summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorEric Cheng <none@none>2008-12-04 18:16:10 -0800
committerEric Cheng <none@none>2008-12-04 18:16:10 -0800
commitda14cebe459d3275048785f25bd869cb09b5307f (patch)
treea394d2c61ec4d7591782a4a5db4e3a157c3ca89a /usr/src
parent03361682bf38acf5bcc36ee83a0d6277731eee68 (diff)
downloadillumos-joyent-da14cebe459d3275048785f25bd869cb09b5307f.tar.gz
PSARC/2006/357 Crossbow - Network Virtualization and Resource Management
6498311 Crossbow - Network Virtualization and Resource Management 6402493 DLPI provider loopback behavior should be improved 6453165 move mac capabs definitions outside mac.h 6338667 Need ability to use NAT for non-global zones 6692884 several threads hung due to deadlock scenario between aggr and mac 6768302 dls: soft_ring_bind/unbind race can panic in thread_affinity_set with cpu_id == -1 6635849 race between lacp_xmit_sm() and aggr_m_stop() ends in panic 6742712 potential message double free in the aggr driver 6754299 a potential race between aggr_m_tx() and aggr_port_delete() 6485324 mi_data_lock recursively held when enabling promiscuous mode on an aggregation 6442559 Forwarding perf bottleneck due to mac_rx() calls 6505462 assertion failure after removing a port from a snooped aggregation 6716664 need to add src/dst IP address to soft ring fanout --HG-- rename : usr/src/uts/common/io/dls/dls_soft_ring.c => usr/src/uts/common/io/mac/mac_soft_ring.c rename : usr/src/uts/common/inet/ip/ip_cksum.c => usr/src/uts/common/os/ip_cksum.c rename : usr/src/uts/common/inet/sctp_crc32.c => usr/src/uts/common/os/sctp_crc32.c rename : usr/src/uts/common/sys/dls_soft_ring.h => usr/src/uts/common/sys/mac_soft_ring.h
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/Makefile2
-rw-r--r--usr/src/cmd/Makefile.cmd4
-rw-r--r--usr/src/cmd/acctadm/Makefile4
-rw-r--r--usr/src/cmd/acctadm/acctadm.xcl15
-rw-r--r--usr/src/cmd/acctadm/aconf.c43
-rw-r--r--usr/src/cmd/acctadm/extended-accounting.xml39
-rw-r--r--usr/src/cmd/acctadm/main.c111
-rw-r--r--usr/src/cmd/acctadm/res.c52
-rw-r--r--usr/src/cmd/acctadm/utils.c16
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile3
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c5
-rw-r--r--usr/src/cmd/dladm/Makefile2
-rw-r--r--usr/src/cmd/dladm/dladm.c2318
-rw-r--r--usr/src/cmd/dladm/dladm.xcl281
-rw-r--r--usr/src/cmd/dladm/vnic.conf29
-rw-r--r--usr/src/cmd/flowadm/Makefile76
-rw-r--r--usr/src/cmd/flowadm/flowadm.c1963
-rw-r--r--usr/src/cmd/flowadm/flowadm.conf28
-rw-r--r--usr/src/cmd/flowadm/flowadm.xcl113
-rw-r--r--usr/src/cmd/flowadm/flowprop.conf29
-rw-r--r--usr/src/cmd/mdb/Makefile.common4
-rw-r--r--usr/src/cmd/mdb/common/modules/mac/mac.c685
-rw-r--r--usr/src/cmd/mdb/intel/amd64/mac/Makefile34
-rw-r--r--usr/src/cmd/mdb/intel/ia32/mac/Makefile33
-rw-r--r--usr/src/cmd/mdb/sparc/v9/mac/Makefile34
-rw-r--r--usr/src/cmd/rcm_daemon/Makefile.com6
-rw-r--r--usr/src/cmd/rcm_daemon/common/vlan_rcm.c29
-rw-r--r--usr/src/cmd/rcm_daemon/common/vnic_rcm.c1329
-rw-r--r--usr/src/cmd/svc/milestone/net-physical10
-rw-r--r--usr/src/cmd/svc/profile/generic_limited_net.xml1
-rw-r--r--usr/src/cmd/svc/profile/generic_open.xml1
-rw-r--r--usr/src/cmd/truss/codes.c41
-rw-r--r--usr/src/cmd/vna/Makefile3
-rw-r--r--usr/src/cmd/vna/vna.c13
-rw-r--r--usr/src/lib/Makefile2
-rw-r--r--usr/src/lib/libdladm/Makefile10
-rw-r--r--usr/src/lib/libdladm/Makefile.com10
-rw-r--r--usr/src/lib/libdladm/common/flowattr.c411
-rw-r--r--usr/src/lib/libdladm/common/flowprop.c611
-rw-r--r--usr/src/lib/libdladm/common/libdladm.c326
-rw-r--r--usr/src/lib/libdladm/common/libdladm.h105
-rw-r--r--usr/src/lib/libdladm/common/libdladm_impl.h68
-rw-r--r--usr/src/lib/libdladm/common/libdlaggr.c3
-rw-r--r--usr/src/lib/libdladm/common/libdlflow.c903
-rw-r--r--usr/src/lib/libdladm/common/libdlflow.h93
-rw-r--r--usr/src/lib/libdladm/common/libdlflow_impl.h138
-rw-r--r--usr/src/lib/libdladm/common/libdllink.c300
-rw-r--r--usr/src/lib/libdladm/common/libdllink.h47
-rw-r--r--usr/src/lib/libdladm/common/libdlstat.c684
-rw-r--r--usr/src/lib/libdladm/common/libdlstat.h71
-rw-r--r--usr/src/lib/libdladm/common/libdlvlan.c288
-rw-r--r--usr/src/lib/libdladm/common/libdlvlan.h6
-rw-r--r--usr/src/lib/libdladm/common/libdlvnic.c695
-rw-r--r--usr/src/lib/libdladm/common/libdlvnic.h40
-rw-r--r--usr/src/lib/libdladm/common/linkprop.c965
-rw-r--r--usr/src/lib/libdladm/common/llib-ldladm4
-rw-r--r--usr/src/lib/libdladm/common/mapfile-vers55
-rw-r--r--usr/src/lib/libdladm/common/propfuncs.c699
-rw-r--r--usr/src/lib/libdladm/common/usage.c1437
-rw-r--r--usr/src/lib/libsecdb/exec_attr.txt2
-rw-r--r--usr/src/lib/libsecdb/help/auths/Makefile2
-rw-r--r--usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html37
-rw-r--r--usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html35
-rw-r--r--usr/src/lib/libsecdb/help/profiles/Makefile1
-rw-r--r--usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html39
-rw-r--r--usr/src/lib/libsecdb/prof_attr.txt1
-rw-r--r--usr/src/pkgdefs/SUNW0on/prototype_com3
-rw-r--r--usr/src/pkgdefs/SUNWcnetr/postinstall38
-rw-r--r--usr/src/pkgdefs/SUNWcnetr/prototype_com4
-rw-r--r--usr/src/pkgdefs/SUNWcsu/prototype_com5
-rw-r--r--usr/src/pkgdefs/SUNWmdb/prototype_i3862
-rw-r--r--usr/src/pkgdefs/SUNWmdb/prototype_sparc4
-rw-r--r--usr/src/pkgdefs/SUNWmdbr/prototype_i3862
-rw-r--r--usr/src/pkgdefs/SUNWmdbr/prototype_sparc4
-rw-r--r--usr/src/pkgdefs/etc/exception_list_i38613
-rw-r--r--usr/src/pkgdefs/etc/exception_list_sparc13
-rw-r--r--usr/src/tools/scripts/bfu.sh72
-rw-r--r--usr/src/uts/common/Makefile8
-rw-r--r--usr/src/uts/common/Makefile.files30
-rw-r--r--usr/src/uts/common/inet/ip.h191
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c6
-rw-r--r--usr/src/uts/common/inet/ip/igmp.c6
-rw-r--r--usr/src/uts/common/inet/ip/ip.c649
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c53
-rw-r--r--usr/src/uts/common/inet/ip/ip_ftable.c395
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c1362
-rw-r--r--usr/src/uts/common/inet/ip/ip_ire.c31
-rw-r--r--usr/src/uts/common/inet/ip/ip_mroute.c4
-rw-r--r--usr/src/uts/common/inet/ip/ip_multi.c4
-rw-r--r--usr/src/uts/common/inet/ip/ip_netinfo.c2
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c1336
-rw-r--r--usr/src/uts/common/inet/ip/spd.c7
-rw-r--r--usr/src/uts/common/inet/ip/tun.c2
-rw-r--r--usr/src/uts/common/inet/ip_ftable.h4
-rw-r--r--usr/src/uts/common/inet/ip_if.h15
-rw-r--r--usr/src/uts/common/inet/ip_impl.h70
-rw-r--r--usr/src/uts/common/inet/ip_ire.h1
-rw-r--r--usr/src/uts/common/inet/ip_stack.h9
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h13
-rw-r--r--usr/src/uts/common/inet/ipdrop.h1
-rw-r--r--usr/src/uts/common/inet/squeue.c1970
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c297
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_kssl.c19
-rw-r--r--usr/src/uts/common/inet/udp/udp.c66
-rw-r--r--usr/src/uts/common/inet/udp_impl.h8
-rw-r--r--usr/src/uts/common/io/afe/afe.c1
-rw-r--r--usr/src/uts/common/io/afe/afeimpl.h4
-rw-r--r--usr/src/uts/common/io/aggr/aggr_ctl.c40
-rw-r--r--usr/src/uts/common/io/aggr/aggr_dev.c38
-rw-r--r--usr/src/uts/common/io/aggr/aggr_grp.c1221
-rw-r--r--usr/src/uts/common/io/aggr/aggr_lacp.c766
-rw-r--r--usr/src/uts/common/io/aggr/aggr_port.c351
-rw-r--r--usr/src/uts/common/io/aggr/aggr_recv.c58
-rw-r--r--usr/src/uts/common/io/aggr/aggr_send.c73
-rw-r--r--usr/src/uts/common/io/ath/ath_main.c3
-rw-r--r--usr/src/uts/common/io/bge/bge.conf2
-rw-r--r--usr/src/uts/common/io/bge/bge_chip2.c44
-rw-r--r--usr/src/uts/common/io/bge/bge_hw.h65
-rw-r--r--usr/src/uts/common/io/bge/bge_impl.h27
-rw-r--r--usr/src/uts/common/io/bge/bge_main2.c563
-rw-r--r--usr/src/uts/common/io/bge/bge_recv2.c123
-rw-r--r--usr/src/uts/common/io/bge/bge_send.c28
-rw-r--r--usr/src/uts/common/io/dld/dld_drv.c493
-rw-r--r--usr/src/uts/common/io/dld/dld_flow.c119
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c1159
-rw-r--r--usr/src/uts/common/io/dld/dld_str.c1171
-rw-r--r--usr/src/uts/common/io/dls/dls.c996
-rw-r--r--usr/src/uts/common/io/dls/dls_link.c881
-rw-r--r--usr/src/uts/common/io/dls/dls_mgmt.c782
-rw-r--r--usr/src/uts/common/io/dls/dls_mod.c17
-rw-r--r--usr/src/uts/common/io/dls/dls_soft_ring.c773
-rw-r--r--usr/src/uts/common/io/dls/dls_stat.c74
-rw-r--r--usr/src/uts/common/io/dls/dls_vlan.c561
-rw-r--r--usr/src/uts/common/io/dmfe/dmfe_impl.h6
-rw-r--r--usr/src/uts/common/io/dmfe/dmfe_main.c46
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_main.c887
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_rx.c45
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_stat.c5
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_sw.h40
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_tx.c105
-rw-r--r--usr/src/uts/common/io/hxge/hxge.h2
-rw-r--r--usr/src/uts/common/io/hxge/hxge_impl.h29
-rw-r--r--usr/src/uts/common/io/hxge/hxge_kstats.c122
-rw-r--r--usr/src/uts/common/io/hxge/hxge_main.c377
-rw-r--r--usr/src/uts/common/io/hxge/hxge_rxdma.c8
-rw-r--r--usr/src/uts/common/io/hxge/hxge_rxdma.h1
-rw-r--r--usr/src/uts/common/io/hxge/hxge_virtual.c71
-rw-r--r--usr/src/uts/common/io/ib/clients/ibd/ibd.c9
-rw-r--r--usr/src/uts/common/io/igb/igb.conf40
-rw-r--r--usr/src/uts/common/io/igb/igb_gld.c388
-rw-r--r--usr/src/uts/common/io/igb/igb_hw.h21
-rw-r--r--usr/src/uts/common/io/igb/igb_main.c488
-rw-r--r--usr/src/uts/common/io/igb/igb_osdep.c76
-rw-r--r--usr/src/uts/common/io/igb/igb_osdep.h30
-rw-r--r--usr/src/uts/common/io/igb/igb_rx.c48
-rw-r--r--usr/src/uts/common/io/igb/igb_sw.h85
-rw-r--r--usr/src/uts/common/io/igb/igb_tx.c82
-rw-r--r--usr/src/uts/common/io/ipw/ipw2100.c3
-rw-r--r--usr/src/uts/common/io/iwh/iwh.c3
-rw-r--r--usr/src/uts/common/io/iwi/ipw2200.c3
-rw-r--r--usr/src/uts/common/io/iwk/iwk2.c3
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe.conf45
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_common.c47
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_gld.c292
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_main.c677
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_rx.c54
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_stat.c175
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_sw.h112
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_tx.c337
-rw-r--r--usr/src/uts/common/io/mac/README80
-rw-r--r--usr/src/uts/common/io/mac/mac.c6101
-rw-r--r--usr/src/uts/common/io/mac/mac_bcast.c668
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c3763
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c3347
-rw-r--r--usr/src/uts/common/io/mac/mac_flow.c2373
-rw-r--r--usr/src/uts/common/io/mac/mac_hio.c182
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c1031
-rw-r--r--usr/src/uts/common/io/mac/mac_sched.c3819
-rw-r--r--usr/src/uts/common/io/mac/mac_soft_ring.c732
-rw-r--r--usr/src/uts/common/io/mac/mac_util.c823
-rw-r--r--usr/src/uts/common/io/mac/plugins/mac_ether.c3
-rw-r--r--usr/src/uts/common/io/mac/plugins/mac_wifi.c3
-rw-r--r--usr/src/uts/common/io/mxfe/mxfe.c1
-rw-r--r--usr/src/uts/common/io/mxfe/mxfeimpl.h4
-rw-r--r--usr/src/uts/common/io/net80211/net80211.c3
-rw-r--r--usr/src/uts/common/io/net80211/net80211_input.c1
-rw-r--r--usr/src/uts/common/io/net80211/net80211_ioctl.c2
-rw-r--r--usr/src/uts/common/io/nge/nge.h2
-rw-r--r--usr/src/uts/common/io/nge/nge_main.c7
-rw-r--r--usr/src/uts/common/io/ntxn/unm_nic.h2
-rw-r--r--usr/src/uts/common/io/ntxn/unm_nic_main.c4
-rw-r--r--usr/src/uts/common/io/nxge/nxge_fzc.c3
-rw-r--r--usr/src/uts/common/io/nxge/nxge_hcall.s6
-rw-r--r--usr/src/uts/common/io/nxge/nxge_hio.c813
-rw-r--r--usr/src/uts/common/io/nxge/nxge_hio_guest.c10
-rw-r--r--usr/src/uts/common/io/nxge/nxge_hv.c6
-rw-r--r--usr/src/uts/common/io/nxge/nxge_hw.c14
-rw-r--r--usr/src/uts/common/io/nxge/nxge_mac.c161
-rw-r--r--usr/src/uts/common/io/nxge/nxge_main.c908
-rw-r--r--usr/src/uts/common/io/nxge/nxge_ndd.c4
-rw-r--r--usr/src/uts/common/io/nxge/nxge_rxdma.c441
-rw-r--r--usr/src/uts/common/io/nxge/nxge_send.c420
-rw-r--r--usr/src/uts/common/io/nxge/nxge_txdma.c124
-rw-r--r--usr/src/uts/common/io/nxge/nxge_virtual.c349
-rw-r--r--usr/src/uts/common/io/pcan/pcan.c3
-rw-r--r--usr/src/uts/common/io/pcwl/pcwl.c3
-rw-r--r--usr/src/uts/common/io/ral/rt2560.c3
-rw-r--r--[-rwxr-xr-x]usr/src/uts/common/io/rge/rge.h7
-rw-r--r--usr/src/uts/common/io/rge/rge_chip.c5
-rw-r--r--usr/src/uts/common/io/rge/rge_main.c32
-rw-r--r--[-rwxr-xr-x]usr/src/uts/common/io/rge/rge_rxtx.c4
-rw-r--r--usr/src/uts/common/io/rtw/rtw.c3
-rw-r--r--usr/src/uts/common/io/rum/rum.c3
-rw-r--r--usr/src/uts/common/io/sfe/sfe_util.c61
-rw-r--r--usr/src/uts/common/io/sfe/sfe_util.h7
-rw-r--r--usr/src/uts/common/io/softmac/softmac_ctl.c99
-rw-r--r--usr/src/uts/common/io/softmac/softmac_dev.c17
-rw-r--r--usr/src/uts/common/io/softmac/softmac_main.c444
-rw-r--r--usr/src/uts/common/io/softmac/softmac_pkt.c38
-rw-r--r--usr/src/uts/common/io/strplumb.c2
-rw-r--r--usr/src/uts/common/io/ural/ural.c3
-rw-r--r--usr/src/uts/common/io/vnic/vnic_bcast.c468
-rw-r--r--usr/src/uts/common/io/vnic/vnic_cl.c319
-rw-r--r--usr/src/uts/common/io/vnic/vnic_ctl.c266
-rw-r--r--usr/src/uts/common/io/vnic/vnic_dev.c1684
-rw-r--r--usr/src/uts/common/io/wpi/wpi.c3
-rw-r--r--usr/src/uts/common/io/xge/drv/xge.c389
-rw-r--r--usr/src/uts/common/io/xge/drv/xge_osdep.h2
-rw-r--r--usr/src/uts/common/io/xge/drv/xgell.c1205
-rw-r--r--usr/src/uts/common/io/xge/drv/xgell.h205
-rw-r--r--usr/src/uts/common/io/xge/hal/include/xgehal-channel.h8
-rw-r--r--usr/src/uts/common/io/xge/hal/include/xgehal-regs.h9
-rw-r--r--usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c24
-rw-r--r--usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c2
-rw-r--r--usr/src/uts/common/os/exacct.c268
-rw-r--r--usr/src/uts/common/os/ip_cksum.c (renamed from usr/src/uts/common/inet/ip/ip_cksum.c)47
-rw-r--r--usr/src/uts/common/os/modhash.c20
-rw-r--r--usr/src/uts/common/os/policy.c21
-rw-r--r--usr/src/uts/common/os/sctp_crc32.c (renamed from usr/src/uts/common/inet/sctp_crc32.c)11
-rw-r--r--usr/src/uts/common/os/space.c21
-rw-r--r--usr/src/uts/common/os/strsubr.c51
-rw-r--r--usr/src/uts/common/sys/Makefile8
-rw-r--r--usr/src/uts/common/sys/acctctl.h55
-rw-r--r--usr/src/uts/common/sys/aggr.h5
-rw-r--r--usr/src/uts/common/sys/aggr_impl.h120
-rw-r--r--usr/src/uts/common/sys/aggr_lacp.h19
-rw-r--r--usr/src/uts/common/sys/dld.h196
-rw-r--r--usr/src/uts/common/sys/dld_impl.h238
-rw-r--r--usr/src/uts/common/sys/dld_ioc.h8
-rw-r--r--usr/src/uts/common/sys/dlpi.h88
-rw-r--r--usr/src/uts/common/sys/dls.h240
-rw-r--r--usr/src/uts/common/sys/dls_impl.h195
-rw-r--r--usr/src/uts/common/sys/dls_mgmt.h218
-rw-r--r--usr/src/uts/common/sys/dls_soft_ring.h96
-rw-r--r--usr/src/uts/common/sys/exacct.h8
-rw-r--r--usr/src/uts/common/sys/exacct_catalog.h43
-rw-r--r--usr/src/uts/common/sys/exacct_impl.h45
-rw-r--r--usr/src/uts/common/sys/ib/clients/ibd/ibd.h4
-rw-r--r--usr/src/uts/common/sys/mac.h571
-rw-r--r--usr/src/uts/common/sys/mac_client.h184
-rw-r--r--usr/src/uts/common/sys/mac_client_impl.h318
-rw-r--r--usr/src/uts/common/sys/mac_client_priv.h149
-rw-r--r--usr/src/uts/common/sys/mac_flow.h210
-rw-r--r--usr/src/uts/common/sys/mac_flow_impl.h537
-rw-r--r--usr/src/uts/common/sys/mac_impl.h710
-rw-r--r--usr/src/uts/common/sys/mac_provider.h478
-rw-r--r--usr/src/uts/common/sys/mac_soft_ring.h724
-rw-r--r--usr/src/uts/common/sys/modhash.h11
-rw-r--r--usr/src/uts/common/sys/nxge/nxge.h42
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_common.h18
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_defs.h6
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_fflp_hw.h3
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_flow.h1
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_hio.h27
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_impl.h21
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_rxdma.h20
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_serialize.h101
-rw-r--r--usr/src/uts/common/sys/nxge/nxge_txdma.h13
-rw-r--r--usr/src/uts/common/sys/policy.h1
-rw-r--r--usr/src/uts/common/sys/softmac_impl.h57
-rw-r--r--usr/src/uts/common/sys/squeue.h53
-rw-r--r--usr/src/uts/common/sys/squeue_impl.h111
-rw-r--r--usr/src/uts/common/sys/stream.h3
-rw-r--r--usr/src/uts/common/sys/strsubr.h4
-rw-r--r--usr/src/uts/common/sys/vlan.h6
-rw-r--r--usr/src/uts/common/sys/vnic.h108
-rw-r--r--usr/src/uts/common/sys/vnic_impl.h163
-rw-r--r--usr/src/uts/common/syscall/acctctl.c36
-rw-r--r--usr/src/uts/common/xen/io/xnb.c3
-rw-r--r--usr/src/uts/common/xen/io/xnbo.c142
-rw-r--r--usr/src/uts/common/xen/io/xnbu.c41
-rw-r--r--usr/src/uts/common/xen/io/xnf.c47
-rw-r--r--usr/src/uts/common/xen/io/xnf.h1
-rw-r--r--usr/src/uts/i86xpv/xnb/Makefile5
-rw-r--r--usr/src/uts/intel/ia32/ml/modstubs.s7
-rw-r--r--usr/src/uts/intel/io/amd8111s/amd8111s_main.c31
-rw-r--r--[-rwxr-xr-x]usr/src/uts/intel/io/amd8111s/amd8111s_main.h11
-rw-r--r--usr/src/uts/intel/ip/Makefile3
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.debug6429
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.obj6426
-rw-r--r--usr/src/uts/intel/mac/Makefile7
-rw-r--r--usr/src/uts/intel/vnic/Makefile5
-rw-r--r--usr/src/uts/intel/xge/Makefile4
-rw-r--r--usr/src/uts/sparc/ip/Makefile4
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.debug6429
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.obj6426
-rw-r--r--usr/src/uts/sparc/mac/Makefile6
-rw-r--r--usr/src/uts/sparc/ml/modstubs.s7
-rw-r--r--usr/src/uts/sparc/vnic/Makefile5
-rw-r--r--usr/src/uts/sparc/xge/Makefile4
-rw-r--r--usr/src/uts/sun/io/eri/eri.c4
-rw-r--r--usr/src/uts/sun/io/hme.c3
-rw-r--r--usr/src/uts/sun/io/qfe.c2
-rw-r--r--usr/src/uts/sun4u/io/rmclomv.c24
-rw-r--r--usr/src/uts/sun4v/io/vnet.c2
-rw-r--r--usr/src/uts/sun4v/io/vnet_gen.c2
-rw-r--r--usr/src/uts/sun4v/io/vsw.c352
-rw-r--r--usr/src/uts/sun4v/io/vsw_hio.c207
-rw-r--r--usr/src/uts/sun4v/io/vsw_ldc.c109
-rw-r--r--usr/src/uts/sun4v/io/vsw_phys.c1713
-rw-r--r--usr/src/uts/sun4v/io/vsw_switching.c417
-rw-r--r--usr/src/uts/sun4v/os/mach_startup.c16
-rw-r--r--usr/src/uts/sun4v/sys/vnet_res.h4
-rw-r--r--usr/src/uts/sun4v/sys/vsw.h94
-rw-r--r--usr/src/uts/sun4v/sys/vsw_hio.h9
-rw-r--r--usr/src/uts/sun4v/sys/vsw_ldc.h16
326 files changed, 55600 insertions, 23414 deletions
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index 927f5ca801..5d7c5f0f8c 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -161,6 +161,7 @@ COMMON_SUBDIRS= \
file \
filebench \
find \
+ flowadm \
fm \
fmli \
fmt \
@@ -582,6 +583,7 @@ MSGSUBDIRS= \
file \
filesync \
find \
+ flowadm \
fm \
fold \
fs.d \
diff --git a/usr/src/cmd/Makefile.cmd b/usr/src/cmd/Makefile.cmd
index 44364753b2..8abf748eab 100644
--- a/usr/src/cmd/Makefile.cmd
+++ b/usr/src/cmd/Makefile.cmd
@@ -66,6 +66,7 @@ ROOTETCTSOL= $(ROOTETCSECURITY)/tsol
ROOTETCSECLIB= $(ROOTETCSECURITY)/lib
ROOTETCZONES= $(ROOTETC)/zones
+ROOTETCINET= $(ROOT)/etc/inet
ROOTCCSBIN= $(ROOT)/usr/ccs/bin
ROOTCCSBIN64= $(ROOTCCSBIN)/$(MACH64)
ROOTCCSBINLINKDIR= $(ROOT)/../../bin
@@ -316,6 +317,9 @@ $(ROOTUSRSBIN64)/%: %
$(ROOTETC)/%: %
$(INS.file)
+$(ROOTETCINET)/%: %
+ $(INS.file)
+
$(ROOTETCDEFAULT)/%: %.dfl
$(INS.rename)
diff --git a/usr/src/cmd/acctadm/Makefile b/usr/src/cmd/acctadm/Makefile
index 554135fa78..09343cbca7 100644
--- a/usr/src/cmd/acctadm/Makefile
+++ b/usr/src/cmd/acctadm/Makefile
@@ -19,8 +19,6 @@
# CDDL HEADER END
#
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -37,7 +35,7 @@ include ../Makefile.cmd
ROOTMANIFESTDIR = $(ROOTSVCSYSTEM)
CFLAGS += $(CCVERBOSE)
-LDLIBS += -lexacct -lscf -lsecdb
+LDLIBS += -lexacct -lscf -lsecdb -ldladm
POFILE = acctadm.po
XGETFLAGS = -a -x acctadm.xcl
FILEMODE = 0555
diff --git a/usr/src/cmd/acctadm/acctadm.xcl b/usr/src/cmd/acctadm/acctadm.xcl
index 4926a94690..e8d2b4572d 100644
--- a/usr/src/cmd/acctadm/acctadm.xcl
+++ b/usr/src/cmd/acctadm/acctadm.xcl
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -19,6 +18,11 @@
#
# CDDL HEADER END
#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
msgid "ruxf:e:d:"
msgid "/etc/acctadm.conf"
msgid ""
@@ -26,6 +30,7 @@ msgid "process"
msgid "proc"
msgid "task"
msgid "flow"
+msgid "net"
msgid "no"
msgid "none"
msgid "yes"
@@ -41,6 +46,10 @@ msgid "ACCTADM_FLOW_ENABLE"
msgid "ACCTADM_FLOW_FILE"
msgid "ACCTADM_FLOW_TRACKED"
msgid "ACCTADM_FLOW_UNTRACKED"
+msgid "ACCTADM_NET_ENABLE"
+msgid "ACCTADM_NET_FILE"
+msgid "ACCTADM_NET_TRACKED"
+msgid "ACCTADM_NET_UNTRACKED"
msgid "r+"
msgid "r"
msgid " %[^=]=%s \n%n"
diff --git a/usr/src/cmd/acctadm/aconf.c b/usr/src/cmd/acctadm/aconf.c
index 70c5f7618d..8453a4fa8f 100644
--- a/usr/src/cmd/acctadm/aconf.c
+++ b/usr/src/cmd/acctadm/aconf.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/acctctl.h>
#include <unistd.h>
@@ -32,6 +30,7 @@
#include <stdlib.h>
#include <errno.h>
#include <limits.h>
+#include <libdllink.h>
#include <libscf.h>
#include <pwd.h>
#include <auth_attr.h>
@@ -47,6 +46,7 @@
#define FMRI_FLOW_ACCT "svc:/system/extended-accounting:flow"
#define FMRI_PROC_ACCT "svc:/system/extended-accounting:process"
#define FMRI_TASK_ACCT "svc:/system/extended-accounting:task"
+#define FMRI_NET_ACCT "svc:/system/extended-accounting:net"
#define NELEM(x) (sizeof (x)) / (sizeof (x[0]))
@@ -134,13 +134,14 @@ aconf_setup(const char *fmri)
}
/*
- * Flow accounting is not available in non-global zones and
+ * Net/Flow accounting is not available in non-global zones and
* the service instance should therefore never be 'enabled' in
* non-global zones. This is enforced by acctadm(1M), but there is
* nothing that prevents someone from calling svcadm enable directly,
* so we handle that case here by disabling the instance.
*/
- if (type == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+ if ((type == AC_FLOW || type == AC_NET) &&
+ getzoneid() != GLOBAL_ZONEID) {
(void) smf_disable_instance(fmri, 0);
warn(gettext("%s accounting cannot be configured in "
"non-global zones\n"), ac_type_name(type));
@@ -210,6 +211,19 @@ aconf_setup(const char *fmri)
ret = SMF_EXIT_ERR_FATAL;
}
(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
+
+ if (state == AC_ON && type == AC_NET) {
+ /*
+ * Start logging.
+ */
+ (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG,
+ NULL);
+ (void) dladm_start_usagelog(strncmp(tracked, "basic",
+ strlen("basic")) == 0 ? DLADM_LOGTYPE_LINK :
+ DLADM_LOGTYPE_FLOW, 20);
+ (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG,
+ NULL);
+ }
out:
aconf_scf_fini();
return (ret);
@@ -219,7 +233,7 @@ void
aconf_print(FILE *fp, int types)
{
acctconf_t ac;
- int print_order[] = { AC_TASK, AC_PROC, AC_FLOW };
+ int print_order[] = { AC_TASK, AC_PROC, AC_FLOW, AC_NET };
int i;
for (i = 0; i < NELEM(print_order); i++) {
@@ -279,6 +293,21 @@ aconf_print_type(acctconf_t *acp, FILE *fp, int type)
gettext(" Untracked flow resources: %s\n"),
acp->untracked);
break;
+ case AC_NET:
+ (void) fprintf(fp,
+ gettext(" Net accounting: %s\n"),
+ acp->state == AC_ON ?
+ gettext("active") : gettext("inactive"));
+ (void) fprintf(fp,
+ gettext(" Net accounting file: %s\n"),
+ acp->file);
+ (void) fprintf(fp,
+ gettext(" Tracked net resources: %s\n"),
+ acp->tracked);
+ (void) fprintf(fp,
+ gettext(" Untracked net resources: %s\n"),
+ acp->untracked);
+ break;
}
}
@@ -369,6 +398,8 @@ aconf_type2fmri(int type)
return (FMRI_TASK_ACCT);
case AC_FLOW:
return (FMRI_FLOW_ACCT);
+ case AC_NET:
+ return (FMRI_NET_ACCT);
default:
die(gettext("invalid type %d\n"), type);
}
@@ -385,6 +416,8 @@ aconf_fmri2type(const char *fmri)
return (AC_TASK);
else if (strcmp(fmri, FMRI_FLOW_ACCT) == 0)
return (AC_FLOW);
+ else if (strcmp(fmri, FMRI_NET_ACCT) == 0)
+ return (AC_NET);
else
return (-1);
}
diff --git a/usr/src/cmd/acctadm/extended-accounting.xml b/usr/src/cmd/acctadm/extended-accounting.xml
index 2c68130080..07cb9af9c1 100644
--- a/usr/src/cmd/acctadm/extended-accounting.xml
+++ b/usr/src/cmd/acctadm/extended-accounting.xml
@@ -23,8 +23,6 @@
CDDL HEADER END
- ident "%Z%%M% %I% %E% SMI"
-
NOTE: This service manifest is not editable; its contents will
be overwritten by package or patch operations, including
operating system upgrade. Make customizations in a different
@@ -175,6 +173,43 @@
</documentation>
</template>
</instance>
+
+ <instance name='net' enabled='false'>
+
+ <property_group name='general' type='framework'>
+ <propval name='action_authorization' type='astring'
+ value='solaris.smf.manage.extended-accounting.net' />
+ <propval name='value_authorization' type='astring'
+ value='solaris.smf.manage.extended-accounting.net' />
+ </property_group>
+
+ <property_group name='config' type='application'>
+ <propval name='value_authorization' type='astring'
+ value='solaris.smf.value.extended-accounting.net' />
+ <propval name='enabled' type='boolean'
+ value='false' />
+ <propval name='file' type='astring'
+ value='none' />
+ <propval name='tracked' type='astring'
+ value='none' />
+ <propval name='untracked' type='astring'
+ value='extended' />
+ </property_group>
+
+ <template>
+ <common_name>
+ <loctext xml:lang='C'>
+ configure net extended accounting
+ </loctext>
+ </common_name>
+
+ <documentation>
+ <manpage
+ title='acctadm' section='1M'
+ manpath='/usr/share/man' />
+ </documentation>
+ </template>
+ </instance>
<stability value='Unstable' />
</service>
diff --git a/usr/src/cmd/acctadm/main.c b/usr/src/cmd/acctadm/main.c
index f83c1ec73c..484caf8988 100644
--- a/usr/src/cmd/acctadm/main.c
+++ b/usr/src/cmd/acctadm/main.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/acctctl.h>
#include <assert.h>
#include <stdio.h>
@@ -33,6 +31,7 @@
#include <string.h>
#include <errno.h>
#include <libintl.h>
+#include <libdllink.h>
#include <locale.h>
#include <priv.h>
#include <libscf.h>
@@ -44,12 +43,12 @@
static const char USAGE[] = "\
Usage:\n\
- acctadm [ {process | task | flow} ]\n\
+ acctadm [ {process | task | flow | net} ]\n\
acctadm -s\n\
- acctadm -r [ {process | task | flow} ]\n\
- acctadm -x|-E|-D {process | task | flow}\n\
- acctadm -f filename {process | task | flow}\n\
- acctadm -e resources -d resources {process | task | flow}\n";
+ acctadm -r [ {process | task | flow | net} ]\n\
+ acctadm -x|-E|-D {process | task | flow | net}\n\
+ acctadm -f filename {process | task | flow | net}\n\
+ acctadm -e resources -d resources {process | task | flow | net}\n";
static const char OPTS[] = "rsxf:e:d:ED";
@@ -77,6 +76,7 @@ setup_privs()
(void) priv_addset(privset, PRIV_SYS_ACCT);
(void) priv_addset(privset, PRIV_FILE_DAC_WRITE);
+ (void) priv_addset(privset, PRIV_SYS_DL_CONFIG);
(void) priv_delset(privset, PRIV_FILE_LINK_ANY);
(void) priv_delset(privset, PRIV_PROC_EXEC);
(void) priv_delset(privset, PRIV_PROC_FORK);
@@ -98,10 +98,11 @@ setup_privs()
die(gettext("cannot setup privileges"));
/*
- * Turn off the sys_acct and file_dac_write privileges until needed.
+ * Turn off the sys_acct, file_dac_write and dl_config privileges
+ * until needed.
*/
(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_FILE_DAC_WRITE,
- PRIV_SYS_ACCT, NULL);
+ PRIV_SYS_ACCT, PRIV_SYS_DL_CONFIG, NULL);
}
int
@@ -183,7 +184,7 @@ main(int argc, char *argv[])
if (!(disabled || enabled || Dflg || Eflg || file || sflg ||
xflg))
(void) priv_set(PRIV_OFF, PRIV_PERMITTED,
- PRIV_SYS_ACCT, NULL);
+ PRIV_SYS_ACCT, PRIV_SYS_DL_CONFIG, NULL);
if (optind < argc) {
if (typestr != NULL) {
@@ -203,20 +204,34 @@ main(int argc, char *argv[])
type |= AC_TASK;
else if (strcmp(typestr, "flow") == 0)
type |= AC_FLOW;
+ else if (strcmp(typestr, "net") == 0)
+ type |= AC_NET;
else {
warn(gettext("unknown accounting type -- %s\n"),
typestr);
usage();
}
} else
- type = AC_PROC | AC_TASK | AC_FLOW;
+ type = AC_PROC | AC_TASK | AC_FLOW | AC_NET;
/*
+ * Drop the DL config privilege if we are not working with
+ * net.
+ */
+ if ((type & AC_NET) == 0) {
+ (void) priv_set(PRIV_OFF, PRIV_PERMITTED,
+ PRIV_SYS_DL_CONFIG, NULL);
+ }
+ /*
* check for invalid options
*/
if (optcnt > 1)
usage();
+ /*
+ * XXX For AC_NET, enabled/disabled should only be "basic" or
+ * "extended" - need to check it here.
+ */
if ((enabled || disabled) && (rflg || Dflg || sflg || xflg || Eflg))
usage();
@@ -253,9 +268,10 @@ main(int argc, char *argv[])
return (E_ERROR);
}
- assert(type == AC_PROC || type == AC_TASK || type == AC_FLOW);
+ assert(type == AC_PROC || type == AC_TASK || type == AC_FLOW ||
+ type == AC_NET);
- if (type == AC_FLOW && getzoneid() != GLOBAL_ZONEID)
+ if ((type == AC_FLOW || type == AC_NET) && getzoneid() != GLOBAL_ZONEID)
die(gettext("%s accounting cannot be configured in "
"non-global zones\n"), ac_type_name(type));
@@ -277,6 +293,18 @@ main(int argc, char *argv[])
/*
* Turn off the specified accounting and close its file
*/
+
+ /*
+ * Stop net logging before turning it off so that the last
+ * set of logs can be written.
+ */
+ if (type & AC_NET) {
+ (void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ (void) dladm_stop_usagelog(DLADM_LOGTYPE_FLOW);
+ (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ }
state = AC_OFF;
(void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
@@ -311,8 +339,22 @@ main(int argc, char *argv[])
free(buf);
die(gettext("cannot obtain list of resources\n"));
}
- if (disabled)
+ if (disabled) {
+ /*
+ * Stop net logging before turning it off so that the
+ * last set of logs can be written.
+ */
+ if (type & AC_NET) {
+ (void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ (void) dladm_stop_usagelog(strncmp(disabled,
+ "basic", strlen("basic")) == 0 ?
+ DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW);
+ (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ }
str2buf(buf, disabled, AC_OFF, type);
+ }
if (enabled)
str2buf(buf, enabled, AC_ON, type);
@@ -332,6 +374,24 @@ main(int argc, char *argv[])
if (aconf_set_string(AC_PROP_UNTRACKED, untracked) == -1)
die(gettext("cannot update %s property\n"),
AC_PROP_UNTRACKED);
+ /*
+ * We will enable net logging after turning it on so that
+ * it can immediately start writing log.
+ */
+ if (type & AC_NET && enabled != NULL) {
+ /*
+ * Default logging interval for AC_NET is 20.
+ * XXX need to find the right place to
+ * configure it.
+ */
+ (void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ (void) dladm_start_usagelog(strncmp(enabled, "basic",
+ strlen("basic")) == 0 ? DLADM_LOGTYPE_LINK :
+ DLADM_LOGTYPE_FLOW, 20);
+ (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ }
free(tracked);
free(untracked);
free(buf);
@@ -365,6 +425,18 @@ main(int argc, char *argv[])
/*
* Disable accounting
*/
+
+ /*
+ * Stop net logging before turning it off so that the last
+ * set of logs can be written.
+ */
+ if (type & AC_NET) {
+ (void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ (void) dladm_stop_usagelog(DLADM_LOGTYPE_FLOW);
+ (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ }
state = AC_OFF;
(void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
@@ -395,6 +467,17 @@ main(int argc, char *argv[])
die(gettext("cannot update %s property\n"),
AC_PROP_STATE);
modified++;
+ if (type & AC_NET) {
+ /*
+ * Default logging interval for AC_NET is 20,
+ * XXX need to find the right place to configure it.
+ */
+ (void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ (void) dladm_start_usagelog(DLADM_LOGTYPE_FLOW, 20);
+ (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+ PRIV_SYS_DL_CONFIG, NULL);
+ }
}
(void) priv_set(PRIV_OFF, PRIV_PERMITTED, PRIV_SYS_ACCT, NULL);
diff --git a/usr/src/cmd/acctadm/res.c b/usr/src/cmd/acctadm/res.c
index 844e3641c1..7f9484f12b 100644
--- a/usr/src/cmd/acctadm/res.c
+++ b/usr/src/cmd/acctadm/res.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdlib.h>
#include <stdio.h>
#include <libintl.h>
@@ -89,6 +87,33 @@ static ac_resname_t ac_names[] = {
{ AC_FLOW, AC_FLOW_ANAME, "action" },
/*
+ * Net accounting resources
+ */
+
+ { AC_NET, AC_NET_NAME, "name" },
+ { AC_NET, AC_NET_EHOST, "ehost" },
+ { AC_NET, AC_NET_EDEST, "edest" },
+ { AC_NET, AC_NET_VLAN_TPID, "vlan_pid" },
+ { AC_NET, AC_NET_VLAN_TCI, "vlan_tci" },
+ { AC_NET, AC_NET_SAP, "sap" },
+ { AC_NET, AC_NET_PRIORITY, "priority" },
+ { AC_NET, AC_NET_BWLIMIT, "bwlimit" },
+ { AC_NET, AC_NET_DEVNAME, "devname" },
+ { AC_NET, AC_NET_SADDR, "src_ip" },
+ { AC_NET, AC_NET_DADDR, "dst_ip" },
+ { AC_NET, AC_NET_SPORT, "src_port" },
+ { AC_NET, AC_NET_DPORT, "dst_port" },
+ { AC_NET, AC_NET_PROTOCOL, "protocol" },
+ { AC_NET, AC_NET_DSFIELD, "dsfield" },
+ { AC_NET, AC_NET_CURTIME, "curtime" },
+ { AC_NET, AC_NET_IBYTES, "ibytes" },
+ { AC_NET, AC_NET_OBYTES, "obytes" },
+ { AC_NET, AC_NET_IPKTS, "ipkts" },
+ { AC_NET, AC_NET_OPKTS, "opkts" },
+ { AC_NET, AC_NET_IERRPKTS, "ierrpkts" },
+ { AC_NET, AC_NET_OERRPKTS, "oerrpkts" },
+
+ /*
* These are included for compatibility with old acctadm that
* didn't have resource groups for individual accounting types.
* It was possible to have resource "pid" enabled for task
@@ -134,6 +159,19 @@ static ac_group_t ac_groups[] = {
{ AC_FLOW_SADDR, AC_FLOW_DADDR, AC_FLOW_SPORT, AC_FLOW_DPORT,
AC_FLOW_PROTOCOL, AC_FLOW_NBYTES, AC_FLOW_NPKTS, AC_FLOW_ANAME,
AC_NONE } },
+ { AC_NET, "extended",
+ { AC_NET_NAME, AC_NET_EHOST, AC_NET_EDEST, AC_NET_VLAN_TPID,
+ AC_NET_VLAN_TCI, AC_NET_SAP, AC_NET_PRIORITY,
+ AC_NET_BWLIMIT, AC_NET_DEVNAME, AC_NET_SADDR, AC_NET_DADDR,
+ AC_NET_SPORT, AC_NET_DPORT, AC_NET_PROTOCOL, AC_NET_DSFIELD,
+ AC_NET_CURTIME, AC_NET_IBYTES, AC_NET_OBYTES, AC_NET_IPKTS,
+ AC_NET_OPKTS, AC_NET_IERRPKTS, AC_NET_OERRPKTS, AC_NONE } },
+ { AC_NET, "basic",
+ { AC_NET_NAME, AC_NET_DEVNAME, AC_NET_EHOST, AC_NET_EDEST,
+ AC_NET_VLAN_TPID, AC_NET_VLAN_TCI, AC_NET_SAP,
+ AC_NET_PRIORITY, AC_NET_BWLIMIT, AC_NET_CURTIME, AC_NET_IBYTES,
+ AC_NET_OBYTES, AC_NET_IPKTS, AC_NET_OPKTS, AC_NET_IERRPKTS,
+ AC_NET_OERRPKTS, AC_NONE } },
{ AC_NONE, NULL,
{ AC_NONE } }
};
@@ -202,9 +240,10 @@ printgroups(int type)
{
int header = 0;
- if ((type & AC_PROC) && (type & AC_TASK) && (type & AC_FLOW))
+ if ((type & AC_PROC) && (type & AC_TASK) && (type & AC_FLOW) &&
+ (type & AC_NET)) {
header = 1;
-
+ }
if (type & AC_PROC) {
if (header == 1)
(void) printf("process:\n");
@@ -220,6 +259,11 @@ printgroups(int type)
(void) printf("flow:\n");
printgroup(AC_FLOW);
}
+ if (type & AC_NET) {
+ if (header == 1)
+ (void) printf("net:\n");
+ printgroup(AC_NET);
+ }
}
/*
diff --git a/usr/src/cmd/acctadm/utils.c b/usr/src/cmd/acctadm/utils.c
index 26482d5ccd..bbee653eeb 100644
--- a/usr/src/cmd/acctadm/utils.c
+++ b/usr/src/cmd/acctadm/utils.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <assert.h>
#include <sys/types.h>
#include <sys/acctctl.h>
@@ -107,6 +105,8 @@ ac_type_name(int type)
return (gettext("flow"));
case AC_TASK:
return (gettext("task"));
+ case AC_NET:
+ return (gettext("net"));
default:
die(gettext("invalid type %d\n"), type);
}
@@ -217,8 +217,9 @@ verify_exacct_file(const char *file, int type)
} else {
/*
* A non-header object exists. Insist that it be
- * either a process, task, or flow accounting record,
- * the same type as is desired.
+ * either a process, task, flow or net accounting
+ * record, the same type as is desired.
+ * xxx-venu:check 101 merge for EXD_GROUP_NET_*
*/
uint_t c = eo.eo_catalog & EXD_DATA_MASK;
@@ -226,7 +227,12 @@ verify_exacct_file(const char *file, int type)
(eo.eo_catalog & EXC_CATALOG_MASK) != EXC_NONE ||
(!(c == EXD_GROUP_PROC && type == AC_PROC ||
c == EXD_GROUP_TASK && type == AC_TASK ||
- c == EXD_GROUP_FLOW && type == AC_FLOW))) {
+ c == EXD_GROUP_FLOW && type == AC_FLOW ||
+ (c == EXD_GROUP_NET_LINK_DESC ||
+ c == EXD_GROUP_NET_FLOW_DESC ||
+ c == EXD_GROUP_NET_LINK_STATS ||
+ c == EXD_GROUP_NET_FLOW_STATS) &&
+ type == AC_NET))) {
(void) ea_close(&ef);
return (B_FALSE);
}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
index 4924d2fe4e..69e91758ea 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
@@ -22,7 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
PROG = ifconfig
@@ -39,7 +38,7 @@ COMMONSRCS= $(CMDINETCOMMONDIR)/$(COMMONOBJS:%.o=%.c)
SRCS= $(LOCALSRCS) $(COMMONSRCS)
CPPFLAGS += -I$(CMDINETCOMMONDIR) -I$(SRC)/common/net/dhcp
-LDLIBS += -ldhcpagent -linetcfg -ldlpi
+LDLIBS += -ldhcpagent -linetcfg -ldlpi -ldladm
LINTFLAGS += -m
ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%)
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
index b33fc6c1b6..79e2991164 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
@@ -13,6 +13,7 @@
#include "ifconfig.h"
#include <compat.h>
#include <libdlpi.h>
+#include <libdllink.h>
#include <inet/ip.h>
#include <inet/ipsec_impl.h>
@@ -4499,7 +4500,11 @@ static boolean_t
ni_entry(const char *linkname, void *arg)
{
dlpi_handle_t dh;
+ datalink_class_t class;
+ (void) dladm_name2info(linkname, NULL, NULL, &class, NULL);
+ if (class == DATALINK_CLASS_ETHERSTUB)
+ return (_B_FALSE);
if (dlpi_open(linkname, &dh, 0) != DLPI_SUCCESS)
return (_B_FALSE);
diff --git a/usr/src/cmd/dladm/Makefile b/usr/src/cmd/dladm/Makefile
index 94e6842ff3..6757c63d89 100644
--- a/usr/src/cmd/dladm/Makefile
+++ b/usr/src/cmd/dladm/Makefile
@@ -22,7 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
#
PROG= dladm
@@ -35,6 +34,7 @@ ROOTCFGFILES= $(CFGFILES:%=$(ROOTCFGDIR)/%)
include ../Makefile.cmd
XGETFLAGS += -a -x $(PROG).xcl
+LDLIBS += -L$(ROOT)/lib -lsocket
LDLIBS += -ldladm -ldlpi -lkstat -lsecdb -lbsm -linetutil -ldevinfo
$(ROOTCFGFILES) := OWNER= dladm
diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c
index 466adfe6c0..9422a31da3 100644
--- a/usr/src/cmd/dladm/dladm.c
+++ b/usr/src/cmd/dladm/dladm.c
@@ -46,7 +46,9 @@
#include <libintl.h>
#include <libdevinfo.h>
#include <libdlpi.h>
+#include <libdladm.h>
#include <libdllink.h>
+#include <libdlstat.h>
#include <libdlaggr.h>
#include <libdlwlan.h>
#include <libdlvlan.h>
@@ -54,11 +56,18 @@
#include <libinetutil.h>
#include <bsm/adt.h>
#include <bsm/adt_event.h>
+#include <libdlvnic.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/processor.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <net/if_types.h>
#include <stddef.h>
-#define AGGR_DRV "aggr"
#define STR_UNDEF_VAL "--"
#define MAXPORT 256
+#define MAXVNIC 256
#define BUFLEN(lim, ptr) (((lim) > (ptr)) ? ((lim) - (ptr)) : 0)
#define MAXLINELEN 1024
#define SMF_UPGRADE_FILE "/var/svc/profile/upgrade"
@@ -131,9 +140,7 @@
* with a callback function that will be called for each field to be printed.
* The callback function will be passed a pointer to the print_field_t
* for the field, and the pf_index may then be used to identify the
- * system call required to find the value to be printed. An example of
- * this implementation may be found in the do_show_dev() and print_dev()
- * invocation.
+ * system call required to find the value to be printed.
*/
typedef struct print_field_s {
@@ -192,15 +199,6 @@ static char *dladm_print_field(print_field_t *, void *);
#define MAX_FIELD_LEN 32
-typedef struct pktsum_s {
- uint64_t ipackets;
- uint64_t opackets;
- uint64_t rbytes;
- uint64_t obytes;
- uint32_t ierrors;
- uint32_t oerrors;
-} pktsum_t;
-
typedef struct show_state {
boolean_t ls_firstonly;
boolean_t ls_donefirst;
@@ -210,6 +208,8 @@ typedef struct show_state {
print_state_t ls_print;
boolean_t ls_parseable;
boolean_t ls_printheader;
+ boolean_t ls_mac;
+ boolean_t ls_hwgrp;
} show_state_t;
typedef struct show_grp_state {
@@ -226,9 +226,37 @@ typedef struct show_grp_state {
print_state_t gs_print;
} show_grp_state_t;
+typedef struct show_vnic_state {
+ datalink_id_t vs_vnic_id;
+ datalink_id_t vs_link_id;
+ char vs_vnic[MAXLINKNAMELEN];
+ char vs_link[MAXLINKNAMELEN];
+ boolean_t vs_parseable;
+ boolean_t vs_printheader;
+ boolean_t vs_found;
+ boolean_t vs_firstonly;
+ boolean_t vs_donefirst;
+ boolean_t vs_stats;
+ boolean_t vs_printstats;
+ pktsum_t vs_totalstats;
+ pktsum_t vs_prevstats[MAXVNIC];
+ boolean_t vs_etherstub;
+ dladm_status_t vs_status;
+ uint32_t vs_flags;
+ print_state_t vs_print;
+} show_vnic_state_t;
+
+typedef struct show_usage_state_s {
+ boolean_t us_plot;
+ boolean_t us_parseable;
+ boolean_t us_printheader;
+ boolean_t us_first;
+ print_state_t us_print;
+} show_usage_state_t;
+
typedef void cmdfunc_t(int, char **, const char *);
-static cmdfunc_t do_show_link, do_show_dev, do_show_wifi, do_show_phys;
+static cmdfunc_t do_show_link, do_show_wifi, do_show_phys;
static cmdfunc_t do_create_aggr, do_delete_aggr, do_add_aggr, do_remove_aggr;
static cmdfunc_t do_modify_aggr, do_show_aggr, do_up_aggr;
static cmdfunc_t do_scan_wifi, do_connect_wifi, do_disconnect_wifi;
@@ -239,21 +267,25 @@ static cmdfunc_t do_create_vlan, do_delete_vlan, do_up_vlan, do_show_vlan;
static cmdfunc_t do_rename_link, do_delete_phys, do_init_phys;
static cmdfunc_t do_show_linkmap;
static cmdfunc_t do_show_ether;
+static cmdfunc_t do_create_vnic, do_delete_vnic, do_show_vnic;
+static cmdfunc_t do_up_vnic;
+static cmdfunc_t do_create_etherstub, do_delete_etherstub, do_show_etherstub;
+static cmdfunc_t do_show_usage;
+
+static void do_up_vnic_common(int, char **, const char *, boolean_t);
static void altroot_cmd(char *, int, char **);
static int show_linkprop_onelink(datalink_id_t, void *);
static void link_stats(datalink_id_t, uint_t, char *, show_state_t *);
static void aggr_stats(datalink_id_t, show_grp_state_t *, uint_t);
-static void dev_stats(const char *dev, uint32_t, char *, show_state_t *);
+static void vnic_stats(show_vnic_state_t *, uint32_t);
static int get_one_kstat(const char *, const char *, uint8_t,
void *, boolean_t);
static void get_mac_stats(const char *, pktsum_t *);
static void get_link_stats(const char *, pktsum_t *);
static uint64_t get_ifspeed(const char *, boolean_t);
-static void stats_total(pktsum_t *, pktsum_t *, pktsum_t *);
-static void stats_diff(pktsum_t *, pktsum_t *, pktsum_t *);
static const char *get_linkstate(const char *, boolean_t, char *);
static const char *get_linkduplex(const char *, boolean_t, char *);
@@ -286,8 +318,6 @@ static cmd_t cmds[] = {
"\tshow-link\t[-pP] [-o <field>,..] [-s [-i <interval>]] [<link>]"},
{ "rename-link", do_rename_link,
"\trename-link\t[-R <root-dir>] <oldlink> <newlink>\n" },
- { "show-dev", do_show_dev,
- "\tshow-dev\t[-p] [-o <field>,..] [-s [-i <interval>]] [<dev>]\n" },
{ "create-aggr", do_create_aggr,
"\tcreate-aggr\t[-t] [-R <root-dir>] [-P <policy>] [-L <mode>]\n"
"\t\t\t[-T <time>] [-u <address>] [-l <link>] ... <link>" },
@@ -343,9 +373,30 @@ static cmd_t cmds[] = {
{ "delete-phys", do_delete_phys,
"\tdelete-phys\t<link>" },
{ "show-phys", do_show_phys,
- "\tshow-phys\t[-pP] [-o <field>,..] [<link>]" },
+ "\tshow-phys\t[-pP] [-o <field>,..] [-H] [<link>]" },
{ "init-phys", do_init_phys, NULL },
- { "show-linkmap", do_show_linkmap, NULL }
+ { "show-linkmap", do_show_linkmap, NULL },
+ { "create-vnic", do_create_vnic,
+ "\tcreate-vnic [-t] [-R <root-dir>] -l <link> [-m <value> |"
+ " auto |\n"
+ "\t {factory [-n <slot-identifier>]} |\n"
+ "\t {random [-r <prefix>]}] [-v vlan-tag [-f]]\n"
+ "\t -p <prop>=<value>[,...] [-H]"
+ " <vnic-link>\n" },
+ { "delete-vnic", do_delete_vnic,
+ "\tdelete-vnic [-t] [-R <root-dir>] <vnic-link>\n" },
+ { "show-vnic", do_show_vnic,
+ "\tshow-vnic [-pP] [-l <link>] [-s [-i <interval>]]" },
+ { "up-vnic", do_up_vnic, NULL },
+ { "create-etherstub", do_create_etherstub,
+ "\tcreate-etherstub [-t] [-R <root-dir>] <link>\n" },
+ { "delete-etherstub", do_delete_etherstub,
+ "\tdelete-etherstub [-t] [-R <root-dir>] <link>\n" },
+ { "show-etherstub", do_show_etherstub,
+ "\tshow-etherstub [-t] [-R <root-dir>] [<link>]\n" },
+ { "show-usage", do_show_usage,
+ "\tshow-usage [-d|-p -F <format>] [-f <filename>]\n"
+ "\t [-s <time>] [-e <time>] <link>\n" }
};
static const struct option lopts[] = {
@@ -360,11 +411,15 @@ static const struct option lopts[] = {
{"root-dir", required_argument, 0, 'R'},
{"link", required_argument, 0, 'l'},
{"forcible", no_argument, 0, 'f'},
+ {"bw-limit", required_argument, 0, 'b'},
+ {"mac-address", required_argument, 0, 'm'},
+ {"slot", required_argument, 0, 'n'},
{ 0, 0, 0, 0 }
};
static const struct option show_lopts[] = {
{"statistics", no_argument, 0, 's'},
+ {"continuous", no_argument, 0, 'S'},
{"interval", required_argument, 0, 'i'},
{"parseable", no_argument, 0, 'p'},
{"extended", no_argument, 0, 'x'},
@@ -409,6 +464,24 @@ static const struct option showeth_lopts[] = {
{ 0, 0, 0, 0 }
};
+static const struct option vnic_lopts[] = {
+ {"temporary", no_argument, 0, 't' },
+ {"root-dir", required_argument, 0, 'R' },
+ {"dev", required_argument, 0, 'd' },
+ {"mac-address", required_argument, 0, 'm' },
+ {"cpus", required_argument, 0, 'c' },
+ {"bw-limit", required_argument, 0, 'b' },
+ {"slot", required_argument, 0, 'n' },
+ {"mac-prefix", required_argument, 0, 'r' },
+ { 0, 0, 0, 0 }
+};
+
+static const struct option etherstub_lopts[] = {
+ {"temporary", no_argument, 0, 't' },
+ {"root-dir", required_argument, 0, 'R' },
+ { 0, 0, 0, 0 }
+};
+
/*
* structures for 'dladm show-ether'
*/
@@ -451,26 +524,7 @@ typedef struct print_ether_state {
} print_ether_state_t;
/*
- * structures for 'dladm show-dev'.
- */
-typedef enum {
- DEV_LINK,
- DEV_STATE,
- DEV_SPEED,
- DEV_DUPLEX
-} dev_field_index_t;
-
-static print_field_t dev_fields[] = {
-/* name, header, field width, index, cmdtype */
-{ "link", "LINK", 15, DEV_LINK, CMD_TYPE_ANY},
-{ "state", "STATE", 6, DEV_STATE, CMD_TYPE_ANY},
-{ "speed", "SPEED", 8, DEV_SPEED, CMD_TYPE_ANY},
-{ "duplex", "DUPLEX", 8, DEV_DUPLEX, CMD_TYPE_ANY}}
-;
-#define DEV_MAX_FIELDS (sizeof (dev_fields) / sizeof (print_field_t))
-
-/*
- * structures for 'dladm show-dev -s' (print statistics)
+ * structures for 'dladm show-link -s' (print statistics)
*/
typedef enum {
DEVS_LINK,
@@ -493,12 +547,6 @@ static print_field_t devs_fields[] = {
{ "oerrors", "OERRORS", 8, DEVS_OERRORS, CMD_TYPE_ANY}}
;
#define DEVS_MAX_FIELDS (sizeof (devs_fields) / sizeof (print_field_t))
-typedef struct dev_args_s {
- char *devs_link;
- pktsum_t *devs_psum;
-} dev_args_t;
-static char *print_dev_stats(print_field_t *, void *);
-static char *print_dev(print_field_t *, void *);
/*
* buffer used by print functions for show-{link,phys,vlan} commands.
@@ -635,10 +683,10 @@ static print_field_t aggr_s_fields[] = {
CMD_TYPE_ANY}}
;
#define AGGR_S_MAX_FIELDS \
- (sizeof (aggr_l_fields) / sizeof (print_field_t))
+ (sizeof (aggr_s_fields) / sizeof (print_field_t))
/*
- * structures for 'dladm show-dev -L'.
+ * structures for 'dladm show-aggr -L'.
*/
typedef enum {
AGGR_L_LINK,
@@ -697,6 +745,50 @@ static print_field_t phys_fields[] = {
#define PHYS_MAX_FIELDS (sizeof (phys_fields) / sizeof (print_field_t))
/*
+ * structures for 'dladm show-phys -m'
+ */
+
+typedef enum {
+ PHYS_M_LINK,
+ PHYS_M_SLOT,
+ PHYS_M_ADDRESS,
+ PHYS_M_INUSE,
+ PHYS_M_CLIENT
+} phys_m_field_index_t;
+
+static print_field_t phys_m_fields[] = {
+/* name, header, field width, offset, cmdtype */
+{ "link", "LINK", 12, PHYS_M_LINK, CMD_TYPE_ANY},
+{ "slot", "SLOT", 8, PHYS_M_SLOT, CMD_TYPE_ANY},
+{ "address", "ADDRESS", 18, PHYS_M_ADDRESS, CMD_TYPE_ANY},
+{ "inuse", "INUSE", 4, PHYS_M_INUSE, CMD_TYPE_ANY},
+{ "client", "CLIENT", 12, PHYS_M_CLIENT, CMD_TYPE_ANY}}
+;
+#define PHYS_M_MAX_FIELDS (sizeof (phys_m_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-phys -H'
+ */
+
+typedef enum {
+ PHYS_H_LINK,
+ PHYS_H_GROUP,
+ PHYS_H_GRPTYPE,
+ PHYS_H_RINGS,
+ PHYS_H_CLIENTS
+} phys_h_field_index_t;
+
+static print_field_t phys_h_fields[] = {
+/* name, header, field width, offset, cmdtype */
+{ "link", "LINK", 12, PHYS_H_LINK, CMD_TYPE_ANY},
+{ "group", "GROUP", 8, PHYS_H_GROUP, CMD_TYPE_ANY},
+{ "grouptype", "TYPE", 6, PHYS_H_GRPTYPE, CMD_TYPE_ANY},
+{ "rings", "NUM-RINGS", 16, PHYS_H_RINGS, CMD_TYPE_ANY},
+{ "clients", "CLIENTS", 20, PHYS_H_CLIENTS, CMD_TYPE_ANY}}
+;
+#define PHYS_H_MAX_FIELDS (sizeof (phys_h_fields) / sizeof (print_field_t))
+
+/*
* structures for 'dladm show-vlan'
*/
static print_field_t vlan_fields[] = {
@@ -712,6 +804,7 @@ static print_field_t vlan_fields[] = {
;
#define VLAN_MAX_FIELDS (sizeof (vlan_fields) / sizeof (print_field_t))
+
/*
* structures for 'dladm show-wifi'
*/
@@ -764,34 +857,28 @@ static print_field_t linkprop_fields[] = {
#define LINKPROP_MAX_FIELDS \
(sizeof (linkprop_fields) / sizeof (print_field_t))
-#define MAX_PROPS 32
#define MAX_PROP_LINE 512
-typedef struct prop_info {
- char *pi_name;
- char *pi_val[DLADM_MAX_PROP_VALCNT];
- uint_t pi_count;
-} prop_info_t;
-
-typedef struct prop_list {
- prop_info_t pl_info[MAX_PROPS];
- uint_t pl_count;
- char *pl_buf;
-} prop_list_t;
-
typedef struct show_linkprop_state {
- char ls_link[MAXLINKNAMELEN];
- char *ls_line;
- char **ls_propvals;
- prop_list_t *ls_proplist;
- boolean_t ls_parseable;
- boolean_t ls_persist;
- boolean_t ls_header;
- dladm_status_t ls_status;
- dladm_status_t ls_retstatus;
- print_state_t ls_print;
+ char ls_link[MAXLINKNAMELEN];
+ char *ls_line;
+ char **ls_propvals;
+ dladm_arg_list_t *ls_proplist;
+ boolean_t ls_parseable;
+ boolean_t ls_persist;
+ boolean_t ls_header;
+ dladm_status_t ls_status;
+ dladm_status_t ls_retstatus;
+ print_state_t ls_print;
} show_linkprop_state_t;
+typedef struct set_linkprop_state {
+ const char *ls_name;
+ boolean_t ls_reset;
+ boolean_t ls_temp;
+ dladm_status_t ls_status;
+} set_linkprop_state_t;
+
typedef struct linkprop_args_s {
show_linkprop_state_t *ls_state;
char *ls_propname;
@@ -817,9 +904,108 @@ static print_field_t secobj_fields[] = {
;
#define DEV_SOBJ_FIELDS (sizeof (secobj_fields) / sizeof (print_field_t))
+/*
+ * structures for 'dladm show-vnic'
+ */
+typedef struct vnic_fields_buf_s
+{
+ char vnic_link[DLPI_LINKNAME_MAX];
+ char vnic_over[DLPI_LINKNAME_MAX];
+ char vnic_speed[6];
+ char vnic_macaddr[19];
+ char vnic_macaddrtype[19];
+ char vnic_vid[6];
+} vnic_fields_buf_t;
+
+static print_field_t vnic_fields[] = {
+/* name, header, field width, offset, cmdtype */
+{ "link", "LINK", 12,
+ offsetof(vnic_fields_buf_t, vnic_link), CMD_TYPE_ANY},
+{ "over", "OVER", 12,
+ offsetof(vnic_fields_buf_t, vnic_over), CMD_TYPE_ANY},
+{ "speed", "SPEED", 6,
+ offsetof(vnic_fields_buf_t, vnic_speed), CMD_TYPE_ANY},
+{ "macaddr", "MACADDRESS", 20,
+ offsetof(vnic_fields_buf_t, vnic_macaddr), CMD_TYPE_ANY},
+{ "macaddrtype", "MACADDRTYPE", 19,
+ offsetof(vnic_fields_buf_t, vnic_macaddrtype), CMD_TYPE_ANY},
+{ "vid", "VID", 6,
+ offsetof(vnic_fields_buf_t, vnic_vid), CMD_TYPE_ANY}}
+;
+#define VNIC_MAX_FIELDS (sizeof (vnic_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-usage'
+ */
+
+typedef struct usage_fields_buf_s {
+ char usage_link[12];
+ char usage_duration[10];
+ char usage_ipackets[9];
+ char usage_rbytes[10];
+ char usage_opackets[9];
+ char usage_obytes[10];
+ char usage_bandwidth[14];
+} usage_fields_buf_t;
+
+static print_field_t usage_fields[] = {
+/* name, header, field width, offset, cmdtype */
+{ "link", "LINK", 12,
+ offsetof(usage_fields_buf_t, usage_link), CMD_TYPE_ANY},
+{ "duration", "DURATION", 10,
+ offsetof(usage_fields_buf_t, usage_duration), CMD_TYPE_ANY},
+{ "ipackets", "IPACKETS", 9,
+ offsetof(usage_fields_buf_t, usage_ipackets), CMD_TYPE_ANY},
+{ "rbytes", "RBYTES", 10,
+ offsetof(usage_fields_buf_t, usage_rbytes), CMD_TYPE_ANY},
+{ "opackets", "OPACKETS", 9,
+ offsetof(usage_fields_buf_t, usage_opackets), CMD_TYPE_ANY},
+{ "obytes", "OBYTES", 10,
+ offsetof(usage_fields_buf_t, usage_obytes), CMD_TYPE_ANY},
+{ "bandwidth", "BANDWIDTH", 14,
+ offsetof(usage_fields_buf_t, usage_bandwidth), CMD_TYPE_ANY}}
+;
+
+#define USAGE_MAX_FIELDS (sizeof (usage_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-usage link'
+ */
+
+typedef struct usage_l_fields_buf_s {
+ char usage_l_link[12];
+ char usage_l_stime[13];
+ char usage_l_etime[13];
+ char usage_l_rbytes[8];
+ char usage_l_obytes[8];
+ char usage_l_bandwidth[14];
+} usage_l_fields_buf_t;
+
+static print_field_t usage_l_fields[] = {
+/* name, header, field width, offset, cmdtype */
+{ "link", "LINK", 12,
+ offsetof(usage_l_fields_buf_t, usage_l_link), CMD_TYPE_ANY},
+{ "start", "START", 13,
+ offsetof(usage_l_fields_buf_t, usage_l_stime), CMD_TYPE_ANY},
+{ "end", "END", 13,
+ offsetof(usage_l_fields_buf_t, usage_l_etime), CMD_TYPE_ANY},
+{ "rbytes", "RBYTES", 8,
+ offsetof(usage_l_fields_buf_t, usage_l_rbytes), CMD_TYPE_ANY},
+{ "obytes", "OBYTES", 8,
+ offsetof(usage_l_fields_buf_t, usage_l_obytes), CMD_TYPE_ANY},
+{ "bandwidth", "BANDWIDTH", 14,
+ offsetof(usage_l_fields_buf_t, usage_l_bandwidth), CMD_TYPE_ANY}}
+;
+
+#define USAGE_L_MAX_FIELDS \
+ (sizeof (usage_l_fields) /sizeof (print_field_t))
+
static char *progname;
static sig_atomic_t signalled;
+#define DLADM_ETHERSTUB_NAME "etherstub"
+#define DLADM_IS_ETHERSTUB(id) (id == DATALINK_INVALID_LINKID)
+
static void
usage(void)
{
@@ -867,6 +1053,254 @@ main(int argc, char *argv[])
return (0);
}
+/*ARGSUSED*/
+static int
+show_usage_date(dladm_usage_t *usage, void *arg)
+{
+
+ time_t stime;
+ char timebuf[20];
+
+ stime = usage->du_stime;
+ (void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y",
+ localtime(&stime));
+ (void) printf("%s\n", timebuf);
+
+ return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_time(dladm_usage_t *usage, void *arg)
+{
+ show_usage_state_t *state = (show_usage_state_t *)arg;
+ char buf[DLADM_STRSIZE];
+ usage_l_fields_buf_t ubuf;
+ time_t time;
+ double bw;
+
+ if (state->us_plot) {
+ if (!state->us_printheader) {
+ if (state->us_first) {
+ (void) printf("# Time");
+ state->us_first = B_FALSE;
+ }
+ (void) printf(" %s", usage->du_name);
+ if (usage->du_last) {
+ (void) printf("\n");
+ state->us_first = B_TRUE;
+ state->us_printheader = B_TRUE;
+ }
+ } else {
+ if (state->us_first) {
+ time = usage->du_etime;
+ (void) strftime(buf, sizeof (buf), "%T",
+ localtime(&time));
+ state->us_first = B_FALSE;
+ (void) printf("%s", buf);
+ }
+ bw = (double)usage->du_bandwidth/1000;
+ (void) printf(" %.2f", bw);
+ if (usage->du_last) {
+ (void) printf("\n");
+ state->us_first = B_TRUE;
+ }
+ }
+ return (DLADM_STATUS_OK);
+ }
+
+ bzero(&ubuf, sizeof (ubuf));
+
+ (void) snprintf(ubuf.usage_l_link, sizeof (ubuf.usage_l_link), "%s",
+ usage->du_name);
+ time = usage->du_stime;
+ (void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+ (void) snprintf(ubuf.usage_l_stime, sizeof (ubuf.usage_l_stime), "%s",
+ buf);
+ time = usage->du_etime;
+ (void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+ (void) snprintf(ubuf.usage_l_etime, sizeof (ubuf.usage_l_etime), "%s",
+ buf);
+ (void) snprintf(ubuf.usage_l_rbytes, sizeof (ubuf.usage_l_rbytes),
+ "%llu", usage->du_rbytes);
+ (void) snprintf(ubuf.usage_l_obytes, sizeof (ubuf.usage_l_obytes),
+ "%llu", usage->du_obytes);
+ (void) snprintf(ubuf.usage_l_bandwidth, sizeof (ubuf.usage_l_bandwidth),
+ "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+ if (!state->us_parseable && !state->us_printheader) {
+ print_header(&state->us_print);
+ state->us_printheader = B_TRUE;
+ }
+
+ dladm_print_output(&state->us_print, state->us_parseable,
+ dladm_print_field, (void *)&ubuf);
+
+ return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_res(dladm_usage_t *usage, void *arg)
+{
+ show_usage_state_t *state = (show_usage_state_t *)arg;
+ char buf[DLADM_STRSIZE];
+ usage_fields_buf_t ubuf;
+
+ bzero(&ubuf, sizeof (ubuf));
+
+ (void) snprintf(ubuf.usage_link, sizeof (ubuf.usage_link), "%s",
+ usage->du_name);
+ (void) snprintf(ubuf.usage_duration, sizeof (ubuf.usage_duration),
+ "%llu", usage->du_duration);
+ (void) snprintf(ubuf.usage_ipackets, sizeof (ubuf.usage_ipackets),
+ "%llu", usage->du_ipackets);
+ (void) snprintf(ubuf.usage_rbytes, sizeof (ubuf.usage_rbytes),
+ "%llu", usage->du_rbytes);
+ (void) snprintf(ubuf.usage_opackets, sizeof (ubuf.usage_opackets),
+ "%llu", usage->du_opackets);
+ (void) snprintf(ubuf.usage_obytes, sizeof (ubuf.usage_obytes),
+ "%llu", usage->du_obytes);
+ (void) snprintf(ubuf.usage_bandwidth, sizeof (ubuf.usage_bandwidth),
+ "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+ if (!state->us_parseable && !state->us_printheader) {
+ print_header(&state->us_print);
+ state->us_printheader = B_TRUE;
+ }
+
+ dladm_print_output(&state->us_print, state->us_parseable,
+ dladm_print_field, (void *)&ubuf);
+
+ return (DLADM_STATUS_OK);
+}
+
+static boolean_t
+valid_formatspec(char *formatspec_str)
+{
+ if (strcmp(formatspec_str, "gnuplot") == 0)
+ return (B_TRUE);
+ return (B_FALSE);
+
+}
+
+/*ARGSUSED*/
+static void
+do_show_usage(int argc, char *argv[], const char *use)
+{
+ char *file = NULL;
+ int opt;
+ dladm_status_t status;
+ boolean_t d_arg = B_FALSE;
+ boolean_t p_arg = B_FALSE;
+ char *stime = NULL;
+ char *etime = NULL;
+ char *resource = NULL;
+ show_usage_state_t state;
+ boolean_t o_arg = B_FALSE;
+ boolean_t F_arg = B_FALSE;
+ char *fields_str = NULL;
+ char *formatspec_str = NULL;
+ print_field_t **fields;
+ uint_t nfields;
+ char *all_fields =
+ "link,duration,ipackets,rbytes,opackets,obytes,bandwidth";
+ char *all_l_fields =
+ "link,start,end,rbytes,obytes,bandwidth";
+
+ bzero(&state, sizeof (show_usage_state_t));
+ state.us_parseable = B_FALSE;
+ state.us_printheader = B_FALSE;
+ state.us_plot = B_FALSE;
+ state.us_first = B_TRUE;
+
+ while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) {
+ switch (opt) {
+ case 'd':
+ d_arg = B_TRUE;
+ break;
+ case 'p':
+ state.us_plot = p_arg = B_TRUE;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 's':
+ stime = optarg;
+ break;
+ case 'e':
+ etime = optarg;
+ break;
+ case 'o':
+ o_arg = B_TRUE;
+ fields_str = optarg;
+ break;
+ case 'F':
+ F_arg = B_TRUE;
+ formatspec_str = optarg;
+ break;
+ default:
+ die_opterr(optopt, opt, use);
+ break;
+ }
+ }
+
+ if (file == NULL)
+ die("show-usage requires a file");
+
+ if (optind == (argc-1)) {
+ resource = argv[optind];
+ }
+
+ if (resource == NULL && stime == NULL && etime == NULL) {
+ if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+ fields_str = all_fields;
+ fields = parse_output_fields(fields_str, usage_fields,
+ USAGE_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+ } else {
+ if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+ fields_str = all_l_fields;
+ fields = parse_output_fields(fields_str, usage_l_fields,
+ USAGE_L_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+ }
+
+ if (fields == NULL) {
+ die("invalid fields(s) specified");
+ return;
+ }
+ state.us_print.ps_fields = fields;
+ state.us_print.ps_nfields = nfields;
+
+ if (p_arg && d_arg)
+ die("plot and date options are incompatible");
+
+ if (p_arg && !F_arg)
+ die("specify format speicifier: -F <format>");
+
+ if (F_arg && valid_formatspec(formatspec_str) == B_FALSE)
+ die("Format specifier %s not supported", formatspec_str);
+
+ if (d_arg) {
+ /* Print log dates */
+ status = dladm_usage_dates(show_usage_date,
+ DLADM_LOGTYPE_LINK, file, resource, &state);
+ } else if (resource == NULL && stime == NULL && etime == NULL &&
+ !p_arg) {
+ /* Print summary */
+ status = dladm_usage_summary(show_usage_res,
+ DLADM_LOGTYPE_LINK, file, &state);
+ } else if (resource != NULL) {
+ /* Print log entries for named resource */
+ status = dladm_walk_usage_res(show_usage_time,
+ DLADM_LOGTYPE_LINK, file, resource, stime, etime, &state);
+ } else {
+ /* Print time and information for each link */
+ status = dladm_walk_usage_time(show_usage_time,
+ DLADM_LOGTYPE_LINK, file, stime, etime, &state);
+ }
+
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "show-usage");
+}
+
static void
do_create_aggr(int argc, char *argv[], const char *use)
{
@@ -889,9 +1323,13 @@ do_create_aggr(int argc, char *argv[], const char *use)
char *devs[MAXPORT];
char *links[MAXPORT];
dladm_status_t status;
+ dladm_status_t pstatus;
+ dladm_arg_list_t *proplist = NULL;
+ int i;
+ datalink_id_t linkid;
ndev = nlink = opterr = 0;
- while ((option = getopt_long(argc, argv, ":d:l:L:P:R:tfu:T:",
+ while ((option = getopt_long(argc, argv, ":d:l:L:P:R:tfu:T:p:",
lopts, NULL)) != -1) {
switch (option) {
case 'd':
@@ -955,6 +1393,11 @@ do_create_aggr(int argc, char *argv[], const char *use)
case 'R':
altroot = optarg;
break;
+ case 'p':
+ if (dladm_parse_link_props(optarg, &proplist, B_FALSE)
+ != DLADM_STATUS_OK)
+ die("invalid aggregation property");
+ break;
default:
die_opterr(optopt, option, use);
break;
@@ -1000,7 +1443,30 @@ do_create_aggr(int argc, char *argv[], const char *use)
status = dladm_aggr_create(name, key, ndev + nlink, port, policy,
mac_addr_fixed, (const uchar_t *)mac_addr, lacp_mode,
lacp_timer, flags);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ if (proplist == NULL)
+ return;
+
+ status = dladm_name2info(name, &linkid, NULL, NULL, NULL);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ for (i = 0; i < proplist->al_count; i++) {
+ dladm_arg_info_t *aip = &proplist->al_info[i];
+
+ pstatus = dladm_set_linkprop(linkid, aip->ai_name,
+ aip->ai_val, aip->ai_count, flags);
+
+ if (pstatus != DLADM_STATUS_OK) {
+ die_dlerr(pstatus,
+ "aggr creation succeeded but "
+ "could not set property '%s'", aip->ai_name);
+ }
+ }
done:
+ dladm_free_props(proplist);
if (status != DLADM_STATUS_OK) {
if (status == DLADM_STATUS_NONOTIF) {
die_dlerr(status, "not all links have link up/down "
@@ -1379,19 +1845,21 @@ done:
static void
do_create_vlan(int argc, char *argv[], const char *use)
{
- char *link = NULL;
- char drv[DLPI_LINKNAME_MAX];
- uint_t ppa;
- datalink_id_t linkid;
- int vid = 0;
- char option;
- uint32_t flags = (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST);
- char *altroot = NULL;
- char vlan[MAXLINKNAMELEN];
- dladm_status_t status;
+ char *link = NULL;
+ char drv[DLPI_LINKNAME_MAX];
+ uint_t ppa;
+ datalink_id_t linkid;
+ datalink_id_t dev_linkid;
+ int vid = 0;
+ char option;
+ uint32_t flags = (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST);
+ char *altroot = NULL;
+ char vlan[MAXLINKNAMELEN];
+ dladm_arg_list_t *proplist = NULL;
+ dladm_status_t status;
opterr = 0;
- while ((option = getopt_long(argc, argv, ":tfl:v:",
+ while ((option = getopt_long(argc, argv, ":tfR:l:v:p:",
lopts, NULL)) != -1) {
switch (option) {
case 'v':
@@ -1408,15 +1876,21 @@ do_create_vlan(int argc, char *argv[], const char *use)
link = optarg;
break;
- case 'f':
- flags |= DLADM_OPT_FORCE;
- break;
case 't':
flags &= ~DLADM_OPT_PERSIST;
break;
case 'R':
altroot = optarg;
break;
+ case 'p':
+ if (dladm_parse_link_props(optarg, &proplist, B_FALSE)
+ != DLADM_STATUS_OK) {
+ die("invalid vlan property");
+ }
+ break;
+ case 'f':
+ flags |= DLADM_OPT_FORCE;
+ break;
default:
die_opterr(optopt, option, use);
break;
@@ -1444,19 +1918,14 @@ do_create_vlan(int argc, char *argv[], const char *use)
if (altroot != NULL)
altroot_cmd(altroot, argc, argv);
- if (dladm_name2info(link, &linkid, NULL, NULL, NULL) !=
+ if (dladm_name2info(link, &dev_linkid, NULL, NULL, NULL) !=
DLADM_STATUS_OK) {
die("invalid link name '%s'", link);
}
- if ((status = dladm_vlan_create(vlan, linkid, vid, flags)) !=
- DLADM_STATUS_OK) {
- if (status == DLADM_STATUS_NOTSUP) {
- die_dlerr(status, "VLAN over '%s' may require lowered "
- "MTU; must use -f (see dladm(1M))\n", link);
- } else {
- die_dlerr(status, "create operation failed");
- }
+ if ((status = dladm_vlan_create(vlan, dev_linkid, vid, proplist, flags,
+ &linkid)) != DLADM_STATUS_OK) {
+ die_dlerr(status, "create operation over %s failed", link);
}
}
@@ -1505,31 +1974,7 @@ done:
static void
do_up_vlan(int argc, char *argv[], const char *use)
{
- datalink_id_t linkid = DATALINK_ALL_LINKID;
- dladm_status_t status;
-
- /*
- * get the name of the VLAN (optional last argument)
- */
- if (argc > 2)
- usage();
-
- if (argc == 2) {
- status = dladm_name2info(argv[1], &linkid, NULL, NULL, NULL);
- if (status != DLADM_STATUS_OK)
- goto done;
- }
-
- status = dladm_vlan_up(linkid);
-done:
- if (status != DLADM_STATUS_OK) {
- if (argc == 2) {
- die_dlerr(status,
- "could not bring up VLAN '%s'", argv[1]);
- } else {
- die_dlerr(status, "could not bring VLANs up");
- }
- }
+ do_up_vnic_common(argc, argv, use, B_TRUE);
}
static void
@@ -1724,7 +2169,7 @@ print_link_topology(show_state_t *state, datalink_id_t linkid,
}
free(ginfo.lg_ports);
} else if (class == DATALINK_CLASS_VNIC) {
- dladm_vnic_attr_sys_t vinfo;
+ dladm_vnic_attr_t vinfo;
if ((status = dladm_vnic_info(linkid, &vinfo, flags)) !=
DLADM_STATUS_OK || (status = dladm_datalink_id2info(
@@ -1816,7 +2261,6 @@ done:
return (status);
}
-
static int
show_link(datalink_id_t linkid, void *arg)
{
@@ -1854,7 +2298,6 @@ show_link_stats(datalink_id_t linkid, void *arg)
show_state_t *state = (show_state_t *)arg;
pktsum_t stats, diff_stats;
dladm_phys_attr_t dpa;
- dev_args_t largs;
if (state->ls_firstonly) {
if (state->ls_donefirst)
@@ -1881,12 +2324,15 @@ show_link_stats(datalink_id_t linkid, void *arg)
} else {
get_link_stats(link, &stats);
}
- stats_diff(&diff_stats, &stats, &state->ls_prevstats);
+ dladm_stats_diff(&diff_stats, &stats, &state->ls_prevstats);
- largs.devs_link = link;
- largs.devs_psum = &diff_stats;
- dladm_print_output(&state->ls_print, state->ls_parseable,
- print_dev_stats, &largs);
+ (void) printf("%-12s", link);
+ (void) printf("%-10llu", diff_stats.ipackets);
+ (void) printf("%-12llu", diff_stats.rbytes);
+ (void) printf("%-8llu", diff_stats.ierrors);
+ (void) printf("%-10llu", diff_stats.opackets);
+ (void) printf("%-12llu", diff_stats.obytes);
+ (void) printf("%-8llu\n", diff_stats.oerrors);
state->ls_prevstats = stats;
return (DLADM_WALK_CONTINUE);
@@ -2192,7 +2638,7 @@ print_aggr_stats_callback(print_field_t *pf, void *arg)
goto err;
}
- stats_diff(&diff_stats, &port_stat, l->laggr_prevstats);
+ dladm_stats_diff(&diff_stats, &port_stat, l->laggr_prevstats);
}
switch (pf->pf_index) {
@@ -2296,7 +2742,8 @@ print_aggr_stats(show_grp_state_t *state, const char *link,
}
get_mac_stats(dpa.dp_dev, &port_stat);
- stats_total(&pktsumtot, &port_stat, &state->gs_prevstats[i]);
+ dladm_stats_total(&pktsumtot, &port_stat,
+ &state->gs_prevstats[i]);
}
if (!state->gs_parseable && !state->gs_printheader) {
@@ -2381,127 +2828,17 @@ done:
return (DLADM_WALK_CONTINUE);
}
-static char *
-print_dev(print_field_t *pf, void *arg)
-{
- const char *dev = arg;
- static char buf[DLADM_STRSIZE];
-
- switch (pf->pf_index) {
- case DEV_LINK:
- (void) snprintf(buf, sizeof (buf), "%s", dev);
- break;
- case DEV_STATE:
- (void) get_linkstate(dev, B_FALSE, buf);
- break;
- case DEV_SPEED:
- (void) snprintf(buf, sizeof (buf), "%uMb",
- (unsigned int)(get_ifspeed(dev, B_FALSE) / 1000000ull));
- break;
- case DEV_DUPLEX:
- (void) get_linkduplex(dev, B_FALSE, buf);
- break;
- default:
- die("invalid index '%d'", pf->pf_index);
- break;
- }
- return (buf);
-}
-
-static int
-show_dev(const char *dev, void *arg)
-{
- show_state_t *state = arg;
-
- if (!state->ls_parseable && !state->ls_printheader) {
- print_header(&state->ls_print);
- state->ls_printheader = B_TRUE;
- }
-
- dladm_print_output(&state->ls_print, state->ls_parseable,
- print_dev, (void *)dev);
-
- return (DLADM_WALK_CONTINUE);
-}
-
-static char *
-print_dev_stats(print_field_t *pf, void *arg)
-{
- dev_args_t *dargs = arg;
- pktsum_t *diff_stats = dargs->devs_psum;
- static char buf[DLADM_STRSIZE];
-
- switch (pf->pf_index) {
- case DEVS_LINK:
- (void) snprintf(buf, sizeof (buf), "%s", dargs->devs_link);
- break;
- case DEVS_IPKTS:
- (void) snprintf(buf, sizeof (buf), "%llu",
- diff_stats->ipackets);
- break;
- case DEVS_RBYTES:
- (void) snprintf(buf, sizeof (buf), "%llu",
- diff_stats->rbytes);
- break;
- case DEVS_IERRORS:
- (void) snprintf(buf, sizeof (buf), "%u",
- diff_stats->ierrors);
- break;
- case DEVS_OPKTS:
- (void) snprintf(buf, sizeof (buf), "%llu",
- diff_stats->opackets);
- break;
- case DEVS_OBYTES:
- (void) snprintf(buf, sizeof (buf), "%llu",
- diff_stats->obytes);
- break;
- case DEVS_OERRORS:
- (void) snprintf(buf, sizeof (buf), "%u",
- diff_stats->oerrors);
- break;
- default:
- die("invalid input");
- break;
- }
- return (buf);
-}
-
-static int
-show_dev_stats(const char *dev, void *arg)
-{
- show_state_t *state = arg;
- pktsum_t stats, diff_stats;
- dev_args_t dargs;
-
- if (state->ls_firstonly) {
- if (state->ls_donefirst)
- return (DLADM_WALK_CONTINUE);
- state->ls_donefirst = B_TRUE;
- } else {
- bzero(&state->ls_prevstats, sizeof (state->ls_prevstats));
- }
-
- get_mac_stats(dev, &stats);
- stats_diff(&diff_stats, &stats, &state->ls_prevstats);
-
- dargs.devs_link = (char *)dev;
- dargs.devs_psum = &diff_stats;
- dladm_print_output(&state->ls_print, state->ls_parseable,
- print_dev_stats, &dargs);
-
- state->ls_prevstats = stats;
- return (DLADM_WALK_CONTINUE);
-}
-
static void
do_show_link(int argc, char *argv[], const char *use)
{
int option;
boolean_t s_arg = B_FALSE;
+ boolean_t S_arg = B_FALSE;
boolean_t i_arg = B_FALSE;
uint32_t flags = DLADM_OPT_ACTIVE;
boolean_t p_arg = B_FALSE;
datalink_id_t linkid = DATALINK_ALL_LINKID;
+ char linkname[MAXLINKNAMELEN];
int interval = 0;
show_state_t state;
dladm_status_t status;
@@ -2517,7 +2854,7 @@ do_show_link(int argc, char *argv[], const char *use)
bzero(&state, sizeof (state));
opterr = 0;
- while ((option = getopt_long(argc, argv, ":pPsi:o:",
+ while ((option = getopt_long(argc, argv, ":pPsSi:o:",
show_lopts, NULL)) != -1) {
switch (option) {
case 'p':
@@ -2538,6 +2875,12 @@ do_show_link(int argc, char *argv[], const char *use)
flags = DLADM_OPT_PERSIST;
break;
+ case 'S':
+ if (S_arg)
+ die_optdup(option);
+
+ S_arg = B_TRUE;
+ break;
case 'o':
o_arg = B_TRUE;
fields_str = optarg;
@@ -2556,19 +2899,32 @@ do_show_link(int argc, char *argv[], const char *use)
}
}
- if (i_arg && !s_arg)
- die("the option -i can be used only with -s");
+ if (i_arg && !(s_arg || S_arg))
+ die("the option -i can be used only with -s or -S");
+
+ if (s_arg && S_arg)
+ die("the -s option cannot be used with -S");
if (s_arg && flags != DLADM_OPT_ACTIVE)
die("the option -P cannot be used with -s");
+ if (S_arg && (p_arg || flags != DLADM_OPT_ACTIVE))
+ die("the option -%c cannot be used with -S", p_arg ? 'p' : 'P');
+
/* get link name (optional last argument) */
if (optind == (argc-1)) {
uint32_t f;
- if ((status = dladm_name2info(argv[optind], &linkid, &f,
+ if (strlcpy(linkname, argv[optind], MAXLINKNAMELEN)
+ >= MAXLINKNAMELEN) {
+ (void) fprintf(stderr,
+ gettext("%s: link name too long\n"),
+ progname);
+ exit(1);
+ }
+ if ((status = dladm_name2info(linkname, &linkid, &f,
NULL, NULL)) != DLADM_STATUS_OK) {
- die_dlerr(status, "link %s is not valid", argv[optind]);
+ die_dlerr(status, "link %s is not valid", linkname);
}
if (!(f & flags)) {
@@ -2583,6 +2939,11 @@ do_show_link(int argc, char *argv[], const char *use)
if (p_arg && !o_arg)
die("-p requires -o");
+ if (S_arg) {
+ dladm_continuous(linkid, NULL, interval, LINK_REPORT);
+ return;
+ }
+
if (p_arg && strcasecmp(fields_str, "all") == 0)
die("\"-o all\" is invalid with -p");
@@ -2604,7 +2965,6 @@ do_show_link(int argc, char *argv[], const char *use)
return;
}
-
fields = parse_output_fields(fields_str, link_fields, DEV_LINK_FIELDS,
CMD_TYPE_ANY, &nfields);
@@ -2641,17 +3001,17 @@ do_show_aggr(int argc, char *argv[], const char *use)
int interval = 0;
int key;
dladm_status_t status;
- boolean_t o_arg = B_FALSE;
- char *fields_str = NULL;
- print_field_t **fields;
- uint_t nfields;
- char *all_fields =
+ boolean_t o_arg = B_FALSE;
+ char *fields_str = NULL;
+ print_field_t **fields;
+ uint_t nfields;
+ char *all_fields =
"link,policy,addrpolicy,lacpactivity,lacptimer,flags";
- char *all_lacp_fields =
+ char *all_lacp_fields =
"link,port,aggregatable,sync,coll,dist,defaulted,expired";
- char *all_stats_fields =
+ char *all_stats_fields =
"link,port,ipackets,rbytes,opackets,obytes,ipktdist,opktdist";
- char *all_extended_fields =
+ char *all_extended_fields =
"link,port,speed,duplex,state,address,portstate";
print_field_t *pf;
int pfmax;
@@ -2806,138 +3166,222 @@ do_show_aggr(int argc, char *argv[], const char *use)
}
}
-static void
-do_show_dev(int argc, char *argv[], const char *use)
+static dladm_status_t
+print_phys_default(show_state_t *state, datalink_id_t linkid,
+ const char *link, uint32_t flags, uint32_t media)
{
- int option;
- char *dev = NULL;
- boolean_t s_arg = B_FALSE;
- boolean_t i_arg = B_FALSE;
- boolean_t o_arg = B_FALSE;
- boolean_t p_arg = B_FALSE;
- datalink_id_t linkid;
- int interval = 0;
- show_state_t state;
- char *fields_str = NULL;
- print_field_t **fields;
- uint_t nfields;
- char *all_fields = "link,state,speed,duplex";
- static char *allstat_fields =
- "link,ipackets,rbytes,ierrors,opackets,obytes,oerrors";
+ dladm_phys_attr_t dpa;
+ dladm_status_t status;
+ link_fields_buf_t pattr;
- bzero(&state, sizeof (state));
- fields_str = all_fields;
+ status = dladm_phys_info(linkid, &dpa, state->ls_flags);
+ if (status != DLADM_STATUS_OK)
+ goto done;
- opterr = 0;
- while ((option = getopt_long(argc, argv, ":psi:o:",
- show_lopts, NULL)) != -1) {
- switch (option) {
- case 'p':
- if (p_arg)
- die_optdup(option);
+ (void) snprintf(pattr.link_phys_device,
+ sizeof (pattr.link_phys_device), "%s", dpa.dp_dev);
+ (void) dladm_media2str(media, pattr.link_phys_media);
+ if (state->ls_flags == DLADM_OPT_ACTIVE) {
+ boolean_t islink;
- p_arg = B_TRUE;
- break;
- case 's':
- if (s_arg)
- die_optdup(option);
+ if (!dpa.dp_novanity) {
+ (void) strlcpy(pattr.link_name, link,
+ sizeof (pattr.link_name));
+ islink = B_TRUE;
+ } else {
+ /*
+ * This is a physical link that does not have
+ * vanity naming support.
+ */
+ (void) strlcpy(pattr.link_name, dpa.dp_dev,
+ sizeof (pattr.link_name));
+ islink = B_FALSE;
+ }
- s_arg = B_TRUE;
- break;
- case 'o':
- o_arg = B_TRUE;
- fields_str = optarg;
- break;
- case 'i':
- if (i_arg)
- die_optdup(option);
+ (void) get_linkstate(pattr.link_name, islink,
+ pattr.link_phys_state);
+ (void) snprintf(pattr.link_phys_speed,
+ sizeof (pattr.link_phys_speed), "%u",
+ (uint_t)((get_ifspeed(pattr.link_name,
+ islink)) / 1000000ull));
+ (void) get_linkduplex(pattr.link_name, islink,
+ pattr.link_phys_duplex);
+ } else {
+ (void) snprintf(pattr.link_name, sizeof (pattr.link_name),
+ "%s", link);
+ (void) snprintf(pattr.link_flags, sizeof (pattr.link_flags),
+ "%c----", flags & DLADM_OPT_ACTIVE ? '-' : 'r');
+ }
- i_arg = B_TRUE;
- if (!str2int(optarg, &interval) || interval == 0)
- die("invalid interval value '%s'", optarg);
- break;
- default:
- die_opterr(optopt, option, use);
- break;
- }
+ if (!state->ls_parseable && !state->ls_printheader) {
+ print_header(&state->ls_print);
+ state->ls_printheader = B_TRUE;
}
- if (p_arg && !o_arg)
- die("-p requires -o");
+ dladm_print_output(&state->ls_print, state->ls_parseable,
+ dladm_print_field, (void *)&pattr);
- if (p_arg && strcasecmp(fields_str, "all") == 0)
- die("\"-o all\" is invalid with -p");
+done:
+ return (status);
+}
- if (i_arg && !s_arg)
- die("the option -i can be used only with -s");
+typedef struct {
+ show_state_t *ms_state;
+ char *ms_link;
+ dladm_macaddr_attr_t *ms_mac_attr;
+} print_phys_mac_state_t;
- if (o_arg && strcasecmp(fields_str, "all") == 0) {
- if (!s_arg)
- fields_str = all_fields;
+/* callback of dladm_print_output() */
+static char *
+print_phys_one_mac_callback(print_field_t *pf, void *arg)
+{
+ print_phys_mac_state_t *mac_state = arg;
+ dladm_macaddr_attr_t *attr = mac_state->ms_mac_attr;
+ static char buf[DLADM_STRSIZE];
+ boolean_t is_primary = (attr->ma_slot == 0);
+ boolean_t is_parseable = mac_state->ms_state->ls_parseable;
+
+ switch (pf->pf_index) {
+ case PHYS_M_LINK:
+ (void) snprintf(buf, sizeof (buf), "%s",
+ (is_primary || is_parseable) ? mac_state->ms_link : " ");
+ break;
+ case PHYS_M_SLOT:
+ if (is_primary)
+ (void) snprintf(buf, sizeof (buf), gettext("primary"));
else
- fields_str = allstat_fields;
+ (void) snprintf(buf, sizeof (buf), "%d", attr->ma_slot);
+ break;
+ case PHYS_M_ADDRESS:
+ (void) dladm_aggr_macaddr2str(attr->ma_addr, buf);
+ break;
+ case PHYS_M_INUSE:
+ (void) snprintf(buf, sizeof (buf), "%s",
+ attr->ma_flags & DLADM_MACADDR_USED ? gettext("yes") :
+ gettext("no"));
+ break;
+ case PHYS_M_CLIENT:
+ /*
+ * CR 6678526: resolve link id to actual link name if
+ * it is valid.
+ */
+ (void) snprintf(buf, sizeof (buf), "%s", attr->ma_client_name);
+ break;
}
- if (!o_arg && s_arg)
- fields_str = allstat_fields;
-
- if (s_arg && p_arg)
- die("the option -s cannot be used with -p");
-
- /* get dev name (optional last argument) */
- if (optind == (argc-1)) {
- uint32_t flags;
+ return (buf);
+}
- dev = argv[optind];
+typedef struct {
+ show_state_t *hs_state;
+ char *hs_link;
+ dladm_hwgrp_attr_t *hs_grp_attr;
+} print_phys_hwgrp_state_t;
- if (dladm_dev2linkid(dev, &linkid) != DLADM_STATUS_OK)
- die("invalid device %s", dev);
+static char *
+print_phys_one_hwgrp_callback(print_field_t *pf, void *arg)
+{
+ print_phys_hwgrp_state_t *hg_state = arg;
+ dladm_hwgrp_attr_t *attr = hg_state->hs_grp_attr;
+ static char buf[DLADM_STRSIZE];
- if ((dladm_datalink_id2info(linkid, &flags, NULL, NULL,
- NULL, 0) != DLADM_STATUS_OK) ||
- !(flags & DLADM_OPT_ACTIVE)) {
- die("device %s has been removed", dev);
+ switch (pf->pf_index) {
+ case PHYS_H_LINK:
+ (void) snprintf(buf, sizeof (buf), "%s", attr->hg_link_name);
+ break;
+ case PHYS_H_GROUP:
+ (void) snprintf(buf, sizeof (buf), "%d", attr->hg_grp_num);
+ break;
+ case PHYS_H_GRPTYPE:
+ (void) snprintf(buf, sizeof (buf), "%s",
+ attr->hg_grp_type == DLADM_HWGRP_TYPE_RX ? "RX" : "TX");
+ break;
+ case PHYS_H_RINGS:
+ (void) snprintf(buf, sizeof (buf), "%d", attr->hg_n_rings);
+ break;
+ case PHYS_H_CLIENTS:
+ if (attr->hg_client_names[0] == '\0') {
+ (void) snprintf(buf, sizeof (buf), "--");
+ } else {
+ (void) snprintf(buf, sizeof (buf), "%s ",
+ attr->hg_client_names);
}
- } else if (optind != argc) {
- usage();
+ break;
}
- state.ls_parseable = p_arg;
- state.ls_donefirst = B_FALSE;
+ return (buf);
+}
- if (s_arg) {
- dev_stats(dev, interval, fields_str, &state);
- return;
+/* callback of dladm_walk_macaddr, invoked for each MAC address slot */
+static boolean_t
+print_phys_mac_callback(void *arg, dladm_macaddr_attr_t *attr)
+{
+ print_phys_mac_state_t *mac_state = arg;
+ show_state_t *state = mac_state->ms_state;
+
+ if (!state->ls_parseable && !state->ls_printheader) {
+ print_header(&state->ls_print);
+ state->ls_printheader = B_TRUE;
}
- fields = parse_output_fields(fields_str, dev_fields, DEV_MAX_FIELDS,
- CMD_TYPE_ANY, &nfields);
+ mac_state->ms_mac_attr = attr;
+ dladm_print_output(&state->ls_print, state->ls_parseable,
+ print_phys_one_mac_callback, mac_state);
- if (fields == NULL) {
- die("invalid field(s) specified");
- return;
- }
+ return (B_TRUE);
+}
- state.ls_print.ps_fields = fields;
- state.ls_print.ps_nfields = nfields;
+/* invoked by show-phys -m for each physical data-link */
+static dladm_status_t
+print_phys_mac(show_state_t *state, datalink_id_t linkid, char *link)
+{
+ print_phys_mac_state_t mac_state;
- if (dev == NULL) {
- (void) dladm_mac_walk(show_dev, &state);
- } else {
- (void) show_dev(dev, &state);
+ mac_state.ms_state = state;
+ mac_state.ms_link = link;
+
+ return (dladm_walk_macaddr(linkid, &mac_state,
+ print_phys_mac_callback));
+}
+
+/* callback of dladm_walk_hwgrp, invoked for each MAC hwgrp */
+static boolean_t
+print_phys_hwgrp_callback(void *arg, dladm_hwgrp_attr_t *attr)
+{
+ print_phys_hwgrp_state_t *hwgrp_state = arg;
+ show_state_t *state = hwgrp_state->hs_state;
+
+ if (!state->ls_parseable && !state->ls_printheader) {
+ print_header(&state->ls_print);
+ state->ls_printheader = B_TRUE;
}
+ hwgrp_state->hs_grp_attr = attr;
+ dladm_print_output(&state->ls_print, state->ls_parseable,
+ print_phys_one_hwgrp_callback, hwgrp_state);
+
+ return (B_TRUE);
}
+/* invoked by show-phys -H for each physical data-link */
+static dladm_status_t
+print_phys_hwgrp(show_state_t *state, datalink_id_t linkid, char *link)
+{
+ print_phys_hwgrp_state_t hwgrp_state;
+
+ hwgrp_state.hs_state = state;
+ hwgrp_state.hs_link = link;
+ return (dladm_walk_hwgrp(linkid, &hwgrp_state,
+ print_phys_hwgrp_callback));
+}
static dladm_status_t
-print_phys(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *pattr)
+print_phys(show_state_t *state, datalink_id_t linkid)
{
char link[MAXLINKNAMELEN];
- dladm_phys_attr_t dpa;
uint32_t flags;
+ dladm_status_t status;
datalink_class_t class;
uint32_t media;
- dladm_status_t status;
if ((status = dladm_datalink_id2info(linkid, &flags, &class, &media,
link, MAXLINKNAMELEN)) != DLADM_STATUS_OK) {
@@ -2954,44 +3398,12 @@ print_phys(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *pattr)
goto done;
}
- status = dladm_phys_info(linkid, &dpa, state->ls_flags);
- if (status != DLADM_STATUS_OK)
- goto done;
-
- (void) snprintf(pattr->link_phys_device,
- sizeof (pattr->link_phys_device), "%s", dpa.dp_dev);
- (void) dladm_media2str(media, pattr->link_phys_media);
- if (state->ls_flags == DLADM_OPT_ACTIVE) {
- boolean_t islink;
-
- if (!dpa.dp_novanity) {
- (void) strlcpy(pattr->link_name, link,
- sizeof (pattr->link_name));
- islink = B_TRUE;
- } else {
- /*
- * This is a physical link that does not have
- * vanity naming support.
- */
- (void) strlcpy(pattr->link_name, dpa.dp_dev,
- sizeof (pattr->link_name));
- islink = B_FALSE;
- }
-
- (void) get_linkstate(pattr->link_name, islink,
- pattr->link_phys_state);
- (void) snprintf(pattr->link_phys_speed,
- sizeof (pattr->link_phys_speed), "%u",
- (uint_t)((get_ifspeed(pattr->link_name,
- islink)) / 1000000ull));
- (void) get_linkduplex(pattr->link_name, islink,
- pattr->link_phys_duplex);
- } else {
- (void) snprintf(pattr->link_name, sizeof (pattr->link_name),
- "%s", link);
- (void) snprintf(pattr->link_flags, sizeof (pattr->link_flags),
- "%c----", flags & DLADM_OPT_ACTIVE ? '-' : 'r');
- }
+ if (state->ls_mac)
+ status = print_phys_mac(state, linkid, link);
+ else if (state->ls_hwgrp)
+ status = print_phys_hwgrp(state, linkid, link);
+ else
+ status = print_phys_default(state, linkid, link, flags, media);
done:
return (status);
@@ -3000,29 +3412,12 @@ done:
static int
show_phys(datalink_id_t linkid, void *arg)
{
- show_state_t *state = arg;
- dladm_status_t status;
- link_fields_buf_t pattr;
-
- bzero(&pattr, sizeof (link_fields_buf_t));
- status = print_phys(state, linkid, &pattr);
- if (status != DLADM_STATUS_OK)
- goto done;
-
- if (!state->ls_parseable && !state->ls_printheader) {
- print_header(&state->ls_print);
- state->ls_printheader = B_TRUE;
- }
-
- dladm_print_output(&state->ls_print, state->ls_parseable,
- dladm_print_field, (void *)&pattr);
+ show_state_t *state = arg;
-done:
- state->ls_status = status;
+ state->ls_status = print_phys(state, linkid);
return (DLADM_WALK_CONTINUE);
}
-
/*
* Print the active topology information.
*/
@@ -3052,8 +3447,8 @@ print_vlan(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *l)
(void) snprintf(l->link_vlan_vid, sizeof (l->link_vlan_vid), "%d",
vinfo.dv_vid);
- (void) snprintf(l->link_flags, sizeof (l->link_flags), "%c%c---",
- vinfo.dv_force ? 'f' : '-', vinfo.dv_implicit ? 'i' : '-');
+ (void) snprintf(l->link_flags, sizeof (l->link_flags), "%c----",
+ vinfo.dv_force ? 'f' : '-');
done:
return (status);
@@ -3091,6 +3486,8 @@ do_show_phys(int argc, char *argv[], const char *use)
uint32_t flags = DLADM_OPT_ACTIVE;
boolean_t p_arg = B_FALSE;
boolean_t o_arg = B_FALSE;
+ boolean_t m_arg = B_FALSE;
+ boolean_t H_arg = B_FALSE;
datalink_id_t linkid = DATALINK_ALL_LINKID;
show_state_t state;
dladm_status_t status;
@@ -3100,10 +3497,15 @@ do_show_phys(int argc, char *argv[], const char *use)
char *all_active_fields =
"link,media,state,speed,duplex,device";
char *all_inactive_fields = "link,device,media,flags";
+ char *all_mac_fields = "link,slot,address,inuse,client";
+ char *all_hwgrp_fields =
+ "link,group,grouptype,rings,clients";
+ print_field_t *pf;
+ int pfmax;
bzero(&state, sizeof (state));
opterr = 0;
- while ((option = getopt_long(argc, argv, ":pPo:",
+ while ((option = getopt_long(argc, argv, ":pPo:mH",
show_lopts, NULL)) != -1) {
switch (option) {
case 'p':
@@ -3122,6 +3524,12 @@ do_show_phys(int argc, char *argv[], const char *use)
o_arg = B_TRUE;
fields_str = optarg;
break;
+ case 'm':
+ m_arg = B_TRUE;
+ break;
+ case 'H':
+ H_arg = B_TRUE;
+ break;
default:
die_opterr(optopt, option, use);
break;
@@ -3131,6 +3539,9 @@ do_show_phys(int argc, char *argv[], const char *use)
if (p_arg && !o_arg)
die("-p requires -o");
+ if (m_arg && H_arg)
+ die("-m cannot combine with -H");
+
if (p_arg && strcasecmp(fields_str, "all") == 0)
die("\"-o all\" is invalid with -p");
@@ -3147,16 +3558,42 @@ do_show_phys(int argc, char *argv[], const char *use)
state.ls_parseable = p_arg;
state.ls_flags = flags;
state.ls_donefirst = B_FALSE;
+ state.ls_mac = m_arg;
+ state.ls_hwgrp = H_arg;
+
+ if (m_arg && !(flags & DLADM_OPT_ACTIVE)) {
+ /*
+ * We can only display the factory MAC addresses of
+ * active data-links.
+ */
+ die("-m not compatible with -P");
+ }
if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) {
- if (state.ls_flags & DLADM_OPT_ACTIVE)
+ if (state.ls_mac)
+ fields_str = all_mac_fields;
+ else if (state.ls_hwgrp)
+ fields_str = all_hwgrp_fields;
+ else if (state.ls_flags & DLADM_OPT_ACTIVE) {
fields_str = all_active_fields;
- else
+ } else {
fields_str = all_inactive_fields;
+ }
+ }
+
+ if (state.ls_mac) {
+ pf = phys_m_fields;
+ pfmax = PHYS_M_MAX_FIELDS;
+ } else if (state.ls_hwgrp) {
+ pf = phys_h_fields;
+ pfmax = PHYS_H_MAX_FIELDS;
+ } else {
+ pf = phys_fields;
+ pfmax = PHYS_MAX_FIELDS;
}
- fields = parse_output_fields(fields_str, phys_fields,
- PHYS_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+ fields = parse_output_fields(fields_str, pf,
+ pfmax, CMD_TYPE_ANY, &nfields);
if (fields == NULL) {
die("invalid field(s) specified");
@@ -3267,6 +3704,661 @@ do_show_vlan(int argc, char *argv[], const char *use)
}
static void
+do_create_vnic(int argc, char *argv[], const char *use)
+{
+ datalink_id_t linkid, dev_linkid;
+ char devname[MAXLINKNAMELEN];
+ char name[MAXLINKNAMELEN];
+ boolean_t l_arg = B_FALSE;
+ uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+ char *altroot = NULL;
+ char option;
+ char *endp = NULL;
+ dladm_status_t status;
+ vnic_mac_addr_type_t mac_addr_type = VNIC_MAC_ADDR_TYPE_AUTO;
+ uchar_t *mac_addr;
+ int mac_slot = -1, maclen = 0, mac_prefix_len = 0;
+ dladm_arg_list_t *proplist = NULL;
+ uint16_t vid = 0;
+
+ opterr = 0;
+ while ((option = getopt_long(argc, argv, ":tfR:l:m:n:p:r:v:H",
+ vnic_lopts, NULL)) != -1) {
+ switch (option) {
+ case 't':
+ flags &= ~DLADM_OPT_PERSIST;
+ break;
+ case 'R':
+ altroot = optarg;
+ break;
+ case 'l':
+ if (strlcpy(devname, optarg, MAXLINKNAMELEN) >=
+ MAXLINKNAMELEN)
+ die("link name too long");
+ l_arg = B_TRUE;
+ break;
+ case 'm':
+ if (strcmp(optarg, "fixed") == 0) {
+ /*
+ * A fixed MAC address must be specified
+ * by its value, not by the keyword 'fixed'.
+ */
+ die("'fixed' is not a valid MAC address");
+ }
+ if (dladm_vnic_str2macaddrtype(optarg,
+ &mac_addr_type) != DLADM_STATUS_OK) {
+ mac_addr_type = VNIC_MAC_ADDR_TYPE_FIXED;
+ /* MAC address specified by value */
+ mac_addr = _link_aton(optarg, &maclen);
+ if (mac_addr == NULL) {
+ if (maclen == -1)
+ die("invalid MAC address");
+ else
+ die("out of memory");
+ exit(1);
+ }
+ }
+ break;
+ case 'n':
+ errno = 0;
+ mac_slot = (int)strtol(optarg, &endp, 10);
+ if (errno != 0 || *endp != '\0')
+ die("invalid slot number");
+ break;
+ case 'p':
+ if (dladm_parse_link_props(optarg, &proplist, B_FALSE)
+ != DLADM_STATUS_OK)
+ die("invalid vnic property");
+ break;
+ case 'r':
+ mac_addr = _link_aton(optarg, &mac_prefix_len);
+ if (mac_addr == NULL) {
+ if (mac_prefix_len == -1)
+ die("invalid MAC address");
+ else
+ die("out of memory");
+ exit(1);
+ }
+ break;
+ case 'v':
+ vid = (int)strtol(optarg, &endp, 10);
+ if (errno != 0 || *endp != '\0' || vid == 0)
+ /* VID of 0 is invalid */
+ die("invalid VLAN id");
+ break;
+ case 'f':
+ flags |= DLADM_OPT_FORCE;
+ break;
+ case 'H':
+ flags |= DLADM_OPT_HWRINGS;
+ break;
+ default:
+ die_opterr(optopt, option, use);
+ }
+ }
+
+ /*
+ * 'f' - force, flag can be specified only with 'v' - vlan.
+ */
+ if ((flags & DLADM_OPT_FORCE) != 0 && vid == 0)
+ die("-f option can only be used with -v");
+
+ if (mac_prefix_len != 0 && mac_addr_type != VNIC_MAC_ADDR_TYPE_RANDOM &&
+ mac_addr_type != VNIC_MAC_ADDR_TYPE_FIXED)
+ usage();
+
+ /* check required options */
+ if (!l_arg)
+ usage();
+
+ if (mac_slot != -1 && mac_addr_type != VNIC_MAC_ADDR_TYPE_FACTORY)
+ usage();
+
+ /* the VNIC id is the required operand */
+ if (optind != (argc - 1))
+ usage();
+
+ if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN)
+ die("link name too long '%s'", argv[optind]);
+
+ if (!dladm_valid_linkname(name))
+ die("invalid link name '%s'", argv[optind]);
+
+ if (altroot != NULL)
+ altroot_cmd(altroot, argc, argv);
+
+ if (dladm_name2info(devname, &dev_linkid, NULL, NULL, NULL) !=
+ DLADM_STATUS_OK)
+ die("invalid link name '%s'", devname);
+
+ status = dladm_vnic_create(name, dev_linkid, mac_addr_type, mac_addr,
+ maclen, &mac_slot, mac_prefix_len, vid, &linkid, proplist, flags);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "vnic creation over %s failed", devname);
+
+ dladm_free_props(proplist);
+}
+
+static void
+do_etherstub_check(const char *name, datalink_id_t linkid, boolean_t etherstub,
+ uint32_t flags)
+{
+ boolean_t is_etherstub;
+ dladm_vnic_attr_t attr;
+
+ if (dladm_vnic_info(linkid, &attr, flags) != DLADM_STATUS_OK) {
+ /*
+ * Let the delete continue anyway.
+ */
+ return;
+ }
+ is_etherstub = (attr.va_link_id == DATALINK_INVALID_LINKID);
+ if (is_etherstub != etherstub) {
+ die("'%s' is not %s", name,
+ (is_etherstub ? "a vnic" : "an etherstub"));
+ }
+}
+
+static void
+do_delete_vnic_common(int argc, char *argv[], const char *use,
+ boolean_t etherstub)
+{
+ char option;
+ uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+ datalink_id_t linkid;
+ char *altroot = NULL;
+ dladm_status_t status;
+
+ opterr = 0;
+ while ((option = getopt_long(argc, argv, ":R:t", lopts,
+ NULL)) != -1) {
+ switch (option) {
+ case 't':
+ flags &= ~DLADM_OPT_PERSIST;
+ break;
+ case 'R':
+ altroot = optarg;
+ break;
+ default:
+ die_opterr(optopt, option, use);
+ }
+ }
+
+ /* get vnic name (required last argument) */
+ if (optind != (argc - 1))
+ usage();
+
+ if (altroot != NULL)
+ altroot_cmd(altroot, argc, argv);
+
+ status = dladm_name2info(argv[optind], &linkid, NULL, NULL, NULL);
+ if (status != DLADM_STATUS_OK)
+ die("invalid link name '%s'", argv[optind]);
+
+ if ((flags & DLADM_OPT_ACTIVE) != 0) {
+ do_etherstub_check(argv[optind], linkid, etherstub,
+ DLADM_OPT_ACTIVE);
+ }
+ if ((flags & DLADM_OPT_PERSIST) != 0) {
+ do_etherstub_check(argv[optind], linkid, etherstub,
+ DLADM_OPT_PERSIST);
+ }
+
+ status = dladm_vnic_delete(linkid, flags);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "vnic deletion failed");
+}
+
+static void
+do_delete_vnic(int argc, char *argv[], const char *use)
+{
+ do_delete_vnic_common(argc, argv, use, B_FALSE);
+}
+
+/* ARGSUSED */
+static void
+do_up_vnic_common(int argc, char *argv[], const char *use, boolean_t vlan)
+{
+ datalink_id_t linkid = DATALINK_ALL_LINKID;
+ dladm_status_t status;
+ char *type;
+
+ type = vlan ? "vlan" : "vnic";
+
+ /*
+ * get the id or the name of the vnic/vlan (optional last argument)
+ */
+ if (argc == 2) {
+ status = dladm_name2info(argv[1], &linkid, NULL, NULL, NULL);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ } else if (argc > 2) {
+ usage();
+ }
+
+ if (vlan)
+ status = dladm_vlan_up(linkid);
+ else
+ status = dladm_vnic_up(linkid, 0);
+
+done:
+ if (status != DLADM_STATUS_OK) {
+ if (argc == 2) {
+ die_dlerr(status,
+ "could not bring up %s '%s'", type, argv[1]);
+ } else {
+ die_dlerr(status, "could not bring %ss up", type);
+ }
+ }
+}
+
+static void
+do_up_vnic(int argc, char *argv[], const char *use)
+{
+ do_up_vnic_common(argc, argv, use, B_FALSE);
+}
+
+static void
+dump_vnics_head(const char *dev)
+{
+ if (strlen(dev))
+ (void) printf("%s", dev);
+
+ (void) printf("\tipackets rbytes opackets obytes ");
+
+ if (strlen(dev))
+ (void) printf("%%ipkts %%opkts\n");
+ else
+ (void) printf("\n");
+}
+
+static void
+dump_vnic_stat(const char *name, datalink_id_t vnic_id,
+ show_vnic_state_t *state, pktsum_t *vnic_stats, pktsum_t *tot_stats)
+{
+ pktsum_t diff_stats;
+ pktsum_t *old_stats = &state->vs_prevstats[vnic_id];
+
+ dladm_stats_diff(&diff_stats, vnic_stats, old_stats);
+
+ (void) printf("%s", name);
+
+ (void) printf("\t%-10llu", diff_stats.ipackets);
+ (void) printf("%-12llu", diff_stats.rbytes);
+ (void) printf("%-10llu", diff_stats.opackets);
+ (void) printf("%-12llu", diff_stats.obytes);
+
+ if (tot_stats) {
+ if (tot_stats->ipackets == 0) {
+ (void) printf("\t-");
+ } else {
+ (void) printf("\t%-6.1f", (double)diff_stats.ipackets/
+ (double)tot_stats->ipackets * 100);
+ }
+ if (tot_stats->opackets == 0) {
+ (void) printf("\t-");
+ } else {
+ (void) printf("\t%-6.1f", (double)diff_stats.opackets/
+ (double)tot_stats->opackets * 100);
+ }
+ }
+ (void) printf("\n");
+
+ *old_stats = *vnic_stats;
+}
+
+/*
+ * Called from the walker dladm_vnic_walk_sys() for each vnic to display
+ * vnic information or statistics.
+ */
+static dladm_status_t
+print_vnic(show_vnic_state_t *state, datalink_id_t linkid)
+{
+ dladm_vnic_attr_t attr, *vnic = &attr;
+ dladm_status_t status;
+ boolean_t is_etherstub;
+ char devname[MAXLINKNAMELEN];
+ char vnic_name[MAXLINKNAMELEN];
+ char mstr[MAXMACADDRLEN * 3];
+ vnic_fields_buf_t vbuf;
+
+ if ((status = dladm_vnic_info(linkid, vnic, state->vs_flags)) !=
+ DLADM_STATUS_OK)
+ return (status);
+
+ is_etherstub = (vnic->va_link_id == DATALINK_INVALID_LINKID);
+ if (state->vs_etherstub != is_etherstub) {
+ /*
+ * Want all etherstub but it's not one, or want
+ * non-etherstub and it's one.
+ */
+ return (DLADM_STATUS_OK);
+ }
+
+ if (state->vs_link_id != DATALINK_ALL_LINKID) {
+ if (state->vs_link_id != vnic->va_link_id)
+ return (DLADM_STATUS_OK);
+ }
+
+ if (dladm_datalink_id2info(linkid, NULL, NULL,
+ NULL, vnic_name, sizeof (vnic_name)) != DLADM_STATUS_OK)
+ return (DLADM_STATUS_BADARG);
+
+ bzero(devname, sizeof (devname));
+ if (!is_etherstub &&
+ dladm_datalink_id2info(vnic->va_link_id, NULL, NULL,
+ NULL, devname, sizeof (devname)) != DLADM_STATUS_OK)
+ return (DLADM_STATUS_BADARG);
+
+ state->vs_found = B_TRUE;
+ if (state->vs_stats) {
+ /* print vnic statistics */
+ pktsum_t vnic_stats;
+
+ if (state->vs_firstonly) {
+ if (state->vs_donefirst)
+ return (0);
+ state->vs_donefirst = B_TRUE;
+ }
+
+ if (!state->vs_printstats) {
+ /*
+ * get vnic statistics and add to the sum for the
+ * named device.
+ */
+ get_link_stats(vnic_name, &vnic_stats);
+ dladm_stats_total(&state->vs_totalstats, &vnic_stats,
+ &state->vs_prevstats[vnic->va_vnic_id]);
+ } else {
+ /* get and print vnic statistics */
+ get_link_stats(vnic_name, &vnic_stats);
+ dump_vnic_stat(vnic_name, linkid, state, &vnic_stats,
+ &state->vs_totalstats);
+ }
+ return (DLADM_STATUS_OK);
+ } else {
+ (void) snprintf(vbuf.vnic_link, sizeof (vbuf.vnic_link),
+ "%s", vnic_name);
+
+ if (!is_etherstub) {
+
+ (void) snprintf(vbuf.vnic_over, sizeof (vbuf.vnic_over),
+ "%s", devname);
+ (void) snprintf(vbuf.vnic_speed,
+ sizeof (vbuf.vnic_speed), "%u",
+ (uint_t)((get_ifspeed(vnic_name, B_TRUE))
+ / 1000000ull));
+
+ switch (vnic->va_mac_addr_type) {
+ case VNIC_MAC_ADDR_TYPE_FIXED:
+ case VNIC_MAC_ADDR_TYPE_PRIMARY:
+ (void) snprintf(vbuf.vnic_macaddrtype,
+ sizeof (vbuf.vnic_macaddrtype),
+ gettext("fixed"));
+ break;
+ case VNIC_MAC_ADDR_TYPE_RANDOM:
+ (void) snprintf(vbuf.vnic_macaddrtype,
+ sizeof (vbuf.vnic_macaddrtype),
+ gettext("random"));
+ break;
+ case VNIC_MAC_ADDR_TYPE_FACTORY:
+ (void) snprintf(vbuf.vnic_macaddrtype,
+ sizeof (vbuf.vnic_macaddrtype),
+ gettext("factory, slot %d"),
+ vnic->va_mac_slot);
+ break;
+ }
+
+ if (strlen(vbuf.vnic_macaddrtype) > 0) {
+ (void) snprintf(vbuf.vnic_macaddr,
+ sizeof (vbuf.vnic_macaddr), "%s",
+ dladm_aggr_macaddr2str(vnic->va_mac_addr,
+ mstr));
+ }
+
+ (void) snprintf(vbuf.vnic_vid, sizeof (vbuf.vnic_vid),
+ "%d", vnic->va_vid);
+ }
+
+ if (!state->vs_parseable && !state->vs_printheader) {
+ print_header(&state->vs_print);
+ state->vs_printheader = B_TRUE;
+ }
+
+ dladm_print_output(&state->vs_print, state->vs_parseable,
+ dladm_print_field, (void *)&vbuf);
+
+ return (DLADM_STATUS_OK);
+ }
+}
+
+static int
+show_vnic(datalink_id_t linkid, void *arg)
+{
+ show_vnic_state_t *state = arg;
+
+ state->vs_status = print_vnic(state, linkid);
+ return (DLADM_WALK_CONTINUE);
+}
+
+static void
+do_show_vnic_common(int argc, char *argv[], const char *use,
+ boolean_t etherstub)
+{
+ int option;
+ boolean_t s_arg = B_FALSE;
+ boolean_t i_arg = B_FALSE;
+ boolean_t l_arg = B_FALSE;
+ char *endp = NULL;
+ uint32_t interval = 0, flags = DLADM_OPT_ACTIVE;
+ datalink_id_t linkid = DATALINK_ALL_LINKID;
+ datalink_id_t dev_linkid = DATALINK_ALL_LINKID;
+ show_vnic_state_t state;
+ dladm_status_t status;
+ boolean_t o_arg = B_FALSE;
+ char *fields_str = NULL;
+ print_field_t **fields;
+ print_field_t *pf;
+ int pfmax;
+ uint_t nfields;
+ char *all_fields =
+ "link,over,speed,macaddr,macaddrtype,vid";
+ char *all_e_fields =
+ "link";
+
+ bzero(&state, sizeof (state));
+ opterr = 0;
+ while ((option = getopt_long(argc, argv, ":pPl:si:o:", lopts,
+ NULL)) != -1) {
+ switch (option) {
+ case 'p':
+ state.vs_parseable = B_TRUE;
+ break;
+ case 'P':
+ flags = DLADM_OPT_PERSIST;
+ break;
+ case 'l':
+ if (etherstub)
+ die("option not supported for this command");
+
+ if (strlcpy(state.vs_link, optarg, MAXLINKNAMELEN) >=
+ MAXLINKNAMELEN)
+ die("link name too long");
+
+ l_arg = B_TRUE;
+ break;
+ case 's':
+ if (s_arg) {
+ die("the option -s cannot be specified "
+ "more than once");
+ }
+ s_arg = B_TRUE;
+ break;
+ case 'i':
+ if (i_arg) {
+ die("the option -i cannot be specified "
+ "more than once");
+ }
+ i_arg = B_TRUE;
+ interval = (int)strtol(optarg, &endp, 10);
+ if (errno != 0 || interval == 0 || *endp != '\0')
+ die("invalid interval value '%s'", optarg);
+ break;
+ case 'o':
+ o_arg = B_TRUE;
+ fields_str = optarg;
+ break;
+ default:
+ die_opterr(optopt, option, use);
+ }
+ }
+
+ if (i_arg && !s_arg)
+ die("the option -i can be used only with -s");
+
+ /* get vnic ID (optional last argument) */
+ if (optind == (argc - 1)) {
+ status = dladm_name2info(argv[optind], &linkid, NULL,
+ NULL, NULL);
+ if (status != DLADM_STATUS_OK) {
+ die_dlerr(status, "invalid vnic name '%s'",
+ argv[optind]);
+ }
+ (void) strlcpy(state.vs_vnic, argv[optind], MAXLINKNAMELEN);
+ } else if (optind != argc) {
+ usage();
+ }
+
+ if (l_arg) {
+ status = dladm_name2info(state.vs_link, &dev_linkid, NULL,
+ NULL, NULL);
+ if (status != DLADM_STATUS_OK) {
+ die_dlerr(status, "invalid link name '%s'",
+ state.vs_link);
+ }
+ }
+
+ state.vs_vnic_id = linkid;
+ state.vs_link_id = dev_linkid;
+ state.vs_etherstub = etherstub;
+ state.vs_found = B_FALSE;
+ state.vs_flags = flags;
+
+ if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) {
+ if (etherstub)
+ fields_str = all_e_fields;
+ else
+ fields_str = all_fields;
+ }
+
+ pf = vnic_fields;
+ pfmax = VNIC_MAX_FIELDS;
+
+ fields = parse_output_fields(fields_str, pf, pfmax, CMD_TYPE_ANY,
+ &nfields);
+
+ if (fields == NULL) {
+ die("invalid field(s) specified");
+ return;
+ }
+
+ state.vs_print.ps_fields = fields;
+ state.vs_print.ps_nfields = nfields;
+
+ if (s_arg) {
+ /* Display vnic statistics */
+ vnic_stats(&state, interval);
+ return;
+ }
+
+ /* Display vnic information */
+ state.vs_donefirst = B_FALSE;
+
+ if (linkid == DATALINK_ALL_LINKID) {
+ (void) dladm_walk_datalink_id(show_vnic, &state,
+ DATALINK_CLASS_VNIC | DATALINK_CLASS_ETHERSTUB,
+ DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+ } else {
+ (void) show_vnic(linkid, &state);
+ if (state.vs_status != DLADM_STATUS_OK) {
+ die_dlerr(state.vs_status, "failed to show vnic '%s'",
+ state.vs_vnic);
+ }
+ }
+}
+
+static void
+do_show_vnic(int argc, char *argv[], const char *use)
+{
+ do_show_vnic_common(argc, argv, use, B_FALSE);
+}
+
+static void
+do_create_etherstub(int argc, char *argv[], const char *use)
+{
+ uint32_t flags;
+ char *altroot = NULL;
+ char option;
+ dladm_status_t status;
+ char name[MAXLINKNAMELEN];
+ uchar_t mac_addr[ETHERADDRL];
+
+ name[0] = '\0';
+ bzero(mac_addr, sizeof (mac_addr));
+ flags = DLADM_OPT_ANCHOR | DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+
+ opterr = 0;
+ while ((option = getopt_long(argc, argv, "tR:",
+ etherstub_lopts, NULL)) != -1) {
+ switch (option) {
+ case 't':
+ flags &= ~DLADM_OPT_PERSIST;
+ break;
+ case 'R':
+ altroot = optarg;
+ break;
+ default:
+ die_opterr(optopt, option, use);
+ }
+ }
+
+ /* the etherstub id is the required operand */
+ if (optind != (argc - 1))
+ usage();
+
+ if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN)
+ die("link name too long '%s'", argv[optind]);
+
+ if (!dladm_valid_linkname(name))
+ die("invalid link name '%s'", argv[optind]);
+
+ if (altroot != NULL)
+ altroot_cmd(altroot, argc, argv);
+
+ status = dladm_vnic_create(name, DATALINK_INVALID_LINKID,
+ VNIC_MAC_ADDR_TYPE_AUTO, mac_addr, ETHERADDRL, NULL, 0, 0, NULL,
+ NULL, flags);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "etherstub creation failed");
+
+
+}
+
+static void
+do_delete_etherstub(int argc, char *argv[], const char *use)
+{
+ do_delete_vnic_common(argc, argv, use, B_TRUE);
+}
+
+/* ARGSUSED */
+static void
+do_show_etherstub(int argc, char *argv[], const char *use)
+{
+ do_show_vnic_common(argc, argv, use, B_TRUE);
+}
+
+static void
link_stats(datalink_id_t linkid, uint_t interval, char *fields_str,
show_state_t *state)
{
@@ -3333,147 +4425,134 @@ aggr_stats(datalink_id_t linkid, show_grp_state_t *state, uint_t interval)
}
}
+/* ARGSUSED */
static void
-dev_stats(const char *dev, uint32_t interval, char *fields_str,
- show_state_t *state)
+vnic_stats(show_vnic_state_t *sp, uint32_t interval)
{
- print_field_t **fields;
- uint_t nfields;
-
- fields = parse_output_fields(fields_str, devs_fields, DEVS_MAX_FIELDS,
- CMD_TYPE_ANY, &nfields);
+ show_vnic_state_t state;
+ boolean_t specific_link, specific_dev;
- if (fields == NULL) {
- die("invalid field(s) specified");
- return;
- }
-
- state->ls_print.ps_fields = fields;
- state->ls_print.ps_nfields = nfields;
+ /* Display vnic statistics */
+ dump_vnics_head(sp->vs_link);
+ bzero(&state, sizeof (state));
+ state.vs_stats = B_TRUE;
+ state.vs_vnic_id = sp->vs_vnic_id;
+ state.vs_link_id = sp->vs_link_id;
/*
- * If an interval is specified, continuously show the stats
- * only for the first MAC port.
+ * If an interval is specified, and a vnic ID is not specified,
+ * continuously show the stats only for the first vnic.
*/
- state->ls_firstonly = (interval != 0);
+ specific_link = (sp->vs_vnic_id != DATALINK_ALL_LINKID);
+ specific_dev = (sp->vs_link_id != DATALINK_ALL_LINKID);
for (;;) {
+ /* Get stats for each vnic */
+ state.vs_found = B_FALSE;
+ state.vs_donefirst = B_FALSE;
+ state.vs_printstats = B_FALSE;
+ state.vs_flags = DLADM_OPT_ACTIVE;
+
+ if (!specific_link) {
+ (void) dladm_walk_datalink_id(show_vnic, &state,
+ DATALINK_CLASS_VNIC, DATALINK_ANY_MEDIATYPE,
+ DLADM_OPT_ACTIVE);
+ } else {
+ (void) show_vnic(sp->vs_vnic_id, &state);
+ if (state.vs_status != DLADM_STATUS_OK) {
+ die_dlerr(state.vs_status,
+ "failed to show vnic '%s'", sp->vs_vnic);
+ }
+ }
- if (!state->ls_parseable)
- print_header(&state->ls_print);
- state->ls_donefirst = B_FALSE;
+ if (specific_link && !state.vs_found)
+ die("non-existent vnic '%s'", sp->vs_vnic);
+ if (specific_dev && !state.vs_found)
+ die("device %s has no vnics", sp->vs_link);
+
+ /* Show totals */
+ if ((specific_link | specific_dev) && !interval) {
+ (void) printf("Total");
+ (void) printf("\t%-10llu",
+ state.vs_totalstats.ipackets);
+ (void) printf("%-12llu",
+ state.vs_totalstats.rbytes);
+ (void) printf("%-10llu",
+ state.vs_totalstats.opackets);
+ (void) printf("%-12llu\n",
+ state.vs_totalstats.obytes);
+ }
- if (dev == NULL)
- (void) dladm_mac_walk(show_dev_stats, state);
- else
- (void) show_dev_stats(dev, state);
+ /* Show stats for each vnic */
+ state.vs_donefirst = B_FALSE;
+ state.vs_printstats = B_TRUE;
+
+ if (!specific_link) {
+ (void) dladm_walk_datalink_id(show_vnic, &state,
+ DATALINK_CLASS_VNIC, DATALINK_ANY_MEDIATYPE,
+ DLADM_OPT_ACTIVE);
+ } else {
+ (void) show_vnic(sp->vs_vnic_id, &state);
+ if (state.vs_status != DLADM_STATUS_OK) {
+ die_dlerr(state.vs_status,
+ "failed to show vnic '%s'", sp->vs_vnic);
+ }
+ }
if (interval == 0)
break;
(void) sleep(interval);
}
-
- if (dev != NULL && state->ls_status != DLADM_STATUS_OK)
- die_dlerr(state->ls_status, "cannot show device '%s'", dev);
}
-/* accumulate stats (s1 += (s2 - s3)) */
static void
-stats_total(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
-{
- s1->ipackets += (s2->ipackets - s3->ipackets);
- s1->opackets += (s2->opackets - s3->opackets);
- s1->rbytes += (s2->rbytes - s3->rbytes);
- s1->obytes += (s2->obytes - s3->obytes);
- s1->ierrors += (s2->ierrors - s3->ierrors);
- s1->oerrors += (s2->oerrors - s3->oerrors);
-}
-
-/* compute stats differences (s1 = s2 - s3) */
-static void
-stats_diff(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
-{
- s1->ipackets = s2->ipackets - s3->ipackets;
- s1->opackets = s2->opackets - s3->opackets;
- s1->rbytes = s2->rbytes - s3->rbytes;
- s1->obytes = s2->obytes - s3->obytes;
- s1->ierrors = s2->ierrors - s3->ierrors;
- s1->oerrors = s2->oerrors - s3->oerrors;
-}
-
-static void
-get_stats(char *module, int instance, const char *name, pktsum_t *stats)
+get_mac_stats(const char *dev, pktsum_t *stats)
{
kstat_ctl_t *kcp;
kstat_t *ksp;
+ char module[DLPI_LINKNAME_MAX];
+ uint_t instance;
- if ((kcp = kstat_open()) == NULL) {
- warn("kstat open operation failed");
+
+ bzero(stats, sizeof (*stats));
+
+ if (dlpi_parselink(dev, module, &instance) != DLPI_SUCCESS)
return;
- }
- if ((ksp = kstat_lookup(kcp, module, instance, (char *)name)) == NULL) {
- /*
- * The kstat query could fail if the underlying MAC
- * driver was already detached.
- */
- (void) kstat_close(kcp);
+ if ((kcp = kstat_open()) == NULL) {
+ warn("kstat open operation failed");
return;
}
- if (kstat_read(kcp, ksp, NULL) == -1)
- goto bail;
-
- if (dladm_kstat_value(ksp, "ipackets64", KSTAT_DATA_UINT64,
- &stats->ipackets) < 0)
- goto bail;
-
- if (dladm_kstat_value(ksp, "opackets64", KSTAT_DATA_UINT64,
- &stats->opackets) < 0)
- goto bail;
-
- if (dladm_kstat_value(ksp, "rbytes64", KSTAT_DATA_UINT64,
- &stats->rbytes) < 0)
- goto bail;
-
- if (dladm_kstat_value(ksp, "obytes64", KSTAT_DATA_UINT64,
- &stats->obytes) < 0)
- goto bail;
-
- if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT32,
- &stats->ierrors) < 0)
- goto bail;
-
- if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT32,
- &stats->oerrors) < 0)
- goto bail;
+ ksp = dladm_kstat_lookup(kcp, module, instance, "mac", NULL);
+ if (ksp != NULL)
+ dladm_get_stats(kcp, ksp, stats);
-bail:
(void) kstat_close(kcp);
- return;
}
static void
-get_mac_stats(const char *dev, pktsum_t *stats)
+get_link_stats(const char *link, pktsum_t *stats)
{
- char module[DLPI_LINKNAME_MAX];
- uint_t instance;
+ kstat_ctl_t *kcp;
+ kstat_t *ksp;
bzero(stats, sizeof (*stats));
- if (dlpi_parselink(dev, module, &instance) != DLPI_SUCCESS)
+
+ if ((kcp = kstat_open()) == NULL) {
+ warn("kstat_open operation failed");
return;
+ }
- get_stats(module, instance, "mac", stats);
-}
+ ksp = dladm_kstat_lookup(kcp, "link", 0, link, NULL);
-static void
-get_link_stats(const char *link, pktsum_t *stats)
-{
- bzero(stats, sizeof (*stats));
- get_stats("link", 0, link, stats);
+ if (ksp != NULL)
+ dladm_get_stats(kcp, ksp, stats);
+
+ (void) kstat_close(kcp);
}
static int
@@ -3547,7 +4626,7 @@ get_linkstate(const char *name, boolean_t islink, char *buf)
if (get_one_kstat(name, "link_state", KSTAT_DATA_UINT32,
&linkstate, islink) != 0) {
- (void) strlcpy(buf, "unknown", DLADM_STRSIZE);
+ (void) strlcpy(buf, "?", DLADM_STRSIZE);
return (buf);
}
return (dladm_linkstate2str(linkstate, buf));
@@ -4271,92 +5350,6 @@ do_disconnect_wifi(int argc, char **argv, const char *use)
die_dlerr(status, "cannot disconnect");
}
-
-static void
-free_props(prop_list_t *list)
-{
- if (list != NULL) {
- free(list->pl_buf);
- free(list);
- }
-}
-
-static int
-parse_props(char *str, prop_list_t **listp, boolean_t novalues)
-{
- prop_list_t *list;
- prop_info_t *pip;
- char *buf, *curr;
- int len, i;
-
- list = malloc(sizeof (prop_list_t));
- if (list == NULL)
- return (-1);
-
- list->pl_count = 0;
- list->pl_buf = buf = strdup(str);
- if (buf == NULL)
- goto fail;
-
- /*
- * buf is a string of form [<propname>=<value>][,<propname>=<value>]+
- * where each <value> string itself could be a comma-separated array.
- * The loop below will count the number of propname assignments
- * in pl_count; for each property, there is a pip entry with
- * pi_name == <propname>, pi_count == # of elements in <value> array.
- * pi_val[] contains the actual values.
- *
- * This could really be a combination of calls to
- * strtok (token delimiter is ",") and strchr (chr '=')
- * with appropriate null/string-bound-checks.
- */
-
- curr = buf;
- len = strlen(buf);
- pip = NULL;
- for (i = 0; i < len; i++) {
- char c = buf[i];
- boolean_t match = (c == '=' || c == ',');
-
- if (!match && i != len - 1)
- continue;
-
- if (match) {
- buf[i] = '\0';
- if (*curr == '\0')
- goto fail;
- }
-
- if (pip != NULL && c != '=') {
- if (pip->pi_count > DLADM_MAX_PROP_VALCNT)
- goto fail;
-
- if (novalues)
- goto fail;
-
- pip->pi_val[pip->pi_count] = curr;
- pip->pi_count++;
- } else {
- if (list->pl_count > MAX_PROPS)
- goto fail;
-
- pip = &list->pl_info[list->pl_count];
- pip->pi_name = curr;
- pip->pi_count = 0;
- list->pl_count++;
- if (c == ',')
- pip = NULL;
- }
- curr = buf + i + 1;
- }
- *listp = list;
- return (0);
-
-fail:
- free_props(list);
- return (-1);
-}
-
static void
print_linkprop(datalink_id_t linkid, show_linkprop_state_t *statep,
const char *propname, dladm_prop_type_t type,
@@ -4365,7 +5358,7 @@ print_linkprop(datalink_id_t linkid, show_linkprop_state_t *statep,
int i;
char *ptr, *lim;
char buf[DLADM_STRSIZE];
- char *unknown = "?", *notsup = "";
+ char *unknown = "--", *notsup = "";
char **propvals = statep->ls_propvals;
uint_t valcnt = DLADM_MAX_PROP_VALCNT;
dladm_status_t status;
@@ -4545,7 +5538,7 @@ static void
do_show_linkprop(int argc, char **argv, const char *use)
{
int option;
- prop_list_t *proplist = NULL;
+ dladm_arg_list_t *proplist = NULL;
datalink_id_t linkid = DATALINK_ALL_LINKID;
show_linkprop_state_t state;
uint32_t flags = DLADM_OPT_ACTIVE;
@@ -4570,7 +5563,8 @@ do_show_linkprop(int argc, char **argv, const char *use)
prop_longopts, NULL)) != -1) {
switch (option) {
case 'p':
- if (parse_props(optarg, &proplist, B_TRUE) < 0)
+ if (dladm_parse_link_props(optarg, &proplist, B_TRUE)
+ != DLADM_STATUS_OK)
die("invalid link properties specified");
break;
case 'c':
@@ -4628,7 +5622,7 @@ do_show_linkprop(int argc, char **argv, const char *use)
} else {
(void) show_linkprop_onelink(linkid, &state);
}
- free_props(proplist);
+ dladm_free_props(proplist);
if (state.ls_retstatus != DLADM_STATUS_OK)
exit(EXIT_FAILURE);
@@ -4640,7 +5634,7 @@ show_linkprop_onelink(datalink_id_t linkid, void *arg)
int i;
char *buf;
uint32_t flags;
- prop_list_t *proplist = NULL;
+ dladm_arg_list_t *proplist = NULL;
show_linkprop_state_t *statep = arg;
dlpi_handle_t dh = NULL;
@@ -4689,9 +5683,9 @@ show_linkprop_onelink(datalink_id_t linkid, void *arg)
(sizeof (char *) + DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT;
if (proplist != NULL) {
- for (i = 0; i < proplist->pl_count; i++) {
+ for (i = 0; i < proplist->al_count; i++) {
(void) show_linkprop(linkid,
- proplist->pl_info[i].pi_name, statep);
+ proplist->al_info[i].ai_name, statep);
}
} else {
(void) dladm_walk_linkprop(linkid, statep, show_linkprop);
@@ -4712,30 +5706,58 @@ set_linkprop_persist(datalink_id_t linkid, const char *prop_name,
DLADM_OPT_PERSIST);
if (status != DLADM_STATUS_OK) {
- warn_dlerr(status, "cannot persistently %s link property",
- reset ? "reset" : "set");
+ warn_dlerr(status, "cannot persistently %s link property '%s'",
+ reset ? "reset" : "set", prop_name);
}
return (status);
}
+static int
+reset_one_linkprop(datalink_id_t linkid, const char *propname, void *arg)
+{
+ set_linkprop_state_t *statep = arg;
+ dladm_status_t status;
+
+ status = dladm_set_linkprop(linkid, propname, NULL, 0,
+ DLADM_OPT_ACTIVE);
+ if (status != DLADM_STATUS_OK) {
+ warn_dlerr(status, "cannot reset link property '%s' on '%s'",
+ propname, statep->ls_name);
+ }
+ if (!statep->ls_temp) {
+ dladm_status_t s;
+
+ s = set_linkprop_persist(linkid, propname, NULL, 0,
+ statep->ls_reset);
+ if (s != DLADM_STATUS_OK)
+ status = s;
+ }
+ if (status != DLADM_STATUS_OK)
+ statep->ls_status = status;
+
+ return (DLADM_WALK_CONTINUE);
+}
+
static void
set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
{
- int i, option;
- char errmsg[DLADM_STRSIZE];
- char *altroot = NULL;
- datalink_id_t linkid;
- prop_list_t *proplist = NULL;
- boolean_t temp = B_FALSE;
- dladm_status_t status = DLADM_STATUS_OK;
+ int i, option;
+ char errmsg[DLADM_STRSIZE];
+ char *altroot = NULL;
+ datalink_id_t linkid;
+ boolean_t temp = B_FALSE;
+ dladm_status_t status = DLADM_STATUS_OK;
+ dladm_arg_list_t *proplist = NULL;
opterr = 0;
while ((option = getopt_long(argc, argv, ":p:R:t",
prop_longopts, NULL)) != -1) {
switch (option) {
case 'p':
- if (parse_props(optarg, &proplist, reset) < 0)
+ if (dladm_parse_link_props(optarg, &proplist, reset) !=
+ DLADM_STATUS_OK) {
die("invalid link properties specified");
+ }
break;
case 't':
temp = B_TRUE;
@@ -4757,7 +5779,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
die("link property must be specified");
if (altroot != NULL) {
- free_props(proplist);
+ dladm_free_props(proplist);
altroot_cmd(altroot, argc, argv);
}
@@ -4766,24 +5788,21 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
die_dlerr(status, "link %s is not valid", argv[optind]);
if (proplist == NULL) {
- status = dladm_set_linkprop(linkid, NULL, NULL, 0,
- DLADM_OPT_ACTIVE);
- if (status != DLADM_STATUS_OK) {
- warn_dlerr(status, "cannot reset link property "
- "on '%s'", argv[optind]);
- }
- if (!temp) {
- dladm_status_t s;
+ set_linkprop_state_t state;
- s = set_linkprop_persist(linkid, NULL, NULL, 0, reset);
- if (s != DLADM_STATUS_OK)
- status = s;
- }
+ state.ls_name = argv[optind];
+ state.ls_reset = reset;
+ state.ls_temp = temp;
+ state.ls_status = DLADM_STATUS_OK;
+
+ (void) dladm_walk_linkprop(linkid, &state, reset_one_linkprop);
+
+ status = state.ls_status;
goto done;
}
- for (i = 0; i < proplist->pl_count; i++) {
- prop_info_t *pip = &proplist->pl_info[i];
+ for (i = 0; i < proplist->al_count; i++) {
+ dladm_arg_info_t *aip = &proplist->al_info[i];
char **val;
uint_t count;
dladm_status_t s;
@@ -4792,21 +5811,21 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
val = NULL;
count = 0;
} else {
- val = pip->pi_val;
- count = pip->pi_count;
+ val = aip->ai_val;
+ count = aip->ai_count;
if (count == 0) {
warn("no value specified for '%s'",
- pip->pi_name);
+ aip->ai_name);
status = DLADM_STATUS_BADARG;
continue;
}
}
- s = dladm_set_linkprop(linkid, pip->pi_name, val, count,
+ s = dladm_set_linkprop(linkid, aip->ai_name, val, count,
DLADM_OPT_ACTIVE);
if (s == DLADM_STATUS_OK) {
if (!temp) {
s = set_linkprop_persist(linkid,
- pip->pi_name, val, count, reset);
+ aip->ai_name, val, count, reset);
if (s != DLADM_STATUS_OK)
status = s;
}
@@ -4815,7 +5834,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
status = s;
switch (s) {
case DLADM_STATUS_NOTFOUND:
- warn("invalid link property '%s'", pip->pi_name);
+ warn("invalid link property '%s'", aip->ai_name);
break;
case DLADM_STATUS_BADVAL: {
int j;
@@ -4837,12 +5856,12 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
j * DLADM_PROP_VAL_MAX;
}
s = dladm_get_linkprop(linkid,
- DLADM_PROP_VAL_MODIFIABLE, pip->pi_name, propvals,
+ DLADM_PROP_VAL_MODIFIABLE, aip->ai_name, propvals,
&valcnt);
if (s != DLADM_STATUS_OK) {
warn_dlerr(status, "cannot set link property "
- "'%s' on '%s'", pip->pi_name, argv[optind]);
+ "'%s' on '%s'", aip->ai_name, argv[optind]);
free(propvals);
break;
}
@@ -4859,7 +5878,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
if (ptr > errmsg) {
*(ptr - 1) = '\0';
warn("link property '%s' must be one of: %s",
- pip->pi_name, errmsg);
+ aip->ai_name, errmsg);
} else
warn("invalid link property '%s'", *val);
free(propvals);
@@ -4868,16 +5887,16 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
default:
if (reset) {
warn_dlerr(status, "cannot reset link property "
- "'%s' on '%s'", pip->pi_name, argv[optind]);
+ "'%s' on '%s'", aip->ai_name, argv[optind]);
} else {
warn_dlerr(status, "cannot set link property "
- "'%s' on '%s'", pip->pi_name, argv[optind]);
+ "'%s' on '%s'", aip->ai_name, argv[optind]);
}
break;
}
}
done:
- free_props(proplist);
+ dladm_free_props(proplist);
if (status != DLADM_STATUS_OK)
exit(1);
}
@@ -5414,7 +6433,7 @@ i_dladm_init_linkprop(datalink_id_t linkid, void *arg)
}
/*ARGSUSED*/
-static void
+void
do_init_linkprop(int argc, char **argv, const char *use)
{
int option;
@@ -5890,6 +6909,7 @@ show_ether_xprop(datalink_id_t linkid, void *arg)
(void) snprintf(ebuf.eth_ptype, sizeof (ebuf.eth_ptype),
"%s", "peeradv");
(void) snprintf(ebuf.eth_state, sizeof (ebuf.eth_state), "");
+
(void) dladm_get_single_mac_stat(linkid, "lp_cap_autoneg",
KSTAT_DATA_UINT32, &autoneg);
(void) snprintf(ebuf.eth_autoneg, sizeof (ebuf.eth_autoneg),
diff --git a/usr/src/cmd/dladm/dladm.xcl b/usr/src/cmd/dladm/dladm.xcl
index b849b22f79..09192c7f4d 100644
--- a/usr/src/cmd/dladm/dladm.xcl
+++ b/usr/src/cmd/dladm/dladm.xcl
@@ -21,244 +21,343 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
-msgid " %-9s\t%s"
-msgid " %s"
-msgid " Total"
-msgid " address-type=%s\n"
-msgid " address=%s"
-msgid " device=%s address=%s"
-msgid " duplex=%s"
-msgid " duplex=%s\n"
-msgid " lacp-mode=%s"
-msgid " lacp-timer=%s\n"
-msgid " link=%s"
-msgid " policy=%s"
-msgid " port=%s"
-msgid " speed=%u"
+
msgid ""
-msgid "%%ipkts %%opkts\n"
-msgid "%-*s "
+msgid "\t%-10llu"
+msgid "\t%-6.1f"
+msgid "\t-"
+msgid "\tipackets rbytes opackets obytes "
+msgid "\n"
+msgid " "
+msgid " "
+msgid " %-18s"
+msgid " MACADDRESS"
+msgid " %-18s"
+msgid " MACADDRTYPE"
+msgid " dev=%s"
+msgid " mac_addr=%s"
+msgid " speed=%u"
+msgid " vid=%d\n"
+msgid "%%ipkts %%opkts\n"
msgid "%-*s"
msgid "%-10llu"
msgid "%-12llu"
msgid "%-12llu\n"
-msgid "%-14s "
-msgid "%-15s "
-msgid "%-15s %-14s %-14s %-30s \n"
-msgid "%-20s %-20s "
-msgid "%-30s "
-msgid "%-30s"
-msgid "%-8u"
-msgid "%-8u\n"
-msgid "%s type=%s mtu=%d device=%s\n"
-msgid "%s type=%s mtu=%d key=%u\n"
-msgid "%s type=legacy mtu=%d device=%s\n"
+msgid "%-12s"
+msgid "%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n"
+msgid "%-12s%-12s"
+msgid "%-12s%-12s%10s%-20s%-19s%-6s\n"
+msgid "%-12s%8d %-12s%-20s %6d\n"
+msgid "%-12s%8s %-12s%-20s %6s\n"
+msgid "%-6.1f"
+msgid "%-6d\n"
+msgid "%-8llu"
+msgid "%-8llu\n"
+msgid "%5u Mbps"
+msgid "%c----"
+msgid "%d"
+msgid "%d%c-%c"
+msgid "%llu"
msgid "%s"
+msgid "%s\n"
+msgid "%s "
msgid "%s,"
msgid "%s: "
-msgid "%s=\"%s\" "
msgid "%s=\"%s\""
-msgid "%s\n"
+msgid "%sfdx"
+msgid "%shdx"
+msgid "%u"
+msgid "%uMb"
msgid ","
-msgid "--"
-msgid "--,"
-msgid "/dev/%s"
+msgid "-R"
+msgid "-f"
+msgid "-fh"
+msgid "-h"
+msgid "/"
+msgid "/%s"
+msgid "/%s/%s"
+msgid "/sbin/dladm "
msgid "0x"
-msgid "0x%-30s"
-msgid ": %s (%s)\n"
+msgid "100M"
+msgid "10M"
+msgid "1G"
msgid ": %s\n"
-msgid ":Lpsi:"
+msgid ":L:l:P:R:tu:T:"
+msgid ":LpPxsi:o:"
+msgid ":R:"
msgid ":R:t"
msgid ":a"
-msgid ":d:R:t"
-msgid ":d:l:P:R:tu:T:"
+msgid ":d:l:L:P:R:tfu:T:"
+msgid ":d:l:R:t"
+msgid ":d:l:R:tf"
msgid ":e:i:a:m:b:s:k:T:c"
msgid ":f:c:R:t"
-msgid ":l:P:R:tu:T:"
msgid ":o:p"
msgid ":p:R:t"
-msgid ":p:cP"
-msgid ":pPd"
-msgid ":psi:"
+msgid ":p:cPo:"
+msgid ":pPo:"
+msgid ":pPo:m"
+msgid ":pPsSi:o:"
+msgid ":psi:o:"
+msgid ":tl:v:p:"
msgid "?"
-msgid "100M"
+msgid "ADDRESS"
+msgid "ADDRPOLICY"
msgid "ADT_dladm_create_secobj"
msgid "ADT_dladm_delete_secobj"
+msgid "AGGREGATABLE"
msgid "AUTH"
msgid "AUTO"
msgid "BSSID/IBSSID"
msgid "BSSTYPE"
msgid "CLASS"
+msgid "CLIENT"
+msgid "COLL"
msgid "DEFAULT"
+msgid "DEFAULTED"
+msgid "DEVICE"
+msgid "DIST"
msgid "DUPLEX"
msgid "ESSID"
+msgid "EXPIRED"
+msgid "FLAGS"
+msgid "IERRORS"
+msgid "INUSE"
+msgid "IPACKETS"
+msgid "IPKTDIST"
+msgid "LACPACTIVITY"
+msgid "LACPTIMER"
msgid "LINK"
+msgid "LINK\n"
+msgid "LINKID"
+msgid "MEDIA"
msgid "MODE"
+msgid "MTU"
msgid "Mb"
+msgid "NAME"
msgid "OBJECT"
-msgid "OBJECT=\"%s\" CLASS=\"%s\" "
+msgid "OBYTES"
+msgid "OERRORS"
+msgid "OPACKETS"
+msgid "OPKTDIST"
+msgid "OVER"
msgid "PAUSE"
+msgid "POLICY"
+msgid "PORT"
+msgid "PORTSTATE"
msgid "POSSIBLE"
msgid "PROPERTY"
-msgid "PROPERTY=\"%s\" "
+msgid "PTYPE"
+msgid "RBYTES"
msgid "REM_FAULT"
msgid "SEC"
+msgid "SLOT"
msgid "SPEED"
msgid "SPEED-DUPLEX"
+msgid "STATE"
msgid "STATUS"
msgid "STRENGTH"
+msgid "SYNC"
+msgid "Total"
msgid "VALUE"
-msgid "VALUE=\"0x%s\""
-msgid "\n"
-msgid "\t %5uMb"
-msgid "\t%-10llu"
-msgid "\t%-6.1f"
-msgid "\t%s"
-msgid "\t%s\n"
-msgid "\t-"
-msgid "\t\t%-10llu"
-msgid "\t\tipackets rbytes ierrors "
-msgid "\tipackets rbytes opackets obytes "
-msgid "active"
+msgid "VID"
+msgid "a+"
+msgid "add-aggr"
+msgid "address"
+msgid "addrpolicy"
+msgid "adt_alloc_event (%s): %s"
+msgid "adt_start_session: %s"
+msgid "adv"
msgid "adv_cap_10"
msgid "adv_cap_100"
msgid "adv_cap_1000"
msgid "adv_cap_asmpause"
msgid "adv_cap_autoneg"
msgid "adv_cap_pause"
-msgid "add-aggr"
-msgid "adt_alloc_event (%s): %s"
-msgid "adt_start_session: %s"
-msgid "aggr key=%d"
-msgid "aggr"
+msgid "adv_rem_fault"
+msgid "aggr%d"
+msgid "aggregatable"
msgid "all"
msgid "all-links"
-msgid "attached"
msgid "auth"
msgid "auto"
+msgid "bi"
msgid "bssid"
msgid "bsstype"
-msgid "cap_pause"
+msgid "bw-limit"
msgid "cap_10"
+msgid "cap_100"
msgid "cap_1000"
+msgid "cap_asmpause"
msgid "cap_autoneg"
+msgid "cap_pause"
+msgid "cap_rem_fault"
msgid "capable"
+msgid "class"
+msgid "client"
+msgid "coll"
msgid "connect-wifi"
+msgid "continuous"
+msgid "cpus"
msgid "create-aggr"
+msgid "create-etherstub"
msgid "create-ibss"
msgid "create-secobj"
msgid "create-vlan"
+msgid "create-vnic"
msgid "current"
+msgid "default"
+msgid "defaulted"
msgid "delete-aggr"
+msgid "delete-etherstub"
msgid "delete-phys"
msgid "delete-secobj"
msgid "delete-vlan"
-msgid "dev key=%d"
+msgid "delete-vnic"
msgid "dev"
+msgid "device"
msgid "disconnect-wifi"
-msgid "down"
-msgid "down-aggr"
+msgid "dist"
+msgid "down-vnic"
msgid "duplex"
msgid "essid"
+msgid "expired"
+msgid "extended"
msgid "fault"
msgid "file"
+msgid "fixed"
+msgid "fixed (%s)"
+msgid "flags"
+msgid "forcible"
msgid "forever"
-msgid "full"
-msgid "half"
msgid "ibssid"
msgid "ierrors"
msgid "ifspeed"
msgid "init-linkprop"
+msgid "init-phys"
msgid "init-secobj"
msgid "interval"
-msgid "invalid input"
-msgid "ipackets64"
+msgid "inuse"
+msgid "ipackets"
+msgid "ipktdist"
msgid "key"
msgid "lacp"
msgid "lacp-mode"
msgid "lacp-timer"
+msgid "lacpactivity"
+msgid "lacptimer"
msgid "link"
msgid "link,class,mtu,state,over"
+msgid "link,class,over"
msgid "link,device,media,flags"
msgid "link,essid,bssid,sec,strength,mode,speed"
-msgid "link,essid,bssid,sec,strength,mode,speed,auth,bsstype"
+msgid "link,essid,bssid,sec,strength,mode,speed,bsstype"
msgid "link,ipackets,rbytes,ierrors,opackets,obytes,oerrors"
msgid "link,media,state,speed,duplex,device"
-msgid "link,property,value,default,possible"
msgid "link,policy,addrpolicy,lacpactivity,lacptimer,flags"
-msigd "link,port,aggregatable,sync,coll,dist,defaulted,expired"
+msgid "link,port,aggregatable,sync,coll,dist,defaulted,expired"
msgid "link,port,ipackets,rbytes,opackets,obytes,ipktdist,opktdist"
msgid "link,port,speed,duplex,state,address,portstate"
+msgid "link,property,value,default,possible"
+msgid "link,ptype,state,auto,speed-duplex,pause"
+msgid "link,ptype,state,auto,speed-duplex,pause,rem_fault"
+msgid "link,slot,address,inuse,client"
msgid "link,state,speed,duplex"
-msgid "link,vid,over,flags"
msgid "link,status,essid,sec,strength,mode,speed"
msgid "link,status,essid,sec,strength,mode,speed,auth,bssid,bsstype"
+msgid "link,vid,over,flags"
+msgid "link=%s"
msgid "link_asmpause"
msgid "link_autoneg"
msgid "link_duplex"
msgid "link_pause"
msgid "link_state"
-msgid "long"
msgid "lp_cap_10"
msgid "lp_cap_100"
msgid "lp_cap_1000"
-msgid "lp_cap_autoneg"
msgid "lp_cap_asmpause"
+msgid "lp_cap_autoneg"
msgid "lp_cap_pause"
msgid "lp_rem_fault"
msgid "mac"
+msgid "mac-address"
+msgid "mac-prefix"
+msgid "media"
msgid "mode"
msgid "modify-aggr"
-msgid "net_rawaccess"
+msgid "mtu"
msgid "no"
-msgid "obytes64"
+msgid "none"
+msgid "o:px"
+msgid "object"
+msgid "object,class"
+msgid "object,class,value"
+msgid "obytes"
msgid "oerrors"
-msgid "opackets obytes oerrors\n"
-msgid "opackets64"
+msgid "opackets"
+msgid "opktdist"
msgid "output"
+msgid "over"
msgid "parseable"
-msgid "passive"
msgid "pause"
+msgid "pd:si:"
msgid "peeradv"
msgid "persistent"
msgid "policy"
+msgid "port"
+msgid "portstate"
+msgid "possible"
+msgid "primary"
msgid "prop"
+msgid "property"
+msgid "ptype"
msgid "r"
-msgid "rbytes64"
+msgid "random"
+msgid "rbytes"
msgid "rem_fault"
msgid "remove-aggr"
msgid "rename-link"
+msgid "reset"
msgid "reset-linkprop"
msgid "root-dir"
msgid "scan-wifi"
msgid "sec"
+msgid "set"
msgid "set-linkprop"
-msgid "short"
msgid "show-aggr"
msgid "show-dev"
+msgid "show-ether"
+msgid "show-etherstub"
msgid "show-link"
+msgid "show-linkmap"
msgid "show-linkprop"
msgid "show-phys"
msgid "show-secobj"
-msgid "show-wifi"
+msgid "show-usage"
msgid "show-vlan"
-msgid "show-ether"
-msgid "solaris.network.link.security"
+msgid "show-vnic"
+msgid "show-wifi"
+msgid "slot"
msgid "speed"
msgid "speed-duplex"
-msgid "standby"
+msgid "state"
msgid "statistics"
msgid "status"
msgid "strength"
-msgid "sys_net_config"
+msgid "sync"
+msgid "tR:"
+msgid "tR:d:m:n:p:r:v:"
+msgid "tdps:e:f:"
msgid "temporary"
+msgid "timeout"
+msgid "tx"
msgid "unicast"
msgid "unknown"
-msgid "up"
msgid "up-aggr"
msgid "up-vlan"
+msgid "up-vnic"
+msgid "value"
+msgid "vid"
msgid "vlan-id"
-msgid "wep"
msgid "yes"
diff --git a/usr/src/cmd/dladm/vnic.conf b/usr/src/cmd/dladm/vnic.conf
new file mode 100644
index 0000000000..d156a65ec1
--- /dev/null
+++ b/usr/src/cmd/dladm/vnic.conf
@@ -0,0 +1,29 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#
+# DO NOT EDIT OR PARSE THIS FILE!
+#
+# Use the dladm(1m) command to change the contents of this file.
+
diff --git a/usr/src/cmd/flowadm/Makefile b/usr/src/cmd/flowadm/Makefile
new file mode 100644
index 0000000000..b6af8b2b79
--- /dev/null
+++ b/usr/src/cmd/flowadm/Makefile
@@ -0,0 +1,76 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+PROG=flowadm
+
+ROOTFS_PROG= $(PROG)
+
+POFILE= $(PROG).po
+CONFIGFILES= flowadm.conf flowprop.conf
+
+include ../Makefile.cmd
+
+XGETFLAGS += -a -x $(PROG).xcl
+LDLIBS += -L$(ROOT)/lib
+LDLIBS += -ldladm -lkstat
+
+ROOTCFGDIR= $(ROOTETC)/dladm
+ROOTCFGFILES= $(CONFIGFILES:%=$(ROOTCFGDIR)/%)
+
+$(ROOTCFGFILES):= FILEMODE= 644
+$(ROOTCFGFILES):= OWNER= dladm
+$(ROOTCFGFILES):= GROUP= sys
+
+.KEEP_STATE:
+
+all: $(ROOTFS_PROG)
+
+#
+# Message catalog
+#
+_msg: $(POFILE)
+
+$(POFILE): $(PROG).c
+ $(RM) $@
+ $(COMPILE.cpp) $(PROG).c > $(POFILE).i
+ $(XGETTEXT) $(XGETFLAGS) $(POFILE).i
+ sed "/^domain/d" messages.po > $@
+ $(RM) messages.po $(POFILE).i
+
+install: all $(ROOTSBINPROG) $(ROOTCFGDIR) $(ROOTCFGFILES)
+ $(RM) $(ROOTUSRSBINPROG)
+ -$(SYMLINK) ../../sbin/$(PROG) $(ROOTUSRSBINPROG)
+
+clean:
+
+lint: lint_PROG
+
+$(ROOTCFGDIR):
+ $(INS.dir)
+
+$(ROOTCFGDIR)/%: $(ROOTCFGDIR) %
+ $(INS.file)
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/flowadm/flowadm.c b/usr/src/cmd/flowadm/flowadm.c
new file mode 100644
index 0000000000..f4c3859172
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowadm.c
@@ -0,0 +1,1963 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stropts.h>
+#include <errno.h>
+#include <kstat.h>
+#include <strings.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <priv.h>
+#include <netdb.h>
+#include <libintl.h>
+#include <libdlflow.h>
+#include <libdllink.h>
+#include <libdlstat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/ethernet.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <stddef.h>
+
+#define CMD_TYPE_ANY 0xffffffff
+#define STR_UNDEF_VAL "--"
+
+
+/*
+ * data structures and routines for printing output.
+ */
+
+typedef struct print_field_s {
+ const char *pf_name;
+ const char *pf_header;
+ uint_t pf_width;
+ union {
+ uint_t _pf_index;
+ size_t _pf_offset;
+ }_pf_un;
+#define pf_index _pf_un._pf_index
+#define pf_offset _pf_un._pf_offset;
+ uint_t pf_cmdtype;
+} print_field_t;
+
+typedef struct print_state_s {
+ print_field_t **ps_fields;
+ uint_t ps_nfields;
+ boolean_t ps_lastfield;
+ uint_t ps_overflow;
+} print_state_t;
+
+typedef struct show_usage_state_s {
+ boolean_t us_plot;
+ boolean_t us_parseable;
+ boolean_t us_printheader;
+ boolean_t us_first;
+ print_state_t us_print;
+} show_usage_state_t;
+
+typedef char *(*print_callback_t)(print_field_t *, void *);
+static print_field_t **parse_output_fields(char *, print_field_t *, int,
+ uint_t, uint_t *);
+
+static void print_header(print_state_t *);
+static void print_field(print_state_t *, print_field_t *, const char *,
+ boolean_t);
+
+static void flowadm_print_output(print_state_t *, boolean_t,
+ print_callback_t, void *);
+
+/*
+ * helper function that, when invoked as flowadm(print_field(pf, buf)
+ * prints string which is offset by pf->pf_offset within buf.
+ */
+static char *flowadm_print_field(print_field_t *, void *);
+
+#define MAX_FIELD_LEN 32
+
+typedef void cmdfunc_t(int, char **);
+
+static cmdfunc_t do_add_flow, do_remove_flow, do_init_flow, do_show_flow;
+static cmdfunc_t do_show_flowprop, do_set_flowprop, do_reset_flowprop;
+static cmdfunc_t do_show_usage;
+
+static int show_flow(dladm_flow_attr_t *, void *);
+static int show_flows_onelink(datalink_id_t, void *);
+
+static void flow_stats(const char *, datalink_id_t, uint_t);
+static void get_flow_stats(const char *, pktsum_t *);
+static int show_flow_stats(dladm_flow_attr_t *, void *);
+static int show_link_flow_stats(datalink_id_t, void *);
+
+static int remove_flow(dladm_flow_attr_t *, void *);
+
+static int show_flowprop(dladm_flow_attr_t *, void *);
+static void show_flowprop_one_flow(void *, const char *);
+static int show_flowprop_onelink(datalink_id_t, void *);
+
+static void die(const char *, ...);
+static void die_optdup(int);
+static void die_opterr(int, int);
+static void die_dlerr(dladm_status_t, const char *, ...);
+static void warn(const char *, ...);
+static void warn_dlerr(dladm_status_t, const char *, ...);
+
+typedef struct cmd {
+ char *c_name;
+ void (*c_fn)(int, char **);
+} cmd_t;
+
+static cmd_t cmds[] = {
+ { "add-flow", do_add_flow },
+ { "remove-flow", do_remove_flow },
+ { "show-flowprop", do_show_flowprop },
+ { "set-flowprop", do_set_flowprop },
+ { "reset-flowprop", do_reset_flowprop },
+ { "show-flow", do_show_flow },
+ { "init-flow", do_init_flow },
+ { "show-usage", do_show_usage }
+};
+
+static const struct option longopts[] = {
+ {"link", required_argument, 0, 'l'},
+ {"parseable", no_argument, 0, 'p'},
+ {"statistics", no_argument, 0, 's'},
+ {"interval", required_argument, 0, 'i'},
+ {"temporary", no_argument, 0, 't'},
+ {"root-dir", required_argument, 0, 'R'},
+ { 0, 0, 0, 0 }
+};
+
+static const struct option prop_longopts[] = {
+ {"link", required_argument, 0, 'l'},
+ {"temporary", no_argument, 0, 't'},
+ {"root-dir", required_argument, 0, 'R'},
+ {"prop", required_argument, 0, 'p'},
+ {"attr", required_argument, 0, 'a'},
+ { 0, 0, 0, 0 }
+};
+
+/*
+ * structures for 'flowadm show-flow'
+ */
+
+typedef struct show_flow_state {
+ boolean_t fs_firstonly;
+ boolean_t fs_donefirst;
+ pktsum_t fs_prevstats;
+ uint32_t fs_flags;
+ dladm_status_t fs_status;
+ print_state_t fs_print;
+ const char *fs_flow;
+ const char *fs_link;
+ boolean_t fs_parseable;
+ boolean_t fs_printheader;
+ boolean_t fs_persist;
+ boolean_t fs_stats;
+ uint64_t fs_mask;
+} show_flow_state_t;
+
+/*
+ * structures for 'flowadm remove-flow'
+ */
+
+typedef struct remove_flow_state {
+ boolean_t fs_tempop;
+ const char *fs_altroot;
+ dladm_status_t fs_status;
+} remove_flow_state_t;
+
+typedef struct flow_args_s {
+ const char *fa_link;
+ int fa_attrno; /* -1 indicates flow itself */
+ uint64_t fa_mask;
+ dladm_flow_attr_t *fa_finfop;
+ dladm_status_t *fa_status;
+ boolean_t fa_parseable;
+} flow_args_t;
+
+#define PROTO_MAXSTR_LEN 7
+#define PORT_MAXSTR_LEN 6
+#define DSFIELD_MAXSTR_LEN 10
+
+typedef struct flow_fields_buf_s
+{
+ char flow_name[MAXNAMELEN];
+ char flow_link[MAXLINKNAMELEN];
+ char flow_ipaddr[INET6_ADDRSTRLEN+4];
+ char flow_proto[PROTO_MAXSTR_LEN];
+ char flow_port[PORT_MAXSTR_LEN];
+ char flow_dsfield[DSFIELD_MAXSTR_LEN];
+} flow_fields_buf_t;
+
+static print_field_t flow_fields[] = {
+/* name, header, field width, index, cmdtype */
+{ "flow", "FLOW", 11,
+ offsetof(flow_fields_buf_t, flow_name), CMD_TYPE_ANY},
+{ "link", "LINK", 11,
+ offsetof(flow_fields_buf_t, flow_link), CMD_TYPE_ANY},
+{ "ipaddr", "IP ADDR", 30,
+ offsetof(flow_fields_buf_t, flow_ipaddr), CMD_TYPE_ANY},
+{ "transport", "PROTO", 6,
+ offsetof(flow_fields_buf_t, flow_proto), CMD_TYPE_ANY},
+{ "port", "PORT", 7,
+ offsetof(flow_fields_buf_t, flow_port), CMD_TYPE_ANY},
+{ "dsfield", "DSFLD", 9,
+ offsetof(flow_fields_buf_t, flow_dsfield), CMD_TYPE_ANY}}
+;
+
+#define FLOW_MAX_FIELDS (sizeof (flow_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'flowadm show-flowprop'
+ */
+typedef enum {
+ FLOWPROP_FLOW,
+ FLOWPROP_PROPERTY,
+ FLOWPROP_VALUE,
+ FLOWPROP_DEFAULT,
+ FLOWPROP_POSSIBLE
+} flowprop_field_index_t;
+
+static print_field_t flowprop_fields[] = {
+/* name, header, fieldwidth, index, cmdtype */
+{ "flow", "FLOW", 12, FLOWPROP_FLOW, CMD_TYPE_ANY},
+{ "property", "PROPERTY", 15, FLOWPROP_PROPERTY, CMD_TYPE_ANY},
+{ "value", "VALUE", 14, FLOWPROP_VALUE, CMD_TYPE_ANY},
+{ "default", "DEFAULT", 14, FLOWPROP_DEFAULT, CMD_TYPE_ANY},
+{ "possible", "POSSIBLE", 20, FLOWPROP_POSSIBLE, CMD_TYPE_ANY}}
+;
+#define FLOWPROP_MAX_FIELDS \
+ (sizeof (flowprop_fields) / sizeof (print_field_t))
+
+#define MAX_PROP_LINE 512
+
+typedef struct show_flowprop_state {
+ const char *fs_flow;
+ datalink_id_t fs_linkid;
+ char *fs_line;
+ char **fs_propvals;
+ dladm_arg_list_t *fs_proplist;
+ boolean_t fs_parseable;
+ boolean_t fs_persist;
+ boolean_t fs_header;
+ dladm_status_t fs_status;
+ dladm_status_t fs_retstatus;
+ print_state_t fs_print;
+} show_flowprop_state_t;
+
+typedef struct set_flowprop_state {
+ const char *fs_name;
+ boolean_t fs_reset;
+ boolean_t fs_temp;
+ dladm_status_t fs_status;
+} set_flowprop_state_t;
+
+typedef struct flowprop_args_s {
+ show_flowprop_state_t *fs_state;
+ char *fs_propname;
+ char *fs_flowname;
+} flowprop_args_t;
+
+/*
+ * structures for 'flow show-usage'
+ */
+
+typedef struct usage_fields_buf_s {
+ char usage_flow[12];
+ char usage_duration[10];
+ char usage_ipackets[9];
+ char usage_rbytes[10];
+ char usage_opackets[9];
+ char usage_obytes[10];
+ char usage_bandwidth[14];
+} usage_fields_buf_t;
+
+static print_field_t usage_fields[] = {
+/* name, header, field width, offset, cmdtype */
+{ "flow", "FLOW", 12,
+ offsetof(usage_fields_buf_t, usage_flow), CMD_TYPE_ANY},
+{ "duration", "DURATION", 10,
+ offsetof(usage_fields_buf_t, usage_duration), CMD_TYPE_ANY},
+{ "ipackets", "IPACKETS", 9,
+ offsetof(usage_fields_buf_t, usage_ipackets), CMD_TYPE_ANY},
+{ "rbytes", "RBYTES", 10,
+ offsetof(usage_fields_buf_t, usage_rbytes), CMD_TYPE_ANY},
+{ "opackets", "OPACKETS", 9,
+ offsetof(usage_fields_buf_t, usage_opackets), CMD_TYPE_ANY},
+{ "obytes", "OBYTES", 10,
+ offsetof(usage_fields_buf_t, usage_obytes), CMD_TYPE_ANY},
+{ "bandwidth", "BANDWIDTH", 14,
+ offsetof(usage_fields_buf_t, usage_bandwidth), CMD_TYPE_ANY}}
+;
+
+#define USAGE_MAX_FIELDS (sizeof (usage_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-usage link'
+ */
+
+typedef struct usage_l_fields_buf_s {
+ char usage_l_flow[12];
+ char usage_l_stime[13];
+ char usage_l_etime[13];
+ char usage_l_rbytes[8];
+ char usage_l_obytes[8];
+ char usage_l_bandwidth[14];
+} usage_l_fields_buf_t;
+
+static print_field_t usage_l_fields[] = {
+/* name, header, field width, offset, cmdtype */
+{ "flow", "FLOW", 12,
+ offsetof(usage_l_fields_buf_t, usage_l_flow), CMD_TYPE_ANY},
+{ "start", "START", 13,
+ offsetof(usage_l_fields_buf_t, usage_l_stime), CMD_TYPE_ANY},
+{ "end", "END", 13,
+ offsetof(usage_l_fields_buf_t, usage_l_etime), CMD_TYPE_ANY},
+{ "rbytes", "RBYTES", 8,
+ offsetof(usage_l_fields_buf_t, usage_l_rbytes), CMD_TYPE_ANY},
+{ "obytes", "OBYTES", 8,
+ offsetof(usage_l_fields_buf_t, usage_l_obytes), CMD_TYPE_ANY},
+{ "bandwidth", "BANDWIDTH", 14,
+ offsetof(usage_l_fields_buf_t, usage_l_bandwidth), CMD_TYPE_ANY}}
+;
+
+#define USAGE_L_MAX_FIELDS \
+ (sizeof (usage_l_fields) /sizeof (print_field_t))
+
+#define PRI_HI 100
+#define PRI_LO 10
+#define PRI_NORM 50
+
+#define FLOWADM_CONF "/etc/dladm/flowadm.conf"
+#define BLANK_LINE(s) ((s[0] == '\0') || (s[0] == '#') || (s[0] == '\n'))
+
+static char *progname;
+
+boolean_t t_arg = B_FALSE; /* changes are persistent */
+char *altroot = NULL;
+
+static const char *attr_table[] =
+ {"local_ip", "remote_ip", "transport", "local_port", "dsfield"};
+
+#define NATTR (sizeof (attr_table)/sizeof (char *))
+
+static void
+usage(void)
+{
+ (void) fprintf(stderr, gettext("usage: flowadm <subcommand>"
+ " <args>...\n"
+ "\tadd-flow [-t] [-R <root-dir>] -l <link>\n"
+ "\t\t-a attr=value[,...] [-p prop=value,...]\n"
+ "\t\tflow-name\n"
+ "\tremove-flow [-t] [-R <root-dir>] {-l <link> | flow-name}\n"
+ "\tset-flowprop [-t] [-R <root-dir>] \n"
+ "\t\t-p prop=value[,...] flowname\n"
+ "\treset-flowprop [-t] [-R <root-dir>] \n"
+ "\t\t[-p prop,...] flowname\n"
+ "\tshow-flowprop [-cP] [-l <link>] [-p prop,...] [flow-name]\n"
+ "\tshow-flow [-p] [-s [-i <interval>]] [-l <link>] [flow-name]\n"
+ "\tshow-usage [-d|-p -F <format>] [-s <DD/MM/YYYY,HH:MM:SS>]\n"
+ "\t\t[-e <DD/MM/YYYY,HH:MM:SS>]] -f <logfile> [<name>]\n"));
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int i, arglen, cmdlen;
+ cmd_t *cmdp;
+
+ (void) setlocale(LC_ALL, "");
+#if !defined(TEXT_DOMAIN)
+#define TEXT_DOMAIN "SYS_TEST"
+#endif
+ (void) textdomain(TEXT_DOMAIN);
+
+ progname = argv[0];
+
+ if (argc < 2)
+ usage();
+
+ for (i = 0; i < sizeof (cmds) / sizeof (cmds[0]); i++) {
+ cmdp = &cmds[i];
+ arglen = strlen(argv[1]);
+ cmdlen = strlen(cmdp->c_name);
+ if ((arglen == cmdlen) && (strncmp(argv[1], cmdp->c_name,
+ cmdlen) == 0)) {
+ cmdp->c_fn(argc - 1, &argv[1]);
+ exit(0);
+ }
+ }
+
+ (void) fprintf(stderr, gettext("%s: unknown subcommand '%s'\n"),
+ progname, argv[1]);
+ usage();
+
+ return (0);
+}
+
+static const char *
+match_attr(char *attr)
+{
+ int i;
+
+ for (i = 0; i < NATTR; i++) {
+ if (strlen(attr) == strlen(attr_table[i]) &&
+ strncmp(attr, attr_table[i], strlen(attr_table[i])) == 0) {
+ return (attr);
+ }
+ }
+ return (NULL);
+}
+
+/* ARGSUSED */
+static void
+do_init_flow(int argc, char *argv[])
+{
+ dladm_status_t status;
+
+ status = dladm_flow_init();
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "flows initialization failed");
+}
+
+/* ARGSUSED */
+static int
+show_usage_date(dladm_usage_t *usage, void *arg)
+{
+
+ time_t stime;
+ char timebuf[20];
+
+ stime = usage->du_stime;
+ (void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y",
+ localtime(&stime));
+ (void) printf("%s\n", timebuf);
+
+ return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_time(dladm_usage_t *usage, void *arg)
+{
+ show_usage_state_t *state = (show_usage_state_t *)arg;
+ char buf[DLADM_STRSIZE];
+ usage_l_fields_buf_t ubuf;
+ time_t time;
+ double bw;
+
+ if (state->us_plot) {
+ if (!state->us_printheader) {
+ if (state->us_first) {
+ (void) printf("# Time");
+ state->us_first = B_FALSE;
+ }
+ (void) printf(" %s", usage->du_name);
+ if (usage->du_last) {
+ (void) printf("\n");
+ state->us_first = B_TRUE;
+ state->us_printheader = B_TRUE;
+ }
+ } else {
+ if (state->us_first) {
+ time = usage->du_etime;
+ (void) strftime(buf, sizeof (buf), "%T",
+ localtime(&time));
+ state->us_first = B_FALSE;
+ (void) printf("%s", buf);
+ }
+ bw = (double)usage->du_bandwidth/1000;
+ (void) printf(" %.2f", bw);
+ if (usage->du_last) {
+ (void) printf("\n");
+ state->us_first = B_TRUE;
+ }
+ }
+ return (DLADM_STATUS_OK);
+ }
+
+ bzero(&ubuf, sizeof (ubuf));
+
+ (void) snprintf(ubuf.usage_l_flow, sizeof (ubuf.usage_l_flow), "%s",
+ usage->du_name);
+ time = usage->du_stime;
+ (void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+ (void) snprintf(ubuf.usage_l_stime, sizeof (ubuf.usage_l_stime), "%s",
+ buf);
+ time = usage->du_etime;
+ (void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+ (void) snprintf(ubuf.usage_l_etime, sizeof (ubuf.usage_l_etime), "%s",
+ buf);
+ (void) snprintf(ubuf.usage_l_rbytes, sizeof (ubuf.usage_l_rbytes),
+ "%llu", usage->du_rbytes);
+ (void) snprintf(ubuf.usage_l_obytes, sizeof (ubuf.usage_l_obytes),
+ "%llu", usage->du_obytes);
+ (void) snprintf(ubuf.usage_l_bandwidth, sizeof (ubuf.usage_l_bandwidth),
+ "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+ if (!state->us_parseable && !state->us_printheader) {
+ print_header(&state->us_print);
+ state->us_printheader = B_TRUE;
+ }
+
+ flowadm_print_output(&state->us_print, state->us_parseable,
+ flowadm_print_field, (void *)&ubuf);
+
+ return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_res(dladm_usage_t *usage, void *arg)
+{
+ show_usage_state_t *state = (show_usage_state_t *)arg;
+ char buf[DLADM_STRSIZE];
+ usage_fields_buf_t ubuf;
+
+ bzero(&ubuf, sizeof (ubuf));
+
+ (void) snprintf(ubuf.usage_flow, sizeof (ubuf.usage_flow), "%s",
+ usage->du_name);
+ (void) snprintf(ubuf.usage_duration, sizeof (ubuf.usage_duration),
+ "%llu", usage->du_duration);
+ (void) snprintf(ubuf.usage_ipackets, sizeof (ubuf.usage_ipackets),
+ "%llu", usage->du_ipackets);
+ (void) snprintf(ubuf.usage_rbytes, sizeof (ubuf.usage_rbytes),
+ "%llu", usage->du_rbytes);
+ (void) snprintf(ubuf.usage_opackets, sizeof (ubuf.usage_opackets),
+ "%llu", usage->du_opackets);
+ (void) snprintf(ubuf.usage_obytes, sizeof (ubuf.usage_obytes),
+ "%llu", usage->du_obytes);
+ (void) snprintf(ubuf.usage_bandwidth, sizeof (ubuf.usage_bandwidth),
+ "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+ if (!state->us_parseable && !state->us_printheader) {
+ print_header(&state->us_print);
+ state->us_printheader = B_TRUE;
+ }
+
+ flowadm_print_output(&state->us_print, state->us_parseable,
+ flowadm_print_field, (void *)&ubuf);
+
+ return (DLADM_STATUS_OK);
+}
+
+static boolean_t
+valid_formatspec(char *formatspec_str)
+{
+ if (strcmp(formatspec_str, "gnuplot") == 0)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/* ARGSUSED */
+static void
+do_show_usage(int argc, char *argv[])
+{
+ char *file = NULL;
+ int opt;
+ dladm_status_t status;
+ boolean_t d_arg = B_FALSE;
+ boolean_t p_arg = B_FALSE;
+ char *stime = NULL;
+ char *etime = NULL;
+ char *resource = NULL;
+ show_usage_state_t state;
+ boolean_t o_arg = B_FALSE;
+ boolean_t F_arg = B_FALSE;
+ char *fields_str = NULL;
+ char *formatspec_str = NULL;
+ print_field_t **fields;
+ uint_t nfields;
+ char *all_fields =
+ "flow,duration,ipackets,rbytes,opackets,obytes,bandwidth";
+ char *all_l_fields =
+ "flow,start,end,rbytes,obytes,bandwidth";
+
+ bzero(&state, sizeof (show_usage_state_t));
+ state.us_parseable = B_FALSE;
+ state.us_printheader = B_FALSE;
+ state.us_plot = B_FALSE;
+ state.us_first = B_TRUE;
+
+ while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) {
+ switch (opt) {
+ case 'd':
+ d_arg = B_TRUE;
+ break;
+ case 'p':
+ state.us_plot = p_arg = B_TRUE;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 's':
+ stime = optarg;
+ break;
+ case 'e':
+ etime = optarg;
+ break;
+ case 'o':
+ o_arg = B_TRUE;
+ fields_str = optarg;
+ break;
+ case 'F':
+ F_arg = B_TRUE;
+ formatspec_str = optarg;
+ break;
+ default:
+ die_opterr(optopt, opt);
+ }
+ }
+
+ if (file == NULL)
+ die("show-usage requires a file");
+
+ if (optind == (argc-1)) {
+ resource = argv[optind];
+ }
+
+ if (resource == NULL && stime == NULL && etime == NULL) {
+ if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+ fields_str = all_fields;
+ fields = parse_output_fields(fields_str, usage_fields,
+ USAGE_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+ } else {
+ if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+ fields_str = all_l_fields;
+ fields = parse_output_fields(fields_str, usage_l_fields,
+ USAGE_L_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+ }
+
+ if (fields == NULL) {
+ die("invalid fields(s) specified");
+ return;
+ }
+ state.us_print.ps_fields = fields;
+ state.us_print.ps_nfields = nfields;
+
+ if (p_arg && d_arg)
+ die("plot and date options are incompatible");
+
+ if (p_arg && !F_arg)
+ die("specify format speicifier: -F <format>");
+
+ if (F_arg && valid_formatspec(formatspec_str) == B_FALSE)
+ die("Format specifier %s not supported", formatspec_str);
+
+ if (d_arg) {
+ /* Print log dates */
+ status = dladm_usage_dates(show_usage_date,
+ DLADM_LOGTYPE_FLOW, file, resource, &state);
+ } else if (resource == NULL && stime == NULL && etime == NULL &&
+ !p_arg) {
+ /* Print summary */
+ status = dladm_usage_summary(show_usage_res,
+ DLADM_LOGTYPE_FLOW, file, &state);
+ } else if (resource != NULL) {
+ /* Print log entries for named resource */
+ status = dladm_walk_usage_res(show_usage_time,
+ DLADM_LOGTYPE_FLOW, file, resource, stime, etime, &state);
+ } else {
+ /* Print time and information for each link */
+ status = dladm_walk_usage_time(show_usage_time,
+ DLADM_LOGTYPE_FLOW, file, stime, etime, &state);
+ }
+
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "show-usage");
+}
+
+static void
+do_add_flow(int argc, char *argv[])
+{
+ char devname[MAXNAMELEN];
+ char *name = NULL;
+ uint_t index;
+ datalink_id_t linkid;
+
+ char option;
+ boolean_t l_arg = B_FALSE;
+ dladm_arg_list_t *proplist = NULL;
+ dladm_arg_list_t *attrlist = NULL;
+ dladm_status_t status;
+
+ while ((option = getopt_long(argc, argv, "tR:l:a:p:",
+ prop_longopts, NULL)) != -1) {
+ switch (option) {
+ case 't':
+ t_arg = B_TRUE;
+ break;
+ case 'R':
+ altroot = optarg;
+ break;
+ case 'l':
+ if (strlcpy(devname, optarg,
+ MAXNAMELEN) >= MAXNAMELEN) {
+ die("link name too long");
+ }
+ if (dladm_name2info(devname, &linkid, NULL,
+ NULL, NULL) != DLADM_STATUS_OK)
+ die("invalid link '%s'", devname);
+ l_arg = B_TRUE;
+ break;
+ case 'a':
+ if (dladm_parse_flow_attrs(optarg, &attrlist, B_FALSE)
+ != DLADM_STATUS_OK)
+ die("invalid flow attribute specified");
+ break;
+ case 'p':
+ if (dladm_parse_flow_props(optarg, &proplist, B_FALSE)
+ != DLADM_STATUS_OK)
+ die("invalid flow property specified");
+ break;
+ default:
+ die_opterr(optopt, option);
+ }
+ }
+ if (!l_arg) {
+ die("link is required");
+ }
+
+ opterr = 0;
+ index = optind;
+
+ if ((index != (argc - 1)) || match_attr(argv[index]) != NULL) {
+ die("flow name is required");
+ } else {
+ /* get flow name; required last argument */
+ if (strlen(argv[index]) >= MAXFLOWNAME)
+ die("flow name too long");
+ name = argv[index];
+ }
+
+ status = dladm_flow_add(linkid, attrlist, proplist, name,
+ t_arg, altroot);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "add flow failed");
+
+ dladm_free_attrs(attrlist);
+ dladm_free_props(proplist);
+}
+
+static void
+do_remove_flow(int argc, char *argv[])
+{
+ char option;
+ char *flowname = NULL;
+ char linkname[MAXNAMELEN];
+ datalink_id_t linkid = DATALINK_ALL_LINKID;
+ boolean_t l_arg = B_FALSE;
+ remove_flow_state_t state;
+ dladm_status_t status;
+
+ bzero(&state, sizeof (state));
+
+ opterr = 0;
+ while ((option = getopt_long(argc, argv, ":tR:l:",
+ longopts, NULL)) != -1) {
+ switch (option) {
+ case 't':
+ t_arg = B_TRUE;
+ break;
+ case 'R':
+ altroot = optarg;
+ break;
+ case 'l':
+ if (strlcpy(linkname, optarg,
+ MAXLINKNAMELEN) >= MAXLINKNAMELEN) {
+ die("link name too long");
+ }
+ if (dladm_name2info(linkname, &linkid, NULL,
+ NULL, NULL) != DLADM_STATUS_OK) {
+ die("invalid link '%s'", linkname);
+ }
+ l_arg = B_TRUE;
+ break;
+ default:
+ die_opterr(optopt, option);
+ break;
+ }
+ }
+
+ /* when link not specified get flow name */
+ if (!l_arg) {
+ if (optind != (argc-1)) {
+ usage();
+ } else {
+ if (strlen(argv[optind]) >= MAXFLOWNAME)
+ die("flow name too long");
+ flowname = argv[optind];
+ }
+ status = dladm_flow_remove(flowname, t_arg, altroot);
+ } else {
+ /* if link is specified then flow name should not be there */
+ if (optind == argc-1)
+ usage();
+ /* walk the link to find flows and remove them */
+ state.fs_tempop = t_arg;
+ state.fs_altroot = altroot;
+ state.fs_status = DLADM_STATUS_OK;
+ status = dladm_walk_flow(remove_flow, linkid, &state, B_FALSE);
+ /*
+ * check if dladm_walk_flow terminated early and see if the
+ * walker function as any status for us
+ */
+ if (status == DLADM_STATUS_OK)
+ status = state.fs_status;
+ }
+
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "remove flow failed");
+}
+
+/*
+ * Walker function for removing a flow through dladm_walk_flow();
+ */
+static int
+remove_flow(dladm_flow_attr_t *attr, void *arg)
+{
+ remove_flow_state_t *state = (remove_flow_state_t *)arg;
+
+ state->fs_status = dladm_flow_remove(attr->fa_flowname,
+ state->fs_tempop, state->fs_altroot);
+
+ if (state->fs_status == DLADM_STATUS_OK)
+ return (DLADM_WALK_CONTINUE);
+ else
+ return (DLADM_WALK_TERMINATE);
+}
+
+static char *
+flowadm_print_field(print_field_t *pf, void *arg)
+{
+ char *value;
+
+ value = (char *)arg + pf->pf_offset;
+ return (value);
+}
+
+/*ARGSUSED*/
+static dladm_status_t
+print_flow(show_flow_state_t *state, dladm_flow_attr_t *attr,
+ flow_fields_buf_t *fbuf)
+{
+ char link[MAXLINKNAMELEN];
+ dladm_status_t status;
+
+ if ((status = dladm_datalink_id2info(attr->fa_linkid, NULL, NULL,
+ NULL, link, sizeof (link))) != DLADM_STATUS_OK) {
+ return (status);
+ }
+
+ (void) snprintf(fbuf->flow_name, sizeof (fbuf->flow_name),
+ "%s", attr->fa_flowname);
+ (void) snprintf(fbuf->flow_link, sizeof (fbuf->flow_link),
+ "%s", link);
+
+ (void) dladm_flow_attr_ip2str(attr, fbuf->flow_ipaddr,
+ sizeof (fbuf->flow_ipaddr));
+ (void) dladm_flow_attr_proto2str(attr, fbuf->flow_proto,
+ sizeof (fbuf->flow_proto));
+ (void) dladm_flow_attr_port2str(attr, fbuf->flow_port,
+ sizeof (fbuf->flow_port));
+ (void) dladm_flow_attr_dsfield2str(attr, fbuf->flow_dsfield,
+ sizeof (fbuf->flow_dsfield));
+
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Walker function for showing flow attributes through dladm_walk_flow().
+ */
+static int
+show_flow(dladm_flow_attr_t *attr, void *arg)
+{
+ show_flow_state_t *statep = arg;
+ dladm_status_t status;
+ flow_fields_buf_t fbuf;
+
+ /*
+ * first get all the flow attributes into fbuf;
+ */
+ bzero(&fbuf, sizeof (fbuf));
+ status = print_flow(statep, attr, &fbuf);
+
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ if (!statep->fs_parseable && !statep->fs_printheader) {
+ print_header(&statep->fs_print);
+ statep->fs_printheader = B_TRUE;
+ }
+
+ flowadm_print_output(&statep->fs_print, statep->fs_parseable,
+ flowadm_print_field, (void *)&fbuf);
+
+done:
+ statep->fs_status = status;
+ return (DLADM_WALK_CONTINUE);
+}
+
+static void
+show_one_flow(void *arg, const char *name)
+{
+ dladm_flow_attr_t attr;
+ dladm_status_t status;
+
+ if (dladm_flow_info(name, &attr) != DLADM_STATUS_OK)
+ die("invalid flow: '%s'", name);
+ else
+ show_flow(&attr, arg);
+}
+
+/*
+ * Wrapper of dladm_walk_flow(show_flow,...) to make it usable to
+ * dladm_walk_datalink_id(). Used for showing flow attributes for
+ * all flows on all links.
+ */
+static int
+show_flows_onelink(datalink_id_t linkid, void *arg)
+{
+ show_flow_state_t *state = arg;
+
+ (void) dladm_walk_flow(show_flow, linkid, arg, state->fs_persist);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+static void
+get_flow_stats(const char *flowname, pktsum_t *stats)
+{
+ kstat_ctl_t *kcp;
+ kstat_t *ksp;
+
+ bzero(stats, sizeof (*stats));
+
+ if ((kcp = kstat_open()) == NULL) {
+ warn("kstat open operation failed");
+ return;
+ }
+
+ ksp = dladm_kstat_lookup(kcp, NULL, -1, flowname, "flow");
+
+ if (ksp != NULL)
+ dladm_get_stats(kcp, ksp, stats);
+
+ (void) kstat_close(kcp);
+}
+
+/* ARGSUSED */
+static int
+show_flow_stats(dladm_flow_attr_t *attr, void *arg)
+{
+ show_flow_state_t *state = (show_flow_state_t *)arg;
+ const char *name = attr->fa_flowname;
+ pktsum_t stats, diff_stats;
+
+ if (state->fs_firstonly) {
+ if (state->fs_donefirst)
+ return (DLADM_WALK_TERMINATE);
+ state->fs_donefirst = B_TRUE;
+ } else {
+ bzero(&state->fs_prevstats, sizeof (state->fs_prevstats));
+ }
+
+ get_flow_stats(name, &stats);
+ dladm_stats_diff(&diff_stats, &stats, &state->fs_prevstats);
+
+ (void) printf("%-12s", name);
+ (void) printf("%-10llu", diff_stats.ipackets);
+ (void) printf("%-12llu", diff_stats.rbytes);
+ (void) printf("%-8llu", diff_stats.ierrors);
+ (void) printf("%-10llu", diff_stats.opackets);
+ (void) printf("%-12llu", diff_stats.obytes);
+ (void) printf("%-8llu\n", diff_stats.oerrors);
+
+ state->fs_prevstats = stats;
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+/*
+ * Wrapper of dladm_walk_flow(show_flow,...) to make it usable for
+ * dladm_walk_datalink_id(). Used for showing flow stats for
+ * all flows on all links.
+ */
+static int
+show_link_flow_stats(datalink_id_t linkid, void * arg)
+{
+ if (dladm_walk_flow(show_flow_stats, linkid, arg, B_FALSE)
+ == DLADM_STATUS_OK)
+ return (DLADM_WALK_CONTINUE);
+ else
+ return (DLADM_WALK_TERMINATE);
+}
+
+/* ARGSUSED */
+static void
+flow_stats(const char *flow, datalink_id_t linkid, uint_t interval)
+{
+ show_flow_state_t state;
+ dladm_flow_attr_t attr;
+
+ if (flow != NULL && dladm_flow_info(flow, &attr) != DLADM_STATUS_OK)
+ die("invalid flow %s", flow);
+
+ bzero(&state, sizeof (state));
+
+ /*
+ * If an interval is specified, continuously show the stats
+ * for only the first flow.
+ */
+ state.fs_firstonly = (interval != 0);
+
+ for (;;) {
+ if (!state.fs_donefirst)
+ (void) printf("%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n",
+ "FLOW", "IPACKETS", "RBYTES", "IERRORS",
+ "OPACKETS", "OBYTES", "OERRORS");
+
+ state.fs_donefirst = B_FALSE;
+
+ /* Show stats for named flow */
+ if (flow != NULL) {
+ state.fs_flow = flow;
+ (void) show_flow_stats(&attr, &state);
+
+ /* Show all stats on a link */
+ } else if (linkid != DATALINK_INVALID_LINKID) {
+ (void) dladm_walk_flow(show_flow_stats, linkid, &state,
+ B_FALSE);
+
+ /* Show all stats by datalink */
+ } else {
+ (void) dladm_walk_datalink_id(show_link_flow_stats,
+ &state, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE,
+ DLADM_OPT_ACTIVE);
+ }
+
+ if (interval == 0)
+ break;
+
+ (void) sleep(interval);
+ }
+}
+
+static void
+do_show_flow(int argc, char *argv[])
+{
+ char flowname[MAXFLOWNAME];
+ char linkname[MAXNAMELEN];
+ datalink_id_t linkid = DATALINK_ALL_LINKID;
+ int option;
+ boolean_t s_arg = B_FALSE;
+ boolean_t S_arg = B_FALSE;
+ boolean_t i_arg = B_FALSE;
+ boolean_t l_arg = B_FALSE;
+ boolean_t o_arg = B_FALSE;
+ uint32_t interval = 0;
+ char *endp = NULL;
+ show_flow_state_t state;
+ char *fields_str = NULL;
+ print_field_t **fields;
+ uint_t nfields;
+ char *all_fields =
+ "flow,link,ipaddr,transport,port,dsfield";
+ dladm_status_t status;
+
+ bzero(&state, sizeof (state));
+
+ opterr = 0;
+ while ((option = getopt_long(argc, argv, ":pPsSi:l:o:",
+ longopts, NULL)) != -1) {
+ switch (option) {
+ case 'p':
+ state.fs_parseable = B_TRUE;
+ break;
+ case 'P':
+ state.fs_persist = B_TRUE;
+ break;
+ case 's':
+ if (s_arg)
+ die_optdup(option);
+
+ s_arg = B_TRUE;
+ break;
+ case 'S':
+ if (S_arg)
+ die_optdup(option);
+
+ S_arg = B_TRUE;
+ break;
+ case 'o':
+ if (o_arg)
+ die_optdup(option);
+
+ o_arg = B_TRUE;
+ fields_str = optarg;
+ break;
+ case 'i':
+ if (i_arg)
+ die_optdup(option);
+
+ i_arg = B_TRUE;
+
+ errno = 0;
+ interval = (int)strtol(optarg, &endp, 10);
+ if (errno != 0 || interval == 0 || *endp != '\0')
+ die("invalid interval value" " '%d'\n",
+ interval);
+ break;
+ case 'l':
+ if (strlcpy(linkname, optarg, MAXLINKNAMELEN)
+ >= MAXLINKNAMELEN)
+ die("link name too long\n");
+ if (dladm_name2info(linkname, &linkid, NULL,
+ NULL, NULL) != DLADM_STATUS_OK)
+ die("invalid link '%s'", linkname);
+ l_arg = B_TRUE;
+ break;
+ default:
+ die_opterr(optopt, option);
+ break;
+ }
+ }
+ if (i_arg && !(s_arg || S_arg))
+ die("the -i option can be used only with -s or -S");
+
+ if (s_arg && S_arg)
+ die("the -s option cannot be used with -S");
+
+ /* get flow name (optional last argument */
+ if (optind == (argc-1)) {
+ if (strlcpy(flowname, argv[optind], MAXFLOWNAME)
+ >= MAXFLOWNAME)
+ die("flow name too long");
+ state.fs_flow = flowname;
+ }
+
+ if (s_arg) {
+ flow_stats(state.fs_flow, linkid, interval);
+ return;
+ }
+
+ if (S_arg) {
+ dladm_continuous(linkid, state.fs_flow, interval, FLOW_REPORT);
+ return;
+ }
+
+ if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+ fields_str = all_fields;
+
+ fields = parse_output_fields(fields_str, flow_fields, FLOW_MAX_FIELDS,
+ CMD_TYPE_ANY, &nfields);
+
+ if (fields == NULL) {
+ die("invalid fields(s) specified");
+ return;
+ }
+
+ state.fs_print.ps_fields = fields;
+ state.fs_print.ps_nfields = nfields;
+
+ /* Show attributes of one flow */
+ if (state.fs_flow != NULL) {
+ show_one_flow(&state, state.fs_flow);
+
+ /* Show attributes of flows on one link */
+ } else if (l_arg) {
+ (void) show_flows_onelink(linkid, &state);
+
+ /* Show attributes of all flows on all links */
+ } else {
+ (void) dladm_walk_datalink_id(show_flows_onelink, &state,
+ DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE,
+ DLADM_OPT_ACTIVE);
+ }
+}
+
+static dladm_status_t
+set_flowprop_persist(const char *flow, const char *prop_name, char **prop_val,
+ uint_t val_cnt, boolean_t reset)
+{
+ dladm_status_t status;
+ char *errprop;
+
+ status = dladm_set_flowprop(flow, prop_name, prop_val, val_cnt,
+ DLADM_OPT_PERSIST, &errprop);
+
+ if (status != DLADM_STATUS_OK) {
+ warn_dlerr(status, "cannot persistently %s flow "
+ "property '%s' on '%s'", reset? "reset": "set",
+ errprop, flow);
+ }
+ return (status);
+}
+
+static void
+set_flowprop(int argc, char **argv, boolean_t reset)
+{
+ int i, option;
+ char errmsg[DLADM_STRSIZE];
+ const char *flow = NULL;
+ dladm_arg_list_t *proplist = NULL;
+ boolean_t temp = B_FALSE;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ opterr = 0;
+ while ((option = getopt_long(argc, argv, ":p:R:t",
+ prop_longopts, NULL)) != -1) {
+ switch (option) {
+ case 'p':
+ if (dladm_parse_flow_props(optarg, &proplist, reset)
+ != DLADM_STATUS_OK)
+ die("invalid flow property specified");
+ break;
+ case 't':
+ temp = B_TRUE;
+ break;
+ case 'R':
+ status = dladm_set_rootdir(optarg);
+ if (status != DLADM_STATUS_OK) {
+ die_dlerr(status, "invalid directory "
+ "specified");
+ }
+ break;
+ default:
+ die_opterr(optopt, option);
+ break;
+ }
+ }
+
+ if (optind == (argc - 1)) {
+ if (strlen(argv[optind]) >= MAXFLOWNAME)
+ die("flow name too long");
+ flow = argv[optind];
+ } else if (optind != argc) {
+ usage();
+ }
+ if (flow == NULL)
+ die("flow name must be specified");
+
+ if (proplist == NULL) {
+ char *errprop;
+
+ if (!reset)
+ die("flow property must be specified");
+
+ status = dladm_set_flowprop(flow, NULL, NULL, 0,
+ DLADM_OPT_ACTIVE, &errprop);
+ if (status != DLADM_STATUS_OK) {
+ warn_dlerr(status, "cannot reset flow property '%s' "
+ "on '%s'", errprop, flow);
+ }
+ if (!temp) {
+ dladm_status_t s;
+
+ s = set_flowprop_persist(flow, NULL, NULL, 0, reset);
+ if (s != DLADM_STATUS_OK)
+ status = s;
+ }
+ goto done;
+ }
+
+ for (i = 0; i < proplist->al_count; i++) {
+ dladm_arg_info_t *aip = &proplist->al_info[i];
+ char **val;
+ uint_t count;
+ dladm_status_t s;
+
+ if (reset) {
+ val = NULL;
+ count = 0;
+ } else {
+ val = aip->ai_val;
+ count = aip->ai_count;
+ if (count == 0) {
+ warn("no value specified for '%s'",
+ aip->ai_name);
+ status = DLADM_STATUS_BADARG;
+ continue;
+ }
+ }
+ s = dladm_set_flowprop(flow, aip->ai_name, val, count,
+ DLADM_OPT_ACTIVE, NULL);
+ if (s == DLADM_STATUS_OK) {
+ if (!temp) {
+ s = set_flowprop_persist(flow,
+ aip->ai_name, val, count, reset);
+ if (s != DLADM_STATUS_OK)
+ status = s;
+ }
+ continue;
+ }
+ status = s;
+ switch (s) {
+ case DLADM_STATUS_NOTFOUND:
+ warn("invalid flow property '%s'", aip->ai_name);
+ break;
+ case DLADM_STATUS_BADVAL: {
+ int j;
+ char *ptr, *lim;
+ char **propvals = NULL;
+ uint_t valcnt = DLADM_MAX_PROP_VALCNT;
+
+ ptr = malloc((sizeof (char *) +
+ DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT +
+ MAX_PROP_LINE);
+
+ if (ptr == NULL)
+ die("insufficient memory");
+ propvals = (char **)(void *)ptr;
+
+ for (j = 0; j < DLADM_MAX_PROP_VALCNT; j++) {
+ propvals[j] = ptr + sizeof (char *) *
+ DLADM_MAX_PROP_VALCNT +
+ j * DLADM_PROP_VAL_MAX;
+ }
+ s = dladm_get_flowprop(flow, DLADM_PROP_VAL_MODIFIABLE,
+ aip->ai_name, propvals, &valcnt);
+
+ ptr = errmsg;
+ lim = ptr + DLADM_STRSIZE;
+ *ptr = '\0';
+ for (j = 0; j < valcnt && s == DLADM_STATUS_OK; j++) {
+ ptr += snprintf(ptr, lim - ptr, "%s,",
+ propvals[j]);
+ if (ptr >= lim)
+ break;
+ }
+ if (ptr > errmsg) {
+ *(ptr - 1) = '\0';
+ warn("flow property '%s' must be one of: %s",
+ aip->ai_name, errmsg);
+ } else
+ warn("%s is an invalid value for "
+ "flow property %s", *val, aip->ai_name);
+ free(propvals);
+ break;
+ }
+ default:
+ if (reset) {
+ warn_dlerr(status, "cannot reset flow property "
+ "'%s' on '%s'", aip->ai_name, flow);
+ } else {
+ warn_dlerr(status, "cannot set flow property "
+ "'%s' on '%s'", aip->ai_name, flow);
+ }
+ break;
+ }
+ }
+done:
+ dladm_free_props(proplist);
+ if (status != DLADM_STATUS_OK)
+ exit(1);
+}
+
+static void
+do_set_flowprop(int argc, char **argv)
+{
+ set_flowprop(argc, argv, B_FALSE);
+}
+
+static void
+do_reset_flowprop(int argc, char **argv)
+{
+ set_flowprop(argc, argv, B_TRUE);
+}
+
+static void
+warn(const char *format, ...)
+{
+ va_list alist;
+
+ format = gettext(format);
+ (void) fprintf(stderr, "%s: warning: ", progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+
+ (void) putchar('\n');
+}
+
+/* PRINTFLIKE2 */
+static void
+warn_dlerr(dladm_status_t err, const char *format, ...)
+{
+ va_list alist;
+ char errmsg[DLADM_STRSIZE];
+
+ format = gettext(format);
+ (void) fprintf(stderr, gettext("%s: warning: "), progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+ (void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg));
+}
+
+/* PRINTFLIKE1 */
+static void
+die(const char *format, ...)
+{
+ va_list alist;
+
+ format = gettext(format);
+ (void) fprintf(stderr, "%s: ", progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+
+ (void) putchar('\n');
+ exit(EXIT_FAILURE);
+}
+
+static void
+die_optdup(int opt)
+{
+ die("the option -%c cannot be specified more than once", opt);
+}
+
+static void
+die_opterr(int opt, int opterr)
+{
+ switch (opterr) {
+ case ':':
+ die("option '-%c' requires a value", opt);
+ break;
+ case '?':
+ default:
+ die("unrecognized option '-%c'", opt);
+ break;
+ }
+}
+
+/* PRINTFLIKE2 */
+static void
+die_dlerr(dladm_status_t err, const char *format, ...)
+{
+ va_list alist;
+ char errmsg[DLADM_STRSIZE];
+
+ format = gettext(format);
+ (void) fprintf(stderr, "%s: ", progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+ (void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg));
+
+ exit(EXIT_FAILURE);
+}
+
+static void
+print_flowprop(const char *flowname, show_flowprop_state_t *statep,
+ const char *propname, dladm_prop_type_t type,
+ const char *format, char **pptr)
+{
+ int i;
+ char *ptr, *lim;
+ char buf[DLADM_STRSIZE];
+ char *unknown = "--", *notsup = "";
+ char **propvals = statep->fs_propvals;
+ uint_t valcnt = DLADM_MAX_PROP_VALCNT;
+ dladm_status_t status;
+
+ status = dladm_get_flowprop(flowname, type, propname, propvals,
+ &valcnt);
+ if (status != DLADM_STATUS_OK) {
+ if (status == DLADM_STATUS_TEMPONLY) {
+ if (type == DLADM_PROP_VAL_MODIFIABLE &&
+ statep->fs_persist) {
+ valcnt = 1;
+ propvals = &unknown;
+ } else {
+ statep->fs_status = status;
+ statep->fs_retstatus = status;
+ return;
+ }
+ } else if (status == DLADM_STATUS_NOTSUP ||
+ statep->fs_persist) {
+ valcnt = 1;
+ if (type == DLADM_PROP_VAL_CURRENT)
+ propvals = &unknown;
+ else
+ propvals = &notsup;
+ } else {
+ if ((statep->fs_proplist != NULL) &&
+ statep->fs_status == DLADM_STATUS_OK) {
+ warn("invalid flow property '%s'", propname);
+ }
+ statep->fs_status = status;
+ statep->fs_retstatus = status;
+ return;
+ }
+ }
+
+ statep->fs_status = DLADM_STATUS_OK;
+
+ ptr = buf;
+ lim = buf + DLADM_STRSIZE;
+ for (i = 0; i < valcnt; i++) {
+ if (propvals[i][0] == '\0' && !statep->fs_parseable)
+ ptr += snprintf(ptr, lim - ptr, STR_UNDEF_VAL",");
+ else
+ ptr += snprintf(ptr, lim - ptr, "%s,", propvals[i]);
+ if (ptr >= lim)
+ break;
+ }
+ if (valcnt > 0)
+ buf[strlen(buf) - 1] = '\0';
+
+ lim = statep->fs_line + MAX_PROP_LINE;
+ if (statep->fs_parseable) {
+ *pptr += snprintf(*pptr, lim - *pptr,
+ "%s", buf);
+ } else {
+ *pptr += snprintf(*pptr, lim - *pptr, format, buf);
+ }
+}
+
+static char *
+flowprop_callback(print_field_t *pf, void *fs_arg)
+{
+ flowprop_args_t *arg = fs_arg;
+ char *propname = arg->fs_propname;
+ show_flowprop_state_t *statep = arg->fs_state;
+ char *ptr = statep->fs_line;
+ char *lim = ptr + MAX_PROP_LINE;
+ char *flowname = arg->fs_flowname;
+
+ switch (pf->pf_index) {
+ case FLOWPROP_FLOW:
+ (void) snprintf(ptr, lim - ptr, "%s", statep->fs_flow);
+ break;
+ case FLOWPROP_PROPERTY:
+ (void) snprintf(ptr, lim - ptr, "%s", propname);
+ break;
+ case FLOWPROP_VALUE:
+ print_flowprop(flowname, statep, propname,
+ statep->fs_persist ? DLADM_PROP_VAL_PERSISTENT :
+ DLADM_PROP_VAL_CURRENT, "%s", &ptr);
+ /*
+ * If we failed to query the flow property, for example, query
+ * the persistent value of a non-persistable flow property,
+ * simply skip the output.
+ */
+ if (statep->fs_status != DLADM_STATUS_OK)
+ goto skip;
+ ptr = statep->fs_line;
+ break;
+ case FLOWPROP_DEFAULT:
+ print_flowprop(flowname, statep, propname,
+ DLADM_PROP_VAL_DEFAULT, "%s", &ptr);
+ if (statep->fs_status != DLADM_STATUS_OK)
+ goto skip;
+ ptr = statep->fs_line;
+ break;
+ case FLOWPROP_POSSIBLE:
+ print_flowprop(flowname, statep, propname,
+ DLADM_PROP_VAL_MODIFIABLE, "%s ", &ptr);
+ if (statep->fs_status != DLADM_STATUS_OK)
+ goto skip;
+ ptr = statep->fs_line;
+ break;
+ default:
+ die("invalid input");
+ break;
+ }
+ return (ptr);
+skip:
+ if (statep->fs_status != DLADM_STATUS_OK)
+ return (NULL);
+ else
+ return ("");
+}
+
+static int
+show_one_flowprop(void *arg, const char *propname)
+{
+ show_flowprop_state_t *statep = arg;
+ flowprop_args_t fs_arg;
+
+ bzero(&fs_arg, sizeof (fs_arg));
+ fs_arg.fs_state = statep;
+ fs_arg.fs_propname = (char *)propname;
+ fs_arg.fs_flowname = (char *)statep->fs_flow;
+
+ if (statep->fs_header) {
+ statep->fs_header = B_FALSE;
+ if (!statep ->fs_parseable)
+ print_header(&statep->fs_print);
+ }
+ flowadm_print_output(&statep->fs_print, statep->fs_parseable,
+ flowprop_callback, (void *)&fs_arg);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+/* Walker function called by dladm_walk_flow to display flow properties */
+static int
+show_flowprop(dladm_flow_attr_t *attr, void *arg)
+{
+ show_flowprop_one_flow(arg, attr->fa_flowname);
+ return (DLADM_WALK_CONTINUE);
+}
+
+/*
+ * Wrapper of dladm_walk_flow(show_walk_fn,...) to make it
+ * usable to dladm_walk_datalink_id()
+ */
+static int
+show_flowprop_onelink(datalink_id_t linkid, void *arg)
+{
+ char name[MAXLINKNAMELEN];
+
+ if (dladm_datalink_id2info(linkid, NULL, NULL, NULL,
+ name, sizeof (name)) != DLADM_STATUS_OK)
+ return (DLADM_WALK_TERMINATE);
+
+ (void) dladm_walk_flow(show_flowprop, linkid, arg, B_FALSE);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+static void
+do_show_flowprop(int argc, char **argv)
+{
+ int option;
+ dladm_arg_list_t *proplist = NULL;
+ show_flowprop_state_t state;
+ char *fields_str = NULL;
+ print_field_t **fields;
+ uint_t nfields;
+ char *all_fields =
+ "flow,property,value,default,possible";
+
+ fields_str = all_fields;
+ opterr = 0;
+ state.fs_propvals = NULL;
+ state.fs_line = NULL;
+ state.fs_parseable = B_FALSE;
+ state.fs_persist = B_FALSE;
+ state.fs_header = B_TRUE;
+ state.fs_retstatus = DLADM_STATUS_OK;
+ state.fs_linkid = DATALINK_INVALID_LINKID;
+ state.fs_flow = NULL;
+
+ while ((option = getopt_long(argc, argv, ":p:cPl:o:",
+ prop_longopts, NULL)) != -1) {
+ switch (option) {
+ case 'p':
+ if (dladm_parse_flow_props(optarg, &proplist, B_TRUE)
+ != DLADM_STATUS_OK)
+ die("invalid flow properties specified");
+ break;
+ case 'c':
+ state.fs_parseable = B_TRUE;
+ break;
+ case 'P':
+ state.fs_persist = B_TRUE;
+ break;
+ case 'l':
+ if (dladm_name2info(optarg, &state.fs_linkid,
+ NULL, NULL, NULL) != DLADM_STATUS_OK)
+ die("invalid link '%s'", optarg);
+ break;
+ case 'o':
+ if (strcasecmp(optarg, "all") == 0)
+ fields_str = all_fields;
+ else
+ fields_str = optarg;
+ break;
+ default:
+ die_opterr(optopt, option);
+ break;
+ }
+ }
+
+ if (optind == (argc - 1)) {
+ if (strlen(argv[optind]) >= MAXFLOWNAME)
+ die("flow name too long");
+ state.fs_flow = argv[optind];
+ } else if (optind != argc) {
+ usage();
+ }
+ bzero(&state.fs_print, sizeof (print_state_t));
+ state.fs_proplist = proplist;
+ state.fs_status = DLADM_STATUS_OK;
+
+ fields = parse_output_fields(fields_str, flowprop_fields,
+ FLOWPROP_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+
+ if (fields == NULL) {
+ die("invalid field(s) specified");
+ return;
+ }
+
+ state.fs_print.ps_fields = fields;
+ state.fs_print.ps_nfields = nfields;
+
+ /* Show properties for one flow */
+ if (state.fs_flow != NULL) {
+ show_flowprop_one_flow(&state, state.fs_flow);
+
+ /* Show properties for all flows on one link */
+ } else if (state.fs_linkid != DATALINK_INVALID_LINKID) {
+ (void) show_flowprop_onelink(state.fs_linkid, &state);
+
+ /* Show properties for all flows on all links */
+ } else {
+ (void) dladm_walk_datalink_id(show_flowprop_onelink, &state,
+ DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE,
+ DLADM_OPT_ACTIVE);
+ }
+
+ dladm_free_props(proplist);
+}
+
+static void
+show_flowprop_one_flow(void *arg, const char *flow)
+{
+ int i;
+ char *buf;
+ dladm_status_t status;
+ dladm_arg_list_t *proplist = NULL;
+ show_flowprop_state_t *statep = arg;
+ dladm_flow_attr_t attr;
+ const char *savep;
+
+ /*
+ * Do not print flow props for invalid flows.
+ */
+ if ((status = dladm_flow_info(flow, &attr)) != DLADM_STATUS_OK) {
+ die("invalid flow: '%s'", flow);
+ }
+
+ savep = statep->fs_flow;
+ statep->fs_flow = flow;
+
+ proplist = statep->fs_proplist;
+
+ buf = malloc((sizeof (char *) + DLADM_PROP_VAL_MAX)
+ * DLADM_MAX_PROP_VALCNT + MAX_PROP_LINE);
+ if (buf == NULL)
+ die("insufficient memory");
+
+ statep->fs_propvals = (char **)(void *)buf;
+ for (i = 0; i < DLADM_MAX_PROP_VALCNT; i++) {
+ statep->fs_propvals[i] = buf +
+ sizeof (char *) * DLADM_MAX_PROP_VALCNT +
+ i * DLADM_PROP_VAL_MAX;
+ }
+ statep->fs_line = buf +
+ (sizeof (char *) + DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT;
+
+ /* show only specified flow properties */
+ if (proplist != NULL) {
+ for (i = 0; i < proplist->al_count; i++) {
+ if (show_one_flowprop(statep,
+ proplist->al_info[i].ai_name) != DLADM_STATUS_OK)
+ break;
+ }
+
+ /* show all flow properties */
+ } else {
+ status = dladm_walk_flowprop(show_one_flowprop, flow, statep);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "show-flowprop");
+ }
+ free(buf);
+ statep->fs_flow = savep;
+}
+
+typedef struct {
+ char *s_buf;
+ char **s_fields; /* array of pointer to the fields in s_buf */
+ uint_t s_nfields; /* the number of fields in s_buf */
+} split_t;
+
+/*
+ * Free the split_t structure pointed to by `sp'.
+ */
+static void
+splitfree(split_t *sp)
+{
+ free(sp->s_buf);
+ free(sp->s_fields);
+ free(sp);
+}
+
+/*
+ * Split `str' into at most `maxfields' fields, each field at most `maxlen' in
+ * length. Return a pointer to a split_t containing the split fields, or NULL
+ * on failure.
+ */
+static split_t *
+split(const char *str, uint_t maxfields, uint_t maxlen)
+{
+ char *field, *token, *lasts = NULL;
+ split_t *sp;
+
+ if (*str == '\0' || maxfields == 0 || maxlen == 0)
+ return (NULL);
+
+ sp = calloc(sizeof (split_t), 1);
+ if (sp == NULL)
+ return (NULL);
+
+ sp->s_buf = strdup(str);
+ sp->s_fields = malloc(sizeof (char *) * maxfields);
+ if (sp->s_buf == NULL || sp->s_fields == NULL)
+ goto fail;
+
+ token = sp->s_buf;
+ while ((field = strtok_r(token, ",", &lasts)) != NULL) {
+ if (sp->s_nfields == maxfields || strlen(field) > maxlen)
+ goto fail;
+ token = NULL;
+ sp->s_fields[sp->s_nfields++] = field;
+ }
+ return (sp);
+fail:
+ splitfree(sp);
+ return (NULL);
+}
+
+static print_field_t **
+parse_output_fields(char *str, print_field_t *template, int max_fields,
+ uint_t cmdtype, uint_t *countp)
+{
+ split_t *sp;
+ boolean_t good_match = B_FALSE;
+ uint_t i, j;
+ print_field_t **pf = NULL;
+
+ sp = split(str, max_fields, MAX_FIELD_LEN);
+
+ if (sp == NULL)
+ return (NULL);
+
+ pf = malloc(sp->s_nfields * sizeof (print_field_t *));
+ if (pf == NULL)
+ goto fail;
+
+ for (i = 0; i < sp->s_nfields; i++) {
+ for (j = 0; j < max_fields; j++) {
+ if (strcasecmp(sp->s_fields[i],
+ template[j].pf_name) == 0) {
+ good_match = template[j]. pf_cmdtype & cmdtype;
+ break;
+ }
+ }
+ if (!good_match)
+ goto fail;
+
+ good_match = B_FALSE;
+ pf[i] = &template[j];
+ }
+ *countp = i;
+ splitfree(sp);
+ return (pf);
+fail:
+ free(pf);
+ splitfree(sp);
+ return (NULL);
+}
+
+static void
+flowadm_print_output(print_state_t *statep, boolean_t parseable,
+ print_callback_t fn, void *arg)
+{
+ int i;
+ char *value;
+ print_field_t **pf;
+
+ pf = statep->ps_fields;
+ for (i = 0; i < statep->ps_nfields; i++) {
+ statep->ps_lastfield = (i + 1 == statep->ps_nfields);
+ value = (*fn)(pf[i], arg);
+ if (value != NULL)
+ print_field(statep, pf[i], value, parseable);
+ }
+ (void) putchar('\n');
+}
+
+static void
+print_header(print_state_t *ps)
+{
+ int i;
+ print_field_t **pf;
+
+ pf = ps->ps_fields;
+ for (i = 0; i < ps->ps_nfields; i++) {
+ ps->ps_lastfield = (i + 1 == ps->ps_nfields);
+ print_field(ps, pf[i], pf[i]->pf_header, B_FALSE);
+ }
+ (void) putchar('\n');
+}
+
+static void
+print_field(print_state_t *statep, print_field_t *pfp, const char *value,
+ boolean_t parseable)
+{
+ uint_t width = pfp->pf_width;
+ uint_t valwidth = strlen(value);
+ uint_t compress;
+
+ if (parseable) {
+ (void) printf("%s=\"%s\"", pfp->pf_header, value);
+ } else {
+ if (value[0] == '\0')
+ value = STR_UNDEF_VAL;
+ if (statep->ps_lastfield) {
+ (void) printf("%s", value);
+ return;
+ }
+
+ if (valwidth > width) {
+ statep->ps_overflow += valwidth - width;
+ } else if (valwidth < width && statep->ps_overflow > 0) {
+ compress = min(statep->ps_overflow, width - valwidth);
+ statep->ps_overflow -= compress;
+ width -= compress;
+ }
+ (void) printf("%-*s", width, value);
+ }
+
+ if (!statep->ps_lastfield)
+ (void) putchar(' ');
+}
diff --git a/usr/src/cmd/flowadm/flowadm.conf b/usr/src/cmd/flowadm/flowadm.conf
new file mode 100644
index 0000000000..3977ddf645
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowadm.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# DO NOT EDIT OR PARSE THIS FILE!
+#
+# Use the flowadm(1m) command to change the contents of this file.
+
diff --git a/usr/src/cmd/flowadm/flowadm.xcl b/usr/src/cmd/flowadm/flowadm.xcl
new file mode 100644
index 0000000000..856a788ed6
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowadm.xcl
@@ -0,0 +1,113 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#
+
+msgid "--"
+msgid "--,"
+msgid ""
+msgid " "
+msgid "%-*s"
+msgid "%-10llu"
+msgid "%-12llu"
+msgid "%-12s"
+msgid "%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n"
+msgid "%-8llu"
+msgid "%-8llu\n"
+msgid "%d"
+msgid "%s"
+msgid "%s "
+msgid "%s,"
+msgid "%s/%d "
+msgid "%s: "
+msgid "%s=\"%s\""
+msgid ","
+msgid "/"
+msgid "0x%x"
+msgid ": %s\n"
+msgid ":d:R:t"
+msgid ":p:R:t"
+msgid ":p:cPl:o:"
+msgid "?"
+msgid "ATTR"
+msgid "DEFAULT"
+msgid "FLOW"
+msgid "ICMPV6"
+msgid "ICMPv6"
+msgid "IERRORS"
+msgid "IPACKETS"
+msgid "LINK"
+msgid "NAME"
+msgid "OBYTES"
+msgid "OERRORS"
+msgid "OPACKETS"
+msgid "POSSIBLE"
+msgid "PROPERTY"
+msgid "RBYTES"
+msgid "SCTP"
+msgid "TCP"
+msgid "UDP"
+msgid "VALUE"
+msgid "add-flow"
+msgid "all"
+msgid "attr"
+msgid "default"
+msgid "dsfield"
+msgid "dsfield_mask"
+msgid "flow"
+msgid "flow,property,value,default,possible"
+msgid "icmp"
+msgid "icmpv6"
+msgid "init-flow"
+msgid "interval"
+msgid "link"
+msgid "local_ip"
+msgid "local_port"
+msgid "name"
+msgid "name,link,attr,value"
+msgid "net_rawaccess"
+msgid "parseable"
+msgid "possible"
+msgid "prop"
+msgid "property"
+msgid "psSi:l:o:"
+msgid "remote_ip"
+msgid "remove-flow"
+msgid "reset"
+msgid "reset-flowprop"
+msgid "root-dir"
+msgid "sctp"
+msgid "set"
+msgid "set-flowprop"
+msgid "show-flow"
+msgid "show-flowprop"
+msgid "show-usage"
+msgid "statistics"
+msgid "sys_net_config"
+msgid "tR:l:a:p:"
+msgid "tcp"
+msgid "tdps:e:f:"
+msgid "temporary"
+msgid "transport"
+msgid "udp"
+msgid "value"
diff --git a/usr/src/cmd/flowadm/flowprop.conf b/usr/src/cmd/flowadm/flowprop.conf
new file mode 100644
index 0000000000..ad6f802040
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowprop.conf
@@ -0,0 +1,29 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#
+# DO NOT EDIT OR PARSE THIS FILE!
+#
+# Use the flowadm(1m) command to change the contents of this file.
+
diff --git a/usr/src/cmd/mdb/Makefile.common b/usr/src/cmd/mdb/Makefile.common
index 5677289bc9..ed27426b8d 100644
--- a/usr/src/cmd/mdb/Makefile.common
+++ b/usr/src/cmd/mdb/Makefile.common
@@ -24,7 +24,8 @@
#
# MDB modules used for debugging user processes that every ISA's build
# subdirectory will need to build.
-#
+#
+
COMMON_MODULES_PROC = \
dof \
libavl \
@@ -70,6 +71,7 @@ COMMON_MODULES_KVM = \
krtld \
lofs \
logindmux \
+ mac \
md \
nca \
nsctl \
diff --git a/usr/src/cmd/mdb/common/modules/mac/mac.c b/usr/src/cmd/mdb/common/modules/mac/mac.c
new file mode 100644
index 0000000000..0f1effb4b2
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/mac/mac.c
@@ -0,0 +1,685 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/mdb_modapi.h>
+#include <sys/types.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_flow_impl.h>
+#include <sys/mac_soft_ring.h>
+
+#define STRSIZE 64
+#define MAC_RX_SRS_SIZE (MAX_RINGS_PER_GROUP * sizeof (uintptr_t))
+
+#define LAYERED_WALKER_FOR_FLOW "flow_entry_cache"
+#define LAYERED_WALKER_FOR_SRS "mac_srs_cache"
+#define LAYERED_WALKER_FOR_RING "mac_ring_cache"
+
+/* arguments passed to mac_flow dee-command */
+#define MAC_FLOW_NONE 0x01
+#define MAC_FLOW_ATTR 0x02
+#define MAC_FLOW_PROP 0x04
+#define MAC_FLOW_RX 0x08
+#define MAC_FLOW_TX 0x10
+#define MAC_FLOW_USER 0x20
+#define MAC_FLOW_STATS 0x40
+#define MAC_FLOW_MISC 0x80
+
+/* arguments passed to mac_srs dee-command */
+#define MAC_SRS_RX 0x01
+#define MAC_SRS_TX 0x02
+
+static char *
+mac_flow_proto2str(uint8_t protocol)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return ("tcp");
+ case IPPROTO_UDP:
+ return ("udp");
+ case IPPROTO_SCTP:
+ return ("sctp");
+ case IPPROTO_ICMP:
+ return ("icmp");
+ case IPPROTO_ICMPV6:
+ return ("icmpv6");
+ default:
+ return ("--");
+ }
+}
+
+static char *
+mac_flow_priority2str(mac_priority_level_t prio)
+{
+ switch (prio) {
+ case MPL_LOW:
+ return ("low");
+ case MPL_MEDIUM:
+ return ("medium");
+ case MPL_HIGH:
+ return ("high");
+ case MPL_RESET:
+ return ("reset");
+ default:
+ return ("--");
+ }
+}
+
+/*
+ * Convert bandwidth in bps to a string in mpbs.
+ */
+static char *
+mac_flow_bw2str(uint64_t bw, char *buf, ssize_t len)
+{
+ int kbps, mbps;
+
+ kbps = (bw % 1000000)/1000;
+ mbps = bw/1000000;
+ if ((mbps == 0) && (kbps != 0))
+ mdb_snprintf(buf, len, "0.%03u", kbps);
+ else
+ mdb_snprintf(buf, len, "%5u", mbps);
+ return (buf);
+}
+
+static void
+mac_flow_print_header(uint_t args)
+{
+ switch (args) {
+ case MAC_FLOW_NONE:
+ mdb_printf("%<u>%?s %-32s %-6s %?s %?s %-20s%</u>\n",
+ "ADDR", "FLOW NAME", "LINKID", "MCIP", "MIP",
+ "MIP NAME");
+ break;
+ case MAC_FLOW_ATTR:
+ mdb_printf("%<u>%?s %-32s %-7s %6s "
+ "%-9s %s%</u>\n",
+ "ADDR", "FLOW NAME", "PROTO", "PORT",
+ "DSFLD:MSK", "IPADDR");
+ break;
+ case MAC_FLOW_PROP:
+ mdb_printf("%<u>%?s %-32s %8s %9s%</u>\n",
+ "ADDR", "FLOW NAME", "MAXBW(M)", "PRIORITY");
+ break;
+ case MAC_FLOW_MISC:
+ mdb_printf("%<u>%?s %-32s %10s %10s "
+ "%32s %s%</u>\n",
+ "ADDR", "FLOW NAME", "TYPE", "FLAGS",
+ "MATCH_FN", "ZONE");
+ break;
+ case MAC_FLOW_RX:
+ mdb_printf("%<u>%?s %-24s %-30s %?s "
+ "%?s %7s %s%</u>\n",
+ "ADDR", "FLOW NAME", "CB_FUNC", "CB_ARG1",
+ "CB_ARG2", "SRS_CNT", "RX_SRS");
+ break;
+ case MAC_FLOW_TX:
+ mdb_printf("%<u>%?s %-32s %?s %</u>\n",
+ "ADDR", "FLOW NAME", "TX_SRS");
+ break;
+ case MAC_FLOW_STATS:
+ mdb_printf("%<u>%?s %-32s %?s %?s%</u>\n",
+ "ADDR", "FLOW NAME", "RBYTES", "OBYTES");
+ break;
+ }
+}
+
+/*
+ * Display selected fields of the flow_entry_t structure
+ */
+static int
+mac_flow_dcmd_output(uintptr_t addr, uint_t flags, uint_t args)
+{
+ static const mdb_bitmask_t flow_type_bits[] = {
+ {"P", FLOW_PRIMARY_MAC, FLOW_PRIMARY_MAC},
+ {"V", FLOW_VNIC_MAC, FLOW_VNIC_MAC},
+ {"M", FLOW_MCAST, FLOW_MCAST},
+ {"O", FLOW_OTHER, FLOW_OTHER},
+ {"U", FLOW_USER, FLOW_USER},
+ {"V", FLOW_VNIC, FLOW_VNIC},
+ {"NS", FLOW_NO_STATS, FLOW_NO_STATS},
+ { NULL, 0, 0 }
+ };
+#define FLOW_MAX_TYPE (sizeof (flow_type_bits) / sizeof (mdb_bitmask_t))
+
+ static const mdb_bitmask_t flow_flag_bits[] = {
+ {"Q", FE_QUIESCE, FE_QUIESCE},
+ {"W", FE_WAITER, FE_WAITER},
+ {"T", FE_FLOW_TAB, FE_FLOW_TAB},
+ {"G", FE_G_FLOW_HASH, FE_G_FLOW_HASH},
+ {"I", FE_INCIPIENT, FE_INCIPIENT},
+ {"C", FE_CONDEMNED, FE_CONDEMNED},
+ {"NU", FE_UF_NO_DATAPATH, FE_UF_NO_DATAPATH},
+ {"NC", FE_MC_NO_DATAPATH, FE_MC_NO_DATAPATH},
+ { NULL, 0, 0 }
+ };
+#define FLOW_MAX_FLAGS (sizeof (flow_flag_bits) / sizeof (mdb_bitmask_t))
+ flow_entry_t fe;
+ mac_client_impl_t mcip;
+ mac_impl_t mip;
+
+ if (mdb_vread(&fe, sizeof (fe), addr) == -1) {
+ mdb_warn("failed to read struct flow_entry_s at %p", addr);
+ return (DCMD_ERR);
+ }
+ if (args & MAC_FLOW_USER) {
+ args &= ~MAC_FLOW_USER;
+ if (fe.fe_type & FLOW_MCAST) {
+ if (DCMD_HDRSPEC(flags))
+ mac_flow_print_header(args);
+ return (DCMD_OK);
+ }
+ }
+ if (DCMD_HDRSPEC(flags))
+ mac_flow_print_header(args);
+ bzero(&mcip, sizeof (mcip));
+ bzero(&mip, sizeof (mip));
+ if (fe.fe_mcip != NULL && mdb_vread(&mcip, sizeof (mcip),
+ (uintptr_t)fe.fe_mcip) == sizeof (mcip)) {
+ (void) mdb_vread(&mip, sizeof (mip), (uintptr_t)mcip.mci_mip);
+ }
+ switch (args) {
+ case MAC_FLOW_NONE: {
+ mdb_printf("%?p %-32s %6d %?p "
+ "%?p %-20s\n",
+ addr, fe.fe_flow_name, fe.fe_link_id, fe.fe_mcip,
+ mcip.mci_mip, mip.mi_name);
+ break;
+ }
+ case MAC_FLOW_ATTR: {
+ struct in_addr in4;
+ uintptr_t desc_addr;
+ flow_desc_t fdesc;
+
+ desc_addr = addr + OFFSETOF(flow_entry_t, fe_flow_desc);
+ if (mdb_vread(&fdesc, sizeof (fdesc), desc_addr) == -1) {
+ mdb_warn("failed to read struct flow_description at %p",
+ desc_addr);
+ return (DCMD_ERR);
+ }
+ mdb_printf("%?p %-32s "
+ "%-7s %6d"
+ "%4d:%-4d ",
+ addr, fe.fe_flow_name,
+ mac_flow_proto2str(fdesc.fd_protocol), fdesc.fd_local_port,
+ fdesc.fd_dsfield, fdesc.fd_dsfield_mask);
+ if (fdesc.fd_ipversion == IPV4_VERSION) {
+ IN6_V4MAPPED_TO_INADDR(&fdesc.fd_local_addr, &in4);
+ mdb_printf("%I", in4.s_addr);
+ } else if (fdesc.fd_ipversion == IPV6_VERSION) {
+ mdb_printf("%N", &fdesc.fd_local_addr);
+ } else {
+ mdb_printf("%s", "--");
+ }
+ mdb_printf("\n");
+ break;
+ }
+ case MAC_FLOW_PROP: {
+ uintptr_t prop_addr;
+ char bwstr[STRSIZE];
+ mac_resource_props_t fprop;
+
+ prop_addr = addr + OFFSETOF(flow_entry_t, fe_resource_props);
+ if (mdb_vread(&fprop, sizeof (fprop), prop_addr) == -1) {
+ mdb_warn("failed to read struct mac_resoource_props "
+ "at %p", prop_addr);
+ return (DCMD_ERR);
+ }
+ mdb_printf("%?p %-32s "
+ "%8s %9s\n",
+ addr, fe.fe_flow_name,
+ mac_flow_bw2str(fprop.mrp_maxbw, bwstr, STRSIZE),
+ mac_flow_priority2str(fprop.mrp_priority));
+ break;
+ }
+ case MAC_FLOW_MISC: {
+ char flow_flags[2 * FLOW_MAX_FLAGS];
+ char flow_type[2 * FLOW_MAX_TYPE];
+ GElf_Sym sym;
+ char func_name[MDB_SYM_NAMLEN] = "";
+ uintptr_t func, match_addr;
+
+ match_addr = addr + OFFSETOF(flow_entry_t, fe_match);
+ (void) mdb_vread(&func, sizeof (func), match_addr);
+ (void) mdb_lookup_by_addr(func, MDB_SYM_EXACT, func_name,
+ MDB_SYM_NAMLEN, &sym);
+ mdb_snprintf(flow_flags, 2 * FLOW_MAX_FLAGS, "%hb",
+ fe.fe_flags, flow_flag_bits);
+ mdb_snprintf(flow_type, 2 * FLOW_MAX_TYPE, "%hb",
+ fe.fe_type, flow_type_bits);
+ mdb_printf("%?p %-32s %10s %10s "
+ "%32s %-d\n",
+ addr, fe.fe_flow_name, flow_type, flow_flags,
+ func_name, fe.fe_zoneid);
+ break;
+ }
+ case MAC_FLOW_RX: {
+ uintptr_t rx_srs[MAX_RINGS_PER_GROUP] = {0};
+ char cb_fn[MDB_SYM_NAMLEN] = "";
+ uintptr_t cb_fnaddr, fnaddr, rxaddr;
+ int i;
+ GElf_Sym sym;
+
+ rxaddr = addr + OFFSETOF(flow_entry_t, fe_rx_srs);
+ (void) mdb_vread(rx_srs, MAC_RX_SRS_SIZE, rxaddr);
+ fnaddr = addr + OFFSETOF(flow_entry_t, fe_cb_fn);
+ (void) mdb_vread(&cb_fnaddr, sizeof (cb_fnaddr), fnaddr);
+ (void) mdb_lookup_by_addr(cb_fnaddr, MDB_SYM_EXACT, cb_fn,
+ MDB_SYM_NAMLEN, &sym);
+ mdb_printf("%?p %-24s %-30s %?p "
+ "%?p %7d ",
+ addr, fe.fe_flow_name, cb_fn, fe.fe_cb_arg1,
+ fe.fe_cb_arg2, fe.fe_rx_srs_cnt);
+ for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
+ if (rx_srs[i] == 0)
+ continue;
+ mdb_printf("%p ", rx_srs[i]);
+ }
+ mdb_printf("\n");
+ break;
+ }
+ case MAC_FLOW_TX: {
+ uintptr_t tx_srs = 0, txaddr;
+
+ txaddr = addr + OFFSETOF(flow_entry_t, fe_tx_srs);
+ (void) mdb_vread(&tx_srs, sizeof (uintptr_t), txaddr);
+ mdb_printf("%?p %-32s %?p\n",
+ addr, fe.fe_flow_name, fe.fe_tx_srs);
+ break;
+ }
+ case MAC_FLOW_STATS: {
+ mdb_printf("%?p %-32s %16llu %16llu\n",
+ addr, fe.fe_flow_name, fe.fe_flowstats.fs_rbytes,
+ fe.fe_flowstats.fs_obytes);
+ break;
+ }
+ }
+ return (DCMD_OK);
+}
+
+/*
+ * Parse the arguments passed to the dcmd and print all or one flow_entry_t
+ * structures
+ */
+static int
+mac_flow_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ uint_t args = 0;
+
+ if (!(flags & DCMD_ADDRSPEC)) {
+ if (mdb_walk_dcmd("mac_flow", "mac_flow", argc, argv) == -1) {
+ mdb_warn("failed to walk 'mac_flow'");
+ return (DCMD_ERR);
+ }
+ return (DCMD_OK);
+ }
+ if ((mdb_getopts(argc, argv,
+ 'a', MDB_OPT_SETBITS, MAC_FLOW_ATTR, &args,
+ 'p', MDB_OPT_SETBITS, MAC_FLOW_PROP, &args,
+ 'm', MDB_OPT_SETBITS, MAC_FLOW_MISC, &args,
+ 'r', MDB_OPT_SETBITS, MAC_FLOW_RX, &args,
+ 't', MDB_OPT_SETBITS, MAC_FLOW_TX, &args,
+ 's', MDB_OPT_SETBITS, MAC_FLOW_STATS, &args,
+ 'u', MDB_OPT_SETBITS, MAC_FLOW_USER, &args) != argc)) {
+ return (DCMD_USAGE);
+ }
+ if (argc > 2 || (argc == 2 && !(args & MAC_FLOW_USER)))
+ return (DCMD_USAGE);
+ /*
+ * If no arguments was specified or just "-u" was specified then
+ * we default to printing basic information of flows.
+ */
+ if (args == 0 || args == MAC_FLOW_USER)
+ args |= MAC_FLOW_NONE;
+
+ return (mac_flow_dcmd_output(addr, flags, args));
+}
+
+static void
+mac_flow_help(void)
+{
+ mdb_printf("If an address is specified, then flow_entry structure at "
+ "that address is printed. Otherwise all the flows in the system "
+ "are printed.\n");
+ mdb_printf("Options:\n"
+ "\t-u\tdisplay user defined link & vnic flows.\n"
+ "\t-a\tdisplay flow attributes\n"
+ "\t-p\tdisplay flow properties\n"
+ "\t-r\tdisplay rx side information\n"
+ "\t-t\tdisplay tx side information\n"
+ "\t-s\tdisplay flow statistics\n"
+ "\t-m\tdisplay miscellaneous flow information\n\n");
+ mdb_printf("%<u>Interpreting Flow type and Flow flags output.%</u>\n");
+ mdb_printf("Flow Types:\n");
+ mdb_printf("\t P --> FLOW_PRIMARY_MAC\n");
+ mdb_printf("\t V --> FLOW_VNIC_MAC\n");
+ mdb_printf("\t M --> FLOW_MCAST\n");
+ mdb_printf("\t O --> FLOW_OTHER\n");
+ mdb_printf("\t U --> FLOW_USER\n");
+ mdb_printf("\t NS --> FLOW_NO_STATS\n\n");
+ mdb_printf("Flow Flags:\n");
+ mdb_printf("\t Q --> FE_QUIESCE\n");
+ mdb_printf("\t W --> FE_WAITER\n");
+ mdb_printf("\t T --> FE_FLOW_TAB\n");
+ mdb_printf("\t G --> FE_G_FLOW_HASH\n");
+ mdb_printf("\t I --> FE_INCIPIENT\n");
+ mdb_printf("\t C --> FE_CONDEMNED\n");
+ mdb_printf("\t NU --> FE_UF_NO_DATAPATH\n");
+ mdb_printf("\t NC --> FE_MC_NO_DATAPATH\n");
+}
+
+/*
+ * called once by the debugger when the mac_flow walk begins.
+ */
+static int
+mac_flow_walk_init(mdb_walk_state_t *wsp)
+{
+ if (mdb_layered_walk(LAYERED_WALKER_FOR_FLOW, wsp) == -1) {
+ mdb_warn("failed to walk 'mac_flow'");
+ return (WALK_ERR);
+ }
+ return (WALK_NEXT);
+}
+
+/*
+ * Common walker step funciton for flow_entry_t, mac_soft_ring_set_t and
+ * mac_ring_t.
+ *
+ * Steps through each flow_entry_t and calls the callback function. If the
+ * user executed ::walk mac_flow, it just prints the address or if the user
+ * executed ::mac_flow it displays selected fields of flow_entry_t structure
+ * by calling "mac_flow_dcmd"
+ */
+static int
+mac_common_walk_step(mdb_walk_state_t *wsp)
+{
+ int status;
+
+ if (wsp->walk_addr == NULL)
+ return (WALK_DONE);
+
+ status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data,
+ wsp->walk_cbdata);
+
+ return (status);
+}
+
+static char *
+mac_srs_txmode2str(mac_tx_srs_mode_t mode)
+{
+ switch (mode) {
+ case SRS_TX_DEFAULT:
+ return ("default");
+ case SRS_TX_SERIALIZE:
+ return ("serialize");
+ case SRS_TX_FANOUT:
+ return ("fanout");
+ case SRS_TX_BW:
+ return ("bw");
+ case SRS_TX_BW_FANOUT:
+ return ("bw fanout");
+ }
+ return ("--");
+}
+
+static void
+mac_srs_help(void)
+{
+ mdb_printf("If an address is specified, then mac_soft_ring_set "
+ "structure at that address is printed. Otherwise all the "
+ "SRS in the system are printed.\n");
+ mdb_printf("Options:\n"
+ "\t-r\tdisplay recieve side SRS structures\n"
+ "\t-t\tdisplay transmit side SRS structures\n");
+}
+
+static int
+mac_srs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ uint_t args = 0;
+ mac_soft_ring_set_t srs;
+
+ if (!(flags & DCMD_ADDRSPEC)) {
+ if (mdb_walk_dcmd("mac_srs", "mac_srs", argc, argv) == -1) {
+ mdb_warn("failed to walk 'mac_srs'");
+ return (DCMD_ERR);
+ }
+ return (DCMD_OK);
+ }
+ if ((mdb_getopts(argc, argv,
+ 'r', MDB_OPT_SETBITS, MAC_SRS_RX, &args,
+ 't', MDB_OPT_SETBITS, MAC_SRS_TX, &args) != argc)) {
+ return (DCMD_USAGE);
+ }
+ if (argc > 1)
+ return (DCMD_USAGE);
+
+ if (mdb_vread(&srs, sizeof (srs), addr) == -1) {
+ mdb_warn("failed to read struct mac_soft_ring_set_s at %p",
+ addr);
+ return (DCMD_ERR);
+ }
+
+ switch (args) {
+ case MAC_SRS_RX: {
+ GElf_Sym sym;
+ char func_name[MDB_SYM_NAMLEN] = "";
+ char l_proc_name[MDB_SYM_NAMLEN] = "";
+ uintptr_t func, lproc, funcaddr, lprocaddr, rxaddr;
+
+ if (DCMD_HDRSPEC(flags)) {
+ mdb_printf("%<u>%?s %8s %-8s "
+ "%8s %-20s %-s%</u>\n",
+ "ADDR", "MBLK_CNT", "Q_BYTES",
+ "POLL_CNT", "SR_FUNC", "SR_LOWER_FUNC");
+ }
+ if (srs.srs_type & SRST_TX)
+ return (DCMD_OK);
+ rxaddr = addr + OFFSETOF(mac_soft_ring_set_t, srs_rx);
+ funcaddr = rxaddr + OFFSETOF(mac_srs_rx_t, sr_func);
+ lprocaddr = rxaddr + OFFSETOF(mac_srs_rx_t, sr_lower_proc);
+ (void) mdb_vread(&func, sizeof (func), funcaddr);
+ (void) mdb_vread(&lproc, sizeof (lproc), lprocaddr);
+ (void) mdb_lookup_by_addr(func, MDB_SYM_EXACT, func_name,
+ MDB_SYM_NAMLEN, &sym);
+ (void) mdb_lookup_by_addr(lproc, MDB_SYM_EXACT, l_proc_name,
+ MDB_SYM_NAMLEN, &sym);
+ mdb_printf("%?p %-8d %-8d "
+ "%-8d %-20s %-s\n",
+ addr, srs.srs_count, srs.srs_size,
+ srs.srs_rx.sr_poll_count, func_name, l_proc_name);
+ break;
+ }
+ case MAC_SRS_TX: {
+ if (DCMD_HDRSPEC(flags)) {
+ mdb_printf("%<u>%?s %-10s %-5s %-7s %-7s "
+ "%-7s %-7s %-7s%</u>\n",
+ "ADDR", "TX_MODE", "WOKEN", "DROP", "BLOCK",
+ "UNBLOCK", "MBLK", "SR_CNT");
+ }
+ if (!(srs.srs_type & SRST_TX))
+ return (DCMD_OK);
+
+ mdb_printf("%?p %-10s "
+ "%-5d %-7d "
+ "%-7d %-7d "
+ "%-7d %-7d\n",
+ addr, mac_srs_txmode2str(srs.srs_tx.st_mode),
+ srs.srs_tx.st_woken_up, srs.srs_tx.st_drop_count,
+ srs.srs_tx.st_blocked_cnt, srs.srs_tx.st_unblocked_cnt,
+ srs.srs_count, srs.srs_oth_ring_count);
+ break;
+ }
+ default: {
+ if (DCMD_HDRSPEC(flags)) {
+ mdb_printf("%<u>%?s %?s %?s %?s %-3s "
+ "%-8s %-8s %-7s %</u>\n",
+ "ADDR", "MCIP", "FLENT", "RING", "DIR",
+ "TYPE", "STATE", "SR_CNT");
+ }
+ mdb_printf("%?p %?p %?p %?p "
+ "%-3s "
+ "%08x %08x %-7d \n",
+ addr, srs.srs_mcip, srs.srs_flent, srs.srs_ring,
+ (srs.srs_type & SRST_TX ? "TX" : "RX"),
+ srs.srs_type, srs.srs_state, srs.srs_soft_ring_count);
+ break;
+ }
+ }
+ return (DCMD_OK);
+}
+
+static int
+mac_srs_walk_init(mdb_walk_state_t *wsp)
+{
+ if (mdb_layered_walk(LAYERED_WALKER_FOR_SRS, wsp) == -1) {
+ mdb_warn("failed to walk 'mac_srs'");
+ return (WALK_ERR);
+ }
+ return (WALK_NEXT);
+}
+
+static char *
+mac_ring_state2str(mac_ring_state_t state)
+{
+ switch (state) {
+ case MR_FREE:
+ return ("free");
+ case MR_NEWLY_ADDED:
+ return ("new");
+ case MR_INUSE:
+ return ("inuse");
+ }
+ return ("--");
+}
+
+static char *
+mac_ring_classify2str(mac_classify_type_t classify)
+{
+ switch (classify) {
+ case MAC_NO_CLASSIFIER:
+ return ("no");
+ case MAC_SW_CLASSIFIER:
+ return ("sw");
+ case MAC_HW_CLASSIFIER:
+ return ("hw");
+ }
+ return ("--");
+}
+
+static int
+mac_ring_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ mac_ring_t ring;
+ mac_group_t group;
+ flow_entry_t flent;
+ mac_soft_ring_set_t srs;
+
+ if (!(flags & DCMD_ADDRSPEC)) {
+ if (mdb_walk_dcmd("mac_ring", "mac_ring", argc, argv) == -1) {
+ mdb_warn("failed to walk 'mac_ring'");
+ return (DCMD_ERR);
+ }
+ return (DCMD_OK);
+ }
+ if (mdb_vread(&ring, sizeof (ring), addr) == -1) {
+ mdb_warn("failed to read struct mac_ring_s at %p", addr);
+ return (DCMD_ERR);
+ }
+ bzero(&flent, sizeof (flent));
+ if (mdb_vread(&srs, sizeof (srs), (uintptr_t)ring.mr_srs) != -1) {
+ (void) mdb_vread(&flent, sizeof (flent),
+ (uintptr_t)srs.srs_flent);
+ }
+ (void) mdb_vread(&group, sizeof (group), (uintptr_t)ring.mr_gh);
+ if (DCMD_HDRSPEC(flags)) {
+ mdb_printf("%<u>%?s %4s %5s %4s %?s "
+ "%5s %?s %?s %s %</u>\n",
+ "ADDR", "TYPE", "STATE", "FLAG", "GROUP",
+ "CLASS", "MIP", "SRS", "FLOW NAME");
+ }
+ mdb_printf("%?p %-4s "
+ "%5s %04x "
+ "%?p %-5s "
+ "%?p %?p %s\n",
+ addr, ((ring.mr_type == 1)? "RX" : "TX"),
+ mac_ring_state2str(ring.mr_state), ring.mr_flag,
+ ring.mr_gh, mac_ring_classify2str(ring.mr_classify_type),
+ group.mrg_mh, ring.mr_srs, flent.fe_flow_name);
+ return (DCMD_OK);
+}
+
+static int
+mac_ring_walk_init(mdb_walk_state_t *wsp)
+{
+ if (mdb_layered_walk(LAYERED_WALKER_FOR_RING, wsp) == -1) {
+ mdb_warn("failed to walk `mac_ring`");
+ return (WALK_ERR);
+ }
+ return (WALK_NEXT);
+}
+
+static void
+mac_ring_help(void)
+{
+ mdb_printf("If an address is specified, then mac_ring_t "
+ "structure at that address is printed. Otherwise all the "
+ "hardware rings in the system are printed.\n");
+}
+
+/* Supported dee-commands */
+static const mdb_dcmd_t dcmds[] = {
+ {"mac_flow", "?[-u] [-aprtsm]", "display Flow Entry structures",
+ mac_flow_dcmd, mac_flow_help},
+ {"mac_srs", "?[-rt]", "display MAC Soft Ring Set structures",
+ mac_srs_dcmd, mac_srs_help},
+ {"mac_ring", "?", "display MAC ring (hardware) structures",
+ mac_ring_dcmd, mac_ring_help},
+ { NULL }
+};
+
+/* Supported walkers */
+static const mdb_walker_t walkers[] = {
+ {"mac_flow", "walk list of flow entry structures", mac_flow_walk_init,
+ mac_common_walk_step, NULL, NULL},
+ {"mac_srs", "walk list of mac soft ring set structures",
+ mac_srs_walk_init, mac_common_walk_step, NULL, NULL},
+ {"mac_ring", "walk list of mac ring structures", mac_ring_walk_init,
+ mac_common_walk_step, NULL, NULL},
+ { NULL }
+};
+
+static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers };
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+ return (&modinfo);
+}
diff --git a/usr/src/cmd/mdb/intel/amd64/mac/Makefile b/usr/src/cmd/mdb/intel/amd64/mac/Makefile
new file mode 100644
index 0000000000..6f24b28ea6
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/mac/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+MODULE = mac.so
+MDBTGT = kvm
+
+MODSRCS = mac.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.amd64
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/intel/ia32/mac/Makefile b/usr/src/cmd/mdb/intel/ia32/mac/Makefile
new file mode 100644
index 0000000000..69c8c97b19
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/ia32/mac/Makefile
@@ -0,0 +1,33 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+MODULE = mac.so
+MDBTGT = kvm
+
+MODSRCS = mac.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.ia32
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/sparc/v9/mac/Makefile b/usr/src/cmd/mdb/sparc/v9/mac/Makefile
new file mode 100644
index 0000000000..1456211245
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v9/mac/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+MODULE = mac.so
+MDBTGT = kvm
+
+MODSRCS = mac.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.sparcv9
+include ../../../Makefile.module
diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com
index a7293e76f1..365371c45c 100644
--- a/usr/src/cmd/rcm_daemon/Makefile.com
+++ b/usr/src/cmd/rcm_daemon/Makefile.com
@@ -22,8 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
include ../../Makefile.cmd
@@ -51,6 +49,7 @@ COMMON_MOD_SRC = \
$(COMMON)/swap_rcm.c \
$(COMMON)/network_rcm.c \
$(COMMON)/vlan_rcm.c \
+ $(COMMON)/vnic_rcm.c \
$(COMMON)/aggr_rcm.c \
$(COMMON)/ip_rcm.c \
$(COMMON)/cluster_rcm.c \
@@ -71,6 +70,7 @@ COMMON_MOD_OBJ = \
swap_rcm.o \
network_rcm.o \
vlan_rcm.o \
+ vnic_rcm.o \
aggr_rcm.o \
ip_rcm.o \
cluster_rcm.o \
@@ -89,6 +89,7 @@ COMMON_RCM_MODS = \
SUNW_swap_rcm.so \
SUNW_network_rcm.so \
SUNW_vlan_rcm.so \
+ SUNW_vnic_rcm.so \
SUNW_aggr_rcm.so \
SUNW_ip_rcm.so \
SUNW_cluster_rcm.so \
@@ -121,6 +122,7 @@ SUNW_pool_rcm.so := LDLIBS_MODULES += -L$(ROOT)/usr/lib -lpool
SUNW_svm_rcm.so := LDLIBS_MODULES += -L$(ROOT)/usr/lib -lmeta
SUNW_network_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
SUNW_vlan_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
+SUNW_vnic_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
SUNW_aggr_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm
SUNW_ip_anon_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil
diff --git a/usr/src/cmd/rcm_daemon/common/vlan_rcm.c b/usr/src/cmd/rcm_daemon/common/vlan_rcm.c
index 1177d5e384..a657baa2d4 100644
--- a/usr/src/cmd/rcm_daemon/common/vlan_rcm.c
+++ b/usr/src/cmd/rcm_daemon/common/vlan_rcm.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This RCM module adds support to the RCM framework for VLAN links
*/
@@ -68,7 +66,6 @@ typedef struct dl_vlan {
struct dl_vlan *dv_next; /* next VLAN on the same link */
struct dl_vlan *dv_prev; /* prev VLAN on the same link */
datalink_id_t dv_vlanid;
- boolean_t dv_implicit;
vlan_flag_t dv_flags; /* VLAN link flags */
} dl_vlan_t;
@@ -399,7 +396,6 @@ vlan_online_vlan(link_cache_t *node)
if (!(vlan->dv_flags & VLAN_OFFLINED))
continue;
- assert(!vlan->dv_implicit);
if ((status = dladm_vlan_up(vlan->dv_vlanid)) !=
DLADM_STATUS_OK) {
/*
@@ -429,10 +425,6 @@ vlan_offline_vlan(link_cache_t *node, uint32_t flags, cache_node_state_t state)
* Try to delete all explicit created VLAN
*/
for (vlan = node->vc_vlan; vlan != NULL; vlan = vlan->dv_next) {
-
- if (vlan->dv_implicit)
- continue;
-
if ((status = dladm_vlan_delete(vlan->dv_vlanid,
DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) {
rcm_log_message(RCM_WARNING,
@@ -918,7 +910,6 @@ vlan_update(datalink_id_t vlanid, void *arg)
node->vc_vlan = vlan;
}
- vlan->dv_implicit = vlan_attr.dv_implicit;
node->vc_state &= ~CACHE_NODE_STALE;
if (newnode)
@@ -1186,18 +1177,16 @@ vlan_notify_new_vlan(rcm_handle_t *hd, char *rsrc)
}
for (vlan = node->vc_vlan; vlan != NULL; vlan = vlan->dv_next) {
- if (!vlan->dv_implicit) {
- rcm_log_message(RCM_TRACE2,
- "VLAN: vlan_notify_new_vlan add (%u)\n",
- vlan->dv_vlanid);
+ rcm_log_message(RCM_TRACE2,
+ "VLAN: vlan_notify_new_vlan add (%u)\n",
+ vlan->dv_vlanid);
- id = vlan->dv_vlanid;
- if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) {
- rcm_log_message(RCM_ERROR,
- _("VLAN: failed to construct nvlist\n"));
- (void) mutex_unlock(&cache_lock);
- goto done;
- }
+ id = vlan->dv_vlanid;
+ if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) {
+ rcm_log_message(RCM_ERROR,
+ _("VLAN: failed to construct nvlist\n"));
+ (void) mutex_unlock(&cache_lock);
+ goto done;
}
}
(void) mutex_unlock(&cache_lock);
diff --git a/usr/src/cmd/rcm_daemon/common/vnic_rcm.c b/usr/src/cmd/rcm_daemon/common/vnic_rcm.c
new file mode 100644
index 0000000000..178d3b44a8
--- /dev/null
+++ b/usr/src/cmd/rcm_daemon/common/vnic_rcm.c
@@ -0,0 +1,1329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This RCM module adds support to the RCM framework for VNIC links
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <synch.h>
+#include <assert.h>
+#include <strings.h>
+#include "rcm_module.h"
+#include <libintl.h>
+#include <libdllink.h>
+#include <libdlvnic.h>
+#include <libdlpi.h>
+
+/*
+ * Definitions
+ */
+#ifndef lint
+#define _(x) gettext(x)
+#else
+#define _(x) x
+#endif
+
+/* Some generic well-knowns and defaults used in this module */
+#define RCM_LINK_PREFIX "SUNW_datalink" /* RCM datalink name prefix */
+#define RCM_LINK_RESOURCE_MAX (13 + LINKID_STR_WIDTH)
+
+/* VNIC link flags */
+typedef enum {
+ VNIC_OFFLINED = 0x1,
+ VNIC_CONSUMER_OFFLINED = 0x2,
+ VNIC_STALE = 0x4
+} vnic_flag_t;
+
+/* link representation */
+typedef struct dl_vnic {
+ struct dl_vnic *dlv_next; /* next VNIC on the same link */
+ struct dl_vnic *dlv_prev; /* prev VNIC on the same link */
+ datalink_id_t dlv_vnic_id;
+ vnic_flag_t dlv_flags; /* VNIC link flags */
+} dl_vnic_t;
+
+/* VNIC Cache state flags */
+typedef enum {
+ CACHE_NODE_STALE = 0x1, /* stale cached data */
+ CACHE_NODE_NEW = 0x2, /* new cached nodes */
+ CACHE_NODE_OFFLINED = 0x4 /* nodes offlined */
+} cache_node_state_t;
+
+/* Network Cache lookup options */
+#define CACHE_NO_REFRESH 0x1 /* cache refresh not needed */
+#define CACHE_REFRESH 0x2 /* refresh cache */
+
+/* Cache element */
+typedef struct link_cache {
+ struct link_cache *vc_next; /* next cached resource */
+ struct link_cache *vc_prev; /* prev cached resource */
+ char *vc_resource; /* resource name */
+ datalink_id_t vc_linkid; /* linkid */
+ dl_vnic_t *vc_vnic; /* VNIC list on this link */
+ cache_node_state_t vc_state; /* cache state flags */
+} link_cache_t;
+
+/*
+ * Global cache for network VNICs
+ */
+static link_cache_t cache_head;
+static link_cache_t cache_tail;
+static mutex_t cache_lock;
+static int events_registered = 0;
+
+/*
+ * RCM module interface prototypes
+ */
+static int vnic_register(rcm_handle_t *);
+static int vnic_unregister(rcm_handle_t *);
+static int vnic_get_info(rcm_handle_t *, char *, id_t, uint_t,
+ char **, char **, nvlist_t *, rcm_info_t **);
+static int vnic_suspend(rcm_handle_t *, char *, id_t,
+ timespec_t *, uint_t, char **, rcm_info_t **);
+static int vnic_resume(rcm_handle_t *, char *, id_t, uint_t,
+ char **, rcm_info_t **);
+static int vnic_offline(rcm_handle_t *, char *, id_t, uint_t,
+ char **, rcm_info_t **);
+static int vnic_undo_offline(rcm_handle_t *, char *, id_t, uint_t,
+ char **, rcm_info_t **);
+static int vnic_remove(rcm_handle_t *, char *, id_t, uint_t,
+ char **, rcm_info_t **);
+static int vnic_notify_event(rcm_handle_t *, char *, id_t, uint_t,
+ char **, nvlist_t *, rcm_info_t **);
+static int vnic_configure(rcm_handle_t *, datalink_id_t);
+
+/* Module private routines */
+static void cache_free();
+static int cache_update(rcm_handle_t *);
+static void cache_remove(link_cache_t *);
+static void node_free(link_cache_t *);
+static void cache_insert(link_cache_t *);
+static link_cache_t *cache_lookup(rcm_handle_t *, char *, char);
+static int vnic_consumer_offline(rcm_handle_t *, link_cache_t *,
+ char **, uint_t, rcm_info_t **);
+static void vnic_consumer_online(rcm_handle_t *, link_cache_t *,
+ char **, uint_t, rcm_info_t **);
+static int vnic_offline_vnic(link_cache_t *, uint32_t,
+ cache_node_state_t);
+static void vnic_online_vnic(link_cache_t *);
+static char *vnic_usage(link_cache_t *);
+static void vnic_log_err(datalink_id_t, char **, char *);
+static int vnic_consumer_notify(rcm_handle_t *, datalink_id_t,
+ char **, uint_t, rcm_info_t **);
+
+/* Module-Private data */
+static struct rcm_mod_ops vnic_ops =
+{
+ RCM_MOD_OPS_VERSION,
+ vnic_register,
+ vnic_unregister,
+ vnic_get_info,
+ vnic_suspend,
+ vnic_resume,
+ vnic_offline,
+ vnic_undo_offline,
+ vnic_remove,
+ NULL,
+ NULL,
+ vnic_notify_event
+};
+
+/*
+ * rcm_mod_init() - Update registrations, and return the ops structure.
+ */
+struct rcm_mod_ops *
+rcm_mod_init(void)
+{
+ rcm_log_message(RCM_TRACE1, "VNIC: mod_init\n");
+
+ cache_head.vc_next = &cache_tail;
+ cache_head.vc_prev = NULL;
+ cache_tail.vc_prev = &cache_head;
+ cache_tail.vc_next = NULL;
+ (void) mutex_init(&cache_lock, 0, NULL);
+
+ /* Return the ops vectors */
+ return (&vnic_ops);
+}
+
+/*
+ * rcm_mod_info() - Return a string describing this module.
+ */
+const char *
+rcm_mod_info(void)
+{
+ rcm_log_message(RCM_TRACE1, "VNIC: mod_info\n");
+
+ return ("VNIC module");
+}
+
+/*
+ * rcm_mod_fini() - Destroy the network VNIC cache.
+ */
+int
+rcm_mod_fini(void)
+{
+ rcm_log_message(RCM_TRACE1, "VNIC: mod_fini\n");
+
+ /*
+ * Note that vnic_unregister() does not seem to be called anywhere,
+ * therefore we free the cache nodes here. In theory we should call
+ * rcm_register_interest() for each node before we free it, the
+ * framework does not provide the rcm_handle to allow us to do so.
+ */
+ cache_free();
+ (void) mutex_destroy(&cache_lock);
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_register() - Make sure the cache is properly sync'ed, and its
+ * registrations are in order.
+ */
+static int
+vnic_register(rcm_handle_t *hd)
+{
+ rcm_log_message(RCM_TRACE1, "VNIC: register\n");
+
+ if (cache_update(hd) < 0)
+ return (RCM_FAILURE);
+
+ /*
+ * Need to register interest in all new resources
+ * getting attached, so we get attach event notifications
+ */
+ if (!events_registered) {
+ if (rcm_register_event(hd, RCM_RESOURCE_LINK_NEW, 0, NULL)
+ != RCM_SUCCESS) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: failed to register %s\n"),
+ RCM_RESOURCE_LINK_NEW);
+ return (RCM_FAILURE);
+ } else {
+ rcm_log_message(RCM_DEBUG, "VNIC: registered %s\n",
+ RCM_RESOURCE_LINK_NEW);
+ events_registered++;
+ }
+ }
+
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_unregister() - Walk the cache, unregistering all the networks.
+ */
+static int
+vnic_unregister(rcm_handle_t *hd)
+{
+ link_cache_t *node;
+
+ rcm_log_message(RCM_TRACE1, "VNIC: unregister\n");
+
+ /* Walk the cache, unregistering everything */
+ (void) mutex_lock(&cache_lock);
+ node = cache_head.vc_next;
+ while (node != &cache_tail) {
+ if (rcm_unregister_interest(hd, node->vc_resource, 0)
+ != RCM_SUCCESS) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: failed to unregister %s\n"),
+ node->vc_resource);
+ (void) mutex_unlock(&cache_lock);
+ return (RCM_FAILURE);
+ }
+ cache_remove(node);
+ node_free(node);
+ node = cache_head.vc_next;
+ }
+ (void) mutex_unlock(&cache_lock);
+
+ /*
+ * Unregister interest in all new resources
+ */
+ if (events_registered) {
+ if (rcm_unregister_event(hd, RCM_RESOURCE_LINK_NEW, 0)
+ != RCM_SUCCESS) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: failed to unregister %s\n"),
+ RCM_RESOURCE_LINK_NEW);
+ return (RCM_FAILURE);
+ } else {
+ rcm_log_message(RCM_DEBUG, "VNIC: unregistered %s\n",
+ RCM_RESOURCE_LINK_NEW);
+ events_registered--;
+ }
+ }
+
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_offline() - Offline VNICs on a specific node.
+ */
+static int
+vnic_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+ char **errorp, rcm_info_t **info)
+{
+ link_cache_t *node;
+
+ rcm_log_message(RCM_TRACE1, "VNIC: offline(%s)\n", rsrc);
+
+ /* Lock the cache and lookup the resource */
+ (void) mutex_lock(&cache_lock);
+ node = cache_lookup(hd, rsrc, CACHE_REFRESH);
+ if (node == NULL) {
+ /* should not happen because the resource is registered. */
+ vnic_log_err(node->vc_linkid, errorp, "unrecognized resource");
+ (void) mutex_unlock(&cache_lock);
+ return (RCM_SUCCESS);
+ }
+
+ /*
+ * Inform consumers (IP interfaces) of associated VNICs to be offlined
+ */
+ if (vnic_consumer_offline(hd, node, errorp, flags, info) ==
+ RCM_SUCCESS) {
+ rcm_log_message(RCM_DEBUG,
+ "VNIC: consumers agreed on offline\n");
+ } else {
+ vnic_log_err(node->vc_linkid, errorp,
+ "consumers failed to offline");
+ (void) mutex_unlock(&cache_lock);
+ return (RCM_FAILURE);
+ }
+
+ /* Check if it's a query */
+ if (flags & RCM_QUERY) {
+ rcm_log_message(RCM_TRACE1,
+ "VNIC: offline query succeeded(%s)\n", rsrc);
+ (void) mutex_unlock(&cache_lock);
+ return (RCM_SUCCESS);
+ }
+
+ if (vnic_offline_vnic(node, VNIC_OFFLINED, CACHE_NODE_OFFLINED) !=
+ RCM_SUCCESS) {
+ vnic_online_vnic(node);
+ vnic_log_err(node->vc_linkid, errorp, "offline failed");
+ (void) mutex_unlock(&cache_lock);
+ return (RCM_FAILURE);
+ }
+
+ rcm_log_message(RCM_TRACE1, "VNIC: Offline succeeded(%s)\n", rsrc);
+ (void) mutex_unlock(&cache_lock);
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_undo_offline() - Undo offline of a previously offlined node.
+ */
+/*ARGSUSED*/
+static int
+vnic_undo_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+ char **errorp, rcm_info_t **info)
+{
+ link_cache_t *node;
+
+ rcm_log_message(RCM_TRACE1, "VNIC: online(%s)\n", rsrc);
+
+ (void) mutex_lock(&cache_lock);
+ node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH);
+ if (node == NULL) {
+ vnic_log_err(DATALINK_INVALID_LINKID, errorp, "no such link");
+ (void) mutex_unlock(&cache_lock);
+ errno = ENOENT;
+ return (RCM_FAILURE);
+ }
+
+ /* Check if no attempt should be made to online the link here */
+ if (!(node->vc_state & CACHE_NODE_OFFLINED)) {
+ vnic_log_err(node->vc_linkid, errorp, "link not offlined");
+ (void) mutex_unlock(&cache_lock);
+ errno = ENOTSUP;
+ return (RCM_SUCCESS);
+ }
+
+ vnic_online_vnic(node);
+
+ /*
+ * Inform IP interfaces on associated VNICs to be onlined
+ */
+ vnic_consumer_online(hd, node, errorp, flags, info);
+
+ node->vc_state &= ~CACHE_NODE_OFFLINED;
+ rcm_log_message(RCM_TRACE1, "VNIC: online succeeded(%s)\n", rsrc);
+ (void) mutex_unlock(&cache_lock);
+ return (RCM_SUCCESS);
+}
+
+static void
+vnic_online_vnic(link_cache_t *node)
+{
+ dl_vnic_t *vnic;
+ dladm_status_t status;
+ char errmsg[DLADM_STRSIZE];
+
+ /*
+ * Try to bring on all offlined VNICs
+ */
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+ if (!(vnic->dlv_flags & VNIC_OFFLINED))
+ continue;
+
+ if ((status = dladm_vnic_up(vnic->dlv_vnic_id, 0)) !=
+ DLADM_STATUS_OK) {
+ /*
+ * Print a warning message and continue to online
+ * other VNICs.
+ */
+ rcm_log_message(RCM_WARNING,
+ _("VNIC: VNIC online failed (%u): %s\n"),
+ vnic->dlv_vnic_id,
+ dladm_status2str(status, errmsg));
+ } else {
+ vnic->dlv_flags &= ~VNIC_OFFLINED;
+ }
+ }
+}
+
+static int
+vnic_offline_vnic(link_cache_t *node, uint32_t flags, cache_node_state_t state)
+{
+ dl_vnic_t *vnic;
+ dladm_status_t status;
+ char errmsg[DLADM_STRSIZE];
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_offline_vnic (%s %u %u)\n",
+ node->vc_resource, flags, state);
+
+ /*
+ * Try to delete all explicit created VNIC
+ */
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+
+ if ((status = dladm_vnic_delete(vnic->dlv_vnic_id,
+ DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) {
+ rcm_log_message(RCM_WARNING,
+ _("VNIC: VNIC offline failed (%u): %s\n"),
+ vnic->dlv_vnic_id,
+ dladm_status2str(status, errmsg));
+ return (RCM_FAILURE);
+ } else {
+ rcm_log_message(RCM_TRACE1,
+ "VNIC: VNIC offline succeeded(%u)\n",
+ vnic->dlv_vnic_id);
+ vnic->dlv_flags |= flags;
+ }
+ }
+
+ node->vc_state |= state;
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_get_info() - Gather usage information for this resource.
+ */
+/*ARGSUSED*/
+int
+vnic_get_info(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+ char **usagep, char **errorp, nvlist_t *props, rcm_info_t **info)
+{
+ link_cache_t *node;
+
+ rcm_log_message(RCM_TRACE1, "VNIC: get_info(%s)\n", rsrc);
+
+ (void) mutex_lock(&cache_lock);
+ node = cache_lookup(hd, rsrc, CACHE_REFRESH);
+ if (node == NULL) {
+ rcm_log_message(RCM_INFO,
+ _("VNIC: get_info(%s) unrecognized resource\n"), rsrc);
+ (void) mutex_unlock(&cache_lock);
+ errno = ENOENT;
+ return (RCM_FAILURE);
+ }
+
+ *usagep = vnic_usage(node);
+ (void) mutex_unlock(&cache_lock);
+ if (*usagep == NULL) {
+ /* most likely malloc failure */
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: get_info(%s) malloc failure\n"), rsrc);
+ (void) mutex_unlock(&cache_lock);
+ errno = ENOMEM;
+ return (RCM_FAILURE);
+ }
+
+ /* Set client/role properties */
+ (void) nvlist_add_string(props, RCM_CLIENT_NAME, "VNIC");
+
+ rcm_log_message(RCM_TRACE1, "VNIC: get_info(%s) info = %s\n",
+ rsrc, *usagep);
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_suspend() - Nothing to do, always okay
+ */
+/*ARGSUSED*/
+static int
+vnic_suspend(rcm_handle_t *hd, char *rsrc, id_t id, timespec_t *interval,
+ uint_t flags, char **errorp, rcm_info_t **info)
+{
+ rcm_log_message(RCM_TRACE1, "VNIC: suspend(%s)\n", rsrc);
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_resume() - Nothing to do, always okay
+ */
+/*ARGSUSED*/
+static int
+vnic_resume(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+ char **errorp, rcm_info_t **info)
+{
+ rcm_log_message(RCM_TRACE1, "VNIC: resume(%s)\n", rsrc);
+ return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_consumer_remove()
+ *
+ * Notify VNIC consumers to remove cache.
+ */
+static int
+vnic_consumer_remove(rcm_handle_t *hd, link_cache_t *node, uint_t flags,
+ rcm_info_t **info)
+{
+ dl_vnic_t *vnic = NULL;
+ char rsrc[RCM_LINK_RESOURCE_MAX];
+ int ret = RCM_SUCCESS;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_remove (%s)\n",
+ node->vc_resource);
+
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+
+ /*
+ * This will only be called when the offline operation
+ * succeeds, so the VNIC consumers must have been offlined
+ * at this point.
+ */
+ assert(vnic->dlv_flags & VNIC_CONSUMER_OFFLINED);
+
+ (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+ RCM_LINK_PREFIX, vnic->dlv_vnic_id);
+
+ ret = rcm_notify_remove(hd, rsrc, flags, info);
+ if (ret != RCM_SUCCESS) {
+ rcm_log_message(RCM_WARNING,
+ _("VNIC: notify remove failed (%s)\n"), rsrc);
+ break;
+ }
+ }
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_remove done\n");
+ return (ret);
+}
+
+/*
+ * vnic_remove() - remove a resource from cache
+ */
+/*ARGSUSED*/
+static int
+vnic_remove(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+ char **errorp, rcm_info_t **info)
+{
+ link_cache_t *node;
+ int rv;
+
+ rcm_log_message(RCM_TRACE1, "VNIC: remove(%s)\n", rsrc);
+
+ (void) mutex_lock(&cache_lock);
+ node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH);
+ if (node == NULL) {
+ rcm_log_message(RCM_INFO,
+ _("VNIC: remove(%s) unrecognized resource\n"), rsrc);
+ (void) mutex_unlock(&cache_lock);
+ errno = ENOENT;
+ return (RCM_FAILURE);
+ }
+
+ /* remove the cached entry for the resource */
+ cache_remove(node);
+ (void) mutex_unlock(&cache_lock);
+
+ rv = vnic_consumer_remove(hd, node, flags, info);
+ node_free(node);
+ return (rv);
+}
+
+/*
+ * vnic_notify_event - Project private implementation to receive new resource
+ * events. It intercepts all new resource events. If the
+ * new resource is a network resource, pass up a notify
+ * for it too. The new resource need not be cached, since
+ * it is done at register again.
+ */
+/*ARGSUSED*/
+static int
+vnic_notify_event(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+ char **errorp, nvlist_t *nvl, rcm_info_t **info)
+{
+ nvpair_t *nvp = NULL;
+ datalink_id_t linkid;
+ uint64_t id64;
+ int rv = RCM_SUCCESS;
+
+ rcm_log_message(RCM_TRACE1, "VNIC: notify_event(%s)\n", rsrc);
+
+ if (strcmp(rsrc, RCM_RESOURCE_LINK_NEW) != 0) {
+ vnic_log_err(DATALINK_INVALID_LINKID, errorp,
+ "unrecognized event");
+ errno = EINVAL;
+ return (RCM_FAILURE);
+ }
+
+ /* Update cache to reflect latest VNICs */
+ if (cache_update(hd) < 0) {
+ vnic_log_err(DATALINK_INVALID_LINKID, errorp,
+ "private Cache update failed");
+ return (RCM_FAILURE);
+ }
+
+ /*
+ * Try best to recover all configuration.
+ */
+ rcm_log_message(RCM_DEBUG, "VNIC: process_nvlist\n");
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ if (strcmp(nvpair_name(nvp), RCM_NV_LINKID) != 0)
+ continue;
+
+ if (nvpair_value_uint64(nvp, &id64) != 0) {
+ vnic_log_err(DATALINK_INVALID_LINKID, errorp,
+ "cannot get linkid");
+ rv = RCM_FAILURE;
+ continue;
+ }
+
+ linkid = (datalink_id_t)id64;
+ if (vnic_configure(hd, linkid) != 0) {
+ vnic_log_err(linkid, errorp, "configuring failed");
+ rv = RCM_FAILURE;
+ continue;
+ }
+
+ /* Notify all VNIC consumers */
+ if (vnic_consumer_notify(hd, linkid, errorp, flags,
+ info) != 0) {
+ vnic_log_err(linkid, errorp, "consumer notify failed");
+ rv = RCM_FAILURE;
+ }
+ }
+
+ rcm_log_message(RCM_TRACE1,
+ "VNIC: notify_event: link configuration complete\n");
+ return (rv);
+}
+
+/*
+ * vnic_usage - Determine the usage of a link.
+ * The returned buffer is owned by caller, and the caller
+ * must free it up when done.
+ */
+static char *
+vnic_usage(link_cache_t *node)
+{
+ dl_vnic_t *vnic;
+ int nvnic;
+ char *buf;
+ const char *fmt;
+ char *sep;
+ char errmsg[DLADM_STRSIZE];
+ char name[MAXLINKNAMELEN];
+ dladm_status_t status;
+ size_t bufsz;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: usage(%s)\n", node->vc_resource);
+
+ assert(MUTEX_HELD(&cache_lock));
+ if ((status = dladm_datalink_id2info(node->vc_linkid, NULL, NULL, NULL,
+ name, sizeof (name))) != DLADM_STATUS_OK) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: usage(%s) get link name failure(%s)\n"),
+ node->vc_resource, dladm_status2str(status, errmsg));
+ return (NULL);
+ }
+
+ if (node->vc_state & CACHE_NODE_OFFLINED)
+ fmt = _("%1$s offlined");
+ else
+ fmt = _("%1$s VNICs: ");
+
+ /* TRANSLATION_NOTE: separator used between VNIC linkids */
+ sep = _(", ");
+
+ nvnic = 0;
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next)
+ nvnic++;
+
+ /* space for VNICs and separators, plus message */
+ bufsz = nvnic * (MAXLINKNAMELEN + strlen(sep)) +
+ strlen(fmt) + MAXLINKNAMELEN + 1;
+ if ((buf = malloc(bufsz)) == NULL) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: usage(%s) malloc failure(%s)\n"),
+ node->vc_resource, strerror(errno));
+ return (NULL);
+ }
+ (void) snprintf(buf, bufsz, fmt, name);
+
+ if (node->vc_state & CACHE_NODE_OFFLINED) {
+ /* Nothing else to do */
+ rcm_log_message(RCM_TRACE2, "VNIC: usage (%s) info = %s\n",
+ node->vc_resource, buf);
+ return (buf);
+ }
+
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+ rcm_log_message(RCM_DEBUG, "VNIC:= %u\n", vnic->dlv_vnic_id);
+
+ if ((status = dladm_datalink_id2info(vnic->dlv_vnic_id, NULL,
+ NULL, NULL, name, sizeof (name))) != DLADM_STATUS_OK) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: usage(%s) get vnic %u name failure(%s)\n"),
+ node->vc_resource, vnic->dlv_vnic_id,
+ dladm_status2str(status, errmsg));
+ free(buf);
+ return (NULL);
+ }
+
+ (void) strlcat(buf, name, bufsz);
+ if (vnic->dlv_next != NULL)
+ (void) strlcat(buf, sep, bufsz);
+ }
+
+ rcm_log_message(RCM_TRACE2, "VNIC: usage (%s) info = %s\n",
+ node->vc_resource, buf);
+
+ return (buf);
+}
+
+/*
+ * Cache management routines, all cache management functions should be
+ * be called with cache_lock held.
+ */
+
+/*
+ * cache_lookup() - Get a cache node for a resource.
+ * Call with cache lock held.
+ *
+ * This ensures that the cache is consistent with the system state and
+ * returns a pointer to the cache element corresponding to the resource.
+ */
+static link_cache_t *
+cache_lookup(rcm_handle_t *hd, char *rsrc, char options)
+{
+ link_cache_t *node;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: cache lookup(%s)\n", rsrc);
+
+ assert(MUTEX_HELD(&cache_lock));
+ if (options & CACHE_REFRESH) {
+ /* drop lock since update locks cache again */
+ (void) mutex_unlock(&cache_lock);
+ (void) cache_update(hd);
+ (void) mutex_lock(&cache_lock);
+ }
+
+ node = cache_head.vc_next;
+ for (; node != &cache_tail; node = node->vc_next) {
+ if (strcmp(rsrc, node->vc_resource) == 0) {
+ rcm_log_message(RCM_TRACE2,
+ "VNIC: cache lookup succeeded(%s)\n", rsrc);
+ return (node);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * node_free - Free a node from the cache
+ */
+static void
+node_free(link_cache_t *node)
+{
+ dl_vnic_t *vnic, *next;
+
+ if (node != NULL) {
+ free(node->vc_resource);
+
+ /* free the VNIC list */
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = next) {
+ next = vnic->dlv_next;
+ free(vnic);
+ }
+ free(node);
+ }
+}
+
+/*
+ * cache_insert - Insert a resource node in cache
+ */
+static void
+cache_insert(link_cache_t *node)
+{
+ assert(MUTEX_HELD(&cache_lock));
+
+ /* insert at the head for best performance */
+ node->vc_next = cache_head.vc_next;
+ node->vc_prev = &cache_head;
+
+ node->vc_next->vc_prev = node;
+ node->vc_prev->vc_next = node;
+}
+
+/*
+ * cache_remove() - Remove a resource node from cache.
+ */
+static void
+cache_remove(link_cache_t *node)
+{
+ assert(MUTEX_HELD(&cache_lock));
+ node->vc_next->vc_prev = node->vc_prev;
+ node->vc_prev->vc_next = node->vc_next;
+ node->vc_next = NULL;
+ node->vc_prev = NULL;
+}
+
+typedef struct vnic_update_arg_s {
+ rcm_handle_t *hd;
+ int retval;
+} vnic_update_arg_t;
+
+/*
+ * vnic_update() - Update physical interface properties
+ */
+static int
+vnic_update(datalink_id_t vnicid, void *arg)
+{
+ vnic_update_arg_t *vnic_update_argp = arg;
+ rcm_handle_t *hd = vnic_update_argp->hd;
+ link_cache_t *node;
+ dl_vnic_t *vnic;
+ char *rsrc;
+ dladm_vnic_attr_t vnic_attr;
+ dladm_status_t status;
+ char errmsg[DLADM_STRSIZE];
+ boolean_t newnode = B_FALSE;
+ int ret = -1;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_update(%u)\n", vnicid);
+
+ assert(MUTEX_HELD(&cache_lock));
+ status = dladm_vnic_info(vnicid, &vnic_attr, DLADM_OPT_ACTIVE);
+ if (status != DLADM_STATUS_OK) {
+ rcm_log_message(RCM_TRACE1,
+ "VNIC: vnic_update() cannot get vnic information for "
+ "%u(%s)\n", vnicid, dladm_status2str(status, errmsg));
+ return (DLADM_WALK_CONTINUE);
+ }
+
+ if (vnic_attr.va_link_id == DATALINK_INVALID_LINKID) {
+ /*
+ * Skip the etherstubs.
+ */
+ rcm_log_message(RCM_TRACE1,
+ "VNIC: vnic_update(): skip the etherstub %u\n", vnicid);
+ return (DLADM_WALK_CONTINUE);
+ }
+
+ rsrc = malloc(RCM_LINK_RESOURCE_MAX);
+ if (rsrc == NULL) {
+ rcm_log_message(RCM_ERROR, _("VNIC: malloc error(%s): %u\n"),
+ strerror(errno), vnicid);
+ goto done;
+ }
+
+ (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+ RCM_LINK_PREFIX, vnic_attr.va_link_id);
+
+ node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH);
+ if (node != NULL) {
+ rcm_log_message(RCM_DEBUG,
+ "VNIC: %s already registered (vnicid:%d)\n",
+ rsrc, vnic_attr.va_vnic_id);
+ free(rsrc);
+ } else {
+ rcm_log_message(RCM_DEBUG,
+ "VNIC: %s is a new resource (vnicid:%d)\n",
+ rsrc, vnic_attr.va_vnic_id);
+ if ((node = calloc(1, sizeof (link_cache_t))) == NULL) {
+ free(rsrc);
+ rcm_log_message(RCM_ERROR, _("VNIC: calloc: %s\n"),
+ strerror(errno));
+ goto done;
+ }
+
+ node->vc_resource = rsrc;
+ node->vc_vnic = NULL;
+ node->vc_linkid = vnic_attr.va_link_id;
+ node->vc_state |= CACHE_NODE_NEW;
+ newnode = B_TRUE;
+ }
+
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+ if (vnic->dlv_vnic_id == vnicid) {
+ vnic->dlv_flags &= ~VNIC_STALE;
+ break;
+ }
+ }
+
+ if (vnic == NULL) {
+ if ((vnic = calloc(1, sizeof (dl_vnic_t))) == NULL) {
+ rcm_log_message(RCM_ERROR, _("VNIC: malloc: %s\n"),
+ strerror(errno));
+ if (newnode) {
+ free(rsrc);
+ free(node);
+ }
+ goto done;
+ }
+ vnic->dlv_vnic_id = vnicid;
+ vnic->dlv_next = node->vc_vnic;
+ vnic->dlv_prev = NULL;
+ if (node->vc_vnic != NULL)
+ node->vc_vnic->dlv_prev = vnic;
+ node->vc_vnic = vnic;
+ }
+
+ node->vc_state &= ~CACHE_NODE_STALE;
+
+ if (newnode)
+ cache_insert(node);
+
+ rcm_log_message(RCM_TRACE3, "VNIC: vnic_update: succeeded(%u)\n",
+ vnicid);
+ ret = 0;
+done:
+ vnic_update_argp->retval = ret;
+ return (ret == 0 ? DLADM_WALK_CONTINUE : DLADM_WALK_TERMINATE);
+}
+
+/*
+ * vnic_update_all() - Determine all VNIC links in the system
+ */
+static int
+vnic_update_all(rcm_handle_t *hd)
+{
+ vnic_update_arg_t arg = {NULL, 0};
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_update_all\n");
+
+ assert(MUTEX_HELD(&cache_lock));
+ arg.hd = hd;
+ (void) dladm_walk_datalink_id(vnic_update, &arg, DATALINK_CLASS_VNIC,
+ DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+ return (arg.retval);
+}
+
+/*
+ * cache_update() - Update cache with latest interface info
+ */
+static int
+cache_update(rcm_handle_t *hd)
+{
+ link_cache_t *node, *nnode;
+ dl_vnic_t *vnic;
+ int rv;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: cache_update\n");
+
+ (void) mutex_lock(&cache_lock);
+
+ /* first we walk the entire cache, marking each entry stale */
+ node = cache_head.vc_next;
+ for (; node != &cache_tail; node = node->vc_next) {
+ node->vc_state |= CACHE_NODE_STALE;
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next)
+ vnic->dlv_flags |= VNIC_STALE;
+ }
+
+ rv = vnic_update_all(hd);
+
+ /*
+ * Continue to delete all stale nodes from the cache even
+ * vnic_update_all() failed. Unregister link that are not offlined
+ * and still in cache
+ */
+ for (node = cache_head.vc_next; node != &cache_tail; node = nnode) {
+ dl_vnic_t *vnic, *next;
+
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = next) {
+ next = vnic->dlv_next;
+
+ /* clear stale VNICs */
+ if (vnic->dlv_flags & VNIC_STALE) {
+ if (vnic->dlv_prev != NULL)
+ vnic->dlv_prev->dlv_next = next;
+ else
+ node->vc_vnic = next;
+
+ if (next != NULL)
+ next->dlv_prev = vnic->dlv_prev;
+ free(vnic);
+ }
+ }
+
+ nnode = node->vc_next;
+ if (node->vc_state & CACHE_NODE_STALE) {
+ (void) rcm_unregister_interest(hd, node->vc_resource,
+ 0);
+ rcm_log_message(RCM_DEBUG, "VNIC: unregistered %s\n",
+ node->vc_resource);
+ assert(node->vc_vnic == NULL);
+ cache_remove(node);
+ node_free(node);
+ continue;
+ }
+
+ if (!(node->vc_state & CACHE_NODE_NEW))
+ continue;
+
+ if (rcm_register_interest(hd, node->vc_resource, 0, NULL) !=
+ RCM_SUCCESS) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: failed to register %s\n"),
+ node->vc_resource);
+ rv = -1;
+ } else {
+ rcm_log_message(RCM_DEBUG, "VNIC: registered %s\n",
+ node->vc_resource);
+ node->vc_state &= ~CACHE_NODE_NEW;
+ }
+ }
+
+ (void) mutex_unlock(&cache_lock);
+ return (rv);
+}
+
+/*
+ * cache_free() - Empty the cache
+ */
+static void
+cache_free()
+{
+ link_cache_t *node;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: cache_free\n");
+
+ (void) mutex_lock(&cache_lock);
+ node = cache_head.vc_next;
+ while (node != &cache_tail) {
+ cache_remove(node);
+ node_free(node);
+ node = cache_head.vc_next;
+ }
+ (void) mutex_unlock(&cache_lock);
+}
+
+/*
+ * vnic_log_err() - RCM error log wrapper
+ */
+static void
+vnic_log_err(datalink_id_t linkid, char **errorp, char *errmsg)
+{
+ char link[MAXLINKNAMELEN];
+ char errstr[DLADM_STRSIZE];
+ dladm_status_t status;
+ int len;
+ const char *errfmt;
+ char *error;
+
+ link[0] = '\0';
+ if (linkid != DATALINK_INVALID_LINKID) {
+ char rsrc[RCM_LINK_RESOURCE_MAX];
+
+ (void) snprintf(rsrc, sizeof (rsrc), "%s/%u",
+ RCM_LINK_PREFIX, linkid);
+
+ rcm_log_message(RCM_ERROR, _("VNIC: %s(%s)\n"), errmsg, rsrc);
+ if ((status = dladm_datalink_id2info(linkid, NULL, NULL,
+ NULL, link, sizeof (link))) != DLADM_STATUS_OK) {
+ rcm_log_message(RCM_WARNING,
+ _("VNIC: cannot get link name for (%s) %s\n"),
+ rsrc, dladm_status2str(status, errstr));
+ }
+ } else {
+ rcm_log_message(RCM_ERROR, _("VNIC: %s\n"), errmsg);
+ }
+
+ errfmt = strlen(link) > 0 ? _("VNIC: %s(%s)") : _("VNIC: %s");
+ len = strlen(errfmt) + strlen(errmsg) + MAXLINKNAMELEN + 1;
+ if ((error = malloc(len)) != NULL) {
+ if (strlen(link) > 0)
+ (void) snprintf(error, len, errfmt, errmsg, link);
+ else
+ (void) snprintf(error, len, errfmt, errmsg);
+ }
+
+ if (errorp != NULL)
+ *errorp = error;
+}
+
+/*
+ * vnic_consumer_online()
+ *
+ * Notify online to VNIC consumers.
+ */
+/* ARGSUSED */
+static void
+vnic_consumer_online(rcm_handle_t *hd, link_cache_t *node, char **errorp,
+ uint_t flags, rcm_info_t **info)
+{
+ dl_vnic_t *vnic;
+ char rsrc[RCM_LINK_RESOURCE_MAX];
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_online (%s)\n",
+ node->vc_resource);
+
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+ if (!(vnic->dlv_flags & VNIC_CONSUMER_OFFLINED))
+ continue;
+
+ (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+ RCM_LINK_PREFIX, vnic->dlv_vnic_id);
+
+ if (rcm_notify_online(hd, rsrc, flags, info) == RCM_SUCCESS)
+ vnic->dlv_flags &= ~VNIC_CONSUMER_OFFLINED;
+ }
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_online done\n");
+}
+
+/*
+ * vnic_consumer_offline()
+ *
+ * Offline VNIC consumers.
+ */
+static int
+vnic_consumer_offline(rcm_handle_t *hd, link_cache_t *node, char **errorp,
+ uint_t flags, rcm_info_t **info)
+{
+ dl_vnic_t *vnic;
+ char rsrc[RCM_LINK_RESOURCE_MAX];
+ int ret = RCM_SUCCESS;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_offline (%s)\n",
+ node->vc_resource);
+
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+ (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+ RCM_LINK_PREFIX, vnic->dlv_vnic_id);
+
+ ret = rcm_request_offline(hd, rsrc, flags, info);
+ if (ret != RCM_SUCCESS)
+ break;
+
+ vnic->dlv_flags |= VNIC_CONSUMER_OFFLINED;
+ }
+
+ if (vnic != NULL)
+ vnic_consumer_online(hd, node, errorp, flags, info);
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_offline done\n");
+ return (ret);
+}
+
+/*
+ * Send RCM_RESOURCE_LINK_NEW events to other modules about new VNICs.
+ * Return 0 on success, -1 on failure.
+ */
+static int
+vnic_notify_new_vnic(rcm_handle_t *hd, char *rsrc)
+{
+ link_cache_t *node;
+ dl_vnic_t *vnic;
+ nvlist_t *nvl = NULL;
+ uint64_t id;
+ int ret = -1;
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_notify_new_vnic (%s)\n", rsrc);
+
+ (void) mutex_lock(&cache_lock);
+ if ((node = cache_lookup(hd, rsrc, CACHE_REFRESH)) == NULL) {
+ (void) mutex_unlock(&cache_lock);
+ return (0);
+ }
+
+ if (nvlist_alloc(&nvl, 0, 0) != 0) {
+ (void) mutex_unlock(&cache_lock);
+ rcm_log_message(RCM_WARNING,
+ _("VNIC: failed to allocate nvlist\n"));
+ goto done;
+ }
+
+ for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+ rcm_log_message(RCM_TRACE2,
+ "VNIC: vnic_notify_new_vnic add (%u)\n", vnic->dlv_vnic_id);
+
+ id = vnic->dlv_vnic_id;
+ if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: failed to construct nvlist\n"));
+ (void) mutex_unlock(&cache_lock);
+ goto done;
+ }
+ }
+ (void) mutex_unlock(&cache_lock);
+
+ if (rcm_notify_event(hd, RCM_RESOURCE_LINK_NEW, 0, nvl, NULL) !=
+ RCM_SUCCESS) {
+ rcm_log_message(RCM_ERROR,
+ _("VNIC: failed to notify %s event for %s\n"),
+ RCM_RESOURCE_LINK_NEW, node->vc_resource);
+ goto done;
+ }
+
+ ret = 0;
+done:
+ if (nvl != NULL)
+ nvlist_free(nvl);
+ return (ret);
+}
+
+/*
+ * vnic_consumer_notify() - Notify consumers of VNICs coming back online.
+ */
+static int
+vnic_consumer_notify(rcm_handle_t *hd, datalink_id_t linkid, char **errorp,
+ uint_t flags, rcm_info_t **info)
+{
+ char rsrc[RCM_LINK_RESOURCE_MAX];
+ link_cache_t *node;
+
+ /* Check for the interface in the cache */
+ (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u", RCM_LINK_PREFIX,
+ linkid);
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_notify(%s)\n", rsrc);
+
+ /*
+ * Inform IP consumers of the new link.
+ */
+ if (vnic_notify_new_vnic(hd, rsrc) != 0) {
+ (void) mutex_lock(&cache_lock);
+ if ((node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH)) != NULL) {
+ (void) vnic_offline_vnic(node, VNIC_STALE,
+ CACHE_NODE_STALE);
+ }
+ (void) mutex_unlock(&cache_lock);
+ rcm_log_message(RCM_TRACE2,
+ "VNIC: vnic_notify_new_vnic failed(%s)\n", rsrc);
+ return (-1);
+ }
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_notify succeeded\n");
+ return (0);
+}
+
+typedef struct vnic_up_arg_s {
+ datalink_id_t linkid;
+ int retval;
+} vnic_up_arg_t;
+
+static int
+vnic_up(datalink_id_t vnicid, void *arg)
+{
+ vnic_up_arg_t *vnic_up_argp = arg;
+ dladm_status_t status;
+ dladm_vnic_attr_t vnic_attr;
+ char errmsg[DLADM_STRSIZE];
+
+ status = dladm_vnic_info(vnicid, &vnic_attr, DLADM_OPT_PERSIST);
+ if (status != DLADM_STATUS_OK) {
+ rcm_log_message(RCM_TRACE1,
+ "VNIC: vnic_up(): cannot get information for VNIC %u "
+ "(%s)\n", vnicid, dladm_status2str(status, errmsg));
+ return (DLADM_WALK_CONTINUE);
+ }
+
+ if (vnic_attr.va_link_id != vnic_up_argp->linkid)
+ return (DLADM_WALK_CONTINUE);
+
+ rcm_log_message(RCM_TRACE3, "VNIC: vnic_up(%u)\n", vnicid);
+ if ((status = dladm_vnic_up(vnicid, 0)) == DLADM_STATUS_OK)
+ return (DLADM_WALK_CONTINUE);
+
+ /*
+ * Prompt the warning message and continue to UP other VNICs.
+ */
+ rcm_log_message(RCM_WARNING,
+ _("VNIC: VNIC up failed (%u): %s\n"),
+ vnicid, dladm_status2str(status, errmsg));
+
+ vnic_up_argp->retval = -1;
+ return (DLADM_WALK_CONTINUE);
+}
+
+/*
+ * vnic_configure() - Configure VNICs over a physical link after it attaches
+ */
+static int
+vnic_configure(rcm_handle_t *hd, datalink_id_t linkid)
+{
+ char rsrc[RCM_LINK_RESOURCE_MAX];
+ link_cache_t *node;
+ vnic_up_arg_t arg = {DATALINK_INVALID_LINKID, 0};
+
+ /* Check for the VNICs in the cache */
+ (void) snprintf(rsrc, sizeof (rsrc), "%s/%u", RCM_LINK_PREFIX, linkid);
+
+ rcm_log_message(RCM_TRACE2, "VNIC: vnic_configure(%s)\n", rsrc);
+
+ /* Check if the link is new or was previously offlined */
+ (void) mutex_lock(&cache_lock);
+ if (((node = cache_lookup(hd, rsrc, CACHE_REFRESH)) != NULL) &&
+ (!(node->vc_state & CACHE_NODE_OFFLINED))) {
+ rcm_log_message(RCM_TRACE2,
+ "VNIC: Skipping configured interface(%s)\n", rsrc);
+ (void) mutex_unlock(&cache_lock);
+ return (0);
+ }
+ (void) mutex_unlock(&cache_lock);
+
+ arg.linkid = linkid;
+ (void) dladm_walk_datalink_id(vnic_up, &arg, DATALINK_CLASS_VNIC,
+ DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST);
+
+ if (arg.retval == 0) {
+ rcm_log_message(RCM_TRACE2,
+ "VNIC: vnic_configure succeeded(%s)\n", rsrc);
+ }
+ return (arg.retval);
+}
diff --git a/usr/src/cmd/svc/milestone/net-physical b/usr/src/cmd/svc/milestone/net-physical
index bcee0c9818..8530806768 100644
--- a/usr/src/cmd/svc/milestone/net-physical
+++ b/usr/src/cmd/svc/milestone/net-physical
@@ -26,8 +26,6 @@
# Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T.
# All rights reserved.
#
-#
-# ident "%Z%%M% %I% %E% SMI"
. /lib/svc/share/smf_include.sh
. /lib/svc/share/net_include.sh
@@ -81,6 +79,14 @@ if smf_is_globalzone; then
/sbin/dladm up-aggr
/sbin/dladm up-vlan
/sbin/dladm init-secobj
+ #
+ # Bring up VNICs
+ #
+ /sbin/dladm up-vnic
+ #
+ # Create flows via flowadm.
+ #
+ /sbin/flowadm init-flow
fi
#
diff --git a/usr/src/cmd/svc/profile/generic_limited_net.xml b/usr/src/cmd/svc/profile/generic_limited_net.xml
index 449d06bf1e..5fed0e86bf 100644
--- a/usr/src/cmd/svc/profile/generic_limited_net.xml
+++ b/usr/src/cmd/svc/profile/generic_limited_net.xml
@@ -62,6 +62,7 @@
<instance name='flow' enabled='false'/>
<instance name='process' enabled='false'/>
<instance name='task' enabled='false'/>
+ <instance name='net' enabled='false'/>
</service>
<service name='system/hal' version='1' type='service'>
<instance name='default' enabled='true'/>
diff --git a/usr/src/cmd/svc/profile/generic_open.xml b/usr/src/cmd/svc/profile/generic_open.xml
index 7d837f4b53..34b600cca1 100644
--- a/usr/src/cmd/svc/profile/generic_open.xml
+++ b/usr/src/cmd/svc/profile/generic_open.xml
@@ -59,6 +59,7 @@
<instance name='flow' enabled='false'/>
<instance name='process' enabled='false'/>
<instance name='task' enabled='false'/>
+ <instance name='net' enabled='false'/>
</service>
<service name='system/hal' version='1' type='service'>
<instance name='default' enabled='true'/>
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index 3869b370c1..46b2b5a958 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -84,6 +84,7 @@
#include <sys/ptms.h>
#include <sys/aggr.h>
#include <sys/dld.h>
+#include <sys/vnic.h>
#include <sys/fs/zfs.h>
#include <inet/kssl/kssl.h>
#include <sys/dkio.h>
@@ -844,18 +845,38 @@ const struct ioc {
{ (uint_t)DLDIOC_ATTR, "DLDIOC_ATTR", "dld_ioc_attr"},
{ (uint_t)DLDIOC_PHYS_ATTR, "DLDIOC_PHYS_ATTR",
"dld_ioc_phys_attr"},
- { (uint_t)DLDIOC_VLAN_ATTR, "DLDIOC_VLAN_ATTR",
- "dld_ioc_vlan_attr"},
- { (uint_t)DLDIOC_CREATE_VLAN, "DLDIOC_CREATE_VLAN",
- "dld_ioc_create_vlan"},
- { (uint_t)DLDIOC_DELETE_VLAN, "DLDIOC_DELETE_VLAN",
- "dld_ioc_delete_vlan"},
- { (uint_t)DLDIOC_DOORSERVER, "DLDIOC_DOORSERVER", "dld_ioc_door"},
- { (uint_t)DLDIOC_RENAME, "DLDIOC_RENAME", "dld_ioc_rename"},
- { (uint_t)DLDIOC_SETMACPROP, "DLDIOC_SETMACPROP",
+ { (uint_t)DLDIOC_DOORSERVER, "DLDIOC_DOORSERVER", "dld_ioc_door"},
+ { (uint_t)DLDIOC_RENAME, "DLDIOC_RENAME", "dld_ioc_rename"},
+ { (uint_t)DLDIOC_SECOBJ_GET, "DLDIOC_SECOBJ_GET",
+ "dld_ioc_secobj_get"},
+ { (uint_t)DLDIOC_SECOBJ_SET, "DLDIOC_SECOBJ_SET",
+ "dld_ioc_secobj_set"},
+ { (uint_t)DLDIOC_SECOBJ_UNSET, "DLDIOC_SECOBJ_UNSET",
+ "dld_ioc_secobj_unset"},
+ { (uint_t)DLDIOC_MACADDRGET, "DLDIOC_MACADDRGET",
+ "dld_ioc_macaddrget"},
+ { (uint_t)DLDIOC_SETMACPROP, "DLDIOC_SETMACPROP",
"dld_ioc_macprop_s"},
- { (uint_t)DLDIOC_GETMACPROP, "DLDIOC_GETMACPROP",
+ { (uint_t)DLDIOC_GETMACPROP, "DLDIOC_GETMACPROP",
"dld_ioc_macprop_s"},
+ { (uint_t)DLDIOC_ADDFLOW, "DLDIOC_ADDFLOW",
+ "dld_ioc_addflow"},
+ { (uint_t)DLDIOC_REMOVEFLOW, "DLDIOC_REMOVEFLOW",
+ "dld_ioc_removeflow"},
+ { (uint_t)DLDIOC_MODIFYFLOW, "DLDIOC_MODIFYFLOW",
+ "dld_ioc_modifyflow"},
+ { (uint_t)DLDIOC_WALKFLOW, "DLDIOC_WALKFLOW",
+ "dld_ioc_walkflow"},
+ { (uint_t)DLDIOC_USAGELOG, "DLDIOC_USAGELOG",
+ "dld_ioc_usagelog"},
+
+ /* vnic ioctls */
+ { (uint_t)VNIC_IOC_CREATE, "VNIC_IOC_CREATE",
+ "vnic_ioc_create"},
+ { (uint_t)VNIC_IOC_DELETE, "VNIC_IOC_DELETE",
+ "vnic_ioc_delete"},
+ { (uint_t)VNIC_IOC_INFO, "VNIC_IOC_INFO",
+ "vnic_ioc_info"},
/* ZFS ioctls */
{ (uint_t)ZFS_IOC_POOL_CREATE, "ZFS_IOC_POOL_CREATE",
diff --git a/usr/src/cmd/vna/Makefile b/usr/src/cmd/vna/Makefile
index 4e5e25e85b..6b608e0126 100644
--- a/usr/src/cmd/vna/Makefile
+++ b/usr/src/cmd/vna/Makefile
@@ -22,15 +22,16 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
PROG = vna
include ../Makefile.cmd
+LDLIBS += -L$(ROOT)/lib
LDLIBS += -ldladm -lsocket -ldlpi
+
.KEEP_STATE:
all: $(PROG)
diff --git a/usr/src/cmd/vna/vna.c b/usr/src/cmd/vna/vna.c
index 6262de5959..6a05cf1777 100644
--- a/usr/src/cmd/vna/vna.c
+++ b/usr/src/cmd/vna/vna.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This utility constitutes a private interface - it will be removed
* in a future release of Solaris. Neither users nor other software
@@ -40,7 +38,7 @@
#include <libdlpi.h>
typedef struct vnic_attr {
- dladm_vnic_attr_sys_t attr;
+ dladm_vnic_attr_t attr;
char *name;
} vnic_attr_t;
@@ -48,7 +46,7 @@ typedef struct vnic_attr {
static int
v_print(datalink_id_t vnic_id, void *arg)
{
- dladm_vnic_attr_sys_t attr;
+ dladm_vnic_attr_t attr;
char vnic[MAXLINKNAMELEN];
char link[MAXLINKNAMELEN];
@@ -87,8 +85,8 @@ static int
v_find(datalink_id_t vnic_id, void *arg)
{
vnic_attr_t *vattr = arg;
- dladm_vnic_attr_sys_t *specp = &vattr->attr;
- dladm_vnic_attr_sys_t attr;
+ dladm_vnic_attr_t *specp = &vattr->attr;
+ dladm_vnic_attr_t attr;
char linkname[MAXLINKNAMELEN];
if (dladm_vnic_info(vnic_id, &attr, DLADM_OPT_ACTIVE) !=
@@ -221,7 +219,8 @@ v_add(char *link, char *addr, char *name)
*/
status = dladm_vnic_create(name, linkid,
VNIC_MAC_ADDR_TYPE_FIXED, (uchar_t *)ea->ether_addr_octet,
- ETHERADDRL, &vnic_id, DLADM_OPT_ACTIVE);
+ ETHERADDRL, NULL, 0, 0, &vnic_id, NULL, DLADM_OPT_ACTIVE);
+
if (status != DLADM_STATUS_OK) {
(void) fprintf(stderr, "dladm_vnic_create: %s\n",
dladm_status2str(status, buf));
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index b7a9d26795..8b3fb0aaf9 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -546,7 +546,7 @@ libdevinfo: libnvpair libsec
libdhcpagent: libsocket libdhcputil libuuid libdlpi
libdhcpsvc: libinetutil
libdhcputil: libnsl libgen libinetutil libdlpi
-libdladm: libdevinfo libinetutil libsocket
+libdladm: libdevinfo libinetutil libsocket libnsl libexacct libscf
libdll: libast
libdlpi: libinetutil libdladm
libdscfg: libnsctl libunistat libsocket libnsl
diff --git a/usr/src/lib/libdladm/Makefile b/usr/src/lib/libdladm/Makefile
index 630a7e2e19..ebe6c51eee 100644
--- a/usr/src/lib/libdladm/Makefile
+++ b/usr/src/lib/libdladm/Makefile
@@ -22,14 +22,14 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
include $(SRC)/lib/Makefile.lib
HDRS = libdladm.h libdladm_impl.h libdllink.h libdlaggr.h \
libdlwlan.h libdlwlan_impl.h libdlvnic.h libdlvlan.h \
- libdlmgmt.h
+ libdlmgmt.h libdlflow.h libdlflow_impl.h libdlstat.h
+
HDRDIR = common
SUBDIRS = $(MACH)
@@ -39,7 +39,11 @@ POFILE = libdladm.po
MSGFILES = common/libdladm.c common/linkprop.c common/secobj.c \
common/libdllink.c common/libdlaggr.c \
common/libdlwlan.c common/libdlvnic.c \
- common/libdlvlan.c common/libdlmgmt.c
+ common/libdlvlan.c common/libdlmgmt.c \
+ common/flowattr.c common/flowprop.c \
+ common/propfuncs.c common/libdlflow.c \
+ common/libdlstat.c common/flowattr.c
+
XGETFLAGS = -a -x libdladm.xcl
all := TARGET = all
diff --git a/usr/src/lib/libdladm/Makefile.com b/usr/src/lib/libdladm/Makefile.com
index 0f6419bd29..50aa57e710 100644
--- a/usr/src/lib/libdladm/Makefile.com
+++ b/usr/src/lib/libdladm/Makefile.com
@@ -22,13 +22,13 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
LIBRARY = libdladm.a
VERS = .1
OBJECTS = libdladm.o secobj.o linkprop.o libdllink.o libdlaggr.o \
- libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o
+ libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o \
+ flowattr.o flowprop.o propfuncs.o libdlflow.o libdlstat.o \
+ usage.o
include ../../Makefile.lib
@@ -36,8 +36,8 @@ include ../../Makefile.lib
include ../../Makefile.rootfs
LIBS = $(DYNLIB) $(LINTLIB)
-LDLIBS += -ldevinfo -lc -linetutil -lsocket -lscf -lrcm \
- -lnvpair -lkstat
+LDLIBS += -ldevinfo -lc -linetutil -lsocket -lscf -lrcm -lnvpair \
+ -lexacct -lnsl -lkstat -lcurses
SRCDIR = ../common
$(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC)
diff --git a/usr/src/lib/libdladm/common/flowattr.c b/usr/src/lib/libdladm/common/flowattr.c
new file mode 100644
index 0000000000..4fb578e5bc
--- /dev/null
+++ b/usr/src/lib/libdladm/common/flowattr.c
@@ -0,0 +1,411 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/mac_flow.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <libdladm.h>
+#include <libdlflow.h>
+#include <libdlflow_impl.h>
+
+#define V4_PART_OF_V6(v6) ((v6)._S6_un._S6_u32[3])
+
+/* max port number for UDP, TCP & SCTP */
+#define MAX_PORT 65535
+
+static fad_checkf_t do_check_local_ip;
+static fad_checkf_t do_check_remote_ip;
+static fad_checkf_t do_check_protocol;
+static fad_checkf_t do_check_local_port;
+
+static dladm_status_t do_check_port(char *, boolean_t, flow_desc_t *);
+
+static fattr_desc_t attr_table[] = {
+ { "local_ip", do_check_local_ip },
+ { "remote_ip", do_check_remote_ip },
+ { "transport", do_check_protocol },
+ { "local_port", do_check_local_port },
+ { "dsfield", do_check_dsfield },
+};
+
+#define DLADM_MAX_FLOWATTRS (sizeof (attr_table) / sizeof (fattr_desc_t))
+
+static dladm_status_t
+do_check_local_ip(char *attr_val, flow_desc_t *fdesc)
+{
+ return (do_check_ip_addr(attr_val, B_TRUE, fdesc));
+}
+
+static dladm_status_t
+do_check_remote_ip(char *attr_val, flow_desc_t *fdesc)
+{
+ return (do_check_ip_addr(attr_val, B_FALSE, fdesc));
+}
+
+dladm_status_t
+do_check_ip_addr(char *addr_str, boolean_t local, flow_desc_t *fd)
+{
+ struct addrinfo *info = NULL;
+ dladm_status_t status;
+ int err, prefix_max, prefix_len = 0;
+ char *prefix_str, *endp = NULL;
+ flow_mask_t mask;
+ in6_addr_t *addr;
+ uchar_t *netmask;
+
+ if ((prefix_str = strchr(addr_str, '/')) != NULL) {
+ *prefix_str++ = '\0';
+ errno = 0;
+ prefix_len = (int)strtol(prefix_str, &endp, 10);
+ if (errno != 0 || prefix_len == 0 || *endp != '\0')
+ return (DLADM_STATUS_INVALID_PREFIXLEN);
+ }
+
+ err = getaddrinfo(addr_str, NULL, NULL, &info);
+ if (err != 0)
+ return (DLADM_STATUS_INVALID_IP);
+
+ mask = FLOW_IP_VERSION;
+ if (local) {
+ mask |= FLOW_IP_LOCAL;
+ addr = &fd->fd_local_addr;
+ netmask = (uchar_t *)&fd->fd_local_netmask;
+ } else {
+ mask |= FLOW_IP_REMOTE;
+ addr = &fd->fd_remote_addr;
+ netmask = (uchar_t *)&fd->fd_remote_netmask;
+ }
+
+ if (info->ai_family == AF_INET) {
+ IN6_INADDR_TO_V4MAPPED(&(((struct sockaddr_in *)
+ (void *)info->ai_addr)->sin_addr), addr);
+ prefix_max = IP_ABITS;
+ fd->fd_ipversion = IPV4_VERSION;
+ netmask = (uchar_t *)
+ &(V4_PART_OF_V6((*((in6_addr_t *)(void *)netmask))));
+ } else if (info->ai_family == AF_INET6) {
+ *addr = ((struct sockaddr_in6 *)
+ (void *)info->ai_addr)->sin6_addr;
+ prefix_max = IPV6_ABITS;
+ fd->fd_ipversion = IPV6_VERSION;
+ } else {
+ freeaddrinfo(info);
+ return (DLADM_STATUS_INVALID_IP);
+ }
+
+ if (prefix_len == 0)
+ prefix_len = prefix_max;
+
+ status = dladm_prefixlen2mask(prefix_len, prefix_max, netmask);
+
+ if (status != DLADM_STATUS_OK) {
+ freeaddrinfo(info);
+ return (DLADM_STATUS_INVALID_PREFIXLEN);
+ }
+
+ fd->fd_mask |= mask;
+ freeaddrinfo(info);
+ return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+do_check_protocol(char *attr_val, flow_desc_t *fdesc)
+{
+ uint8_t protocol;
+
+ protocol = dladm_str2proto(attr_val);
+
+ if (protocol != 0) {
+ fdesc->fd_mask |= FLOW_IP_PROTOCOL;
+ fdesc->fd_protocol = protocol;
+ return (DLADM_STATUS_OK);
+ } else {
+ return (DLADM_STATUS_INVALID_PROTOCOL);
+ }
+}
+
+dladm_status_t
+do_check_local_port(char *attr_val, flow_desc_t *fdesc)
+{
+ return (do_check_port(attr_val, B_TRUE, fdesc));
+}
+
+dladm_status_t
+do_check_port(char *attr_val, boolean_t local, flow_desc_t *fdesc)
+{
+ char *endp = NULL;
+ long val;
+
+ if (local) {
+ fdesc->fd_mask |= FLOW_ULP_PORT_LOCAL;
+ val = strtol(attr_val, &endp, 10);
+ if (val < 1 || val > MAX_PORT)
+ return (DLADM_STATUS_INVALID_PORT);
+ fdesc->fd_local_port = htons((uint16_t)val);
+ } else {
+ return (DLADM_STATUS_BADVAL);
+ }
+
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Check for invalid and/or duplicate attribute specification
+ */
+static dladm_status_t
+flow_attrlist_check(dladm_arg_list_t *attrlist)
+{
+ int i, j;
+ boolean_t isset[DLADM_MAX_FLOWATTRS];
+ boolean_t matched;
+
+ for (j = 0; j < DLADM_MAX_FLOWATTRS; j++)
+ isset[j] = B_FALSE;
+
+ for (i = 0; i < attrlist->al_count; i++) {
+ matched = B_FALSE;
+ for (j = 0; j < DLADM_MAX_FLOWATTRS; j++) {
+ if (strcmp(attrlist->al_info[i].ai_name,
+ attr_table[j].ad_name) == 0) {
+ if (isset[j])
+ return (DLADM_STATUS_FLOW_INCOMPATIBLE);
+ else
+ isset[j] = B_TRUE;
+ matched = B_TRUE;
+ }
+ }
+ /*
+ * if the attribute did not match any of the attribute in
+ * attr_table, then it's an invalid attribute.
+ */
+ if (!matched)
+ return (DLADM_STATUS_BADARG);
+ }
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Convert an attribute list to a flow_desc_t using the attribute ad_check()
+ * functions.
+ */
+dladm_status_t
+dladm_flow_attrlist_extract(dladm_arg_list_t *attrlist, flow_desc_t *flowdesc)
+{
+ dladm_status_t status = DLADM_STATUS_BADARG;
+ int i;
+
+ for (i = 0; i < attrlist->al_count; i++) {
+ dladm_arg_info_t *aip = &attrlist->al_info[i];
+ int j;
+
+ for (j = 0; j < DLADM_MAX_FLOWATTRS; j++) {
+ fattr_desc_t *adp = &attr_table[j];
+
+ if (strcasecmp(aip->ai_name, adp->ad_name) != 0)
+ continue;
+
+ if ((aip->ai_val == NULL) || (*aip->ai_val == NULL))
+ return (DLADM_STATUS_BADARG);
+
+ if (adp->ad_check != NULL)
+ status = adp->ad_check(*aip->ai_val, flowdesc);
+ else
+ status = DLADM_STATUS_BADARG;
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ }
+ }
+ return (status);
+}
+
+void
+dladm_free_attrs(dladm_arg_list_t *list)
+{
+ dladm_free_args(list);
+}
+
+dladm_status_t
+dladm_parse_flow_attrs(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+
+ if (dladm_parse_args(str, listp, novalues)
+ != DLADM_STATUS_OK)
+ return (DLADM_STATUS_ATTR_PARSE_ERR);
+
+ if (flow_attrlist_check(*listp) != DLADM_STATUS_OK) {
+ dladm_free_attrs(*listp);
+ return (DLADM_STATUS_ATTR_PARSE_ERR);
+ }
+
+ return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+do_check_dsfield(char *str, flow_desc_t *fd)
+{
+ char *mask_str, *endp = NULL;
+ uint_t mask = 0xff, value;
+
+ if ((mask_str = strchr(str, ':')) != NULL) {
+ *mask_str++ = '\0';
+ errno = 0;
+ mask = strtoul(mask_str, &endp, 16);
+ if (errno != 0 || mask == 0 || mask > 0xff ||
+ *endp != '\0')
+ return (DLADM_STATUS_INVALID_DSFMASK);
+ }
+ errno = 0;
+ endp = NULL;
+ value = strtoul(str, &endp, 16);
+ if (errno != 0 || value == 0 || value > 0xff || *endp != '\0')
+ return (DLADM_STATUS_INVALID_DSF);
+
+ fd->fd_dsfield = (uint8_t)value;
+ fd->fd_dsfield_mask = (uint8_t)mask;
+ fd->fd_mask |= FLOW_IP_DSFIELD;
+ return (DLADM_STATUS_OK);
+}
+
+char *
+dladm_proto2str(uint8_t protocol)
+{
+ if (protocol == IPPROTO_TCP)
+ return ("tcp");
+ if (protocol == IPPROTO_UDP)
+ return ("udp");
+ if (protocol == IPPROTO_SCTP)
+ return ("sctp");
+ if (protocol == IPPROTO_ICMPV6)
+ return ("icmpv6");
+ if (protocol == IPPROTO_ICMP)
+ return ("icmp");
+ else
+ return ("");
+}
+
+uint8_t
+dladm_str2proto(const char *protostr)
+{
+ if (strncasecmp(protostr, "tcp", 3) == 0)
+ return (IPPROTO_TCP);
+ else if (strncasecmp(protostr, "udp", 3) == 0)
+ return (IPPROTO_UDP);
+ else if (strncasecmp(protostr, "sctp", 4) == 0)
+ return (IPPROTO_SCTP);
+ else if (strncasecmp(protostr, "icmpv6", 6) == 0)
+ return (IPPROTO_ICMPV6);
+ else if (strncasecmp(protostr, "icmp", 4) == 0)
+ return (IPPROTO_ICMP);
+
+ return (0);
+}
+
+void
+dladm_flow_attr_ip2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+ flow_desc_t fdesc = attrp->fa_flow_desc;
+ struct in_addr ipaddr;
+ int prefix_len, prefix_max;
+ char *cp, abuf[INET6_ADDRSTRLEN];
+
+ if (fdesc.fd_mask & FLOW_IP_LOCAL) {
+ if (fdesc.fd_ipversion == IPV6_VERSION) {
+ (void) inet_ntop(AF_INET6, &fdesc.fd_local_addr, abuf,
+ INET6_ADDRSTRLEN);
+ cp = abuf;
+ prefix_max = IPV6_ABITS;
+ } else {
+ ipaddr.s_addr = fdesc.fd_local_addr._S6_un._S6_u32[3];
+ cp = inet_ntoa(ipaddr);
+ prefix_max = IP_ABITS;
+ }
+ (void) dladm_mask2prefixlen(&fdesc.fd_local_netmask,
+ prefix_max, &prefix_len);
+ (void) snprintf(buf, buf_len, "LCL:%s/%d ", cp, prefix_len);
+ } else if (fdesc.fd_mask & FLOW_IP_REMOTE) {
+ if (fdesc.fd_ipversion == IPV6_VERSION) {
+ (void) inet_ntop(AF_INET6, &fdesc.fd_remote_addr, abuf,
+ INET6_ADDRSTRLEN);
+ cp = abuf;
+ prefix_max = IPV6_ABITS;
+ } else {
+ ipaddr.s_addr = fdesc.fd_remote_addr._S6_un._S6_u32[3];
+ cp = inet_ntoa(ipaddr);
+ prefix_max = IP_ABITS;
+ }
+ (void) dladm_mask2prefixlen(&fdesc.fd_remote_netmask,
+ prefix_max, &prefix_len);
+ (void) snprintf(buf, buf_len, "RMT:%s/%d ", cp, prefix_len);
+ } else {
+ buf[0] = '\0';
+ }
+}
+
+void
+dladm_flow_attr_proto2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+ flow_desc_t fdesc = attrp->fa_flow_desc;
+
+ (void) snprintf(buf, buf_len, "%s",
+ dladm_proto2str(fdesc.fd_protocol));
+}
+
+void
+dladm_flow_attr_port2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+ flow_desc_t fdesc = attrp->fa_flow_desc;
+
+ if (fdesc.fd_mask & FLOW_ULP_PORT_LOCAL) {
+ (void) snprintf(buf, buf_len, "%d",
+ ntohs(fdesc.fd_local_port));
+ } else {
+ buf[0] = '\0';
+ }
+}
+
+void
+dladm_flow_attr_dsfield2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+ flow_desc_t fdesc = attrp->fa_flow_desc;
+
+ if (fdesc.fd_mask & FLOW_IP_DSFIELD) {
+ (void) snprintf(buf, buf_len, "0x%x:0x%x",
+ fdesc.fd_dsfield, fdesc.fd_dsfield_mask);
+ } else {
+ buf[0] = '\0';
+ }
+}
diff --git a/usr/src/lib/libdladm/common/flowprop.c b/usr/src/lib/libdladm/common/flowprop.c
new file mode 100644
index 0000000000..a2125a9d33
--- /dev/null
+++ b/usr/src/lib/libdladm/common/flowprop.c
@@ -0,0 +1,611 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdlib.h>
+#include <strings.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/dld.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libdevinfo.h>
+#include <libdladm_impl.h>
+#include <libdlflow.h>
+#include <libdlflow_impl.h>
+#include <libintl.h>
+
+#include <dlfcn.h>
+#include <link.h>
+
+/*
+ * XXX duplicate define
+ */
+#define DLADM_PROP_VAL_MAX 32
+
+static dladm_status_t i_dladm_set_flowprop_db(const char *, const char *,
+ char **, uint_t);
+static dladm_status_t i_dladm_get_flowprop_db(const char *, const char *,
+ char **, uint_t *);
+
+static fpd_getf_t do_get_maxbw;
+static fpd_setf_t do_set_maxbw;
+static fpd_checkf_t do_check_maxbw;
+
+static fpd_getf_t do_get_priority;
+static fpd_setf_t do_set_priority;
+static fpd_checkf_t do_check_priority;
+
+static fprop_desc_t prop_table[] = {
+ { "maxbw", { "", NULL }, NULL, 0, B_FALSE,
+ do_set_maxbw, NULL,
+ do_get_maxbw, do_check_maxbw},
+ { "priority", { "", NULL }, NULL, 0, B_FALSE,
+ do_set_priority, NULL,
+ do_get_priority, do_check_priority}
+};
+
+#define DLADM_MAX_FLOWPROPS (sizeof (prop_table) / sizeof (fprop_desc_t))
+
+static prop_table_t prop_tbl = {
+ prop_table,
+ DLADM_MAX_FLOWPROPS
+};
+
+static resource_prop_t rsrc_prop_table[] = {
+ {"maxbw", do_extract_maxbw},
+ {"priority", do_extract_priority}
+};
+#define DLADM_MAX_RSRC_PROP (sizeof (rsrc_prop_table) / \
+ sizeof (resource_prop_t))
+
+static dladm_status_t flow_proplist_check(dladm_arg_list_t *);
+
+dladm_status_t
+dladm_set_flowprop(const char *flow, const char *prop_name, char **prop_val,
+ uint_t val_cnt, uint_t flags, char **errprop)
+{
+ dladm_status_t status = DLADM_STATUS_BADARG;
+
+ if (flow == NULL || (prop_val == NULL && val_cnt > 0) ||
+ (prop_val != NULL && val_cnt == 0) || flags == 0)
+ return (DLADM_STATUS_BADARG);
+
+ if ((flags & DLADM_OPT_ACTIVE) != 0) {
+ status = i_dladm_set_prop_temp(flow, prop_name, prop_val,
+ val_cnt, flags, errprop, &prop_tbl);
+ if (status == DLADM_STATUS_TEMPONLY &&
+ (flags & DLADM_OPT_PERSIST) != 0)
+ return (DLADM_STATUS_TEMPONLY);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ }
+ if ((flags & DLADM_OPT_PERSIST) != 0) {
+ if (i_dladm_is_prop_temponly(prop_name, errprop, &prop_tbl))
+ return (DLADM_STATUS_TEMPONLY);
+
+ status = i_dladm_set_flowprop_db(flow, prop_name,
+ prop_val, val_cnt);
+ }
+ return (status);
+}
+
+dladm_status_t
+dladm_walk_flowprop(int (*func)(void *, const char *), const char *flow,
+ void *arg)
+{
+ int i;
+
+ if (flow == NULL || func == NULL)
+ return (DLADM_STATUS_BADARG);
+
+ /* Then show data-flow properties if there are any */
+ for (i = 0; i < DLADM_MAX_FLOWPROPS; i++) {
+ if (func(arg, prop_table[i].pd_name) != DLADM_WALK_CONTINUE)
+ break;
+ }
+ return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_get_flowprop(const char *flow, uint32_t type,
+ const char *prop_name, char **prop_val, uint_t *val_cntp)
+{
+ dladm_status_t status;
+
+ if (flow == NULL || prop_name == NULL || prop_val == NULL ||
+ val_cntp == NULL || *val_cntp == 0)
+ return (DLADM_STATUS_BADARG);
+
+ if (type == DLADM_PROP_VAL_PERSISTENT) {
+ if (i_dladm_is_prop_temponly(prop_name, NULL, &prop_tbl))
+ return (DLADM_STATUS_TEMPONLY);
+ return (i_dladm_get_flowprop_db(flow, prop_name,
+ prop_val, val_cntp));
+ }
+
+ status = i_dladm_get_prop_temp(flow, type, prop_name,
+ prop_val, val_cntp, &prop_tbl);
+ if (status != DLADM_STATUS_NOTFOUND)
+ return (status);
+
+ return (DLADM_STATUS_BADARG);
+}
+
+#define FLOWPROP_RW_DB(statep, writeop) \
+ (i_dladm_rw_db("/etc/dladm/flowprop.conf", \
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH, process_prop_db, \
+ (statep), (writeop)))
+
+static dladm_status_t
+i_dladm_set_flowprop_db(const char *flow, const char *prop_name,
+ char **prop_val, uint_t val_cnt)
+{
+ prop_db_state_t state;
+
+ state.ls_op = process_prop_set;
+ state.ls_name = flow;
+ state.ls_propname = prop_name;
+ state.ls_propval = prop_val;
+ state.ls_valcntp = &val_cnt;
+ state.ls_initop = NULL;
+
+ return (FLOWPROP_RW_DB(&state, B_TRUE));
+}
+
+static dladm_status_t
+i_dladm_get_flowprop_db(const char *flow, const char *prop_name,
+ char **prop_val, uint_t *val_cntp)
+{
+ prop_db_state_t state;
+
+ state.ls_op = process_prop_get;
+ state.ls_name = flow;
+ state.ls_propname = prop_name;
+ state.ls_propval = prop_val;
+ state.ls_valcntp = val_cntp;
+ state.ls_initop = NULL;
+
+ return (FLOWPROP_RW_DB(&state, B_FALSE));
+}
+
+dladm_status_t
+i_dladm_init_flowprop_db(void)
+{
+ prop_db_state_t state;
+
+ state.ls_op = process_prop_init;
+ state.ls_name = NULL;
+ state.ls_propname = NULL;
+ state.ls_propval = NULL;
+ state.ls_valcntp = NULL;
+ state.ls_initop = dladm_set_flowprop;
+
+ return (FLOWPROP_RW_DB(&state, B_FALSE));
+}
+
+#define MIN_INFO_SIZE (4 * 1024)
+
+dladm_status_t
+dladm_flow_info(const char *flow, dladm_flow_attr_t *attr)
+{
+ dld_ioc_walkflow_t *ioc;
+ int bufsize, fd;
+ dld_flowinfo_t *flowinfo;
+
+ if ((flow == NULL) || (attr == NULL))
+ return (DLADM_STATUS_BADARG);
+
+ if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+ return (dladm_errno2status(errno));
+
+ bufsize = MIN_INFO_SIZE;
+ if ((ioc = calloc(1, bufsize)) == NULL) {
+ (void) close(fd);
+ return (dladm_errno2status(errno));
+ }
+
+ (void) strlcpy(ioc->wf_name, flow, sizeof (ioc->wf_name));
+ ioc->wf_len = bufsize - sizeof (*ioc);
+
+ while (ioctl(fd, DLDIOC_WALKFLOW, ioc) < 0) {
+ if (errno == ENOSPC) {
+ bufsize *= 2;
+ ioc = realloc(ioc, bufsize);
+ if (ioc != NULL) {
+ (void) strlcpy(ioc->wf_name, flow,
+ MAXNAMELEN);
+ ioc->wf_len = bufsize - sizeof (*ioc);
+ continue;
+ }
+ }
+ free(ioc);
+ (void) close(fd);
+ return (dladm_errno2status(errno));
+ }
+
+ bzero(attr, sizeof (*attr));
+
+ flowinfo = (dld_flowinfo_t *)(void *)(ioc + 1);
+
+ attr->fa_linkid = flowinfo->fi_linkid;
+ bcopy(&flowinfo->fi_flowname, &attr->fa_flowname,
+ sizeof (attr->fa_flowname));
+ bcopy(&flowinfo->fi_flow_desc, &attr->fa_flow_desc,
+ sizeof (attr->fa_flow_desc));
+ bcopy(&flowinfo->fi_resource_props, &attr->fa_resource_props,
+ sizeof (attr->fa_resource_props));
+
+ free(ioc);
+ (void) close(fd);
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_get_maxbw(const char *flow, char **prop_val, uint_t *val_cnt)
+{
+ mac_resource_props_t *mrp;
+ char buf[DLADM_STRSIZE];
+ dladm_flow_attr_t fa;
+ dladm_status_t status;
+
+ status = dladm_flow_info(flow, &fa);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ mrp = &(fa.fa_resource_props);
+
+ *val_cnt = 1;
+ if (mrp->mrp_mask & MRP_MAXBW) {
+ (void) snprintf(prop_val[0], DLADM_STRSIZE, "%s",
+ dladm_bw2str(mrp->mrp_maxbw, buf));
+ } else {
+ return (DLADM_STATUS_NOTSUP);
+ }
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_maxbw(const char *flow, val_desc_t *vdp, uint_t val_cnt)
+{
+ dld_ioc_modifyflow_t attr;
+ int fd;
+ mac_resource_props_t mrp;
+ void *val;
+
+ if (val_cnt != 1)
+ return (DLADM_STATUS_BADVALCNT);
+
+ bzero(&mrp, sizeof (mrp));
+ if (vdp != NULL && (val = (void *)vdp->vd_val) != NULL) {
+ bcopy(val, &mrp.mrp_maxbw, sizeof (int64_t));
+ free(val);
+ } else {
+ mrp.mrp_maxbw = MRP_MAXBW_RESETVAL;
+ }
+ mrp.mrp_mask = MRP_MAXBW;
+
+ bzero(&attr, sizeof (attr));
+ (void) strlcpy(attr.mf_name, flow, sizeof (attr.mf_name));
+ bcopy(&mrp, &attr.mf_resource_props, sizeof (mac_resource_props_t));
+
+ fd = open(DLD_CONTROL_DEV, O_RDWR);
+ if (fd < 0) {
+ return (dladm_errno2status(errno));
+ }
+
+ if (ioctl(fd, DLDIOC_MODIFYFLOW, &attr) < 0) {
+ (void) close(fd);
+ return (dladm_errno2status(errno));
+ }
+ (void) close(fd);
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_maxbw(fprop_desc_t *pdp, char **prop_val, uint_t val_cnt,
+ val_desc_t **vdpp)
+{
+ uint64_t *maxbw;
+ val_desc_t *vdp = NULL;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ if (val_cnt != 1)
+ return (DLADM_STATUS_BADVALCNT);
+
+ maxbw = malloc(sizeof (uint64_t));
+ if (maxbw == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ status = dladm_str2bw(*prop_val, maxbw);
+ if (status != DLADM_STATUS_OK) {
+ free(maxbw);
+ return (status);
+ }
+
+ if ((*maxbw < MRP_MAXBW_MINVAL) && (*maxbw != 0)) {
+ free(maxbw);
+ return (DLADM_STATUS_MINMAXBW);
+ }
+
+ vdp = malloc(sizeof (val_desc_t));
+ if (vdp == NULL) {
+ free(maxbw);
+ return (DLADM_STATUS_NOMEM);
+ }
+
+ vdp->vd_val = (uintptr_t)maxbw;
+ *vdpp = vdp;
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_get_priority(const char *flow, char **prop_val, uint_t *val_cnt)
+{
+ mac_resource_props_t *mrp;
+ char buf[DLADM_STRSIZE];
+ dladm_flow_attr_t fa;
+ dladm_status_t status;
+
+ bzero(&fa, sizeof (dladm_flow_attr_t));
+ status = dladm_flow_info(flow, &fa);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ mrp = &(fa.fa_resource_props);
+
+ *val_cnt = 1;
+ if (mrp->mrp_mask & MRP_PRIORITY) {
+ (void) snprintf(prop_val[0], DLADM_STRSIZE, "%s",
+ dladm_pri2str(mrp->mrp_priority, buf));
+ } else {
+ return (DLADM_STATUS_NOTSUP);
+ }
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_priority(const char *flow, val_desc_t *vdp, uint_t val_cnt)
+{
+ dld_ioc_modifyflow_t attr;
+ int fd;
+ mac_resource_props_t mrp;
+ void *val;
+
+ if (val_cnt != 1)
+ return (DLADM_STATUS_BADVALCNT);
+
+ bzero(&mrp, sizeof (mrp));
+ if (vdp != NULL && (val = (void *)vdp->vd_val) != NULL) {
+ bcopy(val, &mrp.mrp_priority, sizeof (mac_priority_level_t));
+ free(val);
+ } else {
+ mrp.mrp_priority = MPL_RESET;
+ }
+ mrp.mrp_mask = MRP_PRIORITY;
+
+ bzero(&attr, sizeof (attr));
+ (void) strlcpy(attr.mf_name, flow, sizeof (attr.mf_name));
+ bcopy(&mrp, &attr.mf_resource_props, sizeof (mac_resource_props_t));
+
+ fd = open(DLD_CONTROL_DEV, O_RDWR);
+ if (fd < 0) {
+ return (dladm_errno2status(errno));
+ }
+
+ if (ioctl(fd, DLDIOC_MODIFYFLOW, &attr) < 0) {
+ (void) close(fd);
+ return (dladm_errno2status(errno));
+ }
+ (void) close(fd);
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_priority(fprop_desc_t *pdp, char **prop_val, uint_t val_cnt,
+ val_desc_t **vdpp)
+{
+ mac_priority_level_t *pri;
+ val_desc_t *vdp = NULL;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ if (val_cnt != 1)
+ return (DLADM_STATUS_BADVALCNT);
+
+ pri = malloc(sizeof (mac_priority_level_t));
+ if (pri == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ status = dladm_str2pri(*prop_val, pri);
+ if (status != DLADM_STATUS_OK) {
+ free(pri);
+ return (status);
+ }
+
+ if (*pri == -1) {
+ free(pri);
+ return (DLADM_STATUS_BADVAL);
+ }
+
+ vdp = malloc(sizeof (val_desc_t));
+ if (vdp == NULL) {
+ free(pri);
+ return (DLADM_STATUS_NOMEM);
+ }
+
+ vdp->vd_val = (uintptr_t)pri;
+ *vdpp = vdp;
+ return (DLADM_STATUS_OK);
+}
+
+static dladm_status_t
+flow_proplist_check(dladm_arg_list_t *proplist)
+{
+ int i, j;
+ boolean_t matched;
+
+ for (i = 0; i < proplist->al_count; i++) {
+ matched = B_FALSE;
+ for (j = 0; j < DLADM_MAX_FLOWPROPS; j++) {
+ if (strcmp(proplist->al_info[i].ai_name,
+ prop_table[j].pd_name) == 0)
+ matched = B_TRUE;
+ }
+ if (!matched)
+ return (DLADM_STATUS_BADPROP);
+ }
+ return (DLADM_STATUS_OK);
+
+}
+
+dladm_status_t
+dladm_parse_flow_props(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+ dladm_status_t status;
+
+ status = dladm_parse_args(str, listp, novalues);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ status = flow_proplist_check(*listp);
+ if (status != DLADM_STATUS_OK) {
+ dladm_free_props(*listp);
+ return (status);
+ }
+
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Retrieve the named property from a proplist, check the value and
+ * convert to a kernel structure.
+ */
+static dladm_status_t
+i_dladm_flow_proplist_extract_one(dladm_arg_list_t *proplist,
+ const char *name, void *val)
+{
+ dladm_status_t status;
+ dladm_arg_info_t *aip = NULL;
+ int i, j;
+
+ /* Find named property in proplist */
+ for (i = 0; i < proplist->al_count; i++) {
+ aip = &proplist->al_info[i];
+ if (strcasecmp(aip->ai_name, name) == 0)
+ break;
+ }
+
+ /* Property not in list */
+ if (i == proplist->al_count)
+ return (DLADM_STATUS_OK);
+
+ for (i = 0; i < DLADM_MAX_FLOWPROPS; i++) {
+ fprop_desc_t *pdp = &prop_table[i];
+ val_desc_t *vdp;
+
+ vdp = malloc(sizeof (val_desc_t) * aip->ai_count);
+ if (vdp == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ if (strcasecmp(aip->ai_name, pdp->pd_name) != 0)
+ continue;
+
+ if (aip->ai_val == NULL)
+ return (DLADM_STATUS_BADARG);
+
+ /* Check property value */
+ if (pdp->pd_check != NULL) {
+ status = pdp->pd_check(pdp, aip->ai_val,
+ aip->ai_count, &vdp);
+ } else {
+ status = DLADM_STATUS_BADARG;
+ }
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ for (j = 0; j < DLADM_MAX_RSRC_PROP; j++) {
+ resource_prop_t *rpp = &rsrc_prop_table[j];
+
+ if (strcasecmp(aip->ai_name, rpp->rp_name) != 0)
+ continue;
+
+ /* Extract kernel structure */
+ if (rpp->rp_extract != NULL) {
+ status = rpp->rp_extract(vdp, val,
+ aip->ai_count);
+ } else {
+ status = DLADM_STATUS_BADARG;
+ }
+ break;
+ }
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ break;
+ }
+ return (status);
+}
+
+/*
+ * Extract properties from a proplist and convert to mac_resource_props_t.
+ */
+dladm_status_t
+dladm_flow_proplist_extract(dladm_arg_list_t *proplist,
+ mac_resource_props_t *mrp)
+{
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ status = i_dladm_flow_proplist_extract_one(proplist, "maxbw", mrp);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ status = i_dladm_flow_proplist_extract_one(proplist, "priority", mrp);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ return (status);
+}
+
+dladm_status_t
+i_dladm_set_flow_proplist_db(char *flow, dladm_arg_list_t *proplist)
+{
+ dladm_status_t status, ssave = DLADM_STATUS_OK;
+ dladm_arg_info_t ai;
+ int i;
+
+ for (i = 0; i < proplist->al_count; i++) {
+ ai = proplist->al_info[i];
+ status = i_dladm_set_flowprop_db(flow, ai.ai_name,
+ ai.ai_val, ai.ai_count);
+ if (status != DLADM_STATUS_OK)
+ ssave = status;
+ }
+ return (ssave);
+}
diff --git a/usr/src/lib/libdladm/common/libdladm.c b/usr/src/lib/libdladm/common/libdladm.c
index fa588df066..cc6bf542f7 100644
--- a/usr/src/lib/libdladm/common/libdladm.c
+++ b/usr/src/lib/libdladm/common/libdladm.c
@@ -29,6 +29,7 @@
#include <fcntl.h>
#include <strings.h>
#include <dirent.h>
+#include <stdlib.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <libdladm_impl.h>
@@ -89,7 +90,7 @@ dladm_status2str(dladm_status_t status, char *buf)
s = "I/O error";
break;
case DLADM_STATUS_TEMPONLY:
- s = "change cannot be persistent, specify -t please";
+ s = "change cannot be persistent";
break;
case DLADM_STATUS_TIMEDOUT:
s = "operation timed out";
@@ -127,6 +128,117 @@ dladm_status2str(dladm_status_t status, char *buf)
case DLADM_STATUS_NONOTIF:
s = "link notification is not supported";
break;
+ case DLADM_STATUS_BADTIMEVAL:
+ s = "invalid time range";
+ break;
+ case DLADM_STATUS_INVALIDMACADDR:
+ s = "invalid MAC address value";
+ break;
+ case DLADM_STATUS_INVALIDMACADDRNIC:
+ s = "MAC address reserved for use by underlying data-link";
+ break;
+ case DLADM_STATUS_INVALIDMACADDRINUSE:
+ s = "MAC address is already in use";
+ break;
+ case DLADM_STATUS_MACFACTORYSLOTINVALID:
+ s = "invalid factory MAC address slot";
+ break;
+ case DLADM_STATUS_MACFACTORYSLOTUSED:
+ s = "factory MAC address slot already used";
+ break;
+ case DLADM_STATUS_MACFACTORYSLOTALLUSED:
+ s = "all factory MAC address slots are in use";
+ break;
+ case DLADM_STATUS_MACFACTORYNOTSUP:
+ s = "factory MAC address slots not supported";
+ break;
+ case DLADM_STATUS_INVALIDMACPREFIX:
+ s = "Invalid MAC address prefix value";
+ break;
+ case DLADM_STATUS_INVALIDMACPREFIXLEN:
+ s = "Invalid MAC address prefix length";
+ break;
+ case DLADM_STATUS_CPUMAX:
+ s = "non-existent processor ID";
+ break;
+ case DLADM_STATUS_CPUERR:
+ s = "could not determine processor status";
+ break;
+ case DLADM_STATUS_CPUNOTONLINE:
+ s = "processor not online";
+ break;
+ case DLADM_STATUS_DB_NOTFOUND:
+ s = "database not found";
+ break;
+ case DLADM_STATUS_DB_PARSE_ERR:
+ s = "database parse error";
+ break;
+ case DLADM_STATUS_PROP_PARSE_ERR:
+ s = "property parse error";
+ break;
+ case DLADM_STATUS_ATTR_PARSE_ERR:
+ s = "attribute parse error";
+ break;
+ case DLADM_STATUS_FLOW_DB_ERR:
+ s = "flow database error";
+ break;
+ case DLADM_STATUS_FLOW_DB_OPEN_ERR:
+ s = "flow database open error";
+ break;
+ case DLADM_STATUS_FLOW_DB_PARSE_ERR:
+ s = "flow database parse error";
+ break;
+ case DLADM_STATUS_FLOWPROP_DB_PARSE_ERR:
+ s = "flow property database parse error";
+ break;
+ case DLADM_STATUS_FLOW_ADD_ERR:
+ s = "flow add error";
+ break;
+ case DLADM_STATUS_FLOW_WALK_ERR:
+ s = "flow walk error";
+ break;
+ case DLADM_STATUS_FLOW_IDENTICAL:
+ s = "a flow with identical attributes exists";
+ break;
+ case DLADM_STATUS_FLOW_INCOMPATIBLE:
+ s = "flow(s) with incompatible attributes exists";
+ break;
+ case DLADM_STATUS_FLOW_EXISTS:
+ s = "link still has flows";
+ break;
+ case DLADM_STATUS_PERSIST_FLOW_EXISTS:
+ s = "persistent flow with the same name exists";
+ break;
+ case DLADM_STATUS_INVALID_IP:
+ s = "invalid IP address";
+ break;
+ case DLADM_STATUS_INVALID_PREFIXLEN:
+ s = "invalid IP prefix length";
+ break;
+ case DLADM_STATUS_INVALID_PROTOCOL:
+ s = "invalid IP protocol";
+ break;
+ case DLADM_STATUS_INVALID_PORT:
+ s = "invalid port number";
+ break;
+ case DLADM_STATUS_INVALID_DSF:
+ s = "invalid dsfield";
+ break;
+ case DLADM_STATUS_INVALID_DSFMASK:
+ s = "invalid dsfield mask";
+ break;
+ case DLADM_STATUS_INVALID_MACMARGIN:
+ s = "MTU check failed, use lower MTU or -f option";
+ break;
+ case DLADM_STATUS_BADPROP:
+ s = "invalid property";
+ break;
+ case DLADM_STATUS_MINMAXBW:
+ s = "minimum value for maxbw is 1.2M";
+ break;
+ case DLADM_STATUS_NO_HWRINGS:
+ s = "request hw rings failed";
+ break;
default:
s = "<unknown error>";
break;
@@ -169,11 +281,100 @@ dladm_errno2status(int err)
return (DLADM_STATUS_LINKBUSY);
case EAGAIN:
return (DLADM_STATUS_TRYAGAIN);
+ case ENOTEMPTY:
+ return (DLADM_STATUS_FLOW_EXISTS);
+ case EOPNOTSUPP:
+ return (DLADM_STATUS_FLOW_INCOMPATIBLE);
+ case EALREADY:
+ return (DLADM_STATUS_FLOW_IDENTICAL);
default:
return (DLADM_STATUS_FAILED);
}
}
+dladm_status_t
+dladm_str2bw(char *oarg, uint64_t *bw)
+{
+ char *endp = NULL;
+ int64_t n;
+ int mult = 1;
+
+ n = strtoull(oarg, &endp, 10);
+
+ if ((errno != 0) || (strlen(endp) > 1))
+ return (DLADM_STATUS_BADARG);
+
+ if (n < 0)
+ return (DLADM_STATUS_BADVAL);
+
+ switch (*endp) {
+ case 'k':
+ case 'K':
+ mult = 1000;
+ break;
+ case 'm':
+ case 'M':
+ case '\0':
+ mult = 1000000;
+ break;
+ case 'g':
+ case 'G':
+ mult = 1000000000;
+ break;
+ case '%':
+ /*
+ * percentages not supported for now,
+ * see RFE 6540675
+ */
+ return (DLADM_STATUS_NOTSUP);
+ default:
+ return (DLADM_STATUS_BADVAL);
+ }
+
+ *bw = n * mult;
+
+ /* check for overflow */
+ if (*bw / mult != n)
+ return (DLADM_STATUS_BADARG);
+
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Convert bandwidth in bps to a string in mpbs. For values greater
+ * than 1mbps or 1000000, print a whole mbps value. For values that
+ * have fractional Mbps in whole Kbps , print the bandwidth in a manner
+ * simlilar to a floating point format.
+ *
+ * bps string
+ * 0 0
+ * 100 0
+ * 2000 0.002
+ * 431000 0.431
+ * 1000000 1
+ * 1030000 1.030
+ * 100000000 100
+ */
+const char *
+dladm_bw2str(int64_t bw, char *buf)
+{
+ int kbps, mbps;
+
+ kbps = (bw%1000000)/1000;
+ mbps = bw/1000000;
+ if (kbps != 0) {
+ if (mbps == 0)
+ (void) snprintf(buf, DLADM_STRSIZE, "0.%03u", kbps);
+ else
+ (void) snprintf(buf, DLADM_STRSIZE, "%5u.%03u", mbps,
+ kbps);
+ } else {
+ (void) snprintf(buf, DLADM_STRSIZE, "%5u", mbps);
+ }
+
+ return (buf);
+}
+
#define LOCK_DB_PERMS S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH
static int
@@ -241,6 +442,9 @@ dladm_class2str(datalink_class_t class, char *buf)
case DATALINK_CLASS_VNIC:
s = "vnic";
break;
+ case DATALINK_CLASS_ETHERSTUB:
+ s = "etherstub";
+ break;
default:
s = "unknown";
break;
@@ -491,3 +695,123 @@ dladm_valid_linkname(const char *link)
return (B_TRUE);
}
+
+/*
+ * Convert priority string to a value.
+ */
+dladm_status_t
+dladm_str2pri(char *token, mac_priority_level_t *pri)
+{
+ if (strlen(token) == strlen("low") &&
+ strncasecmp(token, "low", strlen("low")) == 0) {
+ *pri = MPL_LOW;
+ } else if (strlen(token) == strlen("medium") &&
+ strncasecmp(token, "medium", strlen("medium")) == 0) {
+ *pri = MPL_MEDIUM;
+ } else if (strlen(token) == strlen("high") &&
+ strncasecmp(token, "high", strlen("high")) == 0) {
+ *pri = MPL_HIGH;
+ } else {
+ return (DLADM_STATUS_BADVAL);
+ }
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Convert priority value to a string.
+ */
+const char *
+dladm_pri2str(mac_priority_level_t pri, char *buf)
+{
+ const char *s;
+
+ switch (pri) {
+ case MPL_LOW:
+ s = "low";
+ break;
+ case MPL_MEDIUM:
+ s = "medium";
+ break;
+ case MPL_HIGH:
+ s = "high";
+ break;
+ default:
+ s = "--";
+ break;
+ }
+ (void) snprintf(buf, DLADM_STRSIZE, "%s", dgettext(TEXT_DOMAIN, s));
+ return (buf);
+}
+
+void
+dladm_free_args(dladm_arg_list_t *list)
+{
+ if (list != NULL) {
+ free(list->al_buf);
+ free(list);
+ }
+}
+
+dladm_status_t
+dladm_parse_args(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+ dladm_arg_list_t *list;
+ dladm_arg_info_t *aip;
+ char *buf, *curr;
+ int len, i;
+
+ list = malloc(sizeof (dladm_arg_list_t));
+ if (list == NULL)
+ return (dladm_errno2status(errno));
+
+ list->al_count = 0;
+ list->al_buf = buf = strdup(str);
+ if (buf == NULL)
+ return (dladm_errno2status(errno));
+
+ curr = buf;
+ len = strlen(buf);
+ aip = NULL;
+ for (i = 0; i < len; i++) {
+ char c = buf[i];
+ boolean_t match = (c == '=' || c == ',');
+
+ if (!match && i != len - 1)
+ continue;
+
+ if (match) {
+ buf[i] = '\0';
+ if (*curr == '\0')
+ goto fail;
+ }
+
+ if (aip != NULL && c != '=') {
+ if (aip->ai_count > DLADM_MAX_ARG_VALS)
+ goto fail;
+
+ if (novalues)
+ goto fail;
+
+ aip->ai_val[aip->ai_count] = curr;
+ aip->ai_count++;
+ } else {
+ if (list->al_count > DLADM_MAX_ARG_VALS)
+ goto fail;
+
+ aip = &list->al_info[list->al_count];
+ aip->ai_name = curr;
+ aip->ai_count = 0;
+ list->al_count++;
+ if (c == ',')
+ aip = NULL;
+ }
+ curr = buf + i + 1;
+ }
+
+ *listp = list;
+ return (DLADM_STATUS_OK);
+
+fail:
+ dladm_free_args(list);
+ return (DLADM_STATUS_FAILED);
+}
diff --git a/usr/src/lib/libdladm/common/libdladm.h b/usr/src/lib/libdladm/common/libdladm.h
index df69a54615..a76245d478 100644
--- a/usr/src/lib/libdladm/common/libdladm.h
+++ b/usr/src/lib/libdladm/common/libdladm.h
@@ -26,7 +26,7 @@
#ifndef _LIBDLADM_H
#define _LIBDLADM_H
-#include <sys/dls.h>
+#include <sys/dls_mgmt.h>
#include <sys/dlpi.h>
/*
@@ -60,16 +60,28 @@ extern "C" {
*
* - DLADM_OPT_PREFIX:
* The function requests to generate a link name using the specified prefix.
+ *
+ * - DLADM_OPT_VLAN:
+ * Signifies VLAN creation code path
+ *
+ * - DLADM_OPT_HWRINGS:
+ * Requires a hardware group of rings when creating a vnic.
*/
#define DLADM_OPT_ACTIVE 0x00000001
#define DLADM_OPT_PERSIST 0x00000002
#define DLADM_OPT_CREATE 0x00000004
#define DLADM_OPT_FORCE 0x00000008
#define DLADM_OPT_PREFIX 0x00000010
+#define DLADM_OPT_ANCHOR 0x00000020
+#define DLADM_OPT_VLAN 0x00000040
+#define DLADM_OPT_HWRINGS 0x00000080
#define DLADM_WALK_TERMINATE 0
#define DLADM_WALK_CONTINUE -1
+#define DLADM_MAX_ARG_CNT 32
+#define DLADM_MAX_ARG_VALS 32
+
typedef enum {
DLADM_STATUS_OK = 0,
DLADM_STATUS_BADARG,
@@ -99,7 +111,44 @@ typedef enum {
DLADM_STATUS_VIDINVAL,
DLADM_STATUS_NONOTIF,
DLADM_STATUS_TRYAGAIN,
- DLADM_STATUS_NOTDEFINED
+ DLADM_STATUS_BADTIMEVAL,
+ DLADM_STATUS_INVALIDMACADDR,
+ DLADM_STATUS_INVALIDMACADDRNIC,
+ DLADM_STATUS_INVALIDMACADDRINUSE,
+ DLADM_STATUS_MACFACTORYSLOTINVALID,
+ DLADM_STATUS_MACFACTORYSLOTUSED,
+ DLADM_STATUS_MACFACTORYSLOTALLUSED,
+ DLADM_STATUS_MACFACTORYNOTSUP,
+ DLADM_STATUS_INVALIDMACPREFIX,
+ DLADM_STATUS_INVALIDMACPREFIXLEN,
+ DLADM_STATUS_CPUMAX,
+ DLADM_STATUS_CPUERR,
+ DLADM_STATUS_CPUNOTONLINE,
+ DLADM_STATUS_DB_NOTFOUND,
+ DLADM_STATUS_DB_PARSE_ERR,
+ DLADM_STATUS_PROP_PARSE_ERR,
+ DLADM_STATUS_ATTR_PARSE_ERR,
+ DLADM_STATUS_FLOW_DB_ERR,
+ DLADM_STATUS_FLOW_DB_OPEN_ERR,
+ DLADM_STATUS_FLOW_DB_PARSE_ERR,
+ DLADM_STATUS_FLOWPROP_DB_PARSE_ERR,
+ DLADM_STATUS_FLOW_ADD_ERR,
+ DLADM_STATUS_FLOW_WALK_ERR,
+ DLADM_STATUS_FLOW_IDENTICAL,
+ DLADM_STATUS_FLOW_INCOMPATIBLE,
+ DLADM_STATUS_FLOW_EXISTS,
+ DLADM_STATUS_PERSIST_FLOW_EXISTS,
+ DLADM_STATUS_INVALID_IP,
+ DLADM_STATUS_INVALID_PREFIXLEN,
+ DLADM_STATUS_INVALID_PROTOCOL,
+ DLADM_STATUS_INVALID_PORT,
+ DLADM_STATUS_INVALID_DSF,
+ DLADM_STATUS_INVALID_DSFMASK,
+ DLADM_STATUS_INVALID_MACMARGIN,
+ DLADM_STATUS_NOTDEFINED,
+ DLADM_STATUS_BADPROP,
+ DLADM_STATUS_MINMAXBW,
+ DLADM_STATUS_NO_HWRINGS
} dladm_status_t;
typedef enum {
@@ -111,11 +160,63 @@ typedef enum {
typedef int dladm_conf_t;
#define DLADM_INVALID_CONF 0
+typedef struct dladm_arg_info {
+ const char *ai_name;
+ char *ai_val[DLADM_MAX_ARG_VALS];
+ uint_t ai_count;
+} dladm_arg_info_t;
+
+typedef struct dladm_arg_list {
+ dladm_arg_info_t al_info[DLADM_MAX_ARG_CNT];
+ uint_t al_count;
+ char *al_buf;
+} dladm_arg_list_t;
+
+typedef enum {
+ DLADM_LOGTYPE_LINK = 1,
+ DLADM_LOGTYPE_FLOW
+} dladm_logtype_t;
+
+typedef struct dladm_usage {
+ char du_name[MAXLINKNAMELEN];
+ uint64_t du_duration;
+ uint64_t du_stime;
+ uint64_t du_etime;
+ uint64_t du_ipackets;
+ uint64_t du_rbytes;
+ uint64_t du_opackets;
+ uint64_t du_obytes;
+ uint64_t du_bandwidth;
+ boolean_t du_last;
+} dladm_usage_t;
+
extern const char *dladm_status2str(dladm_status_t, char *);
extern dladm_status_t dladm_set_rootdir(const char *);
extern const char *dladm_class2str(datalink_class_t, char *);
extern const char *dladm_media2str(uint32_t, char *);
extern boolean_t dladm_valid_linkname(const char *);
+extern dladm_status_t dladm_str2bw(char *, uint64_t *);
+extern const char *dladm_bw2str(int64_t, char *);
+
+extern dladm_status_t dladm_parse_flow_props(char *, dladm_arg_list_t **,
+ boolean_t);
+extern dladm_status_t dladm_parse_link_props(char *, dladm_arg_list_t **,
+ boolean_t);
+extern void dladm_free_props(dladm_arg_list_t *);
+extern dladm_status_t dladm_parse_flow_attrs(char *, dladm_arg_list_t **,
+ boolean_t);
+extern void dladm_free_attrs(dladm_arg_list_t *);
+
+extern dladm_status_t dladm_start_usagelog(dladm_logtype_t, uint_t);
+extern dladm_status_t dladm_stop_usagelog(dladm_logtype_t);
+extern dladm_status_t dladm_walk_usage_res(int (*)(dladm_usage_t *, void *),
+ int, char *, char *, char *, char *, void *);
+extern dladm_status_t dladm_walk_usage_time(int (*)(dladm_usage_t *, void *),
+ int, char *, char *, char *, void *);
+extern dladm_status_t dladm_usage_summary(int (*)(dladm_usage_t *, void *),
+ int, char *, void *);
+extern dladm_status_t dladm_usage_dates(int (*)(dladm_usage_t *, void *),
+ int, char *, char *, void *);
#ifdef __cplusplus
}
diff --git a/usr/src/lib/libdladm/common/libdladm_impl.h b/usr/src/lib/libdladm/common/libdladm_impl.h
index d4a5a52445..41f09b3a46 100644
--- a/usr/src/lib/libdladm/common/libdladm_impl.h
+++ b/usr/src/lib/libdladm/common/libdladm_impl.h
@@ -36,18 +36,17 @@ extern "C" {
#define MAXLINELEN 1024
#define BUFLEN(lim, ptr) (((lim) > (ptr)) ? ((lim) - (ptr)) : 0)
-typedef struct val_desc {
- char *vd_name;
- uintptr_t vd_val;
-} val_desc_t;
-
-#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t))
-
extern dladm_status_t dladm_errno2status(int);
extern dladm_status_t i_dladm_rw_db(const char *, mode_t,
dladm_status_t (*)(void *, FILE *, FILE *),
void *, boolean_t);
+extern const char *dladm_pri2str(mac_priority_level_t, char *);
+extern dladm_status_t dladm_str2pri(char *, mac_priority_level_t *);
+extern dladm_status_t dladm_parse_args(char *, dladm_arg_list_t **,
+ boolean_t);
+extern void dladm_free_args(dladm_arg_list_t *);
+
/*
* Link attributes persisted by dlmgmtd.
*/
@@ -65,11 +64,64 @@ extern dladm_status_t i_dladm_rw_db(const char *, mode_t,
#define FPORTS "portnames" /* string */
#define FPOLICY "policy" /* uint64_t */
#define FFIXMACADDR "fix_macaddr" /* boolean_t */
-#define FMACADDR "macaddr" /* string */
#define FFORCE "force" /* boolean_t */
#define FLACPMODE "lacp_mode" /* uint64_t */
#define FLACPTIMER "lacp_timer" /* uint64_t */
+/*
+ * Set for VNICs only
+ */
+#define FMADDRTYPE "maddrtype" /* uint64_t */
+#define FMADDRLEN "maddrlen" /* uint64_t */
+#define FMADDRSLOT "maddrslot" /* uint64_t */
+#define FMADDRPREFIXLEN "maddrpreflen" /* uint64_t */
+#define FHWRINGS "hwrings" /* boolean_t */
+
+/*
+ * Common fields
+ */
+#define FMACADDR "macaddr" /* string */
+
+/*
+ * Data structures used for implementing temporary properties
+ */
+
+typedef struct val_desc {
+ char *vd_name;
+ uintptr_t vd_val;
+} val_desc_t;
+
+#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t))
+
+extern dladm_status_t dladm_link_proplist_extract(dladm_arg_list_t *,
+ mac_resource_props_t *);
+
+extern dladm_status_t dladm_flow_proplist_extract(dladm_arg_list_t *,
+ mac_resource_props_t *);
+
+/*
+ * The prop extract() callback.
+ *
+ * rp_extract extracts the kernel structure from the val_desc_t created
+ * by the pd_check function.
+ */
+typedef dladm_status_t rp_extractf_t(val_desc_t *propval, void *arg,
+ uint_t cnt);
+extern rp_extractf_t do_extract_maxbw, do_extract_priority,
+ do_extract_cpus;
+
+typedef struct resource_prop_s {
+ /*
+ * resource property name
+ */
+ char *rp_name;
+
+ /*
+ * callback to extract kernel structure
+ */
+ rp_extractf_t *rp_extract;
+} resource_prop_t;
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/libdladm/common/libdlaggr.c b/usr/src/lib/libdladm/common/libdlaggr.c
index dba84441ea..5a155fcad9 100644
--- a/usr/src/lib/libdladm/common/libdlaggr.c
+++ b/usr/src/lib/libdladm/common/libdlaggr.c
@@ -37,6 +37,7 @@
#include <libintl.h>
#include <net/if_types.h>
#include <net/if_dl.h>
+#include <sys/dld.h>
#include <libdllink.h>
#include <libdlvlan.h>
#include <libdlaggr.h>
@@ -1110,7 +1111,7 @@ dladm_aggr_create(const char *name, uint16_t key, uint32_t nports,
for (i = 0; i < nports; i++) {
if ((dladm_datalink_id2info(ports[i].lp_linkid, NULL,
&class, &media, NULL, 0) != DLADM_STATUS_OK) ||
- (class != DATALINK_CLASS_PHYS) && (media != DL_ETHER)) {
+ !((class == DATALINK_CLASS_PHYS) && (media == DL_ETHER))) {
return (DLADM_STATUS_BADARG);
}
}
diff --git a/usr/src/lib/libdladm/common/libdlflow.c b/usr/src/lib/libdladm/common/libdlflow.c
new file mode 100644
index 0000000000..3ec77705a7
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlflow.c
@@ -0,0 +1,903 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ethernet.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <strings.h>
+#include <libintl.h>
+#include <netdb.h>
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <libdlflow.h>
+#include <libdlflow_impl.h>
+#include <libdladm_impl.h>
+
+/* minimum buffer size for DLDIOCWALKFLOW */
+#define MIN_INFO_SIZE (4 * 1024)
+
+#define DLADM_FLOW_DB "/etc/dladm/flowadm.conf"
+#define DLADM_FLOW_DB_TMP "/etc/dladm/flowadm.conf.new"
+#define DLADM_FLOW_DB_LOCK "/tmp/flowadm.conf.lock"
+
+#define DLADM_FLOW_DB_PERMS S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH
+#define DLADM_FLOW_DB_OWNER UID_DLADM
+#define DLADM_FLOW_DB_GROUP GID_SYS
+
+#define BLANK_LINE(s) ((s[0] == '\0') || (s[0] == '#') || (s[0] == '\n'))
+#define MAXLINELEN 1024
+#define MAXPATHLEN 1024
+
+#define V4_PART_OF_V6(v6) ((v6)._S6_un._S6_u32[3])
+
+/* database file parameters */
+static const char *BW_LIMIT = "bw_limit";
+static const char *PRIORITY = "priority";
+static const char *LOCAL_IP_ADDR = "local_ip";
+static const char *REMOTE_IP_ADDR = "remote_ip";
+static const char *TRANSPORT = "transport";
+static const char *LOCAL_PORT = "local_port";
+static const char *DSFIELD = "dsfield";
+
+/*
+ * Open and lock the flowadm configuration file lock. The lock is
+ * acquired as a reader (F_RDLCK) or writer (F_WRLCK).
+ */
+static int
+i_dladm_flow_lock_db(short type)
+{
+ int lock_fd;
+ struct flock lock;
+
+ if ((lock_fd = open(DLADM_FLOW_DB_LOCK, O_RDWR | O_CREAT | O_TRUNC,
+ DLADM_FLOW_DB_PERMS)) < 0)
+ return (-1);
+
+ lock.l_type = type;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ if (fcntl(lock_fd, F_SETLKW, &lock) < 0) {
+ (void) close(lock_fd);
+ (void) unlink(DLADM_FLOW_DB_LOCK);
+ return (-1);
+ }
+ return (lock_fd);
+}
+
+/*
+ * Unlock and close the specified file.
+ */
+static void
+i_dladm_flow_unlock_db(int fd)
+{
+ struct flock lock;
+
+ if (fd < 0)
+ return;
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ (void) fcntl(fd, F_SETLKW, &lock);
+ (void) close(fd);
+ (void) unlink(DLADM_FLOW_DB_LOCK);
+}
+
+/*
+ * Parse one line of the link flowadm DB
+ * Returns -1 on failure, 0 on success.
+ */
+dladm_status_t
+dladm_flow_parse_db(char *line, dld_flowinfo_t *attr)
+{
+ char *token;
+ char *value, *name = NULL;
+ char *endp = NULL;
+ char *lasts = NULL;
+ dladm_status_t status = DLADM_STATUS_FLOW_DB_PARSE_ERR;
+
+ bzero(attr, sizeof (*attr));
+
+ /* flow name */
+ if ((token = strtok_r(line, " \t", &lasts)) == NULL)
+ goto done;
+
+ if (strlcpy(attr->fi_flowname, token, MAXNAMELEN) >= MAXNAMELEN)
+ goto done;
+
+ /* resource control and flow descriptor parameters */
+ while ((token = strtok_r(NULL, " \t", &lasts)) != NULL) {
+ if ((name = strdup(token)) == NULL)
+ goto done;
+
+ (void) strtok(name, "=");
+ value = strtok(NULL, "=");
+ if (value == NULL)
+ goto done;
+
+ if (strcmp(name, "linkid") == 0) {
+ if ((attr->fi_linkid =
+ (uint32_t)strtol(value, &endp, 10)) ==
+ DATALINK_INVALID_LINKID)
+ goto done;
+
+ } else if (strcmp(name, BW_LIMIT) == 0) {
+ attr->fi_resource_props.mrp_mask |=
+ MRP_MAXBW;
+ attr->fi_resource_props.mrp_maxbw =
+ (uint64_t)strtol(value, &endp, 0);
+
+ } else if (strcmp(name, PRIORITY) == 0) {
+ attr->fi_resource_props.mrp_mask |= MRP_PRIORITY;
+ status = dladm_str2pri(value,
+ &attr->fi_resource_props.mrp_priority);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ } else if (strcmp(name, DSFIELD) == 0) {
+ status = do_check_dsfield(value,
+ &attr->fi_flow_desc);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ } else if (strcmp(name, LOCAL_IP_ADDR) == 0) {
+ status = do_check_ip_addr(value, B_TRUE,
+ &attr->fi_flow_desc);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ } else if (strcmp(name, REMOTE_IP_ADDR) == 0) {
+ status = do_check_ip_addr(value, B_FALSE,
+ &attr->fi_flow_desc);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ } else if (strcmp(name, TRANSPORT) == 0) {
+ attr->fi_flow_desc.fd_mask |= FLOW_IP_PROTOCOL;
+ attr->fi_flow_desc.fd_protocol =
+ (uint8_t)strtol(value, &endp, 0);
+
+ } else if (strcmp(name, LOCAL_PORT) == 0) {
+ attr->fi_flow_desc.fd_mask |= FLOW_ULP_PORT_LOCAL;
+ attr->fi_flow_desc.fd_local_port =
+ (uint16_t)strtol(value, &endp, 10);
+ attr->fi_flow_desc.fd_local_port =
+ htons(attr->fi_flow_desc.fd_local_port);
+ }
+ free(name);
+ name = NULL;
+ }
+ if (attr->fi_linkid != DATALINK_INVALID_LINKID)
+ status = DLADM_STATUS_OK;
+done:
+ free(name);
+ return (status);
+}
+
+#define FPRINTF_ERR(fcall) if ((fcall) < 0) return (-1);
+
+/*
+ * Write the attribute of a group to the specified file. Returns 0 on
+ * success, -1 on failure.
+ */
+static int
+i_dladm_flow_fput_grp(FILE *fp, dld_flowinfo_t *attr)
+{
+
+ FPRINTF_ERR(fprintf(fp, "%s\tlinkid=%d\t",
+ attr->fi_flowname, attr->fi_linkid));
+
+ /* flow policy */
+ if (attr->fi_resource_props.mrp_mask & MRP_MAXBW)
+ FPRINTF_ERR(fprintf(fp, "%s=%" PRIu64 "\t", BW_LIMIT,
+ attr->fi_resource_props.mrp_maxbw));
+
+ if (attr->fi_resource_props.mrp_mask & MRP_PRIORITY)
+ FPRINTF_ERR(fprintf(fp, "%s=%d\t", PRIORITY,
+ attr->fi_resource_props.mrp_priority));
+
+ /* flow descriptor */
+ if (attr->fi_flow_desc.fd_mask & FLOW_IP_DSFIELD)
+ FPRINTF_ERR(fprintf(fp, "%s=%x:%x\t", DSFIELD,
+ attr->fi_flow_desc.fd_dsfield,
+ attr->fi_flow_desc.fd_dsfield_mask));
+
+ if (attr->fi_flow_desc.fd_mask & FLOW_IP_LOCAL) {
+ char abuf[INET6_ADDRSTRLEN], *ap;
+ struct in_addr ipaddr;
+ int prefix_len, prefix_max;
+
+ if (attr->fi_flow_desc.fd_ipversion != 6) {
+ ipaddr.s_addr =
+ attr->fi_flow_desc.
+ fd_local_addr._S6_un._S6_u32[3];
+
+ ap = inet_ntoa(ipaddr);
+ prefix_max = IP_ABITS;
+ } else {
+ (void) inet_ntop(AF_INET6,
+ &attr->fi_flow_desc.fd_local_addr,
+ abuf, INET6_ADDRSTRLEN);
+
+ ap = abuf;
+ prefix_max = IPV6_ABITS;
+ }
+ (void) dladm_mask2prefixlen(
+ &attr->fi_flow_desc.fd_local_netmask, prefix_max,
+ &prefix_len);
+
+ FPRINTF_ERR(fprintf(fp, "%s=%s/%d\t", LOCAL_IP_ADDR,
+ ap, prefix_len));
+ }
+ if (attr->fi_flow_desc.fd_mask & FLOW_IP_REMOTE) {
+ char abuf[INET6_ADDRSTRLEN], *ap;
+ struct in_addr ipaddr;
+ int prefix_len, prefix_max;
+
+ if (attr->fi_flow_desc.fd_ipversion != 6) {
+ ipaddr.s_addr =
+ attr->fi_flow_desc.
+ fd_remote_addr._S6_un._S6_u32[3];
+
+ ap = inet_ntoa(ipaddr);
+ prefix_max = IP_ABITS;
+ } else {
+ (void) inet_ntop(AF_INET6,
+ &(attr->fi_flow_desc.fd_remote_addr),
+ abuf, INET6_ADDRSTRLEN);
+
+ ap = abuf;
+ prefix_max = IPV6_ABITS;
+ }
+ (void) dladm_mask2prefixlen(
+ &attr->fi_flow_desc.fd_remote_netmask, prefix_max,
+ &prefix_len);
+
+ FPRINTF_ERR(fprintf(fp, "%s=%s/%d\t", REMOTE_IP_ADDR,
+ ap, prefix_len));
+ }
+ if (attr->fi_flow_desc.fd_mask & FLOW_IP_PROTOCOL)
+ FPRINTF_ERR(fprintf(fp, "%s=%d\t", TRANSPORT,
+ attr->fi_flow_desc.fd_protocol));
+
+ if (attr->fi_flow_desc.fd_mask & FLOW_ULP_PORT_LOCAL)
+ FPRINTF_ERR(fprintf(fp, "%s=%d\t", LOCAL_PORT,
+ ntohs(attr->fi_flow_desc.fd_local_port)));
+
+ FPRINTF_ERR(fprintf(fp, "\n"));
+
+ return (0);
+
+}
+
+static dladm_status_t
+i_dladm_flow_walk_rw_db(int (*fn)(void *, dld_flowinfo_t *),
+ void *arg,
+ const char *root)
+{
+ FILE *fp, *nfp;
+ int nfd, fn_rc, lock_fd;
+ char line[MAXLINELEN];
+ dld_flowinfo_t attr;
+ char *db_file, *tmp_db_file;
+ char db_file_buf[MAXPATHLEN];
+ char tmp_db_file_buf[MAXPATHLEN];
+ dladm_status_t status = DLADM_STATUS_FLOW_DB_ERR;
+
+ if (root == NULL) {
+ db_file = DLADM_FLOW_DB;
+ tmp_db_file = DLADM_FLOW_DB_TMP;
+ } else {
+ (void) snprintf(db_file_buf, MAXPATHLEN, "%s%s", root,
+ DLADM_FLOW_DB);
+ (void) snprintf(tmp_db_file_buf, MAXPATHLEN, "%s%s", root,
+ DLADM_FLOW_DB_TMP);
+ db_file = db_file_buf;
+ tmp_db_file = tmp_db_file_buf;
+ }
+
+ if ((lock_fd = i_dladm_flow_lock_db(F_WRLCK)) < 0)
+ return (DLADM_STATUS_FLOW_DB_ERR);
+
+ if ((fp = fopen(db_file, "r")) == NULL) {
+ i_dladm_flow_unlock_db(lock_fd);
+ return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+ }
+
+ if ((nfd = open(tmp_db_file, O_WRONLY|O_CREAT|O_TRUNC,
+ DLADM_FLOW_DB_PERMS)) == -1) {
+ (void) fclose(fp);
+ i_dladm_flow_unlock_db(lock_fd);
+ return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+ }
+
+ if ((nfp = fdopen(nfd, "w")) == NULL) {
+ (void) close(nfd);
+ (void) fclose(fp);
+ (void) unlink(tmp_db_file);
+ i_dladm_flow_unlock_db(lock_fd);
+ return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+ }
+
+ while (fgets(line, MAXLINELEN, fp) != NULL) {
+
+ /* skip comments */
+ if (BLANK_LINE(line)) {
+ if (fputs(line, nfp) == EOF)
+ goto failed;
+ continue;
+ }
+ (void) strtok(line, " \n");
+
+ if ((status = dladm_flow_parse_db(line, &attr)) !=
+ DLADM_STATUS_OK)
+ goto failed;
+
+ fn_rc = fn(arg, &attr);
+
+ switch (fn_rc) {
+ case -1:
+ /* failure, stop walking */
+ goto failed;
+ case 0:
+ /*
+ * Success, write group attributes, which could
+ * have been modified by fn().
+ */
+ if (i_dladm_flow_fput_grp(nfp, &attr) != 0)
+ goto failed;
+ break;
+ case 1:
+ /* skip current group */
+ break;
+ }
+ }
+ if (fchmod(nfd, DLADM_FLOW_DB_PERMS) == -1)
+ goto failed;
+
+ if (fchown(nfd, DLADM_FLOW_DB_OWNER, DLADM_FLOW_DB_GROUP) == -1)
+ goto failed;
+
+ if (fflush(nfp) == EOF)
+ goto failed;
+
+ (void) fclose(fp);
+ (void) fclose(nfp);
+
+ if (rename(tmp_db_file, db_file) == -1) {
+ (void) unlink(tmp_db_file);
+ i_dladm_flow_unlock_db(lock_fd);
+ return (DLADM_STATUS_FLOW_DB_ERR);
+ }
+ i_dladm_flow_unlock_db(lock_fd);
+ return (DLADM_STATUS_OK);
+
+failed:
+ (void) fclose(fp);
+ (void) fclose(nfp);
+ (void) unlink(tmp_db_file);
+ i_dladm_flow_unlock_db(lock_fd);
+
+ return (status);
+}
+
+/*
+ * Remove existing flow from DB.
+ */
+
+typedef struct remove_db_state {
+ dld_flowinfo_t rs_newattr;
+ dld_flowinfo_t rs_oldattr;
+ boolean_t rs_found;
+} remove_db_state_t;
+
+static int
+i_dladm_flow_remove_db_fn(void *arg, dld_flowinfo_t *grp)
+{
+ remove_db_state_t *state = (remove_db_state_t *)arg;
+ dld_flowinfo_t *attr = &state->rs_newattr;
+
+ if ((strcmp(grp->fi_flowname, attr->fi_flowname)) != 0)
+ return (0);
+ else {
+ bcopy(grp, &state->rs_oldattr,
+ sizeof (dld_flowinfo_t));
+ state->rs_found = B_TRUE;
+ return (1);
+ }
+}
+
+/* ARGSUSED */
+static int
+i_dladm_flow_remove_db(remove_db_state_t *state, const char *root)
+{
+ if (i_dladm_flow_walk_rw_db(i_dladm_flow_remove_db_fn, state, root)
+ != 0)
+ return (-1);
+
+ if (!state->rs_found) {
+ errno = ENOENT;
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Create a flow in the DB.
+ */
+
+typedef struct modify_db_state {
+ dld_flowinfo_t ms_newattr;
+ dld_flowinfo_t ms_oldattr;
+ boolean_t ms_found;
+} modify_db_state_t;
+
+static dladm_status_t
+i_dladm_flow_create_db(dld_flowinfo_t *attr, const char *root)
+{
+ FILE *fp;
+ char line[MAXLINELEN];
+ char *db_file;
+ char db_file_buf[MAXPATHLEN];
+ int lock_fd;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ if (root == NULL) {
+ db_file = DLADM_FLOW_DB;
+ } else {
+ (void) snprintf(db_file_buf, MAXPATHLEN, "%s%s", root,
+ DLADM_FLOW_DB);
+ db_file = db_file_buf;
+ }
+
+ if ((lock_fd = i_dladm_flow_lock_db(F_WRLCK)) < 0)
+ return (DLADM_STATUS_FLOW_DB_ERR);
+
+ if ((fp = fopen(db_file, "r+")) == NULL &&
+ (fp = fopen(db_file, "w")) == NULL) {
+ i_dladm_flow_unlock_db(lock_fd);
+ return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+ }
+
+ /* look for existing group with same flowname */
+ while (fgets(line, MAXLINELEN, fp) != NULL) {
+ char *holder, *lasts;
+
+ /* skip comments */
+ if (BLANK_LINE(line))
+ continue;
+
+ /* ignore corrupted lines */
+ holder = strtok_r(line, " \t", &lasts);
+ if (holder == NULL)
+ continue;
+
+ /* flow id */
+ if (strcmp(holder, attr->fi_flowname) == 0) {
+ /* group with flow id already exists */
+ status = DLADM_STATUS_PERSIST_FLOW_EXISTS;
+ goto failed;
+ }
+ }
+ /*
+ * If we get here, we've verified that no existing group with
+ * the same flow id already exists. Its now time to add the new
+ * group to the DB.
+ */
+ if (i_dladm_flow_fput_grp(fp, attr) != 0)
+ status = DLADM_STATUS_FLOW_DB_PARSE_ERR;
+
+failed:
+ (void) fclose(fp);
+ i_dladm_flow_unlock_db(lock_fd);
+ return (status);
+}
+
+static dladm_status_t
+i_dladm_flow_add(char *flowname, datalink_id_t linkid, flow_desc_t *flowdesc,
+ mac_resource_props_t *mrp)
+{
+ dld_ioc_addflow_t attr;
+ int fd;
+
+ /* create flow */
+ bzero(&attr, sizeof (attr));
+ bcopy(flowdesc, &attr.af_flow_desc, sizeof (flow_desc_t));
+ if (mrp != NULL) {
+ bcopy(mrp, &attr.af_resource_props,
+ sizeof (mac_resource_props_t));
+ }
+
+ (void) strlcpy(attr.af_name, flowname, sizeof (attr.af_name));
+ attr.af_linkid = linkid;
+
+ fd = open(DLD_CONTROL_DEV, O_RDWR);
+ if (fd < 0)
+ return (dladm_errno2status(errno));
+
+ if (ioctl(fd, DLDIOC_ADDFLOW, &attr) < 0) {
+ (void) close(fd);
+ return (dladm_errno2status(errno));
+ }
+
+ (void) close(fd);
+
+ return (DLADM_STATUS_OK);
+}
+
+static dladm_status_t
+i_dladm_flow_remove(char *flowname)
+{
+ dld_ioc_removeflow_t attr;
+ int fd;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ (void) strlcpy(attr.rf_name, flowname,
+ sizeof (attr.rf_name));
+
+ fd = open(DLD_CONTROL_DEV, O_RDWR);
+ if (fd < 0)
+ return (dladm_errno2status(errno));
+
+ if (ioctl(fd, DLDIOC_REMOVEFLOW, &attr) < 0)
+ status = dladm_errno2status(errno);
+
+ (void) close(fd);
+
+ return (status);
+}
+
+
+/* ARGSUSED */
+dladm_status_t
+dladm_flow_add(datalink_id_t linkid, dladm_arg_list_t *attrlist,
+ dladm_arg_list_t *proplist, char *flowname, boolean_t tempop,
+ const char *root)
+{
+ dld_flowinfo_t db_attr;
+ flow_desc_t flowdesc;
+ mac_resource_props_t mrp;
+ dladm_status_t status;
+
+ /* Extract flow attributes from attrlist */
+ bzero(&flowdesc, sizeof (flow_desc_t));
+ if (attrlist != NULL && (status = dladm_flow_attrlist_extract(attrlist,
+ &flowdesc)) != DLADM_STATUS_OK) {
+ return (status);
+ }
+
+ /* Extract resource_ctl and cpu_list from proplist */
+ bzero(&mrp, sizeof (mac_resource_props_t));
+ if (proplist != NULL && (status = dladm_flow_proplist_extract(proplist,
+ &mrp)) != DLADM_STATUS_OK) {
+ return (status);
+ }
+
+ /* Add flow in kernel */
+ status = i_dladm_flow_add(flowname, linkid, &flowdesc, &mrp);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ /* Add flow to DB */
+ if (!tempop) {
+ bzero(&db_attr, sizeof (db_attr));
+ bcopy(&flowdesc, &db_attr.fi_flow_desc, sizeof (flow_desc_t));
+ (void) strlcpy(db_attr.fi_flowname, flowname,
+ sizeof (db_attr.fi_flowname));
+ db_attr.fi_linkid = linkid;
+
+ if ((status = i_dladm_flow_create_db(&db_attr, root)) !=
+ DLADM_STATUS_OK) {
+ (void) i_dladm_flow_remove(flowname);
+ return (status);
+ }
+ /* set flow properties */
+ if (proplist != NULL) {
+ status = i_dladm_set_flow_proplist_db(flowname,
+ proplist);
+ if (status != DLADM_STATUS_OK) {
+ (void) i_dladm_flow_remove(flowname);
+ return (status);
+ }
+ }
+ }
+ return (status);
+}
+
+/*
+ * Remove a flow.
+ */
+/* ARGSUSED */
+dladm_status_t
+dladm_flow_remove(char *flowname, boolean_t tempop,
+ const char *root)
+{
+ remove_db_state_t state;
+ dladm_status_t status = DLADM_STATUS_OK;
+ dladm_status_t s = DLADM_STATUS_OK;
+
+ /* remove flow */
+ status = i_dladm_flow_remove(flowname);
+ if ((status != DLADM_STATUS_OK) &&
+ (tempop || status != DLADM_STATUS_NOTFOUND))
+ goto done;
+
+ /* remove flow from DB */
+ if (!tempop) {
+ bzero(&state, sizeof (state));
+ (void) strlcpy(state.rs_newattr.fi_flowname, flowname,
+ sizeof (state.rs_newattr.fi_flowname));
+ state.rs_found = B_FALSE;
+
+ /* flow DB */
+ if (i_dladm_flow_remove_db(&state, root) < 0) {
+ s = dladm_errno2status(errno);
+ goto done;
+ }
+
+ /* flow prop DB */
+ s = dladm_set_flowprop(flowname, NULL, NULL, 0,
+ DLADM_OPT_PERSIST, NULL);
+ }
+
+done:
+ if (!tempop) {
+ if (s == DLADM_STATUS_OK) {
+ if (status == DLADM_STATUS_NOTFOUND)
+ status = s;
+ } else {
+ if (s != DLADM_STATUS_NOTFOUND)
+ status = s;
+ }
+ }
+ return (status);
+}
+
+/*
+ * Get an existing flow in the DB.
+ */
+
+typedef struct get_db_state {
+ int (*gs_fn)(dladm_flow_attr_t *, void *);
+ void *gs_arg;
+ datalink_id_t gs_linkid;
+} get_db_state_t;
+
+/*
+ * For each flow which matches the linkid, copy all flow information
+ * to a new dladm_flow_attr_t structure and call the provided
+ * function. This is used to display perisistent flows from
+ * the database.
+ */
+
+static int
+i_dladm_flow_get_db_fn(void *arg, dld_flowinfo_t *grp)
+{
+ get_db_state_t *state = (get_db_state_t *)arg;
+ dladm_flow_attr_t attr;
+
+ if (grp->fi_linkid == state->gs_linkid) {
+ attr.fa_linkid = state->gs_linkid;
+ bcopy(grp->fi_flowname, &attr.fa_flowname,
+ sizeof (attr.fa_flowname));
+ bcopy(&grp->fi_flow_desc, &attr.fa_flow_desc,
+ sizeof (attr.fa_flow_desc));
+ bcopy(&grp->fi_resource_props, &attr.fa_resource_props,
+ sizeof (attr.fa_resource_props));
+ (void) state->gs_fn(&attr, state->gs_arg);
+ }
+ return (0);
+}
+
+/*
+ * Walk through the flows defined on the system and for each flow
+ * invoke <fn>(<arg>, <flow>);
+ * Currently used for show-flow.
+ */
+/* ARGSUSED */
+dladm_status_t
+dladm_walk_flow(int (*fn)(dladm_flow_attr_t *, void *),
+ datalink_id_t linkid, void *arg, boolean_t persist)
+{
+ dld_flowinfo_t *flow;
+ int i, bufsize, fd;
+ dld_ioc_walkflow_t *ioc = NULL;
+ dladm_flow_attr_t attr;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ if (fn == NULL)
+ return (DLADM_STATUS_BADARG);
+
+ if (persist) {
+ get_db_state_t state;
+
+ bzero(&state, sizeof (state));
+
+ state.gs_linkid = linkid;
+ state.gs_fn = fn;
+ state.gs_arg = arg;
+ status = i_dladm_flow_walk_rw_db(i_dladm_flow_get_db_fn,
+ &state, NULL);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ } else {
+ if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+ return (dladm_errno2status(errno));
+
+ bufsize = MIN_INFO_SIZE;
+ if ((ioc = calloc(1, bufsize)) == NULL) {
+ status = dladm_errno2status(errno);
+ (void) close(fd);
+ return (status);
+ }
+
+ ioc->wf_linkid = linkid;
+ ioc->wf_len = bufsize - sizeof (*ioc);
+
+ while (ioctl(fd, DLDIOC_WALKFLOW, ioc) < 0) {
+ if (errno == ENOSPC) {
+ bufsize *= 2;
+ ioc = realloc(ioc, bufsize);
+ if (ioc != NULL) {
+ ioc->wf_linkid = linkid;
+ ioc->wf_len = bufsize - sizeof (*ioc);
+ continue;
+ }
+ }
+ goto bail;
+ }
+
+ flow = (dld_flowinfo_t *)(void *)(ioc + 1);
+ for (i = 0; i < ioc->wf_nflows; i++, flow++) {
+ bzero(&attr, sizeof (attr));
+
+ attr.fa_linkid = flow->fi_linkid;
+ bcopy(&flow->fi_flowname, &attr.fa_flowname,
+ sizeof (attr.fa_flowname));
+ bcopy(&flow->fi_flow_desc, &attr.fa_flow_desc,
+ sizeof (attr.fa_flow_desc));
+ bcopy(&flow->fi_resource_props, &attr.fa_resource_props,
+ sizeof (attr.fa_resource_props));
+
+ if (fn(&attr, arg) == DLADM_WALK_TERMINATE)
+ break;
+ }
+ }
+
+bail:
+ free(ioc);
+ (void) close(fd);
+ return (status);
+}
+
+dladm_status_t
+dladm_flow_init(void)
+{
+ flow_desc_t flowdesc;
+ datalink_id_t linkid;
+ dladm_status_t s, status = DLADM_STATUS_OK;
+ char name[MAXNAMELEN];
+ char line[MAXLINELEN];
+ dld_flowinfo_t attr;
+ FILE *fp;
+
+ if ((fp = fopen(DLADM_FLOW_DB, "r")) == NULL)
+ return (DLADM_STATUS_DB_NOTFOUND);
+
+ while (fgets(line, MAXLINELEN, fp) != NULL) {
+ /* skip comments */
+ if (BLANK_LINE(line))
+ continue;
+
+ (void) strtok(line, " \n");
+
+ s = dladm_flow_parse_db(line, &attr);
+ if (s != DLADM_STATUS_OK) {
+ status = s;
+ continue;
+ }
+ bzero(&flowdesc, sizeof (flowdesc));
+ bcopy(&attr.fi_flow_desc, &flowdesc, sizeof (flow_desc_t));
+ (void) strlcpy(name, attr.fi_flowname,
+ sizeof (attr.fi_flowname));
+ linkid = attr.fi_linkid;
+
+ s = i_dladm_flow_add(name, linkid, &flowdesc, NULL);
+ if (s != DLADM_STATUS_OK)
+ status = s;
+ }
+ s = i_dladm_init_flowprop_db();
+ if (s != DLADM_STATUS_OK)
+ status = s;
+
+ (void) fclose(fp);
+ return (status);
+}
+
+dladm_status_t
+dladm_prefixlen2mask(int prefixlen, int maxlen, uchar_t *mask)
+{
+ if (prefixlen < 0 || prefixlen > maxlen)
+ return (DLADM_STATUS_BADARG);
+
+ while (prefixlen > 0) {
+ if (prefixlen >= 8) {
+ *mask++ = 0xFF;
+ prefixlen -= 8;
+ continue;
+ }
+ *mask |= 1 << (8 - prefixlen);
+ prefixlen--;
+ }
+ return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_mask2prefixlen(in6_addr_t *mask, int plen, int *prefixlen)
+{
+ int bits;
+ int i, end;
+
+ switch (plen) {
+ case IP_ABITS:
+ end = 3;
+ break;
+ case IPV6_ABITS:
+ end = 0;
+ break;
+ default:
+ return (DLADM_STATUS_BADARG);
+ }
+
+ for (i = 3; i >= end; i--) {
+ if (mask->_S6_un._S6_u32[i] == 0) {
+ plen -= 32;
+ continue;
+ }
+ bits = ffs(ntohl(mask->_S6_un._S6_u32[i])) - 1;
+ if (bits == 0)
+ break;
+ plen -= bits;
+ }
+ *prefixlen = plen;
+ return (DLADM_STATUS_OK);
+}
diff --git a/usr/src/lib/libdladm/common/libdlflow.h b/usr/src/lib/libdladm/common/libdlflow.h
new file mode 100644
index 0000000000..d35631ba4b
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlflow.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBDLFLOW_H
+#define _LIBDLFLOW_H
+
+/*
+ * This file includes strcutures, macros and routines used by general
+ * flow administration
+ */
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <sys/mac_flow.h>
+#include <sys/dld.h>
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <libdladm.h>
+#include <libdladm_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct dladm_flow_attr {
+ datalink_id_t fa_linkid;
+ char fa_flowname[MAXNAMELEN];
+ flow_desc_t fa_flow_desc;
+ mac_resource_props_t fa_resource_props;
+ uint64_t fa_mask;
+ int fa_nattr;
+} dladm_flow_attr_t;
+
+extern dladm_status_t dladm_flow_add(datalink_id_t, dladm_arg_list_t *,
+ dladm_arg_list_t *, char *, boolean_t,
+ const char *);
+extern dladm_status_t dladm_flow_remove(char *, boolean_t, const char *);
+extern dladm_status_t dladm_flow_init(void);
+
+extern dladm_status_t dladm_flow_parse_db(char *, dld_flowinfo_t *);
+extern dladm_status_t dladm_walk_flow(int (*)(dladm_flow_attr_t *,
+ void *), datalink_id_t, void *, boolean_t);
+extern dladm_status_t dladm_flow_info(const char *, dladm_flow_attr_t *);
+
+extern dladm_status_t dladm_set_flowprop(const char *, const char *,
+ char **, uint_t, uint_t, char **);
+extern dladm_status_t dladm_get_flowprop(const char *, uint32_t,
+ const char *, char **, uint_t *);
+extern dladm_status_t dladm_walk_flowprop(int (*)(void *, const char *),
+ const char *, void *);
+
+extern void dladm_flow_attr_mask(uint64_t, dladm_flow_attr_t *);
+extern dladm_status_t dladm_flow_attr_check(dladm_arg_list_t *);
+extern dladm_status_t dladm_prefixlen2mask(int, int, uchar_t *);
+extern dladm_status_t dladm_mask2prefixlen(in6_addr_t *, int, int *);
+extern char *dladm_proto2str(uint8_t);
+extern uint8_t dladm_str2proto(const char *);
+
+extern void dladm_flow_attr_ip2str(dladm_flow_attr_t *,
+ char *, size_t);
+extern void dladm_flow_attr_proto2str(dladm_flow_attr_t *,
+ char *, size_t);
+extern void dladm_flow_attr_port2str(dladm_flow_attr_t *,
+ char *, size_t);
+extern void dladm_flow_attr_dsfield2str(dladm_flow_attr_t *,
+ char *, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBDLFLOW_H */
diff --git a/usr/src/lib/libdladm/common/libdlflow_impl.h b/usr/src/lib/libdladm/common/libdlflow_impl.h
new file mode 100644
index 0000000000..09b6d55bc1
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlflow_impl.h
@@ -0,0 +1,138 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBDLFLOW_IMPL_H
+#define _LIBDLFLOW_IMPL_H
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <libdladm.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct fprop_desc;
+struct fattr_desc;
+
+typedef dladm_status_t fpd_getf_t(const char *, char **, uint_t *);
+typedef dladm_status_t fpd_setf_t(const char *, val_desc_t *, uint_t);
+typedef dladm_status_t fpd_checkf_t(struct fprop_desc *, char **,
+ uint_t, val_desc_t **);
+
+typedef struct fprop_desc {
+ char *pd_name;
+ val_desc_t pd_defval;
+ val_desc_t *pd_modval;
+ uint_t pd_nmodval;
+ boolean_t pd_temponly;
+ fpd_setf_t *pd_set;
+ fpd_getf_t *pd_getmod;
+ fpd_getf_t *pd_get;
+ fpd_checkf_t *pd_check;
+} fprop_desc_t;
+
+typedef struct prop_table {
+ fprop_desc_t *pt_table;
+ uint_t pt_size;
+} prop_table_t;
+
+typedef enum {
+ DLADM_PROP_VAL_CURRENT = 1,
+ DLADM_PROP_VAL_DEFAULT,
+ DLADM_PROP_VAL_MODIFIABLE,
+ DLADM_PROP_VAL_PERSISTENT
+} prop_type_t;
+
+typedef dladm_status_t fad_checkf_t(char *, flow_desc_t *);
+
+extern dladm_status_t do_check_ip_addr(char *, boolean_t, flow_desc_t *);
+extern dladm_status_t do_check_dsfield(char *, flow_desc_t *);
+
+typedef struct fattr_desc {
+ const char *ad_name;
+ fad_checkf_t *ad_check;
+} fattr_desc_t;
+
+extern dladm_status_t i_dladm_get_prop_temp(const char *, prop_type_t,
+ const char *, char **, uint_t *, prop_table_t *);
+extern dladm_status_t i_dladm_set_prop_temp(const char *, const char *,
+ char **, uint_t, uint_t, char **, prop_table_t *);
+extern boolean_t i_dladm_is_prop_temponly(const char *prop_name,
+ char **, prop_table_t *);
+/*
+ * Data structures used for implementing persistent properties
+ */
+typedef struct prop_val {
+ const char *lv_name;
+ struct prop_val *lv_nextval;
+} prop_val_t;
+
+typedef struct prop_db_info {
+ const char *li_name;
+ struct prop_db_info *li_nextprop;
+ struct prop_val *li_val;
+} prop_db_info_t;
+
+typedef struct prop_db_state prop_db_state_t;
+
+typedef boolean_t (*prop_db_op_t)(prop_db_state_t *,
+ char *, prop_db_info_t *, dladm_status_t *);
+
+typedef dladm_status_t (*prop_db_initop_t)(const char *, const char *,
+ char **, uint_t, uint_t, char **);
+
+struct prop_db_state {
+ prop_db_op_t ls_op;
+ const char *ls_name;
+ const char *ls_propname;
+ char **ls_propval;
+ uint_t *ls_valcntp;
+ prop_db_initop_t ls_initop;
+};
+
+extern boolean_t process_prop_set(prop_db_state_t *lsp, char *buf,
+ prop_db_info_t *listp, dladm_status_t *statusp);
+extern boolean_t process_prop_get(prop_db_state_t *lsp, char *buf,
+ prop_db_info_t *listp, dladm_status_t *statusp);
+extern boolean_t process_prop_init(prop_db_state_t *lsp, char *buf,
+ prop_db_info_t *listp, dladm_status_t *statusp);
+extern dladm_status_t process_prop_db(void *arg, FILE *fp, FILE *nfp);
+
+extern dladm_status_t i_dladm_init_flowprop_db(void);
+extern dladm_status_t i_dladm_set_flow_proplist_db(char *,
+ dladm_arg_list_t *);
+extern dladm_status_t i_dladm_flow_check_restriction(datalink_id_t,
+ flow_desc_t *, mac_resource_props_t *, boolean_t);
+
+extern dladm_status_t dladm_flow_attrlist_extract(dladm_arg_list_t *,
+ flow_desc_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBDLFLOW_IMPL_H */
diff --git a/usr/src/lib/libdladm/common/libdllink.c b/usr/src/lib/libdladm/common/libdllink.c
index 8deed6fe76..5698409442 100644
--- a/usr/src/lib/libdladm/common/libdllink.c
+++ b/usr/src/lib/libdladm/common/libdllink.c
@@ -62,6 +62,50 @@ i_dladm_info(int fd, const datalink_id_t linkid, dladm_attr_t *dap)
return (DLADM_STATUS_OK);
}
+static dladm_status_t
+dladm_usagelog(dladm_logtype_t type, dld_ioc_usagelog_t *log_info)
+{
+ int fd;
+
+ fd = open(DLD_CONTROL_DEV, O_RDWR);
+ if (fd < 0)
+ return (DLADM_STATUS_IOERR);
+
+ if (type == DLADM_LOGTYPE_FLOW)
+ log_info->ul_type = MAC_LOGTYPE_FLOW;
+ else
+ log_info->ul_type = MAC_LOGTYPE_LINK;
+
+ if (ioctl(fd, DLDIOC_USAGELOG, log_info) < 0) {
+ (void) close(fd);
+ return (DLADM_STATUS_IOERR);
+ }
+ (void) close(fd);
+ return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_start_usagelog(dladm_logtype_t type, uint_t interval)
+{
+ dld_ioc_usagelog_t log_info;
+
+ log_info.ul_onoff = B_TRUE;
+ log_info.ul_interval = interval;
+
+ return (dladm_usagelog(type, &log_info));
+}
+
+dladm_status_t
+dladm_stop_usagelog(dladm_logtype_t type)
+{
+ dld_ioc_usagelog_t log_info;
+
+ log_info.ul_onoff = B_FALSE;
+ log_info.ul_interval = 0;
+
+ return (dladm_usagelog(type, &log_info));
+}
+
struct i_dladm_walk_arg {
dladm_walkcb_t *fn;
void *arg;
@@ -96,6 +140,112 @@ dladm_walk(dladm_walkcb_t *fn, void *arg, datalink_class_t class,
class, dmedia, flags));
}
+#define MAXGRPPERLINK 64
+
+int
+dladm_walk_hwgrp(datalink_id_t linkid, void *arg,
+ boolean_t (*fn)(void *, dladm_hwgrp_attr_t *))
+{
+ int fd, bufsize, ret;
+ int nhwgrp = MAXGRPPERLINK;
+ dld_ioc_hwgrpget_t *iomp = NULL;
+
+ if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+ return (-1);
+
+ bufsize = sizeof (dld_ioc_hwgrpget_t) +
+ nhwgrp * sizeof (dld_hwgrpinfo_t);
+
+ if ((iomp = (dld_ioc_hwgrpget_t *)calloc(1, bufsize)) == NULL)
+ return (-1);
+
+ iomp->dih_size = nhwgrp * sizeof (dld_hwgrpinfo_t);
+ iomp->dih_linkid = linkid;
+
+ ret = ioctl(fd, DLDIOC_GETHWGRP, iomp);
+ if (ret == 0) {
+ int i;
+ dld_hwgrpinfo_t *dhip;
+ dladm_hwgrp_attr_t attr;
+
+ dhip = (dld_hwgrpinfo_t *)(iomp + 1);
+ for (i = 0; i < iomp->dih_n_groups; i++) {
+ bzero(&attr, sizeof (attr));
+
+ (void) strlcpy(attr.hg_link_name,
+ dhip->dhi_link_name, sizeof (attr.hg_link_name));
+ attr.hg_grp_num = dhip->dhi_grp_num;
+ attr.hg_grp_type = dhip->dhi_grp_type;
+ attr.hg_n_rings = dhip->dhi_n_rings;
+ attr.hg_n_clnts = dhip->dhi_n_clnts;
+ (void) strlcpy(attr.hg_client_names,
+ dhip->dhi_clnts, sizeof (attr.hg_client_names));
+
+ if (!(*fn)(arg, &attr))
+ break;
+ dhip++;
+ }
+ }
+ free(iomp);
+ (void) close(fd);
+ return (ret);
+}
+
+/*
+ * Invoke the specified callback for each MAC address entry defined on
+ * the specified device.
+ */
+int
+dladm_walk_macaddr(datalink_id_t linkid, void *arg,
+ boolean_t (*fn)(void *, dladm_macaddr_attr_t *))
+{
+ int fd, bufsize, ret;
+ int nmacaddr = 1024;
+ dld_ioc_macaddrget_t *iomp = NULL;
+
+ if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+ return (-1);
+
+ bufsize = sizeof (dld_ioc_macaddrget_t) +
+ nmacaddr * sizeof (dld_macaddrinfo_t);
+
+ if ((iomp = (dld_ioc_macaddrget_t *)calloc(1, bufsize)) == NULL)
+ return (-1);
+
+ iomp->dig_size = nmacaddr * sizeof (dld_macaddrinfo_t);
+ iomp->dig_linkid = linkid;
+
+ ret = ioctl(fd, DLDIOC_MACADDRGET, iomp);
+ if (ret == 0) {
+ int i;
+ dld_macaddrinfo_t *dmip;
+ dladm_macaddr_attr_t attr;
+
+ dmip = (dld_macaddrinfo_t *)(iomp + 1);
+ for (i = 0; i < iomp->dig_count; i++) {
+ bzero(&attr, sizeof (attr));
+
+ attr.ma_slot = dmip->dmi_slot;
+ attr.ma_flags = 0;
+ if (dmip->dmi_flags & DLDIOCMACADDR_USED)
+ attr.ma_flags |= DLADM_MACADDR_USED;
+ bcopy(dmip->dmi_addr, attr.ma_addr,
+ dmip->dmi_addrlen);
+ attr.ma_addrlen = dmip->dmi_addrlen;
+ (void) strlcpy(attr.ma_client_name,
+ dmip->dmi_client_name, MAXNAMELEN);
+ attr.ma_client_linkid = dmip->dma_client_linkid;
+
+ if (!(*fn)(arg, &attr))
+ break;
+ dmip++;
+ }
+ }
+ free(iomp);
+ (void) close(fd);
+ return (ret);
+}
+
/*
* These routines are used by administration tools such as dladm(1M) to
* iterate through the list of MAC interfaces
@@ -253,84 +403,22 @@ dladm_linkduplex2str(link_duplex_t duplex, char *buf)
/*
* Set zoneid of a given link. Note that this function takes a link name
* argument instead of a linkid, because a data-link (and its linkid) could
- * be created implicitly as the result of this function. For example, a VLAN
- * could be created if a VLAN PPA hack name is assigned to an exclusive
- * non-global zone.
+ * be created implicitly as the result of this function.
*/
dladm_status_t
dladm_setzid(const char *dlname, char *zone_name)
{
datalink_id_t linkid;
- char *val;
- char **prop_val;
- char link[MAXLINKNAMELEN];
- uint_t ppa;
- char dev[DLPI_LINKNAME_MAX];
- int valsize;
dladm_status_t status = DLADM_STATUS_OK;
- char *prop_name = "zone";
- boolean_t needfree = B_FALSE;
- char delim = ':';
/* If the link does not exist, it is a ppa-hacked vlan. */
status = dladm_name2info(dlname, &linkid, NULL, NULL, NULL);
- switch (status) {
- case DLADM_STATUS_NOTFOUND:
- if (strlen(dlname) > MAXLINKNAMELEN)
- return (DLADM_STATUS_BADVAL);
-
- if (strlen(zone_name) > ZONENAME_MAX)
- return (DLADM_STATUS_BADVAL);
-
- status = dladm_parselink(dlname, dev, &ppa);
- if (status != DLADM_STATUS_OK)
- return (status);
-
- ppa = (uint_t)DLS_PPA2INST(ppa);
- (void) snprintf(link, sizeof (link), "%s%d", dev, ppa);
-
- status = dladm_name2info(link, &linkid, NULL, NULL, NULL);
- if (status != DLADM_STATUS_OK)
- return (status);
-
- /*
- * Since the link does not exist as yet, we've to pass the
- * link name too as part of data, so that the kernel can
- * create the link. Hence, we're packing the zone_name and
- * the link name into val.
- */
- valsize = ZONENAME_MAX + MAXLINKNAMELEN + 1;
- val = malloc(valsize);
- if (val == NULL)
- return (DLADM_STATUS_NOMEM);
- needfree = B_TRUE;
-
- (void) snprintf(val, valsize, "%s%c%s", zone_name,
- delim, dlname);
-
- break;
- case DLADM_STATUS_OK:
- /*
- * The link exists, so only the zone_name is being passed as
- * val. We could also pass zone_name + linkname like in the
- * previous case just to maintain consistency, but other calls
- * like set_linkprop() in dladm.c [which is called when we run
- * 'dladm set-linkprop -p zone <linkname>' at the command line]
- * pass in the value entered at the command line [which is zone
- * name] as val.
- */
- val = zone_name;
- break;
- default:
- return (DLADM_STATUS_FAILED);
- }
+ if (status != DLADM_STATUS_OK)
+ return (status);
- prop_val = &val;
- status = dladm_set_linkprop(linkid, prop_name, prop_val, 1,
+ status = dladm_set_linkprop(linkid, "zone", &zone_name, 1,
DLADM_OPT_ACTIVE);
- if (needfree)
- free(val);
return (status);
}
@@ -958,86 +1046,6 @@ done:
}
dladm_status_t
-dladm_get_single_mac_stat(datalink_id_t linkid, const char *name, uint8_t type,
- void *val)
-{
- char module[DLPI_LINKNAME_MAX];
- uint_t instance;
- char link[DLPI_LINKNAME_MAX];
- dladm_status_t status;
- uint32_t flags, media;
- kstat_ctl_t *kcp;
- kstat_t *ksp;
- dladm_phys_attr_t dpap;
-
- if ((status = dladm_datalink_id2info(linkid, &flags, NULL, &media,
- link, DLPI_LINKNAME_MAX)) != DLADM_STATUS_OK)
- return (status);
-
- if (media != DL_ETHER)
- return (DLADM_STATUS_LINKINVAL);
-
- status = dladm_phys_info(linkid, &dpap, DLADM_OPT_PERSIST);
-
- if (status != DLADM_STATUS_OK)
- return (status);
-
- status = dladm_parselink(dpap.dp_dev, module, &instance);
-
- if (status != DLADM_STATUS_OK)
- return (status);
-
- if ((kcp = kstat_open()) == NULL)
- return (dladm_errno2status(errno));
-
- /*
- * The kstat query could fail if the underlying MAC
- * driver was already detached.
- */
- if ((ksp = kstat_lookup(kcp, module, instance, "mac")) == NULL &&
- (ksp = kstat_lookup(kcp, module, instance, NULL)) == NULL)
- goto bail;
-
- if (kstat_read(kcp, ksp, NULL) == -1)
- goto bail;
-
- if (dladm_kstat_value(ksp, name, type, val) < 0)
- goto bail;
-
- (void) kstat_close(kcp);
- return (DLADM_STATUS_OK);
-bail:
- (void) kstat_close(kcp);
- return (dladm_errno2status(errno));
-
-}
-
-int
-dladm_kstat_value(kstat_t *ksp, const char *name, uint8_t type, void *buf)
-{
- kstat_named_t *knp;
-
- if ((knp = kstat_data_lookup(ksp, (char *)name)) == NULL)
- return (-1);
-
- if (knp->data_type != type)
- return (-1);
-
- switch (type) {
- case KSTAT_DATA_UINT64:
- *(uint64_t *)buf = knp->value.ui64;
- break;
- case KSTAT_DATA_UINT32:
- *(uint32_t *)buf = knp->value.ui32;
- break;
- default:
- return (-1);
- }
-
- return (0);
-}
-
-dladm_status_t
dladm_parselink(const char *dev, char *provider, uint_t *ppa)
{
ifspec_t ifsp;
diff --git a/usr/src/lib/libdladm/common/libdllink.h b/usr/src/lib/libdladm/common/libdllink.h
index ea51087a83..29d078470c 100644
--- a/usr/src/lib/libdladm/common/libdllink.h
+++ b/usr/src/lib/libdladm/common/libdllink.h
@@ -31,17 +31,19 @@
* link administration (i.e. not limited to one specific type of link).
*/
+#include <stdio.h>
#include <sys/types.h>
#include <sys/param.h>
#include <libdladm.h>
-#include <kstat.h>
+#include <libdladm_impl.h>
+#include <sys/mac_flow.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct dladm_attr {
- uint_t da_max_sdu;
+ uint_t da_max_sdu;
} dladm_attr_t;
typedef struct dladm_phys_attr {
@@ -86,6 +88,32 @@ typedef int dladm_secobj_class_t;
typedef int (dladm_walkcb_t)(const char *, void *);
+/* possible flags for ma_flags below */
+#define DLADM_MACADDR_USED 0x1
+
+typedef enum {
+ DLADM_HWGRP_TYPE_RX = 0x1,
+ DLADM_HWGRP_TYPE_TX
+} dladm_hwgrp_type_t;
+
+typedef struct dladm_hwgrp_attr {
+ char hg_link_name[MAXLINKNAMELEN];
+ uint_t hg_grp_num;
+ dladm_hwgrp_type_t hg_grp_type;
+ uint_t hg_n_rings;
+ uint_t hg_n_clnts;
+ char hg_client_names[MAXCLIENTNAMELEN];
+} dladm_hwgrp_attr_t;
+
+typedef struct dladm_macaddr_attr {
+ uint_t ma_slot;
+ uint_t ma_flags;
+ uchar_t ma_addr[MAXMACADDRLEN];
+ uint_t ma_addrlen;
+ char ma_client_name[MAXNAMELEN];
+ datalink_id_t ma_client_linkid;
+} dladm_macaddr_attr_t;
+
extern dladm_status_t dladm_walk(dladm_walkcb_t *, void *, datalink_class_t,
datalink_media_t, uint32_t);
extern dladm_status_t dladm_mac_walk(dladm_walkcb_t *, void *);
@@ -148,12 +176,19 @@ extern dladm_status_t dladm_phys_delete(datalink_id_t);
extern dladm_status_t dladm_phys_info(datalink_id_t, dladm_phys_attr_t *,
uint32_t);
-extern dladm_status_t dladm_get_single_mac_stat(datalink_id_t, const char *,
- uint8_t, void *);
-extern int dladm_kstat_value(kstat_t *, const char *, uint8_t,
- void *);
extern dladm_status_t dladm_parselink(const char *, char *, uint_t *);
+extern int dladm_walk_macaddr(datalink_id_t, void *,
+ boolean_t (*)(void *, dladm_macaddr_attr_t *));
+extern int dladm_walk_hwgrp(datalink_id_t, void *,
+ boolean_t (*)(void *, dladm_hwgrp_attr_t *));
+
+extern dladm_status_t dladm_link_get_proplist(datalink_id_t,
+ dladm_arg_list_t **);
+
+extern dladm_status_t i_dladm_set_link_proplist_db(char *,
+ dladm_arg_list_t *);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/libdladm/common/libdlstat.c b/usr/src/lib/libdladm/common/libdlstat.c
new file mode 100644
index 0000000000..1990d27c67
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlstat.c
@@ -0,0 +1,684 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <err.h>
+#include <errno.h>
+#include <kstat.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/dld.h>
+
+#include <libdllink.h>
+#include <libdlflow.h>
+#include <libdlstat.h>
+
+/*
+ * x86 <sys/regs> ERR conflicts with <curses.h> ERR.
+ * Include curses.h last.
+ */
+#if defined(ERR)
+#undef ERR
+#endif
+#include <curses.h>
+
+struct flowlist {
+ char flowname[MAXNAMELEN];
+ datalink_id_t linkid;
+ uint_t ifspeed;
+ boolean_t first;
+ boolean_t display;
+ pktsum_t prevstats;
+ pktsum_t diffstats;
+};
+
+static int maxx, maxy, redraw = 0;
+static volatile uint_t handle_resize = 0, handle_break = 0;
+
+pktsum_t totalstats;
+struct flowlist *stattable = NULL;
+static int statentry = -1, maxstatentries = 0;
+
+#define STATGROWSIZE 16
+
+
+/*
+ * Search for flowlist entry in stattable which matches
+ * the flowname and linkide. If no match is found, use
+ * next available slot. If no slots are available,
+ * reallocate table with more slots.
+ *
+ * Return: *flowlist of matching flow
+ * NULL if realloc fails
+ */
+
+static struct flowlist *
+findstat(const char *flowname, datalink_id_t linkid)
+{
+ int match = 0;
+ struct flowlist *flist;
+
+ /* Look for match in the stattable */
+ for (match = 0, flist = stattable;
+ match <= statentry;
+ match++, flist++) {
+
+ if (flist == NULL)
+ break;
+ /* match the flowname */
+ if (flowname != NULL) {
+ if (strncmp(flowname, flist->flowname, MAXNAMELEN)
+ == NULL)
+ return (flist);
+ /* match the linkid */
+ } else {
+ if (linkid == flist->linkid)
+ return (flist);
+ }
+ }
+
+ /*
+ * No match found in the table. Store statistics in the next slot.
+ * If necessary, make room for this entry.
+ */
+ statentry++;
+ if ((maxstatentries == 0) || (maxstatentries == statentry)) {
+ maxstatentries += STATGROWSIZE;
+ stattable = realloc(stattable,
+ maxstatentries * sizeof (struct flowlist));
+ if (stattable == NULL) {
+ perror("realloc");
+ return (struct flowlist *)(NULL);
+ }
+ }
+ flist = &stattable[statentry];
+ bzero(flist, sizeof (struct flowlist));
+ flist->first = B_TRUE;
+
+ if (flowname != NULL)
+ (void) strncpy(flist->flowname, flowname, MAXNAMELEN);
+ flist->linkid = linkid;
+ return (flist);
+}
+
+static void
+print_flow_stats(struct flowlist *flist)
+{
+ struct flowlist *fcurr;
+ double ikbs, okbs;
+ double ipks, opks;
+ double dlt;
+ int fcount;
+ static boolean_t first = B_TRUE;
+
+ if (first) {
+ first = B_FALSE;
+ (void) printw("please wait...\n");
+ return;
+ }
+
+ for (fcount = 0, fcurr = flist;
+ fcount <= statentry;
+ fcount++, fcurr++) {
+ if (fcurr->flowname && fcurr->display) {
+ char linkname[MAXNAMELEN];
+
+ (void) dladm_datalink_id2info(fcurr->linkid, NULL, NULL,
+ NULL, linkname, sizeof (linkname));
+ dlt = (double)fcurr->diffstats.snaptime/(double)NANOSEC;
+ ikbs = fcurr->diffstats.rbytes * 8 / dlt / 1024;
+ okbs = fcurr->diffstats.obytes * 8 / dlt / 1024;
+ ipks = fcurr->diffstats.ipackets / dlt;
+ opks = fcurr->diffstats.opackets / dlt;
+ (void) printw("%-15.15s", fcurr->flowname);
+ (void) printw("%-10.10s", linkname);
+ (void) printw("%9.2f %9.2f %9.2f %9.2f ",
+ ikbs, okbs, ipks, opks);
+ (void) printw("\n");
+ }
+ }
+}
+
+/*ARGSUSED*/
+static int
+flow_kstats(dladm_flow_attr_t *attr, void *arg)
+{
+ kstat_ctl_t *kcp = (kstat_ctl_t *)arg;
+ kstat_t *ksp;
+ struct flowlist *flist;
+ pktsum_t currstats, *prevstats, *diffstats;
+
+ flist = findstat(attr->fa_flowname, attr->fa_linkid);
+ if (flist != NULL) {
+ prevstats = &flist->prevstats;
+ diffstats = &flist->diffstats;
+ } else {
+ return (DLADM_STATUS_FAILED);
+ }
+
+ /* lookup kstat entry */
+ ksp = dladm_kstat_lookup(kcp, NULL, -1, attr->fa_flowname, "flow");
+
+ if (ksp == NULL)
+ return (DLADM_WALK_TERMINATE);
+ else
+ flist->display = B_TRUE;
+
+ dladm_get_stats(kcp, ksp, &currstats);
+ if (flist->ifspeed == 0)
+ (void) dladm_kstat_value(ksp, "ifspeed", KSTAT_DATA_UINT64,
+ &flist->ifspeed);
+
+ if (flist->first)
+ flist->first = B_FALSE;
+ else {
+ dladm_stats_diff(diffstats, &currstats, prevstats);
+ dladm_stats_total(&totalstats, diffstats, &totalstats);
+ }
+
+ bcopy(&currstats, prevstats, sizeof (pktsum_t));
+ return (DLADM_WALK_CONTINUE);
+}
+
+static void
+print_link_stats(struct flowlist *flist)
+{
+ struct flowlist *fcurr;
+ double ikbs, okbs;
+ double ipks, opks;
+ double util;
+ double dlt;
+ int fcount;
+ static boolean_t first = B_TRUE;
+
+ if (first) {
+ first = B_FALSE;
+ (void) printw("please wait...\n");
+ return;
+ }
+
+ for (fcount = 0, fcurr = flist;
+ fcount <= statentry;
+ fcount++, fcurr++) {
+ if ((fcurr->linkid != DATALINK_INVALID_LINKID) &&
+ fcurr->display) {
+ char linkname[MAXNAMELEN];
+
+ (void) dladm_datalink_id2info(fcurr->linkid, NULL, NULL,
+ NULL, linkname, sizeof (linkname));
+ dlt = (double)fcurr->diffstats.snaptime/(double)NANOSEC;
+ ikbs = (double)fcurr->diffstats.rbytes * 8 / dlt / 1024;
+ okbs = (double)fcurr->diffstats.obytes * 8 / dlt / 1024;
+ ipks = (double)fcurr->diffstats.ipackets / dlt;
+ opks = (double)fcurr->diffstats.opackets / dlt;
+ (void) printw("%-10.10s", linkname);
+ (void) printw("%9.2f %9.2f %9.2f %9.2f ",
+ ikbs, okbs, ipks, opks);
+ if (fcurr->ifspeed != 0)
+ util = ((ikbs + okbs) * 1024) *
+ 100/ fcurr->ifspeed;
+ else
+ util = (double)0;
+ (void) attron(A_BOLD);
+ (void) printw(" %6.2f", util);
+ (void) attroff(A_BOLD);
+ (void) printw("\n");
+ }
+ }
+}
+
+/*
+ * This function is called through the dladm_walk_datalink_id() walker and
+ * calls the dladm_walk_flow() walker.
+ */
+
+/*ARGSUSED*/
+static int
+link_flowstats(datalink_id_t linkid, void *arg)
+{
+ return (dladm_walk_flow(flow_kstats, linkid, arg, B_FALSE));
+}
+
+/*ARGSUSED*/
+static int
+link_kstats(datalink_id_t linkid, void *arg)
+{
+ kstat_ctl_t *kcp = (kstat_ctl_t *)arg;
+ struct flowlist *flist;
+ pktsum_t currstats, *prevstats, *diffstats;
+ kstat_t *ksp;
+ char linkname[MAXNAMELEN];
+
+ /* find the flist entry */
+ flist = findstat(NULL, linkid);
+ if (flist != NULL) {
+ prevstats = &flist->prevstats;
+ diffstats = &flist->diffstats;
+ } else {
+ return (DLADM_WALK_CONTINUE);
+ }
+
+ /* lookup kstat entry */
+ (void) dladm_datalink_id2info(linkid, NULL, NULL, NULL, linkname,
+ sizeof (linkname));
+
+ if (linkname == NULL) {
+ warn("no linkname for linkid");
+ return (DLADM_WALK_TERMINATE);
+ }
+
+ ksp = dladm_kstat_lookup(kcp, NULL, -1, linkname, "net");
+
+ if (ksp == NULL)
+ return (DLADM_WALK_TERMINATE);
+ else
+ flist->display = B_TRUE;
+
+ /* read packet and byte stats */
+ dladm_get_stats(kcp, ksp, &currstats);
+
+ if (flist->ifspeed == 0)
+ (void) dladm_kstat_value(ksp, "ifspeed", KSTAT_DATA_UINT64,
+ &flist->ifspeed);
+
+ if (flist->first == B_TRUE)
+ flist->first = B_FALSE;
+ else
+ dladm_stats_diff(diffstats, &currstats, prevstats);
+
+ bcopy(&currstats, prevstats, sizeof (*prevstats));
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+/*ARGSUSED*/
+static void
+sig_break(int s)
+{
+ handle_break = 1;
+}
+
+/*ARGSUSED*/
+static void
+sig_resize(int s)
+{
+ handle_resize = 1;
+}
+
+static void
+curses_init()
+{
+ maxx = maxx; /* lint */
+ maxy = maxy; /* lint */
+
+ /* Install signal handlers */
+ (void) signal(SIGINT, sig_break);
+ (void) signal(SIGQUIT, sig_break);
+ (void) signal(SIGTERM, sig_break);
+ (void) signal(SIGWINCH, sig_resize);
+
+ /* Initialize ncurses */
+ (void) initscr();
+ (void) cbreak();
+ (void) noecho();
+ (void) curs_set(0);
+ timeout(0);
+ getmaxyx(stdscr, maxy, maxx);
+}
+
+static void
+curses_fin()
+{
+ (void) printw("\n");
+ (void) curs_set(1);
+ (void) nocbreak();
+ (void) endwin();
+
+ free(stattable);
+}
+
+static void
+stat_report(kstat_ctl_t *kcp, datalink_id_t linkid, const char *flowname,
+ int opt)
+{
+
+ double dlt, ikbs, okbs, ipks, opks;
+
+ struct flowlist *fstable = stattable;
+
+ if ((opt != LINK_REPORT) && (opt != FLOW_REPORT))
+ return;
+
+ /* Handle window resizes */
+ if (handle_resize) {
+ (void) endwin();
+ (void) initscr();
+ (void) cbreak();
+ (void) noecho();
+ (void) curs_set(0);
+ timeout(0);
+ getmaxyx(stdscr, maxy, maxx);
+ redraw = 1;
+ handle_resize = 0;
+ }
+
+ /* Print title */
+ (void) erase();
+ (void) attron(A_BOLD);
+ (void) move(0, 0);
+ if (opt == FLOW_REPORT)
+ (void) printw("%-15.15s", "Flow");
+ (void) printw("%-10.10s", "Link");
+ (void) printw("%9.9s %9.9s %9.9s %9.9s ",
+ "iKb/s", "oKb/s", "iPk/s", "oPk/s");
+ if (opt == LINK_REPORT)
+ (void) printw(" %6.6s", "%Util");
+ (void) printw("\n");
+ (void) attroff(A_BOLD);
+
+ (void) move(2, 0);
+
+ /* Print stats for each link or flow */
+ bzero(&totalstats, sizeof (totalstats));
+ if (opt == LINK_REPORT) {
+ /* Display all links */
+ if (linkid == DATALINK_ALL_LINKID) {
+ (void) dladm_walk_datalink_id(link_kstats,
+ (void *)kcp, DATALINK_CLASS_ALL,
+ DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+ /* Display 1 link */
+ } else {
+ (void) link_kstats(linkid, kcp);
+ }
+ print_link_stats(fstable);
+
+ } else if (opt == FLOW_REPORT) {
+ /* Display 1 flow */
+ if (flowname != NULL) {
+ dladm_flow_attr_t fattr;
+ if (dladm_flow_info(flowname, &fattr) !=
+ DLADM_STATUS_OK)
+ return;
+ (void) flow_kstats(&fattr, kcp);
+ /* Display all flows on all links */
+ } else if (linkid == DATALINK_ALL_LINKID) {
+ (void) dladm_walk_datalink_id(link_flowstats,
+ (void *)kcp, DATALINK_CLASS_ALL,
+ DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+ /* Display all flows on a link */
+ } else if (linkid != DATALINK_INVALID_LINKID) {
+ (void) dladm_walk_flow(flow_kstats, linkid, kcp,
+ B_FALSE);
+ }
+ print_flow_stats(fstable);
+
+ /* Print totals */
+ (void) attron(A_BOLD);
+ dlt = (double)totalstats.snaptime / (double)NANOSEC;
+ ikbs = totalstats.rbytes / dlt / 1024;
+ okbs = totalstats.obytes / dlt / 1024;
+ ipks = totalstats.ipackets / dlt;
+ opks = totalstats.opackets / dlt;
+ (void) printw("\n%-25.25s", "Totals");
+ (void) printw("%9.2f %9.2f %9.2f %9.2f ",
+ ikbs, okbs, ipks, opks);
+ (void) attroff(A_BOLD);
+ }
+
+ if (redraw)
+ (void) clearok(stdscr, 1);
+
+ if (refresh() == ERR)
+ return;
+
+ if (redraw) {
+ (void) clearok(stdscr, 0);
+ redraw = 0;
+ }
+}
+
+/* Exported functions */
+
+/*
+ * Continuously display link or flow statstics using a libcurses
+ * based display.
+ */
+
+void
+dladm_continuous(datalink_id_t linkid, const char *flowname, int interval,
+ int opt)
+{
+ kstat_ctl_t *kcp;
+
+ if ((kcp = kstat_open()) == NULL) {
+ warn("kstat open operation failed");
+ return;
+ }
+
+ curses_init();
+
+ for (;;) {
+
+ if (handle_break)
+ break;
+
+ stat_report(kcp, linkid, flowname, opt);
+
+ (void) sleep(max(1, interval));
+ }
+
+ (void) curses_fin();
+ (void) kstat_close(kcp);
+}
+
+/*
+ * dladm_kstat_lookup() is a modified version of kstat_lookup which
+ * adds the class as a selector.
+ */
+
+kstat_t *
+dladm_kstat_lookup(kstat_ctl_t *kcp, const char *module, int instance,
+ const char *name, const char *class)
+{
+ kstat_t *ksp = NULL;
+
+ for (ksp = kcp->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
+ if ((module == NULL || strcmp(ksp->ks_module, module) == 0) &&
+ (instance == -1 || ksp->ks_instance == instance) &&
+ (name == NULL || strcmp(ksp->ks_name, name) == 0) &&
+ (class == NULL || strcmp(ksp->ks_class, class) == 0))
+ return (ksp);
+ }
+
+ errno = ENOENT;
+ return (NULL);
+}
+
+/*
+ * dladm_get_stats() populates the supplied pktsum_t structure with
+ * the input and output packet and byte kstats from the kstat_t
+ * found with dladm_kstat_lookup.
+ */
+void
+dladm_get_stats(kstat_ctl_t *kcp, kstat_t *ksp, pktsum_t *stats)
+{
+
+ if (kstat_read(kcp, ksp, NULL) == -1)
+ return;
+
+ stats->snaptime = gethrtime();
+
+ if (dladm_kstat_value(ksp, "ipackets64", KSTAT_DATA_UINT64,
+ &stats->ipackets) < 0) {
+ if (dladm_kstat_value(ksp, "ipackets", KSTAT_DATA_UINT64,
+ &stats->ipackets) < 0)
+ return;
+ }
+
+ if (dladm_kstat_value(ksp, "opackets64", KSTAT_DATA_UINT64,
+ &stats->opackets) < 0) {
+ if (dladm_kstat_value(ksp, "opackets", KSTAT_DATA_UINT64,
+ &stats->opackets) < 0)
+ return;
+ }
+
+ if (dladm_kstat_value(ksp, "rbytes64", KSTAT_DATA_UINT64,
+ &stats->rbytes) < 0) {
+ if (dladm_kstat_value(ksp, "rbytes", KSTAT_DATA_UINT64,
+ &stats->rbytes) < 0)
+ return;
+ }
+
+ if (dladm_kstat_value(ksp, "obytes64", KSTAT_DATA_UINT64,
+ &stats->obytes) < 0) {
+ if (dladm_kstat_value(ksp, "obytes", KSTAT_DATA_UINT64,
+ &stats->obytes) < 0)
+ return;
+ }
+
+ if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT32,
+ &stats->ierrors) < 0) {
+ if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT64,
+ &stats->ierrors) < 0)
+ return;
+ }
+
+ if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT32,
+ &stats->oerrors) < 0) {
+ if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT64,
+ &stats->oerrors) < 0)
+ return;
+ }
+}
+
+int
+dladm_kstat_value(kstat_t *ksp, const char *name, uint8_t type, void *buf)
+{
+ kstat_named_t *knp;
+
+ if ((knp = kstat_data_lookup(ksp, (char *)name)) == NULL)
+ return (-1);
+
+ if (knp->data_type != type)
+ return (-1);
+
+ switch (type) {
+ case KSTAT_DATA_UINT64:
+ *(uint64_t *)buf = knp->value.ui64;
+ break;
+ case KSTAT_DATA_UINT32:
+ *(uint32_t *)buf = knp->value.ui32;
+ break;
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+dladm_status_t
+dladm_get_single_mac_stat(datalink_id_t linkid, const char *name, uint8_t type,
+ void *val)
+{
+ kstat_ctl_t *kcp;
+ char module[DLPI_LINKNAME_MAX];
+ uint_t instance;
+ char link[DLPI_LINKNAME_MAX];
+ dladm_status_t status;
+ uint32_t flags, media;
+ kstat_t *ksp;
+ dladm_phys_attr_t dpap;
+
+ if ((kcp = kstat_open()) == NULL) {
+ warn("kstat_open operation failed");
+ return (-1);
+ }
+
+ if ((status = dladm_datalink_id2info(linkid, &flags, NULL, &media,
+ link, DLPI_LINKNAME_MAX)) != DLADM_STATUS_OK)
+ return (status);
+
+ if (media != DL_ETHER)
+ return (DLADM_STATUS_LINKINVAL);
+
+ status = dladm_phys_info(linkid, &dpap, DLADM_OPT_PERSIST);
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ status = dladm_parselink(dpap.dp_dev, module, &instance);
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ /*
+ * The kstat query could fail if the underlying MAC
+ * driver was already detached.
+ */
+ if ((ksp = kstat_lookup(kcp, module, instance, "mac")) == NULL &&
+ (ksp = kstat_lookup(kcp, module, instance, NULL)) == NULL)
+ goto bail;
+
+ if (kstat_read(kcp, ksp, NULL) == -1)
+ goto bail;
+
+ if (dladm_kstat_value(ksp, name, type, val) < 0)
+ goto bail;
+
+ (void) kstat_close(kcp);
+ return (DLADM_STATUS_OK);
+
+bail:
+ (void) kstat_close(kcp);
+ return (dladm_errno2status(errno));
+}
+
+/* Compute sum of 2 pktsums (s1 = s2 + s3) */
+void
+dladm_stats_total(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
+{
+ s1->rbytes = s2->rbytes + s3->rbytes;
+ s1->ipackets = s2->ipackets + s3->ipackets;
+ s1->ierrors = s2->ierrors + s3->ierrors;
+ s1->obytes = s2->obytes + s3->obytes;
+ s1->opackets = s2->opackets + s3->opackets;
+ s1->oerrors = s2->oerrors + s3->oerrors;
+ s1->snaptime = s2->snaptime;
+}
+
+/* Compute differences between 2 pktsums (s1 = s2 - s3) */
+void
+dladm_stats_diff(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
+{
+ s1->rbytes = s2->rbytes - s3->rbytes;
+ s1->ipackets = s2->ipackets - s3->ipackets;
+ s1->ierrors = s2->ierrors - s3->ierrors;
+ s1->obytes = s2->obytes - s3->obytes;
+ s1->opackets = s2->opackets - s3->opackets;
+ s1->oerrors = s2->oerrors - s3->oerrors;
+ s1->snaptime = s2->snaptime - s3->snaptime;
+}
diff --git a/usr/src/lib/libdladm/common/libdlstat.h b/usr/src/lib/libdladm/common/libdlstat.h
new file mode 100644
index 0000000000..a142275268
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlstat.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBDLSTAT_H
+#define _LIBDLSTAT_H
+
+/*
+ * This file includes structures, macros and common routines shared by all
+ * data-link administration, and routines which are used to retrieve and
+ * display statistics.
+ */
+
+#include <kstat.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LINK_REPORT 1
+#define FLOW_REPORT 2
+
+typedef struct pktsum_s {
+ hrtime_t snaptime;
+ uint64_t ipackets;
+ uint64_t opackets;
+ uint64_t rbytes;
+ uint64_t obytes;
+ uint64_t ierrors;
+ uint64_t oerrors;
+} pktsum_t;
+
+extern void dladm_continuous(datalink_id_t, const char *, int, int);
+
+extern kstat_t *dladm_kstat_lookup(kstat_ctl_t *, const char *, int,
+ const char *, const char *);
+extern void dladm_get_stats(kstat_ctl_t *, kstat_t *, pktsum_t *);
+extern int dladm_kstat_value(kstat_t *, const char *, uint8_t,
+ void *);
+extern dladm_status_t dladm_get_single_mac_stat(datalink_id_t, const char *,
+ uint8_t, void *);
+
+extern void dladm_stats_total(pktsum_t *, pktsum_t *, pktsum_t *);
+extern void dladm_stats_diff(pktsum_t *, pktsum_t *, pktsum_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBDLSTAT_H */
diff --git a/usr/src/lib/libdladm/common/libdlvlan.c b/usr/src/lib/libdladm/common/libdlvlan.c
index f6d855db72..1dc04bf4eb 100644
--- a/usr/src/lib/libdladm/common/libdlvlan.c
+++ b/usr/src/lib/libdladm/common/libdlvlan.c
@@ -23,16 +23,8 @@
* Use is subject to license terms.
*/
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <assert.h>
-#include <sys/dld.h>
-#include <libdladm_impl.h>
-#include <libdllink.h>
#include <libdlvlan.h>
+#include <libdlvnic.h>
/*
* VLAN Administration Library.
@@ -44,106 +36,19 @@
/*
* Returns the current attributes of the specified VLAN.
*/
-static dladm_status_t
-i_dladm_vlan_info_active(datalink_id_t vlanid, dladm_vlan_attr_t *dvap)
-{
- int fd;
- dld_ioc_vlan_attr_t div;
- dladm_status_t status = DLADM_STATUS_OK;
-
- if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
- return (dladm_errno2status(errno));
-
- div.div_vlanid = vlanid;
-
- if (ioctl(fd, DLDIOC_VLAN_ATTR, &div) < 0)
- status = dladm_errno2status(errno);
-
- dvap->dv_vid = div.div_vid;
- dvap->dv_linkid = div.div_linkid;
- dvap->dv_force = div.div_force;
- dvap->dv_implicit = div.div_implicit;
-done:
- (void) close(fd);
- return (status);
-}
-
-/*
- * Returns the persistent attributes of the specified VLAN.
- */
-static dladm_status_t
-i_dladm_vlan_info_persist(datalink_id_t vlanid, dladm_vlan_attr_t *dvap)
-{
- dladm_conf_t conf = DLADM_INVALID_CONF;
- dladm_status_t status;
- uint64_t u64;
-
- if ((status = dladm_read_conf(vlanid, &conf)) != DLADM_STATUS_OK)
- return (status);
-
- status = dladm_get_conf_field(conf, FLINKOVER, &u64, sizeof (u64));
- if (status != DLADM_STATUS_OK)
- goto done;
- dvap->dv_linkid = (datalink_id_t)u64;
-
- status = dladm_get_conf_field(conf, FFORCE, &dvap->dv_force,
- sizeof (boolean_t));
- if (status != DLADM_STATUS_OK)
- goto done;
-
- dvap->dv_implicit = B_FALSE;
-
- status = dladm_get_conf_field(conf, FVLANID, &u64, sizeof (u64));
- if (status != DLADM_STATUS_OK)
- goto done;
- dvap->dv_vid = (uint16_t)u64;
-
-done:
- dladm_destroy_conf(conf);
- return (status);
-}
-
dladm_status_t
dladm_vlan_info(datalink_id_t vlanid, dladm_vlan_attr_t *dvap, uint32_t flags)
{
- assert(flags == DLADM_OPT_ACTIVE || flags == DLADM_OPT_PERSIST);
- if (flags == DLADM_OPT_ACTIVE)
- return (i_dladm_vlan_info_active(vlanid, dvap));
- else
- return (i_dladm_vlan_info_persist(vlanid, dvap));
-}
-
-static dladm_status_t
-dladm_persist_vlan_conf(const char *vlan, datalink_id_t vlanid,
- boolean_t force, datalink_id_t linkid, uint16_t vid)
-{
- dladm_conf_t conf = DLADM_INVALID_CONF;
- dladm_status_t status;
- uint64_t u64;
+ dladm_status_t status;
+ dladm_vnic_attr_t attr, *vnic = &attr;
- if ((status = dladm_create_conf(vlan, vlanid, DATALINK_CLASS_VLAN,
- DL_ETHER, &conf)) != DLADM_STATUS_OK) {
+ if ((status = dladm_vnic_info(vlanid, vnic, flags)) !=
+ DLADM_STATUS_OK)
return (status);
- }
- u64 = linkid;
- status = dladm_set_conf_field(conf, FLINKOVER, DLADM_TYPE_UINT64, &u64);
- if (status != DLADM_STATUS_OK)
- goto done;
-
- status = dladm_set_conf_field(conf, FFORCE, DLADM_TYPE_BOOLEAN, &force);
- if (status != DLADM_STATUS_OK)
- goto done;
-
- u64 = vid;
- status = dladm_set_conf_field(conf, FVLANID, DLADM_TYPE_UINT64, &u64);
- if (status != DLADM_STATUS_OK)
- goto done;
-
- status = dladm_write_conf(conf);
-
-done:
- dladm_destroy_conf(conf);
+ dvap->dv_vid = vnic->va_vid;
+ dvap->dv_linkid = vnic->va_link_id;
+ dvap->dv_force = vnic->va_force;
return (status);
}
@@ -152,63 +57,11 @@ done:
*/
dladm_status_t
dladm_vlan_create(const char *vlan, datalink_id_t linkid, uint16_t vid,
- uint32_t flags)
+ dladm_arg_list_t *proplist, uint32_t flags, datalink_id_t *vlan_id_out)
{
- dld_ioc_create_vlan_t dic;
- int fd;
- datalink_id_t vlanid = DATALINK_INVALID_LINKID;
- uint_t media;
- datalink_class_t class;
- dladm_status_t status;
-
- if (vid < 1 || vid > 4094)
- return (DLADM_STATUS_VIDINVAL);
-
- if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
- return (dladm_errno2status(errno));
-
- status = dladm_datalink_id2info(linkid, NULL, &class, &media, NULL, 0);
- if (status != DLADM_STATUS_OK || media != DL_ETHER ||
- class == DATALINK_CLASS_VLAN) {
- return (DLADM_STATUS_BADARG);
- }
-
- status = dladm_create_datalink_id(vlan, DATALINK_CLASS_VLAN, DL_ETHER,
- flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST), &vlanid);
- if (status != DLADM_STATUS_OK)
- goto fail;
-
- if (flags & DLADM_OPT_PERSIST) {
- status = dladm_persist_vlan_conf(vlan, vlanid,
- (flags & DLADM_OPT_FORCE) != 0, linkid, vid);
- if (status != DLADM_STATUS_OK)
- goto fail;
- }
-
- if (flags & DLADM_OPT_ACTIVE) {
- dic.dic_vlanid = vlanid;
- dic.dic_linkid = linkid;
- dic.dic_vid = vid;
- dic.dic_force = (flags & DLADM_OPT_FORCE) != 0;
-
- if (ioctl(fd, DLDIOC_CREATE_VLAN, &dic) < 0) {
- status = dladm_errno2status(errno);
- if (flags & DLADM_OPT_PERSIST)
- (void) dladm_remove_conf(vlanid);
- goto fail;
- }
- }
-
- (void) close(fd);
- return (DLADM_STATUS_OK);
-
-fail:
- if (vlanid != DATALINK_INVALID_LINKID) {
- (void) dladm_destroy_datalink_id(vlanid,
- flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST));
- }
- (void) close(fd);
- return (status);
+ return (dladm_vnic_create(vlan, linkid, VNIC_MAC_ADDR_TYPE_PRIMARY,
+ NULL, 0, NULL, 0, vid, vlan_id_out, proplist,
+ flags | DLADM_OPT_VLAN));
}
/*
@@ -217,124 +70,11 @@ fail:
dladm_status_t
dladm_vlan_delete(datalink_id_t vlanid, uint32_t flags)
{
- dld_ioc_delete_vlan_t did;
- int fd;
- datalink_class_t class;
- dladm_status_t status = DLADM_STATUS_OK;
-
- if ((dladm_datalink_id2info(vlanid, NULL, &class, NULL, NULL, 0) !=
- DLADM_STATUS_OK) || (class != DATALINK_CLASS_VLAN)) {
- return (DLADM_STATUS_BADARG);
- }
-
- if (flags & DLADM_OPT_ACTIVE) {
- if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
- return (dladm_errno2status(errno));
-
- did.did_linkid = vlanid;
- if ((ioctl(fd, DLDIOC_DELETE_VLAN, &did) < 0) &&
- ((errno != ENOENT) || !(flags & DLADM_OPT_PERSIST))) {
- (void) close(fd);
- return (dladm_errno2status(errno));
- }
- (void) close(fd);
-
- /*
- * Delete active linkprop before this active link is deleted.
- */
- (void) dladm_set_linkprop(vlanid, NULL, NULL, 0,
- DLADM_OPT_ACTIVE);
- }
-
- (void) dladm_destroy_datalink_id(vlanid,
- flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST));
-
- if (flags & DLADM_OPT_PERSIST)
- (void) dladm_remove_conf(vlanid);
-
- return (status);
-}
-
-/*
- * Callback used by dladm_vlan_up()
- */
-static int
-i_dladm_vlan_up(datalink_id_t vlanid, void *arg)
-{
- dladm_vlan_attr_t dva;
- dld_ioc_create_vlan_t dic;
- dladm_status_t *statusp = arg;
- uint32_t flags;
- int fd;
- dladm_status_t status;
-
- status = dladm_vlan_info(vlanid, &dva, DLADM_OPT_PERSIST);
- if (status != DLADM_STATUS_OK)
- goto done;
-
- /*
- * Validate (and delete) the link associated with this VLAN, see if
- * the specific hardware has been removed during system shutdown.
- */
- if ((status = dladm_datalink_id2info(dva.dv_linkid, &flags, NULL,
- NULL, NULL, 0)) != DLADM_STATUS_OK) {
- goto done;
- }
-
- if (!(flags & DLADM_OPT_ACTIVE)) {
- status = DLADM_STATUS_BADARG;
- goto done;
- }
-
- dic.dic_linkid = dva.dv_linkid;
- dic.dic_force = dva.dv_force;
- dic.dic_vid = dva.dv_vid;
-
- if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) {
- status = dladm_errno2status(errno);
- goto done;
- }
-
- dic.dic_vlanid = vlanid;
- if (ioctl(fd, DLDIOC_CREATE_VLAN, &dic) < 0) {
- status = dladm_errno2status(errno);
- goto done;
- }
-
- if ((status = dladm_up_datalink_id(vlanid)) != DLADM_STATUS_OK) {
- dld_ioc_delete_vlan_t did;
-
- did.did_linkid = vlanid;
- (void) ioctl(fd, DLDIOC_DELETE_VLAN, &did);
- } else {
- /*
- * Reset the active linkprop of this specific link.
- */
- (void) dladm_init_linkprop(vlanid, B_FALSE);
- }
-
- (void) close(fd);
-done:
- *statusp = status;
- return (DLADM_WALK_CONTINUE);
+ return (dladm_vnic_delete(vlanid, flags | DLADM_OPT_VLAN));
}
-/*
- * Bring up one VLAN, or all persistent VLANs. In the latter case, the
- * walk may terminate early if bringup of a VLAN fails.
- */
dladm_status_t
dladm_vlan_up(datalink_id_t linkid)
{
- dladm_status_t status;
-
- if (linkid == DATALINK_ALL_LINKID) {
- (void) dladm_walk_datalink_id(i_dladm_vlan_up, &status,
- DATALINK_CLASS_VLAN, DATALINK_ANY_MEDIATYPE,
- DLADM_OPT_PERSIST);
- return (DLADM_STATUS_OK);
- } else {
- (void) i_dladm_vlan_up(linkid, &status);
- return (status);
- }
+ return (dladm_vnic_up(linkid, DLADM_OPT_VLAN));
}
diff --git a/usr/src/lib/libdladm/common/libdlvlan.h b/usr/src/lib/libdladm/common/libdlvlan.h
index 7a305443df..91f6ee8671 100644
--- a/usr/src/lib/libdladm/common/libdlvlan.h
+++ b/usr/src/lib/libdladm/common/libdlvlan.h
@@ -26,8 +26,6 @@
#ifndef _LIBDLVLAN_H
#define _LIBDLVLAN_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file includes structures, macros and routines used by VLAN link
* administration.
@@ -43,13 +41,13 @@ typedef struct dladm_vlan_attr {
uint16_t dv_vid;
datalink_id_t dv_linkid;
boolean_t dv_force;
- boolean_t dv_implicit;
} dladm_vlan_attr_t;
extern dladm_status_t dladm_vlan_info(datalink_id_t, dladm_vlan_attr_t *,
uint32_t);
extern dladm_status_t dladm_vlan_create(const char *, datalink_id_t,
- uint16_t, uint32_t);
+ uint16_t, dladm_arg_list_t *, uint32_t,
+ datalink_id_t *);
extern dladm_status_t dladm_vlan_delete(datalink_id_t, uint32_t);
extern dladm_status_t dladm_vlan_up(datalink_id_t);
diff --git a/usr/src/lib/libdladm/common/libdlvnic.c b/usr/src/lib/libdladm/common/libdlvnic.c
index ac97372785..dfa58bcac5 100644
--- a/usr/src/lib/libdladm/common/libdlvnic.c
+++ b/usr/src/lib/libdladm/common/libdlvnic.c
@@ -36,6 +36,7 @@
#include <libintl.h>
#include <net/if_types.h>
#include <net/if_dl.h>
+#include <sys/dld.h>
#include <libdladm_impl.h>
#include <libdllink.h>
#include <libdlvnic.h>
@@ -44,137 +45,258 @@
* VNIC administration library.
*/
-/* Limits on buffer size for VNIC_IOC_INFO request */
-#define MIN_INFO_SIZE (4*1024)
-#define MAX_INFO_SIZE (128*1024)
-
-/* configuration database entry */
-typedef struct dladm_vnic_attr_db {
- datalink_id_t vt_vnic_id;
- datalink_id_t vt_link_id;
- vnic_mac_addr_type_t vt_mac_addr_type;
- uint_t vt_mac_len;
- uchar_t vt_mac_addr[MAXMACADDRLEN];
-} dladm_vnic_attr_db_t;
-
-typedef struct dladm_vnic_modify_attr {
- vnic_mac_addr_type_t vm_mac_addr_type;
- int vm_mac_len;
- uchar_t vm_mac_addr[MAXMACADDRLEN];
-} dladm_vnic_modify_attr_t;
+/*
+ * Default random MAC address prefix (locally administered).
+ */
+static char dladm_vnic_def_prefix[] = {0x02, 0x08, 0x20};
+
+static dladm_status_t dladm_vnic_persist_conf(const char *name,
+ dladm_vnic_attr_t *, datalink_class_t);
+static const char *dladm_vnic_macaddr2str(const uchar_t *, char *);
+static dladm_status_t dladm_vnic_str2macaddr(const char *, uchar_t *);
/*
- * Send a create command to the VNIC driver.
+ * Convert a diagnostic returned by the kernel into a dladm_status_t.
*/
static dladm_status_t
-i_dladm_vnic_create_sys(int fd, dladm_vnic_attr_db_t *attr)
+dladm_vnic_diag2status(vnic_ioc_diag_t ioc_diag)
{
- vnic_ioc_create_t ioc;
-
- ioc.vc_vnic_id = attr->vt_vnic_id;
- ioc.vc_link_id = attr->vt_link_id;
- ioc.vc_mac_addr_type = attr->vt_mac_addr_type;
- ioc.vc_mac_len = attr->vt_mac_len;
- bcopy(attr->vt_mac_addr, ioc.vc_mac_addr, attr->vt_mac_len);
-
- if (ioctl(fd, VNIC_IOC_CREATE, &ioc) < 0)
- return (dladm_errno2status(errno));
-
+ switch (ioc_diag) {
+ case VNIC_IOC_DIAG_MACADDR_INVALID:
+ return (DLADM_STATUS_INVALIDMACADDR);
+ case VNIC_IOC_DIAG_MACADDRLEN_INVALID:
+ return (DLADM_STATUS_INVALIDMACADDRLEN);
+ case VNIC_IOC_DIAG_MACADDR_NIC:
+ return (DLADM_STATUS_INVALIDMACADDRNIC);
+ case VNIC_IOC_DIAG_MACADDR_INUSE:
+ return (DLADM_STATUS_INVALIDMACADDRINUSE);
+ case VNIC_IOC_DIAG_MACFACTORYSLOTINVALID:
+ return (DLADM_STATUS_MACFACTORYSLOTINVALID);
+ case VNIC_IOC_DIAG_MACFACTORYSLOTUSED:
+ return (DLADM_STATUS_MACFACTORYSLOTUSED);
+ case VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED:
+ return (DLADM_STATUS_MACFACTORYSLOTALLUSED);
+ case VNIC_IOC_DIAG_MACFACTORYNOTSUP:
+ return (DLADM_STATUS_MACFACTORYNOTSUP);
+ case VNIC_IOC_DIAG_MACPREFIX_INVALID:
+ return (DLADM_STATUS_INVALIDMACPREFIX);
+ case VNIC_IOC_DIAG_MACPREFIXLEN_INVALID:
+ return (DLADM_STATUS_INVALIDMACPREFIXLEN);
+ case VNIC_IOC_DIAG_MACMARGIN_INVALID:
+ return (DLADM_STATUS_INVALID_MACMARGIN);
+ case VNIC_IOC_DIAG_NO_HWRINGS:
+ return (DLADM_STATUS_NO_HWRINGS);
+ }
return (DLADM_STATUS_OK);
}
/*
- * Send a modify command to the VNIC driver.
+ * Send a create command to the VNIC driver.
*/
-static dladm_status_t
-i_dladm_vnic_modify_sys(datalink_id_t vnic_id, uint32_t modify_mask,
- dladm_vnic_modify_attr_t *attr)
+dladm_status_t
+i_dladm_vnic_create_sys(dladm_vnic_attr_t *attr)
{
+ int rc, fd;
+ vnic_ioc_create_t ioc;
dladm_status_t status = DLADM_STATUS_OK;
- int fd;
- vnic_ioc_modify_t ioc;
-
- ioc.vm_vnic_id = vnic_id;
- ioc.vm_modify_mask = 0;
- if (modify_mask & DLADM_VNIC_MODIFY_ADDR)
- ioc.vm_modify_mask |= VNIC_IOC_MODIFY_ADDR;
-
- ioc.vm_mac_addr_type = attr->vm_mac_addr_type;
- ioc.vm_mac_len = attr->vm_mac_len;
- bcopy(attr->vm_mac_addr, ioc.vm_mac_addr, MAXMACADDRLEN);
+ bzero(&ioc, sizeof (ioc));
+ ioc.vc_vnic_id = attr->va_vnic_id;
+ ioc.vc_link_id = attr->va_link_id;
+ ioc.vc_mac_addr_type = attr->va_mac_addr_type;
+ ioc.vc_mac_len = attr->va_mac_len;
+ ioc.vc_mac_slot = attr->va_mac_slot;
+ ioc.vc_mac_prefix_len = attr->va_mac_prefix_len;
+ ioc.vc_vid = attr->va_vid;
+ ioc.vc_flags = attr->va_force ? VNIC_IOC_CREATE_FORCE : 0;
+ ioc.vc_flags |= attr->va_hwrings ? VNIC_IOC_CREATE_REQ_HWRINGS : 0;
+
+ if (attr->va_mac_len > 0 || ioc.vc_mac_prefix_len > 0)
+ bcopy(attr->va_mac_addr, ioc.vc_mac_addr, MAXMACADDRLEN);
+ bcopy(&attr->va_resource_props, &ioc.vc_resource_props,
+ sizeof (mac_resource_props_t));
+ if (attr->va_link_id == DATALINK_INVALID_LINKID)
+ ioc.vc_flags |= VNIC_IOC_CREATE_ANCHOR;
if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
return (dladm_errno2status(errno));
- if (ioctl(fd, VNIC_IOC_MODIFY, &ioc) < 0)
+ rc = ioctl(fd, VNIC_IOC_CREATE, &ioc);
+ if (rc < 0)
status = dladm_errno2status(errno);
(void) close(fd);
+ if (status != DLADM_STATUS_OK) {
+ if (ioc.vc_diag != VNIC_IOC_DIAG_NONE)
+ status = dladm_vnic_diag2status(ioc.vc_diag);
+ }
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ attr->va_mac_addr_type = ioc.vc_mac_addr_type;
+ switch (ioc.vc_mac_addr_type) {
+ case VNIC_MAC_ADDR_TYPE_FACTORY:
+ attr->va_mac_slot = ioc.vc_mac_slot;
+ break;
+ case VNIC_MAC_ADDR_TYPE_RANDOM:
+ bcopy(ioc.vc_mac_addr, attr->va_mac_addr, MAXMACADDRLEN);
+ attr->va_mac_len = ioc.vc_mac_len;
+ break;
+ }
return (status);
}
/*
* Get the configuration information of the given VNIC.
*/
-dladm_status_t
-dladm_vnic_info(datalink_id_t vnic_id, dladm_vnic_attr_sys_t *attrp,
- uint32_t flags)
+static dladm_status_t
+i_dladm_vnic_info_active(datalink_id_t linkid, dladm_vnic_attr_t *attrp)
{
- vnic_ioc_info_t *ioc;
- vnic_ioc_info_vnic_t *vnic;
- int bufsize, fd;
+ vnic_ioc_info_t ioc;
+ vnic_info_t *vnic;
+ int rc, fd;
dladm_status_t status = DLADM_STATUS_OK;
- /* for now, only temporary creations are supported */
- if (flags & DLADM_OPT_PERSIST)
- return (dladm_errno2status(ENOTSUP));
-
if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) == -1)
return (dladm_errno2status(errno));
- bufsize = sizeof (vnic_ioc_info_t) + sizeof (vnic_ioc_info_vnic_t);
- ioc = (vnic_ioc_info_t *)calloc(1, bufsize);
- if (ioc == NULL) {
- (void) close(fd);
- return (dladm_errno2status(ENOMEM));
- }
+ bzero(&ioc, sizeof (ioc));
+ vnic = &ioc.vi_info;
+ vnic->vn_vnic_id = linkid;
- ioc->vi_vnic_id = vnic_id;
- ioc->vi_size = bufsize - sizeof (vnic_ioc_info_t);
- if (ioctl(fd, VNIC_IOC_INFO, ioc) != 0) {
+ rc = ioctl(fd, VNIC_IOC_INFO, &ioc);
+ if (rc != 0) {
status = dladm_errno2status(errno);
goto bail;
}
- vnic = (vnic_ioc_info_vnic_t *)(ioc + 1);
-
attrp->va_vnic_id = vnic->vn_vnic_id;
attrp->va_link_id = vnic->vn_link_id;
attrp->va_mac_addr_type = vnic->vn_mac_addr_type;
- bcopy(vnic->vn_mac_addr, attrp->va_mac_addr, ETHERADDRL);
+ bcopy(vnic->vn_mac_addr, attrp->va_mac_addr, MAXMACADDRLEN);
attrp->va_mac_len = vnic->vn_mac_len;
+ attrp->va_mac_slot = vnic->vn_mac_slot;
+ attrp->va_mac_prefix_len = vnic->vn_mac_prefix_len;
+ attrp->va_vid = vnic->vn_vid;
+ attrp->va_force = vnic->vn_force;
bail:
- free(ioc);
(void) close(fd);
return (status);
}
+static dladm_status_t
+i_dladm_vnic_info_persist(datalink_id_t linkid, dladm_vnic_attr_t *attrp)
+{
+ dladm_conf_t conf;
+ dladm_status_t status;
+ char macstr[ETHERADDRL * 3];
+ uint64_t u64;
+ datalink_class_t class;
+
+ attrp->va_vnic_id = linkid;
+ if ((status = dladm_read_conf(linkid, &conf)) != DLADM_STATUS_OK)
+ return (status);
+
+ status = dladm_get_conf_field(conf, FLINKOVER, &u64, sizeof (u64));
+ attrp->va_link_id = ((status == DLADM_STATUS_OK) ?
+ (datalink_id_t)u64 : DATALINK_INVALID_LINKID);
+
+ status = dladm_get_conf_field(conf, FHWRINGS, &attrp->va_hwrings,
+ sizeof (boolean_t));
+
+ if (status != DLADM_STATUS_OK && status != DLADM_STATUS_NOTFOUND)
+ goto done;
+ if (status == DLADM_STATUS_NOTFOUND)
+ attrp->va_hwrings = B_FALSE;
+
+ if ((status = dladm_datalink_id2info(linkid, NULL, &class,
+ NULL, NULL, 0)) != DLADM_STATUS_OK)
+ goto done;
+
+ if (class == DATALINK_CLASS_VLAN) {
+ if (attrp->va_link_id == DATALINK_INVALID_LINKID) {
+ status = DLADM_STATUS_BADARG;
+ goto done;
+ }
+ attrp->va_mac_addr_type = VNIC_MAC_ADDR_TYPE_PRIMARY;
+ attrp->va_mac_len = 0;
+ } else {
+ status = dladm_get_conf_field(conf, FMADDRTYPE, &u64,
+ sizeof (u64));
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ attrp->va_mac_addr_type = (vnic_mac_addr_type_t)u64;
+
+ status = dladm_get_conf_field(conf, FMADDRLEN, &u64,
+ sizeof (u64));
+ attrp->va_mac_len = ((status == DLADM_STATUS_OK) ?
+ (uint_t)u64 : ETHERADDRL);
+
+ status = dladm_get_conf_field(conf, FMADDRSLOT, &u64,
+ sizeof (u64));
+ attrp->va_mac_slot = ((status == DLADM_STATUS_OK) ?
+ (int)u64 : -1);
+
+ status = dladm_get_conf_field(conf, FMADDRPREFIXLEN, &u64,
+ sizeof (u64));
+ attrp->va_mac_prefix_len = ((status == DLADM_STATUS_OK) ?
+ (uint_t)u64 : sizeof (dladm_vnic_def_prefix));
+
+ status = dladm_get_conf_field(conf, FMACADDR, macstr,
+ sizeof (macstr));
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ status = dladm_vnic_str2macaddr(macstr, attrp->va_mac_addr);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ status = dladm_get_conf_field(conf, FVLANID, &u64, sizeof (u64));
+ attrp->va_vid = ((status == DLADM_STATUS_OK) ? (uint16_t)u64 : 0);
+
+
+ status = DLADM_STATUS_OK;
+done:
+ dladm_destroy_conf(conf);
+ return (status);
+}
+
+dladm_status_t
+dladm_vnic_info(datalink_id_t linkid, dladm_vnic_attr_t *attrp,
+ uint32_t flags)
+{
+ if (flags == DLADM_OPT_ACTIVE)
+ return (i_dladm_vnic_info_active(linkid, attrp));
+ else if (flags == DLADM_OPT_PERSIST)
+ return (i_dladm_vnic_info_persist(linkid, attrp));
+ else
+ return (DLADM_STATUS_BADARG);
+}
+
/*
* Remove a VNIC from the kernel.
*/
-static dladm_status_t
-i_dladm_vnic_delete_sys(int fd, dladm_vnic_attr_sys_t *attr)
+dladm_status_t
+i_dladm_vnic_delete_sys(datalink_id_t linkid)
{
vnic_ioc_delete_t ioc;
+ dladm_status_t status = DLADM_STATUS_OK;
+ int rc, fd;
- ioc.vd_vnic_id = attr->va_vnic_id;
-
- if (ioctl(fd, VNIC_IOC_DELETE, &ioc) < 0)
+ if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
return (dladm_errno2status(errno));
- return (DLADM_STATUS_OK);
+ ioc.vd_vnic_id = linkid;
+
+ rc = ioctl(fd, VNIC_IOC_DELETE, &ioc);
+ if (rc < 0)
+ status = dladm_errno2status(errno);
+
+ (void) close(fd);
+ return (status);
}
/*
@@ -182,20 +304,32 @@ i_dladm_vnic_delete_sys(int fd, dladm_vnic_attr_sys_t *attr)
*/
typedef struct dladm_vnic_addr_type_s {
- char *va_str;
- vnic_mac_addr_type_t va_type;
+ const char *va_str;
+ vnic_mac_addr_type_t va_type;
} dladm_vnic_addr_type_t;
static dladm_vnic_addr_type_t addr_types[] = {
{"fixed", VNIC_MAC_ADDR_TYPE_FIXED},
+ {"random", VNIC_MAC_ADDR_TYPE_RANDOM},
+ {"factory", VNIC_MAC_ADDR_TYPE_FACTORY},
+ {"auto", VNIC_MAC_ADDR_TYPE_AUTO},
+ {"fixed", VNIC_MAC_ADDR_TYPE_PRIMARY}
};
#define NADDR_TYPES (sizeof (addr_types) / sizeof (dladm_vnic_addr_type_t))
-/*
- * Return DLADM_STATUS_OK if a matching type was found,
- * DLADM_STATUS_BADARG otherwise
- */
+static const char *
+dladm_vnic_macaddrtype2str(vnic_mac_addr_type_t type)
+{
+ int i;
+
+ for (i = 0; i < NADDR_TYPES; i++) {
+ if (type == addr_types[i].va_type)
+ return (addr_types[i].va_str);
+ }
+ return (NULL);
+}
+
dladm_status_t
dladm_vnic_str2macaddrtype(const char *str, vnic_mac_addr_type_t *val)
{
@@ -209,136 +343,397 @@ dladm_vnic_str2macaddrtype(const char *str, vnic_mac_addr_type_t *val)
return (DLADM_STATUS_OK);
}
}
-
return (DLADM_STATUS_BADARG);
}
+
+
/*
- * Create a new VNIC. Update the configuration file and bring it up.
+ * Create a new VNIC / VLAN. Update the configuration file and bring it up.
*/
dladm_status_t
dladm_vnic_create(const char *vnic, datalink_id_t linkid,
vnic_mac_addr_type_t mac_addr_type, uchar_t *mac_addr, int mac_len,
- datalink_id_t *vnic_id_out, uint32_t flags)
+ int *mac_slot, uint_t mac_prefix_len, uint16_t vid,
+ datalink_id_t *vnic_id_out, dladm_arg_list_t *proplist, uint32_t flags)
{
- dladm_vnic_attr_db_t attr;
- int i, fd;
+ dladm_vnic_attr_t attr;
datalink_id_t vnic_id;
datalink_class_t class;
- uint32_t media;
- char *name = (char *)vnic;
+ uint32_t media = DL_ETHER;
+ char name[MAXLINKNAMELEN];
+ uchar_t tmp_addr[MAXMACADDRLEN];
dladm_status_t status;
+ boolean_t is_vlan;
+ boolean_t is_etherstub;
+ int i;
/*
* Sanity test arguments.
*/
- if (flags & DLADM_OPT_PERSIST)
- return (dladm_errno2status(ENOTSUP));
+ if ((flags & DLADM_OPT_ACTIVE) == 0)
+ return (DLADM_STATUS_NOTSUP);
+
+ is_vlan = ((flags & DLADM_OPT_VLAN) != 0);
+ if (is_vlan && ((vid < 1 || vid > 4094)))
+ return (DLADM_STATUS_VIDINVAL);
+
+ is_etherstub = (linkid == DATALINK_INVALID_LINKID);
if (mac_len > MAXMACADDRLEN)
return (DLADM_STATUS_INVALIDMACADDRLEN);
- for (i = 0; i < NADDR_TYPES; i++) {
- if (mac_addr_type == addr_types[i].va_type)
- break;
- }
- if (i == NADDR_TYPES)
+ if (!dladm_vnic_macaddrtype2str(mac_addr_type))
return (DLADM_STATUS_INVALIDMACADDRTYPE);
- if ((status = dladm_datalink_id2info(linkid, NULL, &class, &media,
- NULL, 0)) != DLADM_STATUS_OK) {
- return (status);
+ /*
+ * If a random address might be generated, but no prefix
+ * was specified by the caller, use the default MAC address
+ * prefix.
+ */
+ if ((mac_addr_type == VNIC_MAC_ADDR_TYPE_RANDOM ||
+ mac_addr_type == VNIC_MAC_ADDR_TYPE_AUTO) &&
+ mac_prefix_len == 0) {
+ mac_prefix_len = sizeof (dladm_vnic_def_prefix);
+ mac_addr = tmp_addr;
+ bcopy(dladm_vnic_def_prefix, mac_addr, mac_prefix_len);
}
- if (class == DATALINK_CLASS_VNIC)
- return (DLADM_STATUS_BADARG);
+ if ((flags & DLADM_OPT_ANCHOR) == 0) {
+ if ((status = dladm_datalink_id2info(linkid, NULL, &class,
+ &media, NULL, 0)) != DLADM_STATUS_OK)
+ return (status);
+
+ if (class == DATALINK_CLASS_VNIC ||
+ class == DATALINK_CLASS_VLAN)
+ return (DLADM_STATUS_BADARG);
+ } else {
+ /* it's an anchor VNIC */
+ if (linkid != DATALINK_INVALID_LINKID || vid != 0)
+ return (DLADM_STATUS_BADARG);
+ }
if (vnic == NULL) {
flags |= DLADM_OPT_PREFIX;
- name = "vnic";
+ (void) strlcpy(name, "vnic", sizeof (name));
+ } else {
+ (void) strlcpy(name, vnic, sizeof (name));
}
- if ((status = dladm_create_datalink_id(name, DATALINK_CLASS_VNIC,
- media, flags, &vnic_id)) != DLADM_STATUS_OK) {
+ class = is_vlan ? DATALINK_CLASS_VLAN :
+ (is_etherstub ? DATALINK_CLASS_ETHERSTUB : DATALINK_CLASS_VNIC);
+ if ((status = dladm_create_datalink_id(name, class,
+ media, flags, &vnic_id)) != DLADM_STATUS_OK)
return (status);
+
+ if ((flags & DLADM_OPT_PREFIX) != 0) {
+ (void) snprintf(name + 4, sizeof (name), "%llu", vnic_id);
+ flags &= ~DLADM_OPT_PREFIX;
}
bzero(&attr, sizeof (attr));
- attr.vt_vnic_id = vnic_id;
- attr.vt_link_id = linkid;
- attr.vt_mac_addr_type = mac_addr_type;
- attr.vt_mac_len = mac_len;
- bcopy(mac_addr, attr.vt_mac_addr, mac_len);
- if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) {
- status = dladm_errno2status(errno);
+ /* Extract resource_ctl and cpu_list from proplist */
+ if (proplist != NULL) {
+ status = dladm_link_proplist_extract(proplist,
+ &attr.va_resource_props);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ attr.va_vnic_id = vnic_id;
+ attr.va_link_id = linkid;
+ attr.va_mac_addr_type = mac_addr_type;
+ attr.va_mac_len = mac_len;
+ if (mac_slot != NULL)
+ attr.va_mac_slot = *mac_slot;
+ if (mac_len > 0)
+ bcopy(mac_addr, attr.va_mac_addr, mac_len);
+ else if (mac_prefix_len > 0)
+ bcopy(mac_addr, attr.va_mac_addr, mac_prefix_len);
+ attr.va_mac_prefix_len = mac_prefix_len;
+ attr.va_vid = vid;
+ attr.va_force = (flags & DLADM_OPT_FORCE) != 0;
+ attr.va_hwrings = (flags & DLADM_OPT_HWRINGS) != 0;
+
+ status = i_dladm_vnic_create_sys(&attr);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ /* Save vnic configuration and its properties */
+ if (!(flags & DLADM_OPT_PERSIST))
+ goto done;
+
+ status = dladm_vnic_persist_conf(name, &attr, class);
+ if (status != DLADM_STATUS_OK) {
+ (void) i_dladm_vnic_delete_sys(vnic_id);
goto done;
}
- status = i_dladm_vnic_create_sys(fd, &attr);
- (void) close(fd);
+ if (proplist != NULL) {
+ for (i = 0; i < proplist->al_count; i++) {
+ dladm_arg_info_t *aip = &proplist->al_info[i];
+
+ status = dladm_set_linkprop(vnic_id, aip->ai_name,
+ aip->ai_val, aip->ai_count, DLADM_OPT_PERSIST);
+ if (status != DLADM_STATUS_OK)
+ break;
+ }
+
+ if (status != DLADM_STATUS_OK) {
+ (void) dladm_remove_conf(vnic_id);
+ (void) i_dladm_vnic_delete_sys(vnic_id);
+ }
+ }
done:
if (status != DLADM_STATUS_OK) {
- (void) dladm_destroy_datalink_id(vnic_id,
- flags & ~DLADM_OPT_PREFIX);
+ (void) dladm_destroy_datalink_id(vnic_id, flags);
} else {
- *vnic_id_out = vnic_id;
+ if (vnic_id_out != NULL)
+ *vnic_id_out = vnic_id;
+ if (mac_slot != NULL)
+ *mac_slot = attr.va_mac_slot;
}
-
return (status);
}
/*
- * Modify the properties of a VNIC.
+ * Delete a VNIC / VLAN.
*/
dladm_status_t
-dladm_vnic_modify(datalink_id_t vnic_id, uint32_t modify_mask,
- vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr,
- uint32_t flags)
+dladm_vnic_delete(datalink_id_t linkid, uint32_t flags)
{
- dladm_vnic_modify_attr_t new_attr;
+ dladm_status_t status;
+ datalink_class_t class;
- /* for now, only temporary creations are supported */
- if (flags & DLADM_OPT_PERSIST)
- return (dladm_errno2status(ENOTSUP));
+ if (flags == 0)
+ return (DLADM_STATUS_BADARG);
- bzero(&new_attr, sizeof (new_attr));
+ if ((dladm_datalink_id2info(linkid, NULL, &class, NULL, NULL, 0) !=
+ DLADM_STATUS_OK))
+ return (DLADM_STATUS_BADARG);
- if (modify_mask & DLADM_VNIC_MODIFY_ADDR) {
- new_attr.vm_mac_addr_type = mac_addr_type;
- new_attr.vm_mac_len = mac_len;
- bcopy(mac_addr, new_attr.vm_mac_addr, MAXMACADDRLEN);
+ if ((flags & DLADM_OPT_VLAN) != 0) {
+ if (class != DATALINK_CLASS_VLAN)
+ return (DLADM_STATUS_BADARG);
+ } else {
+ if (class != DATALINK_CLASS_VNIC &&
+ class != DATALINK_CLASS_ETHERSTUB)
+ return (DLADM_STATUS_BADARG);
}
- /* update the properties of the existing VNIC */
- return (i_dladm_vnic_modify_sys(vnic_id, modify_mask, &new_attr));
+ if ((flags & DLADM_OPT_ACTIVE) != 0) {
+ status = i_dladm_vnic_delete_sys(linkid);
+ if (status == DLADM_STATUS_OK) {
+ (void) dladm_set_linkprop(linkid, NULL, NULL, 0,
+ DLADM_OPT_ACTIVE);
+ (void) dladm_destroy_datalink_id(linkid,
+ DLADM_OPT_ACTIVE);
+ } else if (status != DLADM_STATUS_NOTFOUND ||
+ !(flags & DLADM_OPT_PERSIST)) {
+ return (status);
+ }
+ }
+ if ((flags & DLADM_OPT_PERSIST) != 0) {
+ (void) dladm_destroy_datalink_id(linkid, DLADM_OPT_PERSIST);
+ (void) dladm_remove_conf(linkid);
+ }
+ return (DLADM_STATUS_OK);
}
-/*
- * Delete a VNIC.
- */
-dladm_status_t
-dladm_vnic_delete(datalink_id_t vnic_id, uint32_t flags)
+static const char *
+dladm_vnic_macaddr2str(const uchar_t *mac, char *buf)
{
- dladm_status_t status;
- dladm_vnic_attr_sys_t sys_attr;
- int fd;
+ static char unknown_mac[] = {0, 0, 0, 0, 0, 0};
- /* for now, only temporary deletes are supported */
- if (flags & DLADM_OPT_PERSIST)
- return (dladm_errno2status(ENOTSUP));
+ if (buf == NULL)
+ return (NULL);
- if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
- return (dladm_errno2status(errno));
+ if (bcmp(unknown_mac, mac, ETHERADDRL) == 0)
+ (void) strlcpy(buf, "unknown", DLADM_STRSIZE);
+ else
+ return (_link_ntoa(mac, buf, ETHERADDRL, IFT_OTHER));
- sys_attr.va_vnic_id = vnic_id;
- status = i_dladm_vnic_delete_sys(fd, &sys_attr);
- (void) close(fd);
+ return (buf);
+}
- if (status != DLADM_STATUS_OK)
+static dladm_status_t
+dladm_vnic_str2macaddr(const char *str, uchar_t *buf)
+{
+ int len = 0;
+ uchar_t *b = _link_aton(str, &len);
+
+ if (b == NULL || len >= MAXMACADDRLEN)
+ return (DLADM_STATUS_BADARG);
+
+ bcopy(b, buf, len);
+ free(b);
+ return (DLADM_STATUS_OK);
+}
+
+
+static dladm_status_t
+dladm_vnic_persist_conf(const char *name, dladm_vnic_attr_t *attrp,
+ datalink_class_t class)
+{
+ dladm_conf_t conf = DLADM_INVALID_CONF;
+ dladm_status_t status;
+ char macstr[ETHERADDRL * 3];
+ uint64_t u64;
+
+ if ((status = dladm_create_conf(name, attrp->va_vnic_id,
+ class, DL_ETHER, &conf)) != DLADM_STATUS_OK)
return (status);
- (void) dladm_destroy_datalink_id(vnic_id, flags);
+ if (attrp->va_link_id != DATALINK_INVALID_LINKID) {
+ u64 = attrp->va_link_id;
+ status = dladm_set_conf_field(conf, FLINKOVER,
+ DLADM_TYPE_UINT64, &u64);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ if (class != DATALINK_CLASS_VLAN) {
+ u64 = attrp->va_mac_addr_type;
+ status = dladm_set_conf_field(conf, FMADDRTYPE,
+ DLADM_TYPE_UINT64, &u64);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ if (attrp->va_mac_len != ETHERADDRL) {
+ u64 = attrp->va_mac_len;
+ status = dladm_set_conf_field(conf, FMADDRLEN,
+ DLADM_TYPE_UINT64, &u64);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+ }
+
+ if (attrp->va_hwrings) {
+ boolean_t hwrings = attrp->va_hwrings;
+ status = dladm_set_conf_field(conf, FHWRINGS,
+ DLADM_TYPE_BOOLEAN, &hwrings);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ if (class != DATALINK_CLASS_VLAN) {
+ if (attrp->va_mac_slot != -1) {
+ u64 = attrp->va_mac_slot;
+ status = dladm_set_conf_field(conf, FMADDRSLOT,
+ DLADM_TYPE_UINT64, &u64);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ if (attrp->va_mac_prefix_len !=
+ sizeof (dladm_vnic_def_prefix)) {
+ u64 = attrp->va_mac_prefix_len;
+ status = dladm_set_conf_field(conf, FMADDRPREFIXLEN,
+ DLADM_TYPE_UINT64, &u64);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ (void) dladm_vnic_macaddr2str(attrp->va_mac_addr, macstr);
+ status = dladm_set_conf_field(conf, FMACADDR, DLADM_TYPE_STR,
+ macstr);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ if (attrp->va_vid != 0) {
+ u64 = attrp->va_vid;
+ status = dladm_set_conf_field(conf, FVLANID,
+ DLADM_TYPE_UINT64, &u64);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+ }
+
+ /*
+ * Commit the link configuration.
+ */
+ status = dladm_write_conf(conf);
+
+done:
+ dladm_destroy_conf(conf);
return (status);
}
+
+typedef struct dladm_vnic_up_arg_s {
+ uint32_t flags;
+ dladm_status_t status;
+} dladm_vnic_up_arg_t;
+
+#define DLADM_VNIC_UP_FIRST_WALK 0x1
+#define DLADM_VNIC_UP_SECOND_WALK 0x2
+
+static int
+i_dladm_vnic_up(datalink_id_t linkid, void *arg)
+{
+ dladm_status_t *statusp = &(((dladm_vnic_up_arg_t *)arg)->status);
+ dladm_vnic_attr_t attr;
+ dladm_status_t status;
+ dladm_arg_list_t *proplist;
+ uint32_t flags = ((dladm_vnic_up_arg_t *)arg)->flags;
+
+ bzero(&attr, sizeof (attr));
+
+ status = dladm_vnic_info(linkid, &attr, DLADM_OPT_PERSIST);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ /*
+ * Create the vnics that request hardware group first
+ * Create the vnics that don't request hardware group in the second walk
+ */
+ if ((flags == DLADM_VNIC_UP_FIRST_WALK && !attr.va_hwrings) ||
+ (flags == DLADM_VNIC_UP_SECOND_WALK && attr.va_hwrings))
+ goto done;
+
+ /* Get all properties for this vnic */
+ status = dladm_link_get_proplist(linkid, &proplist);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ if (proplist != NULL) {
+ status = dladm_link_proplist_extract(proplist,
+ &attr.va_resource_props);
+ }
+
+ status = i_dladm_vnic_create_sys(&attr);
+ if (status != DLADM_STATUS_OK)
+ goto done;
+
+ if ((status = dladm_up_datalink_id(linkid)) != DLADM_STATUS_OK) {
+ (void) i_dladm_vnic_delete_sys(linkid);
+ goto done;
+ }
+done:
+ *statusp = status;
+ return (DLADM_WALK_CONTINUE);
+}
+
+dladm_status_t
+dladm_vnic_up(datalink_id_t linkid, uint32_t flags)
+{
+ dladm_vnic_up_arg_t vnic_arg;
+ datalink_class_t class;
+
+ class = ((flags & DLADM_OPT_VLAN) != 0) ? DATALINK_CLASS_VLAN :
+ (DATALINK_CLASS_VNIC | DATALINK_CLASS_ETHERSTUB);
+
+ if (linkid == DATALINK_ALL_LINKID) {
+ vnic_arg.flags = DLADM_VNIC_UP_FIRST_WALK;
+ (void) dladm_walk_datalink_id(i_dladm_vnic_up, &vnic_arg,
+ class, DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST);
+ vnic_arg.flags = DLADM_VNIC_UP_SECOND_WALK;
+ (void) dladm_walk_datalink_id(i_dladm_vnic_up, &vnic_arg,
+ class, DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST);
+ return (DLADM_STATUS_OK);
+ } else {
+ (void) i_dladm_vnic_up(linkid, &vnic_arg);
+ return (vnic_arg.status);
+ }
+}
diff --git a/usr/src/lib/libdladm/common/libdlvnic.h b/usr/src/lib/libdladm/common/libdlvnic.h
index 79b4b01ba2..77f78130be 100644
--- a/usr/src/lib/libdladm/common/libdlvnic.h
+++ b/usr/src/lib/libdladm/common/libdlvnic.h
@@ -26,39 +26,43 @@
#ifndef _LIBDLVNIC_H
#define _LIBDLVNIC_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <netinet/in.h>
#include <libdladm.h>
+#include <libdladm_impl.h>
+#include <sys/mac_flow.h>
#include <sys/vnic.h>
#ifdef __cplusplus
extern "C" {
#endif
-typedef struct dladm_vnic_attr_sys {
+typedef struct dladm_vnic_attr {
datalink_id_t va_vnic_id;
datalink_id_t va_link_id;
vnic_mac_addr_type_t va_mac_addr_type;
- uchar_t va_mac_addr[ETHERADDRL];
uint_t va_mac_len;
-} dladm_vnic_attr_sys_t;
+ uchar_t va_mac_addr[MAXMACADDRLEN];
+ int va_mac_slot;
+ uint_t va_mac_prefix_len;
+ uint16_t va_vid;
+ boolean_t va_force;
+ boolean_t va_hwrings;
+ mac_resource_props_t va_resource_props;
+} dladm_vnic_attr_t;
-/*
- * Modification flags for dladm_vnic_modify().
- */
-#define DLADM_VNIC_MODIFY_ADDR 0x01
+extern dladm_status_t dladm_vnic_create(const char *, datalink_id_t,
+ vnic_mac_addr_type_t, uchar_t *, int, int *,
+ uint_t, uint16_t, datalink_id_t *,
+ dladm_arg_list_t *, uint32_t);
+
+extern dladm_status_t dladm_vnic_delete(datalink_id_t, uint32_t);
+extern dladm_status_t dladm_vnic_info(datalink_id_t, dladm_vnic_attr_t *,
+ uint32_t);
-extern dladm_status_t dladm_vnic_create(const char *, datalink_id_t,
- vnic_mac_addr_type_t, uchar_t *, int, uint_t *, uint32_t);
-extern dladm_status_t dladm_vnic_modify(datalink_id_t, uint32_t,
- vnic_mac_addr_type_t, uint_t, uchar_t *, uint32_t);
-extern dladm_status_t dladm_vnic_delete(datalink_id_t, uint32_t);
-extern dladm_status_t dladm_vnic_info(datalink_id_t, dladm_vnic_attr_sys_t *,
- uint32_t);
-extern dladm_status_t dladm_vnic_str2macaddrtype(const char *,
- vnic_mac_addr_type_t *);
+extern dladm_status_t dladm_vnic_up(datalink_id_t, uint32_t);
+extern dladm_status_t dladm_vnic_str2macaddrtype(const char *,
+ vnic_mac_addr_type_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/lib/libdladm/common/linkprop.c b/usr/src/lib/libdladm/common/linkprop.c
index 8a570c70ef..2d58b585f8 100644
--- a/usr/src/lib/libdladm/common/linkprop.c
+++ b/usr/src/lib/libdladm/common/linkprop.c
@@ -41,30 +41,34 @@
#include <libdlwlan_impl.h>
#include <libdlwlan.h>
#include <libdlvlan.h>
+#include <libdlvnic.h>
+#include <libintl.h>
#include <dlfcn.h>
#include <link.h>
#include <inet/wifi_ioctl.h>
#include <libdladm.h>
+#include <libdlstat.h>
#include <sys/param.h>
+#include <sys/debug.h>
+#include <sys/dld.h>
+#include <sys/mac_flow.h>
#include <inttypes.h>
#include <sys/ethernet.h>
#include <net/wpa.h>
#include <sys/sysmacros.h>
-#define PERM_READ_ONLY "r-"
-#define PERM_READ_WRITE "rw"
-
/*
* The linkprop get() callback.
- * - pd: pointer to the struct prop_desc
+ * - pd: pointer to the prop_desc_t
* - propstrp: a property string array to keep the returned property.
* Caller allocated.
* - cntp: number of returned properties.
* Caller also uses it to indicate how many it expects.
*/
struct prop_desc;
+typedef struct prop_desc prop_desc_t;
-typedef dladm_status_t pd_getf_t(struct prop_desc *pd,
+typedef dladm_status_t pd_getf_t(prop_desc_t *pdp,
datalink_id_t, char **propstp, uint_t *cntp,
datalink_media_t, uint_t, uint_t *);
@@ -79,10 +83,9 @@ typedef dladm_status_t pd_getf_t(struct prop_desc *pd,
* of ioctl buffers etc. pd_set() may call another common routine (used
* by all other pd_sets) which invokes the ioctl.
*/
-typedef dladm_status_t pd_setf_t(struct prop_desc *, datalink_id_t,
- val_desc_t *propval, uint_t cnt, uint_t flags,
- datalink_media_t);
-
+typedef dladm_status_t pd_setf_t(prop_desc_t *, datalink_id_t,
+ val_desc_t *propval, uint_t cnt, uint_t flags,
+ datalink_media_t);
/*
* The linkprop check() callback.
@@ -98,9 +101,8 @@ typedef dladm_status_t pd_setf_t(struct prop_desc *, datalink_id_t,
* with either a val_desc_t found on the pd_modval list or something
* generated on the fly.
*/
-typedef dladm_status_t pd_checkf_t(struct prop_desc *pd,
- datalink_id_t, char **propstrp,
- uint_t cnt, val_desc_t *propval,
+typedef dladm_status_t pd_checkf_t(prop_desc_t *pdp, datalink_id_t,
+ char **propstrp, uint_t cnt, val_desc_t *propval,
datalink_media_t);
typedef struct link_attr_s {
@@ -110,39 +112,45 @@ typedef struct link_attr_s {
} link_attr_t;
static dld_ioc_macprop_t *i_dladm_buf_alloc_by_name(size_t, datalink_id_t,
- const char *, uint_t, dladm_status_t *);
+ const char *, uint_t, dladm_status_t *);
static dld_ioc_macprop_t *i_dladm_buf_alloc_by_id(size_t, datalink_id_t,
- mac_prop_id_t, uint_t,
- dladm_status_t *);
+ mac_prop_id_t, uint_t, dladm_status_t *);
+static dld_ioc_macprop_t *i_dladm_get_public_prop(datalink_id_t, char *, uint_t,
+ dladm_status_t *, uint_t *);
+
static dladm_status_t i_dladm_set_prop(datalink_id_t, const char *, char **,
uint_t, uint_t);
static dladm_status_t i_dladm_get_prop(datalink_id_t, const char *, char **,
uint_t *, dladm_prop_type_t, uint_t);
static link_attr_t *dladm_name2prop(const char *);
static link_attr_t *dladm_id2prop(mac_prop_id_t);
-static dld_ioc_macprop_t *i_dladm_get_public_prop(datalink_id_t, char *, uint_t,
- dladm_status_t *);
+
static pd_getf_t do_get_zone, do_get_autopush, do_get_rate_mod,
do_get_rate_prop, do_get_channel_prop,
do_get_powermode_prop, do_get_radio_prop,
i_dladm_duplex_get, i_dladm_status_get,
i_dladm_binary_get, i_dladm_uint32_get,
- i_dladm_flowctl_get;
+ i_dladm_flowctl_get, dld_maxbw_get, dld_cpus_get,
+ dld_priority_get;
+
static pd_setf_t do_set_zone, do_set_rate_prop,
do_set_powermode_prop, do_set_radio_prop,
- i_dladm_set_public_prop;
+ i_dladm_set_public_prop, do_set_res, do_set_cpus;
+
static pd_checkf_t do_check_zone, do_check_autopush, do_check_rate,
- i_dladm_defmtu_check;
+ i_dladm_defmtu_check, do_check_maxbw, do_check_cpus,
+ do_check_priority;
-static dladm_status_t i_dladm_speed_get(struct prop_desc *, datalink_id_t,
- char **, uint_t *, uint_t);
+static dladm_status_t i_dladm_speed_get(prop_desc_t *, datalink_id_t,
+ char **, uint_t *, uint_t, uint_t *);
static dladm_status_t i_dladm_wlan_get_legacy_ioctl(datalink_id_t, void *,
uint_t, uint_t);
static dladm_status_t i_dladm_wlan_set_legacy_ioctl(datalink_id_t, void *,
uint_t, uint_t);
static dladm_status_t i_dladm_macprop(void *, boolean_t);
+static const char *dladm_perm2str(uint_t, char *);
-typedef struct prop_desc {
+struct prop_desc {
/*
* link property name
*/
@@ -202,7 +210,7 @@ typedef struct prop_desc {
* indicate link media type this property applies to.
*/
datalink_media_t pd_dmedia;
-} prop_desc_t;
+};
#define MAC_PROP_BUFSIZE(v) sizeof (dld_ioc_macprop_t) + (v) - 1
@@ -303,7 +311,14 @@ static link_attr_t link_attr[] = {
{ MAC_PROP_WL_MLME, sizeof (wl_mlme_t), "mlme"},
+ { MAC_PROP_MAXBW, sizeof (mac_resource_props_t), "maxbw"},
+
+ { MAC_PROP_PRIO, sizeof (mac_resource_props_t), "priority"},
+
+ { MAC_PROP_BIND_CPU, sizeof (mac_resource_props_t), "cpus"},
+
{ MAC_PROP_PRIVATE, 0, "driver-private"}
+
};
static val_desc_t link_duplex_vals[] = {
@@ -324,8 +339,11 @@ static val_desc_t link_flow_vals[] = {
{ "rx", LINK_FLOWCTRL_RX },
{ "bi", LINK_FLOWCTRL_BI }
};
-
-#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t))
+static val_desc_t link_priority_vals[] = {
+ { "low", MPL_LOW },
+ { "medium", MPL_MEDIUM },
+ { "high", MPL_HIGH }
+};
static val_desc_t dladm_wlan_radio_vals[] = {
{ "on", DLADM_WLAN_RADIO_ON },
@@ -338,8 +356,10 @@ static val_desc_t dladm_wlan_powermode_vals[] = {
{ "max", DLADM_WLAN_PM_MAX }
};
-static prop_desc_t prop_table[] = {
+#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t))
+#define RESET_VAL ((uintptr_t)-1)
+static prop_desc_t prop_table[] = {
{ "channel", { NULL, 0 },
NULL, 0, NULL, NULL,
do_get_channel_prop, NULL, 0,
@@ -372,12 +392,12 @@ static prop_desc_t prop_table[] = {
do_get_zone, do_check_zone, PD_TEMPONLY|PD_CHECK_ALLOC,
DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
- { "duplex", { "", 0 },
+ { "duplex", { "", 0 },
link_duplex_vals, VALCNT(link_duplex_vals),
NULL, NULL, i_dladm_duplex_get, NULL,
0, DATALINK_CLASS_PHYS, DL_ETHER },
- { "state", { "up", LINK_STATE_UP },
+ { "state", { "up", LINK_STATE_UP },
link_status_vals, VALCNT(link_status_vals),
NULL, NULL, i_dladm_status_get, NULL,
0, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
@@ -455,12 +475,34 @@ static prop_desc_t prop_table[] = {
{ "en_10hdx_cap", { "", 0 },
link_01_vals, VALCNT(link_01_vals),
i_dladm_set_public_prop, NULL, i_dladm_binary_get, NULL,
- 0, DATALINK_CLASS_PHYS, DL_ETHER }
+ 0, DATALINK_CLASS_PHYS, DL_ETHER },
+
+ { "maxbw", { "--", RESET_VAL }, NULL, 0,
+ do_set_res, NULL,
+ dld_maxbw_get, do_check_maxbw, PD_CHECK_ALLOC,
+ DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
+ { "cpus", { "--", RESET_VAL }, NULL, 0,
+ do_set_cpus, NULL,
+ dld_cpus_get, do_check_cpus, 0,
+ DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
+
+ { "priority", { "high", RESET_VAL },
+ link_priority_vals, VALCNT(link_priority_vals), do_set_res, NULL,
+ dld_priority_get, do_check_priority, PD_CHECK_ALLOC,
+ DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
};
#define DLADM_MAX_PROPS (sizeof (prop_table) / sizeof (prop_desc_t))
+static resource_prop_t rsrc_prop_table[] = {
+ {"maxbw", do_extract_maxbw},
+ {"priority", do_extract_priority},
+ {"cpus", do_extract_cpus}
+};
+#define DLADM_MAX_RSRC_PROP (sizeof (rsrc_prop_table) / \
+ sizeof (resource_prop_t))
+
/*
* when retrieving private properties, we pass down a buffer with
* DLADM_PROP_BUF_CHUNK of space for the driver to return the property value.
@@ -477,6 +519,9 @@ static dladm_status_t i_dladm_set_linkprop(datalink_id_t, const char *,
char **, uint_t, uint_t);
static dladm_status_t i_dladm_getset_defval(prop_desc_t *, datalink_id_t,
datalink_media_t, uint_t);
+
+static dladm_status_t link_proplist_check(dladm_arg_list_t *);
+
/*
* Unfortunately, MAX_SCAN_SUPPORT_RATES is too small to allow all
* rates to be retrieved. However, we cannot increase it at this
@@ -539,17 +584,13 @@ i_dladm_set_single_prop(datalink_id_t linkid, datalink_class_t class,
if (pdp->pd_set == NULL)
return (DLADM_STATUS_PROPRDONLY);
- if (pdp->pd_flags & PD_CHECK_ALLOC)
- needfree = B_TRUE;
- else
- needfree = B_FALSE;
if (prop_val != NULL) {
vdp = malloc(sizeof (val_desc_t) * val_cnt);
if (vdp == NULL)
return (DLADM_STATUS_NOMEM);
-
if (pdp->pd_check != NULL) {
+ needfree = ((pdp->pd_flags & PD_CHECK_ALLOC) != 0);
status = pdp->pd_check(pdp, linkid, prop_val, val_cnt,
vdp, media);
} else if (pdp->pd_optval != NULL) {
@@ -563,23 +604,25 @@ i_dladm_set_single_prop(datalink_id_t linkid, datalink_class_t class,
cnt = val_cnt;
} else {
+ boolean_t defval = B_FALSE;
+
if (pdp->pd_defval.vd_name == NULL)
return (DLADM_STATUS_NOTSUP);
cnt = 1;
- if ((pdp->pd_flags & PD_CHECK_ALLOC) != 0 ||
- strlen(pdp->pd_defval.vd_name) > 0) {
+ defval = (strlen(pdp->pd_defval.vd_name) > 0);
+ if ((pdp->pd_flags & PD_CHECK_ALLOC) != 0 || defval) {
if ((vdp = malloc(sizeof (val_desc_t))) == NULL)
return (DLADM_STATUS_NOMEM);
- if (pdp->pd_check != NULL) {
+ if (defval) {
+ (void) memcpy(vdp, &pdp->pd_defval,
+ sizeof (val_desc_t));
+ } else if (pdp->pd_check != NULL) {
status = pdp->pd_check(pdp, linkid, prop_val,
cnt, vdp, media);
if (status != DLADM_STATUS_OK)
goto done;
- } else {
- (void) memcpy(vdp, &pdp->pd_defval,
- sizeof (val_desc_t));
}
} else {
status = i_dladm_getset_defval(pdp, linkid,
@@ -618,7 +661,6 @@ i_dladm_set_linkprop(datalink_id_t linkid, const char *prop_name,
if (prop_name != NULL &&
(strcasecmp(prop_name, pdp->pd_name) != 0))
continue;
-
found = B_TRUE;
s = i_dladm_set_single_prop(linkid, class, media, pdp, prop_val,
val_cnt, flags);
@@ -774,16 +816,8 @@ dladm_get_linkprop(datalink_id_t linkid, dladm_prop_type_t type,
}
*prop_val[0] = '\0';
- switch (perm_flags) {
- case MAC_PROP_PERM_READ:
- (void) strncpy(*prop_val, PERM_READ_ONLY,
- DLADM_PROP_VAL_MAX);
- break;
- case MAC_PROP_PERM_RW:
- (void) strncpy(*prop_val, PERM_READ_WRITE,
- DLADM_PROP_VAL_MAX);
- break;
- }
+ if (status == DLADM_STATUS_OK)
+ (void) dladm_perm2str(perm_flags, *prop_val);
break;
case DLADM_PROP_VAL_DEFAULT:
@@ -879,7 +913,16 @@ done:
static int
i_dladm_init_linkprop(datalink_id_t linkid, void *arg)
{
- (void) dladm_init_linkprop(linkid, B_TRUE);
+ datalink_class_t class;
+ dladm_status_t status;
+
+ status = dladm_datalink_id2info(linkid, NULL, &class, NULL, NULL, 0);
+ if (status != DLADM_STATUS_OK)
+ return (DLADM_WALK_TERMINATE);
+
+ if ((class & (DATALINK_CLASS_VNIC | DATALINK_CLASS_VLAN)) == 0)
+ (void) dladm_init_linkprop(linkid, B_TRUE);
+
return (DLADM_WALK_CONTINUE);
}
@@ -904,24 +947,24 @@ dladm_init_linkprop(datalink_id_t linkid, boolean_t any_media)
/* ARGSUSED */
static dladm_status_t
-do_get_zone(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+do_get_zone(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
- char zone_name[ZONENAME_MAX];
- zoneid_t zid;
- dladm_status_t status;
- char *cp;
+ char zone_name[ZONENAME_MAX];
+ zoneid_t zid;
+ dladm_status_t status;
+ char *cp;
dld_ioc_macprop_t *dip;
if (flags != 0)
return (DLADM_STATUS_NOTSUP);
- dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
if (status != DLADM_STATUS_OK)
return (status);
- *perm_flags = dip->pr_perm_flags;
cp = dip->pr_val;
(void) memcpy(&zid, cp, sizeof (zid));
free(dip);
@@ -929,14 +972,12 @@ do_get_zone(struct prop_desc *pd, datalink_id_t linkid,
*val_cnt = 1;
if (zid != GLOBAL_ZONEID) {
if (getzonenamebyid(zid, zone_name, sizeof (zone_name)) < 0) {
- *perm_flags = 0;
return (dladm_errno2status(errno));
}
(void) strncpy(*prop_val, zone_name, DLADM_PROP_VAL_MAX);
} else {
*prop_val[0] = '\0';
- *perm_flags = 0;
}
return (DLADM_STATUS_OK);
@@ -1011,13 +1052,13 @@ cleanup:
/* ARGSUSED */
static dladm_status_t
-do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
+do_set_zone(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp,
uint_t val_cnt, uint_t flags, datalink_media_t media)
{
- dladm_status_t status = DLADM_STATUS_OK;
- zoneid_t zid_old, zid_new;
- char link[MAXLINKNAMELEN];
- char *cp;
+ dladm_status_t status = DLADM_STATUS_OK;
+ zoneid_t zid_old, zid_new;
+ char link[MAXLINKNAMELEN];
+ char *cp;
dld_ioc_macprop_t *dip;
dld_ioc_zid_t *dzp;
@@ -1026,25 +1067,14 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
dzp = (dld_ioc_zid_t *)vdp->vd_val;
- /*
- * If diz_is_ppa_hack is set, then an implicit vlan must be created.
- * There is no old value to compare against, and vdp->vd_val is
- * already populated with the zoneid and linkname in the function
- * do_check_zone().
- */
-
- if (dzp->diz_is_ppa_hack) {
- zid_old = GLOBAL_ZONEID;
- } else {
- dip = i_dladm_get_public_prop(linkid, pd->pd_name,
- flags, &status);
- if (status != DLADM_STATUS_OK)
- return (status);
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, NULL);
+ if (status != DLADM_STATUS_OK)
+ return (status);
- cp = dip->pr_val;
- (void) memcpy(&zid_old, cp, sizeof (zid_old));
- free(dip);
- }
+ cp = dip->pr_val;
+ (void) memcpy(&zid_old, cp, sizeof (zid_old));
+ free(dip);
zid_new = dzp->diz_zid;
(void) strlcpy(link, dzp->diz_link, MAXLINKNAMELEN);
@@ -1066,7 +1096,7 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
* link and prevent a link renaming, so we need to do it
* before other operations.
*/
- status = i_dladm_set_public_prop(pd, linkid, vdp, val_cnt,
+ status = i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt,
flags, media);
if (status != DLADM_STATUS_OK)
return (status);
@@ -1092,16 +1122,9 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
goto rollback2;
}
- if (dzp->diz_is_ppa_hack) {
- if ((status = dladm_name2info(link, &linkid, NULL, NULL,
- NULL)) != DLADM_STATUS_OK) {
- return (status);
- }
- }
-
(void) i_dladm_update_deventry(zid_new, linkid, B_TRUE);
} else {
- status = i_dladm_set_public_prop(pd, linkid, vdp, val_cnt,
+ status = i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt,
flags, media);
if (status != DLADM_STATUS_OK)
goto rollback2;
@@ -1117,7 +1140,7 @@ rollback2:
rollback1:
if (zid_new != GLOBAL_ZONEID) {
dzp->diz_zid = zid_old;
- (void) i_dladm_set_public_prop(pd, linkid, vdp, val_cnt,
+ (void) i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt,
flags, media);
}
@@ -1126,15 +1149,13 @@ rollback1:
/* ARGSUSED */
static dladm_status_t
-do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
+do_check_zone(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
{
char *zone_name;
char linkname[MAXLINKNAMELEN];
zoneid_t zoneid;
- char *cp;
dladm_status_t status = DLADM_STATUS_OK;
- boolean_t is_ppa_hack = B_FALSE;
dld_ioc_zid_t *dzp;
if (val_cnt != 1)
@@ -1144,32 +1165,12 @@ do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
if (dzp == NULL)
return (DLADM_STATUS_NOMEM);
- if (prop_val) {
- /*
- * The prop_val contains zone_name{:linkname}. The linkname is
- * present only when the link is a ppa-hacked vlan.
- */
- cp = strchr(*prop_val, ':');
- if (cp) {
- (void) strlcpy(linkname, cp + 1, MAXLINKNAMELEN);
- *cp = '\0';
- is_ppa_hack = B_TRUE;
- } else {
- status = dladm_datalink_id2info(linkid, NULL, NULL,
- NULL, linkname, MAXLINKNAMELEN);
- if (status != DLADM_STATUS_OK) {
- goto done;
- }
- }
- zone_name = *prop_val;
- } else {
- zone_name = GLOBAL_ZONENAME;
- if ((status = dladm_datalink_id2info(linkid, NULL, NULL, NULL,
- linkname, MAXLINKNAMELEN)) != DLADM_STATUS_OK) {
- goto done;
- }
+ if ((status = dladm_datalink_id2info(linkid, NULL, NULL, NULL,
+ linkname, MAXLINKNAMELEN)) != DLADM_STATUS_OK) {
+ goto done;
}
+ zone_name = (prop_val != NULL) ? *prop_val : GLOBAL_ZONENAME;
if (strlen(linkname) > MAXLINKNAMELEN) {
status = DLADM_STATUS_BADVAL;
goto done;
@@ -1199,7 +1200,6 @@ do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
dzp->diz_zid = zoneid;
(void) strlcpy(dzp->diz_link, linkname, MAXLINKNAMELEN);
- dzp->diz_is_ppa_hack = is_ppa_hack;
vdp->vd_val = (uintptr_t)dzp;
return (DLADM_STATUS_OK);
@@ -1210,9 +1210,359 @@ done:
/* ARGSUSED */
static dladm_status_t
-do_get_autopush(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+dld_maxbw_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
+{
+ dld_ioc_macprop_t *dip;
+ mac_resource_props_t mrp;
+ dladm_status_t status;
+
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
+ if (dip == NULL)
+ return (status);
+
+ bcopy(dip->pr_val, &mrp, sizeof (mac_resource_props_t));
+ free(dip);
+
+ if ((mrp.mrp_mask & MRP_MAXBW) == 0) {
+ (*prop_val)[0] = '\0';
+ } else {
+ (void) dladm_bw2str(mrp.mrp_maxbw, prop_val[0]);
+ }
+ *val_cnt = 1;
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_maxbw(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
+ uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
+{
+ uint64_t *maxbw;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ if (val_cnt != 1)
+ return (DLADM_STATUS_BADVALCNT);
+
+ maxbw = malloc(sizeof (uint64_t));
+ if (maxbw == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ status = dladm_str2bw(*prop_val, maxbw);
+ if (status != DLADM_STATUS_OK) {
+ free(maxbw);
+ return (status);
+ }
+
+ if ((*maxbw < MRP_MAXBW_MINVAL) && (*maxbw != 0)) {
+ free(maxbw);
+ return (DLADM_STATUS_MINMAXBW);
+ }
+
+ vdp->vd_val = (uintptr_t)maxbw;
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+dladm_status_t
+do_extract_maxbw(val_desc_t *vdp, void *arg, uint_t cnt)
+{
+ mac_resource_props_t *mrp = (mac_resource_props_t *)arg;
+
+ bcopy((char *)vdp->vd_val, &mrp->mrp_maxbw, sizeof (uint64_t));
+ mrp->mrp_mask |= MRP_MAXBW;
+
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+dld_cpus_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
+{
+ dld_ioc_macprop_t *dip;
+ mac_resource_props_t mrp;
+ int i;
+ uint32_t ncpus;
+ uchar_t *cp;
+ dladm_status_t status;
+
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
+ if (dip == NULL)
+ return (status);
+
+ cp = (uchar_t *)dip->pr_val;
+ (void) memcpy(&mrp, cp, sizeof (mac_resource_props_t));
+ free(dip);
+
+ ncpus = mrp.mrp_ncpus;
+
+ if (ncpus > *val_cnt)
+ return (DLADM_STATUS_TOOSMALL);
+
+ if (ncpus == 0) {
+ (*prop_val)[0] = '\0';
+ *val_cnt = 1;
+ return (DLADM_STATUS_OK);
+ }
+
+ *val_cnt = ncpus;
+ for (i = 0; i < ncpus; i++) {
+ (void) snprintf(prop_val[i], DLADM_PROP_VAL_MAX,
+ "%u", mrp.mrp_cpu[i]);
+ }
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_res(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp,
+ uint_t val_cnt, uint_t flags, datalink_media_t media)
+{
+ mac_resource_props_t mrp;
+ dladm_status_t status = DLADM_STATUS_OK;
+ dld_ioc_macprop_t *dip;
+
+ bzero(&mrp, sizeof (mac_resource_props_t));
+ dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name,
+ flags, &status);
+
+ if (dip == NULL)
+ return (status);
+
+ if (vdp->vd_val == RESET_VAL) {
+ switch (dip->pr_num) {
+ case MAC_PROP_MAXBW:
+ mrp.mrp_maxbw = MRP_MAXBW_RESETVAL;
+ mrp.mrp_mask = MRP_MAXBW;
+ break;
+ case MAC_PROP_PRIO:
+ mrp.mrp_priority = MPL_RESET;
+ mrp.mrp_mask = MRP_PRIORITY;
+ break;
+ default:
+ free(dip);
+ return (DLADM_STATUS_BADARG);
+ }
+ } else {
+ switch (dip->pr_num) {
+ case MAC_PROP_MAXBW:
+ bcopy((void *)vdp->vd_val, &mrp.mrp_maxbw,
+ sizeof (uint64_t));
+ mrp.mrp_mask = MRP_MAXBW;
+ break;
+ case MAC_PROP_PRIO:
+ bcopy((void *)vdp->vd_val, &mrp.mrp_priority,
+ sizeof (mac_priority_level_t));
+ mrp.mrp_mask = MRP_PRIORITY;
+ break;
+ default:
+ free(dip);
+ return (DLADM_STATUS_BADARG);
+ }
+ }
+
+ (void) memcpy(dip->pr_val, &mrp, dip->pr_valsize);
+ status = i_dladm_macprop(dip, B_TRUE);
+ free(dip);
+ return (status);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_cpus(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp,
+ uint_t val_cnt, uint_t flags, datalink_media_t media)
+{
+ mac_resource_props_t mrp;
+ dladm_status_t status;
+ dld_ioc_macprop_t *dip;
+ datalink_class_t class;
+
+ /*
+ * CPU bindings can be set on VNIC and regular physical links.
+ * However VNICs fails the dladm_phys_info test(). So apply
+ * the phys_info test only on physical links.
+ */
+ if ((status = dladm_datalink_id2info(linkid, NULL, &class,
+ NULL, NULL, 0)) != DLADM_STATUS_OK) {
+ return (status);
+ }
+
+ /*
+ * We set intr_cpu to -1. The interrupt will be retargetted,
+ * if possible when the setup is complete in MAC.
+ */
+ bzero(&mrp, sizeof (mac_resource_props_t));
+ mrp.mrp_mask = MRP_CPUS;
+ if (vdp != NULL && vdp->vd_val != RESET_VAL) {
+ mac_resource_props_t *vmrp;
+
+ vmrp = (mac_resource_props_t *)vdp->vd_val;
+ if (vmrp->mrp_ncpus > 0) {
+ bcopy(vmrp, &mrp, sizeof (mac_resource_props_t));
+ mrp.mrp_mask = MRP_CPUS;
+ }
+ mrp.mrp_mask |= MRP_CPUS_USERSPEC;
+ mrp.mrp_fanout_mode = MCM_CPUS;
+ mrp.mrp_intr_cpu = -1;
+ }
+
+ dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name,
+ flags, &status);
+ if (dip == NULL)
+ return (status);
+
+ (void) memcpy(dip->pr_val, &mrp, dip->pr_valsize);
+ status = i_dladm_macprop(dip, B_TRUE);
+ free(dip);
+ return (status);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_cpus(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
+ uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
+{
+ uint32_t cpuid;
+ int i, j, rc;
+ long nproc = sysconf(_SC_NPROCESSORS_CONF);
+ mac_resource_props_t *mrp;
+
+ mrp = malloc(sizeof (mac_resource_props_t));
+ if (mrp == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ for (i = 0; i < val_cnt; i++) {
+ errno = 0;
+ cpuid = strtol(prop_val[i], (char **)NULL, 10);
+ if (errno != 0 || cpuid >= nproc) {
+ free(mrp);
+ return (DLADM_STATUS_CPUMAX);
+ }
+ rc = p_online(cpuid, P_STATUS);
+ if (rc < 1) {
+ free(mrp);
+ return (DLADM_STATUS_CPUERR);
+ }
+ if (rc != P_ONLINE) {
+ free(mrp);
+ return (DLADM_STATUS_CPUNOTONLINE);
+ }
+ mrp->mrp_cpu[i] = cpuid;
+ }
+ mrp->mrp_ncpus = (uint32_t)val_cnt;
+
+ /* Check for duplicates */
+ for (i = 0; i < val_cnt; i++) {
+ for (j = 0; j < val_cnt; j++) {
+ if (i != j && mrp->mrp_cpu[i] == mrp->mrp_cpu[j]) {
+ free(mrp);
+ return (DLADM_STATUS_BADARG);
+ }
+ }
+ }
+ vdp->vd_val = (uintptr_t)mrp;
+
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+dladm_status_t
+do_extract_cpus(val_desc_t *vdp, void *arg, uint_t cnt)
+{
+ mac_resource_props_t *mrp = (mac_resource_props_t *)arg;
+ mac_resource_props_t *vmrp = (mac_resource_props_t *)vdp->vd_val;
+ int i;
+
+ for (i = 0; i < vmrp->mrp_ncpus; i++) {
+ mrp->mrp_cpu[i] = vmrp->mrp_cpu[i];
+ }
+ mrp->mrp_ncpus = vmrp->mrp_ncpus;
+ mrp->mrp_mask |= (MRP_CPUS|MRP_CPUS_USERSPEC);
+ mrp->mrp_fanout_mode = MCM_CPUS;
+
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+dld_priority_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
+{
+ dld_ioc_macprop_t *dip;
+ mac_resource_props_t mrp;
+ mac_priority_level_t pri;
+ dladm_status_t status;
+
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
+ if (dip == NULL)
+ return (status);
+
+ bcopy(dip->pr_val, &mrp, sizeof (mac_resource_props_t));
+ free(dip);
+
+ pri = ((mrp.mrp_mask & MRP_PRIORITY) == 0) ? MPL_HIGH :
+ mrp.mrp_priority;
+
+ (void) dladm_pri2str(pri, prop_val[0]);
+ *val_cnt = 1;
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_priority(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
+ uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
+{
+ mac_priority_level_t *pri;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ if (val_cnt != 1)
+ return (DLADM_STATUS_BADVALCNT);
+
+ pri = malloc(sizeof (mac_priority_level_t));
+ if (pri == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ status = dladm_str2pri(*prop_val, pri);
+ if (status != DLADM_STATUS_OK) {
+ free(pri);
+ return (status);
+ }
+
+ if (*pri < MPL_LOW || *pri > MPL_HIGH) {
+ free(pri);
+ return (DLADM_STATUS_BADVAL);
+ }
+
+ vdp->vd_val = (uintptr_t)pri;
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+dladm_status_t
+do_extract_priority(val_desc_t *vdp, void *arg, uint_t cnt)
+{
+ mac_resource_props_t *mrp = (mac_resource_props_t *)arg;
+
+ bcopy((char *)vdp->vd_val, &mrp->mrp_priority,
+ sizeof (mac_priority_level_t));
+ mrp->mrp_mask |= MRP_PRIORITY;
+
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_get_autopush(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
struct dlautopush dlap;
int i, len;
@@ -1223,10 +1573,11 @@ do_get_autopush(struct prop_desc *pd, datalink_id_t linkid,
return (DLADM_STATUS_NOTDEFINED);
*val_cnt = 1;
- dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
if (dip == NULL) {
(*prop_val)[0] = '\0';
- goto done;
+ return (DLADM_STATUS_OK);
}
(void) memcpy(&dlap, dip->pr_val, sizeof (dlap));
@@ -1246,8 +1597,6 @@ do_get_autopush(struct prop_desc *pd, datalink_id_t linkid,
len += (strlen(AP_ANCHOR) + 1);
}
}
-
- *perm_flags = dip->pr_perm_flags;
free(dip);
done:
return (DLADM_STATUS_OK);
@@ -1292,7 +1641,7 @@ i_dladm_add_ap_module(const char *module, struct dlautopush *dlap)
*/
/* ARGSUSED */
static dladm_status_t
-do_check_autopush(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
+do_check_autopush(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
{
char *module;
@@ -1331,8 +1680,8 @@ do_check_autopush(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
/* ARGSUSED */
static dladm_status_t
-do_get_rate_common(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, uint_t id)
+do_get_rate_common(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, uint_t id, uint_t *perm_flags)
{
wl_rates_t *wrp;
uint_t i;
@@ -1363,6 +1712,7 @@ do_get_rate_common(struct prop_desc *pd, datalink_id_t linkid,
(float)wrp->wl_rates_rates[i] / 2);
}
*val_cnt = wrp->wl_rates_num;
+ *perm_flags = MAC_PROP_PERM_RW;
done:
free(wrp);
@@ -1370,29 +1720,25 @@ done:
}
static dladm_status_t
-do_get_rate_prop(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+do_get_rate_prop(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
if (media != DL_WIFI) {
- *perm_flags = MAC_PROP_PERM_READ;
- return (i_dladm_speed_get(pd, linkid, prop_val,
- val_cnt, flags));
+ return (i_dladm_speed_get(pdp, linkid, prop_val,
+ val_cnt, flags, perm_flags));
}
- *perm_flags = MAC_PROP_PERM_RW;
- return (do_get_rate_common(pd, linkid, prop_val, val_cnt,
- MAC_PROP_WL_DESIRED_RATES));
+ return (do_get_rate_common(pdp, linkid, prop_val, val_cnt,
+ MAC_PROP_WL_DESIRED_RATES, perm_flags));
}
/* ARGSUSED */
static dladm_status_t
-do_get_rate_mod(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+do_get_rate_mod(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
- *perm_flags = MAC_PROP_PERM_READ;
-
switch (media) {
case DL_ETHER:
/*
@@ -1402,8 +1748,8 @@ do_get_rate_mod(struct prop_desc *pd, datalink_id_t linkid,
return (DLADM_STATUS_NOTSUP);
case DL_WIFI:
- return (do_get_rate_common(pd, linkid, prop_val, val_cnt,
- MAC_PROP_WL_SUPPORTED_RATES));
+ return (do_get_rate_common(pdp, linkid, prop_val, val_cnt,
+ MAC_PROP_WL_SUPPORTED_RATES, perm_flags));
default:
return (DLADM_STATUS_BADARG);
}
@@ -1437,7 +1783,7 @@ do_set_rate(datalink_id_t linkid, dladm_wlan_rates_t *rates)
/* ARGSUSED */
static dladm_status_t
-do_set_rate_prop(prop_desc_t *pd, datalink_id_t linkid,
+do_set_rate_prop(prop_desc_t *pdp, datalink_id_t linkid,
val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media)
{
dladm_wlan_rates_t rates;
@@ -1463,7 +1809,7 @@ done:
/* ARGSUSED */
static dladm_status_t
-do_check_rate(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
+do_check_rate(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
{
int i;
@@ -1517,16 +1863,15 @@ do_get_phyconf(datalink_id_t linkid, void *buf, int buflen)
/* ARGSUSED */
static dladm_status_t
-do_get_channel_prop(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+do_get_channel_prop(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
uint32_t channel;
char buf[WLDP_BUFSIZE];
dladm_status_t status = DLADM_STATUS_OK;
wl_phy_conf_t wl_phy_conf;
- *perm_flags = MAC_PROP_PERM_READ;
if ((status = do_get_phyconf(linkid, buf, sizeof (buf)))
!= DLADM_STATUS_OK)
goto done;
@@ -1539,7 +1884,7 @@ do_get_channel_prop(struct prop_desc *pd, datalink_id_t linkid,
(void) snprintf(*prop_val, DLADM_STRSIZE, "%u", channel);
*val_cnt = 1;
-
+ *perm_flags = MAC_PROP_PERM_READ;
done:
return (status);
}
@@ -1553,9 +1898,9 @@ do_get_powermode(datalink_id_t linkid, void *buf, int buflen)
/* ARGSUSED */
static dladm_status_t
-do_get_powermode_prop(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+do_get_powermode_prop(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
wl_ps_mode_t mode;
const char *s;
@@ -1583,12 +1928,8 @@ do_get_powermode_prop(struct prop_desc *pd, datalink_id_t linkid,
}
(void) snprintf(*prop_val, DLADM_STRSIZE, "%s", s);
*val_cnt = 1;
-
+ *perm_flags = MAC_PROP_PERM_RW;
done:
- if (status == DLADM_STATUS_OK)
- *perm_flags = MAC_PROP_PERM_RW;
- else
- *perm_flags = 0;
return (status);
}
@@ -1618,7 +1959,7 @@ do_set_powermode(datalink_id_t linkid, dladm_wlan_powermode_t *pm)
/* ARGSUSED */
static dladm_status_t
-do_set_powermode_prop(prop_desc_t *pd, datalink_id_t linkid,
+do_set_powermode_prop(prop_desc_t *pdp, datalink_id_t linkid,
val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media)
{
dladm_wlan_powermode_t powermode = (dladm_wlan_powermode_t)vdp->vd_val;
@@ -1641,9 +1982,9 @@ do_get_radio(datalink_id_t linkid, void *buf, int buflen)
/* ARGSUSED */
static dladm_status_t
-do_get_radio_prop(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+do_get_radio_prop(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
wl_radio_t radio;
const char *s;
@@ -1668,12 +2009,8 @@ do_get_radio_prop(struct prop_desc *pd, datalink_id_t linkid,
}
(void) snprintf(*prop_val, DLADM_STRSIZE, "%s", s);
*val_cnt = 1;
-
+ *perm_flags = MAC_PROP_PERM_RW;
done:
- if (status == DLADM_STATUS_OK)
- *perm_flags = MAC_PROP_PERM_RW;
- else
- *perm_flags = 0;
return (status);
}
@@ -1698,7 +2035,7 @@ do_set_radio(datalink_id_t linkid, dladm_wlan_radio_t *radio)
/* ARGSUSED */
static dladm_status_t
-do_set_radio_prop(prop_desc_t *pd, datalink_id_t linkid,
+do_set_radio_prop(prop_desc_t *pdp, datalink_id_t linkid,
val_desc_t *vdp, uint_t val_cnt, uint_t fags, datalink_media_t media)
{
dladm_wlan_radio_t radio = (dladm_wlan_radio_t)vdp->vd_val;
@@ -1860,7 +2197,7 @@ i_dladm_buf_alloc_by_id(size_t valsize, datalink_id_t linkid,
/* ARGSUSED */
static dladm_status_t
-i_dladm_set_public_prop(prop_desc_t *pd, datalink_id_t linkid,
+i_dladm_set_public_prop(prop_desc_t *pdp, datalink_id_t linkid,
val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media)
{
dld_ioc_macprop_t *dip;
@@ -1870,11 +2207,11 @@ i_dladm_set_public_prop(prop_desc_t *pd, datalink_id_t linkid,
uint32_t u32;
void *val;
- dip = i_dladm_buf_alloc_by_name(0, linkid, pd->pd_name, 0, &status);
+ dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name, 0, &status);
if (dip == NULL)
return (status);
- if (pd->pd_flags & PD_CHECK_ALLOC)
+ if (pdp->pd_flags & PD_CHECK_ALLOC)
val = (void *)vdp->vd_val;
else {
/*
@@ -1931,7 +2268,7 @@ i_dladm_macprop(void *dip, boolean_t set)
static dld_ioc_macprop_t *
i_dladm_get_public_prop(datalink_id_t linkid, char *prop_name, uint_t flags,
- dladm_status_t *status)
+ dladm_status_t *status, uint_t *perm_flags)
{
dld_ioc_macprop_t *dip = NULL;
@@ -1944,12 +2281,15 @@ i_dladm_get_public_prop(datalink_id_t linkid, char *prop_name, uint_t flags,
free(dip);
return (NULL);
}
+ if (perm_flags != NULL)
+ *perm_flags = dip->pr_perm_flags;
+
return (dip);
}
/* ARGSUSED */
static dladm_status_t
-i_dladm_defmtu_check(struct prop_desc *pd, datalink_id_t linkid,
+i_dladm_defmtu_check(prop_desc_t *pdp, datalink_id_t linkid,
char **prop_val, uint_t val_cnt, val_desc_t *v, datalink_media_t media)
{
if (val_cnt != 1)
@@ -1960,9 +2300,9 @@ i_dladm_defmtu_check(struct prop_desc *pd, datalink_id_t linkid,
/* ARGSUSED */
static dladm_status_t
-i_dladm_duplex_get(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+i_dladm_duplex_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
link_duplex_t link_duplex;
dladm_status_t status;
@@ -1988,8 +2328,8 @@ i_dladm_duplex_get(struct prop_desc *pd, datalink_id_t linkid,
/* ARGSUSED */
static dladm_status_t
-i_dladm_speed_get(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, uint_t flags)
+i_dladm_speed_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, uint_t flags, uint_t *perm_flags)
{
uint64_t ifspeed = 0;
dladm_status_t status;
@@ -2006,23 +2346,26 @@ i_dladm_speed_get(struct prop_desc *pd, datalink_id_t linkid,
"%llu", ifspeed / 1000000); /* Mbps */
}
*val_cnt = 1;
+ *perm_flags = MAC_PROP_PERM_READ;
return (DLADM_STATUS_OK);
}
/* ARGSUSED */
static dladm_status_t
-i_dladm_status_get(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+i_dladm_status_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
- link_state_t link_state;
- dladm_status_t status;
- uchar_t *cp;
- dld_ioc_macprop_t *dip;
+ link_state_t link_state;
+ dladm_status_t status;
+ uchar_t *cp;
+ dld_ioc_macprop_t *dip;
- dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
if (status != DLADM_STATUS_OK)
return (status);
+
cp = (uchar_t *)dip->pr_val;
(void) memcpy(&link_state, cp, sizeof (link_state));
@@ -2038,25 +2381,25 @@ i_dladm_status_get(struct prop_desc *pd, datalink_id_t linkid,
break;
}
*val_cnt = 1;
- *perm_flags = dip->pr_perm_flags;
free(dip);
return (DLADM_STATUS_OK);
}
/* ARGSUSED */
static dladm_status_t
-i_dladm_binary_get(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+i_dladm_binary_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
dld_ioc_macprop_t *dip;
dladm_status_t status;
- dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
if (dip == NULL)
return (status);
+
(void) snprintf(*prop_val, DLADM_PROP_VAL_MAX, "%x", dip->pr_val[0]);
- *perm_flags = dip->pr_perm_flags;
free(dip);
*val_cnt = 1;
return (DLADM_STATUS_OK);
@@ -2064,22 +2407,23 @@ i_dladm_binary_get(struct prop_desc *pd, datalink_id_t linkid,
/* ARGSUSED */
static dladm_status_t
-i_dladm_uint32_get(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+i_dladm_uint32_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
dld_ioc_macprop_t *dip;
- uint32_t v = 0;
+ uint32_t v = 0;
uchar_t *cp;
dladm_status_t status;
- dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
if (dip == NULL)
return (status);
+
cp = (uchar_t *)dip->pr_val;
(void) memcpy(&v, cp, sizeof (v));
(void) snprintf(*prop_val, DLADM_PROP_VAL_MAX, "%ld", v);
- *perm_flags = dip->pr_perm_flags;
free(dip);
*val_cnt = 1;
return (DLADM_STATUS_OK);
@@ -2087,18 +2431,20 @@ i_dladm_uint32_get(struct prop_desc *pd, datalink_id_t linkid,
/* ARGSUSED */
static dladm_status_t
-i_dladm_flowctl_get(struct prop_desc *pd, datalink_id_t linkid,
- char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
- uint_t *perm_flags)
+i_dladm_flowctl_get(prop_desc_t *pdp, datalink_id_t linkid,
+ char **prop_val, uint_t *val_cnt, datalink_media_t media,
+ uint_t flags, uint_t *perm_flags)
{
dld_ioc_macprop_t *dip;
link_flowctrl_t v;
dladm_status_t status;
uchar_t *cp;
- dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+ dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+ &status, perm_flags);
if (dip == NULL)
return (status);
+
cp = (uchar_t *)dip->pr_val;
(void) memcpy(&v, cp, sizeof (v));
switch (v) {
@@ -2115,7 +2461,6 @@ i_dladm_flowctl_get(struct prop_desc *pd, datalink_id_t linkid,
(void) sprintf(*prop_val, "bi");
break;
}
- *perm_flags = dip->pr_perm_flags;
free(dip);
*val_cnt = 1;
return (DLADM_STATUS_OK);
@@ -2220,17 +2565,7 @@ i_dladm_get_prop(datalink_id_t linkid, const char *prop_name,
if ((status = i_dladm_macprop(dip, B_FALSE)) == DLADM_STATUS_OK) {
if (type == DLADM_PROP_VAL_PERM) {
- switch (dip->pr_perm_flags) {
- case MAC_PROP_PERM_READ:
- (void) strncpy(*prop_val,
- PERM_READ_ONLY, DLADM_PROP_VAL_MAX);
- break;
- case MAC_PROP_PERM_RW:
- (void) strncpy(*prop_val,
- PERM_READ_WRITE,
- DLADM_PROP_VAL_MAX);
- break;
- }
+ (void) dladm_perm2str(dip->pr_perm_flags, *prop_val);
} else {
(void) strncpy(*prop_val, dip->pr_val,
DLADM_PROP_VAL_MAX);
@@ -2434,3 +2769,189 @@ i_dladm_wlan_set_legacy_ioctl(datalink_id_t linkid, void *buf, uint_t buflen,
free(gbuf);
return (status);
}
+
+static dladm_status_t
+link_proplist_check(dladm_arg_list_t *proplist)
+{
+ int i, j;
+ boolean_t matched;
+
+ for (i = 0; i < proplist->al_count; i++) {
+ matched = B_FALSE;
+ for (j = 0; j < DLADM_MAX_PROPS; j++) {
+ if (strcmp(proplist->al_info[i].ai_name,
+ prop_table[j].pd_name) == 0)
+ matched = B_TRUE;
+ }
+ if (!matched)
+ return (DLADM_STATUS_BADPROP);
+ }
+ return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_parse_link_props(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+ dladm_status_t status;
+
+ status = dladm_parse_args(str, listp, novalues);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ status = link_proplist_check(*listp);
+ if (status != DLADM_STATUS_OK) {
+ dladm_free_props(*listp);
+ return (status);
+ }
+
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Retrieve the one link property from the database
+ */
+/*ARGSUSED*/
+static int
+i_dladm_get_one_prop(datalink_id_t linkid, const char *prop_name, void *arg)
+{
+ dladm_arg_list_t *proplist = arg;
+ dladm_arg_info_t *aip = NULL;
+
+ aip = &proplist->al_info[proplist->al_count];
+ /*
+ * it is fine to point to prop_name since prop_name points to the
+ * prop_table[n].pd_name.
+ */
+ aip->ai_name = prop_name;
+
+ (void) dladm_get_linkprop(linkid, DLADM_PROP_VAL_PERSISTENT, prop_name,
+ aip->ai_val, &aip->ai_count);
+
+ if (aip->ai_count != 0)
+ proplist->al_count++;
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+
+/*
+ * Retrieve all link properties for a link from the database and
+ * return a property list.
+ */
+dladm_status_t
+dladm_link_get_proplist(datalink_id_t linkid, dladm_arg_list_t **listp)
+{
+ dladm_arg_list_t *list;
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ list = calloc(1, sizeof (dladm_arg_list_t));
+ if (list == NULL)
+ return (dladm_errno2status(errno));
+
+ status = dladm_walk_linkprop(linkid, list, i_dladm_get_one_prop);
+
+ *listp = list;
+ return (status);
+}
+
+/*
+ * Retrieve the named property from a proplist, check the value and
+ * convert to a kernel structure.
+ */
+static dladm_status_t
+i_dladm_link_proplist_extract_one(dladm_arg_list_t *proplist,
+ const char *name, void *val)
+{
+ dladm_status_t status;
+ dladm_arg_info_t *aip = NULL;
+ int i, j;
+
+ /* Find named property in proplist */
+ for (i = 0; i < proplist->al_count; i++) {
+ aip = &proplist->al_info[i];
+ if (strcasecmp(aip->ai_name, name) == 0)
+ break;
+ }
+
+ /* Property not in list */
+ if (i == proplist->al_count)
+ return (DLADM_STATUS_OK);
+
+ for (i = 0; i < DLADM_MAX_PROPS; i++) {
+ prop_desc_t *pdp = &prop_table[i];
+ val_desc_t *vdp;
+
+ vdp = malloc(sizeof (val_desc_t) * aip->ai_count);
+ if (vdp == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ if (strcasecmp(aip->ai_name, pdp->pd_name) != 0)
+ continue;
+
+ if (aip->ai_val == NULL)
+ return (DLADM_STATUS_BADARG);
+
+ /* Check property value */
+ if (pdp->pd_check != NULL) {
+ status = pdp->pd_check(pdp, 0, aip->ai_val,
+ aip->ai_count, vdp, 0);
+ } else {
+ status = DLADM_STATUS_BADARG;
+ }
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ for (j = 0; j < DLADM_MAX_RSRC_PROP; j++) {
+ resource_prop_t *rpp = &rsrc_prop_table[j];
+
+ if (strcasecmp(aip->ai_name, rpp->rp_name) != 0)
+ continue;
+
+ /* Extract kernel structure */
+ if (rpp->rp_extract != NULL) {
+ status = rpp->rp_extract(vdp, val,
+ aip->ai_count);
+ } else {
+ status = DLADM_STATUS_BADARG;
+ }
+ break;
+ }
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ break;
+ }
+ return (status);
+}
+
+/*
+ * Extract properties from a proplist and convert to mac_resource_props_t.
+ */
+dladm_status_t
+dladm_link_proplist_extract(dladm_arg_list_t *proplist,
+ mac_resource_props_t *mrp)
+{
+ dladm_status_t status = DLADM_STATUS_OK;
+
+ status = i_dladm_link_proplist_extract_one(proplist, "maxbw", mrp);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ status = i_dladm_link_proplist_extract_one(proplist, "priority", mrp);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ status = i_dladm_link_proplist_extract_one(proplist, "cpus", mrp);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ return (status);
+}
+
+static const char *
+dladm_perm2str(uint_t perm, char *buf)
+{
+ (void) snprintf(buf, DLADM_STRSIZE, "%c%c",
+ ((perm & MAC_PROP_PERM_READ) != 0) ? 'r' : '-',
+ ((perm & MAC_PROP_PERM_WRITE) != 0) ? 'w' : '-');
+ return (buf);
+}
diff --git a/usr/src/lib/libdladm/common/llib-ldladm b/usr/src/lib/libdladm/common/llib-ldladm
index a6fc19b517..ae8bb981bf 100644
--- a/usr/src/lib/libdladm/common/llib-ldladm
+++ b/usr/src/lib/libdladm/common/llib-ldladm
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*LINTLIBRARY*/
/*PROTOLIB1*/
@@ -34,3 +32,5 @@
#include <libdlvnic.h>
#include <libdlvlan.h>
#include <libdlmgmt.h>
+#include <libdlflow.h>
+#include <libdlstat.h>
diff --git a/usr/src/lib/libdladm/common/mapfile-vers b/usr/src/lib/libdladm/common/mapfile-vers
index 9c61b84883..bd8d6a9eb1 100644
--- a/usr/src/lib/libdladm/common/mapfile-vers
+++ b/usr/src/lib/libdladm/common/mapfile-vers
@@ -35,7 +35,6 @@ SUNWprivate_1.1 {
dladm_valid_linkname;
dladm_mac_walk;
dladm_init_linkprop;
- dladm_get_single_mac_stat;
dladm_get_linkprop;
dladm_set_linkprop;
dladm_walk_linkprop;
@@ -44,6 +43,8 @@ SUNWprivate_1.1 {
dladm_set_secobj;
dladm_unset_secobj;
dladm_walk_secobj;
+ dladm_bw2str;
+ dladm_str2bw;
dladm_secobjclass2str;
dladm_str2secobjclass;
dladm_aggr_up;
@@ -118,12 +119,60 @@ SUNWprivate_1.1 {
dladm_wlan_wpa_set_key;
dladm_wlan_wpa_set_mlme;
dladm_vnic_create;
- dladm_vnic_modify;
dladm_vnic_delete;
dladm_vnic_info;
dladm_vnic_str2macaddrtype;
- dladm_kstat_value;
+ dladm_vnic_up;
+ dladm_walk_macaddr;
+ dladm_walk_hwgrp;
+ dladm_pri2str;
+ dladm_str2pri;
+ dladm_start_usagelog;
+ dladm_stop_usagelog;
+ dladm_walk_usage_res;
+ dladm_walk_usage_time;
+ dladm_usage_summary;
+ dladm_usage_dates;
+
+ dladm_flow_add;
+ dladm_flow_remove;
+ dladm_flow_parse_db;
+ dladm_walk_flow;
+ dladm_flow_init;
+ dladm_flow_info;
+ dladm_prefixlen2mask;
+ dladm_mask2prefixlen;
+ dladm_str2proto;
+ dladm_proto2str;
+
+ dladm_free_attrs;
+ dladm_parse_flow_attrs;
+
+ dladm_flow_attr_ip2str;
+ dladm_flow_attr_proto2str;
+ dladm_flow_attr_port2str;
+ dladm_flow_attr_dsfield2str;
+
+ dladm_free_props;
+ dladm_parse_link_props;
+ dladm_get_linkprop;
+ dladm_set_linkprop;
+ dladm_walk_linkprop;
+ dladm_parse_flow_props;
+ dladm_get_flowprop;
+ dladm_set_flowprop;
+ dladm_walk_flowprop;
+
dladm_parselink;
+
+ dladm_continuous;
+ dladm_kstat_lookup;
+ dladm_get_stats;
+ dladm_kstat_value;
+ dladm_get_single_mac_stat;
+ dladm_stats_total;
+ dladm_stats_diff;
+
local:
*;
};
diff --git a/usr/src/lib/libdladm/common/propfuncs.c b/usr/src/lib/libdladm/common/propfuncs.c
new file mode 100644
index 0000000000..74964511eb
--- /dev/null
+++ b/usr/src/lib/libdladm/common/propfuncs.c
@@ -0,0 +1,699 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdlib.h>
+#include <strings.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/dld.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libdladm_impl.h>
+#include <libdlflow_impl.h>
+
+/*
+ * XXX duplicate defines
+ */
+#define DLADM_PROP_VAL_MAX 32
+#define DLADM_MAX_PROPS 32
+
+static void
+free_props(prop_db_info_t *lip)
+{
+ prop_db_info_t *lip_next;
+ prop_val_t *lvp, *lvp_next;
+
+ for (; lip != NULL; lip = lip_next) {
+ lip_next = lip->li_nextprop;
+ for (lvp = lip->li_val; lvp != NULL; lvp = lvp_next) {
+ lvp_next = lvp->lv_nextval;
+ free(lvp);
+ }
+ free(lip);
+ }
+}
+
+/*
+ * Generate an entry in the property database.
+ * Each entry has this format:
+ * <name> <prop0>=<val0>,...,<valn>;...;<propn>=<val0>,...,<valn>;
+ */
+static void
+generate_prop_line(const char *name, char *buf,
+ prop_db_info_t *listp, dladm_status_t *statusp)
+{
+ char tmpbuf[MAXLINELEN];
+ char *ptr, *lim = tmpbuf + MAXLINELEN;
+ prop_db_info_t *lip = listp;
+ prop_val_t *lvp = NULL;
+
+ /*
+ * Delete line if there are no properties left.
+ */
+ if (lip == NULL ||
+ (lip->li_val == NULL && lip->li_nextprop == NULL)) {
+ buf[0] = '\0';
+ return;
+ }
+ ptr = tmpbuf;
+ ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s\t", name);
+ for (; lip != NULL; lip = lip->li_nextprop) {
+ /*
+ * Skip properties without values.
+ */
+ if (lip->li_val == NULL)
+ continue;
+
+ ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s=", lip->li_name);
+ for (lvp = lip->li_val; lvp != NULL; lvp = lvp->lv_nextval) {
+ ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s%c",
+ lvp->lv_name,
+ ((lvp->lv_nextval == NULL) ? ';' : ','));
+ }
+ }
+ if (ptr > lim) {
+ *statusp = DLADM_STATUS_TOOSMALL;
+ return;
+ }
+ (void) snprintf(buf, MAXLINELEN, "%s\n", tmpbuf);
+}
+
+/*
+ * This function is used to update or create an entry in the persistent db.
+ * process_prop_db() will first scan the db for an entry matching the
+ * specified name. If a match is found, this function is invoked with the
+ * entry's contents (buf) and its linked-list representation (listp). lsp
+ * holds the name and values of the property to be added or updated; this
+ * information will be merged with listp. Subsequently, an updated entry
+ * will be written to buf, which will in turn be written to disk by
+ * process_prop_db(). If no entry matches the specified name, listp
+ * will be NULL; a new entry will be generated in this case and it will
+ * contain only the property information in lsp.
+ */
+boolean_t
+process_prop_set(prop_db_state_t *lsp, char *buf,
+ prop_db_info_t *listp, dladm_status_t *statusp)
+{
+ dladm_status_t status;
+ prop_db_info_t *lastp = NULL, *lip = listp, *nlip = NULL;
+ prop_val_t **lvpp;
+ int i;
+
+ if (lsp->ls_propname == NULL) {
+ buf[0] = '\0';
+ return (B_FALSE);
+ }
+
+ /*
+ * Find the prop we want to change.
+ */
+ for (; lip != NULL; lip = lip->li_nextprop) {
+ if (strcmp(lip->li_name, lsp->ls_propname) == 0)
+ break;
+
+ lastp = lip;
+ }
+
+ if (lip == NULL) {
+ /*
+ * If the prop is not found, append it to the list.
+ */
+ if ((nlip = malloc(sizeof (prop_db_info_t))) == NULL) {
+ status = DLADM_STATUS_NOMEM;
+ goto fail;
+ }
+ /*
+ * nlip will need to be freed later if there is no list to
+ * append to.
+ */
+ if (lastp != NULL)
+ lastp->li_nextprop = nlip;
+ nlip->li_name = lsp->ls_propname;
+ nlip->li_nextprop = NULL;
+ nlip->li_val = NULL;
+ lvpp = &nlip->li_val;
+ } else {
+ prop_val_t *lvp, *lvp_next;
+
+ /*
+ * If the prop is found, delete the existing values from it.
+ */
+ for (lvp = lip->li_val; lvp != NULL; lvp = lvp_next) {
+ lvp_next = lvp->lv_nextval;
+ free(lvp);
+ }
+ lip->li_val = NULL;
+ lvpp = &lip->li_val;
+ }
+
+ /*
+ * Fill our prop with the specified values.
+ */
+ for (i = 0; i < *lsp->ls_valcntp; i++) {
+ if ((*lvpp = malloc(sizeof (prop_val_t))) == NULL) {
+ status = DLADM_STATUS_NOMEM;
+ goto fail;
+ }
+ (*lvpp)->lv_name = lsp->ls_propval[i];
+ (*lvpp)->lv_nextval = NULL;
+ lvpp = &(*lvpp)->lv_nextval;
+ }
+
+ if (listp != NULL) {
+ generate_prop_line(lsp->ls_name, buf, listp, statusp);
+ } else {
+ generate_prop_line(lsp->ls_name, buf, nlip, statusp);
+ free_props(nlip);
+ }
+ return (B_FALSE);
+
+fail:
+ *statusp = status;
+ if (listp == NULL)
+ free_props(nlip);
+
+ return (B_FALSE);
+}
+
+/*
+ * This function is used for retrieving the values for a specific property.
+ * It gets called if an entry matching the specified name exists in the db.
+ * The entry is converted into a linked-list listp. This list is then scanned
+ * for the specified property name; if a matching property exists, its
+ * associated values are copied to the array lsp->ls_propval.
+ */
+/* ARGSUSED */
+boolean_t
+process_prop_get(prop_db_state_t *lsp, char *buf,
+ prop_db_info_t *listp, dladm_status_t *statusp)
+{
+ prop_db_info_t *lip = listp;
+ prop_val_t *lvp;
+ uint_t valcnt = 0;
+
+ /*
+ * Find the prop we want to get.
+ */
+ for (; lip != NULL; lip = lip->li_nextprop) {
+ if (strcmp(lip->li_name, lsp->ls_propname) == 0)
+ break;
+ }
+ if (lip == NULL) {
+ *statusp = DLADM_STATUS_NOTFOUND;
+ return (B_FALSE);
+ }
+
+ for (lvp = lip->li_val; lvp != NULL; lvp = lvp->lv_nextval) {
+ (void) strncpy(lsp->ls_propval[valcnt], lvp->lv_name,
+ DLADM_PROP_VAL_MAX);
+
+ if (++valcnt >= *lsp->ls_valcntp && lvp->lv_nextval != NULL) {
+ *statusp = DLADM_STATUS_TOOSMALL;
+ return (B_FALSE);
+ }
+ }
+ /*
+ * This function is meant to be called at most once for each call
+ * to process_prop_db(). For this reason, it's ok to overwrite
+ * the caller's valcnt array size with the actual number of values
+ * returned.
+ */
+ *lsp->ls_valcntp = valcnt;
+ return (B_FALSE);
+}
+
+/*
+ * This is used for initializing properties.
+ * Unlike the other routines, this gets called for every entry in the
+ * database. lsp->ls_name is not user-specified but instead is set to
+ * the current name being processed.
+ */
+/* ARGSUSED */
+boolean_t
+process_prop_init(prop_db_state_t *lsp, char *buf,
+ prop_db_info_t *listp, dladm_status_t *statusp)
+{
+ dladm_status_t status = DLADM_STATUS_OK;
+ prop_db_info_t *lip = listp;
+ prop_val_t *lvp;
+ uint_t valcnt, i;
+ char **propval;
+
+ for (; lip != NULL; lip = lip->li_nextprop) {
+ /*
+ * Construct the propval array and fill it with
+ * values from listp.
+ */
+ for (lvp = lip->li_val, valcnt = 0;
+ lvp != NULL; lvp = lvp->lv_nextval, valcnt++) {
+ }
+
+ propval = malloc(sizeof (char *) * valcnt);
+ if (propval == NULL) {
+ *statusp = DLADM_STATUS_NOMEM;
+ break;
+ }
+ lvp = lip->li_val;
+ for (i = 0; i < valcnt; i++, lvp = lvp->lv_nextval)
+ propval[i] = (char *)lvp->lv_name;
+
+ status = (*lsp->ls_initop)(lsp->ls_name, lip->li_name,
+ propval, valcnt, DLADM_OPT_ACTIVE, NULL);
+
+ /*
+ * We continue with initializing other properties even
+ * after encountering an error. This error will be
+ * propagated to the caller via 'statusp'.
+ */
+ if (status != DLADM_STATUS_OK)
+ *statusp = status;
+
+ free(propval);
+ }
+ return (B_TRUE);
+}
+
+static int
+parse_props(char *buf, prop_db_info_t **lipp)
+{
+ int i, len;
+ char *curr;
+ prop_db_info_t *lip = NULL;
+ prop_db_info_t **tailp = lipp;
+ prop_val_t *lvp = NULL;
+ prop_val_t **vtailp = NULL;
+
+ curr = buf;
+ len = strlen(buf);
+ for (i = 0; i < len; i++) {
+ char c = buf[i];
+ boolean_t match = (c == '=' || c == ',' || c == ';');
+
+ /*
+ * Move to the next character if there is no match and
+ * if we have not reached the last character.
+ */
+ if (!match && i != len - 1)
+ continue;
+
+ if (match) {
+ /*
+ * Nul-terminate the string pointed to by 'curr'.
+ */
+ buf[i] = '\0';
+ if (*curr == '\0')
+ goto fail;
+ }
+
+ if (lip != NULL) {
+ /*
+ * We get here after we have processed the "<prop>="
+ * pattern. The pattern we are now interested in is
+ * "<val0>,<val1>,...,<valn>;". For each value we
+ * find, a prop_val_t will be allocated and
+ * added to the current 'lip'.
+ */
+ if (c == '=')
+ goto fail;
+
+ lvp = malloc(sizeof (*lvp));
+ if (lvp == NULL)
+ goto fail;
+
+ lvp->lv_name = curr;
+ lvp->lv_nextval = NULL;
+ *vtailp = lvp;
+ vtailp = &lvp->lv_nextval;
+
+ if (c == ';') {
+ tailp = &lip->li_nextprop;
+ vtailp = NULL;
+ lip = NULL;
+ }
+ } else {
+ /*
+ * lip == NULL indicates that 'curr' must be refering
+ * to a property name. We allocate a new prop_db_info_t
+ * append it to the list given by the caller.
+ */
+ if (c != '=')
+ goto fail;
+
+ lip = malloc(sizeof (*lip));
+ if (lip == NULL)
+ goto fail;
+
+ lip->li_name = curr;
+ lip->li_val = NULL;
+ lip->li_nextprop = NULL;
+ *tailp = lip;
+ vtailp = &lip->li_val;
+ }
+ curr = buf + i + 1;
+ }
+ /*
+ * The list must be non-empty and the last character must be ';'.
+ */
+ if (*lipp == NULL || lip != NULL)
+ goto fail;
+
+ return (0);
+
+fail:
+ free_props(*lipp);
+ *lipp = NULL;
+ return (-1);
+}
+
+static boolean_t
+process_prop_line(prop_db_state_t *lsp, char *buf,
+ dladm_status_t *statusp)
+{
+ prop_db_info_t *lip = NULL;
+ int i, len, llen;
+ char *str, *lasts;
+ boolean_t cont, noname = B_FALSE;
+
+ /*
+ * Skip leading spaces, blank lines, and comments.
+ */
+ len = strlen(buf);
+ for (i = 0; i < len; i++) {
+ if (!isspace(buf[i]))
+ break;
+ }
+ if (i == len || buf[i] == '#')
+ return (B_TRUE);
+
+ str = buf + i;
+ if (lsp->ls_name != NULL) {
+ /*
+ * Skip names we're not interested in.
+ * Note that strncmp() and isspace() are used here
+ * instead of strtok() and strcmp() because we don't
+ * want to modify buf in case it does not contain the
+ * specified name.
+ */
+ llen = strlen(lsp->ls_name);
+ if (strncmp(str, lsp->ls_name, llen) != 0 ||
+ !isspace(str[llen]))
+ return (B_TRUE);
+ } else {
+ /*
+ * If a name is not specified, find the name
+ * and assign it to lsp->ls_name.
+ */
+ if (strtok_r(str, " \n\t", &lasts) == NULL)
+ goto fail;
+
+ llen = strlen(str);
+ lsp->ls_name = str;
+ noname = B_TRUE;
+ }
+ str += llen + 1;
+ if (str >= buf + len)
+ goto fail;
+
+ /*
+ * Now find the list of properties.
+ */
+ if ((str = strtok_r(str, " \n\t", &lasts)) == NULL)
+ goto fail;
+
+ if (parse_props(str, &lip) < 0)
+ goto fail;
+
+ cont = (*lsp->ls_op)(lsp, buf, lip, statusp);
+ free_props(lip);
+ if (noname)
+ lsp->ls_name = NULL;
+ return (cont);
+
+fail:
+ free_props(lip);
+ if (noname)
+ lsp->ls_name = NULL;
+
+ /*
+ * Delete corrupted line.
+ */
+ buf[0] = '\0';
+ return (B_TRUE);
+}
+
+dladm_status_t
+process_prop_db(void *arg, FILE *fp, FILE *nfp)
+{
+ prop_db_state_t *lsp = arg;
+ dladm_status_t status = DLADM_STATUS_OK;
+ char buf[MAXLINELEN];
+ boolean_t cont = B_TRUE;
+
+ /*
+ * This loop processes each line of the configuration file.
+ * buf can potentially be modified by process_prop_line().
+ * If this is a write operation and buf is not truncated, buf will
+ * be written to disk. process_prop_line() will no longer be
+ * called after it returns B_FALSE; at which point the remainder
+ * of the file will continue to be read and, if necessary, written
+ * to disk as well.
+ */
+ while (fgets(buf, MAXLINELEN, fp) != NULL) {
+ if (cont)
+ cont = process_prop_line(lsp, buf, &status);
+
+ if (nfp != NULL && buf[0] != '\0' && fputs(buf, nfp) == EOF) {
+ status = dladm_errno2status(errno);
+ break;
+ }
+ }
+
+ if (status != DLADM_STATUS_OK || !cont)
+ return (status);
+
+ if (lsp->ls_op == process_prop_set) {
+ /*
+ * If the specified name is not found above, we add the
+ * name and its properties to the configuration file.
+ */
+ (void) (*lsp->ls_op)(lsp, buf, NULL, &status);
+ if (status == DLADM_STATUS_OK && fputs(buf, nfp) == EOF)
+ status = dladm_errno2status(errno);
+ }
+
+ if (lsp->ls_op == process_prop_get)
+ status = DLADM_STATUS_NOTFOUND;
+
+ return (status);
+}
+
+dladm_status_t
+i_dladm_get_prop_temp(const char *name, prop_type_t type,
+ const char *prop_name, char **prop_val, uint_t *val_cntp,
+ prop_table_t *prop_tbl)
+{
+ int i;
+ dladm_status_t status;
+ uint_t cnt;
+ fprop_desc_t *pdp;
+
+ if (name == NULL || prop_name == NULL || prop_val == NULL ||
+ val_cntp == NULL || *val_cntp == 0)
+ return (DLADM_STATUS_BADARG);
+
+ for (i = 0; i < prop_tbl->pt_size; i++)
+ if (strcasecmp(prop_name, prop_tbl->pt_table[i].pd_name) == 0)
+ break;
+
+ if (i == prop_tbl->pt_size)
+ return (DLADM_STATUS_NOTFOUND);
+
+ pdp = &prop_tbl->pt_table[i];
+ status = DLADM_STATUS_OK;
+
+ switch (type) {
+ case DLADM_PROP_VAL_CURRENT:
+ status = pdp->pd_get(name, prop_val, val_cntp);
+ break;
+ case DLADM_PROP_VAL_DEFAULT:
+ if (pdp->pd_defval.vd_name == NULL) {
+ status = DLADM_STATUS_NOTSUP;
+ break;
+ }
+ (void) strcpy(*prop_val, pdp->pd_defval.vd_name);
+ *val_cntp = 1;
+ break;
+
+ case DLADM_PROP_VAL_MODIFIABLE:
+ if (pdp->pd_getmod != NULL) {
+ status = pdp->pd_getmod(name, prop_val, val_cntp);
+ break;
+ }
+ cnt = pdp->pd_nmodval;
+ if (cnt == 0) {
+ status = DLADM_STATUS_NOTSUP;
+ } else if (cnt > *val_cntp) {
+ status = DLADM_STATUS_TOOSMALL;
+ } else {
+ for (i = 0; i < cnt; i++) {
+ (void) strcpy(prop_val[i],
+ pdp->pd_modval[i].vd_name);
+ }
+ *val_cntp = cnt;
+ }
+ break;
+ default:
+ status = DLADM_STATUS_BADARG;
+ break;
+ }
+
+ return (status);
+}
+
+static dladm_status_t
+i_dladm_set_one_prop_temp(const char *name, fprop_desc_t *pdp, char **prop_val,
+ uint_t val_cnt, uint_t flags)
+{
+ dladm_status_t status;
+ val_desc_t *vdp = NULL;
+ uint_t cnt;
+
+ if (pdp->pd_temponly && (flags & DLADM_OPT_PERSIST) != 0)
+ return (DLADM_STATUS_TEMPONLY);
+
+ if (pdp->pd_set == NULL)
+ return (DLADM_STATUS_PROPRDONLY);
+
+ if (prop_val != NULL) {
+ if (pdp->pd_check != NULL)
+ status = pdp->pd_check(pdp, prop_val, val_cnt, &vdp);
+ else
+ status = DLADM_STATUS_BADARG;
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ cnt = val_cnt;
+ } else {
+ if (pdp->pd_defval.vd_name == NULL)
+ return (DLADM_STATUS_NOTSUP);
+
+ if ((vdp = malloc(sizeof (val_desc_t))) == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ (void) memcpy(vdp, &pdp->pd_defval, sizeof (val_desc_t));
+ cnt = 1;
+ }
+
+ status = pdp->pd_set(name, vdp, cnt);
+
+ free(vdp);
+ return (status);
+}
+
+dladm_status_t
+i_dladm_set_prop_temp(const char *name, const char *prop_name, char **prop_val,
+ uint_t val_cnt, uint_t flags, char **errprop, prop_table_t *prop_tbl)
+{
+ int i;
+ dladm_status_t status = DLADM_STATUS_OK;
+ boolean_t found = B_FALSE;
+
+ for (i = 0; i < prop_tbl->pt_size; i++) {
+ fprop_desc_t *pdp = &prop_tbl->pt_table[i];
+ dladm_status_t s;
+
+ if (prop_name != NULL &&
+ (strcasecmp(prop_name, pdp->pd_name) != 0))
+ continue;
+
+ found = B_TRUE;
+ s = i_dladm_set_one_prop_temp(name, pdp, prop_val, val_cnt,
+ flags);
+
+ if (prop_name != NULL) {
+ status = s;
+ break;
+ } else {
+ if (s != DLADM_STATUS_OK &&
+ s != DLADM_STATUS_NOTSUP) {
+ if (errprop != NULL)
+ *errprop = pdp->pd_name;
+ status = s;
+ break;
+ }
+ }
+ }
+
+ if (!found)
+ status = DLADM_STATUS_NOTFOUND;
+
+ return (status);
+}
+
+boolean_t
+i_dladm_is_prop_temponly(const char *prop_name, char **errprop,
+ prop_table_t *prop_tbl)
+{
+ int i;
+
+ if (prop_name == NULL)
+ return (B_FALSE);
+
+ for (i = 0; i < prop_tbl->pt_size; i++) {
+ fprop_desc_t *pdp = &prop_tbl->pt_table[i];
+
+ if (strcasecmp(prop_name, pdp->pd_name) != 0)
+ continue;
+
+ if (errprop != NULL)
+ *errprop = pdp->pd_name;
+
+ if (pdp->pd_temponly)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+void
+dladm_free_props(dladm_arg_list_t *list)
+{
+ dladm_free_args(list);
+}
+
+dladm_status_t
+dladm_parse_props(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+ if (dladm_parse_args(str, listp, novalues) != DLADM_STATUS_OK)
+ goto fail;
+
+ return (DLADM_STATUS_OK);
+
+fail:
+ dladm_free_args(*listp);
+ return (DLADM_STATUS_PROP_PARSE_ERR);
+}
diff --git a/usr/src/lib/libdladm/common/usage.c b/usr/src/lib/libdladm/common/usage.c
new file mode 100644
index 0000000000..07ef7bbb22
--- /dev/null
+++ b/usr/src/lib/libdladm/common/usage.c
@@ -0,0 +1,1437 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <exacct.h>
+#include <libdladm.h>
+
+#define TIMEBUFLEN 20
+#define GBIT 1000000000
+#define MBIT 1000000
+#define KBIT 1000
+
+#define NET_RESET_TOT(tbytes, ttime, tibytes, tobytes, step) { \
+ (step) = 1; \
+ (tbytes) = 0; \
+ (ttime) = 0; \
+ (tibytes) = 0; \
+ (tobytes) = 0; \
+ }
+
+/* Flow/Link Descriptor */
+typedef struct net_desc_s {
+ char net_desc_name[LIFNAMSIZ];
+ char net_desc_devname[LIFNAMSIZ];
+ uchar_t net_desc_ehost[ETHERADDRL];
+ uchar_t net_desc_edest[ETHERADDRL];
+ ushort_t net_desc_vlan_tpid;
+ ushort_t net_desc_vlan_tci;
+ ushort_t net_desc_sap;
+ ushort_t net_desc_cpuid;
+ ushort_t net_desc_priority;
+ uint64_t net_desc_bw_limit;
+ in6_addr_t net_desc_saddr;
+ in6_addr_t net_desc_daddr;
+ boolean_t net_desc_isv4;
+ in_port_t net_desc_sport;
+ in_port_t net_desc_dport;
+ uint8_t net_desc_protocol;
+ uint8_t net_desc_dsfield;
+ boolean_t net_desc_newrec;
+} net_desc_t;
+
+/* Time structure: Year, Month, Day, Hour, Min, Sec */
+typedef struct net_time_s {
+ int net_time_yr;
+ int net_time_mon;
+ int net_time_day;
+ int net_time_hr;
+ int net_time_min;
+ int net_time_sec;
+} net_time_t;
+
+/* Flow/Link Stats */
+typedef struct net_stat_s {
+ char net_stat_name[LIFNAMSIZ];
+ uint64_t net_stat_ibytes;
+ uint64_t net_stat_obytes;
+ uint64_t net_stat_ipackets;
+ uint64_t net_stat_opackets;
+ uint64_t net_stat_ierrors;
+ uint64_t net_stat_oerrors;
+ uint64_t net_stat_tibytes;
+ uint64_t net_stat_tobytes;
+ uint64_t net_stat_tipackets;
+ uint64_t net_stat_topackets;
+ uint64_t net_stat_tierrors;
+ uint64_t net_stat_toerrors;
+ uint64_t net_stat_ctime;
+ uint64_t net_stat_tdiff;
+ net_time_t net_stat_time;
+ struct net_stat_s *net_stat_next;
+ net_desc_t *net_stat_desc;
+ boolean_t net_stat_isref;
+} net_stat_t;
+
+/* Used to create the [gnu]plot file */
+typedef struct net_plot_entry_s {
+ char *net_pe_name;
+ uint64_t net_pe_tottime;
+ uint64_t net_pe_totbytes;
+ uint64_t net_pe_totibytes;
+ uint64_t net_pe_totobytes;
+ uint64_t net_pe_lasttime;
+} net_plot_entry_t;
+
+/* Stats entry */
+typedef struct net_entry_s {
+ net_desc_t *net_entry_desc;
+ net_stat_t *net_entry_shead;
+ net_stat_t *net_entry_stail;
+ int net_entry_scount;
+ net_stat_t *net_entry_sref;
+ net_stat_t *net_entry_tstats;
+ uint64_t net_entry_ttime;
+ struct net_entry_s *net_entry_next;
+} net_entry_t;
+
+/* Time sorted list */
+typedef struct net_time_entry_s {
+ net_stat_t *my_time_stat;
+ struct net_time_entry_s *net_time_entry_next;
+ struct net_time_entry_s *net_time_entry_prev;
+} net_time_entry_t;
+
+/* The parsed table */
+typedef struct net_table_s {
+ /* List of stats */
+ net_entry_t *net_table_head;
+ net_entry_t *net_table_tail;
+ int net_entries;
+
+ /*
+ * Optimization I : List sorted by time, i.e:
+ * Time Resource ..
+ * -------------------------------
+ * 11.15.10 bge0
+ * 11.15.10 ce0
+ * 11.15.10 vnic1
+ * 11.15.15 bge0
+ * 11.15.15 ce0
+ * 11.15.15 vnic1
+ */
+ net_time_entry_t *net_time_head;
+ net_time_entry_t *net_time_tail;
+
+ /*
+ * Optimization II : List sorted by resources
+ * Time Resource ..
+ * -------------------------------
+ * 11.15.10 bge0
+ * 11.15.15 bge0
+ * 11.15.10 ce0
+ * 11.15.15 ce0
+ * 11.15.10 vnic1
+ * 11.15.15 vnic1
+ */
+ net_time_entry_t *net_ctime_head;
+ net_time_entry_t *net_ctime_tail;
+
+ /* Common to both the above (sorted) lists. */
+ int net_time_entries;
+} net_table_t;
+
+#define NET_DATE_GREATER 0
+#define NET_DATE_LESSER 1
+#define NET_DATE_EQUAL 2
+
+#define NET_TIME_GREATER 0
+#define NET_TIME_LESSER 1
+#define NET_TIME_EQUAL 2
+
+#ifndef _LP64
+#define FMT_UINT64 "%-15llu"
+#else
+#define FMT_UINT64 "%-15lu"
+#endif
+
+/*
+ * Given a timebuf of the form M/D/Y,H:M:S break it into individual elements.
+ */
+static void
+dissect_time(char *tbuf, net_time_t *nt)
+{
+ char *d;
+ char *t;
+ char *dd;
+ char *h;
+ char *endp;
+
+ if (tbuf == NULL || nt == NULL)
+ return;
+
+ d = strtok(tbuf, ","); /* Date */
+ t = strtok(NULL, ","); /* Time */
+
+ /* Month */
+ dd = strtok(d, "/");
+ if (dd == NULL)
+ return;
+ nt->net_time_mon = strtol(dd, &endp, 10);
+
+ /* Day */
+ dd = strtok(NULL, "/");
+ if (dd == NULL)
+ return;
+ nt->net_time_day = strtol(dd, &endp, 10);
+
+ /* Year */
+ dd = strtok(NULL, "/");
+ if (dd == NULL)
+ return;
+ nt->net_time_yr = strtol(dd, &endp, 10);
+ if (strlen(dd) <= 2)
+ nt->net_time_yr += 2000;
+
+ if (t == NULL)
+ return;
+
+ /* Hour */
+ h = strtok(t, ":");
+ if (h == NULL)
+ return;
+ nt->net_time_hr = strtol(h, &endp, 10);
+
+ /* Min */
+ h = strtok(NULL, ":");
+ if (h == NULL)
+ return;
+ nt->net_time_min = strtol(h, &endp, 10);
+
+ /* Sec */
+ h = strtok(NULL, ":");
+ if (h == NULL)
+ return;
+ nt->net_time_sec = strtol(h, &endp, 10);
+}
+
+/* Get a stat item from an object in the exacct file */
+static void
+add_stat_item(ea_object_t *o, net_stat_t *ns)
+{
+ switch (o->eo_catalog & EXT_TYPE_MASK) {
+ case EXT_STRING:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_STATS_NAME) {
+ (void) strncpy(ns->net_stat_name, o->eo_item.ei_string,
+ strlen(o->eo_item.ei_string));
+ }
+ break;
+ case EXT_UINT64:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_STATS_CURTIME) {
+ time_t _time;
+ char timebuf[TIMEBUFLEN];
+
+ ns->net_stat_ctime = o->eo_item.ei_uint64;
+ _time = ns->net_stat_ctime;
+ (void) strftime(timebuf, sizeof (timebuf),
+ "%m/%d/%Y,%T\n", localtime(&_time));
+ dissect_time(timebuf, &ns->net_stat_time);
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_STATS_IBYTES) {
+ ns->net_stat_ibytes = o->eo_item.ei_uint64;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_STATS_OBYTES) {
+ ns->net_stat_obytes = o->eo_item.ei_uint64;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_STATS_IPKTS) {
+ ns->net_stat_ipackets = o->eo_item.ei_uint64;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_STATS_OPKTS) {
+ ns->net_stat_opackets = o->eo_item.ei_uint64;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_STATS_IERRPKTS) {
+ ns->net_stat_ierrors = o->eo_item.ei_uint64;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_STATS_OERRPKTS) {
+ ns->net_stat_oerrors = o->eo_item.ei_uint64;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+/* Get a description item from an object in the exacct file */
+static void
+add_desc_item(ea_object_t *o, net_desc_t *nd)
+{
+ switch (o->eo_catalog & EXT_TYPE_MASK) {
+ case EXT_STRING:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_NAME) {
+ (void) strncpy(nd->net_desc_name, o->eo_item.ei_string,
+ strlen(o->eo_item.ei_string));
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_DEVNAME) {
+ (void) strncpy(nd->net_desc_devname,
+ o->eo_item.ei_string, strlen(o->eo_item.ei_string));
+ }
+ break;
+ case EXT_UINT8:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_PROTOCOL) {
+ nd->net_desc_protocol = o->eo_item.ei_uint8;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_DSFIELD) {
+ nd->net_desc_dsfield = o->eo_item.ei_uint8;
+ }
+ break;
+ case EXT_UINT16:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_SPORT) {
+ nd->net_desc_sport = o->eo_item.ei_uint16;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_DPORT) {
+ nd->net_desc_dport = o->eo_item.ei_uint16;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_SAP) {
+ nd->net_desc_sap = o->eo_item.ei_uint16;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_VLAN_TPID) {
+ nd->net_desc_vlan_tpid = o->eo_item.ei_uint16;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_VLAN_TCI) {
+ nd->net_desc_vlan_tci = o->eo_item.ei_uint16;
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_PRIORITY) {
+ nd->net_desc_priority = o->eo_item.ei_uint16;
+ }
+ break;
+ case EXT_UINT32:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V4SADDR ||
+ (o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V4DADDR) {
+ struct in_addr addr;
+
+ addr.s_addr = htonl(o->eo_item.ei_uint32);
+
+ if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_V4SADDR) {
+ IN6_INADDR_TO_V4MAPPED(&addr,
+ &nd->net_desc_saddr);
+ } else {
+ IN6_INADDR_TO_V4MAPPED(&addr,
+ &nd->net_desc_daddr);
+ }
+ }
+ break;
+ case EXT_UINT64:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_BWLIMIT)
+ nd->net_desc_bw_limit = o->eo_item.ei_uint64;
+ break;
+ case EXT_RAW:
+ if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V6SADDR ||
+ (o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V6DADDR) {
+ in6_addr_t addr;
+
+ addr = *(in6_addr_t *)o->eo_item.ei_raw;
+ if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_V6SADDR) {
+ nd->net_desc_saddr = addr;
+ } else {
+ nd->net_desc_daddr = addr;
+ }
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_EHOST) {
+ bcopy((uchar_t *)o->eo_item.ei_raw, nd->net_desc_ehost,
+ ETHERADDRL);
+ } else if ((o->eo_catalog & EXD_DATA_MASK) ==
+ EXD_NET_DESC_EDEST) {
+ bcopy((uchar_t *)o->eo_item.ei_raw, nd->net_desc_edest,
+ ETHERADDRL);
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+/* Add a description item to the table */
+static dladm_status_t
+add_desc_to_tbl(net_table_t *net_table, net_desc_t *nd)
+{
+ net_entry_t *ne;
+
+ if ((ne = calloc(1, sizeof (net_entry_t))) == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ if ((ne->net_entry_tstats = calloc(1, sizeof (net_stat_t))) == NULL) {
+ free(ne);
+ return (DLADM_STATUS_NOMEM);
+ }
+
+ ne->net_entry_desc = nd;
+ ne->net_entry_shead = NULL;
+ ne->net_entry_stail = NULL;
+ ne->net_entry_scount = 0;
+
+ if (net_table->net_table_head == NULL) {
+ net_table->net_table_head = ne;
+ net_table->net_table_tail = ne;
+ } else {
+ net_table->net_table_tail->net_entry_next = ne;
+ net_table->net_table_tail = ne;
+ }
+ net_table->net_entries++;
+ return (DLADM_STATUS_OK);
+}
+
+/* Compare dates and return if t1 is equal, greater or lesser than t2 */
+static int
+compare_date(net_time_t *t1, net_time_t *t2)
+{
+ if (t1->net_time_yr == t2->net_time_yr &&
+ t1->net_time_mon == t2->net_time_mon &&
+ t1->net_time_day == t2->net_time_day) {
+ return (NET_DATE_EQUAL);
+ }
+ if (t1->net_time_yr > t2->net_time_yr ||
+ (t1->net_time_yr == t2->net_time_yr &&
+ t1->net_time_mon > t2->net_time_mon) ||
+ (t1->net_time_yr == t2->net_time_yr &&
+ t1->net_time_mon == t2->net_time_mon &&
+ t1->net_time_day > t2->net_time_day)) {
+ return (NET_DATE_GREATER);
+ }
+ return (NET_DATE_LESSER);
+}
+
+/* Compare times and return if t1 is equal, greater or lesser than t2 */
+static int
+compare_time(net_time_t *t1, net_time_t *t2)
+{
+ int cd;
+
+ cd = compare_date(t1, t2);
+
+ if (cd == NET_DATE_GREATER) {
+ return (NET_TIME_GREATER);
+ } else if (cd == NET_DATE_LESSER) {
+ return (NET_TIME_LESSER);
+ } else {
+ if (t1->net_time_hr == t2->net_time_hr &&
+ t1->net_time_min == t2->net_time_min &&
+ t1->net_time_sec == t2->net_time_sec) {
+ return (NET_TIME_EQUAL);
+ }
+ if (t1->net_time_hr > t2->net_time_hr ||
+ (t1->net_time_hr == t2->net_time_hr &&
+ t1->net_time_min > t2->net_time_min) ||
+ (t1->net_time_hr == t2->net_time_hr &&
+ t1->net_time_min == t2->net_time_min &&
+ t1->net_time_sec > t2->net_time_sec)) {
+ return (NET_TIME_GREATER);
+ }
+ }
+ return (NET_TIME_LESSER);
+}
+
+/*
+ * Given a start and end time and start and end entries check if the
+ * times are within the range, and adjust, if needed.
+ */
+static dladm_status_t
+chk_time_bound(net_time_t *s, net_time_t *e, net_time_t *sns,
+ net_time_t *ens)
+{
+ if (s != NULL && e != NULL) {
+ if (compare_time(s, e) == NET_TIME_GREATER)
+ return (DLADM_STATUS_BADTIMEVAL);
+ }
+ if (s != NULL) {
+ if (compare_time(s, sns) == NET_TIME_LESSER) {
+ s->net_time_yr = sns->net_time_yr;
+ s->net_time_mon = sns->net_time_mon;
+ s->net_time_day = sns->net_time_day;
+ s->net_time_hr = sns->net_time_hr;
+ s->net_time_min = sns->net_time_min;
+ s->net_time_sec = sns->net_time_sec;
+ }
+ }
+ if (e != NULL) {
+ if (compare_time(e, ens) == NET_TIME_GREATER) {
+ e->net_time_yr = ens->net_time_yr;
+ e->net_time_mon = ens->net_time_mon;
+ e->net_time_day = ens->net_time_day;
+ e->net_time_hr = ens->net_time_hr;
+ e->net_time_min = ens->net_time_min;
+ e->net_time_sec = ens->net_time_sec;
+ }
+ }
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Given a start and end time (strings), convert them into net_time_t
+ * and also check for the range given the head and tail of the list.
+ * If stime is lower then head or etime is greated than tail, adjust.
+ */
+static dladm_status_t
+get_time_range(net_time_entry_t *head, net_time_entry_t *tail,
+ net_time_t *st, net_time_t *et, char *stime, char *etime)
+{
+ bzero(st, sizeof (net_time_t));
+ bzero(et, sizeof (net_time_t));
+
+ if (stime == NULL && etime == NULL)
+ return (0);
+
+ if (stime != NULL)
+ dissect_time(stime, st);
+ if (etime != NULL)
+ dissect_time(etime, et);
+
+ if (stime != NULL || etime != NULL) {
+ return (chk_time_bound(stime == NULL ? NULL : st,
+ etime == NULL ? NULL : et,
+ &head->my_time_stat->net_stat_time,
+ &tail->my_time_stat->net_stat_time));
+ }
+ return (0);
+}
+
+/*
+ * Walk the list from a given starting point and return when we find
+ * an entry that is greater or equal to st. lasttime will point to the
+ * previous time entry.
+ */
+static void
+get_starting_point(net_time_entry_t *head, net_time_entry_t **start,
+ net_time_t *st, char *stime, uint64_t *lasttime)
+{
+ net_time_entry_t *next = head;
+
+ if (head == NULL) {
+ *start = NULL;
+ return;
+ }
+ if (stime == NULL) {
+ *start = head;
+ *lasttime = head->my_time_stat->net_stat_ctime;
+ return;
+ }
+ *start = NULL;
+ while (next != NULL) {
+ if (compare_time(st,
+ &next->my_time_stat->net_stat_time) != NET_TIME_LESSER) {
+ *lasttime = next->my_time_stat->net_stat_ctime;
+ next = next->net_time_entry_next;
+ continue;
+ }
+ *start = next;
+ break;
+ }
+}
+
+/*
+ * Point entry (pe) functions
+ */
+/* Clear all the counters. Done after the contents are written to the file */
+static void
+clear_pe(net_plot_entry_t *pe, int entries, int *pentries)
+{
+ int count;
+
+ for (count = 0; count < entries; count++) {
+ pe[count].net_pe_totbytes = 0;
+ pe[count].net_pe_totibytes = 0;
+ pe[count].net_pe_totobytes = 0;
+ pe[count].net_pe_tottime = 0;
+ }
+ *pentries = 0;
+}
+
+/* Update an entry in the point entry table */
+static void
+update_pe(net_plot_entry_t *pe, net_stat_t *nns, int nentries,
+ int *pentries, uint64_t lasttime)
+{
+ int count;
+
+ for (count = 0; count < nentries; count++) {
+ if ((strlen(nns->net_stat_name) ==
+ strlen(pe[count].net_pe_name)) &&
+ (strncmp(pe[count].net_pe_name, nns->net_stat_name,
+ strlen(nns->net_stat_name)) == 0)) {
+ break;
+ }
+ }
+ if (count == nentries)
+ return;
+
+ if (pe[count].net_pe_totbytes == 0)
+ pe[count].net_pe_lasttime = lasttime;
+
+ pe[count].net_pe_totbytes += nns->net_stat_ibytes +
+ nns->net_stat_obytes;
+ pe[count].net_pe_tottime += nns->net_stat_tdiff;
+ pe[count].net_pe_totibytes += nns->net_stat_ibytes;
+ pe[count].net_pe_totobytes += nns->net_stat_obytes;
+ (*pentries)++;
+}
+
+/* Flush the contents of the point entry table to the file. */
+static void
+add_pe_to_file(int (*fn)(dladm_usage_t *, void *), net_plot_entry_t *pe,
+ net_stat_t *ns, int entries, void *arg)
+{
+ int count;
+ dladm_usage_t usage;
+ uint64_t tottime;
+
+ bcopy(&ns->net_stat_ctime, &usage.du_etime, sizeof (usage.du_etime));
+ for (count = 0; count < entries; count++) {
+ bcopy(pe[count].net_pe_name, &usage.du_name,
+ sizeof (usage.du_name));
+ bcopy(&pe[count].net_pe_lasttime, &usage.du_stime,
+ sizeof (usage.du_stime));
+ usage.du_rbytes = pe[count].net_pe_totibytes;
+ usage.du_obytes = pe[count].net_pe_totobytes;
+ tottime = pe[count].net_pe_tottime;
+ usage.du_bandwidth = (tottime > 0) ?
+ ((pe[count].net_pe_totbytes * 8) / tottime) : 0;
+ usage.du_last = (count == entries-1);
+ fn(&usage, arg);
+ }
+}
+
+/*
+ * Net entry functions
+ */
+static net_entry_t *
+get_ne_from_table(net_table_t *net_table, char *name)
+{
+ int count;
+ net_desc_t *nd;
+ net_entry_t *ne = net_table->net_table_head;
+
+ for (count = 0; count < net_table->net_entries; count++) {
+ nd = ne->net_entry_desc;
+ if ((strlen(name) == strlen(nd->net_desc_name)) &&
+ (strncmp(name, nd->net_desc_name, strlen(name)) == 0)) {
+ return (ne);
+ }
+ ne = ne->net_entry_next;
+ }
+ return (NULL);
+}
+
+/* Get the entry for the descriptor, if it exists */
+static net_desc_t *
+get_ndesc(net_table_t *net_table, net_desc_t *nd)
+{
+ int count;
+ net_desc_t *nd1;
+ net_entry_t *ne = net_table->net_table_head;
+
+ for (count = 0; count < net_table->net_entries; count++) {
+ nd1 = ne->net_entry_desc;
+ if (strlen(nd1->net_desc_name) == strlen(nd->net_desc_name) &&
+ strlen(nd1->net_desc_devname) ==
+ strlen(nd->net_desc_devname) &&
+ strncmp(nd1->net_desc_name, nd->net_desc_name,
+ strlen(nd1->net_desc_name)) == 0 &&
+ strncmp(nd1->net_desc_devname, nd->net_desc_devname,
+ strlen(nd1->net_desc_devname)) == 0 &&
+ bcmp(nd1->net_desc_ehost, nd->net_desc_ehost,
+ ETHERADDRL) == 0 &&
+ bcmp(nd1->net_desc_edest, nd->net_desc_edest,
+ ETHERADDRL) == 0 &&
+ nd1->net_desc_vlan_tpid == nd->net_desc_vlan_tpid &&
+ nd1->net_desc_vlan_tci == nd->net_desc_vlan_tci &&
+ nd1->net_desc_sap == nd->net_desc_sap &&
+ nd1->net_desc_cpuid == nd->net_desc_cpuid &&
+ nd1->net_desc_priority == nd->net_desc_priority &&
+ nd1->net_desc_bw_limit == nd->net_desc_bw_limit &&
+ nd1->net_desc_sport == nd->net_desc_sport &&
+ nd1->net_desc_dport == nd->net_desc_dport &&
+ nd1->net_desc_protocol == nd->net_desc_protocol &&
+ nd1->net_desc_dsfield == nd->net_desc_dsfield &&
+ IN6_ARE_ADDR_EQUAL(&nd1->net_desc_saddr,
+ &nd->net_desc_saddr) &&
+ IN6_ARE_ADDR_EQUAL(&nd1->net_desc_daddr,
+ &nd->net_desc_daddr)) {
+ return (nd1);
+ }
+ ne = ne->net_entry_next;
+ }
+ return (NULL);
+}
+
+/*
+ * Update the stat entries. The stats in the file are cumulative, so in order
+ * to have increments, we maintain a reference stat entry, which contains
+ * the stats when the record was first written and a total stat entry, which
+ * maintains the running count. When we want to add a stat entry, if it
+ * the reference stat entry, we don't come here. For subsequent entries,
+ * we get the increment by subtracting the current value from the reference
+ * stat and the total stat.
+ */
+static void
+update_stats(net_stat_t *ns1, net_entry_t *ne, net_stat_t *ref)
+{
+
+ /* get the increment */
+ ns1->net_stat_ibytes -= (ref->net_stat_ibytes + ref->net_stat_tibytes);
+ ns1->net_stat_obytes -= (ref->net_stat_obytes + ref->net_stat_tobytes);
+ ns1->net_stat_ipackets -= (ref->net_stat_ipackets +
+ ref->net_stat_tipackets);
+ ns1->net_stat_opackets -= (ref->net_stat_opackets +
+ ref->net_stat_topackets);
+ ns1->net_stat_ierrors -= (ref->net_stat_ierrors +
+ ref->net_stat_tierrors);
+ ns1->net_stat_oerrors -= (ref->net_stat_oerrors +
+ ref->net_stat_toerrors);
+
+ /* update total bytes */
+ ref->net_stat_tibytes += ns1->net_stat_ibytes;
+ ref->net_stat_tobytes += ns1->net_stat_obytes;
+ ref->net_stat_tipackets += ns1->net_stat_ipackets;
+ ref->net_stat_topackets += ns1->net_stat_opackets;
+ ref->net_stat_tierrors += ns1->net_stat_ierrors;
+ ref->net_stat_toerrors += ns1->net_stat_oerrors;
+
+ ne->net_entry_tstats->net_stat_ibytes += ns1->net_stat_ibytes;
+ ne->net_entry_tstats->net_stat_obytes += ns1->net_stat_obytes;
+ ne->net_entry_tstats->net_stat_ipackets += ns1->net_stat_ipackets;
+ ne->net_entry_tstats->net_stat_opackets += ns1->net_stat_opackets;
+ ne->net_entry_tstats->net_stat_ierrors += ns1->net_stat_ierrors;
+ ne->net_entry_tstats->net_stat_oerrors += ns1->net_stat_oerrors;
+}
+
+/* Add the stat entry into the table */
+static dladm_status_t
+add_stat_to_tbl(net_table_t *net_table, net_stat_t *ns)
+{
+ net_entry_t *ne;
+
+ ne = get_ne_from_table(net_table, ns->net_stat_name);
+ if (ne == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ /* Ptr to flow desc */
+ ns->net_stat_desc = ne->net_entry_desc;
+ if (ns->net_stat_desc->net_desc_newrec) {
+ ns->net_stat_desc->net_desc_newrec = B_FALSE;
+ ns->net_stat_isref = B_TRUE;
+ ne->net_entry_sref = ns;
+ } else if (ns->net_stat_ibytes < ne->net_entry_sref->net_stat_tibytes ||
+ (ns->net_stat_obytes < ne->net_entry_sref->net_stat_tobytes)) {
+ ns->net_stat_isref = B_TRUE;
+ ne->net_entry_sref = ns;
+ } else {
+ ns->net_stat_isref = B_FALSE;
+ update_stats(ns, ne, ne->net_entry_sref);
+ }
+ if (ne->net_entry_shead == NULL) {
+ ne->net_entry_shead = ns;
+ ne->net_entry_stail = ns;
+ } else {
+ if (!ns->net_stat_isref) {
+ ne->net_entry_ttime += (ns->net_stat_ctime -
+ ne->net_entry_stail->net_stat_ctime);
+ ns->net_stat_tdiff = ns->net_stat_ctime -
+ ne->net_entry_stail->net_stat_ctime;
+ }
+ ne->net_entry_stail->net_stat_next = ns;
+ ne->net_entry_stail = ns;
+ }
+
+ ne->net_entry_scount++;
+ return (DLADM_STATUS_OK);
+}
+
+/* Add a flow/link descriptor record to the table */
+static dladm_status_t
+add_desc(net_table_t *net_table, ea_file_t *ef, int nobjs)
+{
+ net_desc_t *nd;
+ net_desc_t *dnd;
+ int count;
+ ea_object_t scratch;
+
+ if ((nd = calloc(1, sizeof (net_desc_t))) == NULL)
+ return (DLADM_STATUS_NOMEM);
+ nd->net_desc_newrec = B_TRUE;
+
+ for (count = 0; count < nobjs; count++) {
+ if (ea_get_object(ef, &scratch) == -1) {
+ free(nd);
+ return (DLADM_STATUS_NOMEM);
+ }
+ add_desc_item(&scratch, nd);
+ }
+ if ((dnd = get_ndesc(net_table, nd)) != NULL) {
+ dnd->net_desc_newrec = B_TRUE;
+ free(nd);
+ return (DLADM_STATUS_OK);
+ }
+ if (add_desc_to_tbl(net_table, nd) != 0) {
+ free(nd);
+ return (DLADM_STATUS_NOMEM);
+ }
+ return (DLADM_STATUS_OK);
+}
+
+/* Make an entry into the time sorted list */
+static void
+addto_time_list(net_table_t *net_table, net_time_entry_t *nt,
+ net_time_entry_t *ntc)
+{
+ net_stat_t *ns = nt->my_time_stat;
+ net_stat_t *ns1;
+ net_time_entry_t *end;
+ net_time_t *t1;
+ int count;
+
+ t1 = &ns->net_stat_time;
+
+ net_table->net_time_entries++;
+
+ if (net_table->net_time_head == NULL) {
+ net_table->net_time_head = nt;
+ net_table->net_time_tail = nt;
+ } else {
+ net_table->net_time_tail->net_time_entry_next = nt;
+ nt->net_time_entry_prev = net_table->net_time_tail;
+ net_table->net_time_tail = nt;
+ }
+
+ if (net_table->net_ctime_head == NULL) {
+ net_table->net_ctime_head = ntc;
+ net_table->net_ctime_tail = ntc;
+ } else {
+ end = net_table->net_ctime_tail;
+ count = 0;
+ while (count < net_table->net_time_entries - 1) {
+ ns1 = end->my_time_stat;
+ /* Just add it to the tail */
+ if (compare_date(t1, &ns1->net_stat_time) ==
+ NET_DATE_GREATER) {
+ break;
+ }
+ if ((strlen(ns1->net_stat_name) ==
+ strlen(ns->net_stat_name)) &&
+ (strncmp(ns1->net_stat_name, ns->net_stat_name,
+ strlen(ns1->net_stat_name)) == 0)) {
+ ntc->net_time_entry_next =
+ end->net_time_entry_next;
+ if (end->net_time_entry_next != NULL) {
+ end->net_time_entry_next->
+ net_time_entry_prev = ntc;
+ } else {
+ net_table->net_ctime_tail = ntc;
+ }
+ end->net_time_entry_next = ntc;
+ ntc->net_time_entry_prev = end;
+ return;
+ }
+ count++;
+ end = end->net_time_entry_prev;
+ }
+ net_table->net_ctime_tail->net_time_entry_next = ntc;
+ ntc->net_time_entry_prev = net_table->net_ctime_tail;
+ net_table->net_ctime_tail = ntc;
+ }
+}
+
+/* Add stat entry into the lists */
+static dladm_status_t
+add_stats(net_table_t *net_table, ea_file_t *ef, int nobjs)
+{
+ net_stat_t *ns;
+ int count;
+ ea_object_t scratch;
+ net_time_entry_t *nt;
+ net_time_entry_t *ntc;
+
+ if ((ns = calloc(1, sizeof (net_stat_t))) == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ if ((nt = calloc(1, sizeof (net_time_entry_t))) == NULL) {
+ free(ns);
+ return (DLADM_STATUS_NOMEM);
+ }
+ if ((ntc = calloc(1, sizeof (net_time_entry_t))) == NULL) {
+ free(ns);
+ free(nt);
+ return (DLADM_STATUS_NOMEM);
+ }
+
+ nt->my_time_stat = ns;
+ ntc->my_time_stat = ns;
+
+ for (count = 0; count < nobjs; count++) {
+ if (ea_get_object(ef, &scratch) == -1) {
+ free(ns);
+ free(nt);
+ free(ntc);
+ return (DLADM_STATUS_NOMEM);
+ }
+ add_stat_item(&scratch, ns);
+ }
+ if (add_stat_to_tbl(net_table, ns) != 0) {
+ free(ns);
+ free(nt);
+ free(ntc);
+ return (DLADM_STATUS_NOMEM);
+ }
+ addto_time_list(net_table, nt, ntc);
+ return (DLADM_STATUS_OK);
+}
+
+/* Free the entire table */
+static void
+free_logtable(net_table_t *net_table)
+{
+ net_entry_t *head;
+ net_entry_t *next;
+ net_stat_t *ns;
+ net_stat_t *ns1;
+ net_time_entry_t *thead;
+ net_time_entry_t *tnext;
+
+ thead = net_table->net_time_head;
+ while (thead != NULL) {
+ thead->my_time_stat = NULL;
+ tnext = thead->net_time_entry_next;
+ thead->net_time_entry_next = NULL;
+ thead->net_time_entry_prev = NULL;
+ free(thead);
+ thead = tnext;
+ }
+ net_table->net_time_head = NULL;
+ net_table->net_time_tail = NULL;
+
+ thead = net_table->net_ctime_head;
+ while (thead != NULL) {
+ thead->my_time_stat = NULL;
+ tnext = thead->net_time_entry_next;
+ thead->net_time_entry_next = NULL;
+ thead->net_time_entry_prev = NULL;
+ free(thead);
+ thead = tnext;
+ }
+ net_table->net_ctime_head = NULL;
+ net_table->net_ctime_tail = NULL;
+
+ net_table->net_time_entries = 0;
+
+ head = net_table->net_table_head;
+ while (head != NULL) {
+ next = head->net_entry_next;
+ head->net_entry_next = NULL;
+ ns = head->net_entry_shead;
+ while (ns != NULL) {
+ ns1 = ns->net_stat_next;
+ free(ns);
+ ns = ns1;
+ }
+ head->net_entry_scount = 0;
+ head->net_entry_sref = NULL;
+ free(head->net_entry_desc);
+ free(head->net_entry_tstats);
+ free(head);
+ head = next;
+ }
+ net_table->net_table_head = NULL;
+ net_table->net_table_tail = NULL;
+ net_table->net_time_entries = 0;
+ free(net_table);
+}
+
+/* Parse the exacct file, and return the parsed table. */
+static void *
+parse_logfile(char *file, int logtype, dladm_status_t *status)
+{
+ ea_file_t ef;
+ ea_object_t scratch;
+ net_table_t *net_table;
+
+ *status = DLADM_STATUS_OK;
+ if ((net_table = calloc(1, sizeof (net_table_t))) == NULL) {
+ *status = DLADM_STATUS_NOMEM;
+ return (NULL);
+ }
+ if (ea_open(&ef, file, NULL, 0, O_RDONLY, 0) == -1) {
+ *status = DLADM_STATUS_BADARG;
+ free(net_table);
+ return (NULL);
+ }
+ bzero(&scratch, sizeof (ea_object_t));
+ while (ea_get_object(&ef, &scratch) != -1) {
+ if (scratch.eo_type != EO_GROUP) {
+ (void) ea_free_item(&scratch, EUP_ALLOC);
+ (void) bzero(&scratch, sizeof (ea_object_t));
+ continue;
+ }
+ /* Read Link Desc/Stat records */
+ if (logtype == DLADM_LOGTYPE_FLOW) {
+ /* Flow Descriptor */
+ if ((scratch.eo_catalog &
+ EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_DESC) {
+ (void) add_desc(net_table, &ef,
+ scratch.eo_group.eg_nobjs - 1);
+ /* Flow Stats */
+ } else if ((scratch.eo_catalog &
+ EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_STATS) {
+ (void) add_stats(net_table, &ef,
+ scratch.eo_group.eg_nobjs - 1);
+ }
+ } else if (logtype == DLADM_LOGTYPE_LINK) {
+ /* Link Descriptor */
+ if ((scratch.eo_catalog &
+ EXD_DATA_MASK) == EXD_GROUP_NET_LINK_DESC) {
+ (void) add_desc(net_table, &ef,
+ scratch.eo_group.eg_nobjs - 1);
+ /* Link Stats */
+ } else if ((scratch.eo_catalog &
+ EXD_DATA_MASK) == EXD_GROUP_NET_LINK_STATS) {
+ (void) add_stats(net_table, &ef,
+ scratch.eo_group.eg_nobjs - 1);
+ }
+ } else {
+ if (((scratch.eo_catalog & EXD_DATA_MASK) ==
+ EXD_GROUP_NET_LINK_DESC) || ((scratch.eo_catalog &
+ EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_DESC)) {
+ (void) add_desc(net_table, &ef,
+ scratch.eo_group.eg_nobjs - 1);
+ } else if (((scratch.eo_catalog & EXD_DATA_MASK) ==
+ EXD_GROUP_NET_LINK_STATS) || ((scratch.eo_catalog &
+ EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_STATS)) {
+ (void) add_stats(net_table, &ef,
+ scratch.eo_group.eg_nobjs - 1);
+ }
+ }
+ (void) ea_free_item(&scratch, EUP_ALLOC);
+ (void) bzero(&scratch, sizeof (ea_object_t));
+ }
+
+ (void) ea_close(&ef);
+ return ((void *)net_table);
+}
+
+/*
+ * Walk the ctime list. This is used when looking for usage records
+ * based on a "resource" name.
+ */
+dladm_status_t
+dladm_walk_usage_res(int (*fn)(dladm_usage_t *, void *), int logtype,
+ char *logfile, char *resource, char *stime, char *etime, void *arg)
+{
+ net_table_t *net_table;
+ net_time_t st, et;
+ net_time_entry_t *start;
+ net_stat_t *ns = NULL;
+ net_stat_t *nns;
+ uint64_t tot_time = 0;
+ uint64_t last_time;
+ uint64_t tot_bytes = 0;
+ uint64_t tot_ibytes = 0;
+ uint64_t tot_obytes = 0;
+ boolean_t gotstart = B_FALSE;
+ dladm_status_t status;
+ dladm_usage_t usage;
+ int step = 1;
+
+ /* Parse the log file */
+ net_table = parse_logfile(logfile, logtype, &status);
+ if (net_table == NULL)
+ return (status);
+
+ if (net_table->net_entries == 0)
+ return (DLADM_STATUS_OK);
+ start = net_table->net_ctime_head;
+
+ /* Time range */
+ status = get_time_range(net_table->net_ctime_head,
+ net_table->net_ctime_tail, &st, &et, stime, etime);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ while (start != NULL) {
+ nns = start->my_time_stat;
+
+ /* Get to the resource we are interested in */
+ if ((strlen(resource) != strlen(nns->net_stat_name)) ||
+ (strncmp(resource, nns->net_stat_name,
+ strlen(nns->net_stat_name)) != 0)) {
+ start = start->net_time_entry_next;
+ continue;
+ }
+
+ /* Find the first record */
+ if (!gotstart) {
+ get_starting_point(start, &start, &st, stime,
+ &last_time);
+ if (start == NULL)
+ break;
+ nns = start->my_time_stat;
+ gotstart = B_TRUE;
+ }
+
+ /* Write one entry and return if we are out of the range */
+ if (etime != NULL && compare_time(&nns->net_stat_time, &et)
+ == NET_TIME_GREATER) {
+ if (tot_bytes != 0) {
+ bcopy(ns->net_stat_name, &usage.du_name,
+ sizeof (usage.du_name));
+ bcopy(&last_time, &usage.du_stime,
+ sizeof (usage.du_stime));
+ bcopy(&ns->net_stat_ctime, &usage.du_etime,
+ sizeof (usage.du_etime));
+ usage.du_rbytes = tot_ibytes;
+ usage.du_obytes = tot_obytes;
+ usage.du_bandwidth = tot_bytes*8/tot_time;
+ usage.du_last = B_TRUE;
+ fn(&usage, arg);
+ }
+ return (DLADM_STATUS_OK);
+ }
+
+ /*
+ * If this is a reference entry, just print what we have
+ * and proceed.
+ */
+ if (nns->net_stat_isref) {
+ if (tot_bytes != 0) {
+ bcopy(&nns->net_stat_name, &usage.du_name,
+ sizeof (usage.du_name));
+ bcopy(&nns->net_stat_ctime, &usage.du_stime,
+ sizeof (usage.du_stime));
+ usage.du_rbytes = tot_ibytes;
+ usage.du_obytes = tot_obytes;
+ usage.du_bandwidth = tot_bytes*8/tot_time;
+ usage.du_last = B_TRUE;
+ fn(&usage, arg);
+ NET_RESET_TOT(tot_bytes, tot_time, tot_ibytes,
+ tot_obytes, step);
+ }
+ last_time = nns->net_stat_ctime;
+ start = start->net_time_entry_next;
+ continue;
+ }
+
+ ns = nns;
+ if (--step == 0) {
+ tot_bytes += ns->net_stat_ibytes + ns->net_stat_obytes;
+ tot_ibytes += ns->net_stat_ibytes;
+ tot_obytes += ns->net_stat_obytes;
+ tot_time += ns->net_stat_tdiff;
+ bcopy(&ns->net_stat_name, &usage.du_name,
+ sizeof (usage.du_name));
+ bcopy(&last_time, &usage.du_stime,
+ sizeof (usage.du_stime));
+ bcopy(&ns->net_stat_ctime, &usage.du_etime,
+ sizeof (usage.du_etime));
+ usage.du_rbytes = tot_ibytes;
+ usage.du_obytes = tot_obytes;
+ usage.du_bandwidth = tot_bytes*8/tot_time;
+ usage.du_last = B_TRUE;
+ fn(&usage, arg);
+
+ NET_RESET_TOT(tot_bytes, tot_time, tot_ibytes,
+ tot_obytes, step);
+ last_time = ns->net_stat_ctime;
+ } else {
+ tot_bytes += ns->net_stat_ibytes + ns->net_stat_obytes;
+ tot_ibytes += ns->net_stat_ibytes;
+ tot_obytes += ns->net_stat_obytes;
+ tot_time += ns->net_stat_tdiff;
+ }
+ start = start->net_time_entry_next;
+ }
+
+ if (tot_bytes != 0) {
+ bcopy(&ns->net_stat_name, &usage.du_name,
+ sizeof (usage.du_name));
+ bcopy(&last_time, &usage.du_stime,
+ sizeof (usage.du_stime));
+ bcopy(&ns->net_stat_ctime, &usage.du_etime,
+ sizeof (usage.du_etime));
+ usage.du_rbytes = tot_ibytes;
+ usage.du_obytes = tot_obytes;
+ usage.du_bandwidth = tot_bytes*8/tot_time;
+ usage.du_last = B_TRUE;
+ fn(&usage, arg);
+ }
+
+ free_logtable(net_table);
+ return (status);
+}
+
+/*
+ * Walk the time sorted list if a resource is not specified.
+ */
+dladm_status_t
+dladm_walk_usage_time(int (*fn)(dladm_usage_t *, void *), int logtype,
+ char *logfile, char *stime, char *etime, void *arg)
+{
+ net_table_t *net_table;
+ net_time_entry_t *start;
+ net_stat_t *ns = NULL, *nns;
+ net_time_t st, et, *t1;
+ net_desc_t *nd;
+ net_entry_t *ne;
+ net_plot_entry_t *pe;
+ int count;
+ int step = 1;
+ int nentries = 0, pentries = 0;
+ uint64_t last_time;
+ dladm_status_t status;
+
+ /* Parse the log file */
+ net_table = parse_logfile(logfile, logtype, &status);
+ if (net_table == NULL)
+ return (status);
+
+ if (net_table->net_entries == 0)
+ return (DLADM_STATUS_OK);
+ start = net_table->net_time_head;
+
+ /* Find the first and last records and starting point */
+ status = get_time_range(net_table->net_time_head,
+ net_table->net_time_tail, &st, &et, stime, etime);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+ get_starting_point(start, &start, &st, stime, &last_time);
+ /*
+ * Could assert to be non-null, since get_time_range()
+ * would have adjusted.
+ */
+ if (start == NULL)
+ return (DLADM_STATUS_BADTIMEVAL);
+
+ /*
+ * Collect entries for all resources in a time slot before
+ * writing to the file.
+ */
+ nentries = net_table->net_entries;
+
+ pe = malloc(sizeof (net_plot_entry_t) * net_table->net_entries + 1);
+ if (pe == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ ne = net_table->net_table_head;
+ for (count = 0; count < nentries; count++) {
+ nd = ne->net_entry_desc;
+ pe[count].net_pe_name = nd->net_desc_name;
+ ne = ne->net_entry_next;
+ }
+
+ clear_pe(pe, nentries, &pentries);
+
+ /* Write header to file */
+ /* add_pe_to_file(fn, pe, ns, nentries, arg); */
+
+ t1 = &start->my_time_stat->net_stat_time;
+
+ while (start != NULL) {
+
+ nns = start->my_time_stat;
+ /*
+ * We have crossed the time boundary, check if we need to
+ * print out now.
+ */
+ if (compare_time(&nns->net_stat_time, t1) ==
+ NET_TIME_GREATER) {
+ /* return if we are out of the range */
+ if (etime != NULL &&
+ compare_time(&nns->net_stat_time, &et) ==
+ NET_TIME_GREATER) {
+ if (pentries > 0) {
+ add_pe_to_file(fn, pe, ns, nentries,
+ arg);
+ clear_pe(pe, nentries, &pentries);
+ }
+ free(pe);
+ return (DLADM_STATUS_OK);
+ }
+ /* update the stats from the ns. */
+ t1 = &nns->net_stat_time;
+ last_time = ns->net_stat_ctime;
+ if (--step == 0) {
+ if (pentries > 0) {
+ add_pe_to_file(fn, pe, ns, nentries,
+ arg);
+ clear_pe(pe, nentries, &pentries);
+ }
+ step = 1;
+ }
+ }
+
+ /*
+ * if this is a reference entry, just print what we have
+ * for this resource and proceed. We will end up writing
+ * the stats for all the entries when we hit a ref element,
+ * which means 'steps' for some might not be accurate, but
+ * that is fine, the alternative is to write only the
+ * resource for which we hit a reference entry.
+ */
+ if (nns->net_stat_isref) {
+ if (pentries > 0) {
+ add_pe_to_file(fn, pe, ns, nentries, arg);
+ clear_pe(pe, nentries, &pentries);
+ }
+ step = 1;
+ } else {
+ update_pe(pe, nns, nentries, &pentries, last_time);
+ }
+ ns = nns;
+ start = start->net_time_entry_next;
+ }
+
+ if (pentries > 0)
+ add_pe_to_file(fn, pe, ns, nentries, arg);
+
+ free(pe);
+ free_logtable(net_table);
+
+ return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_usage_summary(int (*fn)(dladm_usage_t *, void *), int logtype,
+ char *logfile, void *arg)
+{
+ net_table_t *net_table;
+ net_entry_t *ne;
+ net_desc_t *nd;
+ net_stat_t *ns;
+ int count;
+ dladm_usage_t usage;
+ dladm_status_t status;
+
+ /* Parse the log file */
+ net_table = parse_logfile(logfile, logtype, &status);
+ if (net_table == NULL)
+ return (status);
+
+ if (net_table->net_entries == 0)
+ return (DLADM_STATUS_OK);
+
+ ne = net_table->net_table_head;
+ for (count = 0; count < net_table->net_entries; count++) {
+ ns = ne->net_entry_tstats;
+ nd = ne->net_entry_desc;
+
+ if (ns->net_stat_ibytes + ns->net_stat_obytes == 0)
+ continue;
+ bcopy(&nd->net_desc_name, &usage.du_name,
+ sizeof (usage.du_name));
+ usage.du_duration = ne->net_entry_ttime;
+ usage.du_ipackets = ns->net_stat_ipackets;
+ usage.du_rbytes = ns->net_stat_ibytes;
+ usage.du_opackets = ns->net_stat_opackets;
+ usage.du_obytes = ns->net_stat_obytes;
+ usage.du_bandwidth =
+ (ns->net_stat_ibytes + ns->net_stat_obytes) * 8 /
+ usage.du_duration;
+ usage.du_last = (count == net_table->net_entries-1);
+ fn(&usage, arg);
+
+ ne = ne->net_entry_next;
+ }
+
+ free_logtable(net_table);
+ return (DLADM_STATUS_OK);
+}
+
+/*
+ * Walk the ctime list and display the dates of the records.
+ */
+dladm_status_t
+dladm_usage_dates(int (*fn)(dladm_usage_t *, void *), int logtype,
+ char *logfile, char *resource, void *arg)
+{
+ net_table_t *net_table;
+ net_time_entry_t *start;
+ net_stat_t *nns;
+ net_time_t st;
+ net_time_t *lasttime = NULL;
+ uint64_t last_time;
+ boolean_t gotstart = B_FALSE;
+ dladm_status_t status;
+ dladm_usage_t usage;
+
+ /* Parse the log file */
+ net_table = parse_logfile(logfile, logtype, &status);
+ if (net_table == NULL)
+ return (status);
+
+ if (net_table->net_entries == 0)
+ return (DLADM_STATUS_OK);
+
+ start = net_table->net_ctime_head;
+
+ while (start != NULL) {
+ nns = start->my_time_stat;
+
+ /* get to the resource we are interested in */
+ if (resource != NULL) {
+ if ((strlen(resource) != strlen(nns->net_stat_name)) ||
+ (strncmp(resource, nns->net_stat_name,
+ strlen(nns->net_stat_name)) != 0)) {
+ start = start->net_time_entry_next;
+ continue;
+ }
+ }
+
+ /* get the starting point in the logfile */
+ if (!gotstart) {
+ get_starting_point(start, &start, &st, NULL,
+ &last_time);
+ if (start == NULL)
+ break;
+ nns = start->my_time_stat;
+ gotstart = B_TRUE;
+ }
+
+ if (lasttime == NULL ||
+ compare_date(&nns->net_stat_time, lasttime) ==
+ NET_DATE_GREATER) {
+ bzero(&usage, sizeof (dladm_usage_t));
+ bcopy(&nns->net_stat_ctime, &usage.du_stime,
+ sizeof (usage.du_stime));
+ fn(&usage, arg);
+ lasttime = &nns->net_stat_time;
+ }
+
+ start = start->net_time_entry_next;
+ continue;
+ }
+
+ free_logtable(net_table);
+ return (status);
+}
diff --git a/usr/src/lib/libsecdb/exec_attr.txt b/usr/src/lib/libsecdb/exec_attr.txt
index ae7d769e2a..e0ef11b073 100644
--- a/usr/src/lib/libsecdb/exec_attr.txt
+++ b/usr/src/lib/libsecdb/exec_attr.txt
@@ -193,6 +193,8 @@ Network Management:solaris:cmd:::/sbin/routeadm:euid=0;\
privs=proc_chroot,proc_owner,sys_ip_config
Network Management:solaris:cmd:::/sbin/dladm:euid=dladm;egid=sys;\
privs=sys_dl_config,net_rawaccess,proc_audit
+Network Management:solaris:cmd:::/sbin/flowadm:euid=dladm;egid=sys;\
+ privs=sys_dl_config,net_rawaccess,proc_audit
Network Management:suser:cmd:::/usr/bin/netstat:uid=0
Network Management:suser:cmd:::/usr/bin/rup:euid=0
Network Management:suser:cmd:::/usr/bin/ruptime:euid=0
diff --git a/usr/src/lib/libsecdb/help/auths/Makefile b/usr/src/lib/libsecdb/help/auths/Makefile
index 8bc756895f..42d1d72c96 100644
--- a/usr/src/lib/libsecdb/help/auths/Makefile
+++ b/usr/src/lib/libsecdb/help/auths/Makefile
@@ -70,6 +70,7 @@ HTMLENTS = \
SmfExAcctFlowStates.html \
SmfExAcctProcessStates.html \
SmfExAcctTaskStates.html \
+ SmfExAcctNetStates.html \
SmfHeader.html \
SmfInetdStates.html \
SmfIPsecStates.html \
@@ -93,6 +94,7 @@ HTMLENTS = \
SmfValueExAcctFlow.html \
SmfValueExAcctProcess.html \
SmfValueExAcctTask.html \
+ SmfValueExAcctNet.html \
SmfVtStates.html \
SmfValueHeader.html \
SmfValueInetd.html \
diff --git a/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html b/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html
new file mode 100644
index 0000000000..e042637323
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html
@@ -0,0 +1,37 @@
+<HTML>
+<!--
+ CDDL HEADER START
+
+ The contents of this file are subject to the terms of the
+ Common Development and Distribution License (the "License").
+ You may not use this file except in compliance with the License.
+
+ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ or http://www.opensolaris.org/os/licensing.
+ See the License for the specific language governing permissions
+ and limitations under the License.
+
+ When distributing Covered Code, include this CDDL HEADER in each
+ file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ If applicable, add the following below this CDDL HEADER, with the
+ fields enclosed by brackets "[]" replaced with your own identifying
+ information: Portions Copyright [yyyy] [name of copyright owner]
+
+ CDDL HEADER END
+
+Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+Use is subject to license terms.
+-->
+<!--
+ <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
+-->
+<BODY>
+When Manage Net Extended Accounting Service States is in the Authorizations
+Included column, it grants the authorization to enable or disable net
+extended accounting.
+<p>
+If Manage Net Extended Accounting Service States is grayed, then you are not
+entitled to Add or Remove this authorization.
+<BR>&nbsp;
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html b/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html
new file mode 100644
index 0000000000..52f735c4b9
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html
@@ -0,0 +1,35 @@
+<HTML>
+<!--
+ CDDL HEADER START
+
+ The contents of this file are subject to the terms of the
+ Common Development and Distribution License (the "License").
+ You may not use this file except in compliance with the License.
+
+ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ or http://www.opensolaris.org/os/licensing.
+ See the License for the specific language governing permissions
+ and limitations under the License.
+
+ When distributing Covered Code, include this CDDL HEADER in each
+ file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ If applicable, add the following below this CDDL HEADER, with the
+ fields enclosed by brackets "[]" replaced with your own identifying
+ information: Portions Copyright [yyyy] [name of copyright owner]
+
+ CDDL HEADER END
+
+Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+Use is subject to license terms.
+-->
+
+<BODY>
+When Change Values of Net Extended Accounting Service Properties is in the
+Authorizations Included column, it grants the the authorization to change
+net extended accounting configuration parameter values.
+<P>
+If Change Values of Net Extended Accounting Service Properties is grayed,
+then you are not entitled to Add or Remove this authorization.
+<p>
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/help/profiles/Makefile b/usr/src/lib/libsecdb/help/profiles/Makefile
index 37f9608f0b..0d93f0929b 100644
--- a/usr/src/lib/libsecdb/help/profiles/Makefile
+++ b/usr/src/lib/libsecdb/help/profiles/Makefile
@@ -38,6 +38,7 @@ HTMLENTS = \
RtExAcctFlow.html \
RtExAcctProcess.html \
RtExAcctTask.html \
+ RtExAcctNet.html \
RtLogMngmnt.html \
RtDeviceMngmnt.html \
RtDeviceSecurity.html \
diff --git a/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html b/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html
new file mode 100644
index 0000000000..25861d980e
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html
@@ -0,0 +1,39 @@
+<HTML>
+<!--
+ CDDL HEADER START
+
+ The contents of this file are subject to the terms of the
+ Common Development and Distribution License (the "License").
+ You may not use this file except in compliance with the License.
+
+ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ or http://www.opensolaris.org/os/licensing.
+ See the License for the specific language governing permissions
+ and limitations under the License.
+
+ When distributing Covered Code, include this CDDL HEADER in each
+ file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ If applicable, add the following below this CDDL HEADER, with the
+ fields enclosed by brackets "[]" replaced with your own identifying
+ information: Portions Copyright [yyyy] [name of copyright owner]
+
+ CDDL HEADER END
+
+-- Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+-- Use is subject to license terms.
+-->
+<HEAD>
+ <TITLE> </TITLE>
+
+
+</HEAD>
+<BODY>
+When Manage the Net Extended Accounting service is in the Rights Included
+column, it grants the right to commands needed to administer net extended
+accounting.
+<p>
+If Manage the Net Extended Accounting service is grayed, then you are not
+entitled to Add or Remove this right.
+<p>
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/prof_attr.txt b/usr/src/lib/libsecdb/prof_attr.txt
index 9799ec15c2..ccf8b5081f 100644
--- a/usr/src/lib/libsecdb/prof_attr.txt
+++ b/usr/src/lib/libsecdb/prof_attr.txt
@@ -44,6 +44,7 @@ DHCP Management:::Manage the DHCP service:auths=solaris.dhcpmgr.*;help=RtDHCPMng
Extended Accounting Flow Management:::Manage the Flow Extended Accounting service:auths=solaris.smf.manage.extended-accounting.flow,solaris.smf.value.extended-accounting.flow;profiles=acctadm;help=RtExActtFlow.html
Extended Accounting Process Management:::Manage the Process Extended Accounting service:auths=solaris.smf.manage.extended-accounting.process,solaris.smf.value.extended-accounting.process;profiles=acctadm;hep=RtExAcctProcess.html
Extended Accounting Task Management:::Manage the Task Extended Accounting service:auths=solaris.smf.manage.extended-accounting.task,solaris.smf.value.extended-accounting.task;profiles=acctadm;help=RtExAcctTask.html
+Extended Accounting Net Management:::Manage the Net Extended Accounting service:auths=solaris.smf.manage.extended-accounting.net,solaris.smf.value.extended-accounting.net;profiles=acctadm;help=RtExActtNet.html
File System Management:::Manage, mount, share file systems:profiles=SMB Management,VSCAN Management,SMBFS Management;auths=solaris.smf.manage.autofs,solaris.smf.manage.shares.*,solaris.smf.value.shares.*;help=RtFileSysMngmnt.html
File System Security:::Manage file system security attributes:help=RtFileSysSecurity.html
HAL Management:::Manage HAL SMF service:auths=solaris.smf.manage.hal;help=RtHALMngmnt.html
diff --git a/usr/src/pkgdefs/SUNW0on/prototype_com b/usr/src/pkgdefs/SUNW0on/prototype_com
index 14419f0097..34c71c492a 100644
--- a/usr/src/pkgdefs/SUNW0on/prototype_com
+++ b/usr/src/pkgdefs/SUNW0on/prototype_com
@@ -242,6 +242,7 @@ f none usr/lib/help/auths/locale/SmfCronStates.html 444 root bin
f none usr/lib/help/auths/locale/SmfExAcctFlowStates.html 444 root bin
f none usr/lib/help/auths/locale/SmfExAcctProcessStates.html 444 root bin
f none usr/lib/help/auths/locale/SmfExAcctTaskStates.html 444 root bin
+f none usr/lib/help/auths/locale/SmfExAcctNetStates.html 444 root bin
f none usr/lib/help/auths/locale/SmfHeader.html 444 root bin
f none usr/lib/help/auths/locale/SmfInetdStates.html 444 root bin
f none usr/lib/help/auths/locale/SmfManageHeader.html 444 root bin
@@ -267,6 +268,7 @@ f none usr/lib/help/auths/locale/SmfValueCoreadm.html 444 root bin
f none usr/lib/help/auths/locale/SmfValueExAcctFlow.html 444 root bin
f none usr/lib/help/auths/locale/SmfValueExAcctProcess.html 444 root bin
f none usr/lib/help/auths/locale/SmfValueExAcctTask.html 444 root bin
+f none usr/lib/help/auths/locale/SmfValueExAcctNet.html 444 root bin
f none usr/lib/help/auths/locale/SmfVtStates.html 444 root bin
f none usr/lib/help/auths/locale/SmfValueHeader.html 444 root bin
f none usr/lib/help/auths/locale/SmfValueInetd.html 444 root bin
@@ -344,6 +346,7 @@ f none usr/lib/help/profiles/locale/RtDeviceMngmnt.html 444 root bin
f none usr/lib/help/profiles/locale/RtExAcctFlow.html 444 root bin
f none usr/lib/help/profiles/locale/RtExAcctProcess.html 444 root bin
f none usr/lib/help/profiles/locale/RtExAcctTask.html 444 root bin
+f none usr/lib/help/profiles/locale/RtExAcctNet.html 444 root bin
f none usr/lib/help/profiles/locale/RtPrntAdmin.html 444 root bin
f none usr/lib/help/profiles/locale/RtConsUser.html 444 root bin
f none usr/lib/help/profiles/locale/RtContractObserver.html 444 root bin
diff --git a/usr/src/pkgdefs/SUNWcnetr/postinstall b/usr/src/pkgdefs/SUNWcnetr/postinstall
index cb6ab86de9..20d09c70ee 100644
--- a/usr/src/pkgdefs/SUNWcnetr/postinstall
+++ b/usr/src/pkgdefs/SUNWcnetr/postinstall
@@ -109,6 +109,44 @@ if [ -f "${ORIG}" ]; then
removef -f $PKGINST > /dev/null 2>&1
fi
+# Convert hostname.xxx and zonecfg vlan entries
+host_ifs=`ls -1 $rootprefix/etc | egrep -e '^hostname.|^hostname6.|^dhcp.'| \
+ cut -d . -f2 | sort -u`
+
+zones=`zoneadm list -c | grep -v global`
+for zone in $zones
+do
+ zonecfg -z $zone info ip-type | grep exclusive >/dev/null
+ if [ $? -eq 0 ]; then
+ zif=`zonecfg -z $zone info net | grep physical | \
+ nawk '{print $2}'`
+ zone_ifs="$zone_ifs $zif"
+ fi
+done
+
+ORIG=$BASEDIR/etc/dladm/datalink.conf
+for ifname in $host_ifs $zone_ifs
+do
+ grep $ifname $ORIG >/dev/null
+ if [ $? != 0 ]; then
+ phys=`echo $ifname | sed "s/[0-9]*$//"`
+ devnum=`echo $ifname | sed "s/$phys//g"`
+ if [ "$phys$devnum" != $ifname -o \
+ -n "`echo $devnum | tr -d '[0-9]'`" ]; then
+ echo "skipping invalid interface $ifname"
+ continue
+ fi
+
+ vid=`expr $devnum / 1000`
+ inst=`expr $devnum % 1000`
+
+ if [ "$vid" != "0" ]; then
+ echo dladm create-vlan -l $phys$inst -v $vid \
+ $ifname >> ${PKG_INSTALL_ROOT}/$UPGRADE_SCRIPT
+ fi
+ fi
+done
+
#
# Change permissions of public IKE certificates and CRLs
# that may have been incorrectly created as private
diff --git a/usr/src/pkgdefs/SUNWcnetr/prototype_com b/usr/src/pkgdefs/SUNWcnetr/prototype_com
index 307a2a7303..7091ec4bc5 100644
--- a/usr/src/pkgdefs/SUNWcnetr/prototype_com
+++ b/usr/src/pkgdefs/SUNWcnetr/prototype_com
@@ -22,7 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
# This required package information file contains a list of package contents.
# The 'pkgmk' command uses this file to identify the contents of a package
@@ -53,6 +52,8 @@ d none etc 755 root sys
d none etc/dladm 755 dladm sys
e preserve etc/dladm/secobj.conf 600 dladm sys
e preserve etc/dladm/datalink.conf 644 dladm sys
+e preserve etc/dladm/flowadm.conf 644 dladm sys
+e preserve etc/dladm/flowprop.conf 644 dladm sys
d none etc/default 755 root sys
e dhcpagent etc/default/dhcpagent 644 root sys
e preserve etc/default/inetinit 644 root sys
@@ -74,3 +75,4 @@ e sock2path etc/inet/sock2path 444 root sys
s none etc/sock2path=./inet/sock2path
d none sbin 755 root sys
f none sbin/dladm 555 root bin
+f none sbin/flowadm 555 root bin
diff --git a/usr/src/pkgdefs/SUNWcsu/prototype_com b/usr/src/pkgdefs/SUNWcsu/prototype_com
index c3505988cb..b1021d4267 100644
--- a/usr/src/pkgdefs/SUNWcsu/prototype_com
+++ b/usr/src/pkgdefs/SUNWcsu/prototype_com
@@ -482,6 +482,7 @@ f none usr/lib/help/auths/locale/C/SmfCronStates.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfExAcctFlowStates.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfExAcctProcessStates.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfExAcctTaskStates.html 444 root bin
+f none usr/lib/help/auths/locale/C/SmfExAcctNetStates.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfHeader.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfManageHeader.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfMDNSStates.html 444 root bin
@@ -506,6 +507,7 @@ f none usr/lib/help/auths/locale/C/SmfValueCoreadm.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfValueExAcctFlow.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfValueExAcctProcess.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfValueExAcctTask.html 444 root bin
+f none usr/lib/help/auths/locale/C/SmfValueExAcctNet.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfVtStates.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfValueHeader.html 444 root bin
f none usr/lib/help/auths/locale/C/SmfValueInetd.html 444 root bin
@@ -564,6 +566,7 @@ f none usr/lib/help/profiles/locale/C/RtCryptoMngmnt.html 444 root bin
f none usr/lib/help/profiles/locale/C/RtExAcctFlow.html 444 root bin
f none usr/lib/help/profiles/locale/C/RtExAcctProcess.html 444 root bin
f none usr/lib/help/profiles/locale/C/RtExAcctTask.html 444 root bin
+f none usr/lib/help/profiles/locale/C/RtExAcctNet.html 444 root bin
f none usr/lib/help/profiles/locale/C/RtDHCPMngmnt.html 444 root bin
f none usr/lib/help/profiles/locale/C/RtDatAdmin.html 444 root bin
f none usr/lib/help/profiles/locale/C/RtDefault.html 444 root bin
@@ -683,6 +686,7 @@ f none usr/lib/rcm/modules/SUNW_ip_rcm.so 555 root bin
f none usr/lib/rcm/modules/SUNW_mpxio_rcm.so 555 root bin
f none usr/lib/rcm/modules/SUNW_network_rcm.so 555 root bin
f none usr/lib/rcm/modules/SUNW_vlan_rcm.so 555 root bin
+f none usr/lib/rcm/modules/SUNW_vnic_rcm.so 555 root bin
f none usr/lib/rcm/modules/SUNW_aggr_rcm.so 555 root bin
f none usr/lib/rcm/modules/SUNW_swap_rcm.so 555 root bin
f none usr/lib/rcm/rcm_daemon 555 root bin
@@ -828,6 +832,7 @@ s none usr/sbin/edquota=../lib/fs/ufs/edquota
f none usr/sbin/eeprom 2555 root sys
s none usr/sbin/fdisk=../../sbin/fdisk
f none usr/sbin/ff 555 root bin
+s none usr/sbin/flowadm=../../sbin/flowadm
s none usr/sbin/fiocompress=../../sbin/fiocompress
f none usr/sbin/fmthard 555 root sys
f none usr/sbin/format 555 root bin
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_i386 b/usr/src/pkgdefs/SUNWmdb/prototype_i386
index f7620e480d..05c255e659 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_i386
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_i386
@@ -71,6 +71,7 @@ f none usr/lib/mdb/kvm/amd64/ipp.so 555 root sys
f none usr/lib/mdb/kvm/amd64/krtld.so 555 root sys
f none usr/lib/mdb/kvm/amd64/lofs.so 555 root sys
f none usr/lib/mdb/kvm/amd64/logindmux.so 555 root sys
+f none usr/lib/mdb/kvm/amd64/mac.so 555 root sys
f none usr/lib/mdb/kvm/amd64/md.so 555 root sys
f none usr/lib/mdb/kvm/amd64/mdb_kb.so 555 root sys
f none usr/lib/mdb/kvm/amd64/mdb_ks.so 555 root sys
@@ -103,6 +104,7 @@ f none usr/lib/mdb/kvm/ipp.so 555 root sys
f none usr/lib/mdb/kvm/krtld.so 555 root sys
f none usr/lib/mdb/kvm/lofs.so 555 root sys
f none usr/lib/mdb/kvm/logindmux.so 555 root sys
+f none usr/lib/mdb/kvm/mac.so 555 root sys
f none usr/lib/mdb/kvm/md.so 555 root sys
f none usr/lib/mdb/kvm/mdb_kb.so 555 root sys
f none usr/lib/mdb/kvm/mdb_ks.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_sparc b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
index 7e6878d47e..51f5c49182 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
@@ -19,10 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
!include prototype_com
@@ -53,6 +52,7 @@ f none usr/lib/mdb/kvm/sparcv9/isp.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/krtld.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/lofs.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/logindmux.so 555 root sys
+f none usr/lib/mdb/kvm/sparcv9/mac.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/md.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/mdb_ks.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/mpt.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_i386 b/usr/src/pkgdefs/SUNWmdbr/prototype_i386
index 24755c9731..237c1da83b 100644
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_i386
@@ -41,6 +41,7 @@ f none kernel/kmdb/amd64/ipp 555 root sys
f none kernel/kmdb/amd64/krtld 555 root sys
f none kernel/kmdb/amd64/lofs 555 root sys
f none kernel/kmdb/amd64/logindmux 555 root sys
+f none kernel/kmdb/amd64/mac 555 root sys
f none kernel/kmdb/amd64/md 555 root sys
f none kernel/kmdb/amd64/mdb_ds 555 root sys
f none kernel/kmdb/amd64/mpt 555 root sys
@@ -72,6 +73,7 @@ f none kernel/kmdb/ipp 555 root sys
f none kernel/kmdb/krtld 555 root sys
f none kernel/kmdb/lofs 555 root sys
f none kernel/kmdb/logindmux 555 root sys
+f none kernel/kmdb/mac 555 root sys
f none kernel/kmdb/md 555 root sys
f none kernel/kmdb/mdb_ds 555 root sys
f none kernel/kmdb/mpt 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
index 99bb424c63..b4057c2328 100644
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
@@ -19,10 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
!include prototype_com
@@ -43,6 +42,7 @@ f none kernel/kmdb/sparcv9/isp 555 root sys
f none kernel/kmdb/sparcv9/krtld 555 root sys
f none kernel/kmdb/sparcv9/lofs 555 root sys
f none kernel/kmdb/sparcv9/logindmux 555 root sys
+f none kernel/kmdb/sparcv9/mac 555 root sys
f none kernel/kmdb/sparcv9/md 555 root sys
f none kernel/kmdb/sparcv9/mdb_ds 555 root sys
f none kernel/kmdb/sparcv9/mpt 555 root sys
diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386
index ee2ddf8352..e7a2d79ed1 100644
--- a/usr/src/pkgdefs/etc/exception_list_i386
+++ b/usr/src/pkgdefs/etc/exception_list_i386
@@ -89,12 +89,21 @@ usr/include/sys/dld.h i386
usr/include/sys/dld_impl.h i386
usr/include/sys/dld_ioc.h i386
usr/include/sys/dls.h i386
+usr/include/sys/dls_mgmt.h i386
usr/include/sys/dls_impl.h i386
usr/include/sys/mac.h i386
+usr/include/sys/mac_client.h i386
+usr/include/sys/mac_client_impl.h i386
+usr/include/sys/mac_flow.h i386
+usr/include/sys/mac_flow_impl.h i386
usr/include/sys/mac_impl.h i386
+usr/include/sys/mac_provider.h i386
+usr/include/sys/mac_soft_ring.h i386
#
# Private GLDv3 userland libraries and headers
#
+usr/include/sys/vnic.h i386
+usr/include/sys/vnic_impl.h i386
usr/include/libdladm.h i386
usr/include/libdladm_impl.h i386
usr/include/libdllink.h i386
@@ -102,8 +111,11 @@ usr/include/libdlaggr.h i386
usr/include/libdlwlan.h i386
usr/include/libdlwlan_impl.h i386
usr/include/libdlvnic.h i386
+usr/include/libdlflow.h i386
+usr/include/libdlflow_impl.h i386
usr/include/libdlvlan.h i386
usr/include/libdlmgmt.h i386
+usr/include/libdlstat.h i386
lib/libdladm.so i386
lib/llib-ldladm.ln i386
lib/amd64/libdladm.so i386
@@ -528,6 +540,7 @@ lib/llib-lmeta.ln i386
# non-public pci header
#
usr/include/sys/pci_impl.h i386
+usr/include/sys/pci_tools.h i386
#
# Exception list for RCM project, included by librcm and rcm_daemon
#
diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc
index ece69f8eef..005ace8c07 100644
--- a/usr/src/pkgdefs/etc/exception_list_sparc
+++ b/usr/src/pkgdefs/etc/exception_list_sparc
@@ -78,21 +78,33 @@ usr/include/sys/dld.h sparc
usr/include/sys/dld_impl.h sparc
usr/include/sys/dld_ioc.h sparc
usr/include/sys/dls.h sparc
+usr/include/sys/dls_mgmt.h sparc
usr/include/sys/dls_impl.h sparc
usr/include/sys/mac.h sparc
+usr/include/sys/mac_client.h sparc
+usr/include/sys/mac_client_impl.h sparc
+usr/include/sys/mac_flow.h sparc
+usr/include/sys/mac_flow_impl.h sparc
usr/include/sys/mac_impl.h sparc
+usr/include/sys/mac_provider.h sparc
+usr/include/sys/mac_soft_ring.h sparc
#
# Private GLDv3 userland libraries and headers
#
+usr/include/sys/vnic.h sparc
+usr/include/sys/vnic_impl.h sparc
usr/include/libdladm.h sparc
usr/include/libdladm_impl.h sparc
usr/include/libdllink.h sparc
usr/include/libdlaggr.h sparc
+usr/include/libdlflow.h sparc
+usr/include/libdlflow_impl.h sparc
usr/include/libdlwlan.h sparc
usr/include/libdlwlan_impl.h sparc
usr/include/libdlvnic.h sparc
usr/include/libdlvlan.h sparc
usr/include/libdlmgmt.h sparc
+usr/include/libdlstat.h sparc
lib/libdladm.so sparc
lib/llib-ldladm.ln sparc
lib/sparcv9/libdladm.so sparc
@@ -531,6 +543,7 @@ lib/llib-lmeta.ln sparc
# non-public pci header
#
usr/include/sys/pci_impl.h sparc
+usr/include/sys/pci_tools.h sparc
#
# Exception list for RCM project, included by librcm and rcm_daemon
#
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index 88404359fd..100d0e594d 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -666,7 +666,7 @@ inetd_conf_svm_hack() {
}
upgrade_aggr_and_linkprop () {
- # Since aggregation.conf and linkprop.conf are upgraded by
+ # Since aggregation.conf and linkprop.conf are upgraded by
# SUNWcnetr's postinstall script, put the relevant portions of the
# postinstall script here, modified to rename the old files instead
# of removing them.
@@ -756,6 +756,30 @@ upgrade_aggr_and_linkprop () {
fi
}
+upgrade_vlan () {
+ # Convert hostname.*** and zonecfg vlan configurations
+ UPGRADE_SCRIPT=/var/svc/profile/upgrade_datalink
+
+ for ifname in $host_ifs $zone_ifs
+ do
+ phys=`echo $ifname | sed "s/[0-9]*$//"`
+ devnum=`echo $ifname | sed "s/$phys//g"`
+ if [ "$phys$devnum" != $ifname -o \
+ -n "`echo $devnum | tr -d '[0-9]'`" ]; then
+ echo "skipping invalid interface $ifname"
+ continue
+ fi
+
+ vid=`expr $devnum / 1000`
+ inst=`expr $devnum % 1000`
+
+ if [ "$vid" != "0" ]; then
+ echo dladm create-vlan -l $phys$inst -v $vid $ifname \
+ >> $rootprefix$UPGRADE_SCRIPT
+ fi
+ done
+}
+
# Update aac.conf for set legacy-name-enable properly
update_aac_conf()
{
@@ -1174,6 +1198,24 @@ migrate_acctadm_conf()
svcadm enable $fmri
fi
+ fmri="svc:/system/extended-accounting:net"
+ svccfg -s $fmri setprop config/file = \
+ ${ACCTADM_NET_FILE:="none"}
+ svccfg -s $fmri setprop config/tracked = \
+ ${ACCTADM_NET_TRACKED:="none"}
+ svccfg -s $fmri setprop config/untracked = \
+ ${ACCTADM_NET_UNTRACKED:="extended"}
+ if [ ${ACCTADM_NET_ENABLE:="no"} = "yes" ]; then
+ svccfg -s $fmri setprop config/enabled = "true"
+ else
+ svccfg -s $fmri setprop config/enabled = "false"
+ fi
+ if [ $ACCTADM_NET_ENABLE = "yes" -o \
+ $ACCTADM_NET_FILE != "none" -o \
+ $ACCTADM_NET_TRACKED != "none" ]; then
+ svcadm enable $fmri
+ fi
+
rm /etc/acctadm.conf
fi
_EOF
@@ -4762,6 +4804,28 @@ then
fi
#
+ # save vlans associated with zones to be upgraded
+ # to the new dladm based format
+ #
+ flowadm_status="old"
+ if [[ ! -f $root/sbin/flowadm ]] && \
+ archive_file_exists generic.sbin "sbin/flowadm"; then
+ flowadm_status="new"
+ host_ifs=`ls -1 $rootprefix/etc | egrep -e \
+ '^hostname.|^hostname6.|^dhcp.'| cut -d . -f2 | sort -u`
+ zones=`zoneadm list -c | grep -v global`
+ for zone in $zones
+ do
+ zonecfg -z $zone info ip-type | grep exclusive \
+ >/dev/null
+ if [ $? -eq 0 ]; then
+ zif=`zonecfg -z $zone info net | \
+ grep physical | nawk '{print $2}'`
+ zone_ifs="$zone_ifs $zif"
+ fi
+ done
+ fi
+ #
# Stop sendmail so that mail doesn't bounce during the interval
# where /etc/mail/aliases is (effectively) empty.
#
@@ -7593,6 +7657,7 @@ mondo_loop() {
#
rm -f $root/usr/lib/rcm/modules/SUNW_vlan_rcm.so
rm -f $root/usr/lib/rcm/modules/SUNW_aggr_rcm.so
+ rm -f $root/usr/lib/rcm/modules/SUNW_vnic_rcm.so
rm -f $root/kernel/drv/softmac
rm -f $root/kernel/drv/sparcv9/softmac
rm -f $root/kernel/drv/amd64/softmac
@@ -8077,6 +8142,11 @@ mondo_loop() {
fi
fi
+ # upgrade hostname and zones based vlans to dladm
+ if [[ $flowadm_status == "new" ]]; then
+ upgrade_vlan
+ fi
+
# The global zone needs to have its /dev/dld symlink created
# during install so that processes can access it early in boot
# before devfsadm is run.
diff --git a/usr/src/uts/common/Makefile b/usr/src/uts/common/Makefile
index 5b8f6bbc6b..7cf2f14f64 100644
--- a/usr/src/uts/common/Makefile
+++ b/usr/src/uts/common/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,10 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2002-2003 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
# uts/common/Makefile
#
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 2a54074941..564b2cf72e 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -174,6 +174,7 @@ GENUNIX_OBJS += \
inet_ntop.o \
instance.o \
ioctl.o \
+ ip_cksum.o \
issetugid.o \
ippconf.o \
kcpc.o \
@@ -265,6 +266,7 @@ GENUNIX_OBJS += \
sidsys.o \
sched.o \
schedctl.o \
+ sctp_crc32.o \
seg_dev.o \
seg_kp.o \
seg_kpm.o \
@@ -474,7 +476,7 @@ IP_ICMP_OBJS = icmp.o icmp_opt_data.o
IP_RTS_OBJS = rts.o rts_opt_data.o
IP_TCP_OBJS = tcp.o tcp_fusion.o tcp_kssl.o tcp_opt_data.o tcp_sack.o
IP_UDP_OBJS = udp.o udp_opt_data.o
-IP_SCTP_OBJS = sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
+IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \
sctp_init.o sctp_input.o sctp_cookie.o \
sctp_conn.o sctp_error.o sctp_snmp.o \
sctp_param.o sctp_shutdown.o sctp_common.o \
@@ -483,7 +485,7 @@ IP_SCTP_OBJS = sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o
IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
- ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+ ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
@@ -560,14 +562,15 @@ CLONE_OBJS += clone.o
CN_OBJS += cons.o
-DLD_OBJS += dld_drv.o dld_proto.o dld_str.o
+DLD_OBJS += dld_drv.o dld_proto.o dld_str.o dld_flow.o
-DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_vlan.o \
- dls_soft_ring.o dls_mgmt.o
+DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_mgmt.o
GLD_OBJS += gld.o gldutil.o
-MAC_OBJS += mac.o mac_mod.o mac_stat.o mac_ndd.o
+MAC_OBJS += mac.o mac_bcast.o mac_client.o mac_datapath_setup.o mac_flow.o \
+ mac_hio.o mac_mod.o mac_ndd.o mac_provider.o mac_sched.o \
+ mac_soft_ring.o mac_stat.o mac_util.o
MAC_ETHER_OBJS += mac_ether.o
@@ -578,8 +581,6 @@ MAC_IB_OBJS += mac_ib.o
AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \
aggr_send.o aggr_recv.o aggr_lacp.o
-VNIC_OBJS += vnic_ctl.o vnic_dev.o vnic_bcast.o vnic_cl.o
-
SOFTMAC_OBJS += softmac_main.o softmac_ctl.o softmac_capab.o \
softmac_dev.o softmac_stat.o softmac_pkt.o
@@ -588,6 +589,8 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
net80211_crypto_none.o net80211_crypto_wep.o net80211_ioctl.o \
net80211_crypto_tkip.o net80211_crypto_ccmp.o
+VNIC_OBJS += vnic_ctl.o vnic_dev.o
+
IB_OBJS += ibnex.o ibnex_ioctl.o
IBCM_OBJS += ibcm_impl.o ibcm_sm.o ibcm_ti.o ibcm_utils.o ibcm_path.o \
@@ -1724,18 +1727,17 @@ IXGBE_OBJS = ixgbe_82598.o ixgbe_api.o ixgbe_common.o \
#
# NIU 10G/1G driver module
#
-NXGE_OBJS = nxge_mac.o nxge_ipp.o nxge_rxdma.o \
- nxge_txdma.o nxge_txc.o nxge_main.o \
+NXGE_OBJS = nxge_mac.o nxge_ipp.o nxge_rxdma.o \
+ nxge_txdma.o nxge_txc.o nxge_main.o \
nxge_hw.o nxge_fzc.o nxge_virtual.o \
nxge_send.o nxge_classify.o nxge_fflp.o \
nxge_fflp_hash.o nxge_ndd.o nxge_kstats.o \
- nxge_zcp.o nxge_fm.o nxge_espc.o \
- nxge_serialize.o nxge_hv.o \
+ nxge_zcp.o nxge_fm.o nxge_espc.o nxge_hv.o \
nxge_hio.o nxge_hio_guest.o nxge_intr.o
NXGE_NPI_OBJS = \
- npi.o npi_mac.o npi_ipp.o \
- npi_txdma.o npi_rxdma.o npi_txc.o \
+ npi.o npi_mac.o npi_ipp.o \
+ npi_txdma.o npi_rxdma.o npi_txc.o \
npi_zcp.o npi_espc.o npi_fflp.o \
npi_vir.o
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 09a34afa80..c7ccff8a14 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -50,6 +50,7 @@ extern "C" {
#ifdef _KERNEL
#include <netinet/ip6.h>
#include <sys/avl.h>
+#include <sys/list.h>
#include <sys/vmem.h>
#include <sys/squeue.h>
#include <net/route.h>
@@ -380,6 +381,13 @@ typedef struct ipf_s {
uint32_t ipf_checksum; /* Partial checksum of fragment data */
} ipf_t;
+/*
+ * IPv4 Fragments
+ */
+#define IS_V4_FRAGMENT(ipha_fragment_offset_and_flags) \
+ (((ntohs(ipha_fragment_offset_and_flags) & IPH_OFFSET) != 0) || \
+ ((ntohs(ipha_fragment_offset_and_flags) & IPH_MF) != 0))
+
#define ipf_src V4_PART_OF_V6(ipf_v6src)
#define ipf_dst V4_PART_OF_V6(ipf_v6dst)
@@ -1718,9 +1726,10 @@ typedef union ill_g_head_u {
#define ILL_CAPAB_MDT 0x04 /* Multidata Transmit */
#define ILL_CAPAB_HCKSUM 0x08 /* Hardware checksumming */
#define ILL_CAPAB_ZEROCOPY 0x10 /* Zero-copy */
-#define ILL_CAPAB_POLL 0x20 /* Polling Toggle */
-#define ILL_CAPAB_SOFT_RING 0x40 /* Soft_Ring capability */
-#define ILL_CAPAB_LSO 0x80 /* Large Segment Offload */
+#define ILL_CAPAB_DLD 0x20 /* DLD capabilities */
+#define ILL_CAPAB_DLD_POLL 0x40 /* Polling */
+#define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */
+#define ILL_CAPAB_DLD_LSO 0x100 /* Large Segment Offload */
/*
* Per-ill Multidata Transmit capabilities.
@@ -1743,9 +1752,9 @@ typedef struct ill_hcksum_capab_s ill_hcksum_capab_t;
typedef struct ill_zerocopy_capab_s ill_zerocopy_capab_t;
/*
- * Per-ill Polling/soft ring capbilities.
+ * DLD capbilities.
*/
-typedef struct ill_dls_capab_s ill_dls_capab_t;
+typedef struct ill_dld_capab_s ill_dld_capab_t;
/*
* Per-ill polling resource map.
@@ -1762,7 +1771,6 @@ typedef struct ill_lso_capab_s ill_lso_capab_t;
#define ILL_CONDEMNED 0x02 /* No more new ref's to the ILL */
#define ILL_CHANGING 0x04 /* ILL not globally visible */
#define ILL_DL_UNBIND_IN_PROGRESS 0x08 /* UNBIND_REQ is sent */
-#define ILL_SOFT_RING_ASSIGN 0x10 /* Making soft ring assignment */
/* Is this an ILL whose source address is used by other ILL's ? */
#define IS_USESRC_ILL(ill) \
@@ -1870,8 +1878,10 @@ typedef struct ill_s {
ill_note_link : 1, /* supports link-up notification */
ill_capab_reneg : 1, /* capability renegotiation to be done */
+ ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
ill_need_recover_multicast : 1,
- ill_pad_to_bit_31 : 17;
+
+ ill_pad_to_bit_31 : 16;
/* Following bit fields protected by ill_lock */
uint_t
@@ -1883,6 +1893,7 @@ typedef struct ill_s {
ill_arp_bringup_pending : 1,
ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */
ill_arp_extend : 1, /* ARP has DAD extensions */
+
ill_pad_bit_31 : 25;
/*
@@ -1903,15 +1914,17 @@ typedef struct ill_s {
/*
* Capabilities related fields.
*/
- uint_t ill_dlpi_capab_state; /* State of capability query, IDS_* */
+ uint_t ill_dlpi_capab_state; /* State of capability query, IDCS_* */
+ uint_t ill_capab_pending_cnt;
uint64_t ill_capabilities; /* Enabled capabilities, ILL_CAPAB_* */
ill_mdt_capab_t *ill_mdt_capab; /* Multidata Transmit capabilities */
ill_ipsec_capab_t *ill_ipsec_capab_ah; /* IPsec AH capabilities */
ill_ipsec_capab_t *ill_ipsec_capab_esp; /* IPsec ESP capabilities */
ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */
ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */
- ill_dls_capab_t *ill_dls_capab; /* Polling, soft ring capabilities */
- ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */
+ ill_dld_capab_t *ill_dld_capab; /* DLD capabilities */
+ ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */
+ mblk_t *ill_capab_reset_mp; /* Preallocated mblk for capab reset */
/*
* New fields for IPv6
@@ -1989,6 +2002,7 @@ typedef struct ill_s {
zoneid_t ill_zoneid;
ip_stack_t *ill_ipst; /* Corresponds to a netstack_hold */
uint32_t ill_dhcpinit; /* IP_DHCPINIT_IFs for ill */
+ void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
uint_t ill_ilm_cnt; /* ilms referencing this ill */
uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
} ill_t;
@@ -2069,6 +2083,7 @@ typedef struct ill_s {
* ill_type ipsq + down ill only when ill is up
* ill_dlpi_multicast_state ill_lock ill_lock
* ill_dlpi_fastpath_state ill_lock ill_lock
+ * ill_dlpi_capab_state ipsq ipsq
* ill_max_hops ipsq Not atomic
*
* ill_max_mtu
@@ -2110,6 +2125,8 @@ typedef struct ill_s {
* ill_trace ill_lock ill_lock
* ill_usesrc_grp_next ill_g_usesrc_lock ill_g_usesrc_lock
* ill_dhcpinit atomics atomics
+ * ill_flownotify_mh write once write once
+ * ill_capab_pending_cnt ipsq ipsq
*/
/*
@@ -2182,13 +2199,22 @@ typedef struct ipmx_s {
* State for detecting if a driver supports certain features.
* Support for DL_ENABMULTI_REQ uses ill_dlpi_multicast_state.
* Support for DLPI M_DATA fastpath uses ill_dlpi_fastpath_state.
- * Support for DL_CAPABILITY_REQ uses ill_dlpi_capab_state.
*/
#define IDS_UNKNOWN 0 /* No DLPI request sent */
#define IDS_INPROGRESS 1 /* DLPI request sent */
#define IDS_OK 2 /* DLPI request completed successfully */
#define IDS_FAILED 3 /* DLPI request failed */
+/* Support for DL_CAPABILITY_REQ uses ill_dlpi_capab_state. */
+enum {
+ IDCS_UNKNOWN,
+ IDCS_PROBE_SENT,
+ IDCS_OK,
+ IDCS_RESET_SENT,
+ IDCS_RENEG,
+ IDCS_FAILED
+};
+
/* Named Dispatch Parameter Management Structure */
typedef struct ipparam_s {
uint_t ip_param_min;
@@ -3165,6 +3191,8 @@ extern int ip_opt_set_ill(conn_t *, int, boolean_t, boolean_t,
extern void ip_rput(queue_t *, mblk_t *);
extern void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *,
struct mac_header_info_s *);
+extern mblk_t *ip_accept_tcp(ill_t *, ill_rx_ring_t *, squeue_t *,
+ mblk_t *, mblk_t **, uint_t *cnt);
extern void ip_rput_dlpi(queue_t *, mblk_t *);
extern void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
extern void ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *);
@@ -3201,13 +3229,13 @@ extern ipaddr_t ip_net_mask(ipaddr_t);
extern void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
ip_stack_t *);
extern ipxmit_state_t ip_xmit_v4(mblk_t *, ire_t *, struct ipsec_out_s *,
- boolean_t);
+ boolean_t, conn_t *);
extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *);
extern struct qinit iprinitv6;
extern struct qinit ipwinitv6;
-extern void conn_drain_insert(conn_t *connp);
+extern void conn_drain_insert(conn_t *connp);
extern int conn_ipsec_length(conn_t *connp);
extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
ire_t *);
@@ -3437,17 +3465,22 @@ struct ill_zerocopy_capab_s {
};
struct ill_lso_capab_s {
- uint_t ill_lso_version; /* interface version */
uint_t ill_lso_on; /* on/off switch for LSO on this ILL */
uint_t ill_lso_flags; /* capabilities */
uint_t ill_lso_max; /* maximum size of payload */
};
-/* Possible ill_states */
-#define ILL_RING_INPROC 3 /* Being assigned to squeue */
-#define ILL_RING_INUSE 2 /* Already Assigned to Rx Ring */
-#define ILL_RING_BEING_FREED 1 /* Being Unassigned */
-#define ILL_RING_FREE 0 /* Available to be assigned to Ring */
+/*
+ * rr_ring_state cycles in the order shown below from RR_FREE through
+ * RR_FREE_IN_PROG and back to RR_FREE.
+ */
+typedef enum {
+ RR_FREE, /* Free slot */
+ RR_SQUEUE_UNBOUND, /* Ring's squeue is unbound */
+ RR_SQUEUE_BIND_INPROG, /* Ring's squeue bind in progress */
+ RR_SQUEUE_BOUND, /* Ring's squeue bound to cpu */
+ RR_FREE_INPROG /* Ring is being freed */
+} ip_ring_state_t;
#define ILL_MAX_RINGS 256 /* Max num of rx rings we can manage */
#define ILL_POLLING 0x01 /* Polling in use */
@@ -3457,73 +3490,92 @@ struct ill_lso_capab_s {
* we need to duplicate the definitions here because we cannot
* include mac/dls header files here.
*/
-typedef void (*ip_mac_blank_t)(void *, time_t, uint_t);
-typedef void (*ip_dld_tx_t)(void *, mblk_t *);
+typedef void *ip_mac_tx_cookie_t;
+typedef void (*ip_mac_intr_disable_t)(void *);
+typedef void (*ip_mac_intr_enable_t)(void *);
+typedef void *(*ip_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
+typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
+typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *);
+typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t);
-typedef void (*ip_dls_chg_soft_ring_t)(void *, int);
-typedef void (*ip_dls_bind_t)(void *, processorid_t);
-typedef void (*ip_dls_unbind_t)(void *);
+/*
+ * POLLING README
+ * sq_get_pkts() is called to pick packets from softring in poll mode. It
+ * calls rr_rx to get the chain and process it with rr_ip_accept.
+ * rr_rx = mac_soft_ring_poll() to pick packets
+ * rr_ip_accept = ip_accept_tcp() to process packets
+ */
+/*
+ * XXX: With protocol, service specific squeues, they will have
+ * specific acceptor functions.
+ */
+typedef mblk_t *(*ip_mac_rx_t)(void *, size_t);
+typedef mblk_t *(*ip_accept_t)(ill_t *, ill_rx_ring_t *,
+ squeue_t *, mblk_t *, mblk_t **, uint_t *);
+
+/*
+ * rr_intr_enable, rr_intr_disable, rr_rx_handle, rr_rx:
+ * May be accessed while in the squeue AND after checking that SQS_POLL_CAPAB
+ * is set.
+ *
+ * rr_ring_state: Protected by ill_lock.
+ */
struct ill_rx_ring {
- ip_mac_blank_t rr_blank; /* Driver interrupt blanking func */
- void *rr_handle; /* Handle for Rx ring */
+ ip_mac_intr_disable_t rr_intr_disable; /* Interrupt disabling func */
+ ip_mac_intr_enable_t rr_intr_enable; /* Interrupt enabling func */
+ void *rr_intr_handle; /* Handle interrupt funcs */
+ ip_mac_rx_t rr_rx; /* Driver receive function */
+ ip_accept_t rr_ip_accept; /* IP accept function */
+ void *rr_rx_handle; /* Handle for Rx ring */
squeue_t *rr_sqp; /* Squeue the ring is bound to */
- ill_t *rr_ill; /* back pointer to ill */
- clock_t rr_poll_time; /* Last lbolt polling was used */
- uint32_t rr_poll_state; /* polling state flags */
- uint32_t rr_max_blank_time; /* Max interrupt blank */
- uint32_t rr_min_blank_time; /* Min interrupt blank */
- uint32_t rr_max_pkt_cnt; /* Max pkts before interrupt */
- uint32_t rr_min_pkt_cnt; /* Mix pkts before interrupt */
- uint32_t rr_normal_blank_time; /* Normal intr freq */
- uint32_t rr_normal_pkt_cnt; /* Normal intr pkt cnt */
- uint32_t rr_ring_state; /* State of this ring */
+ ill_t *rr_ill; /* back pointer to ill */
+ ip_ring_state_t rr_ring_state; /* State of this ring */
};
-struct ill_dls_capab_s {
- ip_dld_tx_t ill_tx; /* Driver Tx routine */
- void *ill_tx_handle; /* Driver Tx handle */
- ip_dls_chg_soft_ring_t ill_dls_change_status;
- /* change soft ring fanout */
- ip_dls_bind_t ill_dls_bind; /* to add CPU affinity */
- ip_dls_unbind_t ill_dls_unbind; /* remove CPU affinity */
- ill_rx_ring_t *ill_ring_tbl; /* Ring to Sqp mapping table */
- uint_t ill_dls_soft_ring_cnt; /* Number of soft ring */
- conn_t *ill_unbind_conn; /* Conn used during unplumb */
+/*
+ * IP - DLD direct function call capability
+ * Suffixes, df - dld function, dh - dld handle,
+ * cf - client (IP) function, ch - client handle
+ */
+typedef struct ill_dld_direct_s { /* DLD provided driver Tx */
+ ip_dld_tx_t idd_tx_df; /* str_mdata_fastpath_put */
+ void *idd_tx_dh; /* dld_str_t *dsp */
+ ip_dld_callb_t idd_tx_cb_df; /* mac_tx_srs_notify */
+ void *idd_tx_cb_dh; /* mac_client_handle_t *mch */
+} ill_dld_direct_t;
+
+/* IP - DLD polling capability */
+typedef struct ill_dld_poll_s {
+ ill_rx_ring_t idp_ring_tbl[ILL_MAX_RINGS];
+} ill_dld_poll_t;
+
+/* Describes ill->ill_dld_capab */
+struct ill_dld_capab_s {
+ ip_capab_func_t idc_capab_df; /* dld_capab_func */
+ void *idc_capab_dh; /* dld_str_t *dsp */
+ ill_dld_direct_t idc_direct;
+ ill_dld_poll_t idc_poll;
};
/*
* IP squeues exports
*/
-extern int ip_squeue_profile;
-extern int ip_squeue_bind;
extern boolean_t ip_squeue_fanout;
-extern boolean_t ip_squeue_soft_ring;
-extern uint_t ip_threads_per_cpu;
-extern uint_t ip_squeues_per_cpu;
-extern uint_t ip_soft_rings_cnt;
-
-typedef struct squeue_set_s {
- kmutex_t sqs_lock;
- struct squeue_s **sqs_list;
- int sqs_size;
- int sqs_max_size;
- processorid_t sqs_bind;
-} squeue_set_t;
-
-#define IP_SQUEUE_GET(hint) \
- ((!ip_squeue_fanout) ? (CPU->cpu_squeue_set->sqs_list[0]) : \
- ip_squeue_random(hint))
-typedef void (*squeue_func_t)(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t);
+#define IP_SQUEUE_GET(hint) ip_squeue_random(hint)
extern void ip_squeue_init(void (*)(squeue_t *));
extern squeue_t *ip_squeue_random(uint_t);
extern squeue_t *ip_squeue_get(ill_rx_ring_t *);
-extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
+extern squeue_t *ip_squeue_getfree(pri_t);
+extern int ip_squeue_cpu_move(squeue_t *, processorid_t);
+extern void *ip_squeue_add_ring(ill_t *, void *);
+extern void ip_squeue_bind_ring(ill_t *, ill_rx_ring_t *, processorid_t);
+extern void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
+extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *);
+extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *);
extern void ip_squeue_clean_all(ill_t *);
-extern void ip_soft_ring_assignment(ill_t *, ill_rx_ring_t *,
- mblk_t *, struct mac_header_info_s *);
extern void ip_resume_tcp_bind(void *, mblk_t *, void *);
extern void tcp_wput(queue_t *, mblk_t *);
@@ -3580,6 +3632,9 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
#define SQTAG_TCP_KSSL_INPUT 36
#define SQTAG_TCP_DROP_Q0 37
#define SQTAG_TCP_CONN_REQ_2 38
+#define SQTAG_IP_INPUT_RX_RING 39
+#define SQTAG_SQUEUE_CHANGE 40
+#define SQTAG_CONNECT_FINISH 41
#define NOT_OVER_IP(ip_wq) \
(ip_wq->q_next != NULL || \
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 553a975c54..90cc6a51d5 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -24,9 +24,6 @@
*/
/* Copyright (c) 1990 Mentat Inc. */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
@@ -4331,8 +4328,7 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
}
mblk_setcred(mp, connp->conn_cred);
- ip_output_options(connp, mp, q, IP_WPUT,
- &optinfo);
+ ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
}
static boolean_t
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index ecfafc5e51..091509c71e 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -24,8 +24,6 @@
*/
/* Copyright (c) 1990 Mentat Inc. */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Internet Group Management Protocol (IGMP) routines.
* Multicast Listener Discovery Protocol (MLD) routines.
@@ -1439,7 +1437,7 @@ igmp_timeout_handler(void *arg)
if (!ill_waiter_inc(ill))
continue;
rw_exit(&ipst->ips_ill_g_lock);
- success = ipsq_enter(ill, B_TRUE);
+ success = ipsq_enter(ill, B_TRUE, NEW_OP);
if (success) {
next = igmp_timeout_handler_per_ill(ill);
if (next < global_next)
@@ -1682,7 +1680,7 @@ mld_timeout_handler(void *arg)
if (!ill_waiter_inc(ill))
continue;
rw_exit(&ipst->ips_ill_g_lock);
- success = ipsq_enter(ill, B_TRUE);
+ success = ipsq_enter(ill, B_TRUE, NEW_OP);
if (success) {
next = mld_timeout_handler_per_ill(ill);
if (next < global_next)
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 5eb9a7e1d2..b0eaa51983 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -46,6 +46,7 @@
#include <sys/atomic.h>
#include <sys/policy.h>
#include <sys/priv.h>
+#include <sys/taskq.h>
#include <sys/systm.h>
#include <sys/param.h>
@@ -125,16 +126,17 @@
#include <sys/tsol/tnet.h>
#include <rpc/pmap_prot.h>
+#include <sys/squeue_impl.h>
/*
* Values for squeue switch:
- * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain
- * IP_SQUEUE_ENTER: squeue_enter
- * IP_SQUEUE_FILL: squeue_fill
+ * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
+ * IP_SQUEUE_ENTER: SQ_PROCESS
+ * IP_SQUEUE_FILL: SQ_FILL
*/
int ip_squeue_enter = 2; /* Setable in /etc/system */
-squeue_func_t ip_input_proc;
+int ip_squeue_flag;
#define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x))
/*
@@ -391,6 +393,11 @@ void (*cl_inet_idlesa)(uint8_t, uint32_t, sa_family_t, in6_addr_t,
* gcgrp_rwlock -> ire_lock
* gcgrp_rwlock -> gcdb_lock
*
+ * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
+ *
+ * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
+ * sq_lock -> conn_lock -> QLOCK(q)
+ * ill_lock -> ft_lock -> fe_lock
*
* Routing/forwarding table locking notes:
*
@@ -730,7 +737,7 @@ static boolean_t ip_source_route_included(ipha_t *);
static void ip_trash_ire_reclaim_stack(ip_stack_t *);
static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t,
- zoneid_t, ip_stack_t *);
+ zoneid_t, ip_stack_t *, conn_t *);
static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *);
static void ip_wput_local_options(ipha_t *, ip_stack_t *);
static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t,
@@ -763,17 +770,13 @@ static void ip_multirt_bad_mtu(ire_t *, uint32_t);
static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
caddr_t, cred_t *);
-extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
- caddr_t cp, cred_t *cr);
-extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t,
- cred_t *);
static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
-static squeue_func_t ip_squeue_switch(int);
+static int ip_squeue_switch(int);
static void *ip_kstat_init(netstackid_t, ip_stack_t *);
static void ip_kstat_fini(netstackid_t, kstat_t *);
@@ -790,7 +793,7 @@ static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t,
ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *);
static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
- ipha_t *, ill_t *, boolean_t);
+ ipha_t *, ill_t *, boolean_t, boolean_t);
static void ipobs_init(ip_stack_t *);
static void ipobs_fini(ip_stack_t *);
@@ -934,20 +937,14 @@ static ipndp_t lcl_ndp_arr[] = {
"ip_rput_pullups" },
{ ip_srcid_report, NULL, NULL,
"ip_srcid_status" },
- { ip_param_generic_get, ip_squeue_profile_set,
- (caddr_t)&ip_squeue_profile, "ip_squeue_profile" },
- { ip_param_generic_get, ip_squeue_bind_set,
- (caddr_t)&ip_squeue_bind, "ip_squeue_bind" },
{ ip_param_generic_get, ip_input_proc_set,
(caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
{ ip_param_generic_get, ip_int_set,
(caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
-#define IPNDP_CGTP_FILTER_OFFSET 11
+#define IPNDP_CGTP_FILTER_OFFSET 9
{ ip_cgtp_filter_get, ip_cgtp_filter_set, NULL,
"ip_cgtp_filter" },
- { ip_param_generic_get, ip_int_set,
- (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" },
-#define IPNDP_IPMP_HOOK_OFFSET 13
+#define IPNDP_IPMP_HOOK_OFFSET 10
{ ip_param_generic_get, ipmp_hook_emulation_set, NULL,
"ipmp_hook_emulation" },
{ ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
@@ -2564,8 +2561,8 @@ icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
/* Have to change db_type after any pullupmsg */
DB_TYPE(mp) = M_CTL;
- squeue_fill(connp->conn_sqp, first_mp, tcp_input,
- connp, SQTAG_TCP_INPUT_ICMP_ERR);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
+ SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR);
return;
case IPPROTO_SCTP:
@@ -5367,34 +5364,13 @@ ip_modclose(ill_t *ill)
ipif_t *ipif;
queue_t *q = ill->ill_rq;
ip_stack_t *ipst = ill->ill_ipst;
- clock_t timeout;
-
- /*
- * Wait for the ACKs of all deferred control messages to be processed.
- * In particular, we wait for a potential capability reset initiated
- * in ip_sioctl_plink() to complete before proceeding.
- *
- * Note: we wait for at most ip_modclose_ackwait_ms (by default 3000 ms)
- * in case the driver never replies.
- */
- timeout = lbolt + MSEC_TO_TICK(ip_modclose_ackwait_ms);
- mutex_enter(&ill->ill_lock);
- while (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
- if (cv_timedwait(&ill->ill_cv, &ill->ill_lock, timeout) < 0) {
- /* Timeout */
- break;
- }
- }
- mutex_exit(&ill->ill_lock);
/*
- * Forcibly enter the ipsq after some delay. This is to take
- * care of the case when some ioctl does not complete because
- * we sent a control message to the driver and it did not
- * send us a reply. We want to be able to at least unplumb
- * and replumb rather than force the user to reboot the system.
+ * The punlink prior to this may have initiated a capability
+ * negotiation. But ipsq_enter will block until that finishes or
+ * times out.
*/
- success = ipsq_enter(ill, B_FALSE);
+ success = ipsq_enter(ill, B_FALSE, NEW_OP);
/*
* Open/close/push/pop is guaranteed to be single threaded
@@ -5661,33 +5637,6 @@ ip_conn_input(void *arg1, mblk_t *mp, void *arg2)
putnext(connp->conn_rq, mp);
}
-/* Return the IP checksum for the IP header at "iph". */
-uint16_t
-ip_csum_hdr(ipha_t *ipha)
-{
- uint16_t *uph;
- uint32_t sum;
- int opt_len;
-
- opt_len = (ipha->ipha_version_and_hdr_length & 0xF) -
- IP_SIMPLE_HDR_LENGTH_IN_WORDS;
- uph = (uint16_t *)ipha;
- sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
- uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
- if (opt_len > 0) {
- do {
- sum += uph[10];
- sum += uph[11];
- uph += 2;
- } while (--opt_len);
- }
- sum = (sum & 0xFFFF) + (sum >> 16);
- sum = ~(sum + (sum >> 16)) & 0xFFFF;
- if (sum == 0xffff)
- sum = 0;
- return ((uint16_t)sum);
-}
-
/*
* Called when the module is about to be unloaded
*/
@@ -5741,6 +5690,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg)
*/
ipv4_hook_shutdown(ipst);
ipv6_hook_shutdown(ipst);
+
+ mutex_enter(&ipst->ips_capab_taskq_lock);
+ ipst->ips_capab_taskq_quit = B_TRUE;
+ cv_signal(&ipst->ips_capab_taskq_cv);
+ mutex_exit(&ipst->ips_capab_taskq_lock);
}
/*
@@ -5761,6 +5715,10 @@ ip_stack_fini(netstackid_t stackid, void *arg)
ipv6_hook_destroy(ipst);
ip_net_destroy(ipst);
+ mutex_destroy(&ipst->ips_capab_taskq_lock);
+ cv_destroy(&ipst->ips_capab_taskq_cv);
+ list_destroy(&ipst->ips_capab_taskq_list);
+
#ifdef NS_DEBUG
printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
#endif
@@ -5882,7 +5840,7 @@ ip_thread_exit(void *phash)
void
ip_ddi_init(void)
{
- ip_input_proc = ip_squeue_switch(ip_squeue_enter);
+ ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
/*
* For IP and TCP the minor numbers should start from 2 since we have 4
@@ -6043,6 +6001,16 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ipv4_hook_init(ipst);
ipv6_hook_init(ipst);
+ /*
+ * Create the taskq dispatcher thread and initialize related stuff.
+ */
+ ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
+ ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
+ mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
+ list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t),
+ offsetof(mblk_t, b_next));
+
return (ipst);
}
@@ -6839,8 +6807,8 @@ ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
if (IPCL_IS_TCP(connp)) {
/* do not drain, certain use cases can blow the stack */
- squeue_enter_nodrain(connp->conn_sqp, first_mp,
- connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
+ connp, ip_squeue_flag, SQTAG_IP_FANOUT_TCP);
} else {
/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
(connp->conn_recv)(connp, first_mp, NULL);
@@ -7016,9 +6984,10 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha,
NULL, mctl_present);
+ /* Freed by ipsec_check_inbound_policy(). */
if (first_mp == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return; /* Freed by ipsec_check_inbound_policy(). */
+ return;
}
}
if (mctl_present)
@@ -9832,6 +9801,9 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
netstack_rele(ipst->ips_netstack);
connp->conn_zoneid = zoneid;
+ connp->conn_sqp = NULL;
+ connp->conn_initial_sqp = NULL;
+ connp->conn_final_sqp = NULL;
connp->conn_upq = q;
q->q_ptr = WR(q)->q_ptr = connp;
@@ -12977,6 +12949,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
mblk_t *mp1;
boolean_t syn_present = B_FALSE;
tcph_t *tcph;
+ uint_t tcph_flags;
uint_t ip_hdr_len;
ill_t *ill = (ill_t *)q->q_ptr;
zoneid_t zoneid = ire->ire_zoneid;
@@ -13121,6 +13094,9 @@ try_again:
goto no_conn;
}
+ tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
+ tcph_flags = tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG);
+
/*
* TCP FAST PATH for AF_INET socket.
*
@@ -13138,12 +13114,17 @@ try_again:
!IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
ASSERT(first_mp == mp);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- SET_SQUEUE(mp, tcp_rput_data, connp);
+ if (tcph_flags != (TH_SYN | TH_ACK)) {
+ SET_SQUEUE(mp, tcp_rput_data, connp);
+ return (mp);
+ }
+ mp->b_datap->db_struioflag |= STRUIO_CONNECT;
+ DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring);
+ SET_SQUEUE(mp, tcp_input, connp);
return (mp);
}
- tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
- if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
+ if (tcph_flags == TH_SYN) {
if (IPCL_IS_TCP(connp)) {
mp->b_datap->db_struioflag |= STRUIO_EAGER;
DB_CKSUMSTART(mp) =
@@ -13165,7 +13146,6 @@ try_again:
}
syn_present = B_TRUE;
}
-
}
if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
@@ -13903,6 +13883,12 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
return (NULL);
}
+/*
+ *
+ * This is the fast forward path. If we are here, we dont need to
+ * worry about RSVP, CGTP, or TSol. Furthermore the ftable lookup
+ * needed to find the nexthop in this case is much simpler
+ */
ire_t *
ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
{
@@ -13928,6 +13914,12 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
*/
ire_refrele(ire);
ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst);
+ /*
+ * ire_cache_lookup() can return ire of IRE_LOCAL in
+ * transient cases. In such case, just drop the packet
+ */
+ if (ire->ire_type != IRE_CACHE)
+ goto drop;
}
/*
@@ -13952,8 +13944,8 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
/* No ire cache of nexthop. So first create one */
if (ire == NULL) {
- ire = ire_forward(dst, &ret_action, NULL, NULL,
- NULL, ipst);
+ ire = ire_forward_simple(dst, &ret_action, ipst);
+
/*
* We only come to ip_fast_forward if ip_cgtp_filter
* is not set. So ire_forward() should not return with
@@ -14001,7 +13993,6 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
pkt_len = ntohs(ipha->ipha_length);
stq_ill = (ill_t *)ire->ire_stq->q_ptr;
if (!(stq_ill->ill_flags & ILLF_ROUTER) ||
- !(ill->ill_flags & ILLF_ROUTER) ||
(ill == stq_ill) ||
(ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) ||
(ire->ire_nce == NULL) ||
@@ -14010,7 +14001,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) ||
ipha->ipha_ttl <= 1) {
ip_rput_process_forward(ill->ill_rq, mp, ire,
- ipha, ill, B_FALSE);
+ ipha, ill, B_FALSE, B_TRUE);
return (ire);
}
BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
@@ -14048,34 +14039,33 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len);
- dev_q = ire->ire_stq->q_next;
- if ((dev_q->q_next != NULL || dev_q->q_first != NULL) &&
- !canputnext(ire->ire_stq)) {
- goto indiscard;
+ if (!ILL_DIRECT_CAPABLE(stq_ill) || DB_TYPE(mp) != M_DATA) {
+ dev_q = ire->ire_stq->q_next;
+ if (DEV_Q_FLOW_BLOCKED(dev_q))
+ goto indiscard;
}
- if (ILL_DLS_CAPABLE(stq_ill)) {
- /*
- * Send the packet directly to DLD, where it
- * may be queued depending on the availability
- * of transmit resources at the media layer.
- */
- IP_DLS_ILL_TX(stq_ill, ipha, mp, ipst, hlen);
- } else {
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, stq_ill,
- ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, stq_ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
- if (mp == NULL)
- goto drop;
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
- ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha,
- ip6_t *, NULL, int, 0);
+ DTRACE_PROBE4(ip4__physical__out__start,
+ ill_t *, NULL, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_physical_out_event,
+ ipst->ips_ipv4firewall_physical_out,
+ NULL, stq_ill, ipha, mp, mp, 0, ipst);
+ DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha,
+ ip6_t *, NULL, int, 0);
- putnext(ire->ire_stq, mp);
+ if (mp != NULL) {
+ if (ipst->ips_ipobs_enabled) {
+ zoneid_t szone;
+
+ szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+ ipst, ALL_ZONES);
+ ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+ ALL_ZONES, ill, IPV4_VERSION, hlen, ipst);
+ }
+
+ ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC);
}
return (ire);
@@ -14096,7 +14086,7 @@ drop:
static void
ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
- ill_t *ill, boolean_t ll_multicast)
+ ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward)
{
ill_group_t *ill_group;
ill_group_t *ire_group;
@@ -14109,6 +14099,16 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */
mp->b_next = NULL; /* ip_rput_noire sets dst here */
+ /*
+ * If the caller of this function is ip_fast_forward() skip the
+ * next three checks as it does not apply.
+ */
+ if (from_ip_fast_forward) {
+ ill_group = ill->ill_group;
+ ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
+ goto skip;
+ }
+
if (ll_multicast != 0) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
goto drop_pkt;
@@ -14147,6 +14147,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
* side-effect of that would be requiring an ire flush
* whenever the ILLF_ROUTER flag changes.
*/
+skip:
if (((ill->ill_flags &
((ill_t *)ire->ire_stq->q_ptr)->ill_flags &
ILLF_ROUTER) == 0) &&
@@ -14253,7 +14254,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
}
sendit:
dev_q = ire->ire_stq->q_next;
- if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) {
+ if (DEV_Q_FLOW_BLOCKED(dev_q)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
freemsg(mp);
return;
@@ -14447,7 +14448,7 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
ipha->ipha_hdr_checksum = 0;
ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
ip_rput_process_forward(q, mp, ire, ipha,
- ill, ll_multicast);
+ ill, ll_multicast, B_FALSE);
ire_refrele(ire);
return (NULL);
}
@@ -14904,6 +14905,15 @@ ip_fix_dbref(ill_t *ill, mblk_t *mp)
return (mp1);
}
+#define ADD_TO_CHAIN(head, tail, cnt, mp) { \
+ if (tail != NULL) \
+ tail->b_next = mp; \
+ else \
+ head = mp; \
+ tail = mp; \
+ cnt++; \
+}
+
/*
* Direct read side procedure capable of dealing with chains. GLDv3 based
* drivers call this function directly with mblk chains while STREAMS
@@ -14942,20 +14952,23 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
mblk_t *head = NULL;
mblk_t *tail = NULL;
mblk_t *first_mp;
- mblk_t *mp;
- mblk_t *dmp;
int cnt = 0;
ip_stack_t *ipst = ill->ill_ipst;
+ mblk_t *mp;
+ mblk_t *dmp;
+ uint8_t tag;
ASSERT(mp_chain != NULL);
ASSERT(ill != NULL);
TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q);
+ tag = (ip_ring != NULL) ? SQTAG_IP_INPUT_RX_RING : SQTAG_IP_INPUT;
+
#define rptr ((uchar_t *)ipha)
while (mp_chain != NULL) {
- first_mp = mp = mp_chain;
+ mp = mp_chain;
mp_chain = mp_chain->b_next;
mp->b_next = NULL;
ll_multicast = 0;
@@ -14987,6 +15000,15 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
* Given the above assumption, there is no need to walk
* down the entire mblk chain (which could have a
* potential performance problem)
+ *
+ * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
+ * to here because of exclusive ip stacks and vnics.
+ * Packets transmitted from exclusive stack over vnic
+ * can have db_ref > 1 and when it gets looped back to
+ * another vnic in a different zone, you have ip_input()
+ * getting dblks with db_ref > 1. So if someone
+ * complains of TCP performance under this scenario,
+ * take a serious look here on the impact of copymsg().
*/
if (DB_REF(mp) > 1) {
@@ -15056,7 +15078,7 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
}
}
- /* Make sure its an M_DATA and that its aligned */
+ /* Only M_DATA can come here and it is always aligned */
ASSERT(DB_TYPE(mp) == M_DATA);
ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr));
@@ -15140,7 +15162,6 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
continue;
}
dst = ipha->ipha_dst;
-
/*
* Attach any necessary label information to
* this packet
@@ -15194,16 +15215,18 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP &&
!ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) {
if (ire == NULL)
- ire = ire_cache_lookup(dst, ALL_ZONES, NULL,
- ipst);
-
- /* incoming packet is for forwarding */
- if (ire == NULL || (ire->ire_type & IRE_CACHE)) {
+ ire = ire_cache_lookup_simple(dst, ipst);
+ /*
+ * Unless forwarding is enabled, dont call
+ * ip_fast_forward(). Incoming packet is for forwarding
+ */
+ if ((ill->ill_flags & ILLF_ROUTER) &&
+ (ire == NULL || (ire->ire_type & IRE_CACHE))) {
ire = ip_fast_forward(ire, dst, ill, mp);
continue;
}
/* incoming packet is for local consumption */
- if (ire->ire_type & IRE_LOCAL)
+ if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
goto local;
}
@@ -15363,7 +15386,7 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
} else if (ire->ire_stq != NULL) {
/* fowarding? */
ip_rput_process_forward(q, mp, ire, ipha, ill,
- ll_multicast);
+ ll_multicast, B_FALSE);
/* ip_rput_process_forward consumed the packet */
continue;
}
@@ -15414,8 +15437,8 @@ local:
* changes.
*/
IP_STAT(ipst, ip_input_multi_squeue);
- squeue_enter_chain(curr_sqp, head,
- tail, cnt, SQTAG_IP_INPUT);
+ SQUEUE_ENTER(curr_sqp, head,
+ tail, cnt, SQ_PROCESS, tag);
curr_sqp = GET_SQUEUE(mp);
head = mp;
tail = mp;
@@ -15444,33 +15467,231 @@ local:
ire_refrele(ire);
if (head != NULL)
- squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT);
+ SQUEUE_ENTER(curr_sqp, head, tail, cnt, SQ_PROCESS, tag);
- /*
- * This code is there just to make netperf/ttcp look good.
- *
- * Its possible that after being in polling mode (and having cleared
- * the backlog), squeues have turned the interrupt frequency higher
- * to improve latency at the expense of more CPU utilization (less
- * packets per interrupts or more number of interrupts). Workloads
- * like ttcp/netperf do manage to tickle polling once in a while
- * but for the remaining time, stay in higher interrupt mode since
- * their packet arrival rate is pretty uniform and this shows up
- * as higher CPU utilization. Since people care about CPU utilization
- * while running netperf/ttcp, turn the interrupt frequency back to
- * normal/default if polling has not been used in ip_poll_normal_ticks.
- */
- if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) {
- if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) {
- ip_ring->rr_poll_state &= ~ILL_POLLING;
- ip_ring->rr_blank(ip_ring->rr_handle,
- ip_ring->rr_normal_blank_time,
- ip_ring->rr_normal_pkt_cnt);
+ TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
+ "ip_input_end: q %p (%S)", q, "end");
+#undef rptr
+}
+
+/*
+ * ip_accept_tcp() - This function is called by the squeue when it retrieves
+ * a chain of packets in the poll mode. The packets have gone through the
+ * data link processing but not IP processing. For performance and latency
+ * reasons, the squeue wants to process the chain in line instead of feeding
+ * it back via ip_input path.
+ *
+ * So this is a light weight function which checks to see if the packets
+ * retrived are indeed TCP packets (TCP squeue always polls TCP soft ring
+ * but we still do the paranoid check) meant for local machine and we don't
+ * have labels etc enabled. Packets that meet the criterion are returned to
+ * the squeue and processed inline while the rest go via ip_input path.
+ */
+/*ARGSUSED*/
+mblk_t *
+ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
+ mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
+{
+ mblk_t *mp;
+ ipaddr_t dst = NULL;
+ ipaddr_t prev_dst;
+ ire_t *ire = NULL;
+ ipha_t *ipha;
+ uint_t pkt_len;
+ ssize_t len;
+ uint_t opt_len;
+ queue_t *q = ill->ill_rq;
+ squeue_t *curr_sqp;
+ mblk_t *ahead = NULL; /* Accepted head */
+ mblk_t *atail = NULL; /* Accepted tail */
+ uint_t acnt = 0; /* Accepted count */
+ mblk_t *utail = NULL; /* Unaccepted head */
+ mblk_t *uhead = NULL; /* Unaccepted tail */
+ uint_t ucnt = 0; /* Unaccepted cnt */
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ *cnt = 0;
+
+ ASSERT(ill != NULL);
+ ASSERT(ip_ring != NULL);
+
+ TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_accept_tcp: q %p", q);
+
+#define rptr ((uchar_t *)ipha)
+
+ while (mp_chain != NULL) {
+ mp = mp_chain;
+ mp_chain = mp_chain->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * We do ire caching from one iteration to
+ * another. In the event the packet chain contains
+ * all packets from the same dst, this caching saves
+ * an ire_cache_lookup for each of the succeeding
+ * packets in a packet chain.
+ */
+ prev_dst = dst;
+
+ ipha = (ipha_t *)mp->b_rptr;
+ len = mp->b_wptr - rptr;
+
+ ASSERT(!MBLK_RX_FANOUT_SLOWPATH(mp, ipha));
+
+ /*
+ * If it is a non TCP packet, or doesn't have H/W cksum,
+ * or doesn't have min len, reject.
+ */
+ if ((ipha->ipha_protocol != IPPROTO_TCP) || (len <
+ (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH))) {
+ ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+ continue;
}
+
+ pkt_len = ntohs(ipha->ipha_length);
+ if (len != pkt_len) {
+ if (len > pkt_len) {
+ mp->b_wptr = rptr + pkt_len;
+ } else {
+ ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+ continue;
+ }
}
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
- "ip_input_end: q %p (%S)", q, "end");
+ opt_len = ipha->ipha_version_and_hdr_length -
+ IP_SIMPLE_HDR_VERSION;
+ dst = ipha->ipha_dst;
+
+ /* IP version bad or there are IP options */
+ if (opt_len && (!ip_rput_multimblk_ipoptions(q, ill,
+ mp, &ipha, &dst, ipst)))
+ continue;
+
+ if (is_system_labeled() || (ill->ill_dhcpinit != 0) ||
+ (ipst->ips_ip_cgtp_filter &&
+ ipst->ips_ip_cgtp_filter_ops != NULL)) {
+ ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+ continue;
+ }
+
+ /*
+ * Reuse the cached ire only if the ipha_dst of the previous
+ * packet is the same as the current packet AND it is not
+ * INADDR_ANY.
+ */
+ if (!(dst == prev_dst && dst != INADDR_ANY) &&
+ (ire != NULL)) {
+ ire_refrele(ire);
+ ire = NULL;
+ }
+
+ if (ire == NULL)
+ ire = ire_cache_lookup_simple(dst, ipst);
+
+ /*
+ * Unless forwarding is enabled, dont call
+ * ip_fast_forward(). Incoming packet is for forwarding
+ */
+ if ((ill->ill_flags & ILLF_ROUTER) &&
+ (ire == NULL || (ire->ire_type & IRE_CACHE))) {
+
+ DTRACE_PROBE4(ip4__physical__in__start,
+ ill_t *, ill, ill_t *, NULL,
+ ipha_t *, ipha, mblk_t *, mp);
+
+ FW_HOOKS(ipst->ips_ip4_physical_in_event,
+ ipst->ips_ipv4firewall_physical_in,
+ ill, NULL, ipha, mp, mp, 0, ipst);
+
+ DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
+ pkt_len);
+
+ ire = ip_fast_forward(ire, dst, ill, mp);
+ continue;
+ }
+
+ /* incoming packet is for local consumption */
+ if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
+ goto local_accept;
+
+ /*
+ * Disable ire caching for anything more complex
+ * than the simple fast path case we checked for above.
+ */
+ if (ire != NULL) {
+ ire_refrele(ire);
+ ire = NULL;
+ }
+
+ ire = ire_cache_lookup(dst, ALL_ZONES, MBLK_GETLABEL(mp),
+ ipst);
+ if (ire == NULL || ire->ire_type == IRE_BROADCAST ||
+ ire->ire_stq != NULL) {
+ ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ ire = NULL;
+ }
+ continue;
+ }
+
+local_accept:
+
+ if (ire->ire_rfq != q) {
+ ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ ire = NULL;
+ }
+ continue;
+ }
+
+ /*
+ * The event for packets being received from a 'physical'
+ * interface is placed after validation of the source and/or
+ * destination address as being local so that packets can be
+ * redirected to loopback addresses using ipnat.
+ */
+ DTRACE_PROBE4(ip4__physical__in__start,
+ ill_t *, ill, ill_t *, NULL,
+ ipha_t *, ipha, mblk_t *, mp);
+
+ FW_HOOKS(ipst->ips_ip4_physical_in_event,
+ ipst->ips_ipv4firewall_physical_in,
+ ill, NULL, ipha, mp, mp, 0, ipst);
+
+ DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len);
+
+ if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, mp,
+ 0, q, ip_ring)) != NULL) {
+ if ((curr_sqp = GET_SQUEUE(mp)) == target_sqp) {
+ ADD_TO_CHAIN(ahead, atail, acnt, mp);
+ } else {
+ SQUEUE_ENTER(curr_sqp, mp, mp, 1,
+ SQ_FILL, SQTAG_IP_INPUT);
+ }
+ }
+ }
+
+ if (ire != NULL)
+ ire_refrele(ire);
+
+ if (uhead != NULL)
+ ip_input(ill, ip_ring, uhead, NULL);
+
+ if (ahead != NULL) {
+ *last = atail;
+ *cnt = acnt;
+ return (ahead);
+ }
+
+ return (NULL);
#undef rptr
}
@@ -15770,11 +15991,18 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
freemsg(mp); /* Don't want to pass this up */
return;
-
- case DL_CAPABILITY_REQ:
case DL_CONTROL_REQ:
+ ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
+ "DL_CONTROL_REQ\n"));
ill_dlpi_done(ill, dlea->dl_error_primitive);
- ill->ill_dlpi_capab_state = IDS_FAILED;
+ freemsg(mp);
+ return;
+ case DL_CAPABILITY_REQ:
+ ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
+ "DL_CAPABILITY REQ\n"));
+ if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
+ ill->ill_dlpi_capab_state = IDCS_FAILED;
+ ill_capability_done(ill);
freemsg(mp);
return;
}
@@ -15814,19 +16042,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
dlea->dl_errno, dlea->dl_unix_errno);
break;
case DL_CAPABILITY_ACK:
- /* Call a routine to handle this one. */
- ill_dlpi_done(ill, DL_CAPABILITY_REQ);
ill_capability_ack(ill, mp);
-
/*
- * If the ack is due to renegotiation, we will need to send
- * a new CAPABILITY_REQ to start the renegotiation.
+ * The message has been handed off to ill_capability_ack
+ * and must not be freed below
*/
- if (ill->ill_capab_reneg) {
- ill->ill_capab_reneg = B_FALSE;
- ill_capability_probe(ill);
- }
+ mp = NULL;
break;
+
case DL_CONTROL_ACK:
/* We treat all of these as "fire and forget" */
ill_dlpi_done(ill, DL_CONTROL_REQ);
@@ -16117,10 +16340,9 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* and the renegotiation has not been started yet;
* nothing needs to be done in this case.
*/
- if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) {
- ill_capability_reset(ill);
- ill->ill_capab_reneg = B_TRUE;
- }
+ ipsq_current_start(ipsq, ill->ill_ipif, 0);
+ ill_capability_reset(ill, B_TRUE);
+ ipsq_current_finish(ipsq);
break;
default:
ip0dbg(("ip_rput_dlpi_writer: unknown notification "
@@ -16661,7 +16883,8 @@ ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill)
max_frag -= secopt_size;
}
- ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, GLOBAL_ZONEID, ipst);
+ ip_wput_frag(ire, mp, IB_PKT, max_frag, 0,
+ GLOBAL_ZONEID, ipst, NULL);
ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n"));
return;
}
@@ -16677,7 +16900,7 @@ ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill)
mp->b_prev = (mblk_t *)IPP_FWD_OUT;
ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n"));
- (void) ip_xmit_v4(mp, ire, NULL, B_FALSE);
+ (void) ip_xmit_v4(mp, ire, NULL, B_FALSE, NULL);
/* ip_xmit_v4 always consumes the packet */
return;
@@ -17049,9 +17272,12 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
ire, ipsec_mp, 0, ill->ill_rq, NULL);
IRE_REFRELE(ire);
- if (mp != NULL)
- squeue_enter_chain(GET_SQUEUE(mp), mp,
- mp, 1, SQTAG_IP_PROTO_AGAIN);
+ if (mp != NULL) {
+
+ SQUEUE_ENTER(GET_SQUEUE(mp), mp,
+ mp, 1, SQ_PROCESS,
+ SQTAG_IP_PROTO_AGAIN);
+ }
break;
case IPPROTO_SCTP:
if (!ire_need_rele)
@@ -21721,7 +21947,7 @@ conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif)
*/
static void
ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid,
- ip_stack_t *ipst)
+ ip_stack_t *ipst, conn_t *connp)
{
ipha_t *ipha;
mblk_t *mp;
@@ -21779,7 +22005,7 @@ ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid,
ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst));
ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag,
- (dont_use ? 0 : frag_flag), zoneid, ipst);
+ (dont_use ? 0 : frag_flag), zoneid, ipst, connp);
}
/*
@@ -22502,9 +22728,9 @@ another:;
queue_t *dev_q = stq->q_next;
/* flow controlled */
- if ((dev_q->q_next || dev_q->q_first) &&
- !canput(dev_q))
+ if (DEV_Q_FLOW_BLOCKED(dev_q))
goto blocked;
+
if ((PROTO == IPPROTO_UDP) &&
(ip_hdr_included != IP_HDR_INCLUDED)) {
hlen = (V_HLEN & 0xF) << 2;
@@ -22685,6 +22911,7 @@ another:;
ipst->ips_ipv4firewall_physical_out,
NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst);
DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+
if (mp == NULL)
goto release_ire_and_ill;
@@ -22703,7 +22930,9 @@ another:;
}
mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT);
DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire);
- pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE);
+
+ pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE, connp);
+
if ((pktxmit_state == SEND_FAILED) ||
(pktxmit_state == LLHDR_RESLV_FAILED)) {
ip2dbg(("ip_wput_ire: ip_xmit_v4 failed"
@@ -22976,10 +23205,9 @@ broadcast:
#endif
sctph->sh_chksum = sctp_cksum(mp, hlen);
} else {
- queue_t *dev_q = stq->q_next;
+ queue_t *dev_q = stq->q_next;
- if ((dev_q->q_next || dev_q->q_first) &&
- !canput(dev_q)) {
+ if (DEV_Q_FLOW_BLOCKED(dev_q)) {
blocked:
ipha->ipha_ident = ip_hdr_included;
/*
@@ -23314,7 +23542,7 @@ checksumoptions:
DTRACE_PROBE2(ip__xmit__2,
mblk_t *, mp, ire_t *, ire);
pktxmit_state = ip_xmit_v4(mp, ire,
- NULL, B_TRUE);
+ NULL, B_TRUE, connp);
if ((pktxmit_state == SEND_FAILED) ||
(pktxmit_state == LLHDR_RESLV_FAILED)) {
release_ire_and_ill_2:
@@ -23471,13 +23699,14 @@ fragmentit:
"ip_wput_ire_end: q %p (%S)",
q, "last fragmentation");
ip_wput_ire_fragmentit(mp, ire,
- zoneid, ipst);
+ zoneid, ipst, connp);
ire_refrele(ire);
if (conn_outgoing_ill != NULL)
ill_refrele(conn_outgoing_ill);
return;
}
- ip_wput_ire_fragmentit(mp, ire, zoneid, ipst);
+ ip_wput_ire_fragmentit(mp, ire,
+ zoneid, ipst, connp);
}
}
} else {
@@ -24195,7 +24424,7 @@ pbuf_panic:
*/
static void
ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
- uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst)
+ uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst, conn_t *connp)
{
int i1;
mblk_t *ll_hdr_mp;
@@ -24253,7 +24482,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
*/
if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) {
/* If nce_state is ND_INITIAL, trigger ARP query */
- (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE);
+ (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
ip1dbg(("ip_wput_frag: mac address for ire is unresolved"
" - dropping packet\n"));
BUMP_MIB(mibptr, ipIfStatsOutFragFails);
@@ -24622,7 +24851,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
ipha_t *, ipha, ip6_t *, NULL, int, 0);
- putnext(q, xmit_mp);
+ ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
UPDATE_MIB(out_ill->ill_ip_mib,
@@ -24932,7 +25161,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
__dtrace_ipsr_ill_t *, out_ill, ipha_t *,
ipha, ip6_t *, NULL, int, 0);
- putnext(q, xmit_mp);
+ ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
BUMP_MIB(out_ill->ill_ip_mib,
ipIfStatsHCOutTransmits);
@@ -26286,7 +26515,8 @@ send:
"fragmented accelerated packet!\n"));
freemsg(ipsec_mp);
} else {
- ip_wput_ire_fragmentit(ipsec_mp, ire, zoneid, ipst);
+ ip_wput_ire_fragmentit(ipsec_mp, ire,
+ zoneid, ipst, NULL);
}
if (ire_need_rele)
ire_refrele(ire);
@@ -26461,7 +26691,7 @@ send:
* Call ip_xmit_v4() to trigger ARP query
* in case the nce_state is ND_INITIAL
*/
- (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE);
+ (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
goto drop_pkt;
}
@@ -26477,7 +26707,7 @@ send:
ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n"));
pktxmit_state = ip_xmit_v4(mp, ire,
- (io->ipsec_out_accelerated ? io : NULL), B_FALSE);
+ (io->ipsec_out_accelerated ? io : NULL), B_FALSE, NULL);
if ((pktxmit_state == SEND_FAILED) ||
(pktxmit_state == LLHDR_RESLV_FAILED)) {
@@ -27588,9 +27818,9 @@ nak:
*/
ASSERT(ipsq != NULL);
CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, mp,
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
ip_resume_tcp_bind, connp,
- SQTAG_BIND_RETRY);
+ SQ_FILL, SQTAG_BIND_RETRY);
} else if (IPCL_IS_UDP(connp)) {
/*
* In the case of UDP endpoint we
@@ -28053,7 +28283,7 @@ nak:
/*
* send out queued packets.
*/
- (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE);
+ (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
IRE_REFRELE(ire);
return;
@@ -28558,6 +28788,25 @@ ip_wsrv(queue_t *q)
}
/*
+ * Callback to disable flow control in IP.
+ *
+ * This is a mac client callback added when the DLD_CAPAB_DIRECT capability
+ * is enabled.
+ *
+ * When MAC_TX() is not able to send any more packets, dld sets its queue
+ * to QFULL and enable the STREAMS flow control. Later, when the underlying
+ * driver is able to continue to send packets, it calls mac_tx_(ring_)update()
+ * function and wakes up corresponding mac worker threads, which in turn
+ * calls this callback function, and disables flow control.
+ */
+/* ARGSUSED */
+void
+ill_flow_enable(void *ill, ip_mac_tx_cookie_t cookie)
+{
+ qenable(((ill_t *)ill)->ill_wq);
+}
+
+/*
* Walk the list of all conn's calling the function provided with the
* specified argument for each. Note that this only walks conn's that
* have been bound.
@@ -29280,17 +29529,17 @@ ip_cgtp_filter_is_registered(netstackid_t stackid)
return (ret);
}
-static squeue_func_t
+static int
ip_squeue_switch(int val)
{
- squeue_func_t rval = squeue_fill;
+ int rval = SQ_FILL;
switch (val) {
case IP_SQUEUE_ENTER_NODRAIN:
- rval = squeue_enter_nodrain;
+ rval = SQ_NODRAIN;
break;
case IP_SQUEUE_ENTER:
- rval = squeue_enter;
+ rval = SQ_PROCESS;
break;
default:
break;
@@ -29312,7 +29561,7 @@ ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
if (ddi_strtol(value, NULL, 10, &new_value) != 0)
return (EINVAL);
- ip_input_proc = ip_squeue_switch(new_value);
+ ip_squeue_flag = ip_squeue_switch(new_value);
*v = new_value;
return (0);
}
@@ -29983,7 +30232,8 @@ ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4,
* ip_wput_frag can call this function.
*/
ipxmit_state_t
-ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled)
+ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io,
+ boolean_t flow_ctl_enabled, conn_t *connp)
{
nce_t *arpce;
ipha_t *ipha;
@@ -30069,7 +30319,8 @@ ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled)
ipha_t *, ipha, ip6_t *, NULL, int,
0);
- putnext(q, first_mp);
+ ILL_SEND_TX(out_ill,
+ ire, connp, first_mp, 0);
} else {
BUMP_MIB(out_ill->ill_ip_mib,
ipIfStatsOutDiscards);
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 810cec9e8a..a1d97627b2 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -98,6 +98,7 @@
#include <inet/udp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/rts_impl.h>
+#include <sys/squeue_impl.h>
#include <sys/squeue.h>
#include <sys/tsol/label.h>
@@ -108,7 +109,7 @@
/* Temporary; for CR 6451644 work-around */
#include <sys/ethernet.h>
-extern squeue_func_t ip_input_proc;
+extern int ip_squeue_flag;
/*
* Naming conventions:
@@ -887,8 +888,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
goto drop_pkt;
}
- squeue_fill(connp->conn_sqp, first_mp, tcp_input,
- connp, SQTAG_TCP6_INPUT_ICMP_ERR);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
+ SQ_FILL, SQTAG_TCP6_INPUT_ICMP_ERR);
return;
}
@@ -2538,8 +2539,9 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
if (mp != NULL) {
if (IPCL_IS_TCP(connp)) {
CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, mp, ip_resume_tcp_bind,
- connp, SQTAG_TCP_RPUTOTHER);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ ip_resume_tcp_bind, connp, SQ_FILL,
+ SQTAG_TCP_RPUTOTHER);
} else if (IPCL_IS_UDP(connp)) {
udp_resume_bind(connp, mp);
} else {
@@ -3637,8 +3639,8 @@ ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill,
BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
if (IPCL_IS_TCP(connp)) {
- (*ip_input_proc)(connp->conn_sqp, first_mp,
- connp->conn_recv, connp, SQTAG_IP6_TCP_INPUT);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
+ connp, ip_squeue_flag, SQTAG_IP6_TCP_INPUT);
} else {
/* SOCK_RAW, IPPROTO_TCP case */
(connp->conn_recv)(connp, first_mp, NULL);
@@ -11072,7 +11074,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
/* Driver is flow-controlling? */
if (!IP_FLOW_CONTROLLED_ULP(nexthdr) &&
- ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+ DEV_Q_FLOW_BLOCKED(dev_q)) {
/*
* Queue packet if we have an conn to give back
* pressure. We can't queue packets intended for
@@ -12140,8 +12142,9 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
"connp %p (ENOMEM)\n", (void *)connp));
} else {
CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, mdimp, tcp_input,
- connp, SQTAG_TCP_INPUT_MCTL);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mdimp,
+ tcp_input, connp, SQ_FILL,
+ SQTAG_TCP_INPUT_MCTL);
}
}
@@ -12576,34 +12579,8 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
}
} else {
/*
- * Queue packet if we have an conn to give back pressure.
- * We can't queue packets intended for hardware acceleration
- * since we've tossed that state already. If the packet is
- * being fed back from ire_send_v6, we don't know the
- * position in the queue to enqueue the packet and we discard
- * the packet.
- */
- if (ipst->ips_ip_output_queue && (connp != NULL) &&
- (io == NULL) && (caller != IRE_SEND)) {
- if (caller == IP_WSRV) {
- connp->conn_did_putbq = 1;
- (void) putbq(connp->conn_wq, mp);
- conn_drain_insert(connp);
- /*
- * caller == IP_WSRV implies we are
- * the service thread, and the
- * queue is already noenabled.
- * The check for canput and
- * the putbq is not atomic.
- * So we need to check again.
- */
- if (canput(stq->q_next))
- connp->conn_did_putbq = 0;
- } else {
- (void) putq(connp->conn_wq, mp);
- }
- return;
- }
+ * Can't apply backpressure, just discard the packet.
+ */
BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
freemsg(mp);
return;
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index c87267cb29..4fa3c7a74d 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -101,6 +101,8 @@ static ire_t *ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *,
static void ire_del_host_redir(ire_t *, char *);
static boolean_t ire_find_best_route(struct radix_node *, void *);
static int ip_send_align_hcksum_flags(mblk_t *, ill_t *);
+static ire_t *ire_ftable_lookup_simple(ipaddr_t,
+ ire_t **, zoneid_t, int, ip_stack_t *);
/*
* Lookup a route in forwarding table. A specific lookup is indicated by
@@ -406,6 +408,157 @@ found_ire_held:
return (ire);
}
+/*
+ * This function is called by
+ * ip_fast_forward->ire_forward_simple
+ * The optimizations of this function over ire_ftable_lookup are:
+ * o removing unnecessary flag matching
+ * o doing longest prefix match instead of overloading it further
+ * with the unnecessary "best_prefix_match"
+ * o Does not do round robin of default route for every packet
+ * o inlines code of ire_ctable_lookup to look for nexthop cache
+ * entry before calling ire_route_lookup
+ */
+static ire_t *
+ire_ftable_lookup_simple(ipaddr_t addr,
+ ire_t **pire, zoneid_t zoneid, int flags,
+ ip_stack_t *ipst)
+{
+ ire_t *ire = NULL;
+ ire_t *tmp_ire = NULL;
+ struct rt_sockaddr rdst;
+ struct rt_entry *rt;
+ irb_t *irb_ptr;
+ ire_t *save_ire;
+ int match_flags;
+
+ rdst.rt_sin_len = sizeof (rdst);
+ rdst.rt_sin_family = AF_INET;
+ rdst.rt_sin_addr.s_addr = addr;
+
+ /*
+ * This is basically inlining a simpler version of ire_match_args
+ */
+ RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
+
+ rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
+ ipst->ips_ip_ftable, NULL, NULL);
+
+ if (rt == NULL) {
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ return (NULL);
+ }
+ irb_ptr = &rt->rt_irb;
+ if (irb_ptr == NULL || irb_ptr->irb_ire_cnt == 0) {
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ return (NULL);
+ }
+
+ rw_enter(&irb_ptr->irb_lock, RW_READER);
+ for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
+ if (ire->ire_zoneid == zoneid)
+ break;
+ }
+
+ if (ire == NULL || (ire->ire_marks & IRE_MARK_CONDEMNED)) {
+ rw_exit(&irb_ptr->irb_lock);
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ return (NULL);
+ }
+ /* we have a ire that matches */
+ if (ire != NULL)
+ IRE_REFHOLD(ire);
+ rw_exit(&irb_ptr->irb_lock);
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+
+ if ((flags & MATCH_IRE_RJ_BHOLE) &&
+ (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
+ return (ire);
+ }
+ /*
+ * At this point, IRE that was found must be an IRE_FORWARDTABLE
+ * type. If this is a recursive lookup and an IRE_INTERFACE type was
+ * found, return that. If it was some other IRE_FORWARDTABLE type of
+ * IRE (one of the prefix types), then it is necessary to fill in the
+ * parent IRE pointed to by pire, and then lookup the gateway address of
+ * the parent. For backwards compatiblity, if this lookup returns an
+ * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
+ * of lookup is done.
+ */
+ match_flags = MATCH_IRE_DSTONLY;
+
+ if (ire->ire_type & IRE_INTERFACE)
+ return (ire);
+ *pire = ire;
+ /*
+ * If we can't find an IRE_INTERFACE or the caller has not
+ * asked for pire, we need to REFRELE the save_ire.
+ */
+ save_ire = ire;
+
+ /*
+ * Currently MATCH_IRE_ILL is never used with
+ * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
+ * sending out packets as MATCH_IRE_ILL is used only
+ * for communicating with on-link hosts. We can't assert
+ * that here as RTM_GET calls this function with
+ * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
+ * We have already used the MATCH_IRE_ILL in determining
+ * the right prefix route at this point. To match the
+ * behavior of how we locate routes while sending out
+ * packets, we don't want to use MATCH_IRE_ILL below
+ * while locating the interface route.
+ *
+ * ire_ftable_lookup may end up with an incomplete IRE_CACHE
+ * entry for the gateway (i.e., one for which the
+ * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
+ * has specified MATCH_IRE_COMPLETE, such entries will not
+ * be returned; instead, we return the IF_RESOLVER ire.
+ */
+
+ if (ire->ire_ipif == NULL) {
+ tmp_ire = ire;
+ /*
+ * Look to see if the nexthop entry is in the
+ * cachetable (I am inlining a simpler ire_cache_lookup
+ * here).
+ */
+ ire = ire_cache_lookup_simple(ire->ire_gateway_addr, ipst);
+ if (ire == NULL) {
+ /* Try ire_route_lookup */
+ ire = tmp_ire;
+ } else {
+ goto solved;
+ }
+ }
+ if (ire->ire_ipif != NULL)
+ match_flags |= MATCH_IRE_ILL_GROUP;
+
+ ire = ire_route_lookup(ire->ire_gateway_addr, 0,
+ 0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
+solved:
+ DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
+ (ire_t *), save_ire);
+ if (ire == NULL) {
+ /*
+ * Do not release the parent ire if MATCH_IRE_PARENT
+ * is set. Also return it via ire.
+ */
+ ire_refrele(save_ire);
+ *pire = NULL;
+ return (ire);
+ }
+ if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
+ /*
+ * If the caller did not ask for pire, release
+ * it now.
+ */
+ if (pire == NULL) {
+ ire_refrele(save_ire);
+ }
+ }
+ return (ire);
+}
/*
* Find an IRE_OFFSUBNET IRE entry for the multicast address 'group'
@@ -1085,6 +1238,246 @@ icmp_err_ret:
ire_refrele(ire);
}
return (NULL);
+}
+
+/*
+ * Since caller is ip_fast_forward, there is no CGTP or Tsol test
+ * Also we dont call ftable lookup with MATCH_IRE_PARENT
+ */
+
+ire_t *
+ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
+ ip_stack_t *ipst)
+{
+ ipaddr_t gw = 0;
+ ire_t *ire = NULL;
+ ire_t *sire = NULL, *save_ire;
+ ill_t *dst_ill = NULL;
+ int error;
+ zoneid_t zoneid;
+ ipif_t *src_ipif = NULL;
+ mblk_t *res_mp;
+ ushort_t ire_marks = 0;
+
+ zoneid = GLOBAL_ZONEID;
+
+
+ ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
+ MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
+ MATCH_IRE_RJ_BHOLE, ipst);
+
+ if (ire == NULL) {
+ ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
+ goto icmp_err_ret;
+ }
+
+ /*
+ * Verify that the returned IRE does not have either
+ * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
+ * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
+ */
+ if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
+ ASSERT(ire->ire_type & (IRE_CACHE | IRE_INTERFACE));
+ ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
+ (void *)ire));
+ goto icmp_err_ret;
+ }
+
+ /*
+ * If we already have a fully resolved IRE CACHE of the
+ * nexthop router, just hand over the cache entry
+ * and we are done.
+ */
+
+ if (ire->ire_type & IRE_CACHE) {
+
+ /*
+ * If we are using this ire cache entry as a
+ * gateway to forward packets, chances are we
+ * will be using it again. So turn off
+ * the temporary flag, thus reducing its
+ * chances of getting deleted frequently.
+ */
+ if (ire->ire_marks & IRE_MARK_TEMPORARY) {
+ irb_t *irb = ire->ire_bucket;
+ rw_enter(&irb->irb_lock, RW_WRITER);
+ ire->ire_marks &= ~IRE_MARK_TEMPORARY;
+ irb->irb_tmp_ire_cnt--;
+ rw_exit(&irb->irb_lock);
+ }
+
+ if (sire != NULL) {
+ UPDATE_OB_PKT_COUNT(sire);
+ ire_refrele(sire);
+ }
+ *ret_action = Forward_ok;
+ return (ire);
+ }
+ /*
+ * Increment the ire_ob_pkt_count field for ire if it is an
+ * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
+ * increment the same for the parent IRE, sire, if it is some
+ * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
+ */
+ if ((ire->ire_type & IRE_INTERFACE) != 0) {
+ UPDATE_OB_PKT_COUNT(ire);
+ ire->ire_last_used_time = lbolt;
+ }
+
+ /*
+ * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
+ */
+ if (sire != NULL) {
+ gw = sire->ire_gateway_addr;
+ ASSERT((sire->ire_type &
+ (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
+ UPDATE_OB_PKT_COUNT(sire);
+ }
+
+ /* Obtain dst_ill */
+ dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+ if (dst_ill == NULL) {
+ ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
+ (void *)ire));
+ goto icmp_err_ret;
+ }
+
+ ASSERT(src_ipif == NULL);
+ /* Now obtain the src_ipif */
+ src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
+ zoneid, &ire_marks);
+ if (src_ipif == NULL)
+ goto icmp_err_ret;
+
+ switch (ire->ire_type) {
+ case IRE_IF_NORESOLVER:
+ /* create ire_cache for ire_addr endpoint */
+ case IRE_IF_RESOLVER:
+ /*
+ * We have the IRE_IF_RESOLVER of the nexthop gateway
+ * and now need to build a IRE_CACHE for it.
+ * In this case, we have the following :
+ *
+ * 1) src_ipif - used for getting a source address.
+ *
+ * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
+ * means packets using the IRE_CACHE that we will build
+ * here will go out on dst_ill.
+ *
+ * 3) sire may or may not be NULL. But, the IRE_CACHE that is
+ * to be created will only be tied to the IRE_INTERFACE
+ * that was derived from the ire_ihandle field.
+ *
+ * If sire is non-NULL, it means the destination is
+ * off-link and we will first create the IRE_CACHE for the
+ * gateway.
+ */
+ res_mp = dst_ill->ill_resolver_mp;
+ if (ire->ire_type == IRE_IF_RESOLVER &&
+ (!OK_RESOLVER_MP(res_mp))) {
+ ire_refrele(ire);
+ ire = NULL;
+ goto out;
+ }
+ /*
+ * To be at this point in the code with a non-zero gw
+ * means that dst is reachable through a gateway that
+ * we have never resolved. By changing dst to the gw
+ * addr we resolve the gateway first.
+ */
+ if (gw != INADDR_ANY) {
+ /*
+ * The source ipif that was determined above was
+ * relative to the destination address, not the
+ * gateway's. If src_ipif was not taken out of
+ * the IRE_IF_RESOLVER entry, we'll need to call
+ * ipif_select_source() again.
+ */
+ if (src_ipif != ire->ire_ipif) {
+ ipif_refrele(src_ipif);
+ src_ipif = ipif_select_source(dst_ill,
+ gw, zoneid);
+ if (src_ipif == NULL)
+ goto icmp_err_ret;
+ }
+ dst = gw;
+ gw = INADDR_ANY;
+ }
+
+ if (ire->ire_type == IRE_IF_NORESOLVER)
+ dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
+
+ save_ire = ire;
+ /*
+ * create an incomplete IRE_CACHE.
+ * An areq_mp will be generated in ire_arpresolve() for
+ * RESOLVER interfaces.
+ */
+ ire = ire_create(
+ (uchar_t *)&dst, /* dest address */
+ (uchar_t *)&ip_g_all_ones, /* mask */
+ (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
+ (uchar_t *)&gw, /* gateway address */
+ (save_ire->ire_type == IRE_IF_RESOLVER ? NULL:
+ &save_ire->ire_max_frag),
+ NULL,
+ dst_ill->ill_rq, /* recv-from queue */
+ dst_ill->ill_wq, /* send-to queue */
+ IRE_CACHE, /* IRE type */
+ src_ipif,
+ ire->ire_mask, /* Parent mask */
+ 0,
+ ire->ire_ihandle, /* Interface handle */
+ 0,
+ &(ire->ire_uinfo),
+ NULL,
+ NULL,
+ ipst);
+ ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
+ if (ire != NULL) {
+ ire->ire_marks |= ire_marks;
+ /* add the incomplete ire: */
+ error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
+ if (error == 0 && ire != NULL) {
+ ire->ire_max_frag = save_ire->ire_max_frag;
+ ip1dbg(("setting max_frag to %d in ire 0x%p\n",
+ ire->ire_max_frag, (void *)ire));
+ } else {
+ ire_refrele(save_ire);
+ goto icmp_err_ret;
+ }
+ }
+
+ ire_refrele(save_ire);
+ break;
+ default:
+ break;
+ }
+
+out:
+ *ret_action = Forward_ok;
+ if (sire != NULL)
+ ire_refrele(sire);
+ if (dst_ill != NULL)
+ ill_refrele(dst_ill);
+ if (src_ipif != NULL)
+ ipif_refrele(src_ipif);
+ return (ire);
+icmp_err_ret:
+ *ret_action = Forward_ret_icmp_err;
+ if (src_ipif != NULL)
+ ipif_refrele(src_ipif);
+ if (dst_ill != NULL)
+ ill_refrele(dst_ill);
+ if (sire != NULL)
+ ire_refrele(sire);
+ if (ire != NULL) {
+ if (ire->ire_flags & RTF_BLACKHOLE)
+ *ret_action = Forward_blackhole;
+ ire_refrele(ire);
+ }
+ /* caller needs to send icmp error message */
+ return (NULL);
}
@@ -1439,7 +1832,7 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
* if necessary and send it once ready.
*/
- value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE);
+ value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE, NULL);
cleanup:
ire_refrele(ire_cache);
/*
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 3b8ff6b5d9..d767b25a76 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -44,6 +44,8 @@
#include <sys/sunldi.h>
#include <sys/file.h>
#include <sys/bitmap.h>
+#include <sys/cpuvar.h>
+#include <sys/time.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/param.h>
@@ -62,6 +64,7 @@
#include <sys/strsun.h>
#include <sys/policy.h>
#include <sys/ethernet.h>
+#include <sys/callb.h>
#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
#include <inet/mi.h>
@@ -94,7 +97,8 @@
#include <netinet/igmp.h>
#include <inet/ip_listutils.h>
#include <inet/ipclassifier.h>
-#include <sys/mac.h>
+#include <sys/mac_client.h>
+#include <sys/dld.h>
#include <sys/systeminfo.h>
#include <sys/bootconf.h>
@@ -224,25 +228,27 @@ static void ill_ipsec_capab_free(ill_ipsec_capab_t *);
static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
static void ill_ipsec_capab_delete(ill_t *, uint_t);
static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
-static void ill_capability_proto(ill_t *, int, mblk_t *);
static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
boolean_t);
static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_mdt_reset(ill_t *, mblk_t **);
+static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *);
static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_ipsec_reset(ill_t *, mblk_t **);
+static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *);
static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_hcksum_reset(ill_t *, mblk_t **);
+static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
-static void ill_capability_zerocopy_reset(ill_t *, mblk_t **);
-static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_lso_reset(ill_t *, mblk_t **);
-static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *);
-static void ill_capability_dls_reset(ill_t *, mblk_t **);
-static void ill_capability_dls_disable(ill_t *);
+static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
+static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *,
+ int *);
+static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
+static void ill_capability_dld_ack(ill_t *, mblk_t *,
+ dl_capability_sub_t *);
+static void ill_capability_dld_enable(ill_t *);
+static void ill_capability_ack_thr(void *);
+static void ill_capability_lso_enable(ill_t *);
+static void ill_capability_send(ill_t *, mblk_t *);
static void illgrp_cache_delete(ire_t *, char *);
static void illgrp_delete(ill_t *ill);
@@ -523,16 +529,6 @@ static ipif_t ipif_zero;
*/
uint_t ill_no_arena = 12; /* Setable in /etc/system */
-/*
- * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout
- * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is
- * set through platform specific code (Niagara/Ontario).
- */
-#define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \
- (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE)
-
-#define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL)
-
static uint_t
ipif_rand(ip_stack_t *ipst)
{
@@ -824,12 +820,8 @@ ill_delete_tail(ill_t *ill)
while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
cv_wait(&ill->ill_cv, &ill->ill_lock);
mutex_exit(&ill->ill_lock);
-
- /*
- * Clean up polling and soft ring capabilities
- */
- if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))
- ill_capability_dls_disable(ill);
+ ASSERT(!(ill->ill_capabilities &
+ (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
if (ill->ill_net_type != IRE_LOOPBACK)
qprocsoff(ill->ill_rq);
@@ -879,17 +871,11 @@ ill_delete_tail(ill_t *ill)
ill->ill_lso_capab = NULL;
}
- if (ill->ill_dls_capab != NULL) {
- CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn);
- ill->ill_dls_capab->ill_unbind_conn = NULL;
- kmem_free(ill->ill_dls_capab,
- sizeof (ill_dls_capab_t) +
- (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS));
- ill->ill_dls_capab = NULL;
+ if (ill->ill_dld_capab != NULL) {
+ kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
+ ill->ill_dld_capab = NULL;
}
- ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
-
while (ill->ill_ipif != NULL)
ipif_free_tail(ill->ill_ipif);
@@ -1478,7 +1464,7 @@ conn_ioctl_cleanup(conn_t *connp)
refheld = ill_waiter_inc(ill);
mutex_exit(&connp->conn_lock);
if (refheld) {
- if (ipsq_enter(ill, B_TRUE)) {
+ if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
ill_waiter_dcr(ill);
/*
* Check whether this ioctl has started and is
@@ -1742,104 +1728,114 @@ ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
void
ill_capability_probe(ill_t *ill)
{
+ mblk_t *mp;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
+ ill->ill_dlpi_capab_state != IDCS_FAILED)
+ return;
+
/*
- * Do so only if capabilities are still unknown.
+ * We are starting a new cycle of capability negotiation.
+ * Free up the capab reset messages of any previous incarnation.
+ * We will do a fresh allocation when we get the response to our probe
*/
- if (ill->ill_dlpi_capab_state != IDS_UNKNOWN)
- return;
+ if (ill->ill_capab_reset_mp != NULL) {
+ freemsg(ill->ill_capab_reset_mp);
+ ill->ill_capab_reset_mp = NULL;
+ }
- ill->ill_dlpi_capab_state = IDS_INPROGRESS;
ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
- ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL);
+
+ mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
+ if (mp == NULL)
+ return;
+
+ ill_capability_send(ill, mp);
+ ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
}
void
-ill_capability_reset(ill_t *ill)
-{
- mblk_t *sc_mp = NULL;
- mblk_t *tmp;
-
- /*
- * Note here that we reset the state to UNKNOWN, and later send
- * down the DL_CAPABILITY_REQ without first setting the state to
- * INPROGRESS. We do this in order to distinguish the
- * DL_CAPABILITY_ACK response which may come back in response to
- * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would
- * also handle the case where the driver doesn't send us back
- * a DL_CAPABILITY_ACK in response, since the "probe" routine
- * requires the state to be in UNKNOWN anyway. In any case, all
- * features are turned off until the state reaches IDS_OK.
- */
- ill->ill_dlpi_capab_state = IDS_UNKNOWN;
- ill->ill_capab_reneg = B_FALSE;
-
- /*
- * Disable sub-capabilities and request a list of sub-capability
- * messages which will be sent down to the driver. Each handler
- * allocates the corresponding dl_capability_sub_t inside an
- * mblk, and links it to the existing sc_mp mblk, or return it
- * as sc_mp if it's the first sub-capability (the passed in
- * sc_mp is NULL). Upon returning from all capability handlers,
- * sc_mp will be pulled-up, before passing it downstream.
- */
- ill_capability_mdt_reset(ill, &sc_mp);
- ill_capability_hcksum_reset(ill, &sc_mp);
- ill_capability_zerocopy_reset(ill, &sc_mp);
- ill_capability_ipsec_reset(ill, &sc_mp);
- ill_capability_dls_reset(ill, &sc_mp);
- ill_capability_lso_reset(ill, &sc_mp);
-
- /* Nothing to send down in order to disable the capabilities? */
- if (sc_mp == NULL)
- return;
+ill_capability_reset(ill_t *ill, boolean_t reneg)
+{
+ ASSERT(IAM_WRITER_ILL(ill));
- tmp = msgpullup(sc_mp, -1);
- freemsg(sc_mp);
- if ((sc_mp = tmp) == NULL) {
- cmn_err(CE_WARN, "ill_capability_reset: unable to send down "
- "DL_CAPABILITY_REQ (ENOMEM)\n");
+ if (ill->ill_dlpi_capab_state != IDCS_OK)
return;
- }
- ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n"));
- ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp);
+ ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
+
+ ill_capability_send(ill, ill->ill_capab_reset_mp);
+ ill->ill_capab_reset_mp = NULL;
+ /*
+ * We turn off all capabilities except those pertaining to
+ * direct function call capabilities viz. ILL_CAPAB_DLD*
+ * which will be turned off by the corresponding reset functions.
+ */
+ ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM |
+ ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP);
}
-/*
- * Request or set new-style hardware capabilities supported by DLS provider.
- */
static void
-ill_capability_proto(ill_t *ill, int type, mblk_t *reqp)
+ill_capability_reset_alloc(ill_t *ill)
{
mblk_t *mp;
- dl_capability_req_t *capb;
- size_t size = 0;
- uint8_t *ptr;
+ size_t size = 0;
+ int err;
+ dl_capability_req_t *capb;
- if (reqp != NULL)
- size = MBLKL(reqp);
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(ill->ill_capab_reset_mp == NULL);
- mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type);
- if (mp == NULL) {
- freemsg(reqp);
- return;
+ if (ILL_MDT_CAPABLE(ill))
+ size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
+
+ if (ILL_HCKSUM_CAPABLE(ill)) {
+ size += sizeof (dl_capability_sub_t) +
+ sizeof (dl_capab_hcksum_t);
}
- ptr = mp->b_rptr;
- capb = (dl_capability_req_t *)ptr;
- ptr += sizeof (dl_capability_req_t);
+ if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
+ size += sizeof (dl_capability_sub_t) +
+ sizeof (dl_capab_zerocopy_t);
+ }
- if (reqp != NULL) {
- capb->dl_sub_offset = sizeof (dl_capability_req_t);
- capb->dl_sub_length = size;
- bcopy(reqp->b_rptr, ptr, size);
- ptr += size;
- mp->b_cont = reqp->b_cont;
- freeb(reqp);
+ if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) {
+ size += sizeof (dl_capability_sub_t);
+ size += ill_capability_ipsec_reset_size(ill, NULL, NULL,
+ NULL, NULL);
}
- ASSERT(ptr == mp->b_wptr);
- ill_dlpi_send(ill, mp);
+ if (ill->ill_capabilities & ILL_CAPAB_DLD) {
+ size += sizeof (dl_capability_sub_t) +
+ sizeof (dl_capab_dld_t);
+ }
+
+ mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
+ STR_NOSIG, &err);
+
+ mp->b_datap->db_type = M_PROTO;
+ bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
+
+ capb = (dl_capability_req_t *)mp->b_rptr;
+ capb->dl_primitive = DL_CAPABILITY_REQ;
+ capb->dl_sub_offset = sizeof (dl_capability_req_t);
+ capb->dl_sub_length = size;
+
+ mp->b_wptr += sizeof (dl_capability_req_t);
+
+ /*
+ * Each handler fills in the corresponding dl_capability_sub_t
+ * inside the mblk,
+ */
+ ill_capability_mdt_reset_fill(ill, mp);
+ ill_capability_hcksum_reset_fill(ill, mp);
+ ill_capability_zerocopy_reset_fill(ill, mp);
+ ill_capability_ipsec_reset_fill(ill, mp);
+ ill_capability_dld_reset_fill(ill, mp);
+
+ ill->ill_capab_reset_mp = mp;
}
static void
@@ -1944,7 +1940,6 @@ ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
if (*ill_mdt_capab == NULL) {
*ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
KM_NOSLEEP);
-
if (*ill_mdt_capab == NULL) {
cmn_err(CE_WARN, "ill_capability_mdt_ack: "
"could not enable MDT version %d "
@@ -2017,42 +2012,22 @@ ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
/* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
- ill_dlpi_send(ill, nmp);
+ ill_capability_send(ill, nmp);
}
}
static void
-ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp)
{
- mblk_t *mp;
dl_capab_mdt_t *mdt_subcap;
dl_capability_sub_t *dl_subcap;
- int size;
if (!ILL_MDT_CAPABLE(ill))
return;
ASSERT(ill->ill_mdt_capab != NULL);
- /*
- * Clear the capability flag for MDT but retain the ill_mdt_capab
- * structure since it's possible that another thread is still
- * referring to it. The structure only gets deallocated when
- * we destroy the ill.
- */
- ill->ill_capabilities &= ~ILL_CAPAB_MDT;
-
- size = sizeof (*dl_subcap) + sizeof (*mdt_subcap);
-
- mp = allocb(size, BPRI_HI);
- if (mp == NULL) {
- ip1dbg(("ill_capability_mdt_reset: unable to allocate "
- "request to disable MDT\n"));
- return;
- }
- mp->b_wptr = mp->b_rptr + size;
-
- dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
+ dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_MDT;
dl_subcap->dl_length = sizeof (*mdt_subcap);
@@ -2062,10 +2037,26 @@ ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
mdt_subcap->mdt_hdr_head = 0;
mdt_subcap->mdt_hdr_tail = 0;
- if (*sc_mp != NULL)
- linkb(*sc_mp, mp);
- else
- *sc_mp = mp;
+ mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap);
+}
+
+static void
+ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
+{
+ dl_capability_sub_t *dl_subcap;
+
+ if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
+ return;
+
+ /*
+ * The dl_capab_dld_t that follows the dl_capability_sub_t is not
+ * initialized below since it is not used by DLD.
+ */
+ dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
+ dl_subcap->dl_cap = DL_CAPAB_DLD;
+ dl_subcap->dl_length = sizeof (dl_capab_dld_t);
+
+ mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
}
/*
@@ -2371,7 +2362,7 @@ ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
* nmp points to a DL_CAPABILITY_REQ message to enable
* IPsec hardware acceleration.
*/
- ill_dlpi_send(ill, nmp);
+ ill_capability_send(ill, nmp);
if (need_sadb_dump)
/*
@@ -2457,10 +2448,10 @@ ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
}
/* ARGSUSED */
-static void
-ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
+static int
+ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp,
+ int *esp_cntp, int *esp_lenp)
{
- mblk_t *mp;
ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
uint64_t ill_capabilities = ill->ill_capabilities;
@@ -2469,7 +2460,7 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
int i, size = 0;
if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
- return;
+ return (0);
ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
@@ -2504,18 +2495,32 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
}
}
- if (size == 0) {
- ip1dbg(("ill_capability_ipsec_reset: capabilities exist but "
- "there's nothing to reset\n"));
- return;
- }
+ if (ah_cntp != NULL)
+ *ah_cntp = ah_cnt;
+ if (ah_lenp != NULL)
+ *ah_lenp = ah_len;
+ if (esp_cntp != NULL)
+ *esp_cntp = esp_cnt;
+ if (esp_lenp != NULL)
+ *esp_lenp = esp_len;
- mp = allocb(size, BPRI_HI);
- if (mp == NULL) {
- ip1dbg(("ill_capability_ipsec_reset: unable to allocate "
- "request to disable IPSEC Hardware Acceleration\n"));
+ return (size);
+}
+
+/* ARGSUSED */
+static void
+ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp)
+{
+ ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
+ ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
+ int ah_cnt = 0, esp_cnt = 0;
+ int ah_len = 0, esp_len = 0;
+ int size;
+
+ size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len,
+ &esp_cnt, &esp_len);
+ if (size == 0)
return;
- }
/*
* Clear the capability flags for IPsec HA but retain the ill
@@ -2527,20 +2532,17 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
* hardware acceleration, and by clearing them we ensure that new
* outbound IPsec packets are sent down encrypted.
*/
- ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP);
/* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
if (ah_cnt > 0) {
ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
cap_ah, mp);
- ASSERT(mp->b_rptr + size >= mp->b_wptr);
}
/* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
if (esp_cnt > 0) {
ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
cap_esp, mp);
- ASSERT(mp->b_rptr + size >= mp->b_wptr);
}
/*
@@ -2550,11 +2552,6 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
* must stop inbound decryption (by destroying all inbound SAs)
* and let the corresponding packets come in encrypted.
*/
-
- if (*sc_mp != NULL)
- linkb(*sc_mp, mp);
- else
- *sc_mp = mp;
}
static void
@@ -2564,15 +2561,6 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
boolean_t legacy = B_FALSE;
/*
- * If this DL_CAPABILITY_ACK came in as a response to our "reset"
- * DL_CAPABILITY_REQ, ignore it during this cycle. We've just
- * instructed the driver to disable its advertised capabilities,
- * so there's no point in accepting any response at this moment.
- */
- if (ill->ill_dlpi_capab_state == IDS_UNKNOWN)
- return;
-
- /*
* Note that only the following two sub-capabilities may be
* considered as "legacy", since their original definitions
* do not incorporate the dl_mid_t module ID token, and hence
@@ -2611,16 +2599,8 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
case DL_CAPAB_ZEROCOPY:
ill_capability_zerocopy_ack(ill, mp, subp);
break;
- case DL_CAPAB_POLL:
- if (!SOFT_RINGS_ENABLED())
- ill_capability_dls_ack(ill, mp, subp);
- break;
- case DL_CAPAB_SOFT_RING:
- if (SOFT_RINGS_ENABLED())
- ill_capability_dls_ack(ill, mp, subp);
- break;
- case DL_CAPAB_LSO:
- ill_capability_lso_ack(ill, mp, subp);
+ case DL_CAPAB_DLD:
+ ill_capability_dld_ack(ill, mp, subp);
break;
default:
ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
@@ -2629,407 +2609,6 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
}
/*
- * As part of negotiating polling capability, the driver tells us
- * the default (or normal) blanking interval and packet threshold
- * (the receive timer fires if blanking interval is reached or
- * the packet threshold is reached).
- *
- * As part of manipulating the polling interval, we always use our
- * estimated interval (avg service time * number of packets queued
- * on the squeue) but we try to blank for a minimum of
- * rr_normal_blank_time * rr_max_blank_ratio. We disable the
- * packet threshold during this time. When we are not in polling mode
- * we set the blank interval typically lower, rr_normal_pkt_cnt *
- * rr_min_blank_ratio but up the packet cnt by a ratio of
- * rr_min_pkt_cnt_ratio so that we are still getting chains if
- * possible although for a shorter interval.
- */
-#define RR_MAX_BLANK_RATIO 20
-#define RR_MIN_BLANK_RATIO 10
-#define RR_MAX_PKT_CNT_RATIO 3
-#define RR_MIN_PKT_CNT_RATIO 3
-
-/*
- * These can be tuned via /etc/system.
- */
-int rr_max_blank_ratio = RR_MAX_BLANK_RATIO;
-int rr_min_blank_ratio = RR_MIN_BLANK_RATIO;
-int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO;
-int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO;
-
-static mac_resource_handle_t
-ill_ring_add(void *arg, mac_resource_t *mrp)
-{
- ill_t *ill = (ill_t *)arg;
- mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp;
- ill_rx_ring_t *rx_ring;
- int ip_rx_index;
-
- ASSERT(mrp != NULL);
- if (mrp->mr_type != MAC_RX_FIFO) {
- return (NULL);
- }
- ASSERT(ill != NULL);
- ASSERT(ill->ill_dls_capab != NULL);
-
- mutex_enter(&ill->ill_lock);
- for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
- rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index];
- ASSERT(rx_ring != NULL);
-
- if (rx_ring->rr_ring_state == ILL_RING_FREE) {
- time_t normal_blank_time =
- mrfp->mrf_normal_blank_time;
- uint_t normal_pkt_cnt =
- mrfp->mrf_normal_pkt_count;
-
- bzero(rx_ring, sizeof (ill_rx_ring_t));
-
- rx_ring->rr_blank = mrfp->mrf_blank;
- rx_ring->rr_handle = mrfp->mrf_arg;
- rx_ring->rr_ill = ill;
- rx_ring->rr_normal_blank_time = normal_blank_time;
- rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt;
-
- rx_ring->rr_max_blank_time =
- normal_blank_time * rr_max_blank_ratio;
- rx_ring->rr_min_blank_time =
- normal_blank_time * rr_min_blank_ratio;
- rx_ring->rr_max_pkt_cnt =
- normal_pkt_cnt * rr_max_pkt_cnt_ratio;
- rx_ring->rr_min_pkt_cnt =
- normal_pkt_cnt * rr_min_pkt_cnt_ratio;
-
- rx_ring->rr_ring_state = ILL_RING_INUSE;
- mutex_exit(&ill->ill_lock);
-
- DTRACE_PROBE2(ill__ring__add, (void *), ill,
- (int), ip_rx_index);
- return ((mac_resource_handle_t)rx_ring);
- }
- }
-
- /*
- * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
- * we have devices which can overwhelm this limit, ILL_MAX_RING
- * should be made configurable. Meanwhile it cause no panic because
- * driver will pass ip_input a NULL handle which will make
- * IP allocate the default squeue and Polling mode will not
- * be used for this ring.
- */
- cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) "
- "for %s\n", ILL_MAX_RINGS, ill->ill_name);
-
- mutex_exit(&ill->ill_lock);
- return (NULL);
-}
-
-static boolean_t
-ill_capability_dls_init(ill_t *ill)
-{
- ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
- conn_t *connp;
- size_t sz;
- ip_stack_t *ipst = ill->ill_ipst;
-
- if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
- if (ill_dls == NULL) {
- cmn_err(CE_PANIC, "ill_capability_dls_init: "
- "soft_ring enabled for ill=%s (%p) but data "
- "structs uninitialized\n", ill->ill_name,
- (void *)ill);
- }
- return (B_TRUE);
- } else if (ill->ill_capabilities & ILL_CAPAB_POLL) {
- if (ill_dls == NULL) {
- cmn_err(CE_PANIC, "ill_capability_dls_init: "
- "polling enabled for ill=%s (%p) but data "
- "structs uninitialized\n", ill->ill_name,
- (void *)ill);
- }
- return (B_TRUE);
- }
-
- if (ill_dls != NULL) {
- ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl;
- /* Soft_Ring or polling is being re-enabled */
-
- connp = ill_dls->ill_unbind_conn;
- ASSERT(rx_ring != NULL);
- bzero((void *)ill_dls, sizeof (ill_dls_capab_t));
- bzero((void *)rx_ring,
- sizeof (ill_rx_ring_t) * ILL_MAX_RINGS);
- ill_dls->ill_ring_tbl = rx_ring;
- ill_dls->ill_unbind_conn = connp;
- return (B_TRUE);
- }
-
- if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
- ipst->ips_netstack)) == NULL)
- return (B_FALSE);
-
- sz = sizeof (ill_dls_capab_t);
- sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS;
-
- ill_dls = kmem_zalloc(sz, KM_NOSLEEP);
- if (ill_dls == NULL) {
- cmn_err(CE_WARN, "ill_capability_dls_init: could not "
- "allocate dls_capab for %s (%p)\n", ill->ill_name,
- (void *)ill);
- CONN_DEC_REF(connp);
- return (B_FALSE);
- }
-
- /* Allocate space to hold ring table */
- ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1];
- ill->ill_dls_capab = ill_dls;
- ill_dls->ill_unbind_conn = connp;
- return (B_TRUE);
-}
-
-/*
- * ill_capability_dls_disable: disable soft_ring and/or polling
- * capability. Since any of the rings might already be in use, need
- * to call ip_squeue_clean_all() which gets behind the squeue to disable
- * direct calls if necessary.
- */
-static void
-ill_capability_dls_disable(ill_t *ill)
-{
- ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
-
- if (ill->ill_capabilities & ILL_CAPAB_DLS) {
- ip_squeue_clean_all(ill);
- ill_dls->ill_tx = NULL;
- ill_dls->ill_tx_handle = NULL;
- ill_dls->ill_dls_change_status = NULL;
- ill_dls->ill_dls_bind = NULL;
- ill_dls->ill_dls_unbind = NULL;
- }
-
- ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS));
-}
-
-static void
-ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls,
- dl_capability_sub_t *isub)
-{
- uint_t size;
- uchar_t *rptr;
- dl_capab_dls_t dls, *odls;
- ill_dls_capab_t *ill_dls;
- mblk_t *nmp = NULL;
- dl_capability_req_t *ocap;
- uint_t sub_dl_cap = isub->dl_cap;
-
- if (!ill_capability_dls_init(ill))
- return;
- ill_dls = ill->ill_dls_capab;
-
- /* Copy locally to get the members aligned */
- bcopy((void *)idls, (void *)&dls,
- sizeof (dl_capab_dls_t));
-
- /* Get the tx function and handle from dld */
- ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx;
- ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle;
-
- if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
- ill_dls->ill_dls_change_status =
- (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status;
- ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind;
- ill_dls->ill_dls_unbind =
- (ip_dls_unbind_t)dls.dls_ring_unbind;
- ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt;
- }
-
- size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) +
- isub->dl_length;
-
- if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
- cmn_err(CE_WARN, "ill_capability_dls_capable: could "
- "not allocate memory for CAPAB_REQ for %s (%p)\n",
- ill->ill_name, (void *)ill);
- return;
- }
-
- /* initialize dl_capability_req_t */
- rptr = nmp->b_rptr;
- ocap = (dl_capability_req_t *)rptr;
- ocap->dl_sub_offset = sizeof (dl_capability_req_t);
- ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
- rptr += sizeof (dl_capability_req_t);
-
- /* initialize dl_capability_sub_t */
- bcopy(isub, rptr, sizeof (*isub));
- rptr += sizeof (*isub);
-
- odls = (dl_capab_dls_t *)rptr;
- rptr += sizeof (dl_capab_dls_t);
-
- /* initialize dl_capab_dls_t to be sent down */
- dls.dls_rx_handle = (uintptr_t)ill;
- dls.dls_rx = (uintptr_t)ip_input;
- dls.dls_ring_add = (uintptr_t)ill_ring_add;
-
- if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
- dls.dls_ring_cnt = ip_soft_rings_cnt;
- dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment;
- dls.dls_flags = SOFT_RING_ENABLE;
- } else {
- dls.dls_flags = POLL_ENABLE;
- ip1dbg(("ill_capability_dls_capable: asking interface %s "
- "to enable polling\n", ill->ill_name));
- }
- bcopy((void *)&dls, (void *)odls,
- sizeof (dl_capab_dls_t));
- ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
- /*
- * nmp points to a DL_CAPABILITY_REQ message to
- * enable either soft_ring or polling
- */
- ill_dlpi_send(ill, nmp);
-}
-
-static void
-ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp)
-{
- mblk_t *mp;
- dl_capab_dls_t *idls;
- dl_capability_sub_t *dl_subcap;
- int size;
-
- if (!(ill->ill_capabilities & ILL_CAPAB_DLS))
- return;
-
- ASSERT(ill->ill_dls_capab != NULL);
-
- size = sizeof (*dl_subcap) + sizeof (*idls);
-
- mp = allocb(size, BPRI_HI);
- if (mp == NULL) {
- ip1dbg(("ill_capability_dls_reset: unable to allocate "
- "request to disable soft_ring\n"));
- return;
- }
-
- mp->b_wptr = mp->b_rptr + size;
-
- dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
- dl_subcap->dl_length = sizeof (*idls);
- if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
- dl_subcap->dl_cap = DL_CAPAB_SOFT_RING;
- else
- dl_subcap->dl_cap = DL_CAPAB_POLL;
-
- idls = (dl_capab_dls_t *)(dl_subcap + 1);
- if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
- idls->dls_flags = SOFT_RING_DISABLE;
- else
- idls->dls_flags = POLL_DISABLE;
-
- if (*sc_mp != NULL)
- linkb(*sc_mp, mp);
- else
- *sc_mp = mp;
-}
-
-/*
- * Process a soft_ring/poll capability negotiation ack received
- * from a DLS Provider.isub must point to the sub-capability
- * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message.
- */
-static void
-ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
-{
- dl_capab_dls_t *idls;
- uint_t sub_dl_cap = isub->dl_cap;
- uint8_t *capend;
-
- ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING ||
- sub_dl_cap == DL_CAPAB_POLL);
-
- if (ill->ill_isv6)
- return;
-
- /*
- * Note: range checks here are not absolutely sufficient to
- * make us robust against malformed messages sent by drivers;
- * this is in keeping with the rest of IP's dlpi handling.
- * (Remember, it's coming from something else in the kernel
- * address space)
- */
- capend = (uint8_t *)(isub + 1) + isub->dl_length;
- if (capend > mp->b_wptr) {
- cmn_err(CE_WARN, "ill_capability_dls_ack: "
- "malformed sub-capability too long for mblk");
- return;
- }
-
- /*
- * There are two types of acks we process here:
- * 1. acks in reply to a (first form) generic capability req
- * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE)
- * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE
- * capability req.
- */
- idls = (dl_capab_dls_t *)(isub + 1);
-
- if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) {
- ip1dbg(("ill_capability_dls_ack: mid token for dls "
- "capability isn't as expected; pass-thru "
- "module(s) detected, discarding capability\n"));
- if (ill->ill_capabilities & ILL_CAPAB_DLS) {
- /*
- * This is a capability renegotitation case.
- * The interface better be unusable at this
- * point other wise bad things will happen
- * if we disable direct calls on a running
- * and up interface.
- */
- ill_capability_dls_disable(ill);
- }
- return;
- }
-
- switch (idls->dls_flags) {
- default:
- /* Disable if unknown flag */
- case SOFT_RING_DISABLE:
- case POLL_DISABLE:
- ill_capability_dls_disable(ill);
- break;
- case SOFT_RING_CAPABLE:
- case POLL_CAPABLE:
- /*
- * If the capability was already enabled, its safe
- * to disable it first to get rid of stale information
- * and then start enabling it again.
- */
- ill_capability_dls_disable(ill);
- ill_capability_dls_capable(ill, idls, isub);
- break;
- case SOFT_RING_ENABLE:
- case POLL_ENABLE:
- mutex_enter(&ill->ill_lock);
- if (sub_dl_cap == DL_CAPAB_SOFT_RING &&
- !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) {
- ASSERT(ill->ill_dls_capab != NULL);
- ill->ill_capabilities |= ILL_CAPAB_SOFT_RING;
- }
- if (sub_dl_cap == DL_CAPAB_POLL &&
- !(ill->ill_capabilities & ILL_CAPAB_POLL)) {
- ASSERT(ill->ill_dls_capab != NULL);
- ill->ill_capabilities |= ILL_CAPAB_POLL;
- ip1dbg(("ill_capability_dls_ack: interface %s "
- "has enabled polling\n", ill->ill_name));
- }
- mutex_exit(&ill->ill_lock);
- break;
- }
-}
-
-/*
* Process a hardware checksum offload capability negotiation ack received
* from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
* of a DL_CAPABILITY_ACK message.
@@ -3164,7 +2743,7 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
* nmp points to a DL_CAPABILITY_REQ message to enable
* hardware checksum acceleration.
*/
- ill_dlpi_send(ill, nmp);
+ ill_capability_send(ill, nmp);
} else {
ip1dbg(("ill_capability_hcksum_ack: interface %s has "
"advertised %x hardware checksum capability flags\n",
@@ -3173,37 +2752,17 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
}
static void
-ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
{
- mblk_t *mp;
dl_capab_hcksum_t *hck_subcap;
dl_capability_sub_t *dl_subcap;
- int size;
if (!ILL_HCKSUM_CAPABLE(ill))
return;
ASSERT(ill->ill_hcksum_capab != NULL);
- /*
- * Clear the capability flag for hardware checksum offload but
- * retain the ill_hcksum_capab structure since it's possible that
- * another thread is still referring to it. The structure only
- * gets deallocated when we destroy the ill.
- */
- ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM;
- size = sizeof (*dl_subcap) + sizeof (*hck_subcap);
-
- mp = allocb(size, BPRI_HI);
- if (mp == NULL) {
- ip1dbg(("ill_capability_hcksum_reset: unable to allocate "
- "request to disable hardware checksum offload\n"));
- return;
- }
-
- mp->b_wptr = mp->b_rptr + size;
-
- dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
+ dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
dl_subcap->dl_length = sizeof (*hck_subcap);
@@ -3211,10 +2770,7 @@ ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp)
hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
hck_subcap->hcksum_txflags = 0;
- if (*sc_mp != NULL)
- linkb(*sc_mp, mp);
- else
- *sc_mp = mp;
+ mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
}
static void
@@ -3325,42 +2881,22 @@ ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
- ill_dlpi_send(ill, nmp);
+ ill_capability_send(ill, nmp);
}
}
static void
-ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
{
- mblk_t *mp;
dl_capab_zerocopy_t *zerocopy_subcap;
dl_capability_sub_t *dl_subcap;
- int size;
if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
return;
ASSERT(ill->ill_zerocopy_capab != NULL);
- /*
- * Clear the capability flag for Zero-copy but retain the
- * ill_zerocopy_capab structure since it's possible that another
- * thread is still referring to it. The structure only gets
- * deallocated when we destroy the ill.
- */
- ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY;
-
- size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
-
- mp = allocb(size, BPRI_HI);
- if (mp == NULL) {
- ip1dbg(("ill_capability_zerocopy_reset: unable to allocate "
- "request to disable Zero-copy\n"));
- return;
- }
- mp->b_wptr = mp->b_rptr + size;
-
- dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
+ dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
dl_subcap->dl_length = sizeof (*zerocopy_subcap);
@@ -3369,30 +2905,24 @@ ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp)
ill->ill_zerocopy_capab->ill_zerocopy_version;
zerocopy_subcap->zerocopy_flags = 0;
- if (*sc_mp != NULL)
- linkb(*sc_mp, mp);
- else
- *sc_mp = mp;
+ mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
}
/*
- * Process Large Segment Offload capability negotiation ack received from a
- * DLS Provider. isub must point to the sub-capability (DL_CAPAB_LSO) of a
- * DL_CAPABILITY_ACK message.
+ * DLD capability
+ * Refer to dld.h for more information regarding the purpose and usage
+ * of this capability.
*/
static void
-ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
+ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
- mblk_t *nmp = NULL;
- dl_capability_req_t *oc;
- dl_capab_lso_t *lso_ic, *lso_oc;
- ill_lso_capab_t **ill_lso_capab;
- uint_t sub_dl_cap = isub->dl_cap;
- uint8_t *capend;
-
- ASSERT(sub_dl_cap == DL_CAPAB_LSO);
+ dl_capab_dld_t *dld_ic, dld;
+ uint_t sub_dl_cap = isub->dl_cap;
+ uint8_t *capend;
+ ill_dld_capab_t *idc;
- ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab;
+ ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(sub_dl_cap == DL_CAPAB_DLD);
/*
* Note: range checks here are not absolutely sufficient to
@@ -3403,165 +2933,395 @@ ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
- cmn_err(CE_WARN, "ill_capability_lso_ack: "
+ cmn_err(CE_WARN, "ill_capability_dld_ack: "
"malformed sub-capability too long for mblk");
return;
}
-
- lso_ic = (dl_capab_lso_t *)(isub + 1);
-
- if (lso_ic->lso_version != LSO_VERSION_1) {
- cmn_err(CE_CONT, "ill_capability_lso_ack: "
- "unsupported LSO sub-capability (version %d, expected %d)",
- lso_ic->lso_version, LSO_VERSION_1);
+ dld_ic = (dl_capab_dld_t *)(isub + 1);
+ if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
+ cmn_err(CE_CONT, "ill_capability_dld_ack: "
+ "unsupported DLD sub-capability (version %d, "
+ "expected %d)", dld_ic->dld_version,
+ DLD_CURRENT_VERSION);
return;
}
-
- if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) {
- ip1dbg(("ill_capability_lso_ack: mid token for LSO "
+ if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
+ ip1dbg(("ill_capability_dld_ack: mid token for dld "
"capability isn't as expected; pass-thru module(s) "
"detected, discarding capability\n"));
return;
}
- if ((lso_ic->lso_flags & LSO_TX_ENABLE) &&
- (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) {
- if (*ill_lso_capab == NULL) {
- *ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
- KM_NOSLEEP);
+ /*
+ * Copy locally to ensure alignment.
+ */
+ bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
- if (*ill_lso_capab == NULL) {
- cmn_err(CE_WARN, "ill_capability_lso_ack: "
- "could not enable LSO version %d "
- "for %s (ENOMEM)\n", LSO_VERSION_1,
- ill->ill_name);
- return;
- }
+ if ((idc = ill->ill_dld_capab) == NULL) {
+ idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
+ if (idc == NULL) {
+ cmn_err(CE_WARN, "ill_capability_dld_ack: "
+ "could not enable DLD version %d "
+ "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
+ ill->ill_name);
+ return;
}
+ idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
+ idc->idc_capab_dh = (void *)dld.dld_capab_handle;
+ ill->ill_dld_capab = idc;
+ }
+ ip1dbg(("ill_capability_dld_ack: interface %s "
+ "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
- (*ill_lso_capab)->ill_lso_version = lso_ic->lso_version;
- (*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags;
- (*ill_lso_capab)->ill_lso_max = lso_ic->lso_max;
- ill->ill_capabilities |= ILL_CAPAB_LSO;
+ ill_capability_dld_enable(ill);
+}
- ip1dbg(("ill_capability_lso_ack: interface %s "
- "has enabled LSO\n ", ill->ill_name));
- } else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) {
- uint_t size;
- uchar_t *rptr;
+/*
+ * Typically capability negotiation between IP and the driver happens via
+ * DLPI message exchange. However GLD also offers a direct function call
+ * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
+ * But arbitrary function calls into IP or GLD are not permitted, since both
+ * of them are protected by their own perimeter mechanism. The perimeter can
+ * be viewed as a coarse lock or serialization mechanism. The hierarchy of
+ * these perimeters is IP -> MAC. Thus for example to enable the squeue
+ * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
+ * to enter the mac perimeter and then do the direct function calls into
+ * GLD to enable squeue polling. The ring related callbacks from the mac into
+ * the stack to add, bind, quiesce, restart or cleanup a ring are all
+ * protected by the mac perimeter.
+ */
+static void
+ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ int err;
- size = sizeof (dl_capability_req_t) +
- sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t);
+ err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
+ DLD_ENABLE);
+ ASSERT(err == 0);
+}
- if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
- cmn_err(CE_WARN, "ill_capability_lso_ack: "
+static void
+ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ int err;
+
+ err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
+ DLD_DISABLE);
+ ASSERT(err == 0);
+}
+
+boolean_t
+ill_mac_perim_held(ill_t *ill)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+
+ return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
+ DLD_QUERY));
+}
+
+static void
+ill_capability_direct_enable(ill_t *ill)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ ill_dld_direct_t *idd = &idc->idc_direct;
+ dld_capab_direct_t direct;
+ int rc;
+
+ ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
+
+ bzero(&direct, sizeof (direct));
+ direct.di_rx_cf = (uintptr_t)ip_input;
+ direct.di_rx_ch = ill;
+
+ rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
+ DLD_ENABLE);
+ if (rc == 0) {
+ idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
+ idd->idd_tx_dh = direct.di_tx_dh;
+ idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
+ idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
+ /*
+ * One time registration of flow enable callback function
+ */
+ ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
+ ill_flow_enable, ill);
+ ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
+ DTRACE_PROBE1(direct_on, (ill_t *), ill);
+ } else {
+ cmn_err(CE_WARN, "warning: could not enable DIRECT "
+ "capability, rc = %d\n", rc);
+ DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
+ }
+}
+
+static void
+ill_capability_poll_enable(ill_t *ill)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ dld_capab_poll_t poll;
+ int rc;
+
+ ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
+
+ bzero(&poll, sizeof (poll));
+ poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
+ poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
+ poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
+ poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
+ poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
+ poll.poll_ring_ch = ill;
+ rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
+ DLD_ENABLE);
+ if (rc == 0) {
+ ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
+ DTRACE_PROBE1(poll_on, (ill_t *), ill);
+ } else {
+ ip1dbg(("warning: could not enable POLL "
+ "capability, rc = %d\n", rc));
+ DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
+ }
+}
+
+/*
+ * Enable the LSO capability.
+ */
+static void
+ill_capability_lso_enable(ill_t *ill)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ dld_capab_lso_t lso;
+ int rc;
+
+ ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
+
+ if (ill->ill_lso_capab == NULL) {
+ ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
+ KM_NOSLEEP);
+ if (ill->ill_lso_capab == NULL) {
+ cmn_err(CE_WARN, "ill_capability_lso_enable: "
"could not enable LSO for %s (ENOMEM)\n",
ill->ill_name);
return;
}
+ }
- rptr = nmp->b_rptr;
- /* initialize dl_capability_req_t */
- oc = (dl_capability_req_t *)nmp->b_rptr;
- oc->dl_sub_offset = sizeof (dl_capability_req_t);
- oc->dl_sub_length = sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_lso_t);
- nmp->b_rptr += sizeof (dl_capability_req_t);
-
- /* initialize dl_capability_sub_t */
- bcopy(isub, nmp->b_rptr, sizeof (*isub));
- nmp->b_rptr += sizeof (*isub);
+ bzero(&lso, sizeof (lso));
+ if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
+ DLD_ENABLE)) == 0) {
+ ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
+ ill->ill_lso_capab->ill_lso_max = lso.lso_max;
+ ill->ill_capabilities |= ILL_CAPAB_DLD_LSO;
+ ip1dbg(("ill_capability_lso_enable: interface %s "
+ "has enabled LSO\n ", ill->ill_name));
+ } else {
+ kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
+ ill->ill_lso_capab = NULL;
+ DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
+ }
+}
- /* initialize dl_capab_lso_t */
- lso_oc = (dl_capab_lso_t *)nmp->b_rptr;
- bcopy(lso_ic, lso_oc, sizeof (*lso_ic));
+static void
+ill_capability_dld_enable(ill_t *ill)
+{
+ mac_perim_handle_t mph;
- nmp->b_rptr = rptr;
- ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
+ ASSERT(IAM_WRITER_ILL(ill));
- /* set ENABLE flag */
- lso_oc->lso_flags |= LSO_TX_ENABLE;
+ if (ill->ill_isv6)
+ return;
- /* nmp points to a DL_CAPABILITY_REQ message to enable LSO */
- ill_dlpi_send(ill, nmp);
- } else {
- ip1dbg(("ill_capability_lso_ack: interface %s has "
- "advertised %x LSO capability flags\n",
- ill->ill_name, lso_ic->lso_flags));
+ ill_mac_perim_enter(ill, &mph);
+ if (!ill->ill_isv6) {
+ ill_capability_direct_enable(ill);
+ ill_capability_poll_enable(ill);
+ ill_capability_lso_enable(ill);
}
+ ill->ill_capabilities |= ILL_CAPAB_DLD;
+ ill_mac_perim_exit(ill, mph);
}
static void
-ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_dld_disable(ill_t *ill)
{
- mblk_t *mp;
- dl_capab_lso_t *lso_subcap;
- dl_capability_sub_t *dl_subcap;
- int size;
+ ill_dld_capab_t *idc;
+ ill_dld_direct_t *idd;
+ mac_perim_handle_t mph;
- if (!(ill->ill_capabilities & ILL_CAPAB_LSO))
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
return;
- ASSERT(ill->ill_lso_capab != NULL);
- /*
- * Clear the capability flag for LSO but retain the
- * ill_lso_capab structure since it's possible that another
- * thread is still referring to it. The structure only gets
- * deallocated when we destroy the ill.
- */
- ill->ill_capabilities &= ~ILL_CAPAB_LSO;
+ ill_mac_perim_enter(ill, &mph);
+
+ idc = ill->ill_dld_capab;
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
+ /*
+ * For performance we avoid locks in the transmit data path
+ * and don't maintain a count of the number of threads using
+ * direct calls. Thus some threads could be using direct
+ * transmit calls to GLD, even after the capability mechanism
+ * turns it off. This is still safe since the handles used in
+ * the direct calls continue to be valid until the unplumb is
+ * completed. Remove the callback that was added (1-time) at
+ * capab enable time.
+ */
+ mutex_enter(&ill->ill_lock);
+ ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
+ mutex_exit(&ill->ill_lock);
+ if (ill->ill_flownotify_mh != NULL) {
+ idd = &idc->idc_direct;
+ idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
+ ill->ill_flownotify_mh);
+ ill->ill_flownotify_mh = NULL;
+ }
+ (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
+ NULL, DLD_DISABLE);
+ }
- size = sizeof (*dl_subcap) + sizeof (*lso_subcap);
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
+ ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
+ ip_squeue_clean_all(ill);
+ (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
+ NULL, DLD_DISABLE);
+ }
- mp = allocb(size, BPRI_HI);
- if (mp == NULL) {
- ip1dbg(("ill_capability_lso_reset: unable to allocate "
- "request to disable LSO\n"));
- return;
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) {
+ ASSERT(ill->ill_lso_capab != NULL);
+ /*
+ * Clear the capability flag for LSO but retain the
+ * ill_lso_capab structure since it's possible that another
+ * thread is still referring to it. The structure only gets
+ * deallocated when we destroy the ill.
+ */
+
+ ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO;
+ (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
+ NULL, DLD_DISABLE);
}
- mp->b_wptr = mp->b_rptr + size;
+ ill->ill_capabilities &= ~ILL_CAPAB_DLD;
+ ill_mac_perim_exit(ill, mph);
+}
- dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
- dl_subcap->dl_cap = DL_CAPAB_LSO;
- dl_subcap->dl_length = sizeof (*lso_subcap);
+/*
+ * Capability Negotiation protocol
+ *
+ * We don't wait for DLPI capability operations to finish during interface
+ * bringup or teardown. Doing so would introduce more asynchrony and the
+ * interface up/down operations will need multiple return and restarts.
+ * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
+ * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
+ * exclusive operation won't start until the DLPI operations of the previous
+ * exclusive operation complete.
+ *
+ * The capability state machine is shown below.
+ *
+ * state next state event, action
+ *
+ * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe
+ * IDCS_PROBE_SENT IDCS_OK ill_capability_ack
+ * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack)
+ * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG
+ * IDCS_OK IDCS_RESET_SENT ill_capability_reset
+ * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr
+ * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr ->
+ * ill_capability_probe.
+ */
+
+/*
+ * Dedicated thread started from ip_stack_init that handles capability
+ * disable. This thread ensures the taskq dispatch does not fail by waiting
+ * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
+ * that direct calls to DLD are done in a cv_waitable context.
+ */
+void
+ill_taskq_dispatch(ip_stack_t *ipst)
+{
+ callb_cpr_t cprinfo;
+ char name[64];
+ mblk_t *mp;
- lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1);
- lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version;
- lso_subcap->lso_flags = 0;
+ (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
+ ipst->ips_netstack->netstack_stackid);
+ CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
+ name);
+ mutex_enter(&ipst->ips_capab_taskq_lock);
- if (*sc_mp != NULL)
- linkb(*sc_mp, mp);
- else
- *sc_mp = mp;
+ for (;;) {
+ mp = list_head(&ipst->ips_capab_taskq_list);
+ while (mp != NULL) {
+ list_remove(&ipst->ips_capab_taskq_list, mp);
+ mutex_exit(&ipst->ips_capab_taskq_lock);
+ VERIFY(taskq_dispatch(system_taskq,
+ ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
+ mutex_enter(&ipst->ips_capab_taskq_lock);
+ mp = list_head(&ipst->ips_capab_taskq_list);
+ }
+
+ if (ipst->ips_capab_taskq_quit)
+ break;
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
+ }
+ VERIFY(list_head(&ipst->ips_capab_taskq_list) == NULL);
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
}
/*
* Consume a new-style hardware capabilities negotiation ack.
- * Called from ip_rput_dlpi_writer().
+ * Called via taskq on receipt of DL_CAPABBILITY_ACK.
*/
-void
-ill_capability_ack(ill_t *ill, mblk_t *mp)
+static void
+ill_capability_ack_thr(void *arg)
{
+ mblk_t *mp = arg;
dl_capability_ack_t *capp;
dl_capability_sub_t *subp, *endp;
+ ill_t *ill;
+ boolean_t reneg;
- if (ill->ill_dlpi_capab_state == IDS_INPROGRESS)
- ill->ill_dlpi_capab_state = IDS_OK;
+ ill = (ill_t *)mp->b_prev;
+ VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
+
+ if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
+ ill->ill_dlpi_capab_state == IDCS_RENEG) {
+ /*
+ * We have received the ack for our DL_CAPAB reset request.
+ * There isnt' anything in the message that needs processing.
+ * All message based capabilities have been disabled, now
+ * do the function call based capability disable.
+ */
+ reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
+ ill_capability_dld_disable(ill);
+ ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
+ if (reneg)
+ ill_capability_probe(ill);
+ goto done;
+ }
+
+ if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
+ ill->ill_dlpi_capab_state = IDCS_OK;
capp = (dl_capability_ack_t *)mp->b_rptr;
- if (capp->dl_sub_length == 0)
+ if (capp->dl_sub_length == 0) {
/* no new-style capabilities */
- return;
+ goto done;
+ }
/* make sure the driver supplied correct dl_sub_length */
if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
"invalid dl_sub_length (%d)\n", capp->dl_sub_length));
- return;
+ goto done;
}
+
#define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
/*
* There are sub-capabilities. Process the ones we know about.
@@ -3582,6 +3342,34 @@ ill_capability_ack(ill_t *ill, mblk_t *mp)
}
}
#undef SC
+done:
+ inet_freemsg(mp);
+ ill_capability_done(ill);
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+}
+
+/*
+ * This needs to be started in a taskq thread to provide a cv_waitable
+ * context.
+ */
+void
+ill_capability_ack(ill_t *ill, mblk_t *mp)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ mp->b_prev = (mblk_t *)ill;
+ if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
+ TQ_NOSLEEP) != 0)
+ return;
+
+ /*
+ * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
+ * which will do the dispatch using TQ_SLEEP to guarantee success.
+ */
+ mutex_enter(&ipst->ips_capab_taskq_lock);
+ list_insert_tail(&ipst->ips_capab_taskq_list, mp);
+ cv_signal(&ipst->ips_capab_taskq_cv);
+ mutex_exit(&ipst->ips_capab_taskq_lock);
}
/*
@@ -7609,7 +7397,7 @@ ipsq_dq(ipsq_t *ipsq)
*/
#define ENTER_SQ_WAIT_TICKS 100
boolean_t
-ipsq_enter(ill_t *ill, boolean_t force)
+ipsq_enter(ill_t *ill, boolean_t force, int type)
{
ipsq_t *ipsq;
boolean_t waited_enough = B_FALSE;
@@ -7630,7 +7418,8 @@ ipsq_enter(ill_t *ill, boolean_t force)
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
if (ipsq->ipsq_writer == NULL &&
- (ipsq->ipsq_current_ipif == NULL || waited_enough)) {
+ (type == CUR_OP || ipsq->ipsq_current_ipif == NULL ||
+ waited_enough)) {
break;
} else if (ipsq->ipsq_writer != NULL) {
mutex_exit(&ipsq->ipsq_lock);
@@ -7661,6 +7450,18 @@ ipsq_enter(ill_t *ill, boolean_t force)
return (B_TRUE);
}
+boolean_t
+ill_perim_enter(ill_t *ill)
+{
+ return (ipsq_enter(ill, B_FALSE, CUR_OP));
+}
+
+void
+ill_perim_exit(ill_t *ill)
+{
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+}
+
/*
* The ipsq_t (ipsq) is the synchronization data structure used to serialize
* certain critical operations like plumbing (i.e. most set ioctls),
@@ -9984,6 +9785,13 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
ill->ill_ip_muxid = islink ? li->l_index : 0;
/*
+ * Mark the ipsq busy until the capability operations initiated below
+ * complete. The PLINK/UNLINK ioctl itself completes when our caller
+ * returns, but the capability operation may complete asynchronously
+ * much later.
+ */
+ ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
+ /*
* If there's at least one up ipif on this ill, then we're bound to
* the underlying driver via DLPI. In that case, renegotiate
* capabilities to account for any possible change in modules
@@ -9993,8 +9801,9 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
if (islink)
ill_capability_probe(ill);
else
- ill_capability_reset(ill);
+ ill_capability_reset(ill, B_FALSE);
}
+ ipsq_current_finish(ipsq);
if (entered_ipsq)
ipsq_exit(ipsq);
@@ -18244,19 +18053,19 @@ ill_dl_down(ill_t *ill)
ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
mutex_exit(&ill->ill_lock);
/*
- * Reset the capabilities if the negotiation is done or is
- * still in progress. Note that ill_capability_reset() will
- * set ill_dlpi_capab_state to IDS_UNKNOWN, so the subsequent
- * DL_CAPABILITY_ACK and DL_NOTE_CAPAB_RENEG will be ignored.
- *
- * Further, reset ill_capab_reneg to be B_FALSE so that the
- * subsequent DL_CAPABILITY_ACK can be ignored, to prevent
- * the capabilities renegotiation from happening.
+ * ip_rput does not pass up normal (M_PROTO) DLPI messages
+ * after ILL_CONDEMNED is set. So in the unplumb case, we call
+ * ill_capability_dld_disable disable rightaway. If this is not
+ * an unplumb operation then the disable happens on receipt of
+ * the capab ack via ip_rput_dlpi_writer ->
+ * ill_capability_ack_thr. In both cases the order of
+ * the operations seen by DLD is capability disable followed
+ * by DL_UNBIND. Also the DLD capability disable needs a
+ * cv_wait'able context.
*/
- if (ill->ill_dlpi_capab_state != IDS_UNKNOWN)
- ill_capability_reset(ill);
- ill->ill_capab_reneg = B_FALSE;
-
+ if (ill->ill_state_flags & ILL_CONDEMNED)
+ ill_capability_dld_disable(ill);
+ ill_capability_reset(ill, B_FALSE);
ill_dlpi_send(ill, mp);
}
@@ -18314,7 +18123,6 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
ill->ill_dlpi_pending = prim;
}
mutex_exit(&ill->ill_lock);
-
putnext(ill->ill_wq, mp);
}
@@ -18372,6 +18180,26 @@ ill_dlpi_send(ill_t *ill, mblk_t *mp)
ill_dlpi_dispatch(ill, mp);
}
+static void
+ill_capability_send(ill_t *ill, mblk_t *mp)
+{
+ ill->ill_capab_pending_cnt++;
+ ill_dlpi_send(ill, mp);
+}
+
+void
+ill_capability_done(ill_t *ill)
+{
+ ASSERT(ill->ill_capab_pending_cnt != 0);
+
+ ill_dlpi_done(ill, DL_CAPABILITY_REQ);
+
+ ill->ill_capab_pending_cnt--;
+ if (ill->ill_capab_pending_cnt == 0 &&
+ ill->ill_dlpi_capab_state == IDCS_OK)
+ ill_capability_reset_alloc(ill);
+}
+
/*
* Send all deferred DLPI messages without waiting for their ACKs.
*/
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 2e940057f0..405cb653d5 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -4277,6 +4277,37 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
return (NULL);
}
+ire_t *
+ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
+{
+ irb_t *irb_ptr;
+ ire_t *ire;
+
+ /*
+ * Lets look for an ire in the cachetable whose
+ * ire_addr matches the destination.
+ * Since we are being called by forwarding fastpath
+ * no need to check for Trusted Solaris label.
+ */
+ irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
+ dst, ipst->ips_ip_cache_table_size)];
+ rw_enter(&irb_ptr->irb_lock, RW_READER);
+ for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
+ if (ire->ire_marks & (IRE_MARK_CONDEMNED |
+ IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+ continue;
+ }
+ if (ire->ire_addr == dst) {
+ IRE_REFHOLD(ire);
+ rw_exit(&irb_ptr->irb_lock);
+ return (ire);
+ }
+ }
+ rw_exit(&irb_ptr->irb_lock);
+ return (NULL);
+}
+
+
/*
* Locate the interface ire that is tied to the cache ire 'cire' via
* cire->ire_ihandle.
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index 34fd3cd765..ac14adf00d 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -28,8 +28,6 @@
*/
/* Copyright (c) 1990 Mentat Inc. */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Procedures for the kernel part of DVMRP,
* a Distance-Vector Multicast Routing Protocol.
@@ -683,7 +681,7 @@ ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
vifp->v_marks &= ~VIF_MARK_GOOD;
vifp->v_marks |= VIF_MARK_CONDEMNED;
mutex_exit(&(vifp)->v_lock);
- suc = ipsq_enter(ill, B_FALSE);
+ suc = ipsq_enter(ill, B_FALSE, NEW_OP);
ipsq = ill->ill_phyint->phyint_ipsq;
} else {
ipsq = ipsq_try_enter(ipif, NULL,
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index 7a036a34d9..f3c95ae362 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -1201,7 +1201,7 @@ ipsq_enter_byifindex(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
return (NULL);
}
ill_refrele(ill);
- in_ipsq = ipsq_enter(ill, B_FALSE);
+ in_ipsq = ipsq_enter(ill, B_FALSE, NEW_OP);
ill_waiter_dcr(ill);
if (!in_ipsq)
ill = NULL;
@@ -3912,7 +3912,7 @@ retry:
* be refheld for cleanup by those routines and it would be
* a mutual deadlock.
*/
- success = ipsq_enter(ill, B_FALSE);
+ success = ipsq_enter(ill, B_FALSE, NEW_OP);
ipsq = ill->ill_phyint->phyint_ipsq;
ill_waiter_dcr(ill);
mutex_enter(&connp->conn_lock);
diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c
index a34b55693e..53665593be 100644
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c
@@ -1546,7 +1546,7 @@ ip_ni_queue_func_impl(injection_t *inject, boolean_t out)
if (inject->inj_isv6) {
ip_rput_v6(ill->ill_rq, packet->ni_packet);
} else {
- ip_input(ill, NULL, packet->ni_packet, 0);
+ ip_input(ill, NULL, packet->ni_packet, NULL);
}
kmem_free(inject, sizeof (*inject));
ill_refrele(ill);
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index c2b22ab956..9d677c3157 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -26,16 +26,36 @@
/*
* IP interface to squeues.
*
- * IP creates an squeue instance for each CPU. The squeue pointer is saved in
- * cpu_squeue field of the cpu structure. Each squeue is associated with a
- * connection instance (conn_t).
+ * IP uses squeues to force serialization of packets, both incoming and
+ * outgoing. Each squeue is associated with a connection instance (conn_t)
+ * above, and a soft ring (if enabled) below. Each CPU will have a default
+ * squeue for outbound connections, and each soft ring of an interface will
+ * have an squeue to which it sends incoming packets. squeues are never
+ * destroyed, and if they become unused they are kept around against future
+ * needs.
*
- * For CPUs available at system startup time the squeue creation and association
- * with CPU happens at MP initialization time. For CPUs added during dynamic
- * reconfiguration, the initialization happens when the new CPU is configured in
- * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
- * return per-CPU squeue or random squeue based on the ip_squeue_fanout
- * variable.
+ * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
+ * in the system there will be one squeue set, all of whose squeues will be
+ * bound to that CPU, plus one additional set known as the unbound set. Sets
+ * associated with CPUs will have one default squeue, for outbound
+ * connections, and a linked list of squeues used by various NICs for inbound
+ * packets. The unbound set also has a linked list of squeues, but no default
+ * squeue.
+ *
+ * When a CPU goes offline its squeue set is destroyed, and all its squeues
+ * are moved to the unbound set. When a CPU comes online, a new squeue set is
+ * created and the default set is searched for a default squeue formerly bound
+ * to this CPU. If no default squeue is found, a new one is created.
+ *
+ * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
+ * and not the squeue code. squeue.c will not touch them, and we can modify
+ * them without holding the squeue lock because of the guarantee that squeues
+ * are never destroyed. ip_squeue locks must be held, however.
+ *
+ * All the squeue sets are protected by a single lock, the sqset_lock. This
+ * is also used to protect the sq_next and sq_set fields of an squeue_t.
+ *
+ * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
*
* There are two modes of associating connection with squeues. The first mode
* associates each connection with the CPU that creates the connection (either
@@ -50,18 +70,13 @@
* may process the connection on whatever CPU it is scheduled. The squeue to CPU
* binding is only relevant for the worker thread.
*
- * The list of all created squeues is kept in squeue_set structure. This list is
- * used when ip_squeue_fanout is set and the load is distributed across all
- * squeues.
- *
* INTERFACE:
*
- * squeue_t *ip_squeue_get(hint)
+ * squeue_t *ip_squeue_get(ill_rx_ring_t)
*
- * Find an squeue based on the 'hint' value. The hint is used as an index
- * in the array of IP squeues available. The way hint is computed may
- * affect the effectiveness of the squeue distribution. Currently squeues
- * are assigned in round-robin fashion using lbolt as a hint.
+ * Returns the squeue associated with an ill receive ring. If the ring is
+ * not bound to a CPU, and we're currently servicing the interrupt which
+ * generated the packet, then bind the squeue to CPU.
*
*
* DR Notes
@@ -78,36 +93,31 @@
* o When the CPU is going online, it creates a new squeue for this CPU if
* necessary and binds the squeue worker thread to this CPU.
*
- * TUNEBALES:
- *
- * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
- * associated with an squeue instance.
+ * TUNABLES:
*
- * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
- * should be compiled with SQUEUE_PROFILE enabled for this variable to have
- * an impact.
+ * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
+ * pick the default squeue from a random CPU, otherwise use our CPU's default
+ * squeue.
*
- * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
- * otherwise get it from CPU->cpu_squeue.
+ * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
+ * /dev/ip.
*
- * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
- * changed using ndd on /dev/tcp or /dev/ip.
- *
- * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
- * created. This is the time squeue code waits before waking up the worker
- * thread after queuing a request.
+ * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
+ * created. This is the time squeue code waits before waking up the worker
+ * thread after queuing a request.
*/
#include <sys/types.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/cpuvar.h>
-
#include <sys/cmn_err.h>
#include <inet/common.h>
#include <inet/ip.h>
+#include <netinet/ip6.h>
#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
#include <inet/nd.h>
#include <inet/ipclassifier.h>
#include <sys/types.h>
@@ -115,31 +125,21 @@
#include <sys/sunddi.h>
#include <sys/dlpi.h>
#include <sys/squeue_impl.h>
+#include <sys/tihdr.h>
+#include <inet/udp_impl.h>
+#include <sys/strsubr.h>
+#include <sys/zone.h>
+#include <sys/dld.h>
#include <sys/atomic.h>
/*
- * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
- * mapping between squeue and NIC (or Rx ring) for performance reasons so
- * each squeue can uniquely own a NIC or a Rx ring and do polling
- * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU.
- * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
- * can be created dynamically as needed.
+ * List of all created squeue sets. The list and its size are protected by
+ * sqset_lock.
*/
-#define MAX_SQUEUES_PER_CPU 32
-#define MIN_SQUEUES_PER_CPU 1
-uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+static squeue_set_t **sqset_global_list; /* list 0 is the unbound list */
+static uint_t sqset_global_size;
+kmutex_t sqset_lock;
-#define IP_NUM_SOFT_RINGS 2
-uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
-
-/*
- * List of all created squeue sets. The size is protected by cpu_lock
- */
-squeue_set_t **sqset_global_list;
-uint_t sqset_global_size;
-
-int ip_squeue_bind = B_TRUE;
-int ip_squeue_profile = B_TRUE;
static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
/*
@@ -149,82 +149,153 @@ static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
*/
uint_t ip_squeue_worker_wait = 10;
-static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
+static squeue_t *ip_squeue_create(pri_t);
+static squeue_set_t *ip_squeue_set_create(processorid_t);
static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
-
-static void ip_squeue_set_bind(squeue_set_t *);
-static void ip_squeue_set_unbind(squeue_set_t *);
-static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t);
+static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
+static void ip_squeue_set_destroy(cpu_t *);
static void ip_squeue_clean(void *, mblk_t *, void *);
-static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
#define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
+static squeue_t *
+ip_squeue_create(pri_t pri)
+{
+ squeue_t *sqp;
+
+ sqp = squeue_create(ip_squeue_worker_wait, pri);
+ ASSERT(sqp != NULL);
+ if (ip_squeue_create_callback != NULL)
+ ip_squeue_create_callback(sqp);
+ return (sqp);
+}
+
/*
- * Create squeue set containing ip_squeues_per_cpu number of squeues
- * for this CPU and bind them all to the CPU.
+ * Create a new squeue_set. If id == -1, then we're creating the unbound set,
+ * which should only happen once when we are first initialized. Otherwise id
+ * is the id of the CPU that needs a set, either because we are initializing
+ * or because the CPU has come online.
+ *
+ * If id != -1, then we need at a minimum to provide a default squeue for the
+ * new set. We search the unbound set for candidates, and if none are found we
+ * create a new one.
*/
static squeue_set_t *
-ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
+ip_squeue_set_create(processorid_t id)
{
- int i;
squeue_set_t *sqs;
- squeue_t *sqp;
- char sqname[64];
- processorid_t id = cp->cpu_id;
+ squeue_set_t *src = sqset_global_list[0];
+ squeue_t **lastsqp, *sq;
+ squeue_t **defaultq_lastp = NULL;
+
+ sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
+ sqs->sqs_cpuid = id;
+
+ if (id == -1) {
+ ASSERT(sqset_global_size == 0);
+ sqset_global_list[0] = sqs;
+ sqset_global_size = 1;
+ return (sqs);
+ }
- if (reuse) {
- int i;
+ /*
+ * When we create an squeue set id != -1, we need to give it a
+ * default squeue, in order to support fanout of conns across
+ * CPUs. Try to find a former default squeue that matches this
+ * cpu id on the unbound squeue set. If no such squeue is found,
+ * find some non-default TCP squeue and steal it. If still no such
+ * candidate is found, create a new squeue.
+ */
- /*
- * We may already have an squeue created for this CPU. Try to
- * find one and reuse it if possible.
- */
- for (i = 0; i < sqset_global_size; i++) {
- sqs = sqset_global_list[i];
- if (id == sqs->sqs_bind)
- return (sqs);
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&sqset_lock);
+ lastsqp = &src->sqs_head;
+
+ while (*lastsqp) {
+ if ((*lastsqp)->sq_bind == id &&
+ (*lastsqp)->sq_state & SQS_DEFAULT) {
+ defaultq_lastp = lastsqp;
+ break;
+ }
+ if (defaultq_lastp == NULL &&
+ !((*lastsqp)->sq_state & SQS_DEFAULT)) {
+ defaultq_lastp = lastsqp;
}
+ lastsqp = &(*lastsqp)->sq_next;
+
+ }
+ if (defaultq_lastp) {
+ /* Remove from src set and set SQS_DEFAULT */
+ sq = *defaultq_lastp;
+ *defaultq_lastp = sq->sq_next;
+ sq->sq_next = NULL;
+ if (!(sq->sq_state & SQS_DEFAULT)) {
+ mutex_enter(&sq->sq_lock);
+ sq->sq_state |= SQS_DEFAULT;
+ mutex_exit(&sq->sq_lock);
+ }
+ } else {
+ sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
+ sq->sq_state |= SQS_DEFAULT;
}
- sqs = kmem_zalloc(sizeof (squeue_set_t) +
- (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
- mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
- sqs->sqs_list = (squeue_t **)&sqs[1];
- sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
- sqs->sqs_bind = id;
+ sq->sq_set = sqs;
+ sqs->sqs_default = sq;
+ squeue_bind(sq, id); /* this locks squeue mutex */
- for (i = 0; i < ip_squeues_per_cpu; i++) {
- bzero(sqname, sizeof (sqname));
+ ASSERT(sqset_global_size <= NCPU);
+ sqset_global_list[sqset_global_size++] = sqs;
+ mutex_exit(&sqset_lock);
+ return (sqs);
+}
- (void) snprintf(sqname, sizeof (sqname),
- "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
- cp->cpu_id, i);
+/*
+ * Called by ill_ring_add() to find an squeue to associate with a new ring.
+ */
- sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
- minclsyspri);
+squeue_t *
+ip_squeue_getfree(pri_t pri)
+{
+ squeue_set_t *sqs = sqset_global_list[0];
+ squeue_t *sq;
+ mutex_enter(&sqset_lock);
+ for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
/*
- * The first squeue in each squeue_set is the DEFAULT
- * squeue.
+ * Select a non-default squeue
*/
- sqp->sq_state |= SQS_DEFAULT;
+ if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
+ break;
+ }
- ASSERT(sqp != NULL);
+ if (sq == NULL) {
+ sq = ip_squeue_create(pri);
+ sq->sq_set = sqs;
+ sq->sq_next = sqs->sqs_head;
+ sqs->sqs_head = sq;
+ }
- squeue_profile_enable(sqp);
- sqs->sqs_list[sqs->sqs_size++] = sqp;
+ ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
+ SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
+ SQS_POLL_THR_QUIESCED)));
- if (ip_squeue_create_callback != NULL)
- ip_squeue_create_callback(sqp);
- }
+ mutex_enter(&sq->sq_lock);
+ sq->sq_state |= SQS_ILL_BOUND;
+ mutex_exit(&sq->sq_lock);
+ mutex_exit(&sqset_lock);
- if (ip_squeue_bind && cpu_is_online(cp))
- ip_squeue_set_bind(sqs);
+ if (sq->sq_priority != pri) {
+ thread_lock(sq->sq_worker);
+ (void) thread_change_pri(sq->sq_worker, pri, 0);
+ thread_unlock(sq->sq_worker);
- sqset_global_list[sqset_global_size++] = sqs;
- ASSERT(sqset_global_size <= NCPU);
- return (sqs);
+ thread_lock(sq->sq_poll_thr);
+ (void) thread_change_pri(sq->sq_poll_thr, pri, 0);
+ thread_unlock(sq->sq_poll_thr);
+
+ sq->sq_priority = pri;
+ }
+ return (sq);
}
/*
@@ -234,876 +305,450 @@ void
ip_squeue_init(void (*callback)(squeue_t *))
{
int i;
+ squeue_set_t *sqs;
ASSERT(sqset_global_list == NULL);
- if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
- ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
- else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
- ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
-
ip_squeue_create_callback = callback;
squeue_init();
+ mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
sqset_global_list =
- kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
+ kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
sqset_global_size = 0;
- mutex_enter(&cpu_lock);
+ /*
+ * We are called at system boot time and we don't
+ * expect memory allocation failure.
+ */
+ sqs = ip_squeue_set_create(-1);
+ ASSERT(sqs != NULL);
+ mutex_enter(&cpu_lock);
/* Create squeue for each active CPU available */
for (i = 0; i < NCPU; i++) {
- cpu_t *cp = cpu[i];
+ cpu_t *cp = cpu_get(i);
if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
- cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
+ /*
+ * We are called at system boot time and we don't
+ * expect memory allocation failure then
+ */
+ cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
+ ASSERT(cp->cpu_squeue_set != NULL);
}
}
register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
-
mutex_exit(&cpu_lock);
-
- if (ip_squeue_profile)
- squeue_profile_start();
}
/*
- * Get squeue_t structure based on index.
- * Since the squeue list can only grow, no need to grab any lock.
+ * Get a default squeue, either from the current CPU or a CPU derived by hash
+ * from the index argument, depending upon the setting of ip_squeue_fanout.
*/
squeue_t *
ip_squeue_random(uint_t index)
{
- squeue_set_t *sqs;
-
- sqs = sqset_global_list[index % sqset_global_size];
- return (sqs->sqs_list[index % sqs->sqs_size]);
-}
-
-/* ARGSUSED */
-static void
-ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
-{
- squeue_t *sqp = arg2;
- ill_rx_ring_t *ring = (ill_rx_ring_t *)mp->b_wptr;
- ill_t *ill;
-
- ASSERT(sqp != NULL);
- mp->b_wptr = NULL;
-
- if (ring == NULL) {
- return;
- }
+ squeue_set_t *sqs = NULL;
+ squeue_t *sq;
/*
- * Clean up squeue
+ * The minimum value of sqset_global_size is 2, one for the unbound
+ * squeue set and another for the squeue set of the zeroth CPU.
+ * Even though the value could be changing, it can never go below 2,
+ * so the assert does not need the lock protection.
*/
- mutex_enter(&sqp->sq_lock);
- sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
- sqp->sq_rx_ring = NULL;
- mutex_exit(&sqp->sq_lock);
+ ASSERT(sqset_global_size > 1);
- ill = ring->rr_ill;
- if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
- ASSERT(ring->rr_handle != NULL);
- ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
- }
+ /* Protect against changes to sqset_global_list */
+ mutex_enter(&sqset_lock);
- /*
- * Cleanup the ring
- */
-
- ring->rr_blank = NULL;
- ring->rr_handle = NULL;
- ring->rr_sqp = NULL;
+ if (!ip_squeue_fanout)
+ sqs = CPU->cpu_squeue_set;
/*
- * Signal ill that cleanup is done
+ * sqset_global_list[0] corresponds to the unbound squeue set.
+ * The computation below picks a set other than the unbound set.
*/
- mutex_enter(&ill->ill_lock);
- ring->rr_ring_state = ILL_RING_FREE;
- cv_signal(&ill->ill_cv);
- mutex_exit(&ill->ill_lock);
+ if (sqs == NULL)
+ sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
+ sq = sqs->sqs_default;
+
+ mutex_exit(&sqset_lock);
+ ASSERT(sq);
+ return (sq);
}
/*
- * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
- * The real cleanup happens behind the squeue via ip_squeue_clean function but
- * we need to protect ourselves from 2 threads trying to cleanup at the same
- * time (possible with one port going down for aggr and someone tearing down the
- * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
- * to indicate when the cleanup has started (1 ref) and when the cleanup
- * is done (0 ref). When a new ring gets assigned to squeue, we start by
- * putting 2 ref on ill_inuse_ref.
+ * Move squeue from its current set to newset. Not used for default squeues.
+ * Bind or unbind the worker thread as appropriate.
*/
+
static void
-ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
+ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
{
- conn_t *connp;
- squeue_t *sqp;
- mblk_t *mp;
-
- ASSERT(rx_ring != NULL);
+ squeue_set_t *set;
+ squeue_t **lastsqp;
+ processorid_t cpuid = newset->sqs_cpuid;
- /* Just clean one squeue */
- mutex_enter(&ill->ill_lock);
- /*
- * Reset the ILL_SOFT_RING_ASSIGN bit so that
- * ip_squeue_soft_ring_affinty() will not go
- * ahead with assigning rings.
- */
- ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
- while (rx_ring->rr_ring_state == ILL_RING_INPROC)
- /* Some operations pending on the ring. Wait */
- cv_wait(&ill->ill_cv, &ill->ill_lock);
-
- if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
- /*
- * Someone already trying to clean
- * this squeue or it's already been cleaned.
- */
- mutex_exit(&ill->ill_lock);
- return;
- }
- sqp = rx_ring->rr_sqp;
+ ASSERT(!(sq->sq_state & SQS_DEFAULT));
+ ASSERT(!MUTEX_HELD(&sq->sq_lock));
+ ASSERT(MUTEX_HELD(&sqset_lock));
- if (sqp == NULL) {
- /*
- * The rx_ring never had a squeue assigned to it.
- * We are under ill_lock so we can clean it up
- * here itself since no one can get to it.
- */
- rx_ring->rr_blank = NULL;
- rx_ring->rr_handle = NULL;
- rx_ring->rr_sqp = NULL;
- rx_ring->rr_ring_state = ILL_RING_FREE;
- mutex_exit(&ill->ill_lock);
+ set = sq->sq_set;
+ if (set == newset)
return;
- }
-
- /* Indicate that it's being cleaned */
- rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
- ASSERT(sqp != NULL);
- mutex_exit(&ill->ill_lock);
- /*
- * Use the preallocated ill_unbind_conn for this purpose
- */
- connp = ill->ill_dls_capab->ill_unbind_conn;
-
- if (connp->conn_tcp->tcp_closemp.b_prev == NULL) {
- connp->conn_tcp->tcp_closemp_used = B_TRUE;
- } else {
- cmn_err(CE_PANIC, "ip_squeue_clean_ring: "
- "concurrent use of tcp_closemp_used: connp %p tcp %p\n",
- (void *)connp, (void *)connp->conn_tcp);
- }
-
- TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
- mp = &connp->conn_tcp->tcp_closemp;
- CONN_INC_REF(connp);
-
- /*
- * Since the field sq_rx_ring for default squeue is NULL,
- * ip_squeue_clean() will have no way to get the ring if we
- * don't pass the pointer to it. We use b_wptr to do so
- * as use of b_wptr for any other purpose is not expected.
- */
-
- ASSERT(mp->b_wptr == NULL);
- mp->b_wptr = (unsigned char *)rx_ring;
- squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
-
- mutex_enter(&ill->ill_lock);
- while (rx_ring->rr_ring_state != ILL_RING_FREE)
- cv_wait(&ill->ill_cv, &ill->ill_lock);
- mutex_exit(&ill->ill_lock);
+ lastsqp = &set->sqs_head;
+ while (*lastsqp != sq)
+ lastsqp = &(*lastsqp)->sq_next;
+
+ *lastsqp = sq->sq_next;
+ sq->sq_next = newset->sqs_head;
+ newset->sqs_head = sq;
+ sq->sq_set = newset;
+ if (cpuid == -1)
+ squeue_unbind(sq);
+ else
+ squeue_bind(sq, cpuid);
}
-void
-ip_squeue_clean_all(ill_t *ill)
+/*
+ * Move squeue from its current set to cpuid's set and bind to cpuid.
+ */
+
+int
+ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
{
- int idx;
+ cpu_t *cpu;
+ squeue_set_t *set;
- /*
- * No need to clean if poll_capab isn't set for this ill
- */
- if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
- return;
+ if (sq->sq_state & SQS_DEFAULT)
+ return (-1);
- for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
- ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
+ ASSERT(MUTEX_HELD(&cpu_lock));
- ip_squeue_clean_ring(ill, ipr);
- }
+ cpu = cpu_get(cpuid);
+ if (!CPU_ISON(cpu))
+ return (-1);
- ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
+ mutex_enter(&sqset_lock);
+ set = cpu->cpu_squeue_set;
+ if (set != NULL)
+ ip_squeue_set_move(sq, set);
+ mutex_exit(&sqset_lock);
+ return ((set == NULL) ? -1 : 0);
}
-typedef struct ip_taskq_arg {
- ill_t *ip_taskq_ill;
- ill_rx_ring_t *ip_taskq_ill_rx_ring;
- cpu_t *ip_taskq_cpu;
-} ip_taskq_arg_t;
-
/*
- * Do a Rx ring to squeue binding. Find a unique squeue that is not
- * managing a receive ring. If no such squeue exists, dynamically
- * create a new one in the squeue set.
- *
- * The function runs via the system taskq. The ill passed as an
- * argument can't go away since we hold a ref. The lock order is
- * ill_lock -> sqs_lock -> sq_lock.
- *
- * If we are binding a Rx ring to a squeue attached to the offline CPU,
- * no need to check that because squeues are never destroyed once
- * created.
+ * The mac layer is calling, asking us to move an squeue to a
+ * new CPU. This routine is called with cpu_lock held.
*/
-/* ARGSUSED */
-static void
-ip_squeue_extend(void *arg)
+void
+ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
{
- ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg;
- ill_t *ill = sq_arg->ip_taskq_ill;
- ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
- cpu_t *intr_cpu = sq_arg->ip_taskq_cpu;
- squeue_set_t *sqs;
- squeue_t *sqp = NULL;
-
- ASSERT(ill != NULL);
- ASSERT(ill_rx_ring != NULL);
- kmem_free(arg, sizeof (ip_taskq_arg_t));
+ ASSERT(ILL_MAC_PERIM_HELD(ill));
+ ASSERT(rx_ring->rr_ill == ill);
- /*
- * Make sure the CPU that originally took the interrupt still
- * exists.
- */
- if (!CPU_ISON(intr_cpu))
- intr_cpu = CPU;
-
- sqs = intr_cpu->cpu_squeue_set;
-
- /*
- * If this ill represents link aggregation, then there might be
- * multiple NICs trying to register them selves at the same time
- * and in order to ensure that test and assignment of free rings
- * is sequential, we need to hold the ill_lock.
- */
mutex_enter(&ill->ill_lock);
- sqp = ip_find_unused_squeue(sqs, B_FALSE);
- if (sqp == NULL) {
- /*
- * We hit the max limit of squeues allowed per CPU.
- * Assign this rx_ring to DEFAULT squeue of the
- * interrupted CPU but the squeue will not manage
- * the ring. Also print a warning.
- */
- cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
- "has max number of squeues. System performance might "
- "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
-
- /* the first squeue in the list is the default squeue */
- sqp = sqs->sqs_list[0];
- ASSERT(sqp != NULL);
- ill_rx_ring->rr_sqp = sqp;
- ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-
+ if (rx_ring->rr_ring_state == RR_FREE ||
+ rx_ring->rr_ring_state == RR_FREE_INPROG) {
mutex_exit(&ill->ill_lock);
- ill_waiter_dcr(ill);
return;
}
- ASSERT(MUTEX_HELD(&sqp->sq_lock));
- sqp->sq_rx_ring = ill_rx_ring;
- ill_rx_ring->rr_sqp = sqp;
- ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-
- sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
- mutex_exit(&sqp->sq_lock);
+ if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
+ rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
mutex_exit(&ill->ill_lock);
-
- /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
- ill_waiter_dcr(ill);
}
-/*
- * Do a Rx ring to squeue binding. Find a unique squeue that is not
- * managing a receive ring. If no such squeue exists, dynamically
- * create a new one in the squeue set.
- *
- * The function runs via the system taskq. The ill passed as an
- * argument can't go away since we hold a ref. The lock order is
- * ill_lock -> sqs_lock -> sq_lock.
- *
- * If we are binding a Rx ring to a squeue attached to the offline CPU,
- * no need to check that because squeues are never destroyed once
- * created.
- */
-/* ARGSUSED */
-static void
-ip_squeue_soft_ring_affinity(void *arg)
+void *
+ip_squeue_add_ring(ill_t *ill, void *mrp)
{
- ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg;
- ill_t *ill = sq_arg->ip_taskq_ill;
- ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab;
- ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
- cpu_t *intr_cpu = sq_arg->ip_taskq_cpu;
- cpu_t *bind_cpu;
- int cpu_id = intr_cpu->cpu_id;
- int min_cpu_id, max_cpu_id;
- boolean_t enough_uniq_cpus = B_FALSE;
- boolean_t enough_cpus = B_FALSE;
- squeue_set_t *sqs, *last_sqs;
- squeue_t *sqp = NULL;
- int i, j;
-
- ASSERT(ill != NULL);
- kmem_free(arg, sizeof (ip_taskq_arg_t));
+ mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp;
+ ill_rx_ring_t *rx_ring, *ring_tbl;
+ int ip_rx_index;
+ squeue_t *sq = NULL;
+ pri_t pri;
- /*
- * Make sure the CPU that originally took the interrupt still
- * exists.
- */
- if (!CPU_ISON(intr_cpu)) {
- intr_cpu = CPU;
- cpu_id = intr_cpu->cpu_id;
- }
+ ASSERT(ILL_MAC_PERIM_HELD(ill));
+ ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
+ ASSERT(ill->ill_dld_capab != NULL);
- /*
- * If this ill represents link aggregation, then there might be
- * multiple NICs trying to register them selves at the same time
- * and in order to ensure that test and assignment of free rings
- * is sequential, we need to hold the ill_lock.
- */
- mutex_enter(&ill->ill_lock);
+ ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
- if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
- mutex_exit(&ill->ill_lock);
- return;
+ mutex_enter(&ill->ill_lock);
+ for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
+ rx_ring = &ring_tbl[ip_rx_index];
+ if (rx_ring->rr_ring_state == RR_FREE)
+ break;
}
- /*
- * We need to fanout the interrupts from the NIC. We do that by
- * telling the driver underneath to create soft rings and use
- * worker threads (if the driver advertized SOFT_RING capability)
- * Its still a big performance win to if we can fanout to the
- * threads on the same core that is taking interrupts.
- *
- * Since we don't know the interrupt to CPU binding, we don't
- * assign any squeues or affinity to worker threads in the NIC.
- * At the time of the first interrupt, we know which CPU is
- * taking interrupts and try to find other threads on the same
- * core. Assuming, ip_threads_per_cpu is correct and cpus are
- * numbered sequentially for each core (XXX need something better
- * than this in future), find the lowest number and highest
- * number thread for that core.
- *
- * If we have one more thread per core than number of soft rings,
- * then don't assign any worker threads to the H/W thread (cpu)
- * taking interrupts (capability negotiation tries to ensure this)
- *
- * If the number of threads per core are same as the number of
- * soft rings, then assign the worker affinity and squeue to
- * the same cpu.
- *
- * Otherwise, just fanout to higher number CPUs starting from
- * the interrupted CPU.
- */
- min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
- max_cpu_id = min_cpu_id + ip_threads_per_cpu;
-
- /*
- * Quickly check if there are enough CPUs present for fanout
- * and also max_cpu_id is less than the id of the active CPU.
- * We use the cpu_id stored in the last squeue_set to get
- * an idea. The scheme is by no means perfect since it doesn't
- * take into account CPU DR operations and the fact that
- * interrupts themselves might change. An ideal scenario
- * would be to ensure that interrupts run cpus by themselves
- * and worker threads never have affinity to those CPUs. If
- * the interrupts move to CPU which had a worker thread, it
- * should be changed. Probably callbacks similar to CPU offline
- * are needed to make it work perfectly.
- */
- last_sqs = sqset_global_list[sqset_global_size - 1];
- if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
- if ((max_cpu_id - min_cpu_id) >
- ill_soft_ring->ill_dls_soft_ring_cnt)
- enough_uniq_cpus = B_TRUE;
- else if ((max_cpu_id - min_cpu_id) >=
- ill_soft_ring->ill_dls_soft_ring_cnt)
- enough_cpus = B_TRUE;
+ if (ip_rx_index == ILL_MAX_RINGS) {
+ /*
+ * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
+ * we have devices which can overwhelm this limit,
+ * ILL_MAX_RING should be made configurable. Meanwhile it
+ * cause no panic because driver will pass ip_input a NULL
+ * handle which will make IP allocate the default squeue and
+ * Polling mode will not be used for this ring.
+ */
+ cmn_err(CE_NOTE,
+ "Reached maximum number of receiving rings (%d) for %s\n",
+ ILL_MAX_RINGS, ill->ill_name);
+ mutex_exit(&ill->ill_lock);
+ return (NULL);
}
- j = 0;
- for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
- if (enough_uniq_cpus) {
- if ((min_cpu_id + i) == cpu_id) {
- j++;
- continue;
- }
- bind_cpu = cpu[min_cpu_id + i];
- } else if (enough_cpus) {
- bind_cpu = cpu[min_cpu_id + i];
- } else {
- /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
- bind_cpu = cpu[(cpu_id + i) % ncpus];
- }
+ bzero(rx_ring, sizeof (ill_rx_ring_t));
+ rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
+ /* XXX: Hard code it to tcp accept for now */
+ rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
- /*
- * Check if the CPU actually exist and active. If not,
- * use the interrupted CPU. ip_find_unused_squeue() will
- * find the right CPU to fanout anyway.
- */
- if (!CPU_ISON(bind_cpu))
- bind_cpu = intr_cpu;
+ rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
+ rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
+ rx_ring->rr_intr_disable =
+ (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
+ rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
+ rx_ring->rr_ill = ill;
- sqs = bind_cpu->cpu_squeue_set;
- ASSERT(sqs != NULL);
- ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
+ pri = mrfp->mrf_flow_priority;
- sqp = ip_find_unused_squeue(sqs, B_TRUE);
- if (sqp == NULL) {
- /*
- * We hit the max limit of squeues allowed per CPU.
- * Assign this rx_ring to DEFAULT squeue of the
- * interrupted CPU but thesqueue will not manage
- * the ring. Also print a warning.
- */
- cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
- "%d/%p already has max number of squeues. System "
- "performance might become suboptimal\n",
- sqs->sqs_bind, (void *)sqs);
+ sq = ip_squeue_getfree(pri);
- /* the first squeue in the list is the default squeue */
- sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
- ASSERT(sqp != NULL);
+ mutex_enter(&sq->sq_lock);
+ sq->sq_rx_ring = rx_ring;
+ rx_ring->rr_sqp = sq;
- ill_rx_ring->rr_sqp = sqp;
- ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
- continue;
+ sq->sq_state |= SQS_POLL_CAPAB;
- }
- ASSERT(MUTEX_HELD(&sqp->sq_lock));
- ill_rx_ring->rr_sqp = sqp;
- sqp->sq_rx_ring = ill_rx_ring;
- ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
- sqp->sq_state |= SQS_ILL_BOUND;
-
- /* assign affinity to soft ring */
- if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
- ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
- sqp->sq_bind);
- }
- mutex_exit(&sqp->sq_lock);
- }
+ rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
+ sq->sq_ill = ill;
+ mutex_exit(&sq->sq_lock);
mutex_exit(&ill->ill_lock);
- ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
- SOFT_RING_FANOUT);
+ DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
+ ip_rx_index, void *, mrfp->mrf_rx_arg);
- mutex_enter(&ill->ill_lock);
- ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
- mutex_exit(&ill->ill_lock);
+ /* Assign the squeue to the specified CPU as well */
+ mutex_enter(&cpu_lock);
+ (void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
+ mutex_exit(&cpu_lock);
- /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
- ill_waiter_dcr(ill);
+ return (rx_ring);
}
-/* ARGSUSED */
+/*
+ * sanitize the squeue etc. Some of the processing
+ * needs to be done from inside the perimeter.
+ */
void
-ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
- mblk_t *mp_chain, struct mac_header_info_s *mhip)
+ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
{
- ip_taskq_arg_t *taskq_arg;
- boolean_t refheld;
-
- mutex_enter(&ill->ill_lock);
- if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
- taskq_arg = (ip_taskq_arg_t *)
- kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
-
- if (taskq_arg == NULL)
- goto out;
+ squeue_t *sqp;
- taskq_arg->ip_taskq_ill = ill;
- taskq_arg->ip_taskq_ill_rx_ring = NULL;
- taskq_arg->ip_taskq_cpu = CPU;
+ ASSERT(ILL_MAC_PERIM_HELD(ill));
+ ASSERT(rx_ring != NULL);
- /*
- * Set ILL_SOFT_RING_ASSIGN flag. We don't want
- * the next interrupt to schedule a task for calling
- * ip_squeue_soft_ring_affinity();
- */
- ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
- } else {
+ /* Just clean one squeue */
+ mutex_enter(&ill->ill_lock);
+ if (rx_ring->rr_ring_state == RR_FREE) {
mutex_exit(&ill->ill_lock);
- goto out;
+ return;
}
+ rx_ring->rr_ring_state = RR_FREE_INPROG;
+ sqp = rx_ring->rr_sqp;
+
+ mutex_enter(&sqp->sq_lock);
+ sqp->sq_state |= SQS_POLL_CLEANUP;
+ cv_signal(&sqp->sq_worker_cv);
mutex_exit(&ill->ill_lock);
- refheld = ill_waiter_inc(ill);
- if (refheld) {
- if (taskq_dispatch(system_taskq,
- ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
- goto out;
-
- /* release ref on ill if taskq dispatch fails */
- ill_waiter_dcr(ill);
- }
+ while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
+ cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
+ sqp->sq_state &= ~(SQS_POLL_CLEANUP_DONE | SQS_ILL_BOUND);
+
+ ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
+ SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
+ SQS_POLL_THR_QUIESCED)));
+
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+
/*
- * Turn on CAPAB_SOFT_RING so that affinity assignment
- * can be tried again later.
+ * Logically free the squeue. It goes back to the set of unused
+ * squeues
*/
+ mutex_enter(&sqset_lock);
+ ip_squeue_set_move(sqp, sqset_global_list[0]);
+ mutex_exit(&sqset_lock);
+
mutex_enter(&ill->ill_lock);
- ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
+ rx_ring->rr_ring_state = RR_FREE;
mutex_exit(&ill->ill_lock);
- kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
-
-out:
- ip_input(ill, NULL, mp_chain, mhip);
}
-static squeue_t *
-ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout)
+/*
+ * Stop the squeue from polling. This needs to be done
+ * from inside the perimeter.
+ */
+void
+ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
{
- int i;
- squeue_set_t *best_sqs = NULL;
- squeue_set_t *curr_sqs = NULL;
- int min_sq = 0;
- squeue_t *sqp = NULL;
- char sqname[64];
- cpu_t *bind_cpu;
-
- /*
- * If fanout is set and the passed squeue_set already has some
- * squeues which are managing the NICs, try to find squeues on
- * unused CPU.
- */
- if (sqs->sqs_size > 1 && fanout) {
- /*
- * First check to see if any squeue on the CPU passed
- * is managing a NIC.
- */
- mutex_enter(&sqs->sqs_lock);
- for (i = 0; i < sqs->sqs_size; i++) {
- mutex_enter(&sqs->sqs_list[i]->sq_lock);
- if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
- !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
- mutex_exit(&sqs->sqs_list[i]->sq_lock);
- break;
- }
- mutex_exit(&sqs->sqs_list[i]->sq_lock);
- }
- mutex_exit(&sqs->sqs_lock);
- if (i != sqs->sqs_size) {
- best_sqs = NULL;
-
- for (i = sqset_global_size - 1; i >= 0; i--) {
- curr_sqs = sqset_global_list[i];
- /*
- * Check and make sure the CPU that sqs
- * is bound to is valid. There could be
- * sqs's around whose CPUs could have
- * been DR'd out.
- */
- mutex_enter(&cpu_lock);
- if (cpu_get(curr_sqs->sqs_bind) != NULL) {
- if (best_sqs == NULL) {
- best_sqs = curr_sqs;
- min_sq = curr_sqs->sqs_size;
- } else if (curr_sqs->sqs_size <
- min_sq) {
- best_sqs = curr_sqs;
- min_sq = curr_sqs->sqs_size;
- }
- }
- mutex_exit(&cpu_lock);
- }
-
- ASSERT(best_sqs != NULL);
- sqs = best_sqs;
- }
- }
+ squeue_t *sqp;
- mutex_enter(&sqs->sqs_lock);
+ ASSERT(ILL_MAC_PERIM_HELD(ill));
+ ASSERT(rx_ring != NULL);
- for (i = 0; i < sqs->sqs_size; i++) {
- mutex_enter(&sqs->sqs_list[i]->sq_lock);
- if ((sqs->sqs_list[i]->sq_state &
- (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
- sqp = sqs->sqs_list[i];
- break;
- }
- mutex_exit(&sqs->sqs_list[i]->sq_lock);
- }
+ sqp = rx_ring->rr_sqp;
+ mutex_enter(&sqp->sq_lock);
+ sqp->sq_state |= SQS_POLL_QUIESCE;
+ cv_signal(&sqp->sq_worker_cv);
+ while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
+ cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
- if (sqp == NULL) {
- /* Need to create a new squeue */
- if (sqs->sqs_size == sqs->sqs_max_size) {
- /*
- * Reached the max limit for squeue
- * we can allocate on this CPU.
- */
- mutex_exit(&sqs->sqs_lock);
- return (NULL);
- }
+ mutex_exit(&sqp->sq_lock);
+}
- mutex_enter(&cpu_lock);
- if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) {
- /* Too bad, CPU got DR'd out, return NULL */
- mutex_exit(&cpu_lock);
- mutex_exit(&sqs->sqs_lock);
- return (NULL);
- }
+/*
+ * Restart polling etc. Needs to be inside the perimeter to
+ * prevent races.
+ */
+void
+ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
+{
+ squeue_t *sqp;
- bzero(sqname, sizeof (sqname));
- (void) snprintf(sqname, sizeof (sqname),
- "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
- bind_cpu->cpu_id, sqs->sqs_size);
- mutex_exit(&cpu_lock);
+ ASSERT(ILL_MAC_PERIM_HELD(ill));
+ ASSERT(rx_ring != NULL);
- sqp = squeue_create(sqname, sqs->sqs_bind,
- ip_squeue_worker_wait, minclsyspri);
+ sqp = rx_ring->rr_sqp;
+ mutex_enter(&sqp->sq_lock);
+ /*
+ * Handle change in number of rings between the quiesce and
+ * restart operations by checking for a previous quiesce before
+ * attempting a restart.
+ */
+ if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
+ mutex_exit(&sqp->sq_lock);
+ return;
+ }
+ sqp->sq_state |= SQS_POLL_RESTART;
+ cv_signal(&sqp->sq_worker_cv);
+ while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
+ cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
+ sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
+ mutex_exit(&sqp->sq_lock);
+}
- ASSERT(sqp != NULL);
+/*
+ * sanitize all squeues associated with the ill.
+ */
+void
+ip_squeue_clean_all(ill_t *ill)
+{
+ int idx;
+ ill_rx_ring_t *rx_ring;
- squeue_profile_enable(sqp);
- /*
- * Other functions scanning sqs_list don't take sqs_lock.
- * Once sqp is stored in sqs_list[] global visibility is
- * ensured before incrementing the sqs_size counter.
- */
- sqs->sqs_list[sqs->sqs_size] = sqp;
- membar_producer();
- sqs->sqs_size++;
-
- if (ip_squeue_create_callback != NULL)
- ip_squeue_create_callback(sqp);
-
- if (ip_squeue_bind) {
- mutex_enter(&cpu_lock);
- bind_cpu = cpu_get(sqs->sqs_bind);
- if (bind_cpu != NULL && cpu_is_online(bind_cpu)) {
- squeue_bind(sqp, -1);
- }
- mutex_exit(&cpu_lock);
- }
- mutex_enter(&sqp->sq_lock);
+ for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
+ rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
+ ip_squeue_clean_ring(ill, rx_ring);
}
-
- mutex_exit(&sqs->sqs_lock);
- ASSERT(sqp != NULL);
- return (sqp);
}
/*
- * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
- * owned by a squeue yet, do the assignment. When the NIC registers it
- * Rx rings with IP, we don't know where the interrupts will land and
- * hence we need to wait till this point to do the assignment.
+ * Used by IP to get the squeue associated with a ring. If the squeue isn't
+ * yet bound to a CPU, and we're being called directly from the NIC's
+ * interrupt, then we know what CPU we want to assign the squeue to, so
+ * dispatch that task to a taskq.
*/
squeue_t *
ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
{
squeue_t *sqp;
- ill_t *ill;
- int interrupt;
- ip_taskq_arg_t *taskq_arg;
- boolean_t refheld;
-
- if (ill_rx_ring == NULL)
- return (IP_SQUEUE_GET(lbolt));
-
- sqp = ill_rx_ring->rr_sqp;
- /*
- * Do a quick check. If it's not NULL, we are done.
- * Squeues are never destroyed so worse we will bind
- * this connection to a suboptimal squeue.
- *
- * This is the fast path case.
- */
- if (sqp != NULL)
- return (sqp);
-
- ill = ill_rx_ring->rr_ill;
- ASSERT(ill != NULL);
-
- interrupt = servicing_interrupt();
- taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
- KM_NOSLEEP);
- mutex_enter(&ill->ill_lock);
- /*
- * Check sqp under the lock again for atomicity. Possible race with
- * a previously scheduled ip_squeue_get -> ip_squeue_extend.
- * Do the ring to squeue binding only if we are in interrupt context
- * AND the ring is not already bound AND there is no one else trying
- * the bind already.
- */
- sqp = ill_rx_ring->rr_sqp;
- if (sqp != NULL || !interrupt ||
- ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
- /*
- * Note that the ring might get bound once we drop the lock
- * below, if a previous request is in progress i.e. if the ring
- * state is ILL_RING_INPROC. The incoming connection on whose
- * behalf we are currently here might get a suboptimal squeue
- * via the call to IP_SQUEUE_GET below, but there is no
- * correctness issue.
- */
- mutex_exit(&ill->ill_lock);
- if (taskq_arg != NULL)
- kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
- if (sqp != NULL)
- return (sqp);
+ if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
return (IP_SQUEUE_GET(lbolt));
- }
-
- /*
- * No sqp assigned yet. Can't really do that in interrupt
- * context. Assign the default sqp to this connection and
- * trigger creation of new sqp and binding it to this ring
- * via taskq. Need to make sure ill stays around.
- */
- taskq_arg->ip_taskq_ill = ill;
- taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
- taskq_arg->ip_taskq_cpu = CPU;
- ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
- mutex_exit(&ill->ill_lock);
- refheld = ill_waiter_inc(ill);
- if (refheld) {
- if (taskq_dispatch(system_taskq, ip_squeue_extend,
- taskq_arg, TQ_NOSLEEP) != NULL) {
- return (IP_SQUEUE_GET(lbolt));
- }
- }
- /*
- * The ill is closing and we could not get a reference on the ill OR
- * taskq_dispatch failed probably due to memory allocation failure.
- * We will try again next time.
- */
- mutex_enter(&ill->ill_lock);
- ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
- mutex_exit(&ill->ill_lock);
- kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
- if (refheld)
- ill_waiter_dcr(ill);
- return (IP_SQUEUE_GET(lbolt));
+ return (sqp);
}
/*
- * NDD hooks for setting ip_squeue_xxx tuneables.
+ * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
+ * squeues are unboudn and moved to the unbound set.
*/
-
-/* ARGSUSED */
-int
-ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
- caddr_t addr, cred_t *cr)
+static void
+ip_squeue_set_destroy(cpu_t *cpu)
{
- int *bind_enabled = (int *)addr;
- long new_value;
int i;
+ squeue_t *sqp, *lastsqp = NULL;
+ squeue_set_t *sqs, *unbound = sqset_global_list[0];
- if (ddi_strtol(value, NULL, 10, &new_value) != 0)
- return (EINVAL);
+ mutex_enter(&sqset_lock);
+ if ((sqs = cpu->cpu_squeue_set) == NULL) {
+ mutex_exit(&sqset_lock);
+ return;
+ }
- if (ip_squeue_bind == new_value)
- return (0);
+ /* Move all squeues to unbound set */
- *bind_enabled = new_value;
- mutex_enter(&cpu_lock);
- if (new_value == 0) {
- for (i = 0; i < sqset_global_size; i++)
- ip_squeue_set_unbind(sqset_global_list[i]);
- } else {
- for (i = 0; i < sqset_global_size; i++)
- ip_squeue_set_bind(sqset_global_list[i]);
+ for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
+ squeue_unbind(sqp);
+ sqp->sq_set = unbound;
+ }
+ if (sqs->sqs_head) {
+ lastsqp->sq_next = unbound->sqs_head;
+ unbound->sqs_head = sqs->sqs_head;
}
- mutex_exit(&cpu_lock);
- return (0);
-}
+ /* Also move default squeue to unbound set */
-/*
- * Set squeue profiling.
- * 0 means "disable"
- * 1 means "enable"
- * 2 means "enable and reset"
- */
-/* ARGSUSED */
-int
-ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
- cred_t *cr)
-{
- int *profile_enabled = (int *)cp;
- long new_value;
- squeue_set_t *sqs;
-
- if (ddi_strtol(value, NULL, 10, &new_value) != 0)
- return (EINVAL);
-
- if (new_value == 0)
- squeue_profile_stop();
- else if (new_value == 1)
- squeue_profile_start();
- else if (new_value == 2) {
- int i, j;
-
- squeue_profile_stop();
- mutex_enter(&cpu_lock);
- for (i = 0; i < sqset_global_size; i++) {
- sqs = sqset_global_list[i];
- for (j = 0; j < sqs->sqs_size; j++) {
- squeue_profile_reset(sqs->sqs_list[j]);
- }
- }
- mutex_exit(&cpu_lock);
+ sqp = sqs->sqs_default;
+ ASSERT(sqp);
+ ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
- new_value = 1;
- squeue_profile_start();
- }
- *profile_enabled = new_value;
+ sqp->sq_next = unbound->sqs_head;
+ unbound->sqs_head = sqp;
+ squeue_unbind(sqp);
+ sqp->sq_set = unbound;
- return (0);
+ for (i = 1; i < sqset_global_size; i++)
+ if (sqset_global_list[i] == sqs)
+ break;
+
+ ASSERT(i < sqset_global_size);
+ sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
+ sqset_global_list[sqset_global_size - 1] = NULL;
+ sqset_global_size--;
+
+ mutex_exit(&sqset_lock);
+ kmem_free(sqs, sizeof (*sqs));
}
/*
* Reconfiguration callback
*/
-
/* ARGSUSED */
static int
ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
{
- cpu_t *cp = cpu[id];
+ cpu_t *cp = cpu_get(id);
ASSERT(MUTEX_HELD(&cpu_lock));
switch (what) {
case CPU_CONFIG:
- /*
- * A new CPU is added. Create an squeue for it but do not bind
- * it yet.
- */
- if (cp->cpu_squeue_set == NULL)
- cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
- break;
case CPU_ON:
case CPU_INIT:
case CPU_CPUPART_IN:
- if (cp->cpu_squeue_set == NULL) {
- cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
- }
- if (ip_squeue_bind)
- ip_squeue_set_bind(cp->cpu_squeue_set);
+ if (cp->cpu_squeue_set == NULL)
+ cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
break;
case CPU_UNCONFIG:
case CPU_OFF:
case CPU_CPUPART_OUT:
ASSERT((cp->cpu_squeue_set != NULL) ||
(cp->cpu_flags & CPU_OFFLINE));
-
if (cp->cpu_squeue_set != NULL) {
- ip_squeue_set_unbind(cp->cpu_squeue_set);
+ ip_squeue_set_destroy(cp);
+ cp->cpu_squeue_set = NULL;
}
break;
default:
@@ -1111,54 +756,3 @@ ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
}
return (0);
}
-
-/* ARGSUSED */
-static void
-ip_squeue_set_bind(squeue_set_t *sqs)
-{
- int i;
- squeue_t *sqp;
-
- if (!ip_squeue_bind)
- return;
-
- mutex_enter(&sqs->sqs_lock);
- for (i = 0; i < sqs->sqs_size; i++) {
- sqp = sqs->sqs_list[i];
- if (sqp->sq_state & SQS_BOUND)
- continue;
- squeue_bind(sqp, -1);
- }
- mutex_exit(&sqs->sqs_lock);
-}
-
-static void
-ip_squeue_set_unbind(squeue_set_t *sqs)
-{
- int i;
- squeue_t *sqp;
-
- mutex_enter(&sqs->sqs_lock);
- for (i = 0; i < sqs->sqs_size; i++) {
- sqp = sqs->sqs_list[i];
-
- /*
- * CPU is going offline. Remove the thread affinity
- * for any soft ring threads the squeue is managing.
- */
- if (sqp->sq_state & SQS_ILL_BOUND) {
- ill_rx_ring_t *ring = sqp->sq_rx_ring;
- ill_t *ill = ring->rr_ill;
-
- if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
- ASSERT(ring->rr_handle != NULL);
- ill->ill_dls_capab->ill_dls_unbind(
- ring->rr_handle);
- }
- }
- if (!(sqp->sq_state & SQS_BOUND))
- continue;
- squeue_unbind(sqp);
- }
- mutex_exit(&sqs->sqs_lock);
-}
diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c
index 7274576285..f785d8a3f6 100644
--- a/usr/src/uts/common/inet/ip/spd.c
+++ b/usr/src/uts/common/inet/ip/spd.c
@@ -176,13 +176,6 @@ int ipsec_weird_null_inbound_policy = 0;
(((sa1)->ipsa_dst_cid == (sa2)->ipsa_dst_cid))))
/*
- * IPv4 Fragments
- */
-#define IS_V4_FRAGMENT(ipha_fragment_offset_and_flags) \
- (((ntohs(ipha_fragment_offset_and_flags) & IPH_OFFSET) != 0) || \
- ((ntohs(ipha_fragment_offset_and_flags) & IPH_MF) != 0))
-
-/*
* IPv6 Fragments
*/
#define IS_V6_FRAGMENT(ipp) (ipp.ipp_fields & IPPF_FRAGHDR)
diff --git a/usr/src/uts/common/inet/ip/tun.c b/usr/src/uts/common/inet/ip/tun.c
index 24af532b77..632601b5f1 100644
--- a/usr/src/uts/common/inet/ip/tun.c
+++ b/usr/src/uts/common/inet/ip/tun.c
@@ -3202,7 +3202,7 @@ tun_rdata_v4(queue_t *q, mblk_t *ipsec_mp, mblk_t *data_mp, tun_t *atp)
*/
pullup_len = hdrlen + (inner_v4 ? sizeof (ipha_t) : sizeof (ip6_t)) + 4;
if ((data_mp->b_wptr - data_mp->b_rptr) < pullup_len) {
- if (!pullupmsg(data_mp, hdrlen + pullup_len)) {
+ if (!pullupmsg(data_mp, pullup_len)) {
atomic_add_32(&atp->tun_InErrors, 1);
atomic_add_32(&atp->tun_InDiscard, 1);
if (ipsec_mp != NULL)
diff --git a/usr/src/uts/common/inet/ip_ftable.h b/usr/src/uts/common/inet/ip_ftable.h
index e729761147..6a3a05183b 100644
--- a/usr/src/uts/common/inet/ip_ftable.h
+++ b/usr/src/uts/common/inet/ip_ftable.h
@@ -27,8 +27,6 @@
#ifndef _INET_IP_FTABLE_H
#define _INET_IP_FTABLE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -94,6 +92,8 @@ extern void ire_delete_host_redirects(ipaddr_t, ip_stack_t *);
extern ire_t *ire_ihandle_lookup_onlink(ire_t *);
extern ire_t *ire_forward(ipaddr_t, enum ire_forward_action *, ire_t *,
ire_t *, const struct ts_label_s *, ip_stack_t *);
+extern ire_t *ire_forward_simple(ipaddr_t, enum ire_forward_action *,
+ ip_stack_t *);
extern irb_t *ire_get_bucket(ire_t *);
extern uint_t ifindex_lookup(const struct sockaddr *, zoneid_t);
extern int ipfil_sendpkt(const struct sockaddr *, mblk_t *, uint_t, zoneid_t);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index 1bd5b47a9f..c0a6c51696 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -142,6 +142,12 @@ extern "C" {
#define RESTRICT_TO_GROUP 0x1 /* Restrict to IPMP group */
#define RESTRICT_TO_ILL 0x2 /* Restrict to ILL */
+#ifdef DEBUG
+#define ILL_MAC_PERIM_HELD(ill) ill_mac_perim_held(ill)
+#else
+#define ILL_MAC_PERIM_HELD(ill)
+#endif
+
/* for ipif_resolver_up */
enum ip_resolver_action {
Res_act_initial, /* initial address establishment */
@@ -158,6 +164,7 @@ extern void ill_dlpi_done(ill_t *, t_uscalar_t);
extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
extern void ill_dlpi_send(ill_t *, mblk_t *);
extern void ill_dlpi_send_deferred(ill_t *);
+extern void ill_capability_done(ill_t *);
extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t);
extern ill_t *ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *);
@@ -208,9 +215,12 @@ extern void ill_untrace_ref(ill_t *);
extern boolean_t ill_down_start(queue_t *, mblk_t *);
extern ill_t *ill_lookup_group_v6(const in6_addr_t *, zoneid_t,
ip_stack_t *);
+
extern void ill_capability_ack(ill_t *, mblk_t *);
extern void ill_capability_probe(ill_t *);
-extern void ill_capability_reset(ill_t *);
+extern void ill_capability_reset(ill_t *, boolean_t);
+extern void ill_taskq_dispatch(ip_stack_t *);
+
extern void ill_mtu_change(ire_t *, char *);
extern void ill_group_cleanup(ill_t *);
extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
@@ -281,10 +291,11 @@ extern void ipsq_current_start(ipsq_t *, ipif_t *, int);
extern void ipsq_current_finish(ipsq_t *);
extern void ipsq_enq(ipsq_t *, queue_t *, mblk_t *, ipsq_func_t, int,
ill_t *);
-extern boolean_t ipsq_enter(ill_t *, boolean_t);
+extern boolean_t ipsq_enter(ill_t *, boolean_t, int);
extern ipsq_t *ipsq_try_enter(ipif_t *, ill_t *, queue_t *, mblk_t *,
ipsq_func_t, int, boolean_t);
extern void ipsq_exit(ipsq_t *);
+extern boolean_t ill_mac_perim_held(ill_t *);
extern mblk_t *ipsq_pending_mp_get(ipsq_t *, conn_t **);
extern boolean_t ipsq_pending_mp_add(conn_t *, ipif_t *, queue_t *,
mblk_t *, int);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index d993e5f6b4..f7a9b8ff58 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -40,6 +40,7 @@ extern "C" {
#ifdef _KERNEL
#include <sys/sdt.h>
+#include <sys/dld.h>
#define IP_MOD_ID 5701
@@ -359,7 +360,7 @@ typedef struct ip_mdt_info_s {
ill->ill_mdt_capab->ill_mdt_on != 0)
#define ILL_LSO_CAPABLE(ill) \
- (((ill)->ill_capabilities & ILL_CAPAB_LSO) != 0)
+ (((ill)->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0)
/*
* ioctl identifier and structure for Large Segment Offload
@@ -378,12 +379,11 @@ typedef struct ip_lso_info_s {
#define ILL_LSO_USABLE(ill) \
(ILL_LSO_CAPABLE(ill) && \
ill->ill_lso_capab != NULL && \
- ill->ill_lso_capab->ill_lso_version == LSO_VERSION_1 && \
ill->ill_lso_capab->ill_lso_on != 0)
#define ILL_LSO_TCP_USABLE(ill) \
(ILL_LSO_USABLE(ill) && \
- ill->ill_lso_capab->ill_lso_flags & LSO_TX_BASIC_TCP_IPV4)
+ ill->ill_lso_capab->ill_lso_flags & DLD_LSO_TX_BASIC_TCP_IPV4)
/*
* Macro that determines whether or not a given CONN may be considered
@@ -497,43 +497,36 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
(connp)->conn_udp->udp_drain_qfull : \
!canputnext((connp)->conn_rq))
-#define ILL_DLS_CAPABLE(ill) \
- (((ill)->ill_capabilities & \
- (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0)
-
-/*
- * Macro that hands off one or more messages directly to DLD
- * when the interface is marked with ILL_CAPAB_POLL.
- */
-#define IP_DLS_ILL_TX(ill, ipha, mp, ipst, hlen) { \
- ill_dls_capab_t *ill_dls = ill->ill_dls_capab; \
- ASSERT(ILL_DLS_CAPABLE(ill)); \
- ASSERT(ill_dls != NULL); \
- ASSERT(ill_dls->ill_tx != NULL); \
- ASSERT(ill_dls->ill_tx_handle != NULL); \
- DTRACE_PROBE4(ip4__physical__out__start, \
- ill_t *, NULL, ill_t *, ill, \
- ipha_t *, ipha, mblk_t *, mp); \
- FW_HOOKS(ipst->ips_ip4_physical_out_event, \
- ipst->ips_ipv4firewall_physical_out, \
- NULL, ill, ipha, mp, mp, 0, ipst); \
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); \
- if (mp != NULL) { \
- if (ipst->ips_ipobs_enabled) { \
- zoneid_t szone; \
- \
- szone = ip_get_zoneid_v4(ipha->ipha_src, mp, \
- ipst, ALL_ZONES); \
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, \
- ALL_ZONES, ill, IPV4_VERSION, hlen, ipst); \
- } \
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, \
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, \
- ipha_t *, ipha, ip6_t *, NULL, int, 0); \
- ill_dls->ill_tx(ill_dls->ill_tx_handle, mp); \
- } \
+/* Macro that follows definitions of flags for mac_tx() (see mac_client.h) */
+#define IP_DROP_ON_NO_DESC 0x01 /* Equivalent to MAC_DROP_ON_NO_DESC */
+
+#define ILL_DIRECT_CAPABLE(ill) \
+ (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
+
+#define ILL_SEND_TX(ill, ire, hint, mp, flag) { \
+ if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \
+ ill_dld_direct_t *idd; \
+ \
+ idd = &(ill)->ill_dld_capab->idc_direct; \
+ /* \
+ * Send the packet directly to DLD, where it \
+ * may be queued depending on the availability \
+ * of transmit resources at the media layer. \
+ * Ignore the returned value for the time being \
+ * In future, we may want to take this into \
+ * account and flow control the TCP. \
+ */ \
+ (void) idd->idd_tx_df(idd->idd_tx_dh, mp, \
+ (uintptr_t)(hint), flag); \
+ } else { \
+ putnext((ire)->ire_stq, mp); \
+ } \
}
+#define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \
+ (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \
+ (((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr))
+
/*
* In non-global zone exclusive IP stacks, data structures such as IRE
* entries pretend that they're in the global zone. The following
@@ -548,6 +541,7 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
extern int ip_wput_frag_mdt_min;
extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
extern mblk_t *ip_prepend_zoneid(mblk_t *, zoneid_t, ip_stack_t *);
+extern void ill_flow_enable(void *, ip_mac_tx_cookie_t);
extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t);
extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *,
ip_stack_t *, zoneid_t);
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index c9a0e12ea1..7accbbcfa3 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -235,6 +235,7 @@ extern void ire_atomic_end(irb_t *irb_ptr, ire_t *ire);
extern void ire_cache_count(ire_t *, char *);
extern ire_t *ire_cache_lookup(ipaddr_t, zoneid_t,
const struct ts_label_s *, ip_stack_t *);
+extern ire_t *ire_cache_lookup_simple(ipaddr_t, ip_stack_t *);
extern ire_t *ire_cache_lookup_v6(const in6_addr_t *, zoneid_t,
const struct ts_label_s *, ip_stack_t *);
extern void ire_cache_reclaim(ire_t *, char *);
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index b788b95fa0..d0c3953374 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -35,7 +35,7 @@ extern "C" {
#include <netinet/igmp_var.h>
#ifdef _KERNEL
-
+#include <sys/list.h>
/*
* IP statistics.
@@ -175,6 +175,13 @@ struct ip_stack {
struct ill_group *ips_illgrp_head_v4; /* Head of IPv4 ill groups */
struct ill_group *ips_illgrp_head_v6; /* Head of IPv6 ill groups */
+ /* Taskq dispatcher for capability operations */
+ kmutex_t ips_capab_taskq_lock;
+ kcondvar_t ips_capab_taskq_cv;
+ list_t ips_capab_taskq_list;
+ kthread_t *ips_capab_taskq_thread;
+ boolean_t ips_capab_taskq_quit;
+
/* ipclassifier.c - keep in ip_stack_t */
/* ipclassifier hash tables */
struct connf_s *ips_rts_clients;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index dac6d023f7..4665549c69 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -26,8 +26,6 @@
#ifndef _INET_IPCLASSIFIER_H
#define _INET_IPCLASSIFIER_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -222,10 +220,13 @@ struct conn_s {
conn_recvslla : 1, /* IP_RECVSLLA option */
conn_mdt_ok : 1, /* MDT is permitted */
conn_nexthop_set : 1,
- conn_allzones : 1, /* SO_ALLZONES */
+ conn_allzones : 1; /* SO_ALLZONES */
+ unsigned int
conn_lso_ok : 1; /* LSO is usable */
+ squeue_t *conn_initial_sqp; /* Squeue at open time */
+ squeue_t *conn_final_sqp; /* Squeue after connect */
ill_t *conn_nofailover_ill; /* Failover ill */
ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */
ipsec_latch_t *conn_latch; /* latched state */
@@ -286,8 +287,8 @@ struct conn_s {
int conn_orig_bound_ifindex; /* BOUND_IF before MOVE */
int conn_orig_multicast_ifindex;
/* IPv6 MC IF before MOVE */
- struct conn_s *conn_drain_next; /* Next conn in drain list */
- struct conn_s *conn_drain_prev; /* Prev conn in drain list */
+ struct conn_s *conn_drain_next; /* Next conn in drain list */
+ struct conn_s *conn_drain_prev; /* Prev conn in drain list */
idl_t *conn_idl; /* Ptr to the drain list head */
mblk_t *conn_ipsec_opt_mp; /* ipsec option mblk */
uint32_t conn_src_preferences; /* prefs for src addr select */
@@ -499,6 +500,7 @@ struct connf_s {
(connp)->conn_ports = ports; \
(connp)->conn_send = ip_output; \
(connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \
+ (connp)->conn_initial_sqp = (connp)->conn_sqp; \
}
#define IPCL_TCP_EAGER_INIT_V6(connp, protocol, src, rem, ports) { \
@@ -508,6 +510,7 @@ struct connf_s {
(connp)->conn_ports = ports; \
(connp)->conn_send = ip_output_v6; \
(connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \
+ (connp)->conn_initial_sqp = (connp)->conn_sqp; \
}
#define IPCL_UDP_HASH(lport, ipst) \
diff --git a/usr/src/uts/common/inet/ipdrop.h b/usr/src/uts/common/inet/ipdrop.h
index 88dcda264c..9fe672434e 100644
--- a/usr/src/uts/common/inet/ipdrop.h
+++ b/usr/src/uts/common/inet/ipdrop.h
@@ -124,7 +124,6 @@ struct ip_dropstats {
};
#endif /* _KERNEL */
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 4895e2249e..559abd9178 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -19,144 +19,95 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
- * Squeues - TCP/IP serialization mechanism.
- *
- * This is a general purpose high-performance serialization mechanism. It is
- * similar to a taskq with a single worker thread, the difference is that it
- * does not imply a context switch - the thread placing a request may actually
- * process it. It is also biased for processing requests in interrupt context.
- *
- * Each squeue has a worker thread which may optionally be bound to a CPU.
- *
- * Only one thread may process requests from a given squeue at any time. This is
- * called "entering" squeue.
- *
- * Each dispatched request is processed either by
- *
- * a) Dispatching thread or
- * b) Some other thread that is currently processing squeue at the time of
- * request or
- * c) worker thread.
- *
- * INTERFACES:
- *
- * squeue_t *squeue_create(name, bind, wait, pri)
- *
- * name: symbolic name for squeue.
- * wait: time to wait before waiking the worker thread after queueing
- * request.
- * bind: preferred CPU binding for the worker thread.
- * pri: thread priority for the worker thread.
- *
- * This function never fails and may sleep. It returns a transparent pointer
- * to the squeue_t structure that is passed to all other squeue operations.
- *
- * void squeue_bind(sqp, bind)
- *
- * Bind squeue worker thread to a CPU specified by the 'bind' argument. The
- * 'bind' value of -1 binds to the preferred thread specified for
- * squeue_create.
- *
- * NOTE: Any value of 'bind' other then -1 is not supported currently, but the
- * API is present - in the future it may be useful to specify different
- * binding.
- *
- * void squeue_unbind(sqp)
- *
- * Unbind the worker thread from its preferred CPU.
- *
- * void squeue_enter(*sqp, *mp, proc, arg, tag)
- *
- * Post a single request for processing. Each request consists of mblock 'mp',
- * function 'proc' to execute and an argument 'arg' to pass to this
- * function. The function is called as (*proc)(arg, mp, sqp); The tag is an
- * arbitrary number from 0 to 255 which will be stored in mp to track exact
- * caller of squeue_enter. The combination of function name and the tag should
- * provide enough information to identify the caller.
- *
- * If no one is processing the squeue, squeue_enter() will call the function
- * immediately. Otherwise it will add the request to the queue for later
- * processing. Once the function is executed, the thread may continue
- * executing all other requests pending on the queue.
+ * Squeues: General purpose serialization mechanism
+ * ------------------------------------------------
*
- * NOTE: The tagging information is only used when SQUEUE_DEBUG is set to 1.
- * NOTE: The argument can be conn_t only. Ideally we'd like to have generic
- * argument, but we want to drop connection reference count here - this
- * improves tail-call optimizations.
- * XXX: The arg should have type conn_t.
+ * Background:
+ * -----------
*
- * void squeue_enter_nodrain(*sqp, *mp, proc, arg, tag)
+ * This is a general purpose high-performance serialization mechanism
+ * currently used by TCP/IP. It is implement by means of a per CPU queue,
+ * a worker thread and a polling thread with are bound to the CPU
+ * associated with the squeue. The squeue is strictly FIFO for both read
+ * and write side and only one thread can process it at any given time.
+ * The design goal of squeue was to offer a very high degree of
+ * parallelization (on a per H/W execution pipeline basis) with at
+ * most one queuing.
*
- * Same as squeue_enter(), but the entering thread will only try to execute a
- * single request. It will not continue executing any pending requests.
+ * The modules needing protection typically calls squeue_enter() or
+ * squeue_enter_chain() routine as soon as a thread enter the module
+ * from either direction. For each packet, the processing function
+ * and argument is stored in the mblk itself. When the packet is ready
+ * to be processed, the squeue retrieves the stored function and calls
+ * it with the supplied argument and the pointer to the packet itself.
+ * The called function can assume that no other thread is processing
+ * the squeue when it is executing.
*
- * void squeue_fill(*sqp, *mp, proc, arg, tag)
+ * Squeue/connection binding:
+ * --------------------------
*
- * Just place the request on the queue without trying to execute it. Arrange
- * for the worker thread to process the request.
+ * TCP/IP uses an IP classifier in conjunction with squeue where specific
+ * connections are assigned to specific squeue (based on various policies),
+ * at the connection creation time. Once assigned, the connection to
+ * squeue mapping is never changed and all future packets for that
+ * connection are processed on that squeue. The connection ("conn") to
+ * squeue mapping is stored in "conn_t" member "conn_sqp".
*
- * void squeue_profile_enable(sqp)
- * void squeue_profile_disable(sqp)
+ * Since the processing of the connection cuts across multiple layers
+ * but still allows packets for different connnection to be processed on
+ * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
+ * "Per Connection Vertical Perimeter".
*
- * Enable or disable profiling for specified 'sqp'. Profiling is only
- * available when SQUEUE_PROFILE is set.
+ * Processing Model:
+ * -----------------
*
- * void squeue_profile_reset(sqp)
+ * Squeue doesn't necessary processes packets with its own worker thread.
+ * The callers can pick if they just want to queue the packet, process
+ * their packet if nothing is queued or drain and process. The first two
+ * modes are typically employed when the packet was generated while
+ * already doing the processing behind the squeue and last mode (drain
+ * and process) is typically employed when the thread is entering squeue
+ * for the first time. The squeue still imposes a finite time limit
+ * for which a external thread can do processing after which it switches
+ * processing to its own worker thread.
*
- * Reset all profiling information to zero. Profiling is only
- * available when SQUEUE_PROFILE is set.
+ * Once created, squeues are never deleted. Hence squeue pointers are
+ * always valid. This means that functions outside the squeue can still
+ * refer safely to conn_sqp and their is no need for ref counts.
*
- * void squeue_profile_start()
- * void squeue_profile_stop()
+ * Only a thread executing in the squeue can change the squeue of the
+ * connection. It does so by calling a squeue framework function to do this.
+ * After changing the squeue, the thread must leave the squeue. It must not
+ * continue to execute any code that needs squeue protection.
*
- * Globally enable or disabled profiling for all squeues.
+ * The squeue framework, after entering the squeue, checks if the current
+ * squeue matches the conn_sqp. If the check fails, the packet is delivered
+ * to right squeue.
*
- * uintptr_t *squeue_getprivate(sqp, p)
+ * Polling Model:
+ * --------------
*
- * Each squeue keeps small amount of private data space available for various
- * consumers. Current consumers include TCP and NCA. Other consumers need to
- * add their private tag to the sqprivate_t enum. The private information is
- * limited to an uintptr_t value. The squeue has no knowledge of its content
- * and does not manage it in any way.
+ * Squeues can control the rate of packet arrival into itself from the
+ * NIC or specific Rx ring within a NIC. As part of capability negotiation
+ * between IP and MAC layer, squeue are created for each TCP soft ring
+ * (or TCP Rx ring - to be implemented in future). As part of this
+ * negotiation, squeues get a cookie for underlying soft ring or Rx
+ * ring, a function to turn off incoming packets and a function to call
+ * to poll for packets. This helps schedule the receive side packet
+ * processing so that queue backlog doesn't build up and packet processing
+ * doesn't keep getting disturbed by high priority interrupts. As part
+ * of this mode, as soon as a backlog starts building, squeue turns off
+ * the interrupts and switches to poll mode. In poll mode, when poll
+ * thread goes down to retrieve packets, it retrieves them in the form of
+ * a chain which improves performance even more. As the squeue/softring
+ * system gets more packets, it gets more efficient by switching to
+ * polling more often and dealing with larger packet chains.
*
- * The typical use may be a breakdown of data structures per CPU (since
- * squeues are usually per CPU). See NCA for examples of use.
- * Currently 'p' may have one legal value SQPRIVATE_TCP.
- *
- * processorid_t squeue_binding(sqp)
- *
- * Returns the CPU binding for a given squeue.
- *
- * TUNABALES:
- *
- * squeue_intrdrain_ms: Maximum time in ms interrupts spend draining any
- * squeue. Note that this is approximation - squeues have no control on the
- * time it takes to process each request. This limit is only checked
- * between processing individual messages.
- * Default: 20 ms.
- *
- * squeue_writerdrain_ms: Maximum time in ms non-interrupts spend draining any
- * squeue. Note that this is approximation - squeues have no control on the
- * time it takes to process each request. This limit is only checked
- * between processing individual messages.
- * Default: 10 ms.
- *
- * squeue_workerdrain_ms: Maximum time in ms worker thread spends draining any
- * squeue. Note that this is approximation - squeues have no control on the
- * time it takes to process each request. This limit is only checked
- * between processing individual messages.
- * Default: 10 ms.
- *
- * squeue_workerwait_ms: When worker thread is interrupted because workerdrain
- * expired, how much time to wait before waking worker thread again.
- * Default: 10 ms.
*/
#include <sys/types.h>
@@ -169,208 +120,30 @@
#include <sys/callb.h>
#include <sys/sdt.h>
#include <sys/ddi.h>
+#include <sys/sunddi.h>
#include <inet/ipclassifier.h>
#include <inet/udp_impl.h>
-/*
- * State flags.
- * Note: The MDB IP module depends on the values of these flags.
- */
-#define SQS_PROC 0x0001 /* being processed */
-#define SQS_WORKER 0x0002 /* worker thread */
-#define SQS_ENTER 0x0004 /* enter thread */
-#define SQS_FAST 0x0008 /* enter-fast thread */
-#define SQS_USER 0x0010 /* A non interrupt user */
-#define SQS_BOUND 0x0020 /* Worker thread is bound */
-#define SQS_PROFILE 0x0040 /* Enable profiling */
-#define SQS_REENTER 0x0080 /* Re entered thread */
-#define SQS_TMO_PROG 0x0100 /* Timeout is being set */
-
#include <sys/squeue_impl.h>
static void squeue_fire(void *);
static void squeue_drain(squeue_t *, uint_t, hrtime_t);
static void squeue_worker(squeue_t *sqp);
-
-#if SQUEUE_PROFILE
-static kmutex_t squeue_kstat_lock;
-static int squeue_kstat_update(kstat_t *, int);
-#endif
+static void squeue_polling_thread(squeue_t *sqp);
kmem_cache_t *squeue_cache;
#define SQUEUE_MSEC_TO_NSEC 1000000
-int squeue_intrdrain_ms = 20;
-int squeue_writerdrain_ms = 10;
-int squeue_workerdrain_ms = 10;
-int squeue_workerwait_ms = 10;
+int squeue_drain_ms = 20;
+int squeue_workerwait_ms = 0;
/* The values above converted to ticks or nano seconds */
-static int squeue_intrdrain_ns = 0;
-static int squeue_writerdrain_ns = 0;
-static int squeue_workerdrain_ns = 0;
+static int squeue_drain_ns = 0;
static int squeue_workerwait_tick = 0;
-/*
- * The minimum packet queued when worker thread doing the drain triggers
- * polling (if squeue allows it). The choice of 3 is arbitrary. You
- * definitely don't want it to be 1 since that will trigger polling
- * on very low loads as well (ssh seems to do be one such example
- * where packet flow was very low yet somehow 1 packet ended up getting
- * queued and worker thread fires every 10ms and blanking also gets
- * triggered.
- */
-int squeue_worker_poll_min = 3;
-
-#if SQUEUE_PROFILE
-/*
- * Set to B_TRUE to enable profiling.
- */
-static int squeue_profile = B_FALSE;
-#define SQ_PROFILING(sqp) (squeue_profile && ((sqp)->sq_state & SQS_PROFILE))
-
-#define SQSTAT(sqp, x) ((sqp)->sq_stats.x++)
-#define SQDELTA(sqp, x, d) ((sqp)->sq_stats.x += (d))
-
-struct squeue_kstat {
- kstat_named_t sq_count;
- kstat_named_t sq_max_qlen;
- kstat_named_t sq_npackets_worker;
- kstat_named_t sq_npackets_intr;
- kstat_named_t sq_npackets_other;
- kstat_named_t sq_nqueued_intr;
- kstat_named_t sq_nqueued_other;
- kstat_named_t sq_ndrains_worker;
- kstat_named_t sq_ndrains_intr;
- kstat_named_t sq_ndrains_other;
- kstat_named_t sq_time_worker;
- kstat_named_t sq_time_intr;
- kstat_named_t sq_time_other;
-} squeue_kstat = {
- { "count", KSTAT_DATA_UINT64 },
- { "max_qlen", KSTAT_DATA_UINT64 },
- { "packets_worker", KSTAT_DATA_UINT64 },
- { "packets_intr", KSTAT_DATA_UINT64 },
- { "packets_other", KSTAT_DATA_UINT64 },
- { "queued_intr", KSTAT_DATA_UINT64 },
- { "queued_other", KSTAT_DATA_UINT64 },
- { "ndrains_worker", KSTAT_DATA_UINT64 },
- { "ndrains_intr", KSTAT_DATA_UINT64 },
- { "ndrains_other", KSTAT_DATA_UINT64 },
- { "time_worker", KSTAT_DATA_UINT64 },
- { "time_intr", KSTAT_DATA_UINT64 },
- { "time_other", KSTAT_DATA_UINT64 },
-};
-#endif
-
-#define SQUEUE_WORKER_WAKEUP(sqp) { \
- timeout_id_t tid = (sqp)->sq_tid; \
- \
- ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \
- /* \
- * Queue isn't being processed, so take \
- * any post enqueue actions needed before leaving. \
- */ \
- if (tid != 0) { \
- /* \
- * Waiting for an enter() to process mblk(s). \
- */ \
- clock_t waited = lbolt - (sqp)->sq_awaken; \
- \
- if (TICK_TO_MSEC(waited) >= (sqp)->sq_wait) { \
- /* \
- * Times up and have a worker thread \
- * waiting for work, so schedule it. \
- */ \
- (sqp)->sq_tid = 0; \
- (sqp)->sq_awaken = lbolt; \
- cv_signal(&(sqp)->sq_async); \
- mutex_exit(&(sqp)->sq_lock); \
- (void) untimeout(tid); \
- return; \
- } \
- mutex_exit(&(sqp)->sq_lock); \
- return; \
- } else if ((sqp)->sq_state & SQS_TMO_PROG) { \
- mutex_exit(&(sqp)->sq_lock); \
- return; \
- } else if ((sqp)->sq_wait != 0) { \
- clock_t wait = (sqp)->sq_wait; \
- /* \
- * Wait up to sqp->sq_wait ms for an \
- * enter() to process this queue. We \
- * don't want to contend on timeout locks \
- * with sq_lock held for performance reasons, \
- * so drop the sq_lock before calling timeout \
- * but we need to check if timeout is required \
- * after re acquiring the sq_lock. Once \
- * the sq_lock is dropped, someone else could \
- * have processed the packet or the timeout could \
- * have already fired. \
- */ \
- (sqp)->sq_state |= SQS_TMO_PROG; \
- mutex_exit(&(sqp)->sq_lock); \
- tid = timeout(squeue_fire, (sqp), wait); \
- mutex_enter(&(sqp)->sq_lock); \
- /* Check again if we still need the timeout */ \
- if ((((sqp)->sq_state & (SQS_PROC|SQS_TMO_PROG)) == \
- SQS_TMO_PROG) && ((sqp)->sq_tid == 0) && \
- ((sqp)->sq_first != NULL)) { \
- (sqp)->sq_state &= ~SQS_TMO_PROG; \
- (sqp)->sq_awaken = lbolt; \
- (sqp)->sq_tid = tid; \
- mutex_exit(&(sqp)->sq_lock); \
- return; \
- } else { \
- if ((sqp)->sq_state & SQS_TMO_PROG) { \
- (sqp)->sq_state &= ~SQS_TMO_PROG; \
- mutex_exit(&(sqp)->sq_lock); \
- (void) untimeout(tid); \
- } else { \
- /* \
- * The timer fired before we could \
- * reacquire the sq_lock. squeue_fire \
- * removes the SQS_TMO_PROG flag \
- * and we don't need to do anything \
- * else. \
- */ \
- mutex_exit(&(sqp)->sq_lock); \
- } \
- } \
- } else { \
- /* \
- * Schedule the worker thread. \
- */ \
- (sqp)->sq_awaken = lbolt; \
- cv_signal(&(sqp)->sq_async); \
- mutex_exit(&(sqp)->sq_lock); \
- } \
- ASSERT(MUTEX_NOT_HELD(&(sqp)->sq_lock)); \
-}
-
-#define ENQUEUE_MP(sqp, mp, proc, arg) { \
- /* \
- * Enque our mblk. \
- */ \
- (mp)->b_queue = NULL; \
- ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \
- ASSERT((mp)->b_prev == NULL && (mp)->b_next == NULL); \
- (mp)->b_queue = (queue_t *)(proc); \
- (mp)->b_prev = (mblk_t *)(arg); \
- \
- if ((sqp)->sq_last != NULL) \
- (sqp)->sq_last->b_next = (mp); \
- else \
- (sqp)->sq_first = (mp); \
- (sqp)->sq_last = (mp); \
- (sqp)->sq_count++; \
- ASSERT((sqp)->sq_count > 0); \
- DTRACE_PROBE2(squeue__enqueue, squeue_t *, sqp, \
- mblk_t *, mp); \
-}
-
+#define MAX_BYTES_TO_PICKUP 150000
#define ENQUEUE_CHAIN(sqp, mp, tail, cnt) { \
/* \
@@ -390,89 +163,120 @@ struct squeue_kstat {
\
}
-#define SQS_POLLING_ON(sqp, rx_ring) { \
- ASSERT(rx_ring != NULL); \
+#define SQS_POLLING_ON(sqp, sq_poll_capable, rx_ring) { \
ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \
- rx_ring->rr_blank(rx_ring->rr_handle, \
- MIN((sqp->sq_avg_drain_time * sqp->sq_count), \
- rx_ring->rr_max_blank_time), \
- rx_ring->rr_max_pkt_cnt); \
- rx_ring->rr_poll_state |= ILL_POLLING; \
- rx_ring->rr_poll_time = lbolt; \
+ if (sq_poll_capable) { \
+ ASSERT(rx_ring != NULL); \
+ ASSERT(sqp->sq_state & SQS_POLL_CAPAB); \
+ if (!(sqp->sq_state & SQS_POLLING)) { \
+ sqp->sq_state |= SQS_POLLING; \
+ rx_ring->rr_intr_disable(rx_ring->rr_intr_handle); \
+ } \
+ } \
}
+#define SQS_POLLING_OFF(sqp, sq_poll_capable, rx_ring) { \
+ ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \
+ if (sq_poll_capable) { \
+ ASSERT(rx_ring != NULL); \
+ ASSERT(sqp->sq_state & SQS_POLL_CAPAB); \
+ if (sqp->sq_state & SQS_POLLING) { \
+ sqp->sq_state &= ~SQS_POLLING; \
+ rx_ring->rr_intr_enable(rx_ring->rr_intr_handle); \
+ } \
+ } \
+}
-#define SQS_POLLING_OFF(sqp, rx_ring) { \
- ASSERT(rx_ring != NULL); \
+#define SQS_POLL_RING(sqp, sq_poll_capable) { \
ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \
- rx_ring->rr_blank(rx_ring->rr_handle, \
- rx_ring->rr_min_blank_time, \
- rx_ring->rr_min_pkt_cnt); \
+ if (sq_poll_capable) { \
+ ASSERT(sqp->sq_state & SQS_POLL_CAPAB); \
+ if (!(sqp->sq_state & SQS_GET_PKTS)) { \
+ sqp->sq_state |= SQS_GET_PKTS; \
+ cv_signal(&sqp->sq_poll_cv); \
+ } \
+ } \
}
+#ifdef DEBUG
+#define SQUEUE_DBG_SET(sqp, mp, proc, connp, tag) { \
+ (sqp)->sq_curmp = (mp); \
+ (sqp)->sq_curproc = (proc); \
+ (sqp)->sq_connp = (connp); \
+ (mp)->b_tag = (sqp)->sq_tag = (tag); \
+}
+
+#define SQUEUE_DBG_CLEAR(sqp) { \
+ (sqp)->sq_curmp = NULL; \
+ (sqp)->sq_curproc = NULL; \
+ (sqp)->sq_connp = NULL; \
+}
+#else
+#define SQUEUE_DBG_SET(sqp, mp, proc, connp, tag)
+#define SQUEUE_DBG_CLEAR(sqp)
+#endif
+
void
squeue_init(void)
{
squeue_cache = kmem_cache_create("squeue_cache",
sizeof (squeue_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
- squeue_intrdrain_ns = squeue_intrdrain_ms * SQUEUE_MSEC_TO_NSEC;
- squeue_writerdrain_ns = squeue_writerdrain_ms * SQUEUE_MSEC_TO_NSEC;
- squeue_workerdrain_ns = squeue_workerdrain_ms * SQUEUE_MSEC_TO_NSEC;
+ squeue_drain_ns = squeue_drain_ms * SQUEUE_MSEC_TO_NSEC;
squeue_workerwait_tick = MSEC_TO_TICK_ROUNDUP(squeue_workerwait_ms);
}
/* ARGSUSED */
squeue_t *
-squeue_create(char *name, processorid_t bind, clock_t wait, pri_t pri)
+squeue_create(clock_t wait, pri_t pri)
{
squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
bzero(sqp, sizeof (squeue_t));
- (void) strncpy(sqp->sq_name, name, SQ_NAMELEN + 1);
- sqp->sq_name[SQ_NAMELEN] = '\0';
-
- sqp->sq_bind = bind;
+ sqp->sq_bind = PBIND_NONE;
+ sqp->sq_priority = pri;
sqp->sq_wait = MSEC_TO_TICK(wait);
- sqp->sq_avg_drain_time =
- drv_hztousec(NSEC_TO_TICK_ROUNDUP(squeue_intrdrain_ns)) /
- NSEC_TO_TICK_ROUNDUP(squeue_intrdrain_ns);
-
-#if SQUEUE_PROFILE
- if ((sqp->sq_kstat = kstat_create("ip", bind, name,
- "net", KSTAT_TYPE_NAMED,
- sizeof (squeue_kstat) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL)) != NULL) {
- sqp->sq_kstat->ks_lock = &squeue_kstat_lock;
- sqp->sq_kstat->ks_data = &squeue_kstat;
- sqp->sq_kstat->ks_update = squeue_kstat_update;
- sqp->sq_kstat->ks_private = sqp;
- kstat_install(sqp->sq_kstat);
- }
-#endif
-
sqp->sq_worker = thread_create(NULL, 0, squeue_worker,
sqp, 0, &p0, TS_RUN, pri);
+ sqp->sq_poll_thr = thread_create(NULL, 0, squeue_polling_thread,
+ sqp, 0, &p0, TS_RUN, pri);
+
+ sqp->sq_enter = squeue_enter;
+ sqp->sq_drain = squeue_drain;
+
return (sqp);
}
-/* ARGSUSED */
+/*
+ * Bind squeue worker thread to the specified CPU, given by CPU id.
+ * If the CPU id value is -1, bind the worker thread to the value
+ * specified in sq_bind field. If a thread is already bound to a
+ * different CPU, unbind it from the old CPU and bind to the new one.
+ */
+
void
squeue_bind(squeue_t *sqp, processorid_t bind)
{
- ASSERT(bind == -1);
-
mutex_enter(&sqp->sq_lock);
+ ASSERT(sqp->sq_bind != PBIND_NONE || bind != PBIND_NONE);
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
if (sqp->sq_state & SQS_BOUND) {
- mutex_exit(&sqp->sq_lock);
- return;
+ if (sqp->sq_bind == bind) {
+ mutex_exit(&sqp->sq_lock);
+ return;
+ }
+ thread_affinity_clear(sqp->sq_worker);
+ } else {
+ sqp->sq_state |= SQS_BOUND;
}
- sqp->sq_state |= SQS_BOUND;
- mutex_exit(&sqp->sq_lock);
+ if (bind != PBIND_NONE)
+ sqp->sq_bind = bind;
thread_affinity_set(sqp->sq_worker, sqp->sq_bind);
+ mutex_exit(&sqp->sq_lock);
}
void
@@ -485,9 +289,98 @@ squeue_unbind(squeue_t *sqp)
}
sqp->sq_state &= ~SQS_BOUND;
+ thread_affinity_clear(sqp->sq_worker);
mutex_exit(&sqp->sq_lock);
+}
- thread_affinity_clear(sqp->sq_worker);
+void
+squeue_worker_wakeup(squeue_t *sqp)
+{
+ timeout_id_t tid = (sqp)->sq_tid;
+
+ ASSERT(MUTEX_HELD(&(sqp)->sq_lock));
+
+ if (sqp->sq_wait == 0) {
+ ASSERT(tid == 0);
+ ASSERT(!(sqp->sq_state & SQS_TMO_PROG));
+ sqp->sq_awaken = lbolt;
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+ return;
+ }
+
+ /*
+ * Queue isn't being processed, so take
+ * any post enqueue actions needed before leaving.
+ */
+ if (tid != 0) {
+ /*
+ * Waiting for an enter() to process mblk(s).
+ */
+ clock_t waited = lbolt - sqp->sq_awaken;
+
+ if (TICK_TO_MSEC(waited) >= sqp->sq_wait) {
+ /*
+ * Times up and have a worker thread
+ * waiting for work, so schedule it.
+ */
+ sqp->sq_tid = 0;
+ sqp->sq_awaken = lbolt;
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+ (void) untimeout(tid);
+ return;
+ }
+ mutex_exit(&sqp->sq_lock);
+ return;
+ } else if (sqp->sq_state & SQS_TMO_PROG) {
+ mutex_exit(&sqp->sq_lock);
+ return;
+ } else {
+ clock_t wait = sqp->sq_wait;
+ /*
+ * Wait up to sqp->sq_wait ms for an
+ * enter() to process this queue. We
+ * don't want to contend on timeout locks
+ * with sq_lock held for performance reasons,
+ * so drop the sq_lock before calling timeout
+ * but we need to check if timeout is required
+ * after re acquiring the sq_lock. Once
+ * the sq_lock is dropped, someone else could
+ * have processed the packet or the timeout could
+ * have already fired.
+ */
+ sqp->sq_state |= SQS_TMO_PROG;
+ mutex_exit(&sqp->sq_lock);
+ tid = timeout(squeue_fire, sqp, wait);
+ mutex_enter(&sqp->sq_lock);
+ /* Check again if we still need the timeout */
+ if (((sqp->sq_state & (SQS_PROC|SQS_TMO_PROG)) ==
+ SQS_TMO_PROG) && (sqp->sq_tid == 0) &&
+ (sqp->sq_first != NULL)) {
+ sqp->sq_state &= ~SQS_TMO_PROG;
+ sqp->sq_tid = tid;
+ mutex_exit(&sqp->sq_lock);
+ return;
+ } else {
+ if (sqp->sq_state & SQS_TMO_PROG) {
+ sqp->sq_state &= ~SQS_TMO_PROG;
+ mutex_exit(&sqp->sq_lock);
+ (void) untimeout(tid);
+ } else {
+ /*
+ * The timer fired before we could
+ * reacquire the sq_lock. squeue_fire
+ * removes the SQS_TMO_PROG flag
+ * and we don't need to do anything
+ * else.
+ */
+ mutex_exit(&sqp->sq_lock);
+ }
+ }
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
}
/*
@@ -500,18 +393,20 @@ squeue_unbind(squeue_t *sqp)
*
* The proc and arg for each mblk is already stored in the mblk in
* appropriate places.
+ *
+ * The process_flag specifies if we are allowed to process the mblk
+ * and drain in the entering thread context. If process_flag is
+ * SQ_FILL, then we just queue the mblk and return (after signaling
+ * the worker thread if no one else is processing the squeue).
*/
+/* ARGSUSED */
void
-squeue_enter_chain(squeue_t *sqp, mblk_t *mp, mblk_t *tail,
- uint32_t cnt, uint8_t tag)
+squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
+ int process_flag, uint8_t tag)
{
- int interrupt = servicing_interrupt();
- void *arg;
+ conn_t *connp;
sqproc_t proc;
hrtime_t now;
-#if SQUEUE_PROFILE
- hrtime_t start, delta;
-#endif
ASSERT(sqp != NULL);
ASSERT(mp != NULL);
@@ -520,355 +415,111 @@ squeue_enter_chain(squeue_t *sqp, mblk_t *mp, mblk_t *tail,
ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
mutex_enter(&sqp->sq_lock);
- if (!(sqp->sq_state & SQS_PROC)) {
+
+ /*
+ * Try to process the packet if SQ_FILL flag is not set and
+ * we are allowed to process the squeue. The SQ_NODRAIN is
+ * ignored if the packet chain consists of more than 1 packet.
+ */
+ if (!(sqp->sq_state & SQS_PROC) && ((process_flag == SQ_PROCESS) ||
+ (process_flag == SQ_NODRAIN && sqp->sq_first == NULL))) {
/*
* See if anything is already queued. If we are the
* first packet, do inline processing else queue the
* packet and do the drain.
*/
- sqp->sq_run = curthread;
if (sqp->sq_first == NULL && cnt == 1) {
/*
* Fast-path, ok to process and nothing queued.
*/
sqp->sq_state |= (SQS_PROC|SQS_FAST);
+ sqp->sq_run = curthread;
mutex_exit(&sqp->sq_lock);
/*
* We are the chain of 1 packet so
* go through this fast path.
*/
- arg = mp->b_prev;
+ ASSERT(mp->b_prev != NULL);
+ ASSERT(mp->b_queue != NULL);
+ connp = (conn_t *)mp->b_prev;
mp->b_prev = NULL;
proc = (sqproc_t)mp->b_queue;
mp->b_queue = NULL;
-
- ASSERT(proc != NULL);
- ASSERT(arg != NULL);
+ ASSERT(proc != NULL && connp != NULL);
ASSERT(mp->b_next == NULL);
-#if SQUEUE_DEBUG
- sqp->sq_isintr = interrupt;
- sqp->sq_curmp = mp;
- sqp->sq_curproc = proc;
- sqp->sq_connp = arg;
- mp->b_tag = sqp->sq_tag = tag;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (interrupt)
- SQSTAT(sqp, sq_npackets_intr);
- else
- SQSTAT(sqp, sq_npackets_other);
- start = gethrtime();
- }
-#endif
- ((conn_t *)arg)->conn_on_sqp = B_TRUE;
- DTRACE_PROBE3(squeue__proc__start, squeue_t *,
- sqp, mblk_t *, mp, conn_t *, arg);
- (*proc)(arg, mp, sqp);
- DTRACE_PROBE2(squeue__proc__end, squeue_t *,
- sqp, conn_t *, arg);
- ((conn_t *)arg)->conn_on_sqp = B_FALSE;
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- delta = gethrtime() - start;
- if (interrupt)
- SQDELTA(sqp, sq_time_intr, delta);
- else
- SQDELTA(sqp, sq_time_other, delta);
- }
-#endif
-#if SQUEUE_DEBUG
- sqp->sq_curmp = NULL;
- sqp->sq_curproc = NULL;
- sqp->sq_connp = NULL;
- sqp->sq_isintr = 0;
-#endif
-
- CONN_DEC_REF((conn_t *)arg);
- ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
- mutex_enter(&sqp->sq_lock);
- sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
- if (sqp->sq_first == NULL) {
- /*
- * We processed inline our packet and
- * nothing new has arrived. We are done.
- */
- sqp->sq_run = NULL;
- mutex_exit(&sqp->sq_lock);
- return;
- } else if (sqp->sq_bind != CPU->cpu_id) {
- /*
- * If the current thread is not running
- * on the CPU to which this squeue is bound,
- * then don't allow it to drain.
- */
- sqp->sq_run = NULL;
- SQUEUE_WORKER_WAKEUP(sqp);
- return;
- }
- } else {
- ENQUEUE_CHAIN(sqp, mp, tail, cnt);
-#if SQUEUE_DEBUG
- mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (servicing_interrupt())
- SQSTAT(sqp, sq_nqueued_intr);
- else
- SQSTAT(sqp, sq_nqueued_other);
- if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
- sqp->sq_stats.sq_max_qlen =
- sqp->sq_count;
- }
-#endif
- }
-
- /*
- * We are here because either we couldn't do inline
- * processing (because something was already queued),
- * or we had a chanin of more than one packet,
- * or something else arrived after we were done with
- * inline processing.
- */
- ASSERT(MUTEX_HELD(&sqp->sq_lock));
- ASSERT(sqp->sq_first != NULL);
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- start = gethrtime();
- }
-#endif
-#if SQUEUE_DEBUG
- sqp->sq_isintr = interrupt;
-#endif
-
- now = gethrtime();
- if (interrupt) {
- squeue_drain(sqp, SQS_ENTER, now +
- squeue_intrdrain_ns);
- } else {
- squeue_drain(sqp, SQS_USER, now +
- squeue_writerdrain_ns);
- }
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- delta = gethrtime() - start;
- if (interrupt)
- SQDELTA(sqp, sq_time_intr, delta);
- else
- SQDELTA(sqp, sq_time_other, delta);
- }
-#endif
-#if SQUEUE_DEBUG
- sqp->sq_isintr = 0;
-#endif
-
- /*
- * If we didn't do a complete drain, the worker
- * thread was already signalled by squeue_drain.
- */
- sqp->sq_run = NULL;
- mutex_exit(&sqp->sq_lock);
- return;
- } else {
- ASSERT(sqp->sq_run != NULL);
- /*
- * Queue is already being processed. Just enqueue
- * the packet and go away.
- */
-#if SQUEUE_DEBUG
- mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (servicing_interrupt())
- SQSTAT(sqp, sq_nqueued_intr);
- else
- SQSTAT(sqp, sq_nqueued_other);
- if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
- sqp->sq_stats.sq_max_qlen = sqp->sq_count;
- }
-#endif
-
- ENQUEUE_CHAIN(sqp, mp, tail, cnt);
- mutex_exit(&sqp->sq_lock);
- return;
- }
-}
-
-/*
- * squeue_enter() - enter squeue *sqp with mblk *mp with argument of *arg.
- */
-void
-squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
- uint8_t tag)
-{
- int interrupt = servicing_interrupt();
- hrtime_t now;
-#if SQUEUE_PROFILE
- hrtime_t start, delta;
-#endif
-#if SQUEUE_DEBUG
- conn_t *connp = (conn_t *)arg;
- ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
- ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
-#endif
-
- ASSERT(proc != NULL);
- ASSERT(sqp != NULL);
- ASSERT(mp != NULL);
- ASSERT(mp->b_next == NULL);
- ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
-
- mutex_enter(&sqp->sq_lock);
- if (!(sqp->sq_state & SQS_PROC)) {
- /*
- * See if anything is already queued. If we are the
- * first packet, do inline processing else queue the
- * packet and do the drain.
- */
- sqp->sq_run = curthread;
- if (sqp->sq_first == NULL) {
/*
- * Fast-path, ok to process and nothing queued.
+ * Handle squeue switching. More details in the
+ * block comment at the top of the file
*/
- sqp->sq_state |= (SQS_PROC|SQS_FAST);
- mutex_exit(&sqp->sq_lock);
-
-#if SQUEUE_DEBUG
- sqp->sq_isintr = interrupt;
- sqp->sq_curmp = mp;
- sqp->sq_curproc = proc;
- sqp->sq_connp = connp;
- mp->b_tag = sqp->sq_tag = tag;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (interrupt)
- SQSTAT(sqp, sq_npackets_intr);
- else
- SQSTAT(sqp, sq_npackets_other);
- start = gethrtime();
+ if (connp->conn_sqp == sqp) {
+ SQUEUE_DBG_SET(sqp, mp, proc, connp,
+ tag);
+ connp->conn_on_sqp = B_TRUE;
+ DTRACE_PROBE3(squeue__proc__start, squeue_t *,
+ sqp, mblk_t *, mp, conn_t *, connp);
+ (*proc)(connp, mp, sqp);
+ DTRACE_PROBE2(squeue__proc__end, squeue_t *,
+ sqp, conn_t *, connp);
+ connp->conn_on_sqp = B_FALSE;
+ SQUEUE_DBG_CLEAR(sqp);
+ CONN_DEC_REF(connp);
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
+ connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
}
-#endif
- ((conn_t *)arg)->conn_on_sqp = B_TRUE;
- DTRACE_PROBE3(squeue__proc__start, squeue_t *,
- sqp, mblk_t *, mp, conn_t *, arg);
- (*proc)(arg, mp, sqp);
- DTRACE_PROBE2(squeue__proc__end, squeue_t *,
- sqp, conn_t *, arg);
- ((conn_t *)arg)->conn_on_sqp = B_FALSE;
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- delta = gethrtime() - start;
- if (interrupt)
- SQDELTA(sqp, sq_time_intr, delta);
- else
- SQDELTA(sqp, sq_time_other, delta);
- }
-#endif
-#if SQUEUE_DEBUG
- sqp->sq_curmp = NULL;
- sqp->sq_curproc = NULL;
- sqp->sq_connp = NULL;
- sqp->sq_isintr = 0;
-#endif
-
- CONN_DEC_REF((conn_t *)arg);
ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
mutex_enter(&sqp->sq_lock);
sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
- if (sqp->sq_first == NULL) {
+ sqp->sq_run = NULL;
+ if (sqp->sq_first == NULL ||
+ process_flag == SQ_NODRAIN) {
+ if (sqp->sq_first != NULL) {
+ squeue_worker_wakeup(sqp);
+ return;
+ }
/*
- * We processed inline our packet and
- * nothing new has arrived. We are done.
+ * We processed inline our packet and nothing
+ * new has arrived. We are done. In case any
+ * control actions are pending, wake up the
+ * worker.
*/
- sqp->sq_run = NULL;
+ if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
+ cv_signal(&sqp->sq_worker_cv);
mutex_exit(&sqp->sq_lock);
return;
- } else if (sqp->sq_bind != CPU->cpu_id) {
- /*
- * If the current thread is not running
- * on the CPU to which this squeue is bound,
- * then don't allow it to drain.
- */
- sqp->sq_run = NULL;
- SQUEUE_WORKER_WAKEUP(sqp);
- return;
}
} else {
- ENQUEUE_MP(sqp, mp, proc, arg);
-#if SQUEUE_DEBUG
+ ENQUEUE_CHAIN(sqp, mp, tail, cnt);
+#ifdef DEBUG
mp->b_tag = tag;
#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (servicing_interrupt())
- SQSTAT(sqp, sq_nqueued_intr);
- else
- SQSTAT(sqp, sq_nqueued_other);
- if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
- sqp->sq_stats.sq_max_qlen =
- sqp->sq_count;
- }
-#endif
}
-
/*
* We are here because either we couldn't do inline
- * processing (because something was already queued)
+ * processing (because something was already queued),
+ * or we had a chain of more than one packet,
* or something else arrived after we were done with
* inline processing.
*/
ASSERT(MUTEX_HELD(&sqp->sq_lock));
ASSERT(sqp->sq_first != NULL);
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- start = gethrtime();
- }
-#endif
-#if SQUEUE_DEBUG
- sqp->sq_isintr = interrupt;
-#endif
-
now = gethrtime();
- if (interrupt) {
- squeue_drain(sqp, SQS_ENTER, now +
- squeue_intrdrain_ns);
- } else {
- squeue_drain(sqp, SQS_USER, now +
- squeue_writerdrain_ns);
- }
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- delta = gethrtime() - start;
- if (interrupt)
- SQDELTA(sqp, sq_time_intr, delta);
- else
- SQDELTA(sqp, sq_time_other, delta);
- }
-#endif
-#if SQUEUE_DEBUG
- sqp->sq_isintr = 0;
-#endif
+ sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns);
/*
* If we didn't do a complete drain, the worker
* thread was already signalled by squeue_drain.
+ * In case any control actions are pending, wake
+ * up the worker.
*/
sqp->sq_run = NULL;
+ if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
+ cv_signal(&sqp->sq_worker_cv);
mutex_exit(&sqp->sq_lock);
return;
} else {
- ASSERT(sqp->sq_run != NULL);
/*
* We let a thread processing a squeue reenter only
* once. This helps the case of incoming connection
@@ -878,168 +529,42 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
* loopback connection where the two ends are bound
* to the same squeue (which is typical on single
* CPU machines).
+ *
* We let the thread reenter only once for the fear
* of stack getting blown with multiple traversal.
*/
+ connp = (conn_t *)mp->b_prev;
if (!(sqp->sq_state & SQS_REENTER) &&
- (sqp->sq_run == curthread) && sqp->sq_first == NULL &&
- (((conn_t *)arg)->conn_on_sqp == B_FALSE)) {
+ (process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
+ (sqp->sq_run == curthread) && (cnt == 1) &&
+ (connp->conn_on_sqp == B_FALSE)) {
sqp->sq_state |= SQS_REENTER;
mutex_exit(&sqp->sq_lock);
- ((conn_t *)arg)->conn_on_sqp = B_TRUE;
- DTRACE_PROBE3(squeue__proc__start, squeue_t *,
- sqp, mblk_t *, mp, conn_t *, arg);
- (*proc)(arg, mp, sqp);
- DTRACE_PROBE2(squeue__proc__end, squeue_t *,
- sqp, conn_t *, arg);
- ((conn_t *)arg)->conn_on_sqp = B_FALSE;
- CONN_DEC_REF((conn_t *)arg);
-
- mutex_enter(&sqp->sq_lock);
- sqp->sq_state &= ~SQS_REENTER;
- mutex_exit(&sqp->sq_lock);
- return;
- }
- /*
- * Queue is already being processed. Just enqueue
- * the packet and go away.
- */
-#if SQUEUE_DEBUG
- mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (servicing_interrupt())
- SQSTAT(sqp, sq_nqueued_intr);
- else
- SQSTAT(sqp, sq_nqueued_other);
- if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
- sqp->sq_stats.sq_max_qlen = sqp->sq_count;
- }
-#endif
-
- ENQUEUE_MP(sqp, mp, proc, arg);
- mutex_exit(&sqp->sq_lock);
- return;
- }
-}
-
-void
-squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
- uint8_t tag)
-{
- int interrupt = servicing_interrupt();
- boolean_t being_processed;
-#if SQUEUE_DEBUG
- conn_t *connp = (conn_t *)arg;
-#endif
-#if SQUEUE_PROFILE
- hrtime_t start, delta;
-#endif
+ ASSERT(mp->b_prev != NULL);
+ ASSERT(mp->b_queue != NULL);
- ASSERT(proc != NULL);
- ASSERT(sqp != NULL);
- ASSERT(mp != NULL);
- ASSERT(mp->b_next == NULL);
- ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
- ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
- ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
-
- mutex_enter(&sqp->sq_lock);
-
- being_processed = (sqp->sq_state & SQS_PROC);
- if (!being_processed && (sqp->sq_first == NULL)) {
- /*
- * Fast-path, ok to process and nothing queued.
- */
- sqp->sq_state |= (SQS_PROC|SQS_FAST);
- sqp->sq_run = curthread;
- mutex_exit(&sqp->sq_lock);
-
-#if SQUEUE_DEBUG
- sqp->sq_isintr = interrupt;
- sqp->sq_curmp = mp;
- sqp->sq_curproc = proc;
- sqp->sq_connp = connp;
- mp->b_tag = sqp->sq_tag = tag;
-#endif
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (interrupt)
- SQSTAT(sqp, sq_npackets_intr);
- else
- SQSTAT(sqp, sq_npackets_other);
- start = gethrtime();
- }
-#endif
-
- ((conn_t *)arg)->conn_on_sqp = B_TRUE;
- DTRACE_PROBE3(squeue__proc__start, squeue_t *,
- sqp, mblk_t *, mp, conn_t *, arg);
- (*proc)(arg, mp, sqp);
- DTRACE_PROBE2(squeue__proc__end, squeue_t *,
- sqp, conn_t *, arg);
- ((conn_t *)arg)->conn_on_sqp = B_FALSE;
-
-#if SQUEUE_DEBUG
- sqp->sq_curmp = NULL;
- sqp->sq_curproc = NULL;
- sqp->sq_connp = NULL;
- sqp->sq_isintr = 0;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- delta = gethrtime() - start;
- if (interrupt)
- SQDELTA(sqp, sq_time_intr, delta);
- else
- SQDELTA(sqp, sq_time_other, delta);
- }
-#endif
+ mp->b_prev = NULL;
+ proc = (sqproc_t)mp->b_queue;
+ mp->b_queue = NULL;
- CONN_DEC_REF((conn_t *)arg);
- mutex_enter(&sqp->sq_lock);
- sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
- sqp->sq_run = NULL;
- if (sqp->sq_first == NULL) {
/*
- * We processed inline our packet and
- * nothing new has arrived. We are done.
+ * Handle squeue switching. More details in the
+ * block comment at the top of the file
*/
- mutex_exit(&sqp->sq_lock);
- } else {
- SQUEUE_WORKER_WAKEUP(sqp);
- }
- return;
- } else {
- /*
- * We let a thread processing a squeue reenter only
- * once. This helps the case of incoming connection
- * where a SYN-ACK-ACK that triggers the conn_ind
- * doesn't have to queue the packet if listener and
- * eager are on the same squeue. Also helps the
- * loopback connection where the two ends are bound
- * to the same squeue (which is typical on single
- * CPU machines).
- * We let the thread reenter only once for the fear
- * of stack getting blown with multiple traversal.
- */
- if (being_processed && !(sqp->sq_state & SQS_REENTER) &&
- (sqp->sq_run == curthread) && sqp->sq_first == NULL &&
- (((conn_t *)arg)->conn_on_sqp == B_FALSE)) {
- sqp->sq_state |= SQS_REENTER;
- mutex_exit(&sqp->sq_lock);
-
- ((conn_t *)arg)->conn_on_sqp = B_TRUE;
- DTRACE_PROBE3(squeue__proc__start, squeue_t *,
- sqp, mblk_t *, mp, conn_t *, arg);
- (*proc)(arg, mp, sqp);
- DTRACE_PROBE2(squeue__proc__end, squeue_t *,
- sqp, conn_t *, arg);
- ((conn_t *)arg)->conn_on_sqp = B_FALSE;
- CONN_DEC_REF((conn_t *)arg);
+ if (connp->conn_sqp == sqp) {
+ connp->conn_on_sqp = B_TRUE;
+ DTRACE_PROBE3(squeue__proc__start, squeue_t *,
+ sqp, mblk_t *, mp, conn_t *, connp);
+ (*proc)(connp, mp, sqp);
+ DTRACE_PROBE2(squeue__proc__end, squeue_t *,
+ sqp, conn_t *, connp);
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
+ connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
+ }
mutex_enter(&sqp->sq_lock);
sqp->sq_state &= ~SQS_REENTER;
@@ -1047,80 +572,32 @@ squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
return;
}
-#if SQUEUE_DEBUG
+ /*
+ * Queue is already being processed or there is already
+ * one or more paquets on the queue. Enqueue the
+ * packet and wakeup the squeue worker thread if the
+ * squeue is not being processed.
+ */
+#ifdef DEBUG
mp->b_tag = tag;
#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (servicing_interrupt())
- SQSTAT(sqp, sq_nqueued_intr);
- else
- SQSTAT(sqp, sq_nqueued_other);
- if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
- sqp->sq_stats.sq_max_qlen = sqp->sq_count;
- }
-#endif
- ENQUEUE_MP(sqp, mp, proc, arg);
- if (being_processed) {
- /*
- * Queue is already being processed.
- * No need to do anything.
- */
- mutex_exit(&sqp->sq_lock);
+
+ ENQUEUE_CHAIN(sqp, mp, tail, cnt);
+ if (!(sqp->sq_state & SQS_PROC)) {
+ squeue_worker_wakeup(sqp);
return;
}
- SQUEUE_WORKER_WAKEUP(sqp);
- }
-}
-
-/*
- * squeue_fill() - fill squeue *sqp with mblk *mp with argument of *arg
- * without processing the squeue.
- */
-/* ARGSUSED */
-void
-squeue_fill(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void * arg,
- uint8_t tag)
-{
-#if SQUEUE_DEBUG
- conn_t *connp = (conn_t *)arg;
-#endif
- ASSERT(proc != NULL);
- ASSERT(sqp != NULL);
- ASSERT(mp != NULL);
- ASSERT(mp->b_next == NULL);
- ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
- ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
-
- ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
- mutex_enter(&sqp->sq_lock);
- ENQUEUE_MP(sqp, mp, proc, arg);
-#if SQUEUE_DEBUG
- mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (servicing_interrupt())
- SQSTAT(sqp, sq_nqueued_intr);
- else
- SQSTAT(sqp, sq_nqueued_other);
- if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
- sqp->sq_stats.sq_max_qlen = sqp->sq_count;
- }
-#endif
-
- /*
- * If queue is already being processed. No need to do anything.
- */
- if (sqp->sq_state & SQS_PROC) {
+ /*
+ * In case any control actions are pending, wake
+ * up the worker.
+ */
+ if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
+ cv_signal(&sqp->sq_worker_cv);
mutex_exit(&sqp->sq_lock);
return;
}
-
- SQUEUE_WORKER_WAKEUP(sqp);
}
-
/*
* PRIVATE FUNCTIONS
*/
@@ -1151,7 +628,7 @@ squeue_fire(void *arg)
if (!(state & SQS_PROC)) {
sqp->sq_awaken = lbolt;
- cv_signal(&sqp->sq_async);
+ cv_signal(&sqp->sq_worker_cv);
}
mutex_exit(&sqp->sq_lock);
}
@@ -1159,64 +636,52 @@ squeue_fire(void *arg)
static void
squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
{
- mblk_t *mp;
- mblk_t *head;
- sqproc_t proc;
- conn_t *connp;
- clock_t start = lbolt;
- clock_t drain_time;
- timeout_id_t tid;
- uint_t cnt;
- uint_t total_cnt = 0;
+ mblk_t *mp;
+ mblk_t *head;
+ sqproc_t proc;
+ conn_t *connp;
+ timeout_id_t tid;
ill_rx_ring_t *sq_rx_ring = sqp->sq_rx_ring;
- int interrupt = servicing_interrupt();
- boolean_t poll_on = B_FALSE;
- hrtime_t now;
+ hrtime_t now;
+ boolean_t did_wakeup = B_FALSE;
+ boolean_t sq_poll_capable;
+ sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
+again:
ASSERT(mutex_owned(&sqp->sq_lock));
- ASSERT(!(sqp->sq_state & SQS_PROC));
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (interrupt)
- SQSTAT(sqp, sq_ndrains_intr);
- else if (!(proc_type & SQS_WORKER))
- SQSTAT(sqp, sq_ndrains_other);
- else
- SQSTAT(sqp, sq_ndrains_worker);
- }
-#endif
+ ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE)));
+
+ head = sqp->sq_first;
+ sqp->sq_first = NULL;
+ sqp->sq_last = NULL;
+ sqp->sq_count = 0;
if ((tid = sqp->sq_tid) != 0)
sqp->sq_tid = 0;
sqp->sq_state |= SQS_PROC | proc_type;
- head = sqp->sq_first;
- sqp->sq_first = NULL;
- sqp->sq_last = NULL;
- cnt = sqp->sq_count;
+
/*
* We have backlog built up. Switch to polling mode if the
- * device underneath allows it. Need to do it only for
- * drain by non-interrupt thread so interrupts don't
- * come and disrupt us in between. If its a interrupt thread,
- * no need because most devices will not issue another
- * interrupt till this one returns.
+ * device underneath allows it. Need to do it so that
+ * more packets don't come in and disturb us (by contending
+ * for sq_lock or higher priority thread preempting us).
+ *
+ * The worker thread is allowed to do active polling while we
+ * just disable the interrupts for drain by non worker (kernel
+ * or userland) threads so they can peacefully process the
+ * packets during time allocated to them.
*/
- if ((sqp->sq_state & SQS_POLL_CAPAB) && !(proc_type & SQS_ENTER) &&
- (sqp->sq_count > squeue_worker_poll_min)) {
- ASSERT(sq_rx_ring != NULL);
- SQS_POLLING_ON(sqp, sq_rx_ring);
- poll_on = B_TRUE;
- }
-
+ SQS_POLLING_ON(sqp, sq_poll_capable, sq_rx_ring);
mutex_exit(&sqp->sq_lock);
if (tid != 0)
(void) untimeout(tid);
-again:
+
while ((mp = head) != NULL) {
+
head = mp->b_next;
mp->b_next = NULL;
@@ -1224,255 +689,548 @@ again:
mp->b_queue = NULL;
connp = (conn_t *)mp->b_prev;
mp->b_prev = NULL;
-#if SQUEUE_DEBUG
- sqp->sq_curmp = mp;
- sqp->sq_curproc = proc;
- sqp->sq_connp = connp;
- sqp->sq_tag = mp->b_tag;
-#endif
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- if (interrupt)
- SQSTAT(sqp, sq_npackets_intr);
- else if (!(proc_type & SQS_WORKER))
- SQSTAT(sqp, sq_npackets_other);
- else
- SQSTAT(sqp, sq_npackets_worker);
+ /*
+ * Handle squeue switching. More details in the
+ * block comment at the top of the file
+ */
+ if (connp->conn_sqp == sqp) {
+ SQUEUE_DBG_SET(sqp, mp, proc, connp,
+ mp->b_tag);
+ connp->conn_on_sqp = B_TRUE;
+ DTRACE_PROBE3(squeue__proc__start, squeue_t *,
+ sqp, mblk_t *, mp, conn_t *, connp);
+ (*proc)(connp, mp, sqp);
+ DTRACE_PROBE2(squeue__proc__end, squeue_t *,
+ sqp, conn_t *, connp);
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp,
+ SQ_FILL, SQTAG_SQUEUE_CHANGE);
}
-#endif
-
- connp->conn_on_sqp = B_TRUE;
- DTRACE_PROBE3(squeue__proc__start, squeue_t *,
- sqp, mblk_t *, mp, conn_t *, connp);
- (*proc)(connp, mp, sqp);
- DTRACE_PROBE2(squeue__proc__end, squeue_t *,
- sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
}
-
-#if SQUEUE_DEBUG
- sqp->sq_curmp = NULL;
- sqp->sq_curproc = NULL;
- sqp->sq_connp = NULL;
-#endif
+ SQUEUE_DBG_CLEAR(sqp);
mutex_enter(&sqp->sq_lock);
- sqp->sq_count -= cnt;
- total_cnt += cnt;
+ /*
+ * Check if there is still work to do (either more arrived or timer
+ * expired). If we are the worker thread and we are polling capable,
+ * continue doing the work since no one else is around to do the
+ * work anyway (but signal the poll thread to retrieve some packets
+ * in the meanwhile). If we are not the worker thread, just
+ * signal the worker thread to take up the work if processing time
+ * has expired.
+ */
if (sqp->sq_first != NULL) {
-
- now = gethrtime();
- if (!expire || (now < expire)) {
- /* More arrived and time not expired */
- head = sqp->sq_first;
- sqp->sq_first = NULL;
- sqp->sq_last = NULL;
- cnt = sqp->sq_count;
- mutex_exit(&sqp->sq_lock);
- goto again;
- }
-
/*
- * If we are not worker thread and we
- * reached our time limit to do drain,
- * signal the worker thread to pick
- * up the work.
- * If we were the worker thread, then
- * we take a break to allow an interrupt
- * or writer to pick up the load.
+ * Still more to process. If time quanta not expired, we
+ * should let the drain go on. The worker thread is allowed
+ * to drain as long as there is anything left.
*/
- if (proc_type != SQS_WORKER) {
+ now = gethrtime();
+ if ((now < expire) || (proc_type == SQS_WORKER)) {
+ /*
+ * If time not expired or we are worker thread and
+ * this squeue is polling capable, continue to do
+ * the drain.
+ *
+ * We turn off interrupts for all userland threads
+ * doing drain but we do active polling only for
+ * worker thread.
+ */
+ if (proc_type == SQS_WORKER)
+ SQS_POLL_RING(sqp, sq_poll_capable);
+ goto again;
+ } else {
+ did_wakeup = B_TRUE;
sqp->sq_awaken = lbolt;
- cv_signal(&sqp->sq_async);
+ cv_signal(&sqp->sq_worker_cv);
}
}
/*
- * Try to see if we can get a time estimate to process a packet.
- * Do it only in interrupt context since less chance of context
- * switch or pinning etc. to get a better estimate.
+ * If the poll thread is already running, just return. The
+ * poll thread continues to hold the proc and will finish
+ * processing.
*/
- if (interrupt && ((drain_time = (lbolt - start)) > 0))
- sqp->sq_avg_drain_time = ((80 * sqp->sq_avg_drain_time) +
- (20 * (drv_hztousec(drain_time)/total_cnt)))/100;
-
- sqp->sq_state &= ~(SQS_PROC | proc_type);
+ if (sqp->sq_state & SQS_GET_PKTS) {
+ ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE)));
+ sqp->sq_state &= ~proc_type;
+ return;
+ }
/*
- * If polling was turned on, turn it off and reduce the default
- * interrupt blank interval as well to bring new packets in faster
- * (reduces the latency when there is no backlog).
+ *
+ * If we are the worker thread and no work is left, send the poll
+ * thread down once more to see if something arrived. Otherwise,
+ * turn the interrupts back on and we are done.
*/
- if (poll_on && (sqp->sq_state & SQS_POLL_CAPAB)) {
- ASSERT(sq_rx_ring != NULL);
- SQS_POLLING_OFF(sqp, sq_rx_ring);
+ if ((proc_type == SQS_WORKER) &&
+ (sqp->sq_state & SQS_POLL_CAPAB)) {
+ /*
+ * Do one last check to see if anything arrived
+ * in the NIC. We leave the SQS_PROC set to ensure
+ * that poll thread keeps the PROC and can decide
+ * if it needs to turn polling off or continue
+ * processing.
+ *
+ * If we drop the SQS_PROC here and poll thread comes
+ * up empty handed, it can not safely turn polling off
+ * since someone else could have acquired the PROC
+ * and started draining. The previously running poll
+ * thread and the current thread doing drain would end
+ * up in a race for turning polling on/off and more
+ * complex code would be required to deal with it.
+ *
+ * Its lot simpler for drain to hand the SQS_PROC to
+ * poll thread (if running) and let poll thread finish
+ * without worrying about racing with any other thread.
+ */
+ ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE)));
+ SQS_POLL_RING(sqp, sq_poll_capable);
+ sqp->sq_state &= ~proc_type;
+ } else {
+ /*
+ * The squeue is either not capable of polling or
+ * poll thread already finished processing and didn't
+ * find anything. Since there is nothing queued and
+ * we already turn polling on (for all threads doing
+ * drain), we should turn polling off and relinquish
+ * the PROC.
+ */
+ ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE)));
+ SQS_POLLING_OFF(sqp, sq_poll_capable, sq_rx_ring);
+ sqp->sq_state &= ~(SQS_PROC | proc_type);
+ if (!did_wakeup && sqp->sq_first != NULL) {
+ squeue_worker_wakeup(sqp);
+ mutex_enter(&sqp->sq_lock);
+ }
+ /*
+ * If we are not the worker and there is a pending quiesce
+ * event, wake up the worker
+ */
+ if ((proc_type != SQS_WORKER) &&
+ (sqp->sq_state & SQS_WORKER_THR_CONTROL))
+ cv_signal(&sqp->sq_worker_cv);
}
}
+/*
+ * Quiesce, Restart, or Cleanup of the squeue poll thread.
+ *
+ * Quiesce and Restart: After an squeue poll thread has been quiesced, it does
+ * not attempt to poll the underlying soft ring any more. The quiesce is
+ * triggered by the mac layer when it wants to quiesce a soft ring. Typically
+ * control operations such as changing the fanout of a NIC or VNIC (dladm
+ * setlinkprop) need to quiesce data flow before changing the wiring.
+ * The operation is done by the mac layer, but it calls back into IP to
+ * quiesce the soft ring. After completing the operation (say increase or
+ * decrease of the fanout) the mac layer then calls back into IP to restart
+ * the quiesced soft ring.
+ *
+ * Cleanup: This is triggered when the squeue binding to a soft ring is
+ * removed permanently. Typically interface plumb and unplumb would trigger
+ * this. It can also be triggered from the mac layer when a soft ring is
+ * being deleted say as the result of a fanout reduction. Since squeues are
+ * never deleted, the cleanup marks the squeue as fit for recycling and
+ * moves it to the zeroth squeue set.
+ */
static void
-squeue_worker(squeue_t *sqp)
+squeue_poll_thr_control(squeue_t *sqp)
+{
+ if (sqp->sq_state & SQS_POLL_THR_RESTART) {
+ /* Restart implies a previous quiesce */
+ ASSERT(sqp->sq_state & SQS_POLL_THR_QUIESCED);
+ sqp->sq_state &= ~(SQS_POLL_THR_QUIESCED |
+ SQS_POLL_THR_RESTART);
+ sqp->sq_state |= SQS_POLL_CAPAB;
+ cv_signal(&sqp->sq_worker_cv);
+ return;
+ }
+
+ if (sqp->sq_state & SQS_POLL_THR_QUIESCE) {
+ sqp->sq_state |= SQS_POLL_THR_QUIESCED;
+ sqp->sq_state &= ~SQS_POLL_THR_QUIESCE;
+ cv_signal(&sqp->sq_worker_cv);
+ return;
+ }
+}
+
+/*
+ * POLLING Notes
+ *
+ * With polling mode, we want to do as much processing as we possibly can
+ * in worker thread context. The sweet spot is worker thread keeps doing
+ * work all the time in polling mode and writers etc. keep dumping packets
+ * to worker thread. Occassionally, we send the poll thread (running at
+ * lower priority to NIC to get the chain of packets to feed to worker).
+ * Sending the poll thread down to NIC is dependant on 3 criterions
+ *
+ * 1) Its always driven from squeue_drain and only if worker thread is
+ * doing the drain.
+ * 2) We clear the backlog once and more packets arrived in between.
+ * Before starting drain again, send the poll thread down if
+ * the drain is being done by worker thread.
+ * 3) Before exiting the squeue_drain, if the poll thread is not already
+ * working and we are the worker thread, try to poll one more time.
+ *
+ * For latency sake, we do allow any thread calling squeue_enter
+ * to process its packet provided:
+ *
+ * 1) Nothing is queued
+ * 2) If more packets arrived in between, the non worker thread are allowed
+ * to do the drain till their time quanta expired provided SQS_GET_PKTS
+ * wasn't set in between.
+ *
+ * Avoiding deadlocks with interrupts
+ * ==================================
+ *
+ * One of the big problem is that we can't send poll_thr down while holding
+ * the sq_lock since the thread can block. So we drop the sq_lock before
+ * calling sq_get_pkts(). We keep holding the SQS_PROC as long as the
+ * poll thread is running so that no other thread can acquire the
+ * perimeter in between. If the squeue_drain gets done (no more work
+ * left), it leaves the SQS_PROC set if poll thread is running.
+ */
+
+/*
+ * This is the squeue poll thread. In poll mode, it polls the underlying
+ * TCP softring and feeds packets into the squeue. The worker thread then
+ * drains the squeue. The poll thread also responds to control signals for
+ * quiesceing, restarting, or cleanup of an squeue. These are driven by
+ * control operations like plumb/unplumb or as a result of dynamic Rx ring
+ * related operations that are driven from the mac layer.
+ */
+static void
+squeue_polling_thread(squeue_t *sqp)
{
kmutex_t *lock = &sqp->sq_lock;
- kcondvar_t *async = &sqp->sq_async;
+ kcondvar_t *async = &sqp->sq_poll_cv;
+ ip_mac_rx_t sq_get_pkts;
+ ip_accept_t ip_accept;
+ ill_rx_ring_t *sq_rx_ring;
+ ill_t *sq_ill;
+ mblk_t *head, *tail, *mp;
+ uint_t cnt;
+ void *sq_mac_handle;
callb_cpr_t cprinfo;
- hrtime_t now;
-#if SQUEUE_PROFILE
- hrtime_t start;
-#endif
+ size_t bytes_to_pickup;
+ uint32_t ctl_state;
- CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "nca");
+ CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_poll");
mutex_enter(lock);
for (;;) {
- while (sqp->sq_first == NULL || (sqp->sq_state & SQS_PROC)) {
- CALLB_CPR_SAFE_BEGIN(&cprinfo);
-still_wait:
- cv_wait(async, lock);
- if (sqp->sq_state & SQS_PROC) {
- goto still_wait;
- }
- CALLB_CPR_SAFE_END(&cprinfo, lock);
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(async, lock);
+ CALLB_CPR_SAFE_END(&cprinfo, lock);
+
+ ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
+ SQS_POLL_THR_QUIESCED);
+ if (ctl_state != 0) {
+ /*
+ * If the squeue is quiesced, then wait for a control
+ * request. A quiesced squeue must not poll the
+ * underlying soft ring.
+ */
+ if (ctl_state == SQS_POLL_THR_QUIESCED)
+ continue;
+ /*
+ * Act on control requests to quiesce, cleanup or
+ * restart an squeue
+ */
+ squeue_poll_thr_control(sqp);
+ continue;
}
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- start = gethrtime();
+ if (!(sqp->sq_state & SQS_POLL_CAPAB))
+ continue;
+
+ ASSERT((sqp->sq_state &
+ (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
+ (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
+
+poll_again:
+ sq_rx_ring = sqp->sq_rx_ring;
+ sq_get_pkts = sq_rx_ring->rr_rx;
+ sq_mac_handle = sq_rx_ring->rr_rx_handle;
+ ip_accept = sq_rx_ring->rr_ip_accept;
+ sq_ill = sq_rx_ring->rr_ill;
+ bytes_to_pickup = MAX_BYTES_TO_PICKUP;
+ mutex_exit(lock);
+ head = sq_get_pkts(sq_mac_handle, bytes_to_pickup);
+ mp = NULL;
+ if (head != NULL) {
+ /*
+ * We got the packet chain from the mac layer. It
+ * would be nice to be able to process it inline
+ * for better performance but we need to give
+ * IP a chance to look at this chain to ensure
+ * that packets are really meant for this squeue
+ * and do the IP processing.
+ */
+ mp = ip_accept(sq_ill, sq_rx_ring, sqp, head,
+ &tail, &cnt);
}
-#endif
+ mutex_enter(lock);
+ if (mp != NULL)
+ ENQUEUE_CHAIN(sqp, mp, tail, cnt);
- ASSERT(squeue_workerdrain_ns != 0);
- now = gethrtime();
- sqp->sq_run = curthread;
- squeue_drain(sqp, SQS_WORKER, now + squeue_workerdrain_ns);
- sqp->sq_run = NULL;
+ ASSERT((sqp->sq_state &
+ (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
+ (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
- if (sqp->sq_first != NULL) {
+ if (sqp->sq_first != NULL && !(sqp->sq_state & SQS_WORKER)) {
/*
- * Doing too much processing by worker thread
- * in presense of interrupts can be sub optimal.
- * Instead, once a drain is done by worker thread
- * for squeue_writerdrain_ns (the reason we are
- * here), we force wait for squeue_workerwait_tick
- * before doing more processing even if sq_wait is
- * set to 0.
- *
- * This can be counterproductive for performance
- * if worker thread is the only means to process
- * the packets (interrupts or writers are not
- * allowed inside the squeue).
+ * We have packets to process and worker thread
+ * is not running. Check to see if poll thread is
+ * allowed to process. Let it do processing only if it
+ * picked up some packets from the NIC otherwise
+ * wakeup the worker thread.
*/
- if (sqp->sq_tid == 0 &&
- !(sqp->sq_state & SQS_TMO_PROG)) {
- timeout_id_t tid;
+ if (mp != NULL) {
+ hrtime_t now;
+
+ now = gethrtime();
+ sqp->sq_run = curthread;
+ sqp->sq_drain(sqp, SQS_POLL_PROC, now +
+ squeue_drain_ns);
+ sqp->sq_run = NULL;
+
+ if (sqp->sq_first == NULL)
+ goto poll_again;
- sqp->sq_state |= SQS_TMO_PROG;
- mutex_exit(&sqp->sq_lock);
- tid = timeout(squeue_fire, sqp,
- squeue_workerwait_tick);
- mutex_enter(&sqp->sq_lock);
/*
- * Check again if we still need
- * the timeout
+ * Couldn't do the entire drain because the
+ * time limit expired, let the
+ * worker thread take over.
*/
- if (((sqp->sq_state & (SQS_TMO_PROG|SQS_PROC))
- == SQS_TMO_PROG) && (sqp->sq_tid == 0) &&
- (sqp->sq_first != NULL)) {
- sqp->sq_state &= ~SQS_TMO_PROG;
- sqp->sq_awaken = lbolt;
- sqp->sq_tid = tid;
- } else if (sqp->sq_state & SQS_TMO_PROG) {
- /* timeout not needed */
- sqp->sq_state &= ~SQS_TMO_PROG;
- mutex_exit(&(sqp)->sq_lock);
- (void) untimeout(tid);
- mutex_enter(&sqp->sq_lock);
- }
}
- CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(async, lock);
- CALLB_CPR_SAFE_END(&cprinfo, lock);
- }
-
-#if SQUEUE_PROFILE
- if (SQ_PROFILING(sqp)) {
- SQDELTA(sqp, sq_time_worker, gethrtime() - start);
+ sqp->sq_awaken = lbolt;
+ /*
+ * Put the SQS_PROC_HELD on so the worker
+ * thread can distinguish where its called from. We
+ * can remove the SQS_PROC flag here and turn off the
+ * polling so that it wouldn't matter who gets the
+ * processing but we get better performance this way
+ * and save the cost of turn polling off and possibly
+ * on again as soon as we start draining again.
+ *
+ * We can't remove the SQS_PROC flag without turning
+ * polling off until we can guarantee that control
+ * will return to squeue_drain immediately.
+ */
+ sqp->sq_state |= SQS_PROC_HELD;
+ sqp->sq_state &= ~SQS_GET_PKTS;
+ cv_signal(&sqp->sq_worker_cv);
+ } else if (sqp->sq_first == NULL &&
+ !(sqp->sq_state & SQS_WORKER)) {
+ /*
+ * Nothing queued and worker thread not running.
+ * Since we hold the proc, no other thread is
+ * processing the squeue. This means that there
+ * is no work to be done and nothing is queued
+ * in squeue or in NIC. Turn polling off and go
+ * back to interrupt mode.
+ */
+ sqp->sq_state &= ~(SQS_PROC|SQS_GET_PKTS);
+ /* LINTED: constant in conditional context */
+ SQS_POLLING_OFF(sqp, B_TRUE, sq_rx_ring);
+ } else {
+ /*
+ * Worker thread is already running. We don't need
+ * to do anything. Indicate that poll thread is done.
+ */
+ sqp->sq_state &= ~SQS_GET_PKTS;
+ }
+ if (sqp->sq_state & SQS_POLL_THR_CONTROL) {
+ /*
+ * Act on control requests to quiesce, cleanup or
+ * restart an squeue
+ */
+ squeue_poll_thr_control(sqp);
}
-#endif
}
}
-#if SQUEUE_PROFILE
-static int
-squeue_kstat_update(kstat_t *ksp, int rw)
+/*
+ * The squeue worker thread acts on any control requests to quiesce, cleanup
+ * or restart an ill_rx_ring_t by calling this function. The worker thread
+ * synchronizes with the squeue poll thread to complete the request and finally
+ * wakes up the requestor when the request is completed.
+ */
+static void
+squeue_worker_thr_control(squeue_t *sqp)
{
- struct squeue_kstat *sqsp = &squeue_kstat;
- squeue_t *sqp = ksp->ks_private;
+ ill_t *ill;
+ ill_rx_ring_t *rx_ring;
- if (rw == KSTAT_WRITE)
- return (EACCES);
+ ASSERT(MUTEX_HELD(&sqp->sq_lock));
-#if SQUEUE_DEBUG
- sqsp->sq_count.value.ui64 = sqp->sq_count;
- sqsp->sq_max_qlen.value.ui64 = sqp->sq_stats.sq_max_qlen;
-#endif
- sqsp->sq_npackets_worker.value.ui64 = sqp->sq_stats.sq_npackets_worker;
- sqsp->sq_npackets_intr.value.ui64 = sqp->sq_stats.sq_npackets_intr;
- sqsp->sq_npackets_other.value.ui64 = sqp->sq_stats.sq_npackets_other;
- sqsp->sq_nqueued_intr.value.ui64 = sqp->sq_stats.sq_nqueued_intr;
- sqsp->sq_nqueued_other.value.ui64 = sqp->sq_stats.sq_nqueued_other;
- sqsp->sq_ndrains_worker.value.ui64 = sqp->sq_stats.sq_ndrains_worker;
- sqsp->sq_ndrains_intr.value.ui64 = sqp->sq_stats.sq_ndrains_intr;
- sqsp->sq_ndrains_other.value.ui64 = sqp->sq_stats.sq_ndrains_other;
- sqsp->sq_time_worker.value.ui64 = sqp->sq_stats.sq_time_worker;
- sqsp->sq_time_intr.value.ui64 = sqp->sq_stats.sq_time_intr;
- sqsp->sq_time_other.value.ui64 = sqp->sq_stats.sq_time_other;
- return (0);
-}
-#endif
+ if (sqp->sq_state & SQS_POLL_RESTART) {
+ /* Restart implies a previous quiesce. */
+ ASSERT((sqp->sq_state & (SQS_PROC_HELD |
+ SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER)) ==
+ (SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER));
+ /*
+ * Request the squeue poll thread to restart and wait till
+ * it actually restarts.
+ */
+ sqp->sq_state &= ~SQS_POLL_QUIESCE_DONE;
+ sqp->sq_state |= SQS_POLL_THR_RESTART;
+ cv_signal(&sqp->sq_poll_cv);
+ while (sqp->sq_state & SQS_POLL_THR_QUIESCED)
+ cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
+ sqp->sq_state &= ~(SQS_POLL_RESTART | SQS_PROC |
+ SQS_WORKER);
+ /*
+ * Signal any waiter that is waiting for the restart
+ * to complete
+ */
+ sqp->sq_state |= SQS_POLL_RESTART_DONE;
+ cv_signal(&sqp->sq_ctrlop_done_cv);
+ return;
+ }
-void
-squeue_profile_enable(squeue_t *sqp)
-{
- mutex_enter(&sqp->sq_lock);
- sqp->sq_state |= SQS_PROFILE;
- mutex_exit(&sqp->sq_lock);
-}
+ if (sqp->sq_state & SQS_PROC_HELD) {
+ /* The squeue poll thread handed control to us */
+ ASSERT(sqp->sq_state & SQS_PROC);
+ }
-void
-squeue_profile_disable(squeue_t *sqp)
-{
- mutex_enter(&sqp->sq_lock);
- sqp->sq_state &= ~SQS_PROFILE;
+ /*
+ * Prevent any other thread from processing the squeue
+ * until we finish the control actions by setting SQS_PROC.
+ * But allow ourself to reenter by setting SQS_WORKER
+ */
+ sqp->sq_state |= (SQS_PROC | SQS_WORKER);
+
+ /* Signal the squeue poll thread and wait for it to quiesce itself */
+ if (!(sqp->sq_state & SQS_POLL_THR_QUIESCED)) {
+ sqp->sq_state |= SQS_POLL_THR_QUIESCE;
+ cv_signal(&sqp->sq_poll_cv);
+ while (!(sqp->sq_state & SQS_POLL_THR_QUIESCED))
+ cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
+ }
+
+ rx_ring = sqp->sq_rx_ring;
+ ill = rx_ring->rr_ill;
+ /*
+ * The lock hierarchy is as follows.
+ * cpu_lock -> ill_lock -> sqset_lock -> sq_lock
+ */
mutex_exit(&sqp->sq_lock);
-}
+ mutex_enter(&ill->ill_lock);
+ mutex_enter(&sqp->sq_lock);
-void
-squeue_profile_reset(squeue_t *sqp)
-{
-#if SQUEUE_PROFILE
- bzero(&sqp->sq_stats, sizeof (sqstat_t));
-#endif
-}
+ SQS_POLLING_OFF(sqp, (sqp->sq_state & SQS_POLL_CAPAB) != 0,
+ sqp->sq_rx_ring);
+ sqp->sq_state &= ~(SQS_POLL_CAPAB | SQS_GET_PKTS | SQS_PROC_HELD);
+ if (sqp->sq_state & SQS_POLL_CLEANUP) {
+ /*
+ * Disassociate this squeue from its ill_rx_ring_t.
+ * The rr_sqp, sq_rx_ring fields are protected by the
+ * corresponding squeue, ill_lock* and sq_lock. Holding any
+ * of them will ensure that the ring to squeue mapping does
+ * not change.
+ */
+ ASSERT(!(sqp->sq_state & SQS_DEFAULT));
-void
-squeue_profile_start(void)
-{
-#if SQUEUE_PROFILE
- squeue_profile = B_TRUE;
-#endif
+ sqp->sq_rx_ring = NULL;
+ rx_ring->rr_sqp = NULL;
+
+ sqp->sq_state &= ~(SQS_POLL_CLEANUP | SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE);
+ sqp->sq_ill = NULL;
+
+ rx_ring->rr_rx_handle = NULL;
+ rx_ring->rr_intr_handle = NULL;
+ rx_ring->rr_intr_enable = NULL;
+ rx_ring->rr_intr_disable = NULL;
+ sqp->sq_state |= SQS_POLL_CLEANUP_DONE;
+ } else {
+ sqp->sq_state &= ~SQS_POLL_QUIESCE;
+ sqp->sq_state |= SQS_POLL_QUIESCE_DONE;
+ }
+ /*
+ * Signal any waiter that is waiting for the quiesce or cleanup
+ * to complete and also wait for it to actually see and reset the
+ * SQS_POLL_CLEANUP_DONE.
+ */
+ cv_signal(&sqp->sq_ctrlop_done_cv);
+ mutex_exit(&ill->ill_lock);
+ if (sqp->sq_state & SQS_POLL_CLEANUP_DONE) {
+ cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
+ sqp->sq_state &= ~(SQS_PROC | SQS_WORKER);
+ }
}
-void
-squeue_profile_stop(void)
+static void
+squeue_worker(squeue_t *sqp)
{
-#if SQUEUE_PROFILE
- squeue_profile = B_FALSE;
-#endif
+ kmutex_t *lock = &sqp->sq_lock;
+ kcondvar_t *async = &sqp->sq_worker_cv;
+ callb_cpr_t cprinfo;
+ hrtime_t now;
+
+ CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_worker");
+ mutex_enter(lock);
+
+ for (;;) {
+ for (;;) {
+ /*
+ * If the poll thread has handed control to us
+ * we need to break out of the wait.
+ */
+ if (sqp->sq_state & SQS_PROC_HELD)
+ break;
+
+ /*
+ * If the squeue is not being processed and we either
+ * have messages to drain or some thread has signaled
+ * some control activity we need to break
+ */
+ if (!(sqp->sq_state & SQS_PROC) &&
+ ((sqp->sq_state & SQS_WORKER_THR_CONTROL) ||
+ (sqp->sq_first != NULL)))
+ break;
+
+ /*
+ * If we have started some control action, then check
+ * for the SQS_WORKER flag (since we don't
+ * release the squeue) to make sure we own the squeue
+ * and break out
+ */
+ if ((sqp->sq_state & SQS_WORKER_THR_CONTROL) &&
+ (sqp->sq_state & SQS_WORKER))
+ break;
+
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(async, lock);
+ CALLB_CPR_SAFE_END(&cprinfo, lock);
+ }
+ if (sqp->sq_state & SQS_WORKER_THR_CONTROL) {
+ squeue_worker_thr_control(sqp);
+ continue;
+ }
+ ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
+ SQS_WORKER_THR_CONTROL | SQS_POLL_THR_CONTROL)));
+
+ if (sqp->sq_state & SQS_PROC_HELD)
+ sqp->sq_state &= ~SQS_PROC_HELD;
+
+ now = gethrtime();
+ sqp->sq_run = curthread;
+ sqp->sq_drain(sqp, SQS_WORKER, now + squeue_drain_ns);
+ sqp->sq_run = NULL;
+ }
}
uintptr_t *
@@ -1482,9 +1240,3 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p)
return (&sqp->sq_private[p]);
}
-
-processorid_t
-squeue_binding(squeue_t *sqp)
-{
- return (sqp->sq_bind);
-}
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 3b8440b230..4bb50d2344 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -96,6 +96,7 @@
#include <inet/ip_if.h>
#include <inet/ipp_common.h>
#include <inet/ip_netinfo.h>
+#include <sys/squeue_impl.h>
#include <sys/squeue.h>
#include <inet/kssl/ksslapi.h>
#include <sys/tsol/label.h>
@@ -124,8 +125,8 @@
* The tcp data structure does not use any kind of lock for protecting
* its state but instead uses 'squeues' for mutual exclusion from various
* read and write side threads. To access a tcp member, the thread should
- * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
- * squeue_fill). Since the squeues allow a direct function call, caller
+ * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
+ * or SQ_NODRAIN). Since the squeues allow a direct function call, caller
* can pass any tcp function having prototype of edesc_t as argument
* (different from traditional STREAMs model where packets come in only
* designated entry points). The list of functions that can be directly
@@ -251,15 +252,12 @@
/*
* Values for squeue switch:
- * 1: squeue_enter_nodrain
- * 2: squeue_enter
- * 3: squeue_fill
+ * 1: SQ_NODRAIN
+ * 2: SQ_PROCESS
+ * 3: SQ_FILL
*/
-int tcp_squeue_close = 2; /* Setable in /etc/system */
-int tcp_squeue_wput = 2;
-
-squeue_func_t tcp_squeue_close_proc;
-squeue_func_t tcp_squeue_wput_proc;
+int tcp_squeue_wput = 2; /* /etc/systems */
+int tcp_squeue_flag;
/*
* Macros for sodirect:
@@ -940,7 +938,7 @@ static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
tcph_t *tcph, mblk_t *idmp);
-static squeue_func_t tcp_squeue_switch(int);
+static int tcp_squeue_switch(int);
static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
@@ -1865,9 +1863,9 @@ tcp_time_wait_collector(void *arg)
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
- squeue_fill(connp->conn_sqp, mp,
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
tcp_timewait_output, connp,
- SQTAG_TCP_TIMEWAIT);
+ SQ_FILL, SQTAG_TCP_TIMEWAIT);
}
} else {
mutex_enter(&connp->conn_lock);
@@ -1893,8 +1891,9 @@ tcp_time_wait_collector(void *arg)
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
- squeue_fill(connp->conn_sqp, mp,
- tcp_timewait_output, connp, 0);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ tcp_timewait_output, connp,
+ SQ_FILL, SQTAG_TCP_TIMEWAIT);
}
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
}
@@ -2374,10 +2373,10 @@ finish:
* queue.
*/
/*
- * We already have a ref on tcp so no need to do one before squeue_fill
+ * We already have a ref on tcp so no need to do one before squeue_enter
*/
- squeue_fill(eager->tcp_connp->conn_sqp, opt_mp,
- tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH);
+ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish,
+ eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH);
}
/*
@@ -4048,8 +4047,8 @@ tcp_close(queue_t *q, int flags)
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
- (*tcp_squeue_close_proc)(connp->conn_sqp, mp,
- tcp_close_output, connp, SQTAG_IP_TCP_CLOSE);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
+ tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
mutex_enter(&tcp->tcp_closelock);
while (!tcp->tcp_closed) {
@@ -4074,9 +4073,9 @@ tcp_close(queue_t *q, int flags)
/* Entering squeue, bump ref count. */
CONN_INC_REF(connp);
bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
- squeue_enter(connp->conn_sqp, bp,
+ SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
tcp_linger_interrupted, connp,
- SQTAG_IP_TCP_CLOSE);
+ tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
mutex_enter(&tcp->tcp_closelock);
}
break;
@@ -4625,6 +4624,11 @@ tcp_free(tcp_t *tcp)
tcp->tcp_ordrel_mp = NULL;
}
+ if (tcp->tcp_ordrel_mp != NULL) {
+ freeb(tcp->tcp_ordrel_mp);
+ tcp->tcp_ordrel_mp = NULL;
+ }
+
if (tcp->tcp_sack_info != NULL) {
if (tcp->tcp_notsack_list != NULL) {
TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
@@ -4825,8 +4829,9 @@ tcp_drop_q0(tcp_t *tcp)
/* Mark the IRE created for this SYN request temporary */
tcp_ip_ire_mark_advice(eager);
- squeue_fill(eager->tcp_connp->conn_sqp, mp,
- tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0);
+ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
+ tcp_clean_death_wrapper, eager->tcp_connp,
+ SQ_FILL, SQTAG_TCP_DROP_Q0);
return (B_TRUE);
}
@@ -5302,6 +5307,7 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
* The caller already ensured that there is a sqp present.
*/
econnp->conn_sqp = new_sqp;
+ econnp->conn_initial_sqp = new_sqp;
if (connp->conn_policy != NULL) {
ipsec_in_t *ii;
@@ -5681,6 +5687,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
goto error2;
ASSERT(econnp->conn_netstack == connp->conn_netstack);
econnp->conn_sqp = new_sqp;
+ econnp->conn_initial_sqp = new_sqp;
} else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
/*
* mp is updated in tcp_get_ipsec_conn().
@@ -6032,8 +6039,9 @@ error:
freemsg(mp1);
eager->tcp_closemp_used = B_TRUE;
TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
- squeue_fill(econnp->conn_sqp, &eager->tcp_closemp, tcp_eager_kill,
- econnp, SQTAG_TCP_CONN_REQ_2);
+ mp1 = &eager->tcp_closemp;
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
+ econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
/*
* If a connection already exists, send the mp to that connections so
@@ -6056,8 +6064,8 @@ error:
CONN_DEC_REF(econnp);
freemsg(mp);
} else {
- squeue_fill(econnp->conn_sqp, mp, tcp_input,
- econnp, SQTAG_TCP_CONN_REQ_1);
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, mp,
+ tcp_input, econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
}
} else {
/* Nobody wants this packet */
@@ -6149,8 +6157,8 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
done:
if (connp->conn_sqp != sqp) {
CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, mp,
- connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
+ SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
} else {
tcp_conn_request(connp, mp, sqp);
}
@@ -7217,8 +7225,8 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum)
CONN_INC_REF(eager->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
mp = &eager->tcp_closemp;
- squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
- eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF);
+ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
+ eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
return (B_TRUE);
}
@@ -7245,9 +7253,9 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
CONN_INC_REF(eager->tcp_connp);
mp = &eager->tcp_closemp;
- squeue_fill(eager->tcp_connp->conn_sqp, mp,
+ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
tcp_eager_kill, eager->tcp_connp,
- SQTAG_TCP_EAGER_CLEANUP);
+ SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
}
eager = eager->tcp_eager_next_q;
}
@@ -7261,8 +7269,8 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
CONN_INC_REF(eager->tcp_connp);
mp = &eager->tcp_closemp;
- squeue_fill(eager->tcp_connp->conn_sqp, mp,
- tcp_eager_kill, eager->tcp_connp,
+ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
+ tcp_eager_kill, eager->tcp_connp, SQ_FILL,
SQTAG_TCP_EAGER_CLEANUP_Q0);
}
eager = eager->tcp_eager_next_q0;
@@ -9785,6 +9793,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
return (ENOSR);
}
connp->conn_sqp = IP_SQUEUE_GET(lbolt);
+ connp->conn_initial_sqp = connp->conn_sqp;
tcp = connp->conn_tcp;
q->q_ptr = WR(q)->q_ptr = connp;
@@ -12059,13 +12068,13 @@ enq:
* on the conn structure associated so the tcp is guaranteed to exist
* when we come here. We still need to check the state because it might
* as well has been closed. The squeue processing function i.e. squeue_enter,
- * squeue_enter_nodrain, or squeue_drain is responsible for doing the
- * CONN_DEC_REF.
+ * is responsible for doing the CONN_DEC_REF.
*
* Apart from the default entry point, IP also sends packets directly to
* tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming
* connections.
*/
+boolean_t tcp_outbound_squeue_switch = B_FALSE;
void
tcp_input(void *arg, mblk_t *mp, void *arg2)
{
@@ -12102,10 +12111,33 @@ tcp_input(void *arg, mblk_t *mp, void *arg2)
return;
}
- if (DB_TYPE(mp) == M_DATA)
- tcp_rput_data(connp, mp, arg2);
- else
+ if (DB_TYPE(mp) != M_DATA) {
tcp_rput_common(tcp, mp);
+ return;
+ }
+
+ if (mp->b_datap->db_struioflag & STRUIO_CONNECT) {
+ squeue_t *final_sqp;
+
+ mp->b_datap->db_struioflag &= ~STRUIO_CONNECT;
+ final_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+ DB_CKSUMSTART(mp) = 0;
+ if (tcp->tcp_state == TCPS_SYN_SENT &&
+ connp->conn_final_sqp == NULL &&
+ tcp_outbound_squeue_switch) {
+ ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
+ connp->conn_final_sqp = final_sqp;
+ if (connp->conn_final_sqp != connp->conn_sqp) {
+ CONN_INC_REF(connp);
+ SQUEUE_SWITCH(connp, connp->conn_final_sqp);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ tcp_rput_data, connp, ip_squeue_flag,
+ SQTAG_CONNECT_FINISH);
+ return;
+ }
+ }
+ }
+ tcp_rput_data(connp, mp, arg2);
}
/*
@@ -14316,16 +14348,27 @@ process_ack:
CONN_INC_REF(listener->tcp_connp);
if (listener->tcp_connp->conn_sqp ==
connp->conn_sqp) {
+ /*
+ * We optimize by not calling an SQUEUE_ENTER
+ * on the listener since we know that the
+ * listener and eager squeues are the same.
+ * We are able to make this check safely only
+ * because neither the eager nor the listener
+ * can change its squeue. Only an active connect
+ * can change its squeue
+ */
tcp_send_conn_ind(listener->tcp_connp, mp,
listener->tcp_connp->conn_sqp);
CONN_DEC_REF(listener->tcp_connp);
} else if (!tcp->tcp_loopback) {
- squeue_fill(listener->tcp_connp->conn_sqp, mp,
- tcp_send_conn_ind,
- listener->tcp_connp, SQTAG_TCP_CONN_IND);
+ SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
+ mp, tcp_send_conn_ind,
+ listener->tcp_connp, SQ_FILL,
+ SQTAG_TCP_CONN_IND);
} else {
- squeue_enter(listener->tcp_connp->conn_sqp, mp,
- tcp_send_conn_ind, listener->tcp_connp,
+ SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
+ mp, tcp_send_conn_ind,
+ listener->tcp_connp, SQ_PROCESS,
SQTAG_TCP_CONN_IND);
}
}
@@ -15884,7 +15927,6 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
return (mp);
}
-
/*
* Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK
* or a "bad" IRE detected by tcp_adapt_ire.
@@ -16402,8 +16444,8 @@ tcp_rsrv(queue_t *q)
mutex_exit(&tcp->tcp_rsrv_mp_lock);
CONN_INC_REF(connp);
- squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp,
- SQTAG_TCP_RSRV);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp,
+ SQ_PROCESS, SQTAG_TCP_RSRV);
}
/*
@@ -18768,9 +18810,9 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
/* Need to get inside the listener perimeter */
CONN_INC_REF(listener->tcp_connp);
- squeue_fill(listener->tcp_connp->conn_sqp, mp1,
+ SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
tcp_send_pending, listener->tcp_connp,
- SQTAG_TCP_SEND_PENDING);
+ SQ_FILL, SQTAG_TCP_SEND_PENDING);
}
no_more_eagers:
tcp_eager_unlink(eager);
@@ -18781,10 +18823,13 @@ no_more_eagers:
* but we still have an extra refs on eager (apart from the
* usual tcp references). The ref was placed in tcp_rput_data
* before sending the conn_ind in tcp_send_conn_ind.
- * The ref will be dropped in tcp_accept_finish().
+ * The ref will be dropped in tcp_accept_finish(). As sockfs
+ * has already established this tcp with it's own stream,
+ * it's OK to set tcp_detached to B_FALSE.
*/
- squeue_enter_nodrain(econnp->conn_sqp, opt_mp,
- tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0);
+ econnp->conn_tcp->tcp_detached = B_FALSE;
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish,
+ econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
return;
default:
mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
@@ -18916,7 +18961,6 @@ tcp_wput(queue_t *q, mblk_t *mp)
t_scalar_t type;
uchar_t *rptr;
struct iocblk *iocp;
- uint32_t msize;
tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
ASSERT(connp->conn_ref >= 2);
@@ -18926,18 +18970,16 @@ tcp_wput(queue_t *q, mblk_t *mp)
tcp = connp->conn_tcp;
ASSERT(tcp != NULL);
- msize = msgdsize(mp);
-
mutex_enter(&tcp->tcp_non_sq_lock);
- tcp->tcp_squeue_bytes += msize;
+ tcp->tcp_squeue_bytes += msgdsize(mp);
if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
tcp_setqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
CONN_INC_REF(connp);
- (*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
- tcp_output, connp, SQTAG_TCP_OUTPUT);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
+ tcp_squeue_flag, SQTAG_TCP_OUTPUT);
return;
case M_CMD:
@@ -19030,8 +19072,8 @@ tcp_wput(queue_t *q, mblk_t *mp)
}
CONN_INC_REF(connp);
- (*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
- output_proc, connp, SQTAG_TCP_WPUT_OTHER);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
+ tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
}
/*
@@ -19503,34 +19545,27 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
ntohs(ipha->ipha_length));
- if (ILL_DLS_CAPABLE(ill)) {
- /*
- * Send the packet directly to DLD, where it may be queued
- * depending on the availability of transmit resources at
- * the media layer.
- */
- IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len);
- } else {
- ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr;
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, out_ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+ DTRACE_PROBE4(ip4__physical__out__start,
+ ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_physical_out_event,
+ ipst->ips_ipv4firewall_physical_out,
+ NULL, ill, ipha, mp, mp, 0, ipst);
+ DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+ DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
- if (mp != NULL) {
- if (ipst->ips_ipobs_enabled) {
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND,
- IP_REAL_ZONEID(connp->conn_zoneid, ipst),
- ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len,
- ipst);
- }
- DTRACE_IP_FASTPATH(mp, ipha, out_ill, ipha, NULL);
- putnext(ire->ire_stq, mp);
+ if (mp != NULL) {
+ if (ipst->ips_ipobs_enabled) {
+ zoneid_t szone;
+
+ szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+ ipst, ALL_ZONES);
+ ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+ ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
}
+
+ ILL_SEND_TX(ill, ire, connp, mp, 0);
}
+
IRE_REFRELE(ire);
}
@@ -21327,12 +21362,7 @@ tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head,
}
/* send it down */
- if (ILL_DLS_CAPABLE(ill)) {
- ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
- ill_dls->ill_tx(ill_dls->ill_tx_handle, md_mp_head);
- } else {
- putnext(ire->ire_stq, md_mp_head);
- }
+ putnext(ire->ire_stq, md_mp_head);
/* we're done for TCP/IPv4 */
if (tcp->tcp_ipversion == IPV4_VERSION)
@@ -21478,10 +21508,12 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss,
IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
/*
- * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp).
+ * Append LSO flags and mss to the mp.
*/
- DB_LSOFLAGS(mp) |= HW_LSO;
- DB_LSOMSS(mp) = mss;
+ lso_info_set(mp, mss, HW_LSO);
+
+ ipha->ipha_fragment_offset_and_flags |=
+ (uint32_t)htons(ire->ire_frag_flag);
ire_fp_mp = ire->ire_nce->nce_fp_mp;
ire_fp_mp_len = MBLKL(ire_fp_mp);
@@ -21496,34 +21528,25 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss,
UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
ntohs(ipha->ipha_length));
- if (ILL_DLS_CAPABLE(ill)) {
- /*
- * Send the packet directly to DLD, where it may be queued
- * depending on the availability of transmit resources at
- * the media layer.
- */
- IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len);
- } else {
- ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr;
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, out_ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+ DTRACE_PROBE4(ip4__physical__out__start,
+ ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_physical_out_event,
+ ipst->ips_ipv4firewall_physical_out, NULL,
+ ill, ipha, mp, mp, 0, ipst);
+ DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+ DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
- if (mp != NULL) {
- if (ipst->ips_ipobs_enabled) {
- zoneid_t szone = tcp->tcp_connp->conn_zoneid;
+ if (mp != NULL) {
+ if (ipst->ips_ipobs_enabled) {
+ zoneid_t szone;
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
- ALL_ZONES, ill, tcp->tcp_ipversion,
- ire_fp_mp_len, ipst);
- }
- DTRACE_IP_FASTPATH(mp, ipha, out_ill, ipha, NULL);
- putnext(ire->ire_stq, mp);
+ szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+ ipst, ALL_ZONES);
+ ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+ ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
}
+
+ ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0);
}
}
@@ -24921,9 +24944,6 @@ tcp_ddi_g_init(void)
/* Initialize the random number generator */
tcp_random_init();
- tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput);
- tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close);
-
/* A single callback independently of how many netstacks we have */
ip_squeue_init(tcp_squeue_add);
@@ -24932,6 +24952,8 @@ tcp_ddi_g_init(void)
tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1,
TASKQ_PREPOPULATE);
+ tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput);
+
/*
* We want to be informed each time a stack is created or
* destroyed in the kernel, so we can maintain the
@@ -25420,7 +25442,7 @@ tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp)
* If we get here, we are already on the correct
* squeue. This ioctl follows the following path
* tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
- * ->tcp_ioctl_abort->squeue_fill (if on a
+ * ->tcp_ioctl_abort->squeue_enter (if on a
* different squeue)
*/
int errcode;
@@ -25487,8 +25509,8 @@ startover:
listhead = listhead->b_next;
tcp = (tcp_t *)mp->b_prev;
mp->b_next = mp->b_prev = NULL;
- squeue_fill(tcp->tcp_connp->conn_sqp, mp,
- tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET);
+ SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, tcp_input,
+ tcp->tcp_connp, SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
}
*count += nmatch;
@@ -25989,8 +26011,8 @@ tcp_timer_callback(void *arg)
tcpt = (tcp_timer_t *)mp->b_rptr;
connp = tcpt->connp;
- squeue_fill(connp->conn_sqp, mp,
- tcp_timer_handler, connp, SQTAG_TCP_TIMER);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
+ SQ_FILL, SQTAG_TCP_TIMER);
}
static void
@@ -26486,6 +26508,7 @@ tcp_kstat_update(kstat_t *kp, int rw)
netstack_rele(ns);
return (-1);
}
+
tcpkp = (tcp_named_kstat_t *)kp->ks_data;
tcpkp->currEstab.value.ui32 = 0;
@@ -26583,8 +26606,8 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
/* Already has an eager */
if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
TCP_STAT(tcps, tcp_reinput_syn);
- squeue_enter(connp->conn_sqp, mp, connp->conn_recv,
- connp, SQTAG_TCP_REINPUT_EAGER);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
+ SQ_PROCESS, SQTAG_TCP_REINPUT_EAGER);
return;
}
@@ -26609,21 +26632,21 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
DB_CKSUMSTART(mp) = (intptr_t)sqp;
}
- squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp,
- SQTAG_TCP_REINPUT);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
+ SQ_FILL, SQTAG_TCP_REINPUT);
}
-static squeue_func_t
+static int
tcp_squeue_switch(int val)
{
- squeue_func_t rval = squeue_fill;
+ int rval = SQ_FILL;
switch (val) {
case 1:
- rval = squeue_enter_nodrain;
+ rval = SQ_NODRAIN;
break;
case 2:
- rval = squeue_enter;
+ rval = SQ_PROCESS;
break;
default:
break;
diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c
index 0913da33f8..8eb8cddff3 100644
--- a/usr/src/uts/common/inet/tcp/tcp_kssl.c
+++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c
@@ -53,6 +53,7 @@
#include <inet/ipdrop.h>
#include <inet/tcp_impl.h>
+#include <sys/squeue_impl.h>
#include <sys/squeue.h>
#include <inet/kssl/ksslapi.h>
@@ -70,7 +71,7 @@ static void tcp_kssl_input_asynch(void *, mblk_t *, void *);
extern void tcp_output(void *, mblk_t *, void *);
extern void tcp_send_conn_ind(void *, mblk_t *, void *);
-extern squeue_func_t tcp_squeue_wput_proc;
+extern int tcp_squeue_flag;
/*
* tcp_rput_data() calls this routine for all packet destined to a
@@ -205,10 +206,10 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
listener->tcp_connp->conn_sqp);
CONN_DEC_REF(listener->tcp_connp);
} else {
- squeue_fill(
+ SQUEUE_ENTER_ONE(
listener->tcp_connp->conn_sqp,
ind_mp, tcp_send_conn_ind,
- listener->tcp_connp,
+ listener->tcp_connp, SQ_FILL,
SQTAG_TCP_CONN_IND);
}
}
@@ -294,11 +295,11 @@ no_can_do:
listener->tcp_connp->conn_sqp);
CONN_DEC_REF(listener->tcp_connp);
} else {
- squeue_fill(
+ SQUEUE_ENTER_ONE(
listener->tcp_connp->conn_sqp,
ind_mp, tcp_send_conn_ind,
listener->tcp_connp,
- SQTAG_TCP_CONN_IND);
+ SQ_FILL, SQTAG_TCP_CONN_IND);
}
}
if (mp != NULL)
@@ -343,8 +344,8 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
mutex_exit(&tcp->tcp_non_sq_lock);
}
CONN_INC_REF(connp);
- (*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
- tcp_output, connp, SQTAG_TCP_OUTPUT);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
+ tcp_squeue_flag, SQTAG_TCP_OUTPUT);
/* FALLTHROUGH */
case KSSL_CMD_NONE:
@@ -375,8 +376,8 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
*/
if ((sqmp = allocb(1, BPRI_MED)) != NULL) {
CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, sqmp, tcp_kssl_input_asynch,
- connp, SQTAG_TCP_KSSL_INPUT);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, sqmp, tcp_kssl_input_asynch,
+ connp, SQ_FILL, SQTAG_TCP_KSSL_INPUT);
} else {
DTRACE_PROBE(kssl_err__allocb_failed);
}
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 3369ca915e..70677c86d8 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -78,6 +78,7 @@
#include <inet/ipclassifier.h>
#include <inet/ipsec_impl.h>
#include <inet/ipp_common.h>
+#include <sys/squeue_impl.h>
#include <inet/ipnet.h>
/*
@@ -196,14 +197,15 @@ static int udp_rinfop(queue_t *q, infod_t *dp);
static int udp_rrw(queue_t *q, struiod_t *dp);
static int udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
-static void udp_send_data(udp_t *, queue_t *, mblk_t *, ipha_t *);
+static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp,
+ ipha_t *ipha);
static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
t_scalar_t destlen, t_scalar_t err);
static void udp_unbind(queue_t *q, mblk_t *mp);
static in_port_t udp_update_next_port(udp_t *udp, in_port_t port,
boolean_t random);
static mblk_t *udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t,
- int *, boolean_t);
+ int *, boolean_t);
static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
int *error);
static void udp_wput_other(queue_t *q, mblk_t *mp);
@@ -4401,6 +4403,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
UDP_STAT(us, udp_in_recvucred);
}
+ /* XXX FIXME: apply to AF_INET6 as well */
/*
* If SO_TIMESTAMP is set allocate the appropriate sized
* buffer. Since gethrestime() expects a pointer aligned
@@ -6237,8 +6240,12 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
dev_q = ire->ire_stq->q_next;
ASSERT(dev_q != NULL);
+ ill = ire_to_ill(ire);
+ ASSERT(ill != NULL);
- if (DEV_Q_IS_FLOW_CTLED(dev_q)) {
+ /* is queue flow controlled? */
+ if (q->q_first != NULL || connp->conn_draining ||
+ DEV_Q_FLOW_BLOCKED(dev_q)) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
if (ipst->ips_ip_output_queue)
@@ -6256,8 +6263,6 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
dst = ipha->ipha_dst;
src = ipha->ipha_src;
- ill = ire_to_ill(ire);
- ASSERT(ill != NULL);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
@@ -6334,31 +6339,32 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
ntohs(ipha->ipha_length));
- if (ILL_DLS_CAPABLE(ill)) {
- /*
- * Send the packet directly to DLD, where it may be queued
- * depending on the availability of transmit resources at
- * the media layer.
- */
- IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len);
- } else {
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, ill,
- ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, ill, ipha, mp, mp, ll_multicast, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
- if (mp != NULL) {
- if (ipst->ips_ipobs_enabled) {
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND,
- IP_REAL_ZONEID(connp->conn_zoneid, ipst),
- ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len,
- ipst);
- }
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
- ipha_t *, ipha, ip6_t *, NULL, int, 0);
+ DTRACE_PROBE4(ip4__physical__out__start,
+ ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_physical_out_event,
+ ipst->ips_ipv4firewall_physical_out, NULL, ill, ipha, mp, mp,
+ ll_multicast, ipst);
+ DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+ if (ipst->ips_ipobs_enabled && mp != NULL) {
+ zoneid_t szone;
+
+ szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+ ipst, ALL_ZONES);
+ ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+ ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
+ }
+
+ if (mp != NULL) {
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
+ void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
+ ipha_t *, ipha, ip6_t *, NULL, int, 0);
+
+ if (ILL_DIRECT_CAPABLE(ill)) {
+ ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
+
+ (void) idd->idd_tx_df(idd->idd_tx_dh, mp,
+ (uintptr_t)connp, 0);
+ } else {
putnext(ire->ire_stq, mp);
}
}
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 04b8dbc22c..468fa553f4 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -26,8 +26,6 @@
#ifndef _UDP_IMPL_H
#define _UDP_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* UDP implementation private declarations. These interfaces are
* used to build the IP module and are not meant to be accessed
@@ -159,7 +157,7 @@ typedef struct udp_fanout_s {
* below IP and if the q_first is NULL, we optimize by not doing
* the canput check
*/
-#define DEV_Q_IS_FLOW_CTLED(dev_q) \
+#define DEV_Q_FLOW_BLOCKED(dev_q) \
(((dev_q)->q_next != NULL || (dev_q)->q_first != NULL) && \
!canput(dev_q))
@@ -371,9 +369,7 @@ extern void udp_quiesce_conn(conn_t *);
extern void udp_ddi_init(void);
extern void udp_ddi_destroy(void);
extern void udp_resume_bind(conn_t *, mblk_t *);
-extern void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
- socklen_t addrlen);
-extern void udp_wput(queue_t *, mblk_t *);
+extern void udp_wput(queue_t *, mblk_t *);
extern int udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
uchar_t *ptr);
diff --git a/usr/src/uts/common/io/afe/afe.c b/usr/src/uts/common/io/afe/afe.c
index a89926f58f..9f32d0d3f8 100644
--- a/usr/src/uts/common/io/afe/afe.c
+++ b/usr/src/uts/common/io/afe/afe.c
@@ -184,7 +184,6 @@ static mac_callbacks_t afe_m_callbacks = {
afe_m_multicst,
afe_m_unicst,
afe_m_tx,
- NULL, /* mc_resources */
NULL, /* mc_ioctl */
NULL, /* mc_getcapab */
NULL, /* mc_open */
diff --git a/usr/src/uts/common/io/afe/afeimpl.h b/usr/src/uts/common/io/afe/afeimpl.h
index 0dccbe1acd..2b2e0c237d 100644
--- a/usr/src/uts/common/io/afe/afeimpl.h
+++ b/usr/src/uts/common/io/afe/afeimpl.h
@@ -36,10 +36,10 @@
#ifndef _AFEIMPL_H
#define _AFEIMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef _KERNEL
+#include <sys/mac_provider.h>
+
/*
* Compile time tunables.
*/
diff --git a/usr/src/uts/common/io/aggr/aggr_ctl.c b/usr/src/uts/common/io/aggr/aggr_ctl.c
index 0cfb177ed6..ea167fda28 100644
--- a/usr/src/uts/common/io/aggr/aggr_ctl.c
+++ b/usr/src/uts/common/io/aggr/aggr_ctl.c
@@ -29,13 +29,14 @@
#include <sys/aggr.h>
#include <sys/aggr_impl.h>
+#include <sys/priv_names.h>
/*
* Process a LAIOC_MODIFY request.
*/
/* ARGSUSED */
static int
-aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
laioc_modify_t *modify_arg = karg;
uint32_t policy;
@@ -68,8 +69,8 @@ aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
lacp_timer = modify_arg->lu_lacp_timer;
}
- return (aggr_grp_modify(modify_arg->lu_linkid, NULL, modify_mask,
- policy, mac_fixed, mac_addr, lacp_mode, lacp_timer));
+ return (aggr_grp_modify(modify_arg->lu_linkid, modify_mask, policy,
+ mac_fixed, mac_addr, lacp_mode, lacp_timer));
}
/*
@@ -77,7 +78,7 @@ aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
*/
/* ARGSUSED */
static int
-aggr_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
laioc_create_t *create_arg = karg;
uint16_t nports;
@@ -122,7 +123,7 @@ done:
/* ARGSUSED */
static int
-aggr_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
laioc_delete_t *delete_arg = karg;
@@ -191,7 +192,7 @@ aggr_ioc_info_new_port(void *arg, datalink_id_t linkid, uchar_t *mac,
/*ARGSUSED*/
static int
-aggr_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
laioc_info_t *info_argp = karg;
datalink_id_t linkid;
@@ -249,30 +250,31 @@ done:
/* ARGSUSED */
static int
-aggr_ioc_add(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_add(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
return (aggr_ioc_add_remove(karg, arg, LAIOC_ADD, mode));
}
/* ARGSUSED */
static int
-aggr_ioc_remove(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_remove(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
return (aggr_ioc_add_remove(karg, arg, LAIOC_REMOVE, mode));
}
static dld_ioc_info_t aggr_ioc_list[] = {
- {LAIOC_CREATE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_create_t),
- aggr_ioc_create},
- {LAIOC_DELETE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_delete_t),
- aggr_ioc_delete},
- {LAIOC_INFO, DLDCOPYINOUT, sizeof (laioc_info_t), aggr_ioc_info},
- {LAIOC_ADD, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_add_rem_t),
- aggr_ioc_add},
- {LAIOC_REMOVE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_add_rem_t),
- aggr_ioc_remove},
- {LAIOC_MODIFY, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_modify_t),
- aggr_ioc_modify}
+ {LAIOC_CREATE, DLDCOPYIN, sizeof (laioc_create_t), aggr_ioc_create,
+ {PRIV_SYS_DL_CONFIG}},
+ {LAIOC_DELETE, DLDCOPYIN, sizeof (laioc_delete_t), aggr_ioc_delete,
+ {PRIV_SYS_DL_CONFIG}},
+ {LAIOC_INFO, DLDCOPYINOUT, sizeof (laioc_info_t), aggr_ioc_info,
+ {NULL}},
+ {LAIOC_ADD, DLDCOPYIN, sizeof (laioc_add_rem_t), aggr_ioc_add,
+ {PRIV_SYS_DL_CONFIG}},
+ {LAIOC_REMOVE, DLDCOPYIN, sizeof (laioc_add_rem_t), aggr_ioc_remove,
+ {PRIV_SYS_DL_CONFIG}},
+ {LAIOC_MODIFY, DLDCOPYIN, sizeof (laioc_modify_t), aggr_ioc_modify,
+ {PRIV_SYS_DL_CONFIG}}
};
int
diff --git a/usr/src/uts/common/io/aggr/aggr_dev.c b/usr/src/uts/common/io/aggr/aggr_dev.c
index fc2c396c2b..6640015af5 100644
--- a/usr/src/uts/common/io/aggr/aggr_dev.c
+++ b/usr/src/uts/common/io/aggr/aggr_dev.c
@@ -42,38 +42,8 @@ static int aggr_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int aggr_attach(dev_info_t *, ddi_attach_cmd_t);
static int aggr_detach(dev_info_t *, ddi_detach_cmd_t);
-static struct cb_ops aggr_cb_ops = {
- nulldev, /* open */
- nulldev, /* close */
- nulldev, /* strategy */
- nulldev, /* print */
- nodev, /* dump */
- nodev, /* read */
- nodev, /* write */
- nodev, /* ioctl */
- nodev, /* devmap */
- nodev, /* mmap */
- nodev, /* segmap */
- nochpoll, /* poll */
- ddi_prop_op, /* cb_prop_op */
- 0, /* streamtab */
- D_MP /* Driver compatibility flag */
-};
-
-static struct dev_ops aggr_dev_ops = {
- DEVO_REV, /* devo_rev */
- 0, /* refcnt */
- aggr_getinfo, /* get_dev_info */
- nulldev, /* identify */
- nulldev, /* probe */
- aggr_attach, /* attach */
- aggr_detach, /* detach */
- nodev, /* reset */
- &aggr_cb_ops, /* driver operations */
- NULL, /* bus operations */
- nodev, /* dev power */
- ddi_quiesce_not_supported, /* dev quiesce */
-};
+DDI_DEFINE_STREAM_OPS(aggr_dev_ops, nulldev, nulldev, aggr_attach, aggr_detach,
+ nodev, aggr_getinfo, D_MP, NULL, ddi_quiesce_not_supported);
static struct modldrv aggr_modldrv = {
&mod_driverops, /* Type of module. This one is a driver */
@@ -82,9 +52,7 @@ static struct modldrv aggr_modldrv = {
};
static struct modlinkage modlinkage = {
- MODREV_1,
- &aggr_modldrv,
- NULL
+ MODREV_1, &aggr_modldrv, NULL
};
int
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c
index cee6d5e45f..fa90087320 100644
--- a/usr/src/uts/common/io/aggr/aggr_grp.c
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c
@@ -39,6 +39,7 @@
#include <sys/sysmacros.h>
#include <sys/conf.h>
#include <sys/cmn_err.h>
+#include <sys/disp.h>
#include <sys/list.h>
#include <sys/ksynch.h>
#include <sys/kmem.h>
@@ -52,6 +53,7 @@
#include <sys/id_space.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
+#include <sys/mac_provider.h>
#include <sys/dls.h>
#include <sys/vlan.h>
#include <sys/aggr.h>
@@ -63,7 +65,6 @@ static int aggr_m_promisc(void *, boolean_t);
static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
static int aggr_m_unicst(void *, const uint8_t *);
static int aggr_m_stat(void *, uint_t, uint64_t *);
-static void aggr_m_resources(void *);
static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
@@ -76,8 +77,20 @@ static uint_t aggr_grp_max_sdu(aggr_grp_t *);
static uint32_t aggr_grp_max_margin(aggr_grp_t *);
static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
-static int aggr_grp_multicst(aggr_grp_t *grp, boolean_t add,
- const uint8_t *addrp);
+
+static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
+static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
+static int aggr_pseudo_disable_intr(mac_intr_handle_t);
+static int aggr_pseudo_enable_intr(mac_intr_handle_t);
+static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
+static void aggr_pseudo_stop_ring(mac_ring_driver_t);
+static int aggr_addmac(void *, const uint8_t *);
+static int aggr_remmac(void *, const uint8_t *);
+static mblk_t *aggr_rx_poll(void *, int);
+static void aggr_fill_ring(void *, mac_ring_type_t, const int,
+ const int, mac_ring_info_t *, mac_ring_handle_t);
+static void aggr_fill_group(void *, mac_ring_type_t, const int,
+ mac_group_info_t *, mac_group_handle_t);
static kmem_cache_t *aggr_grp_cache;
static mod_hash_t *aggr_grp_hash;
@@ -87,10 +100,11 @@ static id_space_t *key_ids;
#define GRP_HASHSZ 64
#define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
+#define AGGR_PORT_NAME_DELIMIT '-'
static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
-#define AGGR_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB)
+#define AGGR_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB)
static mac_callbacks_t aggr_m_callbacks = {
AGGR_M_CALLBACK_FLAGS,
@@ -99,9 +113,8 @@ static mac_callbacks_t aggr_m_callbacks = {
aggr_m_stop,
aggr_m_promisc,
aggr_m_multicst,
- aggr_m_unicst,
+ NULL,
aggr_m_tx,
- aggr_m_resources,
aggr_m_ioctl,
aggr_m_capab_get
};
@@ -113,11 +126,12 @@ aggr_grp_constructor(void *buf, void *arg, int kmflag)
aggr_grp_t *grp = buf;
bzero(grp, sizeof (*grp));
- rw_init(&grp->lg_lock, NULL, RW_DRIVER, NULL);
- rw_init(&grp->aggr.gl_lock, NULL, RW_DRIVER, NULL);
-
+ mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
+ rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
+ mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
grp->lg_link_state = LINK_STATE_UNKNOWN;
-
return (0);
}
@@ -132,8 +146,11 @@ aggr_grp_destructor(void *buf, void *arg)
grp->lg_tx_ports_size * sizeof (aggr_port_t *));
}
- rw_destroy(&grp->aggr.gl_lock);
- rw_destroy(&grp->lg_lock);
+ mutex_destroy(&grp->lg_lacp_lock);
+ cv_destroy(&grp->lg_lacp_cv);
+ mutex_destroy(&grp->lg_port_lock);
+ cv_destroy(&grp->lg_port_cv);
+ rw_destroy(&grp->lg_tx_lock);
}
void
@@ -179,6 +196,51 @@ aggr_grp_count(void)
}
/*
+ * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
+ * requires the mac perimeter, this function holds a reference of the aggr
+ * and aggr won't call mac_unregister() until this reference drops to 0.
+ */
+void
+aggr_grp_port_hold(aggr_port_t *port)
+{
+ aggr_grp_t *grp = port->lp_grp;
+
+ AGGR_PORT_REFHOLD(port);
+ mutex_enter(&grp->lg_port_lock);
+ grp->lg_port_ref++;
+ mutex_exit(&grp->lg_port_lock);
+}
+
+/*
+ * Release the reference of the grp and inform aggr_grp_delete() calling
+ * mac_unregister() is now safe.
+ */
+void
+aggr_grp_port_rele(aggr_port_t *port)
+{
+ aggr_grp_t *grp = port->lp_grp;
+
+ mutex_enter(&grp->lg_port_lock);
+ if (--grp->lg_port_ref == 0)
+ cv_signal(&grp->lg_port_cv);
+ mutex_exit(&grp->lg_port_lock);
+ AGGR_PORT_REFRELE(port);
+}
+
+/*
+ * Wait for the port's lacp timer thread and the port's notification callback
+ * to exit.
+ */
+void
+aggr_grp_port_wait(aggr_grp_t *grp)
+{
+ mutex_enter(&grp->lg_port_lock);
+ if (grp->lg_port_ref != 0)
+ cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
+ mutex_exit(&grp->lg_port_lock);
+}
+
+/*
* Attach a port to a link aggregation group.
*
* A port is attached to a link aggregation group once its speed
@@ -193,9 +255,8 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
{
boolean_t link_state_changed = B_FALSE;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
return (B_FALSE);
@@ -251,7 +312,7 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
/*
* Set port's receive callback
*/
- port->lp_mrh = mac_rx_add(port->lp_mh, aggr_recv_cb, (void *)port);
+ mac_rx_set(port->lp_mch, aggr_recv_cb, port);
/*
* If LACP is OFF, the port can be used to send data as soon
@@ -270,28 +331,28 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
}
boolean_t
-aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port, boolean_t port_detach)
+aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
{
boolean_t link_state_changed = B_FALSE;
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
+ /* update state */
if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
return (B_FALSE);
- mac_rx_remove(port->lp_mh, port->lp_mrh, B_FALSE);
+ mac_rx_clear(port->lp_mch);
aggr_grp_multicst_port(port, B_FALSE);
if (grp->lg_lacp_mode == AGGR_LACP_OFF)
aggr_send_port_disable(port);
- else if (port_detach)
+ else
aggr_lacp_port_detached(port);
- /* update state */
port->lp_state = AGGR_PORT_STATE_STANDBY;
+
grp->lg_nattached_ports--;
if (grp->lg_nattached_ports == 0) {
/* the last attached MAC port of the group is being detached */
@@ -323,17 +384,15 @@ aggr_grp_update_ports_mac(aggr_grp_t *grp)
{
aggr_port_t *cport;
boolean_t link_state_changed = B_FALSE;
+ mac_perim_handle_t mph;
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-
- if (grp->lg_closing)
- return (link_state_changed);
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
for (cport = grp->lg_ports; cport != NULL;
cport = cport->lp_next) {
- rw_enter(&cport->lp_lock, RW_WRITER);
- if (aggr_port_unicst(cport, grp->lg_addr) != 0) {
- if (aggr_grp_detach_port(grp, cport, B_TRUE))
+ mac_perim_enter_by_mh(cport->lp_mh, &mph);
+ if (aggr_port_unicst(cport) != 0) {
+ if (aggr_grp_detach_port(grp, cport))
link_state_changed = B_TRUE;
} else {
/*
@@ -346,7 +405,7 @@ aggr_grp_update_ports_mac(aggr_grp_t *grp)
if (aggr_grp_attach_port(grp, cport))
link_state_changed = B_TRUE;
}
- rw_exit(&cport->lp_lock);
+ mac_perim_exit(mph);
}
return (link_state_changed);
}
@@ -365,9 +424,8 @@ void
aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
ASSERT(mac_addr_changedp != NULL);
ASSERT(link_state_changedp != NULL);
@@ -394,9 +452,8 @@ aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
* Update the actual port MAC address to the MAC address
* of the group.
*/
- if (aggr_port_unicst(port, grp->lg_addr) != 0) {
- *link_state_changedp = aggr_grp_detach_port(grp, port,
- B_TRUE);
+ if (aggr_port_unicst(port) != 0) {
+ *link_state_changedp = aggr_grp_detach_port(grp, port);
} else {
/*
* If a port was detached because of a previous
@@ -414,21 +471,25 @@ aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
* Add a port to a link aggregation group.
*/
static int
-aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t linkid, boolean_t force,
+aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
aggr_port_t **pp)
{
aggr_port_t *port, **cport;
+ mac_perim_handle_t mph;
int err;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ /*
+ * lg_mh could be NULL when the function is called during the creation
+ * of the aggregation.
+ */
+ ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
/* create new port */
- err = aggr_port_create(linkid, force, &port);
+ err = aggr_port_create(grp, port_linkid, force, &port);
if (err != 0)
return (err);
- rw_enter(&port->lp_lock, RW_WRITER);
+ mac_perim_enter_by_mh(port->lp_mh, &mph);
/* add port to list of group constituent ports */
cport = &grp->lg_ports;
@@ -446,19 +507,238 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t linkid, boolean_t force,
grp->lg_nports++;
aggr_lacp_init_port(port);
+ mac_perim_exit(mph);
+
+ if (pp != NULL)
+ *pp = port;
+
+ return (0);
+}
+
+/*
+ * Add a pseudo Rx ring for the given HW ring handle.
+ */
+static int
+aggr_add_pseudo_rx_ring(aggr_port_t *port,
+ aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
+{
+ aggr_pseudo_rx_ring_t *ring;
+ int err;
+ int j;
+
+ for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
+ ring = rx_grp->arg_rings + j;
+ if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
+ break;
+ }
/*
- * Initialize the callback functions for this port. Note that this
- * can only be done after the lp_grp field is set.
+ * No slot for this new Rx ring.
*/
- aggr_port_init_callbacks(port);
+ if (j == MAX_RINGS_PER_GROUP)
+ return (EIO);
- rw_exit(&port->lp_lock);
+ ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
+ ring->arr_hw_rh = hw_rh;
+ ring->arr_port = port;
+ rx_grp->arg_ring_cnt++;
- if (pp != NULL)
- *pp = port;
+ /*
+ * The group is already registered, dynamically add a new ring to the
+ * mac group.
+ */
+ mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring);
+ if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
+ ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
+ ring->arr_hw_rh = NULL;
+ ring->arr_port = NULL;
+ rx_grp->arg_ring_cnt--;
+ mac_hwring_teardown(hw_rh);
+ }
+ return (err);
+}
- return (0);
+/*
+ * Remove the pseudo Rx ring of the given HW ring handle.
+ */
+static void
+aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
+{
+ aggr_pseudo_rx_ring_t *ring;
+ int j;
+
+ for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
+ ring = rx_grp->arg_rings + j;
+ if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
+ ring->arr_hw_rh != hw_rh) {
+ continue;
+ }
+
+ mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
+
+ ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
+ ring->arr_hw_rh = NULL;
+ ring->arr_port = NULL;
+ rx_grp->arg_ring_cnt--;
+ mac_hwring_teardown(hw_rh);
+ break;
+ }
+}
+
+/*
+ * This function is called to create pseudo rings over the hardware rings of
+ * the underlying device. Note that there is a 1:1 mapping between the pseudo
+ * RX rings of the aggr and the hardware rings of the underlying port.
+ */
+static int
+aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
+{
+ aggr_grp_t *grp = port->lp_grp;
+ mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
+ aggr_unicst_addr_t *addr, *a;
+ mac_perim_handle_t pmph;
+ int hw_rh_cnt, i = 0, j;
+ int err = 0;
+
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ /*
+ * This function must be called after the aggr registers its mac
+ * and its RX group has been initialized.
+ */
+ ASSERT(rx_grp->arg_gh != NULL);
+
+ /*
+ * Get the list the the underlying HW rings.
+ */
+ hw_rh_cnt = mac_hwrings_get(port->lp_mch, &port->lp_hwgh, hw_rh);
+
+ if (port->lp_hwgh != NULL) {
+ /*
+ * Quiesce the HW ring and the mac srs on the ring. Note
+ * that the HW ring will be restarted when the pseudo ring
+ * is started. At that time all the packets will be
+ * directly passed up to the pseudo RX ring and handled
+ * by mac srs created over the pseudo RX ring.
+ */
+ mac_rx_client_quiesce(port->lp_mch);
+ mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
+ }
+
+ /*
+ * Add all the unicast addresses to the newly added port.
+ */
+ for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
+ if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
+ break;
+ }
+
+ for (i = 0; err == 0 && i < hw_rh_cnt; i++)
+ err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
+
+ if (err != 0) {
+ for (j = 0; j < i; j++)
+ aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
+
+ for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
+ aggr_port_remmac(port, a->aua_addr);
+
+ if (port->lp_hwgh != NULL) {
+ mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
+ mac_rx_client_restart(port->lp_mch);
+ port->lp_hwgh = NULL;
+ }
+ } else {
+ port->lp_grp_added = B_TRUE;
+ }
+done:
+ mac_perim_exit(pmph);
+ return (err);
+}
+
+/*
+ * This function is called by aggr to remove pseudo RX rings over the
+ * HW rings of the underlying port.
+ */
+static void
+aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
+{
+ aggr_grp_t *grp = port->lp_grp;
+ mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
+ aggr_unicst_addr_t *addr;
+ mac_group_handle_t hwgh;
+ mac_perim_handle_t pmph;
+ int hw_rh_cnt, i;
+
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ if (!port->lp_grp_added)
+ goto done;
+
+ ASSERT(rx_grp->arg_gh != NULL);
+ hw_rh_cnt = mac_hwrings_get(port->lp_mch, &hwgh, hw_rh);
+
+ /*
+ * If hw_rh_cnt is 0, it means that the underlying port does not
+ * support RX rings. Directly return in this case.
+ */
+ for (i = 0; i < hw_rh_cnt; i++)
+ aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
+
+ for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
+ aggr_port_remmac(port, addr->aua_addr);
+
+ if (port->lp_hwgh != NULL) {
+ port->lp_hwgh = NULL;
+
+ /*
+ * First clear the permanent-quiesced flag of the RX srs then
+ * restart the HW ring and the mac srs on the ring. Note that
+ * the HW ring and associated SRS will soon been removed when
+ * the port is removed from the aggr.
+ */
+ mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
+ mac_rx_client_restart(port->lp_mch);
+ }
+
+ port->lp_grp_added = B_FALSE;
+done:
+ mac_perim_exit(pmph);
+}
+
+static int
+aggr_pseudo_disable_intr(mac_intr_handle_t ih)
+{
+ aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
+ return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
+}
+
+static int
+aggr_pseudo_enable_intr(mac_intr_handle_t ih)
+{
+ aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
+ return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
+}
+
+static int
+aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
+{
+ aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
+ int err;
+
+ err = mac_hwring_start(rr_ring->arr_hw_rh);
+ if (err == 0)
+ rr_ring->arr_gen = mr_gen;
+ return (err);
+}
+
+static void
+aggr_pseudo_stop_ring(mac_ring_driver_t arg)
+{
+ aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
+ mac_hwring_stop(rr_ring->arr_hw_rh);
}
/*
@@ -472,6 +752,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
aggr_grp_t *grp = NULL;
aggr_port_t *port;
boolean_t link_state_changed = B_FALSE;
+ mac_perim_handle_t mph, pmph;
/* get group corresponding to linkid */
rw_enter(&aggr_grp_lock, RW_READER);
@@ -481,10 +762,12 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
return (ENOENT);
}
AGGR_GRP_REFHOLD(grp);
- rw_exit(&aggr_grp_lock);
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
+ /*
+ * Hold the perimeter so that the aggregation won't be destroyed.
+ */
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ rw_exit(&aggr_grp_lock);
/* add the specified ports to group */
for (i = 0; i < nports; i++) {
@@ -504,29 +787,53 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
goto bail;
}
+ /*
+ * Create the pseudo ring for each HW ring of the underlying
+ * port.
+ */
+ rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
+ if (rc != 0)
+ goto bail;
+
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ /* set LACP mode */
+ aggr_port_lacp_set_mode(grp, port);
+
/* start port if group has already been started */
if (grp->lg_started) {
- rw_enter(&port->lp_lock, RW_WRITER);
rc = aggr_port_start(port);
if (rc != 0) {
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
goto bail;
}
- /* set port promiscuous mode */
- rc = aggr_port_promisc(port, grp->lg_promisc);
- if (rc != 0) {
- rw_exit(&port->lp_lock);
- goto bail;
+ /*
+ * Turn on the promiscuous mode over the port when it
+ * is requested to be turned on to receive the
+ * non-primary address over a port, or the promiscous
+ * mode is enabled over the aggr.
+ */
+ if (grp->lg_promisc || port->lp_prom_addr != NULL) {
+ rc = aggr_port_promisc(port, B_TRUE);
+ if (rc != 0) {
+ mac_perim_exit(pmph);
+ goto bail;
+ }
}
- rw_exit(&port->lp_lock);
}
+ mac_perim_exit(pmph);
/*
* Attach each port if necessary.
*/
- if (aggr_port_notify_link(grp, port, B_FALSE))
+ if (aggr_port_notify_link(grp, port))
link_state_changed = B_TRUE;
+
+ /*
+ * Initialize the callback functions for this port.
+ */
+ aggr_port_init_callbacks(port);
}
/* update the MAC address of the constituent ports */
@@ -539,64 +846,43 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
bail:
if (rc != 0) {
/* stop and remove ports that have been added */
- for (i = 0; i < nadded && !grp->lg_closing; i++) {
+ for (i = 0; i < nadded; i++) {
port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
ASSERT(port != NULL);
if (grp->lg_started) {
- rw_enter(&port->lp_lock, RW_WRITER);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+ (void) aggr_port_promisc(port, B_FALSE);
aggr_port_stop(port);
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
}
+ aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
(void) aggr_grp_rem_port(grp, port, NULL, NULL);
}
}
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
- if (rc == 0 && !grp->lg_closing)
+ if (rc == 0)
mac_resource_update(grp->lg_mh);
+ mac_perim_exit(mph);
AGGR_GRP_REFRELE(grp);
return (rc);
}
-/*
- * Update properties of an existing link aggregation group.
- */
-int
-aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask,
- uint32_t policy, boolean_t mac_fixed, const uchar_t *mac_addr,
- aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer)
+static int
+aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
+ boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
+ aggr_lacp_timer_t lacp_timer)
{
- int rc = 0;
- aggr_grp_t *grp = NULL;
boolean_t mac_addr_changed = B_FALSE;
boolean_t link_state_changed = B_FALSE;
+ mac_perim_handle_t pmph;
- if (grp_arg == NULL) {
- /* get group corresponding to linkid */
- rw_enter(&aggr_grp_lock, RW_READER);
- if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
- (mod_hash_val_t *)&grp) != 0) {
- rc = ENOENT;
- goto bail;
- }
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
- } else {
- grp = grp_arg;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
- }
-
- ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
- AGGR_GRP_REFHOLD(grp);
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
/* validate fixed address if specified */
if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
(mac_addr[0] & 0x01))) {
- rc = EINVAL;
- goto bail;
+ return (EINVAL);
}
/* update policy if requested */
@@ -616,11 +902,11 @@ aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask,
/* switch from user-supplied to automatic */
aggr_port_t *port = grp->lg_ports;
- rw_enter(&port->lp_lock, RW_WRITER);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
grp->lg_mac_addr_port = port;
mac_addr_changed = B_TRUE;
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
}
grp->lg_addr_fixed = mac_fixed;
}
@@ -631,36 +917,51 @@ aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask,
if (update_mask & AGGR_MODIFY_LACP_MODE)
aggr_lacp_update_mode(grp, lacp_mode);
- if ((update_mask & AGGR_MODIFY_LACP_TIMER) && !grp->lg_closing)
+ if (update_mask & AGGR_MODIFY_LACP_TIMER)
aggr_lacp_update_timer(grp, lacp_timer);
-bail:
- if (grp != NULL && !grp->lg_closing) {
- /*
- * If grp_arg is non-NULL, this function is called from
- * mac_unicst_set(), and the MAC_NOTE_UNICST notification
- * will be sent there.
- */
- if ((grp_arg == NULL) && mac_addr_changed)
- mac_unicst_update(grp->lg_mh, grp->lg_addr);
+ if (link_state_changed)
+ mac_link_update(grp->lg_mh, grp->lg_link_state);
- if (link_state_changed)
- mac_link_update(grp->lg_mh, grp->lg_link_state);
+ if (mac_addr_changed)
+ mac_unicst_update(grp->lg_mh, grp->lg_addr);
- }
+ return (0);
+}
- if (grp_arg == NULL) {
- if (grp != NULL) {
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
- }
+/*
+ * Update properties of an existing link aggregation group.
+ */
+int
+aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
+ boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
+ aggr_lacp_timer_t lacp_timer)
+{
+ aggr_grp_t *grp = NULL;
+ mac_perim_handle_t mph;
+ int err;
+
+ /* get group corresponding to linkid */
+ rw_enter(&aggr_grp_lock, RW_READER);
+ if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
+ (mod_hash_val_t *)&grp) != 0) {
rw_exit(&aggr_grp_lock);
+ return (ENOENT);
}
+ AGGR_GRP_REFHOLD(grp);
- if (grp != NULL)
- AGGR_GRP_REFRELE(grp);
+ /*
+ * Hold the perimeter so that the aggregation won't be destroyed.
+ */
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ rw_exit(&aggr_grp_lock);
- return (rc);
+ err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
+ mac_addr, lacp_mode, lacp_timer);
+
+ mac_perim_exit(mph);
+ AGGR_GRP_REFRELE(grp);
+ return (err);
}
/*
@@ -676,6 +977,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
aggr_port_t *port;
mac_register_t *mac;
boolean_t link_state_changed;
+ mac_perim_handle_t mph;
int err;
int i;
@@ -695,9 +997,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
-
grp->lg_refs = 1;
grp->lg_closing = B_FALSE;
grp->lg_force = force;
@@ -707,6 +1006,11 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
grp->lg_started = B_FALSE;
grp->lg_promisc = B_FALSE;
+ grp->lg_lacp_done = B_FALSE;
+ grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
+ grp->lg_lacp_rx_thread = thread_create(NULL, 0,
+ aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
+ bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
aggr_lacp_init_grp(grp);
/* add MAC ports to group */
@@ -723,7 +1027,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
goto bail;
}
grp->lg_key = key;
- grp->lg_mcst_list = NULL;
for (i = 0; i < nports; i++) {
err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
@@ -748,17 +1051,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
grp->lg_mac_addr_port = grp->lg_ports;
}
- /*
- * Update the MAC address of the constituent ports.
- * None of the port is attached at this time, the link state of the
- * aggregation will not change.
- */
- link_state_changed = aggr_grp_update_ports_mac(grp);
- ASSERT(!link_state_changed);
-
- /* update outbound load balancing policy */
- aggr_send_update_policy(grp, policy);
-
/* set the initial group capabilities */
aggr_grp_capab_set(grp);
@@ -775,6 +1067,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
mac->m_min_sdu = 0;
mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
mac->m_margin = aggr_grp_max_margin(grp);
+ mac->m_v12n = MAC_VIRT_LEVEL1;
err = mac_register(mac, &grp->lg_mh);
mac_free(mac);
if (err != 0)
@@ -782,9 +1075,23 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
if ((err = dls_devnet_create(grp->lg_mh, grp->lg_linkid)) != 0) {
(void) mac_unregister(grp->lg_mh);
+ grp->lg_mh = NULL;
goto bail;
}
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+ /*
+ * Update the MAC address of the constituent ports.
+ * None of the port is attached at this time, the link state of the
+ * aggregation will not change.
+ */
+ link_state_changed = aggr_grp_update_ports_mac(grp);
+ ASSERT(!link_state_changed);
+
+ /* update outbound load balancing policy */
+ aggr_send_update_policy(grp, policy);
+
/* set LACP mode */
aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
@@ -792,8 +1099,19 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
* Attach each port if necessary.
*/
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
- if (aggr_port_notify_link(grp, port, B_FALSE))
+ /*
+ * Create the pseudo ring for each HW ring of the underlying
+ * port. Note that this is done after the aggr registers the
+ * mac.
+ */
+ VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
+ if (aggr_port_notify_link(grp, port))
link_state_changed = B_TRUE;
+
+ /*
+ * Initialize the callback functions for this port.
+ */
+ aggr_port_init_callbacks(port);
}
if (link_state_changed)
@@ -805,31 +1123,35 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
ASSERT(err == 0);
aggr_grp_cnt++;
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
+ mac_perim_exit(mph);
rw_exit(&aggr_grp_lock);
return (0);
bail:
- if (grp != NULL) {
- aggr_port_t *cport;
- grp->lg_closing = B_TRUE;
-
- port = grp->lg_ports;
- while (port != NULL) {
- cport = port->lp_next;
- aggr_port_delete(port);
- port = cport;
- }
+ grp->lg_closing = B_TRUE;
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
+ port = grp->lg_ports;
+ while (port != NULL) {
+ aggr_port_t *cport;
- AGGR_GRP_REFRELE(grp);
+ cport = port->lp_next;
+ aggr_port_delete(port);
+ port = cport;
}
+ /*
+ * Inform the lacp_rx thread to exit.
+ */
+ mutex_enter(&grp->lg_lacp_lock);
+ grp->lg_lacp_done = B_TRUE;
+ cv_signal(&grp->lg_lacp_cv);
+ while (grp->lg_lacp_rx_thread != NULL)
+ cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
+ mutex_exit(&grp->lg_lacp_lock);
+
rw_exit(&aggr_grp_lock);
+ AGGR_GRP_REFRELE(grp);
return (err);
}
@@ -841,7 +1163,7 @@ aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
{
aggr_port_t *port;
- ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
if (port->lp_linkid == linkid)
@@ -862,12 +1184,12 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
aggr_port_t **pport;
boolean_t mac_addr_changed = B_FALSE;
boolean_t link_state_changed = B_FALSE;
+ mac_perim_handle_t mph;
uint64_t val;
uint_t i;
uint_t stat;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
ASSERT(grp->lg_nports > 1);
ASSERT(!grp->lg_closing);
@@ -881,9 +1203,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
}
*pport = port->lp_next;
- atomic_add_32(&port->lp_closing, 1);
-
- rw_enter(&port->lp_lock, RW_WRITER);
+ mac_perim_enter_by_mh(port->lp_mh, &mph);
/*
* If the MAC address of the port being removed was assigned
@@ -900,7 +1220,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
mac_addr_changed = B_TRUE;
}
- link_state_changed = aggr_grp_detach_port(grp, port, B_FALSE);
+ link_state_changed = aggr_grp_detach_port(grp, port);
/*
* Add the counter statistics of the ports while it was aggregated
@@ -909,7 +1229,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
* value of the counter at the moment it was added to the
* aggregation.
*/
- for (i = 0; i < MAC_NSTAT && !grp->lg_closing; i++) {
+ for (i = 0; i < MAC_NSTAT; i++) {
stat = i + MAC_STAT_MIN;
if (!MAC_STAT_ISACOUNTER(stat))
continue;
@@ -917,7 +1237,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
val -= port->lp_stat[i];
grp->lg_stat[i] += val;
}
- for (i = 0; i < ETHER_NSTAT && !grp->lg_closing; i++) {
+ for (i = 0; i < ETHER_NSTAT; i++) {
stat = i + MACTYPE_STAT_MIN;
if (!ETHER_STAT_ISACOUNTER(stat))
continue;
@@ -927,8 +1247,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
}
grp->lg_nports--;
-
- rw_exit(&port->lp_lock);
+ mac_perim_exit(mph);
aggr_port_delete(port);
@@ -960,6 +1279,7 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
aggr_port_t *port;
boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
boolean_t link_state_update = B_FALSE, link_state_changed;
+ mac_perim_handle_t mph, pmph;
/* get group corresponding to linkid */
rw_enter(&aggr_grp_lock, RW_READER);
@@ -969,10 +1289,12 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
return (ENOENT);
}
AGGR_GRP_REFHOLD(grp);
- rw_exit(&aggr_grp_lock);
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
+ /*
+ * Hold the perimeter so that the aggregation won't be destroyed.
+ */
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ rw_exit(&aggr_grp_lock);
/* we need to keep at least one port per group */
if (nports >= grp->lg_nports) {
@@ -989,20 +1311,51 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
}
}
+ /* clear the promiscous mode for the specified ports */
+ for (i = 0; i < nports && rc == 0; i++) {
+ /* lookup port */
+ port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
+ ASSERT(port != NULL);
+
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+ rc = aggr_port_promisc(port, B_FALSE);
+ mac_perim_exit(pmph);
+ }
+ if (rc != 0) {
+ for (i = 0; i < nports; i++) {
+ port = aggr_grp_port_lookup(grp,
+ ports[i].lp_linkid);
+ ASSERT(port != NULL);
+
+ /*
+ * Turn the promiscuous mode back on if it is required
+ * to receive the non-primary address over a port, or
+ * the promiscous mode is enabled over the aggr.
+ */
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+ if (port->lp_started && (grp->lg_promisc ||
+ port->lp_prom_addr != NULL)) {
+ (void) aggr_port_promisc(port, B_TRUE);
+ }
+ mac_perim_exit(pmph);
+ }
+ goto bail;
+ }
+
/* remove the specified ports from group */
- for (i = 0; i < nports && !grp->lg_closing; i++) {
+ for (i = 0; i < nports; i++) {
/* lookup port */
port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
ASSERT(port != NULL);
/* stop port if group has already been started */
if (grp->lg_started) {
- rw_enter(&port->lp_lock, RW_WRITER);
- aggr_lacp_port_detached(port);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
aggr_port_stop(port);
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
}
+ aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
/* remove port from group */
rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
&link_state_changed);
@@ -1012,16 +1365,14 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
}
bail:
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
- if (!grp->lg_closing) {
- if (mac_addr_update)
- mac_unicst_update(grp->lg_mh, grp->lg_addr);
- if (link_state_update)
- mac_link_update(grp->lg_mh, grp->lg_link_state);
- if (rc == 0)
- mac_resource_update(grp->lg_mh);
- }
+ if (mac_addr_update)
+ mac_unicst_update(grp->lg_mh, grp->lg_addr);
+ if (link_state_update)
+ mac_link_update(grp->lg_mh, grp->lg_link_state);
+ if (rc == 0)
+ mac_resource_update(grp->lg_mh);
+
+ mac_perim_exit(mph);
AGGR_GRP_REFRELE(grp);
return (rc);
@@ -1032,9 +1383,9 @@ aggr_grp_delete(datalink_id_t linkid)
{
aggr_grp_t *grp = NULL;
aggr_port_t *port, *cport;
- lg_mcst_addr_t *mcst, *mcst_nextp;
datalink_id_t tmpid;
mod_hash_val_t val;
+ mac_perim_handle_t mph, pmph;
int err;
rw_enter(&aggr_grp_lock, RW_WRITER);
@@ -1051,68 +1402,69 @@ aggr_grp_delete(datalink_id_t linkid)
* aggr_m_stat() and thus has a kstat_hold() on the kstats that
* dls_devnet_destroy() needs to delete.
*/
- if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid)) != 0) {
+ if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
rw_exit(&aggr_grp_lock);
return (err);
}
ASSERT(linkid == tmpid);
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
-
/*
* Unregister from the MAC service module. Since this can
* fail if a client hasn't closed the MAC port, we gracefully
* fail the operation.
*/
- grp->lg_closing = B_TRUE;
if ((err = mac_disable(grp->lg_mh)) != 0) {
- grp->lg_closing = B_FALSE;
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
-
(void) dls_devnet_create(grp->lg_mh, linkid);
rw_exit(&aggr_grp_lock);
return (err);
}
+ (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
+ ASSERT(grp == (aggr_grp_t *)val);
+
+ ASSERT(aggr_grp_cnt > 0);
+ aggr_grp_cnt--;
+ rw_exit(&aggr_grp_lock);
/*
- * Free the list of multicast addresses.
+ * Inform the lacp_rx thread to exit.
*/
- for (mcst = grp->lg_mcst_list; mcst != NULL; mcst = mcst_nextp) {
- mcst_nextp = mcst->lg_mcst_nextp;
- kmem_free(mcst, sizeof (lg_mcst_addr_t));
- }
- grp->lg_mcst_list = NULL;
+ mutex_enter(&grp->lg_lacp_lock);
+ grp->lg_lacp_done = B_TRUE;
+ cv_signal(&grp->lg_lacp_cv);
+ while (grp->lg_lacp_rx_thread != NULL)
+ cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
+ mutex_exit(&grp->lg_lacp_lock);
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+ grp->lg_closing = B_TRUE;
/* detach and free MAC ports associated with group */
port = grp->lg_ports;
while (port != NULL) {
cport = port->lp_next;
- rw_enter(&port->lp_lock, RW_WRITER);
- aggr_lacp_port_detached(port);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
if (grp->lg_started)
aggr_port_stop(port);
- (void) aggr_grp_detach_port(grp, port, B_FALSE);
- rw_exit(&port->lp_lock);
+ (void) aggr_grp_detach_port(grp, port);
+ mac_perim_exit(pmph);
+ aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
aggr_port_delete(port);
port = cport;
}
- VERIFY(mac_unregister(grp->lg_mh) == 0);
+ mac_perim_exit(mph);
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
-
- (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
- ASSERT(grp == (aggr_grp_t *)val);
+ /*
+ * Wait for the port's lacp timer thread and its notification callback
+ * to exit before calling mac_unregister() since both needs to access
+ * the mac perimeter of the grp.
+ */
+ aggr_grp_port_wait(grp);
- ASSERT(aggr_grp_cnt > 0);
- aggr_grp_cnt--;
+ VERIFY(mac_unregister(grp->lg_mh) == 0);
+ grp->lg_mh = NULL;
- rw_exit(&aggr_grp_lock);
AGGR_GRP_REFRELE(grp);
-
return (0);
}
@@ -1120,6 +1472,7 @@ void
aggr_grp_free(aggr_grp_t *grp)
{
ASSERT(grp->lg_refs == 0);
+ ASSERT(grp->lg_port_ref == 0);
if (grp->lg_key > AGGR_MAX_KEY) {
id_free(key_ids, grp->lg_key);
grp->lg_key = 0;
@@ -1134,6 +1487,7 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg,
{
aggr_grp_t *grp;
aggr_port_t *port;
+ mac_perim_handle_t mph, pmph;
int rc = 0;
rw_enter(&aggr_grp_lock, RW_READER);
@@ -1143,8 +1497,10 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg,
rw_exit(&aggr_grp_lock);
return (ENOENT);
}
+ AGGR_GRP_REFHOLD(grp);
- rw_enter(&grp->lg_lock, RW_READER);
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ rw_exit(&aggr_grp_lock);
rc = new_grp_fn(fn_arg, grp->lg_linkid,
(grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
@@ -1155,32 +1511,21 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg,
goto bail;
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
- rw_enter(&port->lp_lock, RW_READER);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
port->lp_state, &port->lp_lacp.ActorOperPortState);
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
if (rc != 0)
goto bail;
}
bail:
- rw_exit(&grp->lg_lock);
- rw_exit(&aggr_grp_lock);
+ mac_perim_exit(mph);
+ AGGR_GRP_REFRELE(grp);
return (rc);
}
-static void
-aggr_m_resources(void *arg)
-{
- aggr_grp_t *grp = arg;
- aggr_port_t *port;
-
- /* Call each port's m_resources function */
- for (port = grp->lg_ports; port != NULL; port = port->lp_next)
- mac_resources(port->lp_mh);
-}
-
/*ARGSUSED*/
static void
aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
@@ -1230,10 +1575,11 @@ aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
static int
aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
{
- aggr_grp_t *grp = arg;
- int rval = 0;
+ aggr_grp_t *grp = arg;
+ mac_perim_handle_t mph;
+ int rval = 0;
- rw_enter(&grp->lg_lock, RW_READER);
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
switch (stat) {
case MAC_STAT_IFSPEED:
@@ -1253,7 +1599,7 @@ aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
rval = aggr_grp_stat(grp, stat, val);
}
- rw_exit(&grp->lg_lock);
+ mac_perim_exit(mph);
return (rval);
}
@@ -1262,9 +1608,9 @@ aggr_m_start(void *arg)
{
aggr_grp_t *grp = arg;
aggr_port_t *port;
+ mac_perim_handle_t mph, pmph;
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
/*
* Attempts to start all configured members of the group.
@@ -1272,23 +1618,27 @@ aggr_m_start(void *arg)
* is received.
*/
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
- rw_enter(&port->lp_lock, RW_WRITER);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
if (aggr_port_start(port) != 0) {
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
continue;
}
- /* set port promiscuous mode */
- if (aggr_port_promisc(port, grp->lg_promisc) != 0)
- aggr_port_stop(port);
- rw_exit(&port->lp_lock);
+ /*
+ * Turn on the promiscuous mode if it is required to receive
+ * the non-primary address over a port, or the promiscous
+ * mode is enabled over the aggr.
+ */
+ if (grp->lg_promisc || port->lp_prom_addr != NULL) {
+ if (aggr_port_promisc(port, B_TRUE) != 0)
+ aggr_port_stop(port);
+ }
+ mac_perim_exit(pmph);
}
grp->lg_started = B_TRUE;
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
-
+ mac_perim_exit(mph);
return (0);
}
@@ -1297,21 +1647,22 @@ aggr_m_stop(void *arg)
{
aggr_grp_t *grp = arg;
aggr_port_t *port;
+ mac_perim_handle_t mph, pmph;
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
- rw_enter(&port->lp_lock, RW_WRITER);
- aggr_lacp_port_detached(port);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ /* reset port promiscuous mode */
+ (void) aggr_port_promisc(port, B_FALSE);
+
aggr_port_stop(port);
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
}
grp->lg_started = B_FALSE;
-
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
+ mac_perim_exit(mph);
}
static int
@@ -1320,10 +1671,10 @@ aggr_m_promisc(void *arg, boolean_t on)
aggr_grp_t *grp = arg;
aggr_port_t *port;
boolean_t link_state_changed = B_FALSE;
+ mac_perim_handle_t mph, pmph;
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
AGGR_GRP_REFHOLD(grp);
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
ASSERT(!grp->lg_closing);
@@ -1331,25 +1682,30 @@ aggr_m_promisc(void *arg, boolean_t on)
goto bail;
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
- rw_enter(&port->lp_lock, RW_WRITER);
+ int err = 0;
+
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
AGGR_PORT_REFHOLD(port);
- if (port->lp_started) {
- if (aggr_port_promisc(port, on) != 0) {
- if (aggr_grp_detach_port(grp, port, B_TRUE))
- link_state_changed = B_TRUE;
- } else {
- /*
- * If a port was detached because of a previous
- * failure changing the promiscuity, the port
- * is reattached when it successfully changes
- * the promiscuity now, and this might cause
- * the link state of the aggregation to change.
- */
- if (aggr_grp_attach_port(grp, port))
- link_state_changed = B_TRUE;
- }
+ if (!on && (port->lp_prom_addr == NULL))
+ err = aggr_port_promisc(port, B_FALSE);
+ else if (on && port->lp_started)
+ err = aggr_port_promisc(port, B_TRUE);
+
+ if (err != 0) {
+ if (aggr_grp_detach_port(grp, port))
+ link_state_changed = B_TRUE;
+ } else {
+ /*
+ * If a port was detached because of a previous
+ * failure changing the promiscuity, the port
+ * is reattached when it successfully changes
+ * the promiscuity now, and this might cause
+ * the link state of the aggregation to change.
+ */
+ if (aggr_grp_attach_port(grp, port))
+ link_state_changed = B_TRUE;
}
- rw_exit(&port->lp_lock);
+ mac_perim_exit(pmph);
AGGR_PORT_REFRELE(port);
}
@@ -1359,13 +1715,49 @@ aggr_m_promisc(void *arg, boolean_t on)
mac_link_update(grp->lg_mh, grp->lg_link_state);
bail:
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
+ mac_perim_exit(mph);
AGGR_GRP_REFRELE(grp);
return (0);
}
+static void
+aggr_grp_port_rename(const char *new_name, void *arg)
+{
+ /*
+ * aggr port's mac client name is the format of "aggr link name" plus
+ * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
+ */
+ int aggr_len, link_len, clnt_name_len, i;
+ char *str_end, *str_st, *str_del;
+ char aggr_name[MAXNAMELEN];
+ char link_name[MAXNAMELEN];
+ char *clnt_name;
+ aggr_grp_t *aggr_grp = arg;
+ aggr_port_t *aggr_port = aggr_grp->lg_ports;
+
+ for (i = 0; i < aggr_grp->lg_nports; i++) {
+ clnt_name = mac_client_name(aggr_port->lp_mch);
+ clnt_name_len = strlen(clnt_name);
+ str_st = clnt_name;
+ str_end = &(clnt_name[clnt_name_len]);
+ str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
+ ASSERT(str_del != NULL);
+ aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
+ link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
+ bzero(aggr_name, MAXNAMELEN);
+ bzero(link_name, MAXNAMELEN);
+ bcopy(clnt_name, aggr_name, aggr_len);
+ bcopy(str_del, link_name, link_len + 1);
+ bzero(clnt_name, MAXNAMELEN);
+ (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
+ link_name);
+
+ (void) mac_rename_primary(aggr_port->lp_mh, NULL);
+ aggr_port = aggr_port->lp_next;
+ }
+}
+
/*
* Initialize the capabilities that are advertised for the group
* according to the capabilities of the constituent ports.
@@ -1381,51 +1773,245 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
*hcksum_txflags = grp->lg_hcksum_txflags;
break;
}
- case MAC_CAPAB_POLL:
- /*
- * There's nothing for us to fill in, we simply return
- * B_TRUE or B_FALSE to represent the group's support
- * status for this capability.
- */
- return (grp->lg_gldv3_polling);
case MAC_CAPAB_NO_NATIVEVLAN:
return (!grp->lg_vlan);
case MAC_CAPAB_NO_ZCOPY:
return (!grp->lg_zcopy);
+ case MAC_CAPAB_RINGS: {
+ mac_capab_rings_t *cap_rings = cap_data;
+
+ if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
+ cap_rings->mr_rget = aggr_fill_ring;
+
+ /*
+ * An aggregation advertises only one (pseudo) RX
+ * group, which virtualizes the main/primary group of
+ * the underlying devices.
+ */
+ cap_rings->mr_gnum = 1;
+ cap_rings->mr_gget = aggr_fill_group;
+ cap_rings->mr_gaddring = NULL;
+ cap_rings->mr_gremring = NULL;
+ } else {
+ return (B_FALSE);
+ }
+ break;
+ }
+ case MAC_CAPAB_AGGR:
+ {
+ mac_capab_aggr_t *aggr_cap;
+
+ if (cap_data != NULL) {
+ aggr_cap = cap_data;
+ aggr_cap->mca_rename_fn = aggr_grp_port_rename;
+ aggr_cap->mca_unicst = aggr_m_unicst;
+ }
+ return (B_TRUE);
+ }
default:
return (B_FALSE);
}
return (B_TRUE);
}
+/*
+ * Callback funtion for MAC layer to register groups.
+ */
+static void
+aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+ mac_group_info_t *infop, mac_group_handle_t gh)
+{
+ aggr_grp_t *grp = arg;
+ aggr_pseudo_rx_group_t *rx_group;
+
+ ASSERT(rtype == MAC_RING_TYPE_RX && index == 0);
+ rx_group = &grp->lg_rx_group;
+ rx_group->arg_gh = gh;
+ rx_group->arg_grp = grp;
+
+ infop->mgi_driver = (mac_group_driver_t)rx_group;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = aggr_addmac;
+ infop->mgi_remmac = aggr_remmac;
+ infop->mgi_count = rx_group->arg_ring_cnt;
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ */
+static void
+aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+ const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ aggr_grp_t *grp = arg;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group;
+ aggr_pseudo_rx_ring_t *rx_ring;
+ mac_intr_t aggr_mac_intr;
+
+ ASSERT(rg_index == 0);
+
+ ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
+ rx_ring = rx_group->arg_rings + index;
+ rx_ring->arr_rh = rh;
+
+ /*
+ * Entrypoint to enable interrupt (disable poll) and
+ * disable interrupt (enable poll).
+ */
+ aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
+ aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
+ aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
+
+ infop->mri_driver = (mac_ring_driver_t)rx_ring;
+ infop->mri_start = aggr_pseudo_start_ring;
+ infop->mri_stop = aggr_pseudo_stop_ring;
+
+ infop->mri_intr = aggr_mac_intr;
+ infop->mri_poll = aggr_rx_poll;
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+static mblk_t *
+aggr_rx_poll(void *arg, int bytes_to_pickup)
+{
+ aggr_pseudo_rx_ring_t *rr_ring = arg;
+ aggr_port_t *port = rr_ring->arr_port;
+ aggr_grp_t *grp = port->lp_grp;
+ mblk_t *mp_chain, *mp, **mpp;
+
+ mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
+
+ if (grp->lg_lacp_mode == AGGR_LACP_OFF)
+ return (mp_chain);
+
+ mpp = &mp_chain;
+ while ((mp = *mpp) != NULL) {
+ if (MBLKL(mp) >= sizeof (struct ether_header)) {
+ struct ether_header *ehp;
+
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
+ *mpp = mp->b_next;
+ mp->b_next = NULL;
+ aggr_recv_lacp(port,
+ (mac_resource_handle_t)rr_ring, mp);
+ continue;
+ }
+ }
+
+ if (!port->lp_collector_enabled) {
+ *mpp = mp->b_next;
+ mp->b_next = NULL;
+ freemsg(mp);
+ continue;
+ }
+ mpp = &mp->b_next;
+ }
+ return (mp_chain);
+}
+
static int
-aggr_grp_multicst(aggr_grp_t *grp, boolean_t add, const uint8_t *addrp)
+aggr_addmac(void *arg, const uint8_t *mac_addr)
{
- lg_mcst_addr_t *mcst, **ppmcst;
+ aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
+ aggr_unicst_addr_t *addr, **pprev;
+ aggr_grp_t *grp = rx_group->arg_grp;
+ aggr_port_t *port, *p;
+ mac_perim_handle_t mph;
+ int err = 0;
+
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+ if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
+ mac_perim_exit(mph);
+ return (0);
+ }
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ /*
+ * Insert this mac address into the list of mac addresses owned by
+ * the aggregation pseudo group.
+ */
+ pprev = &rx_group->arg_macaddr;
+ while ((addr = *pprev) != NULL) {
+ if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
+ mac_perim_exit(mph);
+ return (EEXIST);
+ }
+ pprev = &addr->aua_next;
+ }
+ addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
+ bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
+ addr->aua_next = NULL;
+ *pprev = addr;
- for (ppmcst = &(grp->lg_mcst_list); (mcst = *ppmcst) != NULL;
- ppmcst = &(mcst->lg_mcst_nextp)) {
- if (bcmp(mcst->lg_mcst_addr, addrp, MAXMACADDRLEN) == 0)
+ for (port = grp->lg_ports; port != NULL; port = port->lp_next)
+ if ((err = aggr_port_addmac(port, mac_addr)) != 0)
break;
+
+ if (err != 0) {
+ for (p = grp->lg_ports; p != port; p = p->lp_next)
+ aggr_port_remmac(p, mac_addr);
+
+ *pprev = NULL;
+ kmem_free(addr, sizeof (aggr_unicst_addr_t));
}
- if (add) {
- if (mcst != NULL)
- return (0);
- mcst = kmem_zalloc(sizeof (lg_mcst_addr_t), KM_NOSLEEP);
- if (mcst == NULL)
- return (ENOMEM);
- bcopy(addrp, mcst->lg_mcst_addr, MAXMACADDRLEN);
- *ppmcst = mcst;
- } else {
- if (mcst == NULL)
- return (ENOENT);
- *ppmcst = mcst->lg_mcst_nextp;
- kmem_free(mcst, sizeof (lg_mcst_addr_t));
+ mac_perim_exit(mph);
+ return (err);
+}
+
+static int
+aggr_remmac(void *arg, const uint8_t *mac_addr)
+{
+ aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
+ aggr_unicst_addr_t *addr, **pprev;
+ aggr_grp_t *grp = rx_group->arg_grp;
+ aggr_port_t *port;
+ mac_perim_handle_t mph;
+ int err = 0;
+
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+ if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
+ mac_perim_exit(mph);
+ return (0);
}
- return (0);
+
+ /*
+ * Insert this mac address into the list of mac addresses owned by
+ * the aggregation pseudo group.
+ */
+ pprev = &rx_group->arg_macaddr;
+ while ((addr = *pprev) != NULL) {
+ if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
+ pprev = &addr->aua_next;
+ continue;
+ }
+ break;
+ }
+ if (addr == NULL) {
+ mac_perim_exit(mph);
+ return (EINVAL);
+ }
+
+ for (port = grp->lg_ports; port != NULL; port = port->lp_next)
+ aggr_port_remmac(port, mac_addr);
+
+ *pprev = addr->aua_next;
+ kmem_free(addr, sizeof (aggr_unicst_addr_t));
+
+ mac_perim_exit(mph);
+ return (err);
}
/*
@@ -1438,17 +2024,14 @@ void
aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
{
aggr_grp_t *grp = port->lp_grp;
- lg_mcst_addr_t *mcst;
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
if (!port->lp_started)
return;
- for (mcst = grp->lg_mcst_list; mcst != NULL;
- mcst = mcst->lg_mcst_nextp)
- (void) aggr_port_multicst(port, add, mcst->lg_mcst_addr);
+ mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
}
static int
@@ -1456,19 +2039,18 @@ aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
{
aggr_grp_t *grp = arg;
aggr_port_t *port = NULL;
+ mac_perim_handle_t mph;
int err = 0, cerr;
- rw_enter(&grp->lg_lock, RW_WRITER);
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
continue;
cerr = aggr_port_multicst(port, add, addrp);
- if (cerr == 0)
- (void) aggr_grp_multicst(grp, add, addrp);
if (cerr != 0 && err == 0)
err = cerr;
}
- rw_exit(&grp->lg_lock);
+ mac_perim_exit(mph);
return (err);
}
@@ -1476,16 +2058,14 @@ static int
aggr_m_unicst(void *arg, const uint8_t *macaddr)
{
aggr_grp_t *grp = arg;
- int rc;
+ mac_perim_handle_t mph;
+ int err;
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
- rc = aggr_grp_modify(0, grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
0, 0);
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
-
- return (rc);
+ mac_perim_exit(mph);
+ return (err);
}
/*
@@ -1498,11 +2078,10 @@ aggr_grp_capab_set(aggr_grp_t *grp)
uint32_t cksum;
aggr_port_t *port;
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(grp->lg_mh == NULL);
ASSERT(grp->lg_ports != NULL);
grp->lg_hcksum_txflags = (uint32_t)-1;
- grp->lg_gldv3_polling = B_TRUE;
grp->lg_zcopy = B_TRUE;
grp->lg_vlan = B_TRUE;
@@ -1516,9 +2095,6 @@ aggr_grp_capab_set(aggr_grp_t *grp)
grp->lg_zcopy &=
!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
-
- grp->lg_gldv3_polling &=
- mac_capab_get(port->lp_mh, MAC_CAPAB_POLL, NULL);
}
}
@@ -1551,11 +2127,6 @@ aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
return (B_FALSE);
}
- if (mac_capab_get(port->lp_mh, MAC_CAPAB_POLL, NULL) !=
- grp->lg_gldv3_polling) {
- return (B_FALSE);
- }
-
return (B_TRUE);
}
@@ -1568,7 +2139,7 @@ aggr_grp_max_sdu(aggr_grp_t *grp)
uint_t max_sdu = (uint_t)-1;
aggr_port_t *port;
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(grp->lg_mh == NULL);
ASSERT(grp->lg_ports != NULL);
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
@@ -1605,7 +2176,7 @@ aggr_grp_max_margin(aggr_grp_t *grp)
uint32_t margin = UINT32_MAX;
aggr_port_t *port;
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(grp->lg_mh == NULL);
ASSERT(grp->lg_ports != NULL);
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
diff --git a/usr/src/uts/common/io/aggr/aggr_lacp.c b/usr/src/uts/common/io/aggr/aggr_lacp.c
index 09330f8df1..0916533c48 100644
--- a/usr/src/uts/common/io/aggr/aggr_lacp.c
+++ b/usr/src/uts/common/io/aggr/aggr_lacp.c
@@ -29,8 +29,10 @@
#include <sys/types.h>
#include <sys/sysmacros.h>
+#include <sys/callb.h>
#include <sys/conf.h>
#include <sys/cmn_err.h>
+#include <sys/disp.h>
#include <sys/list.h>
#include <sys/ksynch.h>
#include <sys/kmem.h>
@@ -97,8 +99,8 @@ typedef struct lacp_sel_ports {
static lacp_sel_ports_t *sel_ports = NULL;
static kmutex_t lacp_sel_lock;
-static void periodic_timer_pop_locked(aggr_port_t *);
static void periodic_timer_pop(void *);
+static void periodic_timer_pop_handler(aggr_port_t *);
static void lacp_xmit_sm(aggr_port_t *);
static void lacp_periodic_sm(aggr_port_t *);
static void fill_lacp_pdu(aggr_port_t *, lacp_t *);
@@ -108,16 +110,18 @@ static void lacp_off(aggr_port_t *);
static boolean_t valid_lacp_pdu(aggr_port_t *, lacp_t *);
static void lacp_receive_sm(aggr_port_t *, lacp_t *);
static void aggr_set_coll_dist(aggr_port_t *, boolean_t);
-static void aggr_set_coll_dist_locked(aggr_port_t *, boolean_t);
static void start_wait_while_timer(aggr_port_t *);
static void stop_wait_while_timer(aggr_port_t *);
static void lacp_reset_port(aggr_port_t *);
static void stop_current_while_timer(aggr_port_t *);
static void current_while_timer_pop(void *);
+static void current_while_timer_pop_handler(aggr_port_t *);
static void update_default_selected(aggr_port_t *);
static boolean_t update_selected(aggr_port_t *, lacp_t *);
static boolean_t lacp_sel_ports_add(aggr_port_t *);
static void lacp_sel_ports_del(aggr_port_t *);
+static void wait_while_timer_pop(void *);
+static void wait_while_timer_pop_handler(aggr_port_t *);
void
aggr_lacp_init(void)
@@ -132,13 +136,96 @@ aggr_lacp_fini(void)
}
/*
+ * The following functions are used for handling LACP timers.
+ *
+ * Note that we cannot fully rely on the aggr's mac perimeter in the timeout
+ * handler routine, otherwise it may cause deadlock with the untimeout() call
+ * which is usually called with the mac perimeter held. Instead, a
+ * lacp_timer_lock mutex is introduced, which protects a bitwise flag
+ * (lacp_timer_bits). This flag is set/cleared by timeout()/stop_timer()
+ * routines and is checked by a dedicated thread, that executes the real
+ * timeout operation.
+ */
+static void
+aggr_port_timer_thread(void *arg)
+{
+ aggr_port_t *port = arg;
+ aggr_lacp_port_t *pl = &port->lp_lacp;
+ aggr_grp_t *grp = port->lp_grp;
+ uint32_t lacp_timer_bits;
+ mac_perim_handle_t mph;
+ callb_cpr_t cprinfo;
+
+ CALLB_CPR_INIT(&cprinfo, &pl->lacp_timer_lock, callb_generic_cpr,
+ "aggr_port_timer_thread");
+
+ mutex_enter(&pl->lacp_timer_lock);
+
+ for (;;) {
+
+ if ((lacp_timer_bits = pl->lacp_timer_bits) == 0) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&pl->lacp_timer_cv, &pl->lacp_timer_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &pl->lacp_timer_lock);
+ continue;
+ }
+ pl->lacp_timer_bits = 0;
+
+ if (lacp_timer_bits & LACP_THREAD_EXIT)
+ break;
+
+ if (lacp_timer_bits & LACP_PERIODIC_TIMEOUT)
+ pl->periodic_timer.id = 0;
+ if (lacp_timer_bits & LACP_WAIT_WHILE_TIMEOUT)
+ pl->wait_while_timer.id = 0;
+ if (lacp_timer_bits & LACP_CURRENT_WHILE_TIMEOUT)
+ pl->current_while_timer.id = 0;
+
+ mutex_exit(&pl->lacp_timer_lock);
+
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ if (port->lp_closing) {
+ mac_perim_exit(mph);
+ mutex_enter(&pl->lacp_timer_lock);
+ break;
+ }
+
+ if (lacp_timer_bits & LACP_PERIODIC_TIMEOUT)
+ periodic_timer_pop_handler(port);
+ if (lacp_timer_bits & LACP_WAIT_WHILE_TIMEOUT)
+ wait_while_timer_pop_handler(port);
+ if (lacp_timer_bits & LACP_CURRENT_WHILE_TIMEOUT)
+ current_while_timer_pop_handler(port);
+ mac_perim_exit(mph);
+
+ mutex_enter(&pl->lacp_timer_lock);
+ if (pl->lacp_timer_bits & LACP_THREAD_EXIT)
+ break;
+ }
+
+ pl->lacp_timer_bits = 0;
+ pl->lacp_timer_thread = NULL;
+ cv_broadcast(&pl->lacp_timer_cv);
+
+ /* CALLB_CPR_EXIT drops the lock */
+ CALLB_CPR_EXIT(&cprinfo);
+
+ /*
+ * Release the reference of the grp so aggr_grp_delete() can call
+ * mac_unregister() safely.
+ */
+ aggr_grp_port_rele(port);
+ thread_exit();
+}
+
+/*
* Set the port LACP state to SELECTED. Returns B_FALSE if the operation
* could not be performed due to a memory allocation error, B_TRUE otherwise.
*/
static boolean_t
lacp_port_select(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
if (!lacp_sel_ports_add(portp))
return (B_FALSE);
@@ -152,7 +239,9 @@ lacp_port_select(aggr_port_t *portp)
static void
lacp_port_unselect(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ aggr_grp_t *grp = portp->lp_grp;
+
+ ASSERT((grp->lg_mh == NULL) || MAC_PERIM_HELD(grp->lg_mh));
lacp_sel_ports_del(portp);
portp->lp_lacp.sm.selected = AGGR_UNSELECTED;
@@ -180,9 +269,8 @@ aggr_lacp_init_port(aggr_port_t *portp)
aggr_grp_t *aggrp = portp->lp_grp;
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp));
- ASSERT(RW_LOCK_HELD(&aggrp->lg_lock));
- ASSERT(RW_LOCK_HELD(&portp->lp_lock));
+ ASSERT(aggrp->lg_mh == NULL || MAC_PERIM_HELD(aggrp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(portp->lp_mh));
/* actor port # */
pl->ActorPortNumber = portp->lp_portid;
@@ -251,6 +339,25 @@ aggr_lacp_init_port(aggr_port_t *portp)
pl->wait_while_timer.id = 0;
pl->wait_while_timer.val = AGGREGATE_WAIT_TIME;
+
+ pl->lacp_timer_bits = 0;
+
+ mutex_init(&pl->lacp_timer_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&pl->lacp_timer_cv, NULL, CV_DRIVER, NULL);
+
+ pl->lacp_timer_thread = thread_create(NULL, 0, aggr_port_timer_thread,
+ portp, 0, &p0, TS_RUN, minclsyspri);
+
+ /*
+ * Hold a reference of the grp and the port and this reference will
+ * be release when the thread exits.
+ *
+ * The reference on the port is used for aggr_port_delete() to
+ * continue without waiting for the thread to exit; the reference
+ * on the grp is used for aggr_grp_delete() to wait for the thread
+ * to exit before calling mac_unregister().
+ */
+ aggr_grp_port_hold(portp);
}
/*
@@ -264,7 +371,7 @@ lacp_reset_port(aggr_port_t *portp)
{
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
pl->NTT = B_FALSE; /* need to transmit */
@@ -306,8 +413,8 @@ lacp_reset_port(aggr_port_t *portp)
static void
aggr_lacp_mcast_on(aggr_port_t *port)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(port->lp_grp));
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
return;
@@ -319,8 +426,8 @@ aggr_lacp_mcast_on(aggr_port_t *port)
static void
aggr_lacp_mcast_off(aggr_port_t *port)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(port->lp_grp));
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
return;
@@ -332,26 +439,35 @@ aggr_lacp_mcast_off(aggr_port_t *port)
static void
start_periodic_timer(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
- if (portp->lp_lacp.periodic_timer.id == 0) {
- portp->lp_lacp.periodic_timer.id =
- timeout(periodic_timer_pop, portp,
+ mutex_enter(&pl->lacp_timer_lock);
+ if (pl->periodic_timer.id == 0) {
+ pl->periodic_timer.id = timeout(periodic_timer_pop, portp,
drv_usectohz(1000000 * portp->lp_lacp.periodic_timer.val));
}
+ mutex_exit(&pl->lacp_timer_lock);
}
static void
stop_periodic_timer(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+ timeout_id_t id;
+
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
- if (portp->lp_lacp.periodic_timer.id != 0) {
- AGGR_LACP_UNLOCK(portp->lp_grp);
- (void) untimeout(portp->lp_lacp.periodic_timer.id);
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
- portp->lp_lacp.periodic_timer.id = 0;
+ mutex_enter(&pl->lacp_timer_lock);
+ if ((id = pl->periodic_timer.id) != 0) {
+ pl->lacp_timer_bits &= ~LACP_PERIODIC_TIMEOUT;
+ pl->periodic_timer.id = 0;
}
+ mutex_exit(&pl->lacp_timer_lock);
+
+ if (id != 0)
+ (void) untimeout(id);
}
/*
@@ -360,13 +476,29 @@ stop_periodic_timer(aggr_port_t *portp)
* LACPDU. We then set the periodic state and let
* the periodic state machine restart the timer.
*/
+static void
+periodic_timer_pop(void *data)
+{
+ aggr_port_t *portp = data;
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+ mutex_enter(&pl->lacp_timer_lock);
+ pl->lacp_timer_bits |= LACP_PERIODIC_TIMEOUT;
+ cv_broadcast(&pl->lacp_timer_cv);
+ mutex_exit(&pl->lacp_timer_lock);
+}
+/*
+ * When the timer pops, we arrive here to
+ * clear out LACPDU count as well as transmit an
+ * LACPDU. We then set the periodic state and let
+ * the periodic state machine restart the timer.
+ */
static void
-periodic_timer_pop_locked(aggr_port_t *portp)
+periodic_timer_pop_handler(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
- portp->lp_lacp.periodic_timer.id = NULL;
portp->lp_lacp_stats.LACPDUsTx = 0;
/* current timestamp */
@@ -390,19 +522,6 @@ periodic_timer_pop_locked(aggr_port_t *portp)
lacp_periodic_sm(portp);
}
-static void
-periodic_timer_pop(void *data)
-{
- aggr_port_t *portp = data;
-
- if (portp->lp_closing)
- return;
-
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
- periodic_timer_pop_locked(portp);
- AGGR_LACP_UNLOCK(portp->lp_grp);
-}
-
/*
* Invoked from:
* - startup upon aggregation
@@ -417,7 +536,7 @@ lacp_periodic_sm(aggr_port_t *portp)
lacp_periodic_state_t oldstate = portp->lp_lacp.sm.periodic_state;
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
/* LACP_OFF state not in specification so check here. */
if (!pl->sm.lacp_on) {
@@ -465,7 +584,7 @@ lacp_periodic_sm(aggr_port_t *portp)
* a LACPDU.
*/
stop_periodic_timer(portp);
- periodic_timer_pop_locked(portp);
+ periodic_timer_pop_handler(portp);
}
/* Rearm timer with value provided by partner */
@@ -483,9 +602,8 @@ lacp_xmit_sm(aggr_port_t *portp)
size_t len;
mblk_t *mp;
hrtime_t now, elapsed;
- const mac_txinfo_t *mtp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
/* LACP_OFF state not in specification so check here. */
if (!pl->sm.lacp_on || !pl->NTT || !portp->lp_started)
@@ -534,12 +652,7 @@ lacp_xmit_sm(aggr_port_t *portp)
fill_lacp_pdu(portp,
(lacp_t *)(mp->b_rptr + sizeof (struct ether_header)));
- /*
- * Store the transmit info pointer locally in case it changes between
- * loading mt_fn and mt_arg.
- */
- mtp = portp->lp_txinfo;
- mtp->mt_fn(mtp->mt_arg, mp);
+ (void) mac_tx(portp->lp_mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL);
pl->NTT = B_FALSE;
portp->lp_lacp_stats.LACPDUsTx++;
@@ -563,15 +676,14 @@ fill_lacp_pdu(aggr_port_t *portp, lacp_t *lacp)
{
aggr_lacp_port_t *pl = &portp->lp_lacp;
aggr_grp_t *aggrp = portp->lp_grp;
+ mac_perim_handle_t pmph;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
+ mac_perim_enter_by_mh(portp->lp_mh, &pmph);
lacp->subtype = LACP_SUBTYPE;
lacp->version = LACP_VERSION;
- rw_enter(&aggrp->lg_lock, RW_READER);
- rw_enter(&portp->lp_lock, RW_READER);
-
/*
* Actor Information
*/
@@ -609,8 +721,7 @@ fill_lacp_pdu(aggr_port_t *portp, lacp_t *lacp)
lacp->tlv_terminator = TERMINATOR_TLV;
lacp->terminator_len = 0x0;
- rw_exit(&portp->lp_lock);
- rw_exit(&aggrp->lg_lock);
+ mac_perim_exit(pmph);
}
/*
@@ -633,7 +744,7 @@ lacp_mux_sm(aggr_port_t *portp)
aggr_lacp_port_t *pl = &portp->lp_lacp;
lacp_mux_state_t oldstate = pl->sm.mux_state;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp));
+ ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
/* LACP_OFF state not in specification so check here. */
if (!pl->sm.lacp_on) {
@@ -788,29 +899,28 @@ again:
} /* lacp_mux_sm */
-static void
+static int
receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
{
marker_pdu_t *markerp = (marker_pdu_t *)mp->b_rptr;
- const mac_txinfo_t *mtp;
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
AGGR_LACP_DBG(("trunk link: (%d): MARKER PDU received:\n",
portp->lp_linkid));
/* LACP_OFF state not in specification so check here. */
if (!portp->lp_lacp.sm.lacp_on)
- goto bail;
+ return (-1);
if (MBLKL(mp) < sizeof (marker_pdu_t))
- goto bail;
+ return (-1);
if (markerp->version != MARKER_VERSION) {
AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: "
"version = %d does not match s/w version %d\n",
portp->lp_linkid, markerp->version, MARKER_VERSION));
- goto bail;
+ return (-1);
}
if (markerp->tlv_marker == MARKER_RESPONSE_TLV) {
@@ -818,21 +928,21 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
AGGR_LACP_DBG(("trunk link (%d): MARKER RESPONSE PDU: "
" MARKER TLV = %d - We don't send out info type!\n",
portp->lp_linkid, markerp->tlv_marker));
- goto bail;
+ return (-1);
}
if (markerp->tlv_marker != MARKER_INFO_TLV) {
AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: "
" MARKER TLV = %d \n", portp->lp_linkid,
markerp->tlv_marker));
- goto bail;
+ return (-1);
}
if (markerp->marker_len != MARKER_INFO_RESPONSE_LENGTH) {
AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: "
" MARKER length = %d \n", portp->lp_linkid,
markerp->marker_len));
- goto bail;
+ return (-1);
}
if (markerp->requestor_port != portp->lp_lacp.PartnerOperPortNum) {
@@ -840,7 +950,7 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
" MARKER Port %d not equal to Partner port %d\n",
portp->lp_linkid, markerp->requestor_port,
portp->lp_lacp.PartnerOperPortNum));
- goto bail;
+ return (-1);
}
if (ether_cmp(&markerp->system_id,
@@ -848,7 +958,7 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
AGGR_LACP_DBG(("trunk link (%d): MARKER PDU: "
" MARKER MAC not equal to Partner MAC\n",
portp->lp_linkid));
- goto bail;
+ return (-1);
}
/*
@@ -861,23 +971,9 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
ASSERT(MBLKHEAD(mp) >= sizeof (struct ether_header));
mp->b_rptr -= sizeof (struct ether_header);
fill_lacp_ether(portp, (struct ether_header *)mp->b_rptr);
-
- /*
- * Store the transmit info pointer locally in case it changes between
- * loading mt_fn and mt_arg.
- */
- mtp = portp->lp_txinfo;
- AGGR_LACP_UNLOCK(portp->lp_grp);
-
- mtp->mt_fn(mtp->mt_arg, mp);
- return;
-
-bail:
- AGGR_LACP_UNLOCK(portp->lp_grp);
- freemsg(mp);
+ return (0);
}
-
/*
* Update the LACP mode (off, active, or passive) of the specified group.
*/
@@ -887,8 +983,8 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode)
aggr_lacp_mode_t old_mode = grp->lg_lacp_mode;
aggr_port_t *port;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(!grp->lg_closing);
if (mode == old_mode)
return;
@@ -904,20 +1000,12 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode)
/* OFF -> {PASSIVE,ACTIVE} */
/* turn OFF Collector_Distributor */
aggr_set_coll_dist(port, B_FALSE);
- rw_enter(&port->lp_lock, RW_WRITER);
lacp_on(port);
- if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
- aggr_lacp_port_attached(port);
- rw_exit(&port->lp_lock);
} else if (mode == AGGR_LACP_OFF) {
/* {PASSIVE,ACTIVE} -> OFF */
- rw_enter(&port->lp_lock, RW_WRITER);
lacp_off(port);
- rw_exit(&port->lp_lock);
- if (!grp->lg_closing) {
- /* Turn ON Collector_Distributor */
- aggr_set_coll_dist(port, B_TRUE);
- }
+ /* Turn ON Collector_Distributor */
+ aggr_set_coll_dist(port, B_TRUE);
} else {
/* PASSIVE->ACTIVE or ACTIVE->PASSIVE */
port->lp_lacp.sm.begin = B_TRUE;
@@ -928,9 +1016,6 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode)
lacp_receive_sm(port, NULL);
lacp_mux_sm(port);
}
-
- if (grp->lg_closing)
- break;
}
}
@@ -943,8 +1028,7 @@ aggr_lacp_update_timer(aggr_grp_t *grp, aggr_lacp_timer_t timer)
{
aggr_port_t *port;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
if (timer == grp->aggr.PeriodicTimer)
return;
@@ -958,6 +1042,32 @@ aggr_lacp_update_timer(aggr_grp_t *grp, aggr_lacp_timer_t timer)
}
}
+void
+aggr_port_lacp_set_mode(aggr_grp_t *grp, aggr_port_t *port)
+{
+ aggr_lacp_mode_t mode;
+ aggr_lacp_timer_t timer;
+
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+
+ mode = grp->lg_lacp_mode;
+ timer = grp->aggr.PeriodicTimer;
+
+ port->lp_lacp.ActorAdminPortState.bit.activity =
+ port->lp_lacp.ActorOperPortState.bit.activity =
+ (mode == AGGR_LACP_ACTIVE);
+
+ port->lp_lacp.ActorAdminPortState.bit.timeout =
+ port->lp_lacp.ActorOperPortState.bit.timeout =
+ (timer == AGGR_LACP_TIMER_SHORT);
+
+ if (mode == AGGR_LACP_OFF) {
+ /* Turn ON Collector_Distributor */
+ aggr_set_coll_dist(port, B_TRUE);
+ } else { /* LACP_ACTIVE/PASSIVE */
+ lacp_on(port);
+ }
+}
/*
* Sets the initial LACP mode (off, active, passive) and LACP timer
@@ -969,30 +1079,13 @@ aggr_lacp_set_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode,
{
aggr_port_t *port;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
grp->lg_lacp_mode = mode;
grp->aggr.PeriodicTimer = timer;
- for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
- port->lp_lacp.ActorAdminPortState.bit.activity =
- port->lp_lacp.ActorOperPortState.bit.activity =
- (mode == AGGR_LACP_ACTIVE);
-
- port->lp_lacp.ActorAdminPortState.bit.timeout =
- port->lp_lacp.ActorOperPortState.bit.timeout =
- (timer == AGGR_LACP_TIMER_SHORT);
-
- if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
- /* Turn ON Collector_Distributor */
- aggr_set_coll_dist(port, B_TRUE);
- } else { /* LACP_ACTIVE/PASSIVE */
- rw_enter(&port->lp_lock, RW_WRITER);
- lacp_on(port);
- rw_exit(&port->lp_lock);
- }
- }
+ for (port = grp->lg_ports; port != NULL; port = port->lp_next)
+ aggr_port_lacp_set_mode(grp, port);
}
/*
@@ -1148,7 +1241,7 @@ lacp_selection_logic(aggr_port_t *portp)
boolean_t reset_mac = B_FALSE;
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp));
+ ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
/* LACP_OFF state not in specification so check here. */
if (!pl->sm.lacp_on) {
@@ -1377,47 +1470,65 @@ static void
wait_while_timer_pop(void *data)
{
aggr_port_t *portp = data;
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
- if (portp->lp_closing)
- return;
+ mutex_enter(&pl->lacp_timer_lock);
+ pl->lacp_timer_bits |= LACP_WAIT_WHILE_TIMEOUT;
+ cv_broadcast(&pl->lacp_timer_cv);
+ mutex_exit(&pl->lacp_timer_lock);
+}
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
+/*
+ * wait_while_timer_pop_handler - When the timer pops, we arrive here to
+ * set ready_n and trigger the selection logic.
+ */
+static void
+wait_while_timer_pop_handler(aggr_port_t *portp)
+{
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
AGGR_LACP_DBG(("trunk link:(%d): wait_while_timer pop \n",
portp->lp_linkid));
- portp->lp_lacp.wait_while_timer.id = 0;
portp->lp_lacp.sm.ready_n = B_TRUE;
lacp_selection_logic(portp);
- AGGR_LACP_UNLOCK(portp->lp_grp);
}
static void
start_wait_while_timer(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
- if (portp->lp_lacp.wait_while_timer.id == 0) {
- portp->lp_lacp.wait_while_timer.id =
+ mutex_enter(&pl->lacp_timer_lock);
+ if (pl->wait_while_timer.id == 0) {
+ pl->wait_while_timer.id =
timeout(wait_while_timer_pop, portp,
drv_usectohz(1000000 *
portp->lp_lacp.wait_while_timer.val));
}
+ mutex_exit(&pl->lacp_timer_lock);
}
static void
-stop_wait_while_timer(portp)
-aggr_port_t *portp;
+stop_wait_while_timer(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+ timeout_id_t id;
+
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
- if (portp->lp_lacp.wait_while_timer.id != 0) {
- AGGR_LACP_UNLOCK(portp->lp_grp);
- (void) untimeout(portp->lp_lacp.wait_while_timer.id);
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
- portp->lp_lacp.wait_while_timer.id = 0;
+ mutex_enter(&pl->lacp_timer_lock);
+ if ((id = pl->wait_while_timer.id) != 0) {
+ pl->lacp_timer_bits &= ~LACP_WAIT_WHILE_TIMEOUT;
+ pl->wait_while_timer.id = 0;
}
+ mutex_exit(&pl->lacp_timer_lock);
+
+ if (id != 0)
+ (void) untimeout(id);
}
/*
@@ -1432,52 +1543,30 @@ aggr_lacp_port_attached(aggr_port_t *portp)
aggr_grp_t *grp = portp->lp_grp;
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(portp->lp_mh));
ASSERT(portp->lp_state == AGGR_PORT_STATE_ATTACHED);
- ASSERT(RW_WRITE_HELD(&portp->lp_lock));
AGGR_LACP_DBG(("aggr_lacp_port_attached: port %d\n",
portp->lp_linkid));
portp->lp_lacp.sm.port_enabled = B_TRUE; /* link on */
- if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
- pl->ActorAdminPortState.bit.activity =
- pl->ActorOperPortState.bit.activity = B_FALSE;
-
- /* Turn ON Collector_Distributor */
- aggr_set_coll_dist_locked(portp, B_TRUE);
-
+ if (grp->lg_lacp_mode == AGGR_LACP_OFF)
return;
- }
-
- pl->ActorAdminPortState.bit.activity =
- pl->ActorOperPortState.bit.activity =
- (grp->lg_lacp_mode == AGGR_LACP_ACTIVE);
-
- pl->ActorAdminPortState.bit.timeout =
- pl->ActorOperPortState.bit.timeout =
- (grp->aggr.PeriodicTimer == AGGR_LACP_TIMER_SHORT);
pl->sm.lacp_enabled = B_TRUE;
pl->ActorOperPortState.bit.aggregation = B_TRUE;
pl->sm.begin = B_TRUE;
- if (!pl->sm.lacp_on) {
- /* Turn OFF Collector_Distributor */
- aggr_set_coll_dist_locked(portp, B_FALSE);
-
- lacp_on(portp);
- } else {
- lacp_receive_sm(portp, NULL);
- lacp_mux_sm(portp);
+ lacp_receive_sm(portp, NULL);
+ lacp_mux_sm(portp);
- /* Enable Multicast Slow Protocol address */
- aggr_lacp_mcast_on(portp);
+ /* Enable Multicast Slow Protocol address */
+ aggr_lacp_mcast_on(portp);
- /* periodic_sm is started up from the receive machine */
- lacp_selection_logic(portp);
- }
+ /* periodic_sm is started up from the receive machine */
+ lacp_selection_logic(portp);
}
/*
@@ -1489,8 +1578,8 @@ aggr_lacp_port_detached(aggr_port_t *portp)
{
aggr_grp_t *grp = portp->lp_grp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(portp->lp_mh));
AGGR_LACP_DBG(("aggr_lacp_port_detached: port %d\n",
portp->lp_linkid));
@@ -1500,34 +1589,35 @@ aggr_lacp_port_detached(aggr_port_t *portp)
if (grp->lg_lacp_mode == AGGR_LACP_OFF)
return;
- /* Disable Slow Protocol PDUs */
- lacp_off(portp);
-}
-
+ portp->lp_lacp.sm.lacp_enabled = B_FALSE;
+ lacp_selection_logic(portp);
+ lacp_mux_sm(portp);
+ lacp_periodic_sm(portp);
-/*
- * Invoked after the outbound port selection policy has been changed.
- */
-void
-aggr_lacp_policy_changed(aggr_grp_t *grp)
-{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ /*
+ * Disable Slow Protocol Timers.
+ */
+ stop_periodic_timer(portp);
+ stop_current_while_timer(portp);
+ stop_wait_while_timer(portp);
- /* suspend transmission for CollectorMaxDelay time */
- delay(grp->aggr.CollectorMaxDelay * 10);
+ /* Disable Multicast Slow Protocol address */
+ aggr_lacp_mcast_off(portp);
+ aggr_set_coll_dist(portp, B_FALSE);
}
-
/*
* Enable Slow Protocol LACP and Marker PDUs.
*/
static void
lacp_on(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
- ASSERT(RW_WRITE_HELD(&portp->lp_grp->lg_lock));
- ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+ mac_perim_handle_t mph;
+
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
+
+ mac_perim_enter_by_mh(portp->lp_mh, &mph);
/*
* Reset the state machines and Partner operational
@@ -1535,67 +1625,69 @@ lacp_on(aggr_port_t *portp)
* our link state.
*/
lacp_reset_port(portp);
- portp->lp_lacp.sm.lacp_on = B_TRUE;
+ pl->sm.lacp_on = B_TRUE;
AGGR_LACP_DBG(("lacp_on:(%d): \n", portp->lp_linkid));
+ if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) {
+ pl->sm.port_enabled = B_TRUE;
+ pl->sm.lacp_enabled = B_TRUE;
+ pl->ActorOperPortState.bit.aggregation = B_TRUE;
+ }
+
lacp_receive_sm(portp, NULL);
lacp_mux_sm(portp);
- if (portp->lp_state != AGGR_PORT_STATE_ATTACHED)
- return;
-
- /* Enable Multicast Slow Protocol address */
- aggr_lacp_mcast_on(portp);
+ if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) {
+ /* Enable Multicast Slow Protocol address */
+ aggr_lacp_mcast_on(portp);
- /* periodic_sm is started up from the receive machine */
- lacp_selection_logic(portp);
+ /* periodic_sm is started up from the receive machine */
+ lacp_selection_logic(portp);
+ }
+done:
+ mac_perim_exit(mph);
} /* lacp_on */
-
/* Disable Slow Protocol LACP and Marker PDUs */
static void
lacp_off(aggr_port_t *portp)
{
- aggr_grp_t *grp = portp->lp_grp;
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+ mac_perim_handle_t mph;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
- ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
+ mac_perim_enter_by_mh(portp->lp_mh, &mph);
- portp->lp_lacp.sm.lacp_on = B_FALSE;
+ pl->sm.lacp_on = B_FALSE;
AGGR_LACP_DBG(("lacp_off:(%d): \n", portp->lp_linkid));
- /*
- * Disable Slow Protocol Timers. We must temporarily release
- * the group and port locks to avoid deadlocks. Make sure that
- * neither the port nor group are closing after re-acquiring
- * their locks.
- */
- rw_exit(&portp->lp_lock);
- rw_exit(&grp->lg_lock);
-
- stop_periodic_timer(portp);
- stop_current_while_timer(portp);
- stop_wait_while_timer(portp);
+ if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) {
+ /*
+ * Disable Slow Protocol Timers.
+ */
+ stop_periodic_timer(portp);
+ stop_current_while_timer(portp);
+ stop_wait_while_timer(portp);
- rw_enter(&grp->lg_lock, RW_WRITER);
- rw_enter(&portp->lp_lock, RW_WRITER);
+ /* Disable Multicast Slow Protocol address */
+ aggr_lacp_mcast_off(portp);
- if (!portp->lp_closing && !grp->lg_closing) {
- lacp_mux_sm(portp);
- lacp_periodic_sm(portp);
- lacp_selection_logic(portp);
+ pl->sm.port_enabled = B_FALSE;
+ pl->sm.lacp_enabled = B_FALSE;
+ pl->ActorOperPortState.bit.aggregation = B_FALSE;
}
- /* Turn OFF Collector_Distributor */
- aggr_set_coll_dist_locked(portp, B_FALSE);
+ lacp_mux_sm(portp);
+ lacp_periodic_sm(portp);
+ lacp_selection_logic(portp);
- /* Disable Multicast Slow Protocol address */
- aggr_lacp_mcast_off(portp);
+ /* Turn OFF Collector_Distributor */
+ aggr_set_coll_dist(portp, B_FALSE);
lacp_reset_port(portp);
+ mac_perim_exit(mph);
}
@@ -1627,61 +1719,71 @@ valid_lacp_pdu(aggr_port_t *portp, lacp_t *lacp)
static void
start_current_while_timer(aggr_port_t *portp, uint_t time)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
-
- if (portp->lp_lacp.current_while_timer.id == 0) {
- if (time > 0) {
- portp->lp_lacp.current_while_timer.val = time;
- } else if (portp->lp_lacp.ActorOperPortState.bit.timeout) {
- portp->lp_lacp.current_while_timer.val =
- SHORT_TIMEOUT_TIME;
- } else {
- portp->lp_lacp.current_while_timer.val =
- LONG_TIMEOUT_TIME;
- }
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
+
+ mutex_enter(&pl->lacp_timer_lock);
+ if (pl->current_while_timer.id == 0) {
+ if (time > 0)
+ pl->current_while_timer.val = time;
+ else if (pl->ActorOperPortState.bit.timeout)
+ pl->current_while_timer.val = SHORT_TIMEOUT_TIME;
+ else
+ pl->current_while_timer.val = LONG_TIMEOUT_TIME;
- portp->lp_lacp.current_while_timer.id =
+ pl->current_while_timer.id =
timeout(current_while_timer_pop, portp,
drv_usectohz((clock_t)1000000 *
(clock_t)portp->lp_lacp.current_while_timer.val));
}
+ mutex_exit(&pl->lacp_timer_lock);
}
static void
stop_current_while_timer(aggr_port_t *portp)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
+ timeout_id_t id;
+
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
- if (portp->lp_lacp.current_while_timer.id != 0) {
- AGGR_LACP_UNLOCK(portp->lp_grp);
- (void) untimeout(portp->lp_lacp.current_while_timer.id);
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
- portp->lp_lacp.current_while_timer.id = 0;
+ mutex_enter(&pl->lacp_timer_lock);
+ if ((id = pl->current_while_timer.id) != 0) {
+ pl->lacp_timer_bits &= ~LACP_CURRENT_WHILE_TIMEOUT;
+ pl->current_while_timer.id = 0;
}
-}
+ mutex_exit(&pl->lacp_timer_lock);
+ if (id != 0)
+ (void) untimeout(id);
+}
static void
current_while_timer_pop(void *data)
{
aggr_port_t *portp = (aggr_port_t *)data;
+ aggr_lacp_port_t *pl = &portp->lp_lacp;
- if (portp->lp_closing)
- return;
+ mutex_enter(&pl->lacp_timer_lock);
+ pl->lacp_timer_bits |= LACP_CURRENT_WHILE_TIMEOUT;
+ cv_broadcast(&pl->lacp_timer_cv);
+ mutex_exit(&pl->lacp_timer_lock);
+}
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
+static void
+current_while_timer_pop_handler(aggr_port_t *portp)
+{
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
AGGR_LACP_DBG(("trunk link:(%d): current_while_timer "
"pop id=%p\n", portp->lp_linkid,
portp->lp_lacp.current_while_timer.id));
- portp->lp_lacp.current_while_timer.id = 0;
lacp_receive_sm(portp, NULL);
- AGGR_LACP_UNLOCK(portp->lp_grp);
}
-
/*
* record_Default - Simply copies over administrative values
* to the partner operational values, and sets our state to indicate we
@@ -1692,7 +1794,7 @@ record_Default(aggr_port_t *portp)
{
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
pl->PartnerOperPortNum = pl->PartnerAdminPortNum;
pl->PartnerOperPortPriority = pl->PartnerAdminPortPriority;
@@ -1713,7 +1815,7 @@ record_PDU(aggr_port_t *portp, lacp_t *lacp)
aggr_lacp_port_t *pl = &portp->lp_lacp;
uint8_t save_sync;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
/*
* Partner Information
@@ -1780,7 +1882,7 @@ update_selected(aggr_port_t *portp, lacp_t *lacp)
{
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
if ((pl->PartnerOperPortNum != ntohs(lacp->actor_info.port)) ||
(pl->PartnerOperPortPriority !=
@@ -1814,7 +1916,7 @@ update_default_selected(aggr_port_t *portp)
{
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
if ((pl->PartnerAdminPortNum != pl->PartnerOperPortNum) ||
(pl->PartnerOperPortPriority != pl->PartnerAdminPortPriority) ||
@@ -1844,7 +1946,7 @@ update_NTT(aggr_port_t *portp, lacp_t *lacp)
aggr_grp_t *aggrp = portp->lp_grp;
aggr_lacp_port_t *pl = &portp->lp_lacp;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
if ((pl->ActorPortNumber != ntohs(lacp->partner_info.port)) ||
(pl->ActorPortPriority !=
@@ -1890,7 +1992,7 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp)
aggr_lacp_port_t *pl = &portp->lp_lacp;
lacp_receive_state_t oldstate = pl->sm.receive_state;
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+ ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
/* LACP_OFF state not in specification so check here. */
if (!pl->sm.lacp_on)
@@ -1918,7 +2020,6 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp)
pl->sm.receive_state = LACP_DEFAULTED;
}
-
if (!((lacp && (oldstate == LACP_CURRENT) &&
(pl->sm.receive_state == LACP_CURRENT)))) {
AGGR_LACP_DBG(("lacp_receive_sm(%d):%s--->%s\n",
@@ -2068,28 +2169,19 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp)
static void
aggr_set_coll_dist(aggr_port_t *portp, boolean_t enable)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
- rw_enter(&portp->lp_lock, RW_WRITER);
- aggr_set_coll_dist_locked(portp, enable);
- rw_exit(&portp->lp_lock);
-}
-
-static void
-aggr_set_coll_dist_locked(aggr_port_t *portp, boolean_t enable)
-{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
- ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+ mac_perim_handle_t mph;
AGGR_LACP_DBG(("AGGR_SET_COLL_DIST_TYPE: (%d) %s\n",
portp->lp_linkid, enable ? "ENABLED" : "DISABLED"));
+ mac_perim_enter_by_mh(portp->lp_mh, &mph);
if (!enable) {
/*
* Turn OFF Collector_Distributor.
*/
portp->lp_collector_enabled = B_FALSE;
aggr_send_port_disable(portp);
- return;
+ goto done;
}
/*
@@ -2102,14 +2194,21 @@ aggr_set_coll_dist_locked(aggr_port_t *portp, boolean_t enable)
portp->lp_collector_enabled = B_TRUE;
aggr_send_port_enable(portp);
}
+
+done:
+ mac_perim_exit(mph);
}
/*
- * Process a received Marker or LACPDU.
+ * Because the LACP packet processing needs to enter the aggr's mac perimeter
+ * and that would potentially cause a deadlock with the thread in which the
+ * grp/port is deleted, we defer the packet process to a worker thread. Here
+ * we only enqueue the received Marker or LACPDU for later processing.
*/
void
-aggr_lacp_rx(aggr_port_t *portp, mblk_t *dmp)
+aggr_lacp_rx_enqueue(aggr_port_t *portp, mblk_t *dmp)
{
+ aggr_grp_t *grp = portp->lp_grp;
lacp_t *lacp;
dmp->b_rptr += sizeof (struct ether_header);
@@ -2120,34 +2219,143 @@ aggr_lacp_rx(aggr_port_t *portp, mblk_t *dmp)
}
lacp = (lacp_t *)dmp->b_rptr;
+ if (lacp->subtype != LACP_SUBTYPE && lacp->subtype != MARKER_SUBTYPE) {
+ AGGR_LACP_DBG(("aggr_lacp_rx_enqueue: (%d): "
+ "Unknown Slow Protocol type %d\n",
+ portp->lp_linkid, lacp->subtype));
+ freemsg(dmp);
+ return;
+ }
+
+ mutex_enter(&grp->lg_lacp_lock);
+
+ /*
+ * If the lg_lacp_done is set, this aggregation is in the process of
+ * being deleted, return directly.
+ */
+ if (grp->lg_lacp_done) {
+ mutex_exit(&grp->lg_lacp_lock);
+ freemsg(dmp);
+ return;
+ }
+
+ if (grp->lg_lacp_tail == NULL) {
+ grp->lg_lacp_head = grp->lg_lacp_tail = dmp;
+ } else {
+ grp->lg_lacp_tail->b_next = dmp;
+ grp->lg_lacp_tail = dmp;
+ }
+
+ /*
+ * Hold a reference of the port so that the port won't be freed when it
+ * is removed from the aggr. The b_prev field is borrowed to save the
+ * port information.
+ */
+ AGGR_PORT_REFHOLD(portp);
+ dmp->b_prev = (mblk_t *)portp;
+ cv_broadcast(&grp->lg_lacp_cv);
+ mutex_exit(&grp->lg_lacp_lock);
+}
+static void
+aggr_lacp_rx(mblk_t *dmp)
+{
+ aggr_port_t *portp = (aggr_port_t *)dmp->b_prev;
+ mac_perim_handle_t mph;
+ lacp_t *lacp;
+
+ dmp->b_prev = NULL;
+
+ mac_perim_enter_by_mh(portp->lp_grp->lg_mh, &mph);
+ if (portp->lp_closing)
+ goto done;
+
+ lacp = (lacp_t *)dmp->b_rptr;
switch (lacp->subtype) {
case LACP_SUBTYPE:
AGGR_LACP_DBG(("aggr_lacp_rx:(%d): LACPDU received.\n",
portp->lp_linkid));
- AGGR_LACP_LOCK_WRITER(portp->lp_grp);
if (!portp->lp_lacp.sm.lacp_on) {
- AGGR_LACP_UNLOCK(portp->lp_grp);
break;
}
lacp_receive_sm(portp, lacp);
- AGGR_LACP_UNLOCK(portp->lp_grp);
break;
case MARKER_SUBTYPE:
AGGR_LACP_DBG(("aggr_lacp_rx:(%d): Marker Packet received.\n",
portp->lp_linkid));
- (void) receive_marker_pdu(portp, dmp);
- break;
+ if (receive_marker_pdu(portp, dmp) != 0)
+ break;
- default:
- AGGR_LACP_DBG(("aggr_lacp_rx: (%d): "
- "Unknown Slow Protocol type %d\n",
- portp->lp_linkid, lacp->subtype));
- break;
+ (void) mac_tx(portp->lp_mch, dmp, 0, MAC_DROP_ON_NO_DESC, NULL);
+ mac_perim_exit(mph);
+ AGGR_PORT_REFRELE(portp);
+ return;
}
+done:
+ mac_perim_exit(mph);
+ AGGR_PORT_REFRELE(portp);
freemsg(dmp);
}
+
+void
+aggr_lacp_rx_thread(void *arg)
+{
+ callb_cpr_t cprinfo;
+ aggr_grp_t *grp = (aggr_grp_t *)arg;
+ aggr_port_t *port;
+ mblk_t *mp, *nextmp;
+
+ CALLB_CPR_INIT(&cprinfo, &grp->lg_lacp_lock, callb_generic_cpr,
+ "aggr_lacp_rx_thread");
+
+ mutex_enter(&grp->lg_lacp_lock);
+
+ /*
+ * Quit the thread if the grp is deleted.
+ */
+ while (!grp->lg_lacp_done) {
+ if ((mp = grp->lg_lacp_head) == NULL) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_lacp_lock);
+ continue;
+ }
+
+ grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
+ mutex_exit(&grp->lg_lacp_lock);
+
+ while (mp != NULL) {
+ nextmp = mp->b_next;
+ mp->b_next = NULL;
+ aggr_lacp_rx(mp);
+ mp = nextmp;
+ }
+ mutex_enter(&grp->lg_lacp_lock);
+ }
+
+ /*
+ * The grp is being destroyed, simply free all of the LACP messages
+ * left in the queue which did not have the chance to be processed.
+ * We cannot use freemsgchain() here since we need to clear the
+ * b_prev field.
+ */
+ while ((mp = grp->lg_lacp_head) != NULL) {
+ port = (aggr_port_t *)mp->b_prev;
+ AGGR_PORT_REFRELE(port);
+ nextmp = mp->b_next;
+ mp->b_next = NULL;
+ mp->b_prev = NULL;
+ freemsg(mp);
+ mp = nextmp;
+ }
+
+ grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
+ grp->lg_lacp_rx_thread = NULL;
+ cv_broadcast(&grp->lg_lacp_cv);
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
+}
diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c
index cad61f559f..a84c4a5c2a 100644
--- a/usr/src/uts/common/io/aggr/aggr_port.c
+++ b/usr/src/uts/common/io/aggr/aggr_port.c
@@ -46,6 +46,7 @@
#include <sys/stat.h>
#include <sys/sdt.h>
#include <sys/dlpi.h>
+#include <sys/dls.h>
#include <sys/aggr.h>
#include <sys/aggr_impl.h>
@@ -58,11 +59,7 @@ static void aggr_port_notify_cb(void *, mac_notify_type_t);
static int
aggr_port_constructor(void *buf, void *arg, int kmflag)
{
- aggr_port_t *port = buf;
-
bzero(buf, sizeof (aggr_port_t));
- rw_init(&port->lp_lock, NULL, RW_DRIVER, NULL);
-
return (0);
}
@@ -72,7 +69,10 @@ aggr_port_destructor(void *buf, void *arg)
{
aggr_port_t *port = buf;
- rw_destroy(&port->lp_lock);
+ ASSERT(port->lp_mnh == NULL);
+ ASSERT(port->lp_mphp == NULL);
+ ASSERT(!port->lp_grp_added);
+ ASSERT(port->lp_hwgh == NULL);
}
void
@@ -103,31 +103,37 @@ aggr_port_fini(void)
id_space_destroy(aggr_portids);
}
-mac_resource_handle_t
-aggr_port_resource_add(void *arg, mac_resource_t *mrp)
-{
- aggr_port_t *port = (aggr_port_t *)arg;
- aggr_grp_t *grp = port->lp_grp;
-
- return (mac_resource_add(grp->lg_mh, mrp));
-}
-
+/* ARGSUSED */
void
aggr_port_init_callbacks(aggr_port_t *port)
{
/* add the port's receive callback */
- port->lp_mnh = mac_notify_add(port->lp_mh, aggr_port_notify_cb,
- (void *)port);
-
- /* set port's resource_add callback */
- mac_resource_set(port->lp_mh, aggr_port_resource_add, (void *)port);
+ port->lp_mnh = mac_notify_add(port->lp_mh, aggr_port_notify_cb, port);
+ /*
+ * Hold a reference of the grp and the port and this reference will
+ * be release when the thread exits.
+ *
+ * The reference on the port is used for aggr_port_delete() to
+ * continue without waiting for the thread to exit; the reference
+ * on the grp is used for aggr_grp_delete() to wait for the thread
+ * to exit before calling mac_unregister().
+ *
+ * Note that these references will be released either in
+ * aggr_port_delete() when mac_notify_remove() succeeds, or in
+ * the aggr_port_notify_cb() callback when the port is deleted
+ * (lp_closing is set).
+ */
+ aggr_grp_port_hold(port);
}
+/* ARGSUSED */
int
-aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
+aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
+ aggr_port_t **pp)
{
int err;
mac_handle_t mh;
+ mac_client_handle_t mch = NULL;
aggr_port_t *port;
uint16_t portid;
uint_t i;
@@ -135,6 +141,11 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
const mac_info_t *mip;
uint32_t note;
uint32_t margin;
+ char client_name[MAXNAMELEN];
+ char aggr_name[MAXNAMELEN];
+ char port_name[MAXNAMELEN];
+ mac_diag_t diag;
+ mac_unicast_handle_t mah;
*pp = NULL;
@@ -165,6 +176,20 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
}
}
+ if (((err = dls_mgmt_get_linkinfo(grp->lg_linkid,
+ aggr_name, NULL, NULL, NULL)) != 0) ||
+ ((err = dls_mgmt_get_linkinfo(linkid, port_name,
+ NULL, NULL, NULL)) != 0)) {
+ goto fail;
+ }
+
+ (void) snprintf(client_name, MAXNAMELEN, "%s-%s", aggr_name, port_name);
+ if ((err = mac_client_open(mh, &mch, client_name,
+ MAC_OPEN_FLAGS_IS_AGGR_PORT | MAC_OPEN_FLAGS_EXCLUSIVE |
+ MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK)) != 0) {
+ goto fail;
+ }
+
if ((portid = (uint16_t)id_alloc(aggr_portids)) == 0) {
err = ENOMEM;
goto fail;
@@ -180,10 +205,9 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
goto fail;
}
- if (!mac_active_set(mh)) {
+ if ((err = mac_unicast_primary_add(mch, &mah, &diag)) != 0) {
VERIFY(mac_margin_remove(mh, margin) == 0);
id_free(aggr_portids, portid);
- err = EBUSY;
goto fail;
}
@@ -192,15 +216,14 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
port->lp_refs = 1;
port->lp_next = NULL;
port->lp_mh = mh;
+ port->lp_mch = mch;
port->lp_mip = mip;
port->lp_linkid = linkid;
- port->lp_closing = 0;
+ port->lp_closing = B_FALSE;
+ port->lp_mah = mah;
/* get the port's original MAC address */
- mac_unicst_get(port->lp_mh, port->lp_addr);
-
- /* set port's transmit information */
- port->lp_txinfo = mac_tx_get(port->lp_mh);
+ mac_unicast_primary_get(port->lp_mh, port->lp_addr);
/* initialize state */
port->lp_state = AGGR_PORT_STATE_STANDBY;
@@ -213,6 +236,7 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
port->lp_no_link_update = no_link_update;
port->lp_portid = portid;
port->lp_margin = margin;
+ port->lp_prom_addr = NULL;
/*
* Save the current statistics of the port. They will be used
@@ -235,6 +259,8 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
return (0);
fail:
+ if (mch != NULL)
+ mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
mac_close(mh);
return (err);
}
@@ -242,19 +268,48 @@ fail:
void
aggr_port_delete(aggr_port_t *port)
{
+ aggr_lacp_port_t *pl = &port->lp_lacp;
+
+ ASSERT(port->lp_mphp == NULL);
+ ASSERT(!port->lp_promisc_on);
+
+ port->lp_closing = B_TRUE;
+
VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0);
- mac_rx_remove_wait(port->lp_mh);
- mac_resource_set(port->lp_mh, NULL, NULL);
- mac_notify_remove(port->lp_mh, port->lp_mnh);
- mac_active_clear(port->lp_mh);
+ mac_rx_clear(port->lp_mch);
+ /*
+ * If the notification callback is already in process and waiting for
+ * the aggr grp's mac perimeter, don't wait (otherwise there would be
+ * deadlock). Otherwise, if mac_notify_remove() succeeds, we can
+ * release the reference held when mac_notify_add() is called.
+ */
+ if ((port->lp_mnh != NULL) &&
+ (mac_notify_remove(port->lp_mnh, B_FALSE) == 0)) {
+ aggr_grp_port_rele(port);
+ }
+ port->lp_mnh = NULL;
+
+ /*
+ * Inform the the port lacp timer thread to exit. Note that waiting
+ * for the thread to exit may cause deadlock since that thread may
+ * need to enter into the mac perimeter which we are currently in.
+ * It is fine to continue without waiting though since that thread
+ * is holding a reference of the port.
+ */
+ mutex_enter(&pl->lacp_timer_lock);
+ pl->lacp_timer_bits |= LACP_THREAD_EXIT;
+ cv_broadcast(&pl->lacp_timer_cv);
+ mutex_exit(&pl->lacp_timer_lock);
/*
* Restore the port MAC address. Note it is called after the
* port's notification callback being removed. This prevent
* port's MAC_NOTE_UNICST notify callback function being called.
*/
- (void) mac_unicst_set(port->lp_mh, port->lp_addr);
+ (void) mac_unicast_primary_set(port->lp_mh, port->lp_addr);
+ (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
+ mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
mac_close(port->lp_mh);
AGGR_PORT_REFRELE(port);
}
@@ -268,6 +323,8 @@ aggr_port_free(aggr_port_t *port)
port->lp_grp = NULL;
id_free(aggr_portids, port->lp_portid);
port->lp_portid = 0;
+ mutex_destroy(&port->lp_lacp.lacp_timer_lock);
+ cv_destroy(&port->lp_lacp.lacp_timer_cv);
kmem_cache_free(aggr_port_cache, port);
}
@@ -276,7 +333,7 @@ aggr_port_free(aggr_port_t *port)
* one of the constituent ports.
*/
boolean_t
-aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock)
+aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port)
{
boolean_t do_attach = B_FALSE;
boolean_t do_detach = B_FALSE;
@@ -284,16 +341,10 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock)
uint64_t ifspeed;
link_state_t link_state;
link_duplex_t link_duplex;
+ mac_perim_handle_t mph;
- if (dolock) {
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
- } else {
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
- }
-
- rw_enter(&port->lp_lock, RW_WRITER);
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ mac_perim_enter_by_mh(port->lp_mh, &mph);
/*
* link state change? For links that do not support link state
@@ -334,15 +385,10 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock)
link_state_changed = aggr_grp_attach_port(grp, port);
} else if (do_detach) {
/* detach the port from the aggregation */
- link_state_changed = aggr_grp_detach_port(grp, port, B_TRUE);
+ link_state_changed = aggr_grp_detach_port(grp, port);
}
- rw_exit(&port->lp_lock);
-
- if (dolock) {
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
- }
+ mac_perim_exit(mph);
return (link_state_changed);
}
@@ -357,21 +403,20 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port,
boolean_t mac_addr_changed = B_FALSE;
boolean_t link_state_changed = B_FALSE;
uint8_t mac_addr[ETHERADDRL];
+ mac_perim_handle_t mph;
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
ASSERT(mac_addr_changedp != NULL);
ASSERT(link_state_changedp != NULL);
- AGGR_LACP_LOCK_WRITER(grp);
- rw_enter(&grp->lg_lock, RW_WRITER);
-
- rw_enter(&port->lp_lock, RW_WRITER);
+ mac_perim_enter_by_mh(port->lp_mh, &mph);
/*
* If it is called when setting the MAC address to the
* aggregation group MAC address, do nothing.
*/
- mac_unicst_get(port->lp_mh, mac_addr);
+ mac_unicast_primary_get(port->lp_mh, mac_addr);
if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
- rw_exit(&port->lp_lock);
+ mac_perim_exit(mph);
goto done;
}
@@ -381,10 +426,7 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port,
aggr_grp_port_mac_changed(grp, port, &mac_addr_changed,
&link_state_changed);
- rw_exit(&port->lp_lock);
-
- if (grp->lg_closing)
- goto done;
+ mac_perim_exit(mph);
/*
* If this port was used to determine the MAC address of
@@ -397,8 +439,6 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port,
done:
*mac_addr_changedp = mac_addr_changed;
*link_state_changedp = link_state_changed;
- rw_exit(&grp->lg_lock);
- AGGR_LACP_UNLOCK(grp);
}
/*
@@ -411,22 +451,26 @@ aggr_port_notify_cb(void *arg, mac_notify_type_t type)
aggr_port_t *port = arg;
aggr_grp_t *grp = port->lp_grp;
boolean_t mac_addr_changed, link_state_changed;
+ mac_perim_handle_t mph;
- /*
- * Do nothing if the aggregation or the port is in the deletion
- * process. Note that this is necessary to avoid deadlock.
- */
- if ((grp->lg_closing) || (port->lp_closing))
- return;
+ mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ if (port->lp_closing) {
+ mac_perim_exit(mph);
- AGGR_PORT_REFHOLD(port);
+ /*
+ * Release the reference so it is safe for aggr to call
+ * mac_unregister() now.
+ */
+ aggr_grp_port_rele(port);
+ return;
+ }
switch (type) {
case MAC_NOTE_TX:
mac_tx_update(grp->lg_mh);
break;
case MAC_NOTE_LINK:
- if (aggr_port_notify_link(grp, port, B_TRUE))
+ if (aggr_port_notify_link(grp, port))
mac_link_update(grp->lg_mh, grp->lg_link_state);
break;
case MAC_NOTE_UNICST:
@@ -437,46 +481,34 @@ aggr_port_notify_cb(void *arg, mac_notify_type_t type)
if (link_state_changed)
mac_link_update(grp->lg_mh, grp->lg_link_state);
break;
- case MAC_NOTE_PROMISC:
- port->lp_txinfo = mac_tx_get(port->lp_mh);
- break;
default:
break;
}
- AGGR_PORT_REFRELE(port);
+ mac_perim_exit(mph);
}
int
aggr_port_start(aggr_port_t *port)
{
- int rc;
-
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
-
- if (port->lp_started)
- return (0);
-
- if ((rc = mac_start(port->lp_mh)) != 0)
- return (rc);
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
- /* update the port state */
- port->lp_started = B_TRUE;
+ if (!port->lp_started)
+ port->lp_started = B_TRUE;
- return (rc);
+ return (0);
}
void
aggr_port_stop(aggr_port_t *port)
{
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
if (!port->lp_started)
return;
- aggr_grp_multicst_port(port, B_FALSE);
-
- mac_stop(port->lp_mh);
+ if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
+ aggr_grp_multicst_port(port, B_FALSE);
/* update the port state */
port->lp_started = B_FALSE;
@@ -487,33 +519,46 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on)
{
int rc;
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
if (on == port->lp_promisc_on)
/* already in desired promiscous mode */
return (0);
- rc = mac_promisc_set(port->lp_mh, on, MAC_DEVPROMISC);
+ if (on) {
+ mac_rx_clear(port->lp_mch);
+ rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL,
+ aggr_recv_cb, port, &port->lp_mphp,
+ MAC_PROMISC_FLAGS_NO_TX_LOOP);
+ if (rc != 0) {
+ mac_rx_set(port->lp_mch, aggr_recv_cb, port);
+ return (rc);
+ }
+ } else {
+ rc = mac_promisc_remove(port->lp_mphp);
+ if (rc != 0)
+ return (rc);
+ port->lp_mphp = NULL;
+ mac_rx_set(port->lp_mch, aggr_recv_cb, port);
+ }
- if (rc == 0)
- port->lp_promisc_on = on;
+ port->lp_promisc_on = on;
- return (rc);
+ return (0);
}
/*
* Set the MAC address of a port.
*/
int
-aggr_port_unicst(aggr_port_t *port, uint8_t *macaddr)
+aggr_port_unicst(aggr_port_t *port)
{
- int rc;
-
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ aggr_grp_t *grp = port->lp_grp;
- rc = mac_unicst_set(port->lp_mh, macaddr);
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
- return (rc);
+ return (mac_unicast_primary_set(port->lp_mh, grp->lg_addr));
}
/*
@@ -524,8 +569,12 @@ aggr_port_multicst(void *arg, boolean_t add, const uint8_t *addrp)
{
aggr_port_t *port = arg;
- return (add ? mac_multicst_add(port->lp_mh, addrp) :
- mac_multicst_remove(port->lp_mh, addrp));
+ if (add) {
+ return (mac_multicast_add(port->lp_mch, addrp));
+ } else {
+ mac_multicast_remove(port->lp_mch, addrp);
+ return (0);
+ }
}
uint64_t
@@ -533,3 +582,101 @@ aggr_port_stat(aggr_port_t *port, uint_t stat)
{
return (mac_stat_get(port->lp_mh, stat));
}
+
+/*
+ * Add a non-primary unicast address to the underlying port. If the port
+ * supports HW Rx group, try to add the address into the HW Rx group of
+ * the port first. If that fails, or if the port does not support HW Rx
+ * group, enable the port's promiscous mode.
+ */
+int
+aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr)
+{
+ aggr_unicst_addr_t *addr, **pprev;
+ mac_perim_handle_t pmph;
+ int err;
+
+ ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ /*
+ * If the underlying port support HW Rx group, add the mac to its
+ * RX group directly.
+ */
+ if ((port->lp_hwgh != NULL) &&
+ ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) {
+ mac_perim_exit(pmph);
+ return (0);
+ }
+
+ /*
+ * If that fails, or if the port does not support HW Rx group, enable
+ * the port's promiscous mode. (Note that we turn on the promiscous
+ * mode only if the port is already started.
+ */
+ if (port->lp_started &&
+ ((err = aggr_port_promisc(port, B_TRUE)) != 0)) {
+ mac_perim_exit(pmph);
+ return (err);
+ }
+
+ /*
+ * Walk through the unicast addresses that requires promiscous mode
+ * enabled on this port, and add this address to the end of the list.
+ */
+ pprev = &port->lp_prom_addr;
+ while ((addr = *pprev) != NULL) {
+ ASSERT(bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0);
+ pprev = &addr->aua_next;
+ }
+ addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
+ bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
+ addr->aua_next = NULL;
+ *pprev = addr;
+ mac_perim_exit(pmph);
+ return (0);
+}
+
+/*
+ * Remove a non-primary unicast address from the underlying port. This address
+ * must has been added by aggr_port_addmac(). As a result, we probably need to
+ * remove the address from the port's HW Rx group, or to disable the port's
+ * promiscous mode.
+ */
+void
+aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
+{
+ aggr_grp_t *grp = port->lp_grp;
+ aggr_unicst_addr_t *addr, **pprev;
+ mac_perim_handle_t pmph;
+
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ /*
+ * See whether this address is in the list of addresses that requires
+ * the port being promiscous mode.
+ */
+ pprev = &port->lp_prom_addr;
+ while ((addr = *pprev) != NULL) {
+ if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0)
+ break;
+ pprev = &addr->aua_next;
+ }
+ if (addr != NULL) {
+ /*
+ * This unicast address put the port into the promiscous mode,
+ * delete this address from the lp_prom_addr list. If this is
+ * the last address in that list, disable the promiscous mode
+ * if the aggregation is not in promiscous mode.
+ */
+ *pprev = addr->aua_next;
+ kmem_free(addr, sizeof (aggr_unicst_addr_t));
+ if (port->lp_prom_addr == NULL && !grp->lg_promisc)
+ (void) aggr_port_promisc(port, B_FALSE);
+ } else {
+ ASSERT(port->lp_hwgh != NULL);
+ (void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr);
+ }
+ mac_perim_exit(pmph);
+}
diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c
index bf98e65ee3..2bdb7872e3 100644
--- a/usr/src/uts/common/io/aggr/aggr_recv.c
+++ b/usr/src/uts/common/io/aggr/aggr_recv.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* IEEE 802.3ad Link Aggregation - Receive
*
@@ -42,7 +40,18 @@
#include <sys/aggr_impl.h>
static void
-aggr_recv_lacp(aggr_port_t *port, mblk_t *mp)
+aggr_mac_rx(mac_handle_t lg_mh, mac_resource_handle_t mrh, mblk_t *mp)
+{
+ if (mrh == NULL) {
+ mac_rx(lg_mh, mrh, mp);
+ } else {
+ aggr_pseudo_rx_ring_t *ring = (aggr_pseudo_rx_ring_t *)mrh;
+ mac_rx_ring(lg_mh, ring->arr_rh, mp, ring->arr_gen);
+ }
+}
+
+void
+aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
{
aggr_grp_t *grp = port->lp_grp;
@@ -51,35 +60,26 @@ aggr_recv_lacp(aggr_port_t *port, mblk_t *mp)
mblk_t *nmp = copymsg(mp);
if (nmp != NULL)
- mac_rx(grp->lg_mh, NULL, nmp);
+ aggr_mac_rx(grp->lg_mh, mrh, nmp);
}
- aggr_lacp_rx(port, mp);
+ aggr_lacp_rx_enqueue(port, mp);
}
/*
* Callback function invoked by MAC service module when packets are
* made available by a MAC port.
*/
+/* ARGSUSED */
void
-aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
aggr_port_t *port = (aggr_port_t *)arg;
aggr_grp_t *grp = port->lp_grp;
- /*
- * If this message is looped back from the legacy devices, drop
- * it as the Nemo framework will be responsible for looping it
- * back by the mac_txloop() function.
- */
- if (mp->b_flag & MSGNOLOOP) {
- ASSERT(mp->b_next == NULL);
- freemsg(mp);
- return;
- }
-
if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
- mac_rx(grp->lg_mh, mrh, mp);
+ aggr_mac_rx(grp->lg_mh, mrh, mp);
} else {
mblk_t *cmp, *last, *head;
struct ether_header *ehp;
@@ -100,10 +100,12 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
} else {
/* send up accumulated packets */
last->b_next = NULL;
- if (port->lp_collector_enabled)
- mac_rx(grp->lg_mh, mrh, head);
- else
+ if (port->lp_collector_enabled) {
+ aggr_mac_rx(grp->lg_mh, mrh,
+ head);
+ } else {
freemsgchain(head);
+ }
head = cmp->b_next;
cmp->b_next = NULL;
freemsg(cmp);
@@ -126,21 +128,23 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
ASSERT(last == NULL);
head = cmp->b_next;
cmp->b_next = NULL;
- aggr_recv_lacp(port, cmp);
+ aggr_recv_lacp(port, mrh, cmp);
cmp = head;
} else {
/* previously accumulated packets */
ASSERT(last != NULL);
/* send up non-LACP packets */
last->b_next = NULL;
- if (port->lp_collector_enabled)
- mac_rx(grp->lg_mh, mrh, head);
- else
+ if (port->lp_collector_enabled) {
+ aggr_mac_rx(grp->lg_mh, mrh,
+ head);
+ } else {
freemsgchain(head);
+ }
/* unlink and pass up LACP packets */
head = cmp->b_next;
cmp->b_next = NULL;
- aggr_recv_lacp(port, cmp);
+ aggr_recv_lacp(port, mrh, cmp);
cmp = head;
last = NULL;
}
@@ -151,7 +155,7 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
}
if (head != NULL) {
if (port->lp_collector_enabled)
- mac_rx(grp->lg_mh, mrh, head);
+ aggr_mac_rx(grp->lg_mh, mrh, head);
else
freemsgchain(head);
}
diff --git a/usr/src/uts/common/io/aggr/aggr_send.c b/usr/src/uts/common/io/aggr/aggr_send.c
index 467f8541a3..9b4ad24621 100644
--- a/usr/src/uts/common/io/aggr/aggr_send.c
+++ b/usr/src/uts/common/io/aggr/aggr_send.c
@@ -55,18 +55,19 @@
static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
-static uint_t
-aggr_send_port(aggr_grp_t *grp, mblk_t *mp)
+static uint64_t
+aggr_send_hash(aggr_grp_t *grp, mblk_t *mp)
{
struct ether_header *ehp;
uint16_t sap;
uint_t skip_len;
uint8_t proto;
uint32_t policy = grp->lg_tx_policy;
- uint32_t hash = 0;
+ uint64_t hash = 0;
ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
+ ASSERT(RW_READ_HELD(&grp->lg_tx_lock));
/* compute MAC hash */
@@ -207,7 +208,7 @@ again:
}
done:
- return (hash % grp->lg_ntx_ports);
+ return (hash);
}
/*
@@ -216,8 +217,7 @@ done:
void
aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
{
- ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
- ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
grp->lg_tx_policy = policy;
}
@@ -231,35 +231,63 @@ aggr_m_tx(void *arg, mblk_t *mp)
aggr_grp_t *grp = arg;
aggr_port_t *port;
mblk_t *nextp;
- const mac_txinfo_t *mtp;
+ mac_tx_cookie_t cookie;
+ uint64_t hash;
+ void *mytx_handle;
for (;;) {
- AGGR_LACP_LOCK_READER(grp)
+ rw_enter(&grp->lg_tx_lock, RW_READER);
if (grp->lg_ntx_ports == 0) {
/*
* We could have returned from aggr_m_start() before
* the ports were actually attached. Drop the chain.
*/
- AGGR_LACP_UNLOCK(grp)
+ rw_exit(&grp->lg_tx_lock);
freemsgchain(mp);
return (NULL);
}
+
nextp = mp->b_next;
mp->b_next = NULL;
- port = grp->lg_tx_ports[aggr_send_port(grp, mp)];
- ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED);
+ hash = aggr_send_hash(grp, mp);
+ port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
/*
- * We store the transmit info pointer locally in case it
- * changes between loading mt_fn and mt_arg.
+ * Bump the active Tx ref count so that the port won't
+ * be deleted. The reference count will be dropped in mac_tx().
*/
- mtp = port->lp_txinfo;
- AGGR_LACP_UNLOCK(grp)
+ mytx_handle = mac_tx_hold(port->lp_mch);
+ rw_exit(&grp->lg_tx_lock);
- if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
- mp->b_next = nextp;
- break;
+ if (mytx_handle == NULL) {
+ /*
+ * The port is quiesced.
+ */
+ freemsg(mp);
+ } else {
+ mblk_t *ret_mp;
+
+ /*
+ * It is fine that the port state changes now.
+ * Set MAC_TX_NO_HOLD to inform mac_tx() not to bump
+ * the active Tx ref again. Use hash as the hint so
+ * to direct traffic to different TX rings. Note below
+ * bit operation is needed to get the most benefit
+ * from the mac_tx() hash algorithm.
+ */
+ hash = (hash << 24 | hash << 16 | hash);
+ hash = (hash << 32 | hash);
+ cookie = mac_tx(port->lp_mch, mp, (uintptr_t)hash,
+ MAC_TX_NO_ENQUEUE | MAC_TX_NO_HOLD, &ret_mp);
+
+ mac_tx_rele(port->lp_mch, mytx_handle);
+
+ if (cookie != NULL) {
+ ret_mp->b_next = nextp;
+ mp = ret_mp;
+ break;
+ }
}
if ((mp = nextp) == NULL)
@@ -276,6 +304,8 @@ aggr_send_port_enable(aggr_port_t *port)
{
aggr_grp_t *grp = port->lp_grp;
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+
if (port->lp_tx_enabled || (port->lp_state !=
AGGR_PORT_STATE_ATTACHED)) {
/* already enabled or port not yet attached */
@@ -285,6 +315,7 @@ aggr_send_port_enable(aggr_port_t *port)
/*
* Add to group's array of tx ports.
*/
+ rw_enter(&grp->lg_tx_lock, RW_WRITER);
if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
/* current array too small */
aggr_port_t **new_ports;
@@ -308,6 +339,7 @@ aggr_send_port_enable(aggr_port_t *port)
grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
port->lp_tx_idx = grp->lg_ntx_ports-1;
+ rw_exit(&grp->lg_tx_lock);
port->lp_tx_enabled = B_TRUE;
}
@@ -321,13 +353,15 @@ aggr_send_port_disable(aggr_port_t *port)
uint_t idx, ntx;
aggr_grp_t *grp = port->lp_grp;
- ASSERT(RW_WRITE_HELD(&port->lp_lock));
+ ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_mh));
if (!port->lp_tx_enabled) {
/* not yet enabled */
return;
}
+ rw_enter(&grp->lg_tx_lock, RW_WRITER);
idx = port->lp_tx_idx;
ntx = grp->lg_ntx_ports;
ASSERT(idx < ntx);
@@ -347,6 +381,7 @@ aggr_send_port_disable(aggr_port_t *port)
port->lp_tx_idx = 0;
grp->lg_ntx_ports--;
+ rw_exit(&grp->lg_tx_lock);
port->lp_tx_enabled = B_FALSE;
}
diff --git a/usr/src/uts/common/io/ath/ath_main.c b/usr/src/uts/common/io/ath/ath_main.c
index b18451e570..451f827415 100644
--- a/usr/src/uts/common/io/ath/ath_main.c
+++ b/usr/src/uts/common/io/ath/ath_main.c
@@ -132,7 +132,7 @@
#include <sys/sunddi.h>
#include <sys/pci.h>
#include <sys/errno.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <sys/list.h>
@@ -232,7 +232,6 @@ static mac_callbacks_t ath_m_callbacks = {
ath_m_multicst,
ath_m_unicst,
ath_m_tx,
- NULL, /* mc_resources; */
ath_m_ioctl,
NULL, /* mc_getcapab */
NULL,
diff --git a/usr/src/uts/common/io/bge/bge.conf b/usr/src/uts/common/io/bge/bge.conf
index 71a44f851a..edabf29ab1 100644
--- a/usr/src/uts/common/io/bge/bge.conf
+++ b/usr/src/uts/common/io/bge/bge.conf
@@ -171,6 +171,6 @@ bge-known-subsystems = 0x108e1647,
# For BCM5705, BCM5782, etc, there are only 1 receive ring and 1 send ring.
# Otherwise, there can be up to 16 receive rings and 4 send rings.
#
-bge-rx-rings = 1;
+bge-rx-rings = 16;
bge-tx-rings = 1;
diff --git a/usr/src/uts/common/io/bge/bge_chip2.c b/usr/src/uts/common/io/bge/bge_chip2.c
index 4c17aaa5a9..d91ac5f0f6 100644
--- a/usr/src/uts/common/io/bge/bge_chip2.c
+++ b/usr/src/uts/common/io/bge/bge_chip2.c
@@ -1838,29 +1838,13 @@ bge_nvmem_id(bge_t *bgep)
static void
bge_init_recv_rule(bge_t *bgep)
{
- bge_recv_rule_t *rulep;
+ bge_recv_rule_t *rulep = bgep->recv_rules;
uint32_t i;
/*
- * receive rule: direct all TCP traffic to ring RULE_MATCH_TO_RING
- * 1. to direct UDP traffic, set:
- * rulep->control = RULE_PROTO_CONTROL;
- * rulep->mask_value = RULE_UDP_MASK_VALUE;
- * 2. to direct ICMP traffic, set:
- * rulep->control = RULE_PROTO_CONTROL;
- * rulep->mask_value = RULE_ICMP_MASK_VALUE;
- * 3. to direct traffic by source ip, set:
- * rulep->control = RULE_SIP_CONTROL;
- * rulep->mask_value = RULE_SIP_MASK_VALUE;
+ * Initialize receive rule registers.
+ * Note that rules may persist across each bge_m_start/stop() call.
*/
- rulep = bgep->recv_rules;
- rulep->control = RULE_PROTO_CONTROL;
- rulep->mask_value = RULE_TCP_MASK_VALUE;
-
- /*
- * set receive rule registers
- */
- rulep = bgep->recv_rules;
for (i = 0; i < RECV_RULES_NUM_MAX; i++, rulep++) {
bge_reg_put32(bgep, RECV_RULE_MASK_REG(i), rulep->mask_value);
bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i), rulep->control);
@@ -2871,10 +2855,11 @@ bge_chip_sync(bge_t *bgep)
}
bge_reg_put32(bgep, MAC_TX_RANDOM_BACKOFF_REG, fill);
bge_reg_put64(bgep, MAC_ADDRESS_REG(j), macaddr);
- }
- BGE_DEBUG(("bge_chip_sync($%p) setting MAC address %012llx",
- (void *)bgep, macaddr));
+ BGE_DEBUG(("bge_chip_sync($%p) "
+ "setting MAC address %012llx",
+ (void *)bgep, macaddr));
+ }
#ifdef BGE_IPMI_ASF
}
#endif
@@ -5515,14 +5500,25 @@ bge_chip_ioctl(bge_t *bgep, queue_t *wq, mblk_t *mp, struct iocblk *iocp)
/* NOTREACHED */
}
+/* ARGSUSED */
void
-bge_chip_blank(void *arg, time_t ticks, uint_t count)
+bge_chip_blank(void *arg, time_t ticks, uint_t count, int flag)
{
- bge_t *bgep = arg;
+ recv_ring_t *rrp = arg;
+ bge_t *bgep = rrp->bgep;
mutex_enter(bgep->genlock);
+ rrp->poll_flag = flag;
+#ifdef NOT_YET
+ /*
+ * XXX-Sunay: Since most broadcom cards support only one
+ * interrupt but multiple rx rings, we can't disable the
+ * physical interrupt. This need to be done via capability
+ * negotiation depending on the NIC.
+ */
bge_reg_put32(bgep, RCV_COALESCE_TICKS_REG, ticks);
bge_reg_put32(bgep, RCV_COALESCE_MAX_BD_REG, count);
+#endif
if (bge_check_acc_handle(bgep, bgep->io_handle) != DDI_FM_OK)
ddi_fm_service_impact(bgep->devinfo, DDI_SERVICE_UNAFFECTED);
mutex_exit(bgep->genlock);
diff --git a/usr/src/uts/common/io/bge/bge_hw.h b/usr/src/uts/common/io/bge/bge_hw.h
index 2ebdc1a7a3..1974faea88 100644
--- a/usr/src/uts/common/io/bge/bge_hw.h
+++ b/usr/src/uts/common/io/bge/bge_hw.h
@@ -858,30 +858,53 @@ extern "C" {
/*
* Receive Rules definition
*/
-#define RULE_MATCH_TO_RING 2
- /* ring that traffic will go into when recv rule matches. */
- /* value is between 1 and 16, not 0 and 15 */
-
+#define ETHERHEADER_DEST_OFFSET 0x00
#define IPHEADER_PROTO_OFFSET 0x08
#define IPHEADER_SIP_OFFSET 0x0c
+#define IPHEADER_DIP_OFFSET 0x10
+#define TCPHEADER_SPORT_OFFSET 0x00
+#define TCPHEADER_DPORT_OFFSET 0x02
+#define UDPHEADER_SPORT_OFFSET 0x00
+#define UDPHEADER_DPORT_OFFSET 0x02
+
+#define RULE_MATCH(ring) (RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_OP_EQ | \
+ RECV_RULE_CTL_CLASS((ring)))
+
+#define RULE_MATCH_MASK(ring) (RULE_MATCH(ring) | RECV_RULE_CTL_MASK)
+
+#define RULE_DEST_MAC_1(ring) (RULE_MATCH(ring) | \
+ RECV_RULE_CTL_HEADER_FRAME | \
+ ETHERHEADER_DEST_OFFSET)
+
+#define RULE_DEST_MAC_2(ring) (RULE_MATCH_MASK(ring) | \
+ RECV_RULE_CTL_HEADER_FRAME | \
+ ETHERHEADER_DEST_OFFSET + 4)
+
+#define RULE_LOCAL_IP(ring) (RULE_MATCH(ring) | RECV_RULE_CTL_HEADER_IP | \
+ IPHEADER_DIP_OFFSET)
+
+#define RULE_REMOTE_IP(ring) (RULE_MATCH(ring) | RECV_RULE_CTL_HEADER_IP | \
+ IPHEADER_SIP_OFFSET)
-#define RULE_PROTO_CONTROL (RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_MASK | \
- RECV_RULE_CTL_OP_EQ | \
+#define RULE_IP_PROTO(ring) (RULE_MATCH_MASK(ring) | \
RECV_RULE_CTL_HEADER_IP | \
- RECV_RULE_CTL_CLASS(RULE_MATCH_TO_RING) | \
IPHEADER_PROTO_OFFSET)
-#define RULE_TCP_MASK_VALUE 0x00ff0006
-#define RULE_UDP_MASK_VALUE 0x00ff0011
-#define RULE_ICMP_MASK_VALUE 0x00ff0001
-#define RULE_SIP_ADDR 0x0a000001
- /* ip address in 32-bit integer,such as, 0x0a000001 is "10.0.0.1" */
+#define RULE_TCP_SPORT(ring) (RULE_MATCH_MASK(ring) | \
+ RECV_RULE_CTL_HEADER_TCP | \
+ TCPHEADER_SPORT_OFFSET)
-#define RULE_SIP_CONTROL (RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_OP_EQ | \
- RECV_RULE_CTL_HEADER_IP | \
- RECV_RULE_CTL_CLASS(RULE_MATCH_TO_RING) | \
- IPHEADER_SIP_OFFSET)
-#define RULE_SIP_MASK_VALUE RULE_SIP_ADDR
+#define RULE_TCP_DPORT(ring) (RULE_MATCH_MASK(ring) | \
+ RECV_RULE_CTL_HEADER_TCP | \
+ TCPHEADER_DPORT_OFFSET)
+
+#define RULE_UDP_SPORT(ring) (RULE_MATCH_MASK(ring) | \
+ RECV_RULE_CTL_HEADER_UDP | \
+ UDPHEADER_SPORT_OFFSET)
+
+#define RULE_UDP_DPORT(ring) (RULE_MATCH_MASK(ring) | \
+ RECV_RULE_CTL_HEADER_UDP | \
+ UDPHEADER_DPORT_OFFSET)
/*
* 1000BaseX low-level access registers
@@ -1686,6 +1709,14 @@ typedef struct {
} bge_recv_rule_t;
/*
+ * This describes which sub-rule slots are used by a particular rule.
+ */
+typedef struct {
+ int start;
+ int count;
+} bge_rule_info_t;
+
+/*
* Indexes into the <buff_cons_index> array
*/
#ifdef _BIG_ENDIAN
diff --git a/usr/src/uts/common/io/bge/bge_impl.h b/usr/src/uts/common/io/bge/bge_impl.h
index 961bf14064..3d2b73f325 100644
--- a/usr/src/uts/common/io/bge/bge_impl.h
+++ b/usr/src/uts/common/io/bge/bge_impl.h
@@ -71,7 +71,7 @@ extern "C" {
#include <sys/fm/util.h>
#include <sys/fm/io/ddi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#ifdef __amd64
@@ -397,6 +397,13 @@ typedef struct buff_ring {
void *spare[4]; /* padding */
} buff_ring_t; /* 0x100 (256) bytes */
+typedef struct bge_multi_mac {
+ int naddr; /* total supported addresses */
+ int naddrfree; /* free addresses slots */
+ ether_addr_t mac_addr[MAC_ADDRESS_REGS_MAX];
+ boolean_t mac_addr_set[MAC_ADDRESS_REGS_MAX];
+} bge_multi_mac_t;
+
/*
* Software Receive (Return) Ring Control Block
* There's one of these for each receiver return ring (up to 16).
@@ -418,7 +425,6 @@ typedef struct recv_ring {
volatile uint16_t *prod_index_p; /* (const) ptr to h/w */
/* "producer index" */
/* (in status block) */
-
/*
* The rx_lock must be held when updating the h/w consumer index
* mailbox register (*chip_mbox_reg), or the s/w consumer index
@@ -428,10 +434,16 @@ typedef struct recv_ring {
/* index mailbox offset */
kmutex_t rx_lock[1]; /* serialize receive */
uint64_t rx_next; /* next slot to examine */
- mac_resource_handle_t handle; /* per ring cookie */
- /* ("producer index") */
+
+ mac_ring_handle_t ring_handle;
+ mac_group_handle_t ring_group_handle;
+ uint64_t ring_gen_num;
+ bge_rule_info_t *mac_addr_rule;
+ uint8_t mac_addr_val[ETHERADDRL];
+ int poll_flag; /* Polling flag */
} recv_ring_t; /* 0x90 (144) bytes */
+
/*
* Send packet structure
*/
@@ -528,6 +540,7 @@ typedef struct send_ring {
sw_sbd_t *sw_sbds; /* software descriptors */
uint64_t mac_resid; /* special per resource id */
+ uint64_t pushed_bytes;
} send_ring_t; /* 0x100 (256) bytes */
typedef struct {
@@ -760,6 +773,8 @@ typedef struct bge {
* Note: they're not necessarily all used.
*/
buff_ring_t buff[BGE_BUFF_RINGS_MAX]; /* 3*0x0100 */
+
+ /* may be obsoleted */
recv_ring_t recv[BGE_RECV_RINGS_MAX]; /* 16*0x0090 */
send_ring_t send[BGE_SEND_RINGS_MAX]; /* 16*0x0100 */
@@ -1158,7 +1173,8 @@ int bge_chip_sync(bge_t *bgep, boolean_t asf_keeplive);
int bge_chip_reset(bge_t *bgep, boolean_t enable_dma);
int bge_chip_sync(bge_t *bgep);
#endif
-void bge_chip_blank(void *arg, time_t ticks, uint_t count);
+void bge_chip_blank(void *arg, time_t ticks, uint_t count, int flag);
+extern mblk_t *bge_poll_ring(void *, int);
uint_t bge_chip_factotum(caddr_t arg);
void bge_chip_cyclic(void *arg);
enum ioc_reply bge_chip_ioctl(bge_t *bgep, queue_t *wq, mblk_t *mp,
@@ -1222,6 +1238,7 @@ void bge_receive(bge_t *bgep, bge_status_t *bsp);
/* bge_send.c */
mblk_t *bge_m_tx(void *arg, mblk_t *mp);
+mblk_t *bge_ring_tx(void *arg, mblk_t *mp);
void bge_recycle(bge_t *bgep, bge_status_t *bsp);
uint_t bge_send_drain(caddr_t arg);
diff --git a/usr/src/uts/common/io/bge/bge_main2.c b/usr/src/uts/common/io/bge/bge_main2.c
index fc4407214e..c8cef32365 100644
--- a/usr/src/uts/common/io/bge/bge_main2.c
+++ b/usr/src/uts/common/io/bge/bge_main2.c
@@ -26,7 +26,9 @@
#include "bge_impl.h"
#include <sys/sdt.h>
+#include <sys/mac_provider.h>
#include <sys/mac.h>
+#include <sys/mac_flow.h>
/*
* This is the string displayed by modinfo, etc.
@@ -52,6 +54,7 @@ static char default_mtu[] = "default_mtu";
static int bge_add_intrs(bge_t *, int);
static void bge_rem_intrs(bge_t *);
+static int bge_unicst_set(void *, const uint8_t *, int);
/*
* Describes the chip's DMA engine
@@ -104,16 +107,10 @@ static int bge_m_start(void *);
static void bge_m_stop(void *);
static int bge_m_promisc(void *, boolean_t);
static int bge_m_multicst(void *, boolean_t, const uint8_t *);
-static int bge_m_unicst(void *, const uint8_t *);
-static void bge_m_resources(void *);
static void bge_m_ioctl(void *, queue_t *, mblk_t *);
static boolean_t bge_m_getcapab(void *, mac_capab_t, void *);
static int bge_unicst_set(void *, const uint8_t *,
- mac_addr_slot_t);
-static int bge_m_unicst_add(void *, mac_multi_addr_t *);
-static int bge_m_unicst_remove(void *, mac_addr_slot_t);
-static int bge_m_unicst_modify(void *, mac_multi_addr_t *);
-static int bge_m_unicst_get(void *, mac_multi_addr_t *);
+ int);
static int bge_m_setprop(void *, const char *, mac_prop_id_t,
uint_t, const void *);
static int bge_m_getprop(void *, const char *, mac_prop_id_t,
@@ -123,8 +120,7 @@ static int bge_set_priv_prop(bge_t *, const char *, uint_t,
static int bge_get_priv_prop(bge_t *, const char *, uint_t,
uint_t, void *);
-#define BGE_M_CALLBACK_FLAGS\
- (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
+#define BGE_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
static mac_callbacks_t bge_m_callbacks = {
BGE_M_CALLBACK_FLAGS,
@@ -133,9 +129,8 @@ static mac_callbacks_t bge_m_callbacks = {
bge_m_stop,
bge_m_promisc,
bge_m_multicst,
- bge_m_unicst,
+ NULL,
bge_m_tx,
- bge_m_resources,
bge_m_ioctl,
bge_m_getcapab,
NULL,
@@ -152,6 +147,7 @@ mac_priv_prop_t bge_priv_prop[] = {
#define BGE_MAX_PRIV_PROPS \
(sizeof (bge_priv_prop) / sizeof (mac_priv_prop_t))
+uint8_t zero_addr[6] = {0, 0, 0, 0, 0, 0};
/*
* ========== Transmit and receive ring reinitialisation ==========
*/
@@ -590,23 +586,10 @@ bge_m_start(void *arg)
}
/*
- * bge_m_unicst() -- set the physical network address
- */
-static int
-bge_m_unicst(void *arg, const uint8_t *macaddr)
-{
- /*
- * Request to set address in
- * address slot 0, i.e., default address
- */
- return (bge_unicst_set(arg, macaddr, 0));
-}
-
-/*
* bge_unicst_set() -- set the physical network address
*/
static int
-bge_unicst_set(void *arg, const uint8_t *macaddr, mac_addr_slot_t slot)
+bge_unicst_set(void *arg, const uint8_t *macaddr, int slot)
{
bge_t *bgep = arg; /* private device info */
@@ -693,160 +676,6 @@ bge_unicst_set(void *arg, const uint8_t *macaddr, mac_addr_slot_t slot)
return (0);
}
-/*
- * The following four routines are used as callbacks for multiple MAC
- * address support:
- * - bge_m_unicst_add(void *, mac_multi_addr_t *);
- * - bge_m_unicst_remove(void *, mac_addr_slot_t);
- * - bge_m_unicst_modify(void *, mac_multi_addr_t *);
- * - bge_m_unicst_get(void *, mac_multi_addr_t *);
- */
-
-/*
- * bge_m_unicst_add() - will find an unused address slot, set the
- * address value to the one specified, reserve that slot and enable
- * the NIC to start filtering on the new MAC address.
- * address slot. Returns 0 on success.
- */
-static int
-bge_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
-{
- bge_t *bgep = arg; /* private device info */
- mac_addr_slot_t slot;
- int err;
-
- if (mac_unicst_verify(bgep->mh,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
- return (EINVAL);
-
- mutex_enter(bgep->genlock);
- if (bgep->unicst_addr_avail == 0) {
- /* no slots available */
- mutex_exit(bgep->genlock);
- return (ENOSPC);
- }
-
- /*
- * Primary/default address is in slot 0. The next three
- * addresses are the multiple MAC addresses. So multiple
- * MAC address 0 is in slot 1, 1 in slot 2, and so on.
- * So the first multiple MAC address resides in slot 1.
- */
- for (slot = 1; slot < bgep->unicst_addr_total; slot++) {
- if (bgep->curr_addr[slot].set == B_FALSE) {
- bgep->curr_addr[slot].set = B_TRUE;
- break;
- }
- }
-
- ASSERT(slot < bgep->unicst_addr_total);
- bgep->unicst_addr_avail--;
- mutex_exit(bgep->genlock);
- maddr->mma_slot = slot;
-
- if ((err = bge_unicst_set(bgep, maddr->mma_addr, slot)) != 0) {
- mutex_enter(bgep->genlock);
- bgep->curr_addr[slot].set = B_FALSE;
- bgep->unicst_addr_avail++;
- mutex_exit(bgep->genlock);
- }
- return (err);
-}
-
-/*
- * bge_m_unicst_remove() - removes a MAC address that was added by a
- * call to bge_m_unicst_add(). The slot number that was returned in
- * add() is passed in the call to remove the address.
- * Returns 0 on success.
- */
-static int
-bge_m_unicst_remove(void *arg, mac_addr_slot_t slot)
-{
- bge_t *bgep = arg; /* private device info */
-
- if (slot <= 0 || slot >= bgep->unicst_addr_total)
- return (EINVAL);
-
- mutex_enter(bgep->genlock);
- if (bgep->curr_addr[slot].set == B_TRUE) {
- bgep->curr_addr[slot].set = B_FALSE;
- bgep->unicst_addr_avail++;
- mutex_exit(bgep->genlock);
- /*
- * Copy the default address to the passed slot
- */
- return (bge_unicst_set(bgep, bgep->curr_addr[0].addr, slot));
- }
- mutex_exit(bgep->genlock);
- return (EINVAL);
-}
-
-/*
- * bge_m_unicst_modify() - modifies the value of an address that
- * has been added by bge_m_unicst_add(). The new address, address
- * length and the slot number that was returned in the call to add
- * should be passed to bge_m_unicst_modify(). mma_flags should be
- * set to 0. Returns 0 on success.
- */
-static int
-bge_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
-{
- bge_t *bgep = arg; /* private device info */
- mac_addr_slot_t slot;
-
- if (mac_unicst_verify(bgep->mh,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
- return (EINVAL);
-
- slot = maddr->mma_slot;
-
- if (slot <= 0 || slot >= bgep->unicst_addr_total)
- return (EINVAL);
-
- mutex_enter(bgep->genlock);
- if (bgep->curr_addr[slot].set == B_TRUE) {
- mutex_exit(bgep->genlock);
- return (bge_unicst_set(bgep, maddr->mma_addr, slot));
- }
- mutex_exit(bgep->genlock);
-
- return (EINVAL);
-}
-
-/*
- * bge_m_unicst_get() - will get the MAC address and all other
- * information related to the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
- */
-static int
-bge_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
-{
- bge_t *bgep = arg; /* private device info */
- mac_addr_slot_t slot;
-
- slot = maddr->mma_slot;
-
- if (slot <= 0 || slot >= bgep->unicst_addr_total)
- return (EINVAL);
-
- mutex_enter(bgep->genlock);
- if (bgep->curr_addr[slot].set == B_TRUE) {
- ethaddr_copy(bgep->curr_addr[slot].addr,
- maddr->mma_addr);
- maddr->mma_flags = MMAC_SLOT_USED;
- } else {
- maddr->mma_flags = MMAC_SLOT_UNUSED;
- }
- mutex_exit(bgep->genlock);
-
- return (0);
-}
-
extern void bge_wake_factotum(bge_t *);
static boolean_t
@@ -1576,6 +1405,295 @@ bge_m_promisc(void *arg, boolean_t on)
return (0);
}
+/*
+ * Find the slot for the specified unicast address
+ */
+int
+bge_unicst_find(bge_t *bgep, const uint8_t *mac_addr)
+{
+ int slot;
+
+ ASSERT(mutex_owned(bgep->genlock));
+
+ for (slot = 0; slot < bgep->unicst_addr_total; slot++) {
+ if (bcmp(bgep->curr_addr[slot].addr, mac_addr, ETHERADDRL) == 0)
+ return (slot);
+ }
+
+ return (-1);
+}
+
+/*
+ * Programs the classifier to start steering packets matching 'mac_addr' to the
+ * specified ring 'arg'.
+ */
+static int
+bge_addmac(void *arg, const uint8_t *mac_addr)
+{
+ recv_ring_t *rrp = (recv_ring_t *)arg;
+ bge_t *bgep = rrp->bgep;
+ bge_recv_rule_t *rulep = bgep->recv_rules;
+ bge_rule_info_t *rinfop = NULL;
+ uint8_t ring = (uint8_t)(rrp - bgep->recv) + 1;
+ int i;
+ uint16_t tmp16;
+ uint32_t tmp32;
+ int slot;
+ int err;
+
+ mutex_enter(bgep->genlock);
+ if (bgep->unicst_addr_avail == 0) {
+ mutex_exit(bgep->genlock);
+ return (ENOSPC);
+ }
+
+ /*
+ * First add the unicast address to a available slot.
+ */
+ slot = bge_unicst_find(bgep, mac_addr);
+ ASSERT(slot == -1);
+
+ for (slot = 0; slot < bgep->unicst_addr_total; slot++) {
+ if (!bgep->curr_addr[slot].set) {
+ bgep->curr_addr[slot].set = B_TRUE;
+ break;
+ }
+ }
+
+ ASSERT(slot < bgep->unicst_addr_total);
+ bgep->unicst_addr_avail--;
+ mutex_exit(bgep->genlock);
+
+ if ((err = bge_unicst_set(bgep, mac_addr, slot)) != 0)
+ goto fail;
+
+ /* A rule is already here. Deny this. */
+ if (rrp->mac_addr_rule != NULL) {
+ err = ether_cmp(mac_addr, rrp->mac_addr_val) ? EEXIST : EBUSY;
+ goto fail;
+ }
+
+ /*
+ * Allocate a bge_rule_info_t to keep track of which rule slots
+ * are being used.
+ */
+ rinfop = kmem_zalloc(sizeof (bge_rule_info_t), KM_NOSLEEP);
+ if (rinfop == NULL) {
+ err = ENOMEM;
+ goto fail;
+ }
+
+ /*
+ * Look for the starting slot to place the rules.
+ * The two slots we reserve must be contiguous.
+ */
+ for (i = 0; i + 1 < RECV_RULES_NUM_MAX; i++)
+ if ((rulep[i].control & RECV_RULE_CTL_ENABLE) == 0 &&
+ (rulep[i+1].control & RECV_RULE_CTL_ENABLE) == 0)
+ break;
+
+ ASSERT(i + 1 < RECV_RULES_NUM_MAX);
+
+ bcopy(mac_addr, &tmp32, sizeof (tmp32));
+ rulep[i].mask_value = ntohl(tmp32);
+ rulep[i].control = RULE_DEST_MAC_1(ring) | RECV_RULE_CTL_AND;
+ bge_reg_put32(bgep, RECV_RULE_MASK_REG(i), rulep[i].mask_value);
+ bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i), rulep[i].control);
+
+ bcopy(mac_addr + 4, &tmp16, sizeof (tmp16));
+ rulep[i+1].mask_value = 0xffff0000 | ntohs(tmp16);
+ rulep[i+1].control = RULE_DEST_MAC_2(ring);
+ bge_reg_put32(bgep, RECV_RULE_MASK_REG(i+1), rulep[i+1].mask_value);
+ bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i+1), rulep[i+1].control);
+ rinfop->start = i;
+ rinfop->count = 2;
+
+ rrp->mac_addr_rule = rinfop;
+ bcopy(mac_addr, rrp->mac_addr_val, ETHERADDRL);
+
+ return (0);
+
+fail:
+ /* Clear the address just set */
+ (void) bge_unicst_set(bgep, zero_addr, slot);
+ mutex_enter(bgep->genlock);
+ bgep->curr_addr[slot].set = B_FALSE;
+ bgep->unicst_addr_avail++;
+ mutex_exit(bgep->genlock);
+
+ return (err);
+}
+
+/*
+ * Stop classifying packets matching the MAC address to the specified ring.
+ */
+static int
+bge_remmac(void *arg, const uint8_t *mac_addr)
+{
+ recv_ring_t *rrp = (recv_ring_t *)arg;
+ bge_t *bgep = rrp->bgep;
+ bge_recv_rule_t *rulep = bgep->recv_rules;
+ bge_rule_info_t *rinfop = rrp->mac_addr_rule;
+ int start;
+ int slot;
+ int err;
+
+ /*
+ * Remove the MAC address from its slot.
+ */
+ mutex_enter(bgep->genlock);
+ slot = bge_unicst_find(bgep, mac_addr);
+ if (slot == -1) {
+ mutex_exit(bgep->genlock);
+ return (EINVAL);
+ }
+
+ ASSERT(bgep->curr_addr[slot].set);
+ mutex_exit(bgep->genlock);
+
+ if ((err = bge_unicst_set(bgep, zero_addr, slot)) != 0)
+ return (err);
+
+ if (rinfop == NULL || ether_cmp(mac_addr, rrp->mac_addr_val) != 0)
+ return (EINVAL);
+
+ start = rinfop->start;
+ rulep[start].mask_value = 0;
+ rulep[start].control = 0;
+ bge_reg_put32(bgep, RECV_RULE_MASK_REG(start), rulep[start].mask_value);
+ bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(start), rulep[start].control);
+ start++;
+ rulep[start].mask_value = 0;
+ rulep[start].control = 0;
+ bge_reg_put32(bgep, RECV_RULE_MASK_REG(start), rulep[start].mask_value);
+ bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(start), rulep[start].control);
+
+ kmem_free(rinfop, sizeof (bge_rule_info_t));
+ rrp->mac_addr_rule = NULL;
+ bzero(rrp->mac_addr_val, ETHERADDRL);
+
+ mutex_enter(bgep->genlock);
+ bgep->curr_addr[slot].set = B_FALSE;
+ bgep->unicst_addr_avail++;
+ mutex_exit(bgep->genlock);
+
+ return (0);
+}
+
+static int
+bge_flag_intr_enable(mac_intr_handle_t ih)
+{
+ recv_ring_t *rrp = (recv_ring_t *)ih;
+ bge_t *bgep = rrp->bgep;
+
+ mutex_enter(bgep->genlock);
+ rrp->poll_flag = 0;
+ mutex_exit(bgep->genlock);
+
+ return (0);
+}
+
+static int
+bge_flag_intr_disable(mac_intr_handle_t ih)
+{
+ recv_ring_t *rrp = (recv_ring_t *)ih;
+ bge_t *bgep = rrp->bgep;
+
+ mutex_enter(bgep->genlock);
+ rrp->poll_flag = 1;
+ mutex_exit(bgep->genlock);
+
+ return (0);
+}
+
+static int
+bge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+ recv_ring_t *rx_ring;
+
+ rx_ring = (recv_ring_t *)rh;
+ mutex_enter(rx_ring->rx_lock);
+ rx_ring->ring_gen_num = mr_gen_num;
+ mutex_exit(rx_ring->rx_lock);
+ return (0);
+}
+
+
+/*
+ * Callback funtion for MAC layer to register all rings
+ * for given ring_group, noted by rg_index.
+ */
+void
+bge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+ const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ bge_t *bgep = arg;
+ mac_intr_t *mintr;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ recv_ring_t *rx_ring;
+ ASSERT(rg_index >= 0 && rg_index < MIN(bgep->chipid.rx_rings,
+ MAC_ADDRESS_REGS_MAX) && index == 0);
+
+ rx_ring = &bgep->recv[rg_index];
+ rx_ring->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)rx_ring;
+ infop->mri_start = bge_ring_start;
+ infop->mri_stop = NULL;
+ infop->mri_poll = bge_poll_ring;
+
+ mintr = &infop->mri_intr;
+ mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+ mintr->mi_enable = bge_flag_intr_enable;
+ mintr->mi_disable = bge_flag_intr_disable;
+
+ break;
+ }
+ case MAC_RING_TYPE_TX:
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+/*
+ * Fill infop passed as argument
+ * fill in respective ring_group info
+ * Each group has a single ring in it. We keep it simple
+ * and use the same internal handle for rings and groups.
+ */
+void
+bge_fill_group(void *arg, mac_ring_type_t rtype, const int rg_index,
+ mac_group_info_t *infop, mac_group_handle_t gh)
+{
+ bge_t *bgep = arg;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ recv_ring_t *rx_ring;
+
+ ASSERT(rg_index >= 0 && rg_index < MIN(bgep->chipid.rx_rings,
+ MAC_ADDRESS_REGS_MAX));
+ rx_ring = &bgep->recv[rg_index];
+ rx_ring->ring_group_handle = gh;
+
+ infop->mgi_driver = (mac_group_driver_t)rx_ring;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = bge_addmac;
+ infop->mgi_remmac = bge_remmac;
+ infop->mgi_count = 1;
+ break;
+ }
+ case MAC_RING_TYPE_TX:
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
/*ARGSUSED*/
static boolean_t
bge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
@@ -1589,38 +1707,20 @@ bge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
*txflags = HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM;
break;
}
+ case MAC_CAPAB_RINGS: {
+ mac_capab_rings_t *cap_rings = cap_data;
- case MAC_CAPAB_POLL:
- /*
- * There's nothing for us to fill in, simply returning
- * B_TRUE stating that we support polling is sufficient.
- */
- break;
-
- case MAC_CAPAB_MULTIADDRESS: {
- multiaddress_capab_t *mmacp = cap_data;
+ /* Temporarily disable multiple tx rings. */
+ if (cap_rings->mr_type != MAC_RING_TYPE_RX)
+ return (B_FALSE);
- mutex_enter(bgep->genlock);
- /*
- * The number of MAC addresses made available by
- * this capability is one less than the total as
- * the primary address in slot 0 is counted in
- * the total.
- */
- mmacp->maddr_naddr = bgep->unicst_addr_total - 1;
- mmacp->maddr_naddrfree = bgep->unicst_addr_avail;
- /* No multiple factory addresses, set mma_flag to 0 */
- mmacp->maddr_flag = 0;
- mmacp->maddr_handle = bgep;
- mmacp->maddr_add = bge_m_unicst_add;
- mmacp->maddr_remove = bge_m_unicst_remove;
- mmacp->maddr_modify = bge_m_unicst_modify;
- mmacp->maddr_get = bge_m_unicst_get;
- mmacp->maddr_reserve = NULL;
- mutex_exit(bgep->genlock);
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = cap_rings->mr_gnum =
+ MIN(bgep->chipid.rx_rings, MAC_ADDRESS_REGS_MAX);
+ cap_rings->mr_rget = bge_fill_ring;
+ cap_rings->mr_gget = bge_fill_group;
break;
}
-
default:
return (B_FALSE);
}
@@ -1889,43 +1989,6 @@ bge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
}
}
-static void
-bge_resources_add(bge_t *bgep, time_t time, uint_t pkt_cnt)
-{
-
- recv_ring_t *rrp;
- mac_rx_fifo_t mrf;
- int ring;
-
- /*
- * Register Rx rings as resources and save mac
- * resource id for future reference
- */
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = bge_chip_blank;
- mrf.mrf_arg = (void *)bgep;
- mrf.mrf_normal_blank_time = time;
- mrf.mrf_normal_pkt_count = pkt_cnt;
-
- for (ring = 0; ring < bgep->chipid.rx_rings; ring++) {
- rrp = &bgep->recv[ring];
- rrp->handle = mac_resource_add(bgep->mh,
- (mac_resource_t *)&mrf);
- }
-}
-
-static void
-bge_m_resources(void *arg)
-{
- bge_t *bgep = arg;
-
- mutex_enter(bgep->genlock);
-
- bge_resources_add(bgep, bgep->chipid.rx_ticks_norm,
- bgep->chipid.rx_count_norm);
- mutex_exit(bgep->genlock);
-}
-
/*
* ========== Per-instance setup/teardown code ==========
*/
@@ -3404,29 +3467,23 @@ bge_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
* Determine whether to override the chip's own MAC address
*/
bge_find_mac_address(bgep, cidp);
- ethaddr_copy(cidp->vendor_addr.addr, bgep->curr_addr[0].addr);
- bgep->curr_addr[0].set = B_TRUE;
bgep->unicst_addr_total = MAC_ADDRESS_REGS_MAX;
- /*
- * Address available is one less than MAX
- * as primary address is not advertised
- * as a multiple MAC address.
- */
- bgep->unicst_addr_avail = MAC_ADDRESS_REGS_MAX - 1;
+ bgep->unicst_addr_avail = MAC_ADDRESS_REGS_MAX;
if ((macp = mac_alloc(MAC_VERSION)) == NULL)
goto attach_fail;
macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
macp->m_driver = bgep;
macp->m_dip = devinfo;
- macp->m_src_addr = bgep->curr_addr[0].addr;
+ macp->m_src_addr = cidp->vendor_addr.addr;
macp->m_callbacks = &bge_m_callbacks;
macp->m_min_sdu = 0;
macp->m_max_sdu = cidp->ethmax_size - sizeof (struct ether_header);
macp->m_margin = VLAN_TAGSZ;
macp->m_priv_props = bge_priv_prop;
macp->m_priv_prop_count = BGE_MAX_PRIV_PROPS;
+ macp->m_v12n = MAC_VIRT_LEVEL1;
/*
* Finally, we're ready to register ourselves with the MAC layer
diff --git a/usr/src/uts/common/io/bge/bge_recv2.c b/usr/src/uts/common/io/bge/bge_recv2.c
index 60df201711..2c8bb20f71 100644
--- a/usr/src/uts/common/io/bge/bge_recv2.c
+++ b/usr/src/uts/common/io/bge/bge_recv2.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "bge_impl.h"
#define U32TOPTR(x) ((void *)(uintptr_t)(uint32_t)(x))
@@ -274,7 +272,9 @@ error:
* the chip to indicate the packets it has accepted from the ring.
*/
static mblk_t *bge_receive_ring(bge_t *bgep, recv_ring_t *rrp);
+#ifndef DEBUG
#pragma inline(bge_receive_ring)
+#endif
static mblk_t *
bge_receive_ring(bge_t *bgep, recv_ring_t *rrp)
@@ -328,36 +328,61 @@ bge_receive_ring(bge_t *bgep, recv_ring_t *rrp)
}
/*
- * Receive all packets in all rings.
- *
- * To give priority to low-numbered rings, whenever we have received any
- * packets in any ring except 0, we restart scanning again from ring 0.
- * Thus, for example, if rings 0, 3, and 10 are carrying traffic, the
- * pattern of receives might go 0, 3, 10, 3, 0, 10, 0:
- *
- * 0 found some - receive them
- * 1..2 none found
- * 3 found some - receive them and restart scan
- * 0..9 none found
- * 10 found some - receive them and restart scan
- * 0..2 none found
- * 3 found some more - receive them and restart scan
- * 0 found some more - receive them
- * 1..9 none found
- * 10 found some more - receive them and restart scan
- * 0 found some more - receive them
- * 1..15 none found
- *
- * The routine returns only when a complete scan has been performed either
- * without finding any packets to receive or BGE_MAXPKT_RCVED packets were
- * received from ring 0 and other rings (if used) are empty.
+ * XXX: Poll a particular ring. The implementation is incomplete.
+ * Once the ring interrupts are disabled, we need to do bge_recyle()
+ * for the ring as well and re enable the ring interrupt automatically
+ * if the poll doesn't find any packets in the ring. We need to
+ * have MSI-X interrupts support for this.
*
- * Note that driver-defined locks may *NOT* be held across calls
- * to gld_recv().
- *
- * Note: the expression (BGE_RECV_RINGS_USED > 1), yields a compile-time
- * constant and allows the compiler to optimise away the outer do-loop
- * if only one receive ring is being used.
+ * The basic poll policy is that rings that are dealing with explicit
+ * flows (like TCP or some service) and are marked as such should
+ * have their own MSI-X interrupt per ring. bge_intr() should leave
+ * that interrupt disabled after an upcall. The ring is in poll mode.
+ * When a poll thread comes down and finds nothing, the MSI-X interrupt
+ * is automatically enabled. Squeue needs to deal with the race of
+ * a new interrupt firing and reaching before poll thread returns.
+ */
+mblk_t *
+bge_poll_ring(void *arg, int bytes_to_pickup)
+{
+ recv_ring_t *rrp = arg;
+ bge_t *bgep = rrp->bgep;
+ bge_rbd_t *hw_rbd_p;
+ uint64_t slot;
+ mblk_t *head;
+ mblk_t **tail;
+ mblk_t *mp;
+ size_t sz = 0;
+
+ mutex_enter(rrp->rx_lock);
+
+ /*
+ * Sync (all) the receive ring descriptors
+ * before accepting the packets they describe
+ */
+ DMA_SYNC(rrp->desc, DDI_DMA_SYNC_FORKERNEL);
+ hw_rbd_p = DMA_VPTR(rrp->desc);
+ head = NULL;
+ tail = &head;
+ slot = rrp->rx_next;
+
+ /* Note: volatile */
+ while ((slot != *rrp->prod_index_p) && (sz <= bytes_to_pickup)) {
+ if ((mp = bge_receive_packet(bgep, &hw_rbd_p[slot])) != NULL) {
+ *tail = mp;
+ sz += msgdsize(mp);
+ tail = &mp->b_next;
+ }
+ rrp->rx_next = slot = NEXT(slot, rrp->desc.nslots);
+ }
+
+ bge_mbx_put(bgep, rrp->chip_mbx_reg, rrp->rx_next);
+ mutex_exit(rrp->rx_lock);
+ return (head);
+}
+
+/*
+ * Receive all packets in all rings.
*/
void bge_receive(bge_t *bgep, bge_status_t *bsp);
#pragma no_inline(bge_receive)
@@ -366,41 +391,31 @@ void
bge_receive(bge_t *bgep, bge_status_t *bsp)
{
recv_ring_t *rrp;
- uint64_t ring;
- uint64_t rx_rings = bgep->chipid.rx_rings;
+ uint64_t index;
mblk_t *mp;
-restart:
- ring = 0;
- rrp = &bgep->recv[ring];
- do {
+ for (index = 0; index < bgep->chipid.rx_rings; index++) {
+ /*
+ * Start from the first ring.
+ */
+ rrp = &bgep->recv[index];
+
/*
* For each ring, (rrp->prod_index_p) points to the
* proper index within the status block (which has
* already been sync'd by the caller)
*/
- ASSERT(rrp->prod_index_p == RECV_INDEX_P(bsp, ring));
+ ASSERT(rrp->prod_index_p == RECV_INDEX_P(bsp, index));
- if (*rrp->prod_index_p == rrp->rx_next)
+ if (*rrp->prod_index_p == rrp->rx_next || rrp->poll_flag)
continue; /* no packets */
if (mutex_tryenter(rrp->rx_lock) == 0)
continue; /* already in process */
mp = bge_receive_ring(bgep, rrp);
mutex_exit(rrp->rx_lock);
- if (mp != NULL) {
- mac_rx(bgep->mh, rrp->handle, mp);
-
- /*
- * Restart from ring 0, if the driver is compiled
- * with multiple rings and we're not on ring 0 now
- */
- if (rx_rings > 1 && ring > 0)
- goto restart;
- }
-
- /*
- * Loop over all rings (if there *are* multiple rings)
- */
- } while (++rrp, ++ring < rx_rings);
+ if (mp != NULL)
+ mac_rx_ring(bgep->mh, rrp->ring_handle, mp,
+ rrp->ring_gen_num);
+ }
}
diff --git a/usr/src/uts/common/io/bge/bge_send.c b/usr/src/uts/common/io/bge/bge_send.c
index a8c6f16ac2..01b70fd13d 100644
--- a/usr/src/uts/common/io/bge/bge_send.c
+++ b/usr/src/uts/common/io/bge/bge_send.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "bge_impl.h"
@@ -484,11 +482,11 @@ start_tx:
mutex_exit(srp->tx_lock);
}
-static boolean_t
-bge_send(bge_t *bgep, mblk_t *mp)
+mblk_t *
+bge_ring_tx(void *arg, mblk_t *mp)
{
- uint_t ring = 0; /* use ring 0 */
- send_ring_t *srp;
+ send_ring_t *srp = arg;
+ bge_t *bgep = srp->bgep;
struct ether_vlan_header *ehp;
bge_queue_item_t *txbuf_item;
sw_txbuf_t *txbuf;
@@ -499,7 +497,6 @@ bge_send(bge_t *bgep, mblk_t *mp)
char *pbuf;
ASSERT(mp->b_next == NULL);
- srp = &bgep->send[ring];
/*
* Get a s/w tx buffer first
@@ -510,7 +507,7 @@ bge_send(bge_t *bgep, mblk_t *mp)
srp->tx_nobuf++;
bgep->tx_resched_needed = B_TRUE;
bge_send_serial(bgep, srp);
- return (B_FALSE);
+ return (mp);
}
/*
@@ -564,12 +561,23 @@ bge_send(bge_t *bgep, mblk_t *mp)
*/
bge_send_serial(bgep, srp);
+ srp->pushed_bytes += MBLKL(mp);
+
/*
* We've copied the contents, the message can be freed right away
*/
freemsg(mp);
+ return (NULL);
+}
+
+static mblk_t *
+bge_send(bge_t *bgep, mblk_t *mp)
+{
+ send_ring_t *ring;
+
+ ring = &bgep->send[0]; /* ring 0 */
- return (B_TRUE);
+ return (bge_ring_tx(ring, mp));
}
uint_t
@@ -621,7 +629,7 @@ bge_m_tx(void *arg, mblk_t *mp)
next = mp->b_next;
mp->b_next = NULL;
- if (!bge_send(bgep, mp)) {
+ if ((mp = bge_send(bgep, mp)) != NULL) {
mp->b_next = next;
break;
}
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index 615006d86e..55e4d161db 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -31,14 +31,17 @@
#include <sys/mkdev.h>
#include <sys/modctl.h>
#include <sys/stat.h>
-#include <sys/vlan.h>
-#include <sys/mac.h>
#include <sys/dld_impl.h>
#include <sys/dls_impl.h>
#include <sys/softmac.h>
-#include <sys/vlan.h>
-#include <sys/policy.h>
+#include <sys/mac.h>
+#include <sys/mac_ether.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
#include <inet/common.h>
+#include <sys/policy.h>
+#include <sys/priv_names.h>
static void drv_init(void);
static int drv_fini(void);
@@ -150,6 +153,7 @@ drv_init(void)
{
drv_secobj_init();
dld_str_init();
+
/*
* Create a hash table for autopush configuration.
*/
@@ -179,7 +183,6 @@ drv_fini(void)
rw_enter(&dld_ap_hash_lock, RW_READER);
mod_hash_walk(dld_ap_hashp, drv_ap_exist, &exist);
rw_exit(&dld_ap_hash_lock);
-
if (exist)
return (EBUSY);
@@ -314,24 +317,33 @@ drv_open(dev_t *devp, int flag, int sflag, cred_t *credp)
*/
/* ARGSUSED */
static int
-drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
dld_ioc_attr_t *diap = karg;
dls_dl_handle_t dlh;
- dls_vlan_t *dvp;
+ dls_link_t *dlp;
int err;
+ mac_perim_handle_t mph;
if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0)
return (err);
- if ((err = dls_vlan_hold(dls_devnet_mac(dlh),
- dls_devnet_vid(dlh), &dvp, B_FALSE, B_FALSE)) != 0) {
+ if ((err = mac_perim_enter_by_macname(
+ dls_devnet_mac(dlh), &mph)) != 0) {
dls_devnet_rele_tmp(dlh);
return (err);
}
- mac_sdu_get(dvp->dv_dlp->dl_mh, NULL, &diap->dia_max_sdu);
- dls_vlan_rele(dvp);
+ if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) {
+ mac_perim_exit(mph);
+ dls_devnet_rele_tmp(dlh);
+ return (err);
+ }
+
+ mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu);
+
+ dls_link_rele(dlp);
+ mac_perim_exit(mph);
dls_devnet_rele_tmp(dlh);
return (0);
@@ -342,7 +354,7 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
*/
/* ARGSUSED */
static int
-drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
dld_ioc_phys_attr_t *dipp = karg;
int err;
@@ -387,64 +399,184 @@ drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
return (0);
}
+/* ARGSUSED */
+static int
+drv_ioc_hwgrpget(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ dld_ioc_hwgrpget_t *hwgrpp = karg;
+ dld_hwgrpinfo_t hwgrp, *hip;
+ mac_handle_t mh = NULL;
+ int i, err, grpnum;
+ uint_t bytes_left;
+
+ hwgrpp->dih_n_groups = 0;
+ err = mac_open_by_linkid(hwgrpp->dih_linkid, &mh);
+ if (err != 0)
+ goto done;
+
+ hip = (dld_hwgrpinfo_t *)
+ ((uchar_t *)arg + sizeof (dld_ioc_hwgrpget_t));
+ bytes_left = hwgrpp->dih_size;
+ grpnum = mac_hwgrp_num(mh);
+ for (i = 0; i < grpnum; i++) {
+ if (sizeof (dld_hwgrpinfo_t) > bytes_left) {
+ err = ENOSPC;
+ goto done;
+ }
+
+ bzero(&hwgrp, sizeof (hwgrp));
+ bcopy(mac_name(mh), hwgrp.dhi_link_name,
+ sizeof (hwgrp.dhi_link_name));
+ mac_get_hwgrp_info(mh, i, &hwgrp.dhi_grp_num,
+ &hwgrp.dhi_n_rings, &hwgrp.dhi_grp_type,
+ &hwgrp.dhi_n_clnts, hwgrp.dhi_clnts);
+ if (copyout(&hwgrp, hip, sizeof (hwgrp)) != 0) {
+ err = EFAULT;
+ goto done;
+ }
+
+ hip++;
+ bytes_left -= sizeof (dld_hwgrpinfo_t);
+ }
+
+done:
+ if (mh != NULL)
+ dld_mac_close(mh);
+ if (err == 0)
+ hwgrpp->dih_n_groups = grpnum;
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+drv_ioc_macaddrget(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ dld_ioc_macaddrget_t *magp = karg;
+ dld_macaddrinfo_t mai, *maip;
+ mac_handle_t mh = NULL;
+ int i, err;
+ uint_t bytes_left;
+ boolean_t is_used;
+
+ magp->dig_count = 0;
+ err = mac_open_by_linkid(magp->dig_linkid, &mh);
+ if (err != 0)
+ goto done;
+
+ maip = (dld_macaddrinfo_t *)
+ ((uchar_t *)arg + sizeof (dld_ioc_macaddrget_t));
+ bytes_left = magp->dig_size;
+
+ for (i = 0; i < mac_addr_factory_num(mh) + 1; i++) {
+ if (sizeof (dld_macaddrinfo_t) > bytes_left) {
+ err = ENOSPC;
+ goto done;
+ }
+
+ bzero(&mai, sizeof (mai));
+
+ if (i == 0) {
+ /* primary MAC address */
+ mac_unicast_primary_get(mh, mai.dmi_addr);
+ mai.dmi_addrlen = mac_addr_len(mh);
+ mac_unicast_primary_info(mh, mai.dmi_client_name,
+ &is_used);
+ } else {
+ /* factory MAC address slot */
+ mac_addr_factory_value(mh, i, mai.dmi_addr,
+ &mai.dmi_addrlen, mai.dmi_client_name, &is_used);
+ }
+
+ mai.dmi_slot = i;
+ if (is_used)
+ mai.dmi_flags |= DLDIOCMACADDR_USED;
+
+ if (copyout(&mai, maip, sizeof (mai)) != 0) {
+ err = EFAULT;
+ goto done;
+ }
+
+ maip++;
+ bytes_left -= sizeof (dld_macaddrinfo_t);
+ }
+
+done:
+ if (mh != NULL)
+ dld_mac_close(mh);
+ if (err == 0)
+ magp->dig_count = mac_addr_factory_num(mh) + 1;
+ return (err);
+}
+
/*
- * DLDIOC_SETPROP
+ * DLDIOC_SET/GETPROP
*/
static int
-drv_ioc_prop_common(dld_ioc_macprop_t *dipp, intptr_t arg, boolean_t set,
+drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
int mode)
{
- int err = EINVAL;
- size_t dsize;
- dld_ioc_macprop_t *kdipp;
- dls_dl_handle_t dlh;
- dls_vlan_t *dvp;
- datalink_id_t linkid;
+ int err = EINVAL;
+ dls_dl_handle_t dlh = NULL;
+ dls_link_t *dlp = NULL;
+ mac_perim_handle_t mph = NULL;
mac_prop_t macprop;
- uchar_t *cp;
- struct dlautopush *dlap;
- dld_ioc_zid_t *dzp;
+ dld_ioc_macprop_t *kprop;
+ datalink_id_t linkid;
+ uint_t dsize;
+
/*
- * We only use pr_valsize from dipp, as the caller only did a
+ * We only use pr_valsize from prop, as the caller only did a
* copyin() for sizeof (dld_ioc_prop_t), which doesn't cover
* the property data. We copyin the full dld_ioc_prop_t
- * including the data into kdipp down below.
+ * including the data into kprop down below.
*/
- dsize = sizeof (dld_ioc_macprop_t) + dipp->pr_valsize - 1;
- if (dsize < dipp->pr_valsize)
+ dsize = sizeof (dld_ioc_macprop_t) + prop->pr_valsize - 1;
+ if (dsize < prop->pr_valsize)
return (EINVAL);
/*
* The property data is variable size, so we need to allocate
* a buffer for kernel use as this data was not part of the
- * dipp allocation and copyin() done by the framework.
+ * prop allocation and copyin() done by the framework.
*/
- if ((kdipp = kmem_alloc(dsize, KM_NOSLEEP)) == NULL)
+ if ((kprop = kmem_alloc(dsize, KM_NOSLEEP)) == NULL)
return (ENOMEM);
- if (ddi_copyin((void *)arg, kdipp, dsize, mode) != 0) {
+
+ if (ddi_copyin((void *)arg, kprop, dsize, mode) != 0) {
err = EFAULT;
goto done;
}
- linkid = kdipp->pr_linkid;
+ linkid = kprop->pr_linkid;
+ if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
+ goto done;
+
+ if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh),
+ &mph)) != 0) {
+ goto done;
+ }
- switch (dipp->pr_num) {
- case MAC_PROP_ZONE:
+ switch (kprop->pr_num) {
+ case MAC_PROP_ZONE: {
if (set) {
- dzp = (dld_ioc_zid_t *)kdipp->pr_val;
+ dld_ioc_zid_t *dzp = (dld_ioc_zid_t *)kprop->pr_val;
+
err = dls_devnet_setzid(dzp->diz_link, dzp->diz_zid);
goto done;
} else {
- kdipp->pr_perm_flags = MAC_PROP_PERM_RW;
- cp = (uchar_t *)kdipp->pr_val;
- err = dls_devnet_getzid(linkid, (zoneid_t *)cp);
+ kprop->pr_perm_flags = MAC_PROP_PERM_RW;
+ err = dls_devnet_getzid(linkid,
+ (zoneid_t *)kprop->pr_val);
goto done;
}
- case MAC_PROP_AUTOPUSH:
+ }
+ case MAC_PROP_AUTOPUSH: {
+ struct dlautopush *dlap =
+ (struct dlautopush *)kprop->pr_val;
+
if (set) {
- if (dipp->pr_valsize != 0) {
- dlap = (struct dlautopush *)kdipp->pr_val;
+ if (kprop->pr_valsize != 0) {
err = drv_ioc_setap(linkid, dlap);
goto done;
} else {
@@ -452,125 +584,73 @@ drv_ioc_prop_common(dld_ioc_macprop_t *dipp, intptr_t arg, boolean_t set,
goto done;
}
} else {
- kdipp->pr_perm_flags = MAC_PROP_PERM_RW;
- dlap = (struct dlautopush *)kdipp->pr_val;
+ kprop->pr_perm_flags = MAC_PROP_PERM_RW;
err = drv_ioc_getap(linkid, dlap);
goto done;
}
-
+ }
default:
break;
}
- if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
- goto done;
-
- if ((err = dls_vlan_hold(dls_devnet_mac(dlh),
- dls_devnet_vid(dlh), &dvp, B_FALSE, B_FALSE)) != 0) {
- dls_devnet_rele_tmp(dlh);
+ if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
goto done;
- }
- macprop.mp_name = kdipp->pr_name;
- macprop.mp_id = kdipp->pr_num;
- macprop.mp_flags = kdipp->pr_flags;
+ macprop.mp_name = kprop->pr_name;
+ macprop.mp_id = kprop->pr_num;
+ macprop.mp_flags = kprop->pr_flags;
if (set) {
- err = mac_set_prop(dvp->dv_dlp->dl_mh, &macprop,
- kdipp->pr_val, kdipp->pr_valsize);
+ err = mac_set_prop(dlp->dl_mh, &macprop, kprop->pr_val,
+ kprop->pr_valsize);
} else {
- kdipp->pr_perm_flags = MAC_PROP_PERM_RW;
- err = mac_get_prop(dvp->dv_dlp->dl_mh, &macprop,
- kdipp->pr_val, kdipp->pr_valsize, &kdipp->pr_perm_flags);
+ kprop->pr_perm_flags = MAC_PROP_PERM_RW;
+ err = mac_get_prop(dlp->dl_mh, &macprop, kprop->pr_val,
+ kprop->pr_valsize, &kprop->pr_perm_flags);
}
- dls_vlan_rele(dvp);
- dls_devnet_rele_tmp(dlh);
done:
if (!set && err == 0 &&
- ddi_copyout(kdipp, (void *)arg, dsize, mode) != 0)
+ ddi_copyout(kprop, (void *)arg, dsize, mode) != 0)
err = EFAULT;
- kmem_free(kdipp, dsize);
- return (err);
-}
-/* ARGSUSED */
-static int
-drv_ioc_setprop(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
- return (drv_ioc_prop_common(karg, arg, B_TRUE, mode));
-}
+ if (dlp != NULL)
+ dls_link_rele(dlp);
-/* ARGSUSED */
-static int
-drv_ioc_getprop(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
- return (drv_ioc_prop_common(karg, arg, B_FALSE, mode));
-}
+ if (mph != NULL) {
+ int32_t cpuid;
+ void *mdip = NULL;
-/*
- * DLDIOC_CREATE_VLAN
- */
-/* ARGSUSED */
-static int
-drv_ioc_create_vlan(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
- dld_ioc_create_vlan_t *dicp = karg;
+ if (dlp != NULL && set && err == 0) {
+ cpuid = mac_client_intr_cpu(dlp->dl_mch);
+ mdip = mac_get_devinfo(dlp->dl_mh);
+ }
- return (dls_devnet_create_vlan(dicp->dic_vlanid, dicp->dic_linkid,
- dicp->dic_vid, dicp->dic_force));
+ mac_perim_exit(mph);
+
+ if (mdip != NULL)
+ mac_client_set_intr_cpu(mdip, dlp->dl_mch, cpuid);
+ }
+ if (dlh != NULL)
+ dls_devnet_rele_tmp(dlh);
+
+ if (kprop != NULL)
+ kmem_free(kprop, dsize);
+ return (err);
}
-/*
- * DLDIOC_DELETE_VLAN
- */
/* ARGSUSED */
static int
-drv_ioc_delete_vlan(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
- dld_ioc_delete_vlan_t *didp = karg;
-
- return (dls_devnet_destroy_vlan(didp->did_linkid));
+ return (drv_ioc_prop_common(karg, arg, B_TRUE, mode));
}
-/*
- * DLDIOC_VLAN_ATTR
- */
/* ARGSUSED */
static int
-drv_ioc_vlan_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
- dld_ioc_vlan_attr_t *divp = karg;
- dls_dl_handle_t dlh;
- uint16_t vid;
- dls_vlan_t *dvp;
- int err;
-
- /*
- * Hold this link to prevent it from being deleted.
- */
- if ((err = dls_devnet_hold_tmp(divp->div_vlanid, &dlh)) != 0)
- return (err);
-
- if ((vid = dls_devnet_vid(dlh)) == VLAN_ID_NONE) {
- dls_devnet_rele_tmp(dlh);
- return (EINVAL);
- }
-
- err = dls_vlan_hold(dls_devnet_mac(dlh), vid, &dvp, B_FALSE, B_FALSE);
- if (err != 0) {
- dls_devnet_rele_tmp(dlh);
- return (err);
- }
-
- divp->div_linkid = dls_devnet_linkid(dlh);
- divp->div_implicit = !dls_devnet_is_explicit(dlh);
- divp->div_vid = vid;
- divp->div_force = dvp->dv_force;
-
- dls_vlan_rele(dvp);
- dls_devnet_rele_tmp(dlh);
- return (0);
+ return (drv_ioc_prop_common(karg, arg, B_FALSE, mode));
}
/*
@@ -581,7 +661,7 @@ drv_ioc_vlan_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
*/
/* ARGSUSED */
static int
-drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
dld_ioc_rename_t *dir = karg;
mod_hash_key_t key;
@@ -719,7 +799,7 @@ drv_ioc_clrap(datalink_id_t linkid)
*/
/* ARGSUSED */
static int
-drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
dld_ioc_door_t *did = karg;
@@ -727,6 +807,76 @@ drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred)
}
/*
+ * DLDIOC_USAGELOG
+ */
+/* ARGSUSED */
+static int
+drv_ioc_usagelog(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ dld_ioc_usagelog_t *log_info = (dld_ioc_usagelog_t *)karg;
+
+ if (log_info->ul_type < MAC_LOGTYPE_LINK ||
+ log_info->ul_type > MAC_LOGTYPE_FLOW)
+ return (EINVAL);
+
+ if (log_info->ul_onoff)
+ mac_start_logusage(log_info->ul_type, log_info->ul_interval);
+ else
+ mac_stop_logusage(log_info->ul_type);
+ return (0);
+}
+
+/*
+ * Process a DLDIOC_ADDFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_addflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ dld_ioc_addflow_t *afp = karg;
+
+ return (dld_add_flow(afp->af_linkid, afp->af_name,
+ &afp->af_flow_desc, &afp->af_resource_props));
+}
+
+/*
+ * Process a DLDIOC_REMOVEFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_removeflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ dld_ioc_removeflow_t *rfp = karg;
+
+ return (dld_remove_flow(rfp->rf_name));
+}
+
+/*
+ * Process a DLDIOC_MODIFYFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_modifyflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ dld_ioc_modifyflow_t *mfp = karg;
+
+ return (dld_modify_flow(mfp->mf_name, &mfp->mf_resource_props));
+}
+
+/*
+ * Process a DLDIOC_WALKFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_walkflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ dld_ioc_walkflow_t *wfp = karg;
+
+ return (dld_walk_flow(wfp, arg));
+}
+
+/*
* Check for GLDv3 autopush information. There are three cases:
*
* 1. If devp points to a GLDv3 datalink and it has autopush configuration,
@@ -809,7 +959,7 @@ drv_secobj_fini(void)
/* ARGSUSED */
static int
-drv_ioc_secobj_set(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_secobj_set(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
dld_ioc_secobj_set_t *ssp = karg;
dld_secobj_t *sobjp, *objp;
@@ -885,14 +1035,13 @@ drv_secobj_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
/* ARGSUSED */
static int
-drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
dld_ioc_secobj_get_t *sgp = karg;
dld_secobj_t *sobjp, *objp;
int err;
sobjp = &sgp->sg_obj;
-
if (sobjp->so_name[DLD_SECOBJ_NAME_MAX - 1] != '\0')
return (EINVAL);
@@ -932,7 +1081,8 @@ drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred)
/* ARGSUSED */
static int
-drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
{
dld_ioc_secobj_unset_t *sup = karg;
dld_secobj_t *objp;
@@ -959,32 +1109,56 @@ drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred)
return (0);
}
+static int
+drv_check_policy(dld_ioc_info_t *info, cred_t *cred)
+{
+ int i, err = 0;
+
+ for (i = 0; info->di_priv[i] != NULL && i < DLD_MAX_PRIV; i++) {
+ if ((err = secpolicy_dld_ioctl(cred, info->di_priv[i],
+ "dld ioctl")) != 0) {
+ break;
+ }
+ }
+ if (err == 0)
+ return (0);
+
+ return (secpolicy_net_config(cred, B_FALSE));
+}
+
static dld_ioc_info_t drv_ioc_list[] = {
{DLDIOC_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_attr_t),
- drv_ioc_attr},
+ drv_ioc_attr, {NULL}},
{DLDIOC_PHYS_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_phys_attr_t),
- drv_ioc_phys_attr},
- {DLDIOC_SECOBJ_SET, DLDCOPYIN | DLDDLCONFIG,
- sizeof (dld_ioc_secobj_set_t), drv_ioc_secobj_set},
- {DLDIOC_SECOBJ_GET, DLDCOPYINOUT | DLDDLCONFIG,
- sizeof (dld_ioc_secobj_get_t), drv_ioc_secobj_get},
- {DLDIOC_SECOBJ_UNSET, DLDCOPYIN | DLDDLCONFIG,
- sizeof (dld_ioc_secobj_unset_t), drv_ioc_secobj_unset},
- {DLDIOC_CREATE_VLAN, DLDCOPYIN | DLDDLCONFIG,
- sizeof (dld_ioc_create_vlan_t), drv_ioc_create_vlan},
- {DLDIOC_DELETE_VLAN, DLDCOPYIN | DLDDLCONFIG,
- sizeof (dld_ioc_delete_vlan_t),
- drv_ioc_delete_vlan},
- {DLDIOC_VLAN_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_vlan_attr_t),
- drv_ioc_vlan_attr},
- {DLDIOC_DOORSERVER, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_door_t),
- drv_ioc_doorserver},
- {DLDIOC_RENAME, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_rename_t),
- drv_ioc_rename},
+ drv_ioc_phys_attr, {NULL}},
+ {DLDIOC_SECOBJ_SET, DLDCOPYIN, sizeof (dld_ioc_secobj_set_t),
+ drv_ioc_secobj_set, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_SECOBJ_GET, DLDCOPYINOUT, sizeof (dld_ioc_secobj_get_t),
+ drv_ioc_secobj_get, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_SECOBJ_UNSET, DLDCOPYIN, sizeof (dld_ioc_secobj_unset_t),
+ drv_ioc_secobj_unset, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_DOORSERVER, DLDCOPYIN, sizeof (dld_ioc_door_t),
+ drv_ioc_doorserver, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_RENAME, DLDCOPYIN, sizeof (dld_ioc_rename_t),
+ drv_ioc_rename, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_MACADDRGET, DLDCOPYINOUT, sizeof (dld_ioc_macaddrget_t),
+ drv_ioc_macaddrget, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_ADDFLOW, DLDCOPYIN, sizeof (dld_ioc_addflow_t),
+ drv_ioc_addflow, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_REMOVEFLOW, DLDCOPYIN, sizeof (dld_ioc_removeflow_t),
+ drv_ioc_removeflow, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_MODIFYFLOW, DLDCOPYIN, sizeof (dld_ioc_modifyflow_t),
+ drv_ioc_modifyflow, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_WALKFLOW, DLDCOPYINOUT, sizeof (dld_ioc_walkflow_t),
+ drv_ioc_walkflow, {NULL}},
+ {DLDIOC_USAGELOG, DLDCOPYIN, sizeof (dld_ioc_usagelog_t),
+ drv_ioc_usagelog, {PRIV_SYS_DL_CONFIG}},
+ {DLDIOC_SETMACPROP, DLDCOPYIN, sizeof (dld_ioc_macprop_t),
+ drv_ioc_setprop, {PRIV_SYS_DL_CONFIG}},
{DLDIOC_GETMACPROP, DLDCOPYIN, sizeof (dld_ioc_macprop_t),
- drv_ioc_getprop},
- {DLDIOC_SETMACPROP, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_macprop_t),
- drv_ioc_setprop}
+ drv_ioc_getprop, {NULL}},
+ {DLDIOC_GETHWGRP, DLDCOPYINOUT, sizeof (dld_ioc_hwgrpget_t),
+ drv_ioc_hwgrpget, {PRIV_SYS_DL_CONFIG}},
};
typedef struct dld_ioc_modentry {
@@ -1090,11 +1264,8 @@ drv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp)
}
info = &dim->dim_list[i];
-
- if ((info->di_flags & DLDDLCONFIG) && secpolicy_dl_config(cred) != 0) {
- err = EPERM;
+ if ((err = drv_check_policy(info, cred)) != 0)
goto done;
- }
sz = info->di_argsize;
if ((buf = kmem_zalloc(sz, KM_NOSLEEP)) == NULL) {
@@ -1108,7 +1279,7 @@ drv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp)
goto done;
}
- err = info->di_func(buf, arg, mode, cred);
+ err = info->di_func(buf, arg, mode, cred, rvalp);
if ((info->di_flags & DLDCOPYOUT) &&
ddi_copyout(buf, (void *)arg, sz, mode) != 0 && err == 0)
diff --git a/usr/src/uts/common/io/dld/dld_flow.c b/usr/src/uts/common/io/dld/dld_flow.c
new file mode 100644
index 0000000000..b57368484f
--- /dev/null
+++ b/usr/src/uts/common/io/dld/dld_flow.c
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Flows ioctls implementation.
+ */
+
+#include <sys/dld.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
+
+/*
+ * Implements flow add, remove, modify ioctls.
+ */
+int
+dld_add_flow(datalink_id_t linkid, char *flow_name, flow_desc_t *flow_desc,
+ mac_resource_props_t *mrp)
+{
+ return (mac_link_flow_add(linkid, flow_name, flow_desc, mrp));
+}
+
+int
+dld_remove_flow(char *flow_name)
+{
+ return (mac_link_flow_remove(flow_name));
+}
+
+int
+dld_modify_flow(char *flow_name, mac_resource_props_t *mrp)
+{
+ return (mac_link_flow_modify(flow_name, mrp));
+}
+
+
+/*
+ * Callback function and structure used by dld_walk_flow().
+ */
+typedef struct flowinfo_state_s {
+ int fi_bufsize;
+ int fi_nflows;
+ uchar_t *fi_fl;
+} flowinfo_state_t;
+
+static int
+dld_walk_flow_cb(mac_flowinfo_t *finfo, void *arg)
+{
+ flowinfo_state_t *statep = arg;
+ dld_flowinfo_t fi;
+
+ if (statep->fi_bufsize < sizeof (dld_flowinfo_t))
+ return (ENOSPC);
+
+ (void) strlcpy(fi.fi_flowname, finfo->fi_flow_name,
+ sizeof (fi.fi_flowname));
+ fi.fi_linkid = finfo->fi_link_id;
+ fi.fi_flow_desc = finfo->fi_flow_desc;
+ fi.fi_resource_props = finfo->fi_resource_props;
+
+ if (copyout(&fi, statep->fi_fl, sizeof (fi)) != 0) {
+ return (EFAULT);
+ }
+ statep->fi_nflows++;
+ statep->fi_bufsize -= sizeof (dld_flowinfo_t);
+ statep->fi_fl += sizeof (dld_flowinfo_t);
+ return (0);
+}
+
+/*
+ * Implements flow walk ioctl.
+ * Retrieves a specific flow or a list of flows from the specified link.
+ * ENOSPC is returned a bigger buffer is needed.
+ */
+int
+dld_walk_flow(dld_ioc_walkflow_t *wf, intptr_t uaddr)
+{
+ flowinfo_state_t state;
+ mac_flowinfo_t finfo;
+ int err = 0;
+
+ state.fi_bufsize = wf->wf_len;
+ state.fi_fl = (uchar_t *)uaddr + sizeof (*wf);
+ state.fi_nflows = 0;
+
+ if (wf->wf_name[0] == '\0') {
+ err = mac_link_flow_walk(wf->wf_linkid, dld_walk_flow_cb,
+ &state);
+ } else {
+ err = mac_link_flow_info(wf->wf_name, &finfo);
+ if (err != 0)
+ return (err);
+
+ err = dld_walk_flow_cb(&finfo, &state);
+ }
+ wf->wf_nflows = state.fi_nflows;
+ return (err);
+}
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 5bc1fc5322..2c3d0f7ecb 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -23,32 +23,19 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Data-Link Driver
*/
-
-#include <sys/types.h>
-#include <sys/debug.h>
#include <sys/sysmacros.h>
-#include <sys/stream.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/strsun.h>
-#include <sys/cpuvar.h>
-#include <sys/dlpi.h>
-#include <netinet/in.h>
-#include <sys/sdt.h>
#include <sys/strsubr.h>
+#include <sys/strsun.h>
#include <sys/vlan.h>
-#include <sys/mac.h>
-#include <sys/dls.h>
-#include <sys/dld.h>
#include <sys/dld_impl.h>
-#include <sys/dls_soft_ring.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
-typedef boolean_t proto_reqfunc_t(dld_str_t *, union DL_primitives *, mblk_t *);
+typedef void proto_reqfunc_t(dld_str_t *, mblk_t *);
static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req,
@@ -56,13 +43,8 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req,
proto_notify_req, proto_passive_req;
-static void proto_poll_disable(dld_str_t *);
-static boolean_t proto_poll_enable(dld_str_t *, dl_capab_dls_t *);
-
-static void proto_soft_ring_disable(dld_str_t *);
-static boolean_t proto_soft_ring_enable(dld_str_t *, dl_capab_dls_t *);
-static boolean_t proto_capability_advertise(dld_str_t *, mblk_t *);
-static void proto_change_soft_ring_fanout(dld_str_t *, int);
+static void proto_capability_advertise(dld_str_t *, mblk_t *);
+static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *);
#define DL_ACK_PENDING(state) \
((state) == DL_ATTACH_PENDING || \
@@ -79,70 +61,72 @@ static void proto_change_soft_ring_fanout(dld_str_t *, int);
* by the above primitives.
*/
void
-dld_wput_proto_nondata(dld_str_t *dsp, mblk_t *mp)
+dld_proto(dld_str_t *dsp, mblk_t *mp)
{
- union DL_primitives *udlp;
t_uscalar_t prim;
- ASSERT(MBLKL(mp) >= sizeof (t_uscalar_t));
-
- udlp = (union DL_primitives *)mp->b_rptr;
- prim = udlp->dl_primitive;
+ if (MBLKL(mp) < sizeof (t_uscalar_t)) {
+ freemsg(mp);
+ return;
+ }
+ prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
switch (prim) {
case DL_INFO_REQ:
- (void) proto_info_req(dsp, udlp, mp);
+ proto_info_req(dsp, mp);
break;
case DL_BIND_REQ:
- (void) proto_bind_req(dsp, udlp, mp);
+ proto_bind_req(dsp, mp);
break;
case DL_UNBIND_REQ:
- (void) proto_unbind_req(dsp, udlp, mp);
+ proto_unbind_req(dsp, mp);
+ break;
+ case DL_UNITDATA_REQ:
+ proto_unitdata_req(dsp, mp);
break;
case DL_UDQOS_REQ:
- (void) proto_udqos_req(dsp, udlp, mp);
+ proto_udqos_req(dsp, mp);
break;
case DL_ATTACH_REQ:
- (void) proto_attach_req(dsp, udlp, mp);
+ proto_attach_req(dsp, mp);
break;
case DL_DETACH_REQ:
- (void) proto_detach_req(dsp, udlp, mp);
+ proto_detach_req(dsp, mp);
break;
case DL_ENABMULTI_REQ:
- (void) proto_enabmulti_req(dsp, udlp, mp);
+ proto_enabmulti_req(dsp, mp);
break;
case DL_DISABMULTI_REQ:
- (void) proto_disabmulti_req(dsp, udlp, mp);
+ proto_disabmulti_req(dsp, mp);
break;
case DL_PROMISCON_REQ:
- (void) proto_promiscon_req(dsp, udlp, mp);
+ proto_promiscon_req(dsp, mp);
break;
case DL_PROMISCOFF_REQ:
- (void) proto_promiscoff_req(dsp, udlp, mp);
+ proto_promiscoff_req(dsp, mp);
break;
case DL_PHYS_ADDR_REQ:
- (void) proto_physaddr_req(dsp, udlp, mp);
+ proto_physaddr_req(dsp, mp);
break;
case DL_SET_PHYS_ADDR_REQ:
- (void) proto_setphysaddr_req(dsp, udlp, mp);
+ proto_setphysaddr_req(dsp, mp);
break;
case DL_NOTIFY_REQ:
- (void) proto_notify_req(dsp, udlp, mp);
+ proto_notify_req(dsp, mp);
break;
case DL_CAPABILITY_REQ:
- (void) proto_capability_req(dsp, udlp, mp);
+ proto_capability_req(dsp, mp);
break;
case DL_PASSIVE_REQ:
- (void) proto_passive_req(dsp, udlp, mp);
+ proto_passive_req(dsp, mp);
break;
default:
- (void) proto_req(dsp, udlp, mp);
+ proto_req(dsp, mp);
break;
}
}
#define NEG(x) -(x)
-
typedef struct dl_info_ack_wrapper {
dl_info_ack_t dl_info;
uint8_t dl_addr[MAXMACADDRLEN + sizeof (uint16_t)];
@@ -154,9 +138,8 @@ typedef struct dl_info_ack_wrapper {
/*
* DL_INFO_REQ
*/
-/*ARGSUSED*/
-static boolean_t
-proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_info_req(dld_str_t *dsp, mblk_t *mp)
{
dl_info_ack_wrapper_t *dlwp;
dl_info_ack_t *dlp;
@@ -176,9 +159,7 @@ proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
*/
if ((mp = mexchange(q, mp, sizeof (dl_info_ack_wrapper_t),
M_PCPROTO, 0)) == NULL)
- return (B_FALSE);
-
- rw_enter(&dsp->ds_lock, RW_READER);
+ return;
bzero(mp->b_rptr, sizeof (dl_info_ack_wrapper_t));
dlwp = (dl_info_ack_wrapper_t *)mp->b_rptr;
@@ -307,7 +288,8 @@ proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
*/
dlp->dl_addr_offset = (uintptr_t)addr - (uintptr_t)dlp;
if (addr_length > 0)
- bcopy(dsp->ds_curr_addr, addr, addr_length);
+ mac_unicast_primary_get(dsp->ds_mh, addr);
+
*(uint16_t *)(addr + addr_length) = dsp->ds_sap;
}
@@ -319,25 +301,20 @@ done:
ASSERT(IMPLY(dlp->dl_brdcst_addr_offset != 0,
dlp->dl_brdcst_addr_length != 0));
- rw_exit(&dsp->ds_lock);
-
qreply(q, mp);
- return (B_TRUE);
}
/*
* DL_ATTACH_REQ
*/
-static boolean_t
-proto_attach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_attach_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_attach_req_t *dlp = (dl_attach_req_t *)udlp;
+ dl_attach_req_t *dlp = (dl_attach_req_t *)mp->b_rptr;
int err = 0;
t_uscalar_t dl_err;
queue_t *q = dsp->ds_wq;
- rw_enter(&dsp->ds_lock, RW_WRITER);
-
if (MBLKL(mp) < sizeof (dl_attach_req_t) ||
dlp->dl_ppa < 0 || dsp->ds_style == DL_STYLE1) {
dl_err = DL_BADPRIM;
@@ -366,25 +343,22 @@ proto_attach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
ASSERT(dsp->ds_dlstate == DL_UNBOUND);
- rw_exit(&dsp->ds_lock);
-
dlokack(q, mp, DL_ATTACH_REQ);
- return (B_TRUE);
+ return;
+
failed:
- rw_exit(&dsp->ds_lock);
dlerrorack(q, mp, DL_ATTACH_REQ, dl_err, (t_uscalar_t)err);
- return (B_FALSE);
}
-/*ARGSUSED*/
-static boolean_t
-proto_detach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+/*
+ * DL_DETACH_REQ
+ */
+static void
+proto_detach_req(dld_str_t *dsp, mblk_t *mp)
{
queue_t *q = dsp->ds_wq;
t_uscalar_t dl_err;
- rw_enter(&dsp->ds_lock, RW_WRITER);
-
if (MBLKL(mp) < sizeof (dl_detach_req_t)) {
dl_err = DL_BADPRIM;
goto failed;
@@ -400,37 +374,34 @@ proto_detach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
+ ASSERT(dsp->ds_datathr_cnt == 0);
dsp->ds_dlstate = DL_DETACH_PENDING;
- dld_str_detach(dsp);
- rw_exit(&dsp->ds_lock);
+ dld_str_detach(dsp);
dlokack(dsp->ds_wq, mp, DL_DETACH_REQ);
- return (B_TRUE);
+ return;
+
failed:
- rw_exit(&dsp->ds_lock);
dlerrorack(q, mp, DL_DETACH_REQ, dl_err, 0);
- return (B_FALSE);
}
/*
* DL_BIND_REQ
*/
-static boolean_t
-proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_bind_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_bind_req_t *dlp = (dl_bind_req_t *)udlp;
+ dl_bind_req_t *dlp = (dl_bind_req_t *)mp->b_rptr;
int err = 0;
uint8_t dlsap_addr[MAXMACADDRLEN + sizeof (uint16_t)];
uint_t dlsap_addr_length;
t_uscalar_t dl_err;
t_scalar_t sap;
queue_t *q = dsp->ds_wq;
+ mac_perim_handle_t mph;
+ void *mdip;
+ int32_t intr_cpu;
- /*
- * Because control message processing is serialized, we don't need
- * to hold any locks to read any fields of dsp; we only need ds_lock
- * to update the ds_dlstate, ds_sap and ds_passivestate fields.
- */
if (MBLKL(mp) < sizeof (dl_bind_req_t)) {
dl_err = DL_BADPRIM;
goto failed;
@@ -451,24 +422,26 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
- !dls_active_set(dsp->ds_dc)) {
+ ((err = dls_active_set(dsp)) != 0)) {
dl_err = DL_SYSERR;
- err = EBUSY;
- goto failed;
+ goto failed2;
}
+ dsp->ds_dlstate = DL_BIND_PENDING;
/*
* Set the receive callback.
*/
- dls_rx_set(dsp->ds_dc, (dsp->ds_mode == DLD_RAW) ?
+ dls_rx_set(dsp, (dsp->ds_mode == DLD_RAW) ?
dld_str_rx_raw : dld_str_rx_unitdata, dsp);
/*
* Bind the channel such that it can receive packets.
*/
sap = dlp->dl_sap;
- err = dls_bind(dsp->ds_dc, sap);
+ err = dls_bind(dsp, sap);
if (err != 0) {
switch (err) {
case EINVAL:
@@ -480,17 +453,28 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
break;
}
+ dsp->ds_dlstate = DL_UNBOUND;
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
- dls_active_clear(dsp->ds_dc);
-
- goto failed;
+ dls_active_clear(dsp);
+ goto failed2;
}
+ intr_cpu = mac_client_intr_cpu(dsp->ds_mch);
+ mdip = mac_get_devinfo(dsp->ds_mh);
+ mac_perim_exit(mph);
+
+ /*
+ * We do this after we get out of the perim to avoid deadlocks
+ * etc. since part of mac_client_retarget_intr is to walk the
+ * device tree in order to find and retarget the interrupts.
+ */
+ mac_client_set_intr_cpu(mdip, dsp->ds_mch, intr_cpu);
+
/*
* Copy in MAC address.
*/
dlsap_addr_length = dsp->ds_mip->mi_addr_length;
- bcopy(dsp->ds_curr_addr, dlsap_addr, dlsap_addr_length);
+ mac_unicast_primary_get(dsp->ds_mh, dlsap_addr);
/*
* Copy in the SAP.
@@ -498,37 +482,28 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
*(uint16_t *)(dlsap_addr + dlsap_addr_length) = sap;
dlsap_addr_length += sizeof (uint16_t);
- rw_enter(&dsp->ds_lock, RW_WRITER);
-
dsp->ds_dlstate = DL_IDLE;
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
dsp->ds_passivestate = DLD_ACTIVE;
- dsp->ds_sap = sap;
-
- if (dsp->ds_mode == DLD_FASTPATH)
- dsp->ds_tx = str_mdata_fastpath_put;
- else if (dsp->ds_mode == DLD_RAW)
- dsp->ds_tx = str_mdata_raw_put;
- dsp->ds_unitdata_tx = dld_wput_proto_data;
-
- rw_exit(&dsp->ds_lock);
dlbindack(q, mp, sap, dlsap_addr, dlsap_addr_length, 0, 0);
- return (B_TRUE);
+ return;
+
+failed2:
+ mac_perim_exit(mph);
failed:
dlerrorack(q, mp, DL_BIND_REQ, dl_err, (t_uscalar_t)err);
- return (B_FALSE);
}
/*
* DL_UNBIND_REQ
*/
-/*ARGSUSED*/
-static boolean_t
-proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_unbind_req(dld_str_t *dsp, mblk_t *mp)
{
queue_t *q = dsp->ds_wq;
t_uscalar_t dl_err;
+ mac_perim_handle_t mph;
if (MBLKL(mp) < sizeof (dl_unbind_req_t)) {
dl_err = DL_BADPRIM;
@@ -540,32 +515,27 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
- /*
- * Flush any remaining packets scheduled for transmission.
- */
- dld_tx_flush(dsp);
+ mutex_enter(&dsp->ds_lock);
+ while (dsp->ds_datathr_cnt != 0)
+ cv_wait(&dsp->ds_datathr_cv, &dsp->ds_lock);
- /*
- * Unbind the channel to stop packets being received.
- */
- dls_unbind(dsp->ds_dc);
+ dsp->ds_dlstate = DL_UNBIND_PENDING;
+ mutex_exit(&dsp->ds_lock);
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
/*
- * Clear the receive callback.
+ * Unbind the channel to stop packets being received.
*/
- dls_rx_set(dsp->ds_dc, NULL, NULL);
-
- rw_enter(&dsp->ds_lock, RW_WRITER);
+ if (dls_unbind(dsp) != 0) {
+ dl_err = DL_OUTSTATE;
+ mac_perim_exit(mph);
+ goto failed;
+ }
/*
* Disable polling mode, if it is enabled.
*/
- proto_poll_disable(dsp);
-
- /*
- * If soft rings were enabled, the workers should be quiesced.
- */
- dls_soft_ring_disable(dsp->ds_dc);
+ (void) dld_capab_poll_disable(dsp, NULL);
/*
* Clear LSO flags.
@@ -574,38 +544,37 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
dsp->ds_lso_max = 0;
/*
+ * Clear the receive callback.
+ */
+ dls_rx_set(dsp, NULL, NULL);
+ dsp->ds_direct = B_FALSE;
+
+ /*
* Set the mode back to the default (unitdata).
*/
dsp->ds_mode = DLD_UNITDATA;
dsp->ds_dlstate = DL_UNBOUND;
- DLD_TX_QUIESCE(dsp);
- rw_exit(&dsp->ds_lock);
-
- dlokack(q, mp, DL_UNBIND_REQ);
- return (B_TRUE);
+ mac_perim_exit(mph);
+ dlokack(dsp->ds_wq, mp, DL_UNBIND_REQ);
+ return;
failed:
dlerrorack(q, mp, DL_UNBIND_REQ, dl_err, 0);
- return (B_FALSE);
}
/*
* DL_PROMISCON_REQ
*/
-static boolean_t
-proto_promiscon_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_promiscon_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_promiscon_req_t *dlp = (dl_promiscon_req_t *)udlp;
+ dl_promiscon_req_t *dlp = (dl_promiscon_req_t *)mp->b_rptr;
int err = 0;
t_uscalar_t dl_err;
- uint32_t promisc;
+ uint32_t promisc_saved;
queue_t *q = dsp->ds_wq;
+ mac_perim_handle_t mph;
- /*
- * Because control message processing is serialized, we don't need
- * to hold any locks to read any fields of dsp; we only need ds_lock
- * to update the ds_promisc and ds_passivestate fields.
- */
if (MBLKL(mp) < sizeof (dl_promiscon_req_t)) {
dl_err = DL_BADPRIM;
goto failed;
@@ -617,70 +586,73 @@ proto_promiscon_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
+ promisc_saved = dsp->ds_promisc;
switch (dlp->dl_level) {
case DL_PROMISC_SAP:
- promisc = DLS_PROMISC_SAP;
+ dsp->ds_promisc |= DLS_PROMISC_SAP;
break;
+
case DL_PROMISC_MULTI:
- promisc = DLS_PROMISC_MULTI;
+ dsp->ds_promisc |= DLS_PROMISC_MULTI;
break;
+
case DL_PROMISC_PHYS:
- promisc = DLS_PROMISC_PHYS;
+ dsp->ds_promisc |= DLS_PROMISC_PHYS;
break;
+
default:
dl_err = DL_NOTSUPPORTED;
goto failed;
}
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
- !dls_active_set(dsp->ds_dc)) {
+ ((err = dls_active_set(dsp)) != 0)) {
+ dsp->ds_promisc = promisc_saved;
dl_err = DL_SYSERR;
- err = EBUSY;
- goto failed;
+ goto failed2;
}
/*
* Adjust channel promiscuity.
*/
- promisc = (dsp->ds_promisc | promisc);
- err = dls_promisc(dsp->ds_dc, promisc);
+ err = dls_promisc(dsp, promisc_saved);
+
if (err != 0) {
dl_err = DL_SYSERR;
+ dsp->ds_promisc = promisc_saved;
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
- dls_active_clear(dsp->ds_dc);
- goto failed;
+ dls_active_clear(dsp);
+ goto failed2;
}
- rw_enter(&dsp->ds_lock, RW_WRITER);
+ mac_perim_exit(mph);
+
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
dsp->ds_passivestate = DLD_ACTIVE;
- dsp->ds_promisc = promisc;
- rw_exit(&dsp->ds_lock);
-
dlokack(q, mp, DL_PROMISCON_REQ);
- return (B_TRUE);
+ return;
+
+failed2:
+ mac_perim_exit(mph);
failed:
dlerrorack(q, mp, DL_PROMISCON_REQ, dl_err, (t_uscalar_t)err);
- return (B_FALSE);
}
/*
* DL_PROMISCOFF_REQ
*/
-static boolean_t
-proto_promiscoff_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_promiscoff_req_t *dlp = (dl_promiscoff_req_t *)udlp;
+ dl_promiscoff_req_t *dlp = (dl_promiscoff_req_t *)mp->b_rptr;
int err = 0;
t_uscalar_t dl_err;
- uint32_t promisc;
+ uint32_t promisc_saved;
queue_t *q = dsp->ds_wq;
+ mac_perim_handle_t mph;
- /*
- * Because control messages processing is serialized, we don't need
- * to hold any lock to read any field of dsp; we hold ds_lock to
- * update the ds_promisc field.
- */
if (MBLKL(mp) < sizeof (dl_promiscoff_req_t)) {
dl_err = DL_BADPRIM;
goto failed;
@@ -692,60 +664,66 @@ proto_promiscoff_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
+ promisc_saved = dsp->ds_promisc;
switch (dlp->dl_level) {
case DL_PROMISC_SAP:
- promisc = DLS_PROMISC_SAP;
+ if (!(dsp->ds_promisc & DLS_PROMISC_SAP)) {
+ dl_err = DL_NOTENAB;
+ goto failed;
+ }
+ dsp->ds_promisc &= ~DLS_PROMISC_SAP;
break;
+
case DL_PROMISC_MULTI:
- promisc = DLS_PROMISC_MULTI;
+ if (!(dsp->ds_promisc & DLS_PROMISC_MULTI)) {
+ dl_err = DL_NOTENAB;
+ goto failed;
+ }
+ dsp->ds_promisc &= ~DLS_PROMISC_MULTI;
break;
+
case DL_PROMISC_PHYS:
- promisc = DLS_PROMISC_PHYS;
+ if (!(dsp->ds_promisc & DLS_PROMISC_PHYS)) {
+ dl_err = DL_NOTENAB;
+ goto failed;
+ }
+ dsp->ds_promisc &= ~DLS_PROMISC_PHYS;
break;
+
default:
dl_err = DL_NOTSUPPORTED;
goto failed;
}
- if (!(dsp->ds_promisc & promisc)) {
- dl_err = DL_NOTENAB;
- goto failed;
- }
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+ /*
+ * Adjust channel promiscuity.
+ */
+ err = dls_promisc(dsp, promisc_saved);
+ mac_perim_exit(mph);
- promisc = (dsp->ds_promisc & ~promisc);
- err = dls_promisc(dsp->ds_dc, promisc);
if (err != 0) {
dl_err = DL_SYSERR;
goto failed;
}
-
- rw_enter(&dsp->ds_lock, RW_WRITER);
- dsp->ds_promisc = promisc;
- rw_exit(&dsp->ds_lock);
-
dlokack(q, mp, DL_PROMISCOFF_REQ);
- return (B_TRUE);
+ return;
failed:
dlerrorack(q, mp, DL_PROMISCOFF_REQ, dl_err, (t_uscalar_t)err);
- return (B_FALSE);
}
/*
* DL_ENABMULTI_REQ
*/
-static boolean_t
-proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_enabmulti_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_enabmulti_req_t *dlp = (dl_enabmulti_req_t *)udlp;
+ dl_enabmulti_req_t *dlp = (dl_enabmulti_req_t *)mp->b_rptr;
int err = 0;
t_uscalar_t dl_err;
queue_t *q = dsp->ds_wq;
+ mac_perim_handle_t mph;
- /*
- * Because control messages processing is serialized, we don't need
- * to hold any lock to read any field of dsp; we hold ds_lock to
- * update the ds_passivestate field.
- */
if (dsp->ds_dlstate == DL_UNATTACHED ||
DL_ACK_PENDING(dsp->ds_dlstate)) {
dl_err = DL_OUTSTATE;
@@ -759,14 +737,16 @@ proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
- !dls_active_set(dsp->ds_dc)) {
+ ((err = dls_active_set(dsp)) != 0)) {
dl_err = DL_SYSERR;
- err = EBUSY;
- goto failed;
+ goto failed2;
}
- err = dls_multicst_add(dsp->ds_dc, mp->b_rptr + dlp->dl_addr_offset);
+ err = dls_multicst_add(dsp, mp->b_rptr + dlp->dl_addr_offset);
+
if (err != 0) {
switch (err) {
case EINVAL:
@@ -781,40 +761,37 @@ proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
dl_err = DL_SYSERR;
break;
}
-
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
- dls_active_clear(dsp->ds_dc);
+ dls_active_clear(dsp);
- goto failed;
+ goto failed2;
}
- rw_enter(&dsp->ds_lock, RW_WRITER);
+ mac_perim_exit(mph);
+
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
dsp->ds_passivestate = DLD_ACTIVE;
- rw_exit(&dsp->ds_lock);
-
dlokack(q, mp, DL_ENABMULTI_REQ);
- return (B_TRUE);
+ return;
+
+failed2:
+ mac_perim_exit(mph);
failed:
dlerrorack(q, mp, DL_ENABMULTI_REQ, dl_err, (t_uscalar_t)err);
- return (B_FALSE);
}
/*
* DL_DISABMULTI_REQ
*/
-static boolean_t
-proto_disabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_disabmulti_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_disabmulti_req_t *dlp = (dl_disabmulti_req_t *)udlp;
+ dl_disabmulti_req_t *dlp = (dl_disabmulti_req_t *)mp->b_rptr;
int err = 0;
t_uscalar_t dl_err;
queue_t *q = dsp->ds_wq;
+ mac_perim_handle_t mph;
- /*
- * Because control messages processing is serialized, we don't need
- * to hold any lock to read any field of dsp.
- */
if (dsp->ds_dlstate == DL_UNATTACHED ||
DL_ACK_PENDING(dsp->ds_dlstate)) {
dl_err = DL_OUTSTATE;
@@ -828,45 +805,46 @@ proto_disabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
- err = dls_multicst_remove(dsp->ds_dc, mp->b_rptr + dlp->dl_addr_offset);
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+ err = dls_multicst_remove(dsp, mp->b_rptr + dlp->dl_addr_offset);
+ mac_perim_exit(mph);
+
if (err != 0) {
- switch (err) {
+ switch (err) {
case EINVAL:
dl_err = DL_BADADDR;
err = 0;
break;
+
case ENOENT:
dl_err = DL_NOTENAB;
err = 0;
break;
+
default:
dl_err = DL_SYSERR;
break;
}
goto failed;
}
-
dlokack(q, mp, DL_DISABMULTI_REQ);
- return (B_TRUE);
+ return;
failed:
dlerrorack(q, mp, DL_DISABMULTI_REQ, dl_err, (t_uscalar_t)err);
- return (B_FALSE);
}
/*
* DL_PHYS_ADDR_REQ
*/
-static boolean_t
-proto_physaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_physaddr_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_phys_addr_req_t *dlp = (dl_phys_addr_req_t *)udlp;
+ dl_phys_addr_req_t *dlp = (dl_phys_addr_req_t *)mp->b_rptr;
queue_t *q = dsp->ds_wq;
t_uscalar_t dl_err;
char *addr;
uint_t addr_length;
- rw_enter(&dsp->ds_lock, RW_READER);
-
if (MBLKL(mp) < sizeof (dl_phys_addr_req_t)) {
dl_err = DL_BADPRIM;
goto failed;
@@ -886,50 +864,34 @@ proto_physaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
addr_length = dsp->ds_mip->mi_addr_length;
if (addr_length > 0) {
- addr = kmem_alloc(addr_length, KM_NOSLEEP);
- if (addr == NULL) {
- rw_exit(&dsp->ds_lock);
- merror(q, mp, ENOSR);
- return (B_FALSE);
- }
-
- /*
- * Copy out the address before we drop the lock; we don't
- * want to call dlphysaddrack() while holding ds_lock.
- */
- bcopy((dlp->dl_addr_type == DL_CURR_PHYS_ADDR) ?
- dsp->ds_curr_addr : dsp->ds_fact_addr, addr, addr_length);
+ addr = kmem_alloc(addr_length, KM_SLEEP);
+ if (dlp->dl_addr_type == DL_CURR_PHYS_ADDR)
+ mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)addr);
+ else
+ bcopy(dsp->ds_mip->mi_unicst_addr, addr, addr_length);
- rw_exit(&dsp->ds_lock);
dlphysaddrack(q, mp, addr, (t_uscalar_t)addr_length);
kmem_free(addr, addr_length);
} else {
- rw_exit(&dsp->ds_lock);
dlphysaddrack(q, mp, NULL, 0);
}
- return (B_TRUE);
+ return;
failed:
- rw_exit(&dsp->ds_lock);
dlerrorack(q, mp, DL_PHYS_ADDR_REQ, dl_err, 0);
- return (B_FALSE);
}
/*
* DL_SET_PHYS_ADDR_REQ
*/
-static boolean_t
-proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_setphysaddr_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_set_phys_addr_req_t *dlp = (dl_set_phys_addr_req_t *)udlp;
+ dl_set_phys_addr_req_t *dlp = (dl_set_phys_addr_req_t *)mp->b_rptr;
int err = 0;
t_uscalar_t dl_err;
queue_t *q = dsp->ds_wq;
+ mac_perim_handle_t mph;
- /*
- * Because control message processing is serialized, we don't need
- * to hold any locks to read any fields of dsp; we only need ds_lock
- * to update the ds_passivestate field.
- */
if (dsp->ds_dlstate == DL_UNATTACHED ||
DL_ACK_PENDING(dsp->ds_dlstate)) {
dl_err = DL_OUTSTATE;
@@ -943,14 +905,16 @@ proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
- !dls_active_set(dsp->ds_dc)) {
+ ((err = dls_active_set(dsp)) != 0)) {
dl_err = DL_SYSERR;
- err = EBUSY;
- goto failed;
+ goto failed2;
}
- err = mac_unicst_set(dsp->ds_mh, mp->b_rptr + dlp->dl_addr_offset);
+ err = mac_unicast_primary_set(dsp->ds_mh,
+ mp->b_rptr + dlp->dl_addr_offset);
if (err != 0) {
switch (err) {
case EINVAL:
@@ -962,32 +926,33 @@ proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
dl_err = DL_SYSERR;
break;
}
-
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
- dls_active_clear(dsp->ds_dc);
+ dls_active_clear(dsp);
+
+ goto failed2;
- goto failed;
}
- rw_enter(&dsp->ds_lock, RW_WRITER);
+ mac_perim_exit(mph);
+
if (dsp->ds_passivestate == DLD_UNINITIALIZED)
dsp->ds_passivestate = DLD_ACTIVE;
- rw_exit(&dsp->ds_lock);
-
dlokack(q, mp, DL_SET_PHYS_ADDR_REQ);
- return (B_TRUE);
+ return;
+
+failed2:
+ mac_perim_exit(mph);
failed:
dlerrorack(q, mp, DL_SET_PHYS_ADDR_REQ, dl_err, (t_uscalar_t)err);
- return (B_FALSE);
}
/*
* DL_UDQOS_REQ
*/
-static boolean_t
-proto_udqos_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_udqos_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_udqos_req_t *dlp = (dl_udqos_req_t *)udlp;
+ dl_udqos_req_t *dlp = (dl_udqos_req_t *)mp->b_rptr;
dl_qos_cl_sel1_t *selp;
int off, len;
t_uscalar_t dl_err;
@@ -1013,21 +978,11 @@ proto_udqos_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
goto failed;
}
- if (dsp->ds_dlstate == DL_UNATTACHED ||
- DL_ACK_PENDING(dsp->ds_dlstate)) {
- dl_err = DL_OUTSTATE;
- goto failed;
- }
-
- rw_enter(&dsp->ds_lock, RW_WRITER);
dsp->ds_pri = selp->dl_priority;
- rw_exit(&dsp->ds_lock);
-
dlokack(q, mp, DL_UDQOS_REQ);
- return (B_TRUE);
+ return;
failed:
dlerrorack(q, mp, DL_UDQOS_REQ, dl_err, 0);
- return (B_FALSE);
}
static boolean_t
@@ -1047,19 +1002,16 @@ check_ip_above(queue_t *q)
/*
* DL_CAPABILITY_REQ
*/
-/*ARGSUSED*/
-static boolean_t
-proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_capability_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_capability_req_t *dlp = (dl_capability_req_t *)udlp;
+ dl_capability_req_t *dlp = (dl_capability_req_t *)mp->b_rptr;
dl_capability_sub_t *sp;
size_t size, len;
offset_t off, end;
t_uscalar_t dl_err;
queue_t *q = dsp->ds_wq;
- rw_enter(&dsp->ds_lock, RW_WRITER);
-
if (MBLKL(mp) < sizeof (dl_capability_req_t)) {
dl_err = DL_BADPRIM;
goto failed;
@@ -1077,8 +1029,8 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
* support. Otherwise we enable the set of capabilities requested.
*/
if (dlp->dl_sub_length == 0) {
- /* callee drops lock */
- return (proto_capability_advertise(dsp, mp));
+ proto_capability_advertise(dsp, mp);
+ return;
}
if (!MBLKIN(mp, dlp->dl_sub_offset, dlp->dl_sub_length)) {
@@ -1122,137 +1074,37 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
break;
}
- /*
- * Large segment offload. (LSO)
- */
- case DL_CAPAB_LSO: {
- dl_capab_lso_t *lsop;
- dl_capab_lso_t lso;
-
- lsop = (dl_capab_lso_t *)&sp[1];
- /*
- * Copy for alignment.
- */
- bcopy(lsop, &lso, sizeof (dl_capab_lso_t));
- dlcapabsetqid(&(lso.lso_mid), dsp->ds_rq);
- bcopy(&lso, lsop, sizeof (dl_capab_lso_t));
- break;
- }
-
- /*
- * IP polling interface.
- */
- case DL_CAPAB_POLL: {
- dl_capab_dls_t *pollp;
- dl_capab_dls_t poll;
-
- pollp = (dl_capab_dls_t *)&sp[1];
- /*
- * Copy for alignment.
- */
- bcopy(pollp, &poll, sizeof (dl_capab_dls_t));
-
- switch (poll.dls_flags) {
- default:
- /*FALLTHRU*/
- case POLL_DISABLE:
- proto_poll_disable(dsp);
- break;
-
- case POLL_ENABLE:
- ASSERT(!(dld_opt & DLD_OPT_NO_POLL));
-
- /*
- * Make sure polling is disabled.
- */
- proto_poll_disable(dsp);
-
- /*
- * Note that only IP should enable POLL.
- */
- if (check_ip_above(dsp->ds_rq) &&
- proto_poll_enable(dsp, &poll)) {
- bzero(&poll, sizeof (dl_capab_dls_t));
- poll.dls_flags = POLL_ENABLE;
- } else {
- bzero(&poll, sizeof (dl_capab_dls_t));
- poll.dls_flags = POLL_DISABLE;
- }
- break;
- }
-
- dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq);
- bcopy(&poll, pollp, sizeof (dl_capab_dls_t));
- break;
- }
- case DL_CAPAB_SOFT_RING: {
- dl_capab_dls_t *soft_ringp;
- dl_capab_dls_t soft_ring;
+ case DL_CAPAB_DLD: {
+ dl_capab_dld_t *dldp;
+ dl_capab_dld_t dld;
- soft_ringp = (dl_capab_dls_t *)&sp[1];
+ dldp = (dl_capab_dld_t *)&sp[1];
/*
* Copy for alignment.
*/
- bcopy(soft_ringp, &soft_ring,
- sizeof (dl_capab_dls_t));
-
- switch (soft_ring.dls_flags) {
- default:
- /*FALLTHRU*/
- case SOFT_RING_DISABLE:
- proto_soft_ring_disable(dsp);
- break;
-
- case SOFT_RING_ENABLE:
- ASSERT(!(dld_opt & DLD_OPT_NO_SOFTRING));
- /*
- * Make sure soft_ring is disabled.
- */
- proto_soft_ring_disable(dsp);
-
- /*
- * Note that only IP can enable soft ring.
- */
- if (check_ip_above(dsp->ds_rq) &&
- proto_soft_ring_enable(dsp, &soft_ring)) {
- bzero(&soft_ring,
- sizeof (dl_capab_dls_t));
- soft_ring.dls_flags = SOFT_RING_ENABLE;
- } else {
- bzero(&soft_ring,
- sizeof (dl_capab_dls_t));
- soft_ring.dls_flags = SOFT_RING_DISABLE;
- }
- break;
- }
-
- dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq);
- bcopy(&soft_ring, soft_ringp,
- sizeof (dl_capab_dls_t));
+ bcopy(dldp, &dld, sizeof (dl_capab_dld_t));
+ dlcapabsetqid(&(dld.dld_mid), dsp->ds_rq);
+ bcopy(&dld, dldp, sizeof (dl_capab_dld_t));
break;
}
default:
break;
}
-
off += size;
}
- rw_exit(&dsp->ds_lock);
qreply(q, mp);
- return (B_TRUE);
+ return;
failed:
- rw_exit(&dsp->ds_lock);
dlerrorack(q, mp, DL_CAPABILITY_REQ, dl_err, 0);
- return (B_FALSE);
}
/*
* DL_NOTIFY_REQ
*/
-static boolean_t
-proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_notify_req(dld_str_t *dsp, mblk_t *mp)
{
- dl_notify_req_t *dlp = (dl_notify_req_t *)udlp;
+ dl_notify_req_t *dlp = (dl_notify_req_t *)mp->b_rptr;
t_uscalar_t dl_err;
queue_t *q = dsp->ds_wq;
uint_t note =
@@ -1264,8 +1116,6 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
DL_NOTE_CAPAB_RENEG |
DL_NOTE_SPEED;
- rw_enter(&dsp->ds_lock, RW_WRITER);
-
if (MBLKL(mp) < sizeof (dl_notify_req_t)) {
dl_err = DL_BADPRIM;
goto failed;
@@ -1283,7 +1133,6 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
* Cache the notifications that are being enabled.
*/
dsp->ds_notifications = dlp->dl_notifications & note;
- rw_exit(&dsp->ds_lock);
/*
* The ACK carries all notifications regardless of which set is
* being enabled.
@@ -1291,27 +1140,21 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
dlnotifyack(q, mp, note);
/*
- * Solicit DL_NOTIFY_IND messages for each enabled notification.
+ * Generate DL_NOTIFY_IND messages for each enabled notification.
*/
- rw_enter(&dsp->ds_lock, RW_READER);
if (dsp->ds_notifications != 0) {
- rw_exit(&dsp->ds_lock);
dld_str_notify_ind(dsp);
- } else {
- rw_exit(&dsp->ds_lock);
}
- return (B_TRUE);
+ return;
failed:
- rw_exit(&dsp->ds_lock);
dlerrorack(q, mp, DL_NOTIFY_REQ, dl_err, 0);
- return (B_FALSE);
}
/*
- * DL_UNITDATA_REQ
+ * DL_UINTDATA_REQ
*/
void
-dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
+proto_unitdata_req(dld_str_t *dsp, mblk_t *mp)
{
queue_t *q = dsp->ds_wq;
dl_unitdata_req_t *dlp = (dl_unitdata_req_t *)mp->b_rptr;
@@ -1326,10 +1169,19 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
uint_t max_sdu;
if (MBLKL(mp) < sizeof (dl_unitdata_req_t) || mp->b_cont == NULL) {
- dl_err = DL_BADPRIM;
- goto failed;
+ dlerrorack(q, mp, DL_UNITDATA_REQ, DL_BADPRIM, 0);
+ return;
}
+ mutex_enter(&dsp->ds_lock);
+ if (dsp->ds_dlstate != DL_IDLE) {
+ mutex_exit(&dsp->ds_lock);
+ dlerrorack(q, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0);
+ return;
+ }
+ DLD_DATATHR_INC(dsp);
+ mutex_exit(&dsp->ds_lock);
+
addr_length = dsp->ds_mip->mi_addr_length;
off = dlp->dl_dest_addr_offset;
@@ -1367,7 +1219,7 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
/*
* Build a packet header.
*/
- if ((bp = dls_header(dsp->ds_dc, addr, sap, dlp->dl_priority.dl_max,
+ if ((bp = dls_header(dsp, addr, sap, dlp->dl_priority.dl_max,
&payload)) == NULL) {
dl_err = DL_BADADDR;
goto failed;
@@ -1390,32 +1242,37 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
*/
ASSERT(bp->b_cont == NULL);
bp->b_cont = payload;
- dld_tx_single(dsp, bp);
+
+ /*
+ * No lock can be held across modules and putnext()'s,
+ * which can happen here with the call from DLD_TX().
+ */
+ if (DLD_TX(dsp, bp, 0, 0) != NULL) {
+ /* flow-controlled */
+ DLD_SETQFULL(dsp);
+ }
+ DLD_DATATHR_DCR(dsp);
return;
+
failed:
dlerrorack(q, mp, DL_UNITDATA_REQ, dl_err, 0);
+ DLD_DATATHR_DCR(dsp);
return;
baddata:
dluderrorind(q, mp, (void *)addr, len, DL_BADDATA, 0);
+ DLD_DATATHR_DCR(dsp);
}
/*
* DL_PASSIVE_REQ
*/
-/* ARGSUSED */
-static boolean_t
-proto_passive_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_passive_req(dld_str_t *dsp, mblk_t *mp)
{
t_uscalar_t dl_err;
/*
- * READER lock is enough because ds_passivestate can only be changed
- * as the result of non-data message processing.
- */
- rw_enter(&dsp->ds_lock, RW_READER);
-
- /*
* If we've already become active by issuing an active primitive,
* then it's too late to try to become passive.
*/
@@ -1430,209 +1287,281 @@ proto_passive_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
}
dsp->ds_passivestate = DLD_PASSIVE;
- rw_exit(&dsp->ds_lock);
dlokack(dsp->ds_wq, mp, DL_PASSIVE_REQ);
- return (B_TRUE);
+ return;
failed:
- rw_exit(&dsp->ds_lock);
dlerrorack(dsp->ds_wq, mp, DL_PASSIVE_REQ, dl_err, 0);
- return (B_FALSE);
}
+
/*
* Catch-all handler.
*/
-static boolean_t
-proto_req(dld_str_t *dsp, union DL_primitives *dlp, mblk_t *mp)
+static void
+proto_req(dld_str_t *dsp, mblk_t *mp)
{
+ union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
+
dlerrorack(dsp->ds_wq, mp, dlp->dl_primitive, DL_UNSUPPORTED, 0);
- return (B_FALSE);
}
-static void
-proto_poll_disable(dld_str_t *dsp)
+static int
+dld_capab_perim(dld_str_t *dsp, void *data, uint_t flags)
{
- mac_handle_t mh;
+ switch (flags) {
+ case DLD_ENABLE:
+ mac_perim_enter_by_mh(dsp->ds_mh, (mac_perim_handle_t *)data);
+ return (0);
- ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+ case DLD_DISABLE:
+ mac_perim_exit((mac_perim_handle_t)data);
+ return (0);
- if (!dsp->ds_polling)
- return;
+ case DLD_QUERY:
+ return (mac_perim_held(dsp->ds_mh));
+ }
+ return (0);
+}
- /*
- * It should be impossible to enable raw mode if polling is turned on.
- */
- ASSERT(dsp->ds_mode != DLD_RAW);
+static int
+dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
+{
+ dld_capab_direct_t *direct = data;
- /*
- * Reset the resource_add callback.
- */
- mh = dls_mac(dsp->ds_dc);
- mac_resource_set(mh, NULL, NULL);
- mac_resources(mh);
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
- /*
- * Set receive function back to default.
- */
- dls_rx_set(dsp->ds_dc, (dsp->ds_mode == DLD_FASTPATH) ?
- dld_str_rx_fastpath : dld_str_rx_unitdata, dsp);
+ switch (flags) {
+ case DLD_ENABLE:
+ dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
+ direct->di_rx_ch);
+ /*
+ * TODO: XXXGopi
+ *
+ * Direct pointer to functions in the MAC layer
+ * should be passed here:
+ *
+ * 1) pass mac_tx() and mac_client_handle instead
+ * of str_mdata_fastpath_put() and dld_str_t. But
+ * not done presently because of some VLAN
+ * processing stuff in str_mdata_fastpath_put().
+ *
+ * 2) pass a MAC layer callback instead of
+ * dld_flow_ctl_callb().
+ */
+ direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+ direct->di_tx_dh = dsp;
- /*
- * Note that polling is disabled.
- */
- dsp->ds_polling = B_FALSE;
+ direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
+ direct->di_tx_cb_dh = dsp->ds_mch;
+ dsp->ds_direct = B_TRUE;
+
+ return (0);
+
+ case DLD_DISABLE:
+ dls_rx_set(dsp, (dsp->ds_mode == DLD_FASTPATH) ?
+ dld_str_rx_fastpath : dld_str_rx_unitdata, (void *)dsp);
+ dsp->ds_direct = B_FALSE;
+
+ return (0);
+ }
+ return (ENOTSUP);
}
-static boolean_t
-proto_poll_enable(dld_str_t *dsp, dl_capab_dls_t *pollp)
+/*
+ * dld_capab_poll_enable()
+ *
+ * This function is misnamed. All polling and fanouts are run out of the
+ * lower mac (in case of VNIC and the only mac in case of NICs). The
+ * availability of Rx ring and promiscous mode is all taken care between
+ * the soft ring set (mac_srs), the Rx ring, and S/W classifier. Any
+ * fanout necessary is done by the soft rings that are part of the
+ * mac_srs (by default mac_srs sends the packets up via a TCP and
+ * non TCP soft ring).
+ *
+ * The mac_srs (or its associated soft rings) always store the ill_rx_ring
+ * (the cookie returned when they registered with IP during plumb) as their
+ * 2nd argument which is passed up as mac_resource_handle_t. The upcall
+ * function and 1st argument is what the caller registered when they
+ * called mac_rx_classify_flow_add() to register the flow. For VNIC,
+ * the function is vnic_rx and argument is vnic_t. For regular NIC
+ * case, it mac_rx_default and mac_handle_t. As explained above, the
+ * mac_srs (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t)
+ * from its stored 2nd argument.
+ */
+static int
+dld_capab_poll_enable(dld_str_t *dsp, dld_capab_poll_t *poll)
{
- mac_handle_t mh;
+ if (dsp->ds_polling)
+ return (EINVAL);
- ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
- ASSERT(!dsp->ds_polling);
+ if ((dld_opt & DLD_OPT_NO_POLL) != 0 || dsp->ds_mode == DLD_RAW)
+ return (ENOTSUP);
/*
- * We cannot enable polling if raw mode
- * has been enabled.
+ * Enable client polling if and only if DLS bypass is possible.
+ * Special cases like VLANs need DLS processing in the Rx data path.
+ * In such a case we can neither allow the client (IP) to directly
+ * poll the softring (since DLS processing hasn't been done) nor can
+ * we allow DLS bypass.
*/
- if (dsp->ds_mode == DLD_RAW)
- return (B_FALSE);
-
- mh = dls_mac(dsp->ds_dc);
+ if (!mac_rx_bypass_set(dsp->ds_mch, dsp->ds_rx, dsp->ds_rx_arg))
+ return (ENOTSUP);
/*
- * Register resources.
+ * Register soft ring resources. This will come in handy later if
+ * the user decides to modify CPU bindings to use more CPUs for the
+ * device in which case we will switch to fanout using soft rings.
*/
- mac_resource_set(mh, (mac_resource_add_t)pollp->dls_ring_add,
- (void *)pollp->dls_rx_handle);
-
- mac_resources(mh);
+ mac_resource_set_common(dsp->ds_mch,
+ (mac_resource_add_t)poll->poll_ring_add_cf,
+ (mac_resource_remove_t)poll->poll_ring_remove_cf,
+ (mac_resource_quiesce_t)poll->poll_ring_quiesce_cf,
+ (mac_resource_restart_t)poll->poll_ring_restart_cf,
+ (mac_resource_bind_t)poll->poll_ring_bind_cf,
+ poll->poll_ring_ch);
- /*
- * Set the upstream receive function.
- */
- dls_rx_set(dsp->ds_dc, (dls_rx_t)pollp->dls_rx,
- (void *)pollp->dls_rx_handle);
+ mac_client_poll_enable(dsp->ds_mch);
- /*
- * Note that polling is enabled. This prevents further DLIOCHDRINFO
- * ioctls from overwriting the receive function pointer.
- */
dsp->ds_polling = B_TRUE;
- return (B_TRUE);
+ return (0);
}
-static void
-proto_soft_ring_disable(dld_str_t *dsp)
+/* ARGSUSED */
+static int
+dld_capab_poll_disable(dld_str_t *dsp, dld_capab_poll_t *poll)
{
- ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+ if (!dsp->ds_polling)
+ return (EINVAL);
- if (!dsp->ds_soft_ring)
- return;
+ mac_client_poll_disable(dsp->ds_mch);
+ mac_resource_set(dsp->ds_mch, NULL, NULL);
- /*
- * It should be impossible to enable raw mode if soft_ring is turned on.
- */
- ASSERT(dsp->ds_mode != DLD_RAW);
- proto_change_soft_ring_fanout(dsp, SOFT_RING_NONE);
- /*
- * Note that fanout is disabled.
- */
- dsp->ds_soft_ring = B_FALSE;
+ dsp->ds_polling = B_FALSE;
+ return (0);
}
-static boolean_t
-proto_soft_ring_enable(dld_str_t *dsp, dl_capab_dls_t *soft_ringp)
+static int
+dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
{
- ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
- ASSERT(!dsp->ds_soft_ring);
+ dld_capab_poll_t *poll = data;
- /*
- * We cannot enable soft_ring if raw mode
- * has been enabled.
- */
- if (dsp->ds_mode == DLD_RAW)
- return (B_FALSE);
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
- if (dls_soft_ring_enable(dsp->ds_dc, soft_ringp) == B_FALSE)
- return (B_FALSE);
+ switch (flags) {
+ case DLD_ENABLE:
+ return (dld_capab_poll_enable(dsp, poll));
+ case DLD_DISABLE:
+ return (dld_capab_poll_disable(dsp, poll));
+ }
+ return (ENOTSUP);
+}
- dsp->ds_soft_ring = B_TRUE;
- return (B_TRUE);
+static int
+dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
+{
+ dld_capab_lso_t *lso = data;
+
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+ switch (flags) {
+ case DLD_ENABLE: {
+ mac_capab_lso_t mac_lso;
+
+ /*
+ * Check if LSO is supported on this MAC & enable LSO
+ * accordingly.
+ */
+ if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_LSO, &mac_lso)) {
+ lso->lso_max = mac_lso.lso_basic_tcp_ipv4.lso_max;
+ lso->lso_flags = 0;
+ /* translate the flag for mac clients */
+ if ((mac_lso.lso_flags & LSO_TX_BASIC_TCP_IPV4) != 0)
+ lso->lso_flags |= DLD_LSO_TX_BASIC_TCP_IPV4;
+ dsp->ds_lso = B_TRUE;
+ dsp->ds_lso_max = lso->lso_max;
+ } else {
+ dsp->ds_lso = B_FALSE;
+ dsp->ds_lso_max = 0;
+ return (ENOTSUP);
+ }
+ return (0);
+ }
+ case DLD_DISABLE: {
+ dsp->ds_lso = B_FALSE;
+ dsp->ds_lso_max = 0;
+ return (0);
+ }
+ }
+ return (ENOTSUP);
}
-static void
-proto_change_soft_ring_fanout(dld_str_t *dsp, int type)
+static int
+dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
{
- dls_channel_t dc = dsp->ds_dc;
+ int err;
- if (type == SOFT_RING_NONE) {
- dls_rx_set(dc, (dsp->ds_mode == DLD_FASTPATH) ?
- dld_str_rx_fastpath : dld_str_rx_unitdata, dsp);
- } else if (type != SOFT_RING_NONE) {
- dls_rx_set(dc, (dls_rx_t)dls_soft_ring_fanout, dc);
+ /*
+ * Don't enable direct callback capabilities unless the caller is
+ * the IP client. When a module is inserted in a stream (_I_INSERT)
+ * the stack initiates capability disable, but due to races, the
+ * module insertion may complete before the capability disable
+ * completes. So we limit the check to DLD_ENABLE case.
+ */
+ if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) &&
+ (dsp->ds_sap != ETHERTYPE_IP || !check_ip_above(dsp->ds_rq))) {
+ return (ENOTSUP);
}
+
+ switch (type) {
+ case DLD_CAPAB_DIRECT:
+ err = dld_capab_direct(dsp, data, flags);
+ break;
+
+ case DLD_CAPAB_POLL:
+ err = dld_capab_poll(dsp, data, flags);
+ break;
+
+ case DLD_CAPAB_PERIM:
+ err = dld_capab_perim(dsp, data, flags);
+ break;
+
+ case DLD_CAPAB_LSO:
+ err = dld_capab_lso(dsp, data, flags);
+ break;
+
+ default:
+ err = ENOTSUP;
+ break;
+ }
+
+ return (err);
}
/*
* DL_CAPABILITY_ACK/DL_ERROR_ACK
*/
-static boolean_t
+static void
proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
{
dl_capability_ack_t *dlap;
dl_capability_sub_t *dlsp;
size_t subsize;
- dl_capab_dls_t poll;
- dl_capab_dls_t soft_ring;
+ dl_capab_dld_t dld;
dl_capab_hcksum_t hcksum;
- dl_capab_lso_t lso;
dl_capab_zerocopy_t zcopy;
uint8_t *ptr;
queue_t *q = dsp->ds_wq;
mblk_t *mp1;
- boolean_t is_vlan = (dsp->ds_vid != VLAN_ID_NONE);
- boolean_t poll_capable = B_FALSE;
- boolean_t soft_ring_capable = B_FALSE;
+ boolean_t is_vlan;
boolean_t hcksum_capable = B_FALSE;
boolean_t zcopy_capable = B_FALSE;
- boolean_t lso_capable = B_FALSE;
- mac_capab_lso_t mac_lso;
-
- ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+ boolean_t dld_capable = B_FALSE;
/*
* Initially assume no capabilities.
*/
subsize = 0;
-
- /*
- * Check if soft ring can be enabled on this interface. Note that we
- * do not enable softring on any legacy drivers, because doing that
- * would hurt the performance if the legacy driver has its own taskq
- * implementation. Further, most high-performance legacy drivers do
- * have their own taskq implementation.
- *
- * If advertising DL_CAPAB_SOFT_RING has not been explicitly disabled,
- * reserve space for that capability.
- */
- if (!mac_is_legacy(dsp->ds_mh) && !(dld_opt & DLD_OPT_NO_SOFTRING)) {
- soft_ring_capable = B_TRUE;
- subsize += sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_dls_t);
- }
-
- /*
- * Check if polling can be enabled on this interface.
- * If advertising DL_CAPAB_POLL has not been explicitly disabled
- * then reserve space for that capability.
- */
- if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_POLL, NULL) &&
- !(dld_opt & DLD_OPT_NO_POLL) && !is_vlan) {
- poll_capable = B_TRUE;
- subsize += sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_dls_t);
- }
+ is_vlan = (mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE);
/*
* Check if checksum offload is supported on this MAC. Don't
@@ -1652,16 +1581,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
}
/*
- * Check if LSO is supported on this MAC, then reserve space for
- * the DL_CAPAB_LSO capability.
- */
- if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_LSO, &mac_lso)) {
- lso_capable = B_TRUE;
- subsize += sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_lso_t);
- }
-
- /*
* Check if zerocopy is supported on this interface.
* If advertising DL_CAPAB_ZEROCOPY has not been explicitly disabled
* then reserve space for that capability.
@@ -1674,14 +1593,22 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
}
/*
+ * Direct capability negotiation interface between IP and DLD
+ */
+ if (dsp->ds_sap == ETHERTYPE_IP && check_ip_above(dsp->ds_rq)) {
+ dld_capable = B_TRUE;
+ subsize += sizeof (dl_capability_sub_t) +
+ sizeof (dl_capab_dld_t);
+ }
+
+ /*
* If there are no capabilities to advertise or if we
* can't allocate a response, send a DL_ERROR_ACK.
*/
if ((mp1 = reallocb(mp,
sizeof (dl_capability_ack_t) + subsize, 0)) == NULL) {
- rw_exit(&dsp->ds_lock);
dlerrorack(q, mp, DL_CAPABILITY_REQ, DL_NOTSUPPORTED, 0);
- return (B_FALSE);
+ return;
}
mp = mp1;
@@ -1695,56 +1622,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
ptr = (uint8_t *)&dlap[1];
/*
- * IP polling interface.
- */
- if (poll_capable) {
- /*
- * Attempt to disable just in case this is a re-negotiation;
- * READER lock is enough because ds_polling can only be
- * changed as the result of non-data message processing.
- */
- proto_poll_disable(dsp);
-
- dlsp = (dl_capability_sub_t *)ptr;
-
- dlsp->dl_cap = DL_CAPAB_POLL;
- dlsp->dl_length = sizeof (dl_capab_dls_t);
- ptr += sizeof (dl_capability_sub_t);
-
- bzero(&poll, sizeof (dl_capab_dls_t));
- poll.dls_version = POLL_VERSION_1;
- poll.dls_flags = POLL_CAPABLE;
- poll.dls_tx_handle = (uintptr_t)dsp;
- poll.dls_tx = (uintptr_t)str_mdata_fastpath_put;
- dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq);
- bcopy(&poll, ptr, sizeof (dl_capab_dls_t));
- ptr += sizeof (dl_capab_dls_t);
- }
-
-
- if (soft_ring_capable) {
- dlsp = (dl_capability_sub_t *)ptr;
-
- dlsp->dl_cap = DL_CAPAB_SOFT_RING;
- dlsp->dl_length = sizeof (dl_capab_dls_t);
- ptr += sizeof (dl_capability_sub_t);
-
- bzero(&soft_ring, sizeof (dl_capab_dls_t));
- soft_ring.dls_version = SOFT_RING_VERSION_1;
- soft_ring.dls_flags = SOFT_RING_CAPABLE;
- soft_ring.dls_tx_handle = (uintptr_t)dsp;
- soft_ring.dls_tx = (uintptr_t)str_mdata_fastpath_put;
- soft_ring.dls_ring_change_status =
- (uintptr_t)proto_change_soft_ring_fanout;
- soft_ring.dls_ring_bind = (uintptr_t)soft_ring_bind;
- soft_ring.dls_ring_unbind = (uintptr_t)soft_ring_unbind;
-
- dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq);
- bcopy(&soft_ring, ptr, sizeof (dl_capab_dls_t));
- ptr += sizeof (dl_capab_dls_t);
- }
-
- /*
* TCP/IP checksum offload.
*/
if (hcksum_capable) {
@@ -1761,32 +1638,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
}
/*
- * Large segment offload. (LSO)
- */
- if (lso_capable) {
- dlsp = (dl_capability_sub_t *)ptr;
-
- dlsp->dl_cap = DL_CAPAB_LSO;
- dlsp->dl_length = sizeof (dl_capab_lso_t);
- ptr += sizeof (dl_capability_sub_t);
-
- lso.lso_version = LSO_VERSION_1;
- lso.lso_flags = mac_lso.lso_flags;
- lso.lso_max = mac_lso.lso_basic_tcp_ipv4.lso_max;
-
- /* Simply enable LSO with DLD */
- dsp->ds_lso = B_TRUE;
- dsp->ds_lso_max = lso.lso_max;
-
- dlcapabsetqid(&(lso.lso_mid), dsp->ds_rq);
- bcopy(&lso, ptr, sizeof (dl_capab_lso_t));
- ptr += sizeof (dl_capab_lso_t);
- } else {
- dsp->ds_lso = B_FALSE;
- dsp->ds_lso_max = 0;
- }
-
- /*
* Zero copy
*/
if (zcopy_capable) {
@@ -1805,11 +1656,28 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
ptr += sizeof (dl_capab_zerocopy_t);
}
- ASSERT(ptr == mp->b_rptr + sizeof (dl_capability_ack_t) + subsize);
+ /*
+ * Direct capability negotiation interface between IP and DLD.
+ * Refer to dld.h for details.
+ */
+ if (dld_capable) {
+ dlsp = (dl_capability_sub_t *)ptr;
+ dlsp->dl_cap = DL_CAPAB_DLD;
+ dlsp->dl_length = sizeof (dl_capab_dld_t);
+ ptr += sizeof (dl_capability_sub_t);
- rw_exit(&dsp->ds_lock);
+ bzero(&dld, sizeof (dl_capab_dld_t));
+ dld.dld_version = DLD_CURRENT_VERSION;
+ dld.dld_capab = (uintptr_t)dld_capab;
+ dld.dld_capab_handle = (uintptr_t)dsp;
+
+ dlcapabsetqid(&(dld.dld_mid), dsp->ds_rq);
+ bcopy(&dld, ptr, sizeof (dl_capab_dld_t));
+ ptr += sizeof (dl_capab_dld_t);
+ }
+
+ ASSERT(ptr == mp->b_rptr + sizeof (dl_capability_ack_t) + subsize);
qreply(q, mp);
- return (B_TRUE);
}
/*
@@ -1819,8 +1687,5 @@ void
dld_capabilities_disable(dld_str_t *dsp)
{
if (dsp->ds_polling)
- proto_poll_disable(dsp);
-
- if (dsp->ds_soft_ring)
- proto_soft_ring_disable(dsp);
+ (void) dld_capab_poll_disable(dsp, NULL);
}
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index 8694b9d6c4..cf7e7010dc 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -27,17 +27,17 @@
* Data-Link Driver
*/
+#include <inet/common.h>
+#include <sys/strsubr.h>
#include <sys/stropts.h>
#include <sys/strsun.h>
-#include <sys/strsubr.h>
-#include <sys/atomic.h>
-#include <sys/disp.h>
-#include <sys/callb.h>
#include <sys/vlan.h>
-#include <sys/dld.h>
#include <sys/dld_impl.h>
-#include <sys/dls_impl.h>
-#include <inet/common.h>
+#include <sys/cpuvar.h>
+#include <sys/callb.h>
+#include <sys/list.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
static int str_constructor(void *, void *, int);
static void str_destructor(void *, void *);
@@ -49,111 +49,80 @@ static void str_notify_link_up(dld_str_t *);
static void str_notify_link_down(dld_str_t *);
static void str_notify_capab_reneg(dld_str_t *);
static void str_notify_speed(dld_str_t *, uint32_t);
-static void str_notify(void *, mac_notify_type_t);
static void ioc_native(dld_str_t *, mblk_t *);
static void ioc_margin(dld_str_t *, mblk_t *);
static void ioc_raw(dld_str_t *, mblk_t *);
static void ioc_fast(dld_str_t *, mblk_t *);
static void ioc(dld_str_t *, mblk_t *);
-static void dld_tx_enqueue(dld_str_t *, mblk_t *, mblk_t *, boolean_t,
- uint_t, uint_t);
+static void dld_ioc(dld_str_t *, mblk_t *);
static void dld_wput_nondata(dld_str_t *, mblk_t *);
-static void dld_wput_nondata_task(void *);
-static void dld_flush_nondata(dld_str_t *);
+
+static void str_mdata_raw_put(dld_str_t *, mblk_t *);
static mblk_t *i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
static mblk_t *i_dld_ether_header_strip_tag(mblk_t *);
static uint32_t str_count;
static kmem_cache_t *str_cachep;
-static taskq_t *dld_disp_taskq = NULL;
static mod_hash_t *str_hashp;
#define STR_HASHSZ 64
#define STR_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key))
-static inline uint_t mp_getsize(mblk_t *);
+#define dld_taskq system_taskq
-/*
- * Interval to count the TX queued depth. Default is 1s (1000000us).
- * Count the queue depth immediately (not by timeout) if this is set to 0.
- * See more details above dld_tx_enqueue().
- */
-uint_t tx_qdepth_interval = 1000000;
+static kmutex_t dld_taskq_lock;
+static kcondvar_t dld_taskq_cv;
+static list_t dld_taskq_list; /* List of dld_str_t */
+boolean_t dld_taskq_quit;
+boolean_t dld_taskq_done;
+
+static void dld_taskq_dispatch(void);
/*
- * Some notes on entry points, flow-control, queueing and locking:
+ * Some notes on entry points, flow-control, queueing.
*
* This driver exports the traditional STREAMS put entry point as well as
* the non-STREAMS fast-path transmit routine which is provided to IP via
* the DL_CAPAB_POLL negotiation. The put procedure handles all control
* and data operations, while the fast-path routine deals only with M_DATA
* fast-path packets. Regardless of the entry point, all outbound packets
- * will end up in dld_tx_single(), where they will be delivered to the MAC
- * driver.
+ * will end up in DLD_TX(), where they will be delivered to the MAC layer.
*
- * The transmit logic operates in two modes: a "not busy" mode where the
- * packets will be delivered to the MAC for a send attempt, or "busy" mode
- * where they will be enqueued in the internal queue because of flow-control.
- * Flow-control happens when the MAC driver indicates the packets couldn't
- * be transmitted due to lack of resources (e.g. running out of descriptors).
- * In such case, the driver will place a dummy message on its write-side
- * STREAMS queue so that the queue is marked as "full". Any subsequent
- * packets arriving at the driver will be enqueued in the internal queue,
- * which is drained in the context of the service thread that gets scheduled
- * whenever the driver is in the "busy" mode. When all packets have been
- * successfully delivered by MAC and the internal queue is empty, it will
- * transition to the "not busy" mode by removing the dummy message from the
- * write-side STREAMS queue; in effect this will trigger backenabling.
- * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
- * to the above reasons.
+ * The transmit logic operates in the following way: All packets coming
+ * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
+ * happens when the MAC layer indicates the packets couldn't be
+ * transmitted due to 1) lack of resources (e.g. running out of
+ * descriptors), or 2) reaching the allowed bandwidth limit for this
+ * particular flow. The indication comes in the form of a Tx cookie that
+ * identifies the blocked ring. In such case, DLD will place a
+ * dummy message on its write-side STREAMS queue so that the queue is
+ * marked as "full". Any subsequent packets arriving at the driver will
+ * still be sent to the MAC layer where it either gets queued in the Tx
+ * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
+ * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
+ * When the write service procedure runs, it will remove the dummy
+ * message from the write-side STREAMS queue; in effect this will trigger
+ * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
+ * respectively, due to the above reasons.
*
- * The driver implements an internal transmit queue independent of STREAMS.
- * This allows for flexibility and provides a fast enqueue/dequeue mechanism
- * compared to the putq() and get() STREAMS interfaces. The only putq() and
- * getq() operations done by the driver are those related to placing and
- * removing the dummy message to/from the write-side STREAMS queue for flow-
- * control purposes.
+ * All non-data operations, both DLPI and ioctls are single threaded on a per
+ * dld_str_t endpoint. This is done using a taskq so that the control operation
+ * has kernel context and can cv_wait for resources. In addition all set type
+ * operations that involve mac level state modification are serialized on a
+ * per mac end point using the perimeter mechanism provided by the mac layer.
+ * This serializes all mac clients trying to modify a single mac end point over
+ * the entire sequence of mac calls made by that client as an atomic unit. The
+ * mac framework locking is described in mac.c. A critical element is that
+ * DLD/DLS does not hold any locks across the mac perimeter.
*
- * Locking is done independent of STREAMS due to the driver being fully MT.
- * Threads entering the driver (either from put or service entry points)
- * will most likely be readers, with the exception of a few writer cases
- * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
- * DLD-related ioctl requests. The DLPI detach case is special, because
- * it involves freeing resources and therefore must be single-threaded.
- * Unfortunately the readers/writers lock can't be used to protect against
- * it, because the lock is dropped prior to the driver calling places where
- * putnext() may be invoked, and such places may depend on those resources
- * to exist. Because of this, the driver always completes the DLPI detach
- * process when there are no other threads running in the driver. This is
- * done by keeping track of the number of threads, such that the the last
- * thread leaving the driver will finish the pending DLPI detach operation.
- */
-
-/*
- * dld_max_q_count is the queue depth threshold used to limit the number of
- * outstanding packets or bytes allowed in the queue; once this limit is
- * reached the driver will free any incoming ones until the queue depth
- * drops below the threshold.
- *
- * This buffering is provided to accomodate clients which do not employ
- * their own buffering scheme, and to handle occasional packet bursts.
- * Clients which handle their own buffering will receive positive feedback
- * from this driver as soon as it transitions into the "busy" state, i.e.
- * when the queue is initially filled up; they will get backenabled once
- * the queue is empty.
- *
- * The value chosen here is rather arbitrary; in future some intelligent
- * heuristics may be involved which could take into account the hardware's
- * transmit ring size, etc.
- */
-uint_t dld_max_q_count = (16 * 1024 *1024);
-
-/*
* dld_finddevinfo() returns the dev_info_t * corresponding to a particular
* dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
* match dev_t. If a stream is found and it is attached, its dev_info_t *
- * is returned.
+ * is returned. If the mac handle is non-null, it can be safely accessed
+ * below. The mac handle won't be freed until the mac_unregister which
+ * won't happen until the driver detaches. The DDI framework ensures that
+ * the detach won't happen while a getinfo is in progress.
*/
typedef struct i_dld_str_state_s {
major_t ds_major;
@@ -167,35 +136,31 @@ i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
{
i_dld_str_state_t *statep = arg;
dld_str_t *dsp = (dld_str_t *)val;
+ mac_handle_t mh;
if (statep->ds_major != dsp->ds_major)
return (MH_WALK_CONTINUE);
ASSERT(statep->ds_minor != 0);
+ mh = dsp->ds_mh;
- /*
- * Access to ds_mh needs to be protected by ds_lock.
- */
- rw_enter(&dsp->ds_lock, RW_READER);
if (statep->ds_minor == dsp->ds_minor) {
/*
* Clone: a clone minor is unique. we can terminate the
* walk if we find a matching stream -- even if we fail
* to obtain the devinfo.
*/
- if (dsp->ds_mh != NULL)
- statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
- rw_exit(&dsp->ds_lock);
+ if (mh != NULL)
+ statep->ds_dip = mac_devinfo_get(mh);
return (MH_WALK_TERMINATE);
}
- rw_exit(&dsp->ds_lock);
return (MH_WALK_CONTINUE);
}
static dev_info_t *
dld_finddevinfo(dev_t dev)
{
- dev_info_t *dip;
+ dev_info_t *dip;
i_dld_str_state_t state;
if (getminor(dev) == 0)
@@ -204,7 +169,7 @@ dld_finddevinfo(dev_t dev)
/*
* See if it's a minor node of a link
*/
- if ((dip = dls_finddevinfo(dev)) != NULL)
+ if ((dip = dls_link_devinfo(dev)) != NULL)
return (dip);
state.ds_minor = getminor(dev);
@@ -319,11 +284,24 @@ dld_close(queue_t *rq)
dld_str_t *dsp = rq->q_ptr;
/*
+ * All modules on top have been popped off. So there can't be any
+ * threads from the top.
+ */
+ ASSERT(dsp->ds_datathr_cnt == 0);
+
+ /*
+ * Wait until pending DLPI requests are processed.
+ */
+ mutex_enter(&dsp->ds_lock);
+ while (dsp->ds_dlpi_pending)
+ cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
+ mutex_exit(&dsp->ds_lock);
+
+ /*
* Disable the queue srv(9e) routine.
*/
qprocsoff(rq);
- dld_finish_pending_task(dsp);
/*
* This stream was open to a provider node. Check to see
@@ -348,58 +326,55 @@ dld_close(queue_t *rq)
void
dld_wput(queue_t *wq, mblk_t *mp)
{
- dld_str_t *dsp = wq->q_ptr;
+ dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
+ dld_str_mode_t mode;
switch (DB_TYPE(mp)) {
- case M_DATA: {
- dld_tx_t tx;
-
- DLD_TX_ENTER(dsp);
- if ((tx = dsp->ds_tx) != NULL)
- tx(dsp, mp);
- else
- freemsg(mp);
- DLD_TX_EXIT(dsp);
+ case M_DATA:
+ mutex_enter(&dsp->ds_lock);
+ if (dsp->ds_dlstate == DL_IDLE) {
+ mode = dsp->ds_mode;
+ if (mode == DLD_FASTPATH || mode == DLD_RAW) {
+ DLD_DATATHR_INC(dsp);
+ mutex_exit(&dsp->ds_lock);
+ if (mode == DLD_FASTPATH) {
+ (void) str_mdata_fastpath_put(dsp, mp,
+ 0, 0);
+ } else {
+ str_mdata_raw_put(dsp, mp);
+ }
+ DLD_DATATHR_DCR(dsp);
+ break;
+ }
+ }
+ mutex_exit(&dsp->ds_lock);
+ freemsg(mp);
break;
- }
+
case M_PROTO:
case M_PCPROTO: {
t_uscalar_t prim;
- dld_tx_t tx;
- if (MBLKL(mp) < sizeof (t_uscalar_t)) {
- freemsg(mp);
- return;
- }
+ if (MBLKL(mp) < sizeof (t_uscalar_t))
+ break;
prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
- if (prim != DL_UNITDATA_REQ) {
- /* Control path */
+
+ if (prim == DL_UNITDATA_REQ) {
+ proto_unitdata_req(dsp, mp);
+ } else {
dld_wput_nondata(dsp, mp);
- break;
}
-
- /* Data path */
- DLD_TX_ENTER(dsp);
- if ((tx = dsp->ds_unitdata_tx) != NULL)
- tx(dsp, mp);
- else
- dlerrorack(wq, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0);
- DLD_TX_EXIT(dsp);
break;
}
+
case M_IOCTL:
- case M_IOCDATA:
- /* Control path */
dld_wput_nondata(dsp, mp);
break;
+
case M_FLUSH:
- /*
- * Flush both the data messages and the control messages.
- */
if (*mp->b_rptr & FLUSHW) {
- dld_flush_nondata(dsp);
- dld_tx_flush(dsp);
+ DLD_CLRQFULL(dsp);
*mp->b_rptr &= ~FLUSHW;
}
@@ -409,6 +384,7 @@ dld_wput(queue_t *wq, mblk_t *mp)
freemsg(mp);
}
break;
+
default:
freemsg(mp);
break;
@@ -416,122 +392,14 @@ dld_wput(queue_t *wq, mblk_t *mp)
}
/*
- * Called by GLDv3 control node to process the ioctls. It will start
- * a taskq to allow the ioctl processing to block. This is a temporary
- * solution, and will be replaced by a more graceful approach afterwards.
- */
-void
-dld_ioctl(queue_t *wq, mblk_t *mp)
-{
- dld_wput_nondata(wq->q_ptr, mp);
-}
-
-/*
* qi_srvp: srv(9e)
*/
void
dld_wsrv(queue_t *wq)
{
- mblk_t *mp, *head, *tail;
dld_str_t *dsp = wq->q_ptr;
- uint_t cnt, msgcnt;
- timeout_id_t tid = 0;
-
- rw_enter(&dsp->ds_lock, RW_READER);
- /*
- * Grab all packets (chained via b_next) off our transmit queue
- * and try to send them all to the MAC layer. Since the queue
- * is independent of streams, we are able to dequeue all messages
- * at once without looping through getq() and manually chaining
- * them. Note that the queue size parameters (byte and message
- * counts) are cleared as well, but we postpone the backenabling
- * until after the MAC transmit since some packets may end up
- * back at our transmit queue.
- */
- mutex_enter(&dsp->ds_tx_list_lock);
- if ((mp = dsp->ds_tx_list_head) == NULL) {
- ASSERT(!dsp->ds_tx_qbusy);
- ASSERT(dsp->ds_tx_flow_mp != NULL);
- ASSERT(dsp->ds_tx_list_head == NULL);
- ASSERT(dsp->ds_tx_list_tail == NULL);
- ASSERT(dsp->ds_tx_cnt == 0);
- ASSERT(dsp->ds_tx_msgcnt == 0);
- mutex_exit(&dsp->ds_tx_list_lock);
- rw_exit(&dsp->ds_lock);
- return;
- }
- head = mp;
- tail = dsp->ds_tx_list_tail;
- dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
- cnt = dsp->ds_tx_cnt;
- msgcnt = dsp->ds_tx_msgcnt;
- dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
- mutex_exit(&dsp->ds_tx_list_lock);
-
- /*
- * Discard packets unless we are attached and bound; note that
- * the driver mode (fastpath/raw/unitdata) is irrelevant here,
- * because regardless of the mode all transmit will end up in
- * dld_tx_single() where the packets may be queued.
- */
- ASSERT((DB_TYPE(mp) == M_DATA) || (DB_TYPE(mp) == M_MULTIDATA));
- if (dsp->ds_dlstate != DL_IDLE) {
- freemsgchain(mp);
- goto done;
- }
-
- /*
- * Attempt to transmit one or more packets. If the MAC can't
- * send them all, re-queue the packet(s) at the beginning of
- * the transmit queue to avoid any re-ordering.
- */
- mp = dls_tx(dsp->ds_dc, mp);
- if (mp == head) {
- /*
- * No message was sent out. Take the saved the queue depth
- * as the input, so that dld_tx_enqueue() need not to
- * calculate it again.
- */
- dld_tx_enqueue(dsp, mp, tail, B_TRUE, msgcnt, cnt);
- } else if (mp != NULL) {
- /*
- * Some but not all messages were sent out. dld_tx_enqueue()
- * needs to start the timer to calculate the queue depth if
- * timer has not been started.
- *
- * Note that a timer is used to calculate the queue depth
- * to improve network performance, especially for TCP, in
- * which case packets are sent without canput() being checked,
- * and mostly end up in dld_tx_enqueue() under heavy load.
- */
- dld_tx_enqueue(dsp, mp, tail, B_TRUE, 0, 0);
- }
-
-done:
- /*
- * Grab the list lock again and check if the transmit queue is
- * really empty; if so, lift up flow-control and backenable any
- * writer queues. If the queue is not empty, schedule service
- * thread to drain it.
- */
- mutex_enter(&dsp->ds_tx_list_lock);
- if (dsp->ds_tx_list_head == NULL) {
- dsp->ds_tx_flow_mp = getq(wq);
- ASSERT(dsp->ds_tx_flow_mp != NULL);
- dsp->ds_tx_qbusy = B_FALSE;
- if ((tid = dsp->ds_tx_qdepth_tid) != 0)
- dsp->ds_tx_qdepth_tid = 0;
- }
- mutex_exit(&dsp->ds_tx_list_lock);
-
- /*
- * Note that ds_tx_list_lock (which is acquired by the timeout
- * callback routine) cannot be held across the call to untimeout().
- */
- if (tid != 0)
- (void) untimeout(tid);
- rw_exit(&dsp->ds_lock);
+ DLD_CLRQFULL(dsp);
}
void
@@ -602,12 +470,6 @@ dld_str_init(void)
ASSERT(str_cachep != NULL);
/*
- * Create taskq to process DLPI requests.
- */
- dld_disp_taskq = taskq_create("dld_disp_taskq", 1024, MINCLSYSPRI, 2,
- INT_MAX, TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
-
- /*
* Create a hash table for maintaining dld_str_t's.
* The ds_minor field (the clone minor number) of a dld_str_t
* is used as a key for this hash table because this number is
@@ -615,6 +477,16 @@ dld_str_init(void)
*/
str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
mod_hash_null_valdtor);
+
+ mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
+
+ dld_taskq_quit = B_FALSE;
+ dld_taskq_done = B_FALSE;
+ list_create(&dld_taskq_list, sizeof (dld_str_t),
+ offsetof(dld_str_t, ds_tqlist));
+ (void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
+ &p0, TS_RUN, minclsyspri);
}
/*
@@ -629,10 +501,16 @@ dld_str_fini(void)
if (str_count != 0)
return (EBUSY);
- ASSERT(dld_disp_taskq != NULL);
- taskq_destroy(dld_disp_taskq);
- dld_disp_taskq = NULL;
-
+ /*
+ * Ask the dld_taskq thread to quit and wait for it to be done
+ */
+ mutex_enter(&dld_taskq_lock);
+ dld_taskq_quit = B_TRUE;
+ cv_signal(&dld_taskq_cv);
+ while (!dld_taskq_done)
+ cv_wait(&dld_taskq_cv, &dld_taskq_lock);
+ mutex_exit(&dld_taskq_lock);
+ list_destroy(&dld_taskq_list);
/*
* Destroy object cache.
*/
@@ -668,7 +546,6 @@ dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
dsp->ds_type = type;
dsp->ds_major = major;
dsp->ds_style = style;
- dsp->ds_tx = dsp->ds_unitdata_tx = NULL;
/*
* Initialize the queue pointers.
@@ -690,20 +567,6 @@ dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
return (dsp);
}
-void
-dld_finish_pending_task(dld_str_t *dsp)
-{
- /*
- * Wait until the pending requests are processed by the worker thread.
- */
- mutex_enter(&dsp->ds_disp_lock);
- dsp->ds_closing = B_TRUE;
- while (dsp->ds_tid != NULL)
- cv_wait(&dsp->ds_disp_cv, &dsp->ds_disp_lock);
- dsp->ds_closing = B_FALSE;
- mutex_exit(&dsp->ds_disp_lock);
-}
-
/*
* Destroy a dld_str_t object.
*/
@@ -713,30 +576,29 @@ dld_str_destroy(dld_str_t *dsp)
queue_t *rq;
queue_t *wq;
mod_hash_val_t val;
+
/*
* Clear the queue pointers.
*/
rq = dsp->ds_rq;
wq = dsp->ds_wq;
ASSERT(wq == WR(rq));
-
rq->q_ptr = wq->q_ptr = NULL;
dsp->ds_rq = dsp->ds_wq = NULL;
- ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
- ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
- ASSERT(dsp->ds_tx_list_head == NULL);
- ASSERT(dsp->ds_tx_list_tail == NULL);
- ASSERT(dsp->ds_tx_cnt == 0);
- ASSERT(dsp->ds_tx_msgcnt == 0);
- ASSERT(dsp->ds_tx_qdepth_tid == 0);
- ASSERT(!dsp->ds_tx_qbusy);
+ ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
+ ASSERT(dsp->ds_sap == 0);
+ ASSERT(dsp->ds_mh == NULL);
+ ASSERT(dsp->ds_mch == NULL);
+ ASSERT(dsp->ds_promisc == 0);
+ ASSERT(dsp->ds_mph == NULL);
+ ASSERT(dsp->ds_mip == NULL);
+ ASSERT(dsp->ds_mnh == NULL);
- ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
- ASSERT(dsp->ds_pending_head == NULL);
- ASSERT(dsp->ds_pending_tail == NULL);
- ASSERT(dsp->ds_tx == NULL);
- ASSERT(dsp->ds_unitdata_tx == NULL);
+ ASSERT(dsp->ds_polling == B_FALSE);
+ ASSERT(dsp->ds_direct == B_FALSE);
+ ASSERT(dsp->ds_lso == B_FALSE);
+ ASSERT(dsp->ds_lso_max == 0);
/*
* Reinitialize all the flags.
@@ -746,6 +608,18 @@ dld_str_destroy(dld_str_t *dsp)
dsp->ds_mode = DLD_UNITDATA;
dsp->ds_native = B_FALSE;
+ ASSERT(dsp->ds_datathr_cnt == 0);
+ ASSERT(dsp->ds_pending_head == NULL);
+ ASSERT(dsp->ds_pending_tail == NULL);
+ ASSERT(!dsp->ds_dlpi_pending);
+
+ ASSERT(dsp->ds_dlp == NULL);
+ ASSERT(dsp->ds_dmap == NULL);
+ ASSERT(dsp->ds_rx == NULL);
+ ASSERT(dsp->ds_rx_arg == NULL);
+ ASSERT(dsp->ds_next == NULL);
+ ASSERT(dsp->ds_head == NULL);
+
/*
* Free the dummy mblk if exists.
*/
@@ -786,12 +660,9 @@ str_constructor(void *buf, void *cdrarg, int kmflags)
*/
dsp->ds_dlstate = DL_UNATTACHED;
- rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
- mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
- mutex_init(&dsp->ds_disp_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&dsp->ds_disp_cv, NULL, CV_DRIVER, NULL);
- mutex_init(&dsp->ds_tx_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&dsp->ds_tx_cv, NULL, CV_DRIVER, NULL);
+ mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
+ cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
return (0);
}
@@ -806,78 +677,20 @@ str_destructor(void *buf, void *cdrarg)
dld_str_t *dsp = buf;
/*
- * Make sure the DLPI state machine was reset.
- */
- ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
-
- /*
- * Make sure the data-link interface was closed.
- */
- ASSERT(dsp->ds_mh == NULL);
- ASSERT(dsp->ds_dc == NULL);
- ASSERT(dsp->ds_tx == NULL);
- ASSERT(dsp->ds_unitdata_tx == NULL);
- ASSERT(dsp->ds_intx_cnt == 0);
- ASSERT(dsp->ds_detaching == B_FALSE);
-
- /*
- * Make sure enabled notifications are cleared.
- */
- ASSERT(dsp->ds_notifications == 0);
-
- /*
- * Make sure polling is disabled.
- */
- ASSERT(!dsp->ds_polling);
-
- /*
* Release the minor number.
*/
mac_minor_rele(dsp->ds_minor);
- ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
- rw_destroy(&dsp->ds_lock);
-
- ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
- mutex_destroy(&dsp->ds_tx_list_lock);
ASSERT(dsp->ds_tx_flow_mp == NULL);
- ASSERT(dsp->ds_pending_head == NULL);
- ASSERT(dsp->ds_pending_tail == NULL);
- ASSERT(!dsp->ds_closing);
-
- ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
- mutex_destroy(&dsp->ds_disp_lock);
- cv_destroy(&dsp->ds_disp_cv);
-
- ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_lock));
- mutex_destroy(&dsp->ds_tx_lock);
- cv_destroy(&dsp->ds_tx_cv);
-}
-
-void
-dld_tx_single(dld_str_t *dsp, mblk_t *mp)
-{
- /*
- * If we are busy enqueue the packet and return.
- * Otherwise hand them over to the MAC driver for transmission.
- * If the message didn't get sent it will be queued.
- *
- * Note here that we don't grab the list lock prior to checking
- * the busy flag. This is okay, because a missed transition
- * will not cause any packet reordering for any particular TCP
- * connection (which is single-threaded). The enqueue routine
- * will atomically set the busy flag and schedule the service
- * thread to run; the flag is only cleared by the service thread
- * when there is no more packet to be transmitted.
- */
- if (dsp->ds_tx_qbusy || ((mp = dls_tx(dsp->ds_dc, mp)) != NULL))
- dld_tx_enqueue(dsp, mp, mp, B_FALSE, 1, mp_getsize(mp));
+ mutex_destroy(&dsp->ds_lock);
+ cv_destroy(&dsp->ds_datathr_cv);
+ cv_destroy(&dsp->ds_dlpi_pending_cv);
}
/*
* Update the priority bits and VID (may need to insert tag if mp points
- * to an untagged packet).
+ * to an untagged packet.
* If vid is VLAN_ID_NONE, use the VID encoded in the packet.
*/
static mblk_t *
@@ -960,18 +773,16 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
}
/*
- * M_DATA put
- *
- * The poll callback function for DLS clients which are not in the per-stream
- * mode. This function is called from an upper layer protocol (currently only
- * tcp and udp).
+ * M_DATA put (IP fast-path mode)
*/
-void
-str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
+mac_tx_cookie_t
+str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
+ uint16_t flag)
{
boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
mblk_t *newmp;
uint_t pri;
+ mac_tx_cookie_t cookie;
if (is_ethernet) {
/*
@@ -988,25 +799,28 @@ str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
}
}
- dld_tx_single(dsp, mp);
- return;
+ if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
+ DLD_SETQFULL(dsp);
+ }
+ return (cookie);
discard:
/* TODO: bump kstat? */
freemsg(mp);
+ return (NULL);
}
/*
- * M_DATA put (DLIOCRAW mode).
+ * M_DATA put (DLIOCRAW mode)
*/
-void
+static void
str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
{
boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
mblk_t *bp, *newmp;
size_t size;
mac_header_info_t mhi;
- uint_t pri, vid;
+ uint_t pri, vid, dvid;
uint_t max_sdu;
/*
@@ -1039,7 +853,7 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
size += MBLKL(bp);
}
- if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
+ if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
goto discard;
mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
@@ -1052,12 +866,14 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
goto discard;
if (is_ethernet) {
+ dvid = mac_client_vid(dsp->ds_mch);
+
/*
* Discard the packet if this is a VLAN stream but the VID in
* the packet is not correct.
*/
vid = VLAN_ID(mhi.mhi_tci);
- if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
+ if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
goto discard;
/*
@@ -1074,16 +890,19 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
* packets on a VLAN stream.
*/
pri = (pri == 0) ? dsp->ds_pri : 0;
- if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) {
+ if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
if ((newmp = i_dld_ether_header_update_tag(mp,
- pri, dsp->ds_vid)) == NULL) {
+ pri, dvid)) == NULL) {
goto discard;
}
mp = newmp;
}
}
- dld_tx_single(dsp, mp);
+ if (DLD_TX(dsp, mp, 0, 0) != NULL) {
+ /* Turn on flow-control for dld */
+ DLD_SETQFULL(dsp);
+ }
return;
discard:
@@ -1097,18 +916,21 @@ discard:
int
dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
{
- dev_t dev;
- int err;
- const char *drvname;
- dls_channel_t dc;
- uint_t addr_length;
- boolean_t qassociated = B_FALSE;
-
- ASSERT(dsp->ds_dc == NULL);
+ dev_t dev;
+ int err;
+ const char *drvname;
+ mac_perim_handle_t mph;
+ boolean_t qassociated = B_FALSE;
+ dls_link_t *dlp = NULL;
+ dls_dl_handle_t ddp = NULL;
+ boolean_t entered_perim = B_FALSE;
if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
return (EINVAL);
+ if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
+ return (ENOTSUP);
+
/*
* /dev node access. This will still be supported for backward
* compatibility reason.
@@ -1120,46 +942,22 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
qassociated = B_TRUE;
}
- /*
- * Open a channel.
- */
- if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) {
- /*
- * style-2 VLAN open, this is a /dev VLAN ppa open
- * which might result in a newly created dls_vlan_t.
- */
- err = dls_open_style2_vlan(dsp->ds_major, ppa, &dc);
- if (err != 0) {
- if (qassociated)
- (void) qassociate(dsp->ds_wq, -1);
- return (err);
- }
- } else {
- dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
- if ((err = dls_open_by_dev(dev, &dc)) != 0) {
- if (qassociated)
- (void) qassociate(dsp->ds_wq, -1);
- return (err);
- }
- }
-
- /*
- * Cache the MAC interface handle, a pointer to the immutable MAC
- * information and the current and 'factory' MAC address.
- */
- dsp->ds_mh = dls_mac(dc);
- dsp->ds_mip = mac_info(dsp->ds_mh);
-
- mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
+ dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
+ if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
+ goto failed;
- addr_length = dsp->ds_mip->mi_addr_length;
- bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
+ if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
+ goto failed;
+ entered_perim = B_TRUE;
/*
- * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
- * a non-VLAN interface).
+ * Open a channel.
*/
- dsp->ds_vid = dls_vid(dc);
+ if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
+ goto failed;
+
+ if ((err = dls_open(dlp, ddp, dsp)) != 0)
+ goto failed;
/*
* Set the default packet priority.
@@ -1169,12 +967,22 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
/*
* Add a notify function so that the we get updates from the MAC.
*/
- dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
-
- dsp->ds_dc = dc;
+ dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
dsp->ds_dlstate = DL_UNBOUND;
-
+ mac_perim_exit(mph);
return (0);
+
+failed:
+ if (dlp != NULL)
+ dls_link_rele(dlp);
+ if (entered_perim)
+ mac_perim_exit(mph);
+ if (ddp != NULL)
+ dls_devnet_rele(ddp);
+ if (qassociated)
+ (void) qassociate(dsp->ds_wq, -1);
+
+ return (err);
}
/*
@@ -1184,35 +992,56 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
void
dld_str_detach(dld_str_t *dsp)
{
+ mac_perim_handle_t mph;
+ int err;
+
+ ASSERT(dsp->ds_datathr_cnt == 0);
+
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
/*
* Remove the notify function.
+ *
+ * Note that we cannot wait for the notification callback to be removed
+ * since it could cause the deadlock with str_notify() since they both
+ * need the mac perimeter. Continue if we cannot remove the
+ * notification callback right now and wait after we leave the
+ * perimeter.
*/
- mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
+ err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
+ dsp->ds_mnh = NULL;
/*
- * Disable the capabilities and clear the promisc flag.
+ * Disable the capabilities
*/
- ASSERT(!dsp->ds_polling);
- ASSERT(!dsp->ds_soft_ring);
dld_capabilities_disable(dsp);
- dsp->ds_promisc = 0;
- DLD_TX_QUIESCE(dsp);
+ /*
+ * Clear LSO flags.
+ */
+ dsp->ds_lso = B_FALSE;
+ dsp->ds_lso_max = 0;
+
+ dls_close(dsp);
+ mac_perim_exit(mph);
/*
- * Flush all pending packets which are sitting in the transmit queue.
+ * Now we leave the mac perimeter. If mac_notify_remove() failed
+ * because the notification callback was in progress, wait for
+ * it to finish before we proceed.
*/
- dld_tx_flush(dsp);
+ if (err != 0)
+ mac_notify_remove_wait(dsp->ds_mh);
/*
- * Clear LSO flags.
+ * An unreferenced tagged (non-persistent) vlan gets destroyed
+ * automatically in the call to dls_devnet_rele.
*/
- dsp->ds_lso = B_FALSE;
- dsp->ds_lso_max = 0;
+ dls_devnet_rele(dsp->ds_ddh);
- dls_close(dsp->ds_dc);
- dsp->ds_dc = NULL;
+ dsp->ds_sap = 0;
dsp->ds_mh = NULL;
+ dsp->ds_mch = NULL;
+ dsp->ds_mip = NULL;
if (dsp->ds_style == DL_STYLE2)
(void) qassociate(dsp->ds_wq, -1);
@@ -1221,7 +1050,6 @@ dld_str_detach(dld_str_t *dsp)
* Re-initialize the DLPI state machine.
*/
dsp->ds_dlstate = DL_UNATTACHED;
-
}
/*
@@ -1314,7 +1142,8 @@ dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
/*
* Strip the VLAN tag for VLAN streams.
*/
- if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) {
+ if (is_ethernet &&
+ mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
newmp = i_dld_ether_header_strip_tag(mp);
if (newmp == NULL) {
freemsg(mp);
@@ -1366,7 +1195,8 @@ dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
* * Otherwise, strip the whole VLAN header.
* - Untagged packets. Strip the whole MAC header.
*/
- if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
+ if (mhip->mhi_istagged &&
+ (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
((dsp->ds_sap == ETHERTYPE_VLAN) ||
(dsp->ds_promisc & DLS_PROMISC_SAP))) {
offset = VLAN_TAGSZ;
@@ -1418,7 +1248,8 @@ dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
/*
* See MAC header stripping rules in the dld_str_rx_fastpath() function.
*/
- if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
+ if (mhip->mhi_istagged &&
+ (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
((dsp->ds_sap == ETHERTYPE_VLAN) ||
(dsp->ds_promisc & DLS_PROMISC_SAP))) {
offset = VLAN_TAGSZ;
@@ -1534,7 +1365,7 @@ str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
/*
* Get the packet header information.
*/
- if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
+ if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
return (NULL);
/*
@@ -1805,11 +1636,14 @@ str_notify_fastpath_flush(dld_str_t *dsp)
/*
* MAC notification callback.
*/
-static void
+void
str_notify(void *arg, mac_notify_type_t type)
{
dld_str_t *dsp = (dld_str_t *)arg;
queue_t *q = dsp->ds_wq;
+ mac_handle_t mh = dsp->ds_mh;
+ mac_client_handle_t mch = dsp->ds_mch;
+ uint8_t addr[MAXMACADDRLEN];
switch (type) {
case MAC_NOTE_TX:
@@ -1820,26 +1654,23 @@ str_notify(void *arg, mac_notify_type_t type)
/*
* Send the appropriate DL_NOTIFY_IND.
*/
- if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
+ if (mac_promisc_get(mh, MAC_DEVPROMISC))
str_notify_promisc_on_phys(dsp);
else
str_notify_promisc_off_phys(dsp);
break;
- case MAC_NOTE_PROMISC:
- break;
-
case MAC_NOTE_UNICST:
/*
- * This notification is sent whenever the MAC unicast address
- * changes. We need to re-cache the address.
+ * This notification is sent whenever the MAC unicast
+ * address changes.
*/
- mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
+ mac_unicast_primary_get(mh, addr);
/*
* Send the appropriate DL_NOTIFY_IND.
*/
- str_notify_phys_addr(dsp, dsp->ds_curr_addr);
+ str_notify_phys_addr(dsp, addr);
break;
case MAC_NOTE_LINK:
@@ -1847,7 +1678,7 @@ str_notify(void *arg, mac_notify_type_t type)
* This notification is sent every time the MAC driver
* updates the link state.
*/
- switch (mac_link_get(dsp->ds_mh)) {
+ switch (mac_client_stat_get(mch, MAC_STAT_LINK_STATE)) {
case LINK_STATE_UP: {
uint64_t speed;
/*
@@ -1856,7 +1687,7 @@ str_notify(void *arg, mac_notify_type_t type)
*/
str_notify_link_up(dsp);
- speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
+ speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
break;
}
@@ -1874,7 +1705,7 @@ str_notify(void *arg, mac_notify_type_t type)
break;
case MAC_NOTE_RESOURCE:
- case MAC_NOTE_VNIC:
+ case MAC_NOTE_CAPAB_CHG:
/*
* This notification is sent whenever the MAC resources
* change or capabilities change. We need to renegotiate
@@ -1897,334 +1728,177 @@ str_notify(void *arg, mac_notify_type_t type)
case MAC_NOTE_MARGIN:
break;
+ case MAC_NOTE_PROMISC:
+ break;
+
default:
ASSERT(B_FALSE);
break;
}
}
-static inline uint_t
-mp_getsize(mblk_t *mp)
-{
- ASSERT(DB_TYPE(mp) == M_DATA);
- return ((mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp));
-}
-
/*
- * Calculate the dld queue depth, free the messages that exceed the threshold.
+ * This function is called via a taskq mechansim to process all control
+ * messages on a per 'dsp' end point.
*/
static void
-dld_tx_qdepth_timer(void *arg)
+dld_wput_nondata_task(void *arg)
{
- dld_str_t *dsp = (dld_str_t *)arg;
- mblk_t *prev, *mp;
- uint_t cnt, msgcnt, size;
-
- mutex_enter(&dsp->ds_tx_list_lock);
-
- /* Calculate total size and count of the packet(s) */
- cnt = msgcnt = 0;
- for (prev = NULL, mp = dsp->ds_tx_list_head; mp != NULL;
- prev = mp, mp = mp->b_next) {
- size = mp_getsize(mp);
- cnt += size;
- msgcnt++;
- if (cnt >= dld_max_q_count || msgcnt >= dld_max_q_count) {
- ASSERT(dsp->ds_tx_qbusy);
- dsp->ds_tx_list_tail = prev;
- if (prev == NULL)
- dsp->ds_tx_list_head = NULL;
- else
- prev->b_next = NULL;
- freemsgchain(mp);
- cnt -= size;
- msgcnt--;
+ dld_str_t *dsp = arg;
+ mblk_t *mp;
+
+ mutex_enter(&dsp->ds_lock);
+ while (dsp->ds_pending_head != NULL) {
+ mp = dsp->ds_pending_head;
+ dsp->ds_pending_head = mp->b_next;
+ mp->b_next = NULL;
+ if (dsp->ds_pending_head == NULL)
+ dsp->ds_pending_tail = NULL;
+ mutex_exit(&dsp->ds_lock);
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ dld_proto(dsp, mp);
break;
+ case M_IOCTL:
+ dld_ioc(dsp, mp);
+ break;
+ default:
+ ASSERT(0);
}
+
+ mutex_enter(&dsp->ds_lock);
}
- dsp->ds_tx_cnt = cnt;
- dsp->ds_tx_msgcnt = msgcnt;
- dsp->ds_tx_qdepth_tid = 0;
- mutex_exit(&dsp->ds_tx_list_lock);
+ ASSERT(dsp->ds_pending_tail == NULL);
+ dsp->ds_dlpi_pending = 0;
+ cv_broadcast(&dsp->ds_dlpi_pending_cv);
+ mutex_exit(&dsp->ds_lock);
}
/*
- * Enqueue one or more messages on the transmit queue. Caller specifies:
- * - the insertion position (head/tail).
- * - the message count and the total message size of messages to be queued
- * if they are known to the caller; or 0 if they are not known.
- *
- * If the caller does not know the message size information, this usually
- * means that dld_wsrv() managed to send some but not all of the queued
- * messages. For performance reasons, we do not calculate the queue depth
- * every time. Instead, a timer is started to calculate the queue depth
- * every 1 second (can be changed by tx_qdepth_interval).
+ * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
+ * thread is started at boot time.
*/
static void
-dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, mblk_t *tail, boolean_t head_insert,
- uint_t msgcnt, uint_t cnt)
+dld_taskq_dispatch(void)
{
- queue_t *q = dsp->ds_wq;
- uint_t tot_cnt, tot_msgcnt;
- mblk_t *next;
-
- mutex_enter(&dsp->ds_tx_list_lock);
-
- /*
- * Simply enqueue the message and calculate the queue depth via
- * timer if:
- *
- * - the current queue depth is incorrect, and the timer is already
- * started; or
- *
- * - the given message size is unknown and it is allowed to start the
- * timer;
- */
- if ((dsp->ds_tx_qdepth_tid != 0) ||
- (msgcnt == 0 && tx_qdepth_interval != 0)) {
- goto enqueue;
- }
+ callb_cpr_t cprinfo;
+ dld_str_t *dsp;
- /*
- * The timer is not allowed, so calculate the message size now.
- */
- if (msgcnt == 0) {
- for (next = mp; next != NULL; next = next->b_next) {
- cnt += mp_getsize(next);
- msgcnt++;
+ CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
+ "dld_taskq_dispatch");
+ mutex_enter(&dld_taskq_lock);
+
+ while (!dld_taskq_quit) {
+ dsp = list_head(&dld_taskq_list);
+ while (dsp != NULL) {
+ list_remove(&dld_taskq_list, dsp);
+ mutex_exit(&dld_taskq_lock);
+ VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
+ dsp, TQ_SLEEP) != 0);
+ mutex_enter(&dld_taskq_lock);
+ dsp = list_head(&dld_taskq_list);
}
- }
-
- /*
- * Grow the queue depth using the input messesge size.
- *
- * If the queue depth would exceed the allowed threshold, drop
- * new packet(s) and drain those already in the queue.
- */
- tot_cnt = dsp->ds_tx_cnt + cnt;
- tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
-
- if (!head_insert && (tot_cnt >= dld_max_q_count ||
- tot_msgcnt >= dld_max_q_count)) {
- ASSERT(dsp->ds_tx_qbusy);
- mutex_exit(&dsp->ds_tx_list_lock);
- freemsgchain(mp);
- goto done;
- }
- /* Update the queue size parameters */
- dsp->ds_tx_cnt = tot_cnt;
- dsp->ds_tx_msgcnt = tot_msgcnt;
-
-enqueue:
- /*
- * If the transmit queue is currently empty and we are
- * about to deposit the packet(s) there, switch mode to
- * "busy" and raise flow-control condition.
- */
- if (!dsp->ds_tx_qbusy) {
- dsp->ds_tx_qbusy = B_TRUE;
- ASSERT(dsp->ds_tx_flow_mp != NULL);
- (void) putq(q, dsp->ds_tx_flow_mp);
- dsp->ds_tx_flow_mp = NULL;
- }
-
- if (!head_insert) {
- /* Tail insertion */
- if (dsp->ds_tx_list_head == NULL)
- dsp->ds_tx_list_head = mp;
- else
- dsp->ds_tx_list_tail->b_next = mp;
- dsp->ds_tx_list_tail = tail;
- } else {
- /* Head insertion */
- tail->b_next = dsp->ds_tx_list_head;
- if (dsp->ds_tx_list_head == NULL)
- dsp->ds_tx_list_tail = tail;
- dsp->ds_tx_list_head = mp;
- }
-
- if (msgcnt == 0 && dsp->ds_tx_qdepth_tid == 0 &&
- tx_qdepth_interval != 0) {
- /*
- * The message size is not given so that we need to start
- * the timer to calculate the queue depth.
- */
- dsp->ds_tx_qdepth_tid = timeout(dld_tx_qdepth_timer, dsp,
- drv_usectohz(tx_qdepth_interval));
- ASSERT(dsp->ds_tx_qdepth_tid != NULL);
- }
- mutex_exit(&dsp->ds_tx_list_lock);
-done:
- /* Schedule service thread to drain the transmit queue */
- if (!head_insert)
- qenable(q);
-}
-void
-dld_tx_flush(dld_str_t *dsp)
-{
- timeout_id_t tid = 0;
-
- mutex_enter(&dsp->ds_tx_list_lock);
- if (dsp->ds_tx_list_head != NULL) {
- freemsgchain(dsp->ds_tx_list_head);
- dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
- dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
- if (dsp->ds_tx_qbusy) {
- dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
- ASSERT(dsp->ds_tx_flow_mp != NULL);
- dsp->ds_tx_qbusy = B_FALSE;
- }
- if ((tid = dsp->ds_tx_qdepth_tid) != 0)
- dsp->ds_tx_qdepth_tid = 0;
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&dld_taskq_cv, &dld_taskq_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
}
- mutex_exit(&dsp->ds_tx_list_lock);
- /*
- * Note that ds_tx_list_lock (which is acquired by the timeout
- * callback routine) cannot be held across the call to untimeout().
- */
- if (tid != 0)
- (void) untimeout(tid);
+ dld_taskq_done = B_TRUE;
+ cv_signal(&dld_taskq_cv);
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
}
/*
- * Process a non-data message.
+ * All control operations are serialized on the 'dsp' and are also funneled
+ * through a taskq mechanism to ensure that subsequent processing has kernel
+ * context and can safely use cv_wait.
+ *
+ * Mechanisms to handle taskq dispatch failures
+ *
+ * The only way to be sure that taskq dispatch does not fail is to either
+ * specify TQ_SLEEP or to use a static taskq and prepopulate it with
+ * some number of entries and make sure that the number of outstanding requests
+ * are less than that number. We can't use TQ_SLEEP since we don't know the
+ * context. Nor can we bound the total number of 'dsp' end points. So we are
+ * unable to use either of the above schemes, and are forced to deal with
+ * taskq dispatch failures. Note that even dynamic taskq could fail in
+ * dispatch if TQ_NOSLEEP is specified, since this flag is translated
+ * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
+ * framework.
+ *
+ * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
+ * We also have a single global thread to retry the taskq dispatch. This
+ * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
+ * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
*/
static void
dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
{
- ASSERT((dsp->ds_type == DLD_DLPI && dsp->ds_ioctl == NULL) ||
- (dsp->ds_type == DLD_CONTROL && dsp->ds_ioctl != NULL));
-
- mutex_enter(&dsp->ds_disp_lock);
-
- /*
- * The processing of the message might block. Enqueue the
- * message for later processing.
- */
- if (dsp->ds_pending_head == NULL) {
- dsp->ds_pending_head = dsp->ds_pending_tail = mp;
- } else {
+ ASSERT(mp->b_next == NULL);
+ mutex_enter(&dsp->ds_lock);
+ if (dsp->ds_pending_head != NULL) {
+ ASSERT(dsp->ds_dlpi_pending);
dsp->ds_pending_tail->b_next = mp;
dsp->ds_pending_tail = mp;
+ mutex_exit(&dsp->ds_lock);
+ return;
}
-
+ ASSERT(dsp->ds_pending_tail == NULL);
+ dsp->ds_pending_head = dsp->ds_pending_tail = mp;
/*
- * If there is no task pending, kick off the task.
+ * At this point if ds_dlpi_pending is set, it implies that the taskq
+ * thread is still active and is processing the last message, though
+ * the pending queue has been emptied.
*/
- if (dsp->ds_tid == NULL) {
- dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
- dld_wput_nondata_task, dsp, TQ_SLEEP);
- ASSERT(dsp->ds_tid != NULL);
+ if (dsp->ds_dlpi_pending) {
+ mutex_exit(&dsp->ds_lock);
+ return;
}
- mutex_exit(&dsp->ds_disp_lock);
+
+ dsp->ds_dlpi_pending = 1;
+ mutex_exit(&dsp->ds_lock);
+
+ if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
+ TQ_NOSLEEP) != 0)
+ return;
+
+ mutex_enter(&dld_taskq_lock);
+ list_insert_tail(&dld_taskq_list, dsp);
+ cv_signal(&dld_taskq_cv);
+ mutex_exit(&dld_taskq_lock);
}
/*
- * The worker thread which processes non-data messages. Note we only process
- * one message at one time in order to be able to "flush" the queued message
- * and serialize the processing.
+ * Process an M_IOCTL message.
*/
static void
-dld_wput_nondata_task(void *arg)
+dld_ioc(dld_str_t *dsp, mblk_t *mp)
{
- dld_str_t *dsp = (dld_str_t *)arg;
- mblk_t *mp;
-
- mutex_enter(&dsp->ds_disp_lock);
- ASSERT(dsp->ds_pending_head != NULL);
- ASSERT(dsp->ds_tid != NULL);
-
- if (dsp->ds_closing)
- goto closing;
-
- mp = dsp->ds_pending_head;
- if ((dsp->ds_pending_head = mp->b_next) == NULL)
- dsp->ds_pending_tail = NULL;
- mp->b_next = NULL;
+ uint_t cmd;
- mutex_exit(&dsp->ds_disp_lock);
+ cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
+ ASSERT(dsp->ds_type == DLD_DLPI);
- switch (DB_TYPE(mp)) {
- case M_PROTO:
- case M_PCPROTO:
- ASSERT(dsp->ds_type == DLD_DLPI);
- dld_wput_proto_nondata(dsp, mp);
+ switch (cmd) {
+ case DLIOCNATIVE:
+ ioc_native(dsp, mp);
break;
- case M_IOCTL: {
- uint_t cmd;
-
- if (dsp->ds_type == DLD_CONTROL) {
- ASSERT(dsp->ds_ioctl != NULL);
- dsp->ds_ioctl(dsp->ds_wq, mp);
- break;
- }
-
- cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
-
- switch (cmd) {
- case DLIOCNATIVE:
- ioc_native(dsp, mp);
- break;
- case DLIOCMARGININFO:
- ioc_margin(dsp, mp);
- break;
- case DLIOCRAW:
- ioc_raw(dsp, mp);
- break;
- case DLIOCHDRINFO:
- ioc_fast(dsp, mp);
- break;
- default:
- ioc(dsp, mp);
- break;
- }
+ case DLIOCMARGININFO:
+ ioc_margin(dsp, mp);
break;
- }
- case M_IOCDATA:
- ASSERT(dsp->ds_type == DLD_DLPI);
- ioc(dsp, mp);
+ case DLIOCRAW:
+ ioc_raw(dsp, mp);
break;
+ case DLIOCHDRINFO:
+ ioc_fast(dsp, mp);
+ break;
+ default:
+ ioc(dsp, mp);
}
-
- mutex_enter(&dsp->ds_disp_lock);
-
- if (dsp->ds_closing)
- goto closing;
-
- if (dsp->ds_pending_head != NULL) {
- dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
- dld_wput_nondata_task, dsp, TQ_SLEEP);
- ASSERT(dsp->ds_tid != NULL);
- } else {
- dsp->ds_tid = NULL;
- }
- mutex_exit(&dsp->ds_disp_lock);
- return;
-
- /*
- * If the stream is closing, flush all queued messages and inform
- * the stream once it is done.
- */
-closing:
- freemsgchain(dsp->ds_pending_head);
- dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
- dsp->ds_tid = NULL;
- cv_signal(&dsp->ds_disp_cv);
- mutex_exit(&dsp->ds_disp_lock);
-}
-
-/*
- * Flush queued non-data messages.
- */
-static void
-dld_flush_nondata(dld_str_t *dsp)
-{
- mutex_enter(&dsp->ds_disp_lock);
- freemsgchain(dsp->ds_pending_head);
- dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
- mutex_exit(&dsp->ds_disp_lock);
}
/*
@@ -2236,8 +1910,6 @@ ioc_native(dld_str_t *dsp, mblk_t *mp)
queue_t *q = dsp->ds_wq;
const mac_info_t *mip = dsp->ds_mip;
- rw_enter(&dsp->ds_lock, RW_WRITER);
-
/*
* Native mode can be enabled if it's disabled and if the
* native media type is different.
@@ -2245,8 +1917,6 @@ ioc_native(dld_str_t *dsp, mblk_t *mp)
if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
dsp->ds_native = B_TRUE;
- rw_exit(&dsp->ds_lock);
-
if (dsp->ds_native)
miocack(q, mp, 0, mip->mi_nativemedia);
else
@@ -2286,22 +1956,34 @@ static void
ioc_raw(dld_str_t *dsp, mblk_t *mp)
{
queue_t *q = dsp->ds_wq;
+ mac_perim_handle_t mph;
+
+ if (dsp->ds_mh == NULL) {
+ dsp->ds_mode = DLD_RAW;
+ miocack(q, mp, 0, 0);
+ return;
+ }
- if (dsp->ds_polling || dsp->ds_soft_ring) {
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+ if (dsp->ds_polling || dsp->ds_direct) {
+ mac_perim_exit(mph);
miocnak(q, mp, 0, EPROTO);
return;
}
- rw_enter(&dsp->ds_lock, RW_WRITER);
- if ((dsp->ds_mode != DLD_RAW) && (dsp->ds_dlstate == DL_IDLE)) {
+ if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
/*
* Set the receive callback.
*/
- dls_rx_set(dsp->ds_dc, dld_str_rx_raw, dsp);
- dsp->ds_tx = str_mdata_raw_put;
+ dls_rx_set(dsp, dld_str_rx_raw, dsp);
}
+
+ /*
+ * Note that raw mode is enabled.
+ */
dsp->ds_mode = DLD_RAW;
- rw_exit(&dsp->ds_lock);
+ mac_perim_exit(mph);
+
miocack(q, mp, 0, 0);
}
@@ -2321,6 +2003,7 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp)
uint_t addr_length;
queue_t *q = dsp->ds_wq;
int err;
+ mac_perim_handle_t mph;
if (dld_opt & DLD_OPT_NO_FASTPATH) {
err = ENOTSUP;
@@ -2352,11 +2035,6 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp)
goto failed;
}
- /*
- * We don't need to hold any locks to access ds_dlstate, because
- * control message prossessing (which updates this field) is
- * serialized.
- */
if (dsp->ds_dlstate != DL_IDLE) {
err = ENOTSUP;
goto failed;
@@ -2371,24 +2049,31 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp)
addr = nmp->b_rptr + off;
sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
- if ((hmp = dls_header(dsp->ds_dc, addr, sap, 0, NULL)) == NULL) {
+ if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
err = ENOMEM;
goto failed;
}
- rw_enter(&dsp->ds_lock, RW_WRITER);
- ASSERT(dsp->ds_dlstate == DL_IDLE);
+ /*
+ * This ioctl might happen concurrently with a direct call to dld_capab
+ * that tries to enable direct and/or poll capabilities. Since the
+ * stack does not serialize them, we do so here to avoid mixing
+ * the callbacks.
+ */
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
if (dsp->ds_mode != DLD_FASTPATH) {
/*
- * Set the receive callback (unless polling or
- * soft-ring is enabled).
+ * Set the receive callback (unless polling is enabled).
+ */
+ if (!dsp->ds_polling && !dsp->ds_direct)
+ dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
+
+ /*
+ * Note that fast-path mode is enabled.
*/
dsp->ds_mode = DLD_FASTPATH;
- if (!dsp->ds_polling && !dsp->ds_soft_ring)
- dls_rx_set(dsp->ds_dc, dld_str_rx_fastpath, dsp);
- dsp->ds_tx = str_mdata_fastpath_put;
}
- rw_exit(&dsp->ds_lock);
+ mac_perim_exit(mph);
freemsg(nmp->b_cont);
nmp->b_cont = hmp;
@@ -2399,17 +2084,17 @@ failed:
miocnak(q, mp, 0, err);
}
+/*
+ * Catch-all handler.
+ */
static void
ioc(dld_str_t *dsp, mblk_t *mp)
{
queue_t *q = dsp->ds_wq;
- mac_handle_t mh;
if (dsp->ds_dlstate == DL_UNATTACHED) {
miocnak(q, mp, 0, EINVAL);
return;
}
- mh = dsp->ds_mh;
- ASSERT(mh != NULL);
- mac_ioctl(mh, q, mp);
+ mac_ioctl(dsp->ds_mh, q, mp);
}
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 2002e994bf..064217c8f2 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -23,583 +23,285 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Data-Link Services Module
*/
-#include <sys/types.h>
-#include <sys/stream.h>
#include <sys/strsun.h>
-#include <sys/sysmacros.h>
-#include <sys/atomic.h>
-#include <sys/stat.h>
-#include <sys/dlpi.h>
#include <sys/vlan.h>
-#include <sys/ethernet.h>
-#include <sys/byteorder.h>
-#include <sys/mac.h>
-
-#include <sys/dls.h>
-#include <sys/dls_impl.h>
-#include <sys/dls_soft_ring.h>
-
-static kmem_cache_t *i_dls_impl_cachep;
-static uint32_t i_dls_impl_count;
-
-static kstat_t *dls_ksp = (kstat_t *)NULL;
-struct dls_kstats dls_kstat =
-{
- { "soft_ring_pkt_drop", KSTAT_DATA_UINT32 },
-};
-
-static int dls_open(dls_vlan_t *, dls_dl_handle_t ddh, dls_channel_t *);
-
-/*
- * Private functions.
- */
-
-/*ARGSUSED*/
-static int
-i_dls_constructor(void *buf, void *arg, int kmflag)
-{
- dls_impl_t *dip = buf;
-
- bzero(buf, sizeof (dls_impl_t));
-
- rw_init(&(dip->di_lock), NULL, RW_DRIVER, NULL);
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-i_dls_destructor(void *buf, void *arg)
-{
- dls_impl_t *dip = buf;
-
- ASSERT(dip->di_dvp == NULL);
- ASSERT(dip->di_mnh == NULL);
- ASSERT(dip->di_dmap == NULL);
- ASSERT(!dip->di_local);
- ASSERT(!dip->di_bound);
- ASSERT(dip->di_rx == NULL);
- ASSERT(dip->di_txinfo == NULL);
-
- rw_destroy(&(dip->di_lock));
-}
-
-static void
-i_dls_notify(void *arg, mac_notify_type_t type)
-{
- dls_impl_t *dip = arg;
-
- switch (type) {
- case MAC_NOTE_UNICST:
- mac_unicst_get(dip->di_mh, dip->di_unicst_addr);
- break;
-
- case MAC_NOTE_PROMISC:
- case MAC_NOTE_VNIC:
- /*
- * Every time the MAC interface changes promiscuity or
- * the VNIC characteristics change we need to reset
- * our transmit information.
- */
- dip->di_txinfo = mac_tx_get(dip->di_mh);
- break;
- }
-}
-
-static void
-dls_stat_init()
-{
- if ((dls_ksp = kstat_create("dls", 0, "dls_stat",
- "net", KSTAT_TYPE_NAMED,
- sizeof (dls_kstat) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL)) == NULL) {
- cmn_err(CE_WARN,
- "DLS: failed to create kstat structure for dls stats");
- return;
- }
- dls_ksp->ks_data = (void *)&dls_kstat;
- kstat_install(dls_ksp);
-}
-
-static void
-dls_stat_destroy()
-{
- kstat_delete(dls_ksp);
-}
-
-/*
- * Module initialization functions.
- */
-
-void
-dls_init(void)
-{
- /*
- * Create a kmem_cache of dls_impl_t.
- */
- i_dls_impl_cachep = kmem_cache_create("dls_cache",
- sizeof (dls_impl_t), 0, i_dls_constructor, i_dls_destructor, NULL,
- NULL, NULL, 0);
- ASSERT(i_dls_impl_cachep != NULL);
- soft_ring_init();
- dls_stat_init();
-}
+#include <sys/dld_impl.h>
int
-dls_fini(void)
+dls_open(dls_link_t *dlp, dls_dl_handle_t ddh, dld_str_t *dsp)
{
- /*
- * If there are any dls_impl_t in use then return EBUSY.
- */
- if (i_dls_impl_count != 0)
- return (EBUSY);
-
- /*
- * Destroy the kmem_cache.
- */
- kmem_cache_destroy(i_dls_impl_cachep);
- dls_stat_destroy();
- return (0);
-}
-
-/*
- * Client functions.
- */
-
-/*
- * /dev node style-2 VLAN PPA access. This might result in a newly created
- * dls_vlan_t. Note that this dls_vlan_t is different from others, in that
- * this VLAN might not have a link name that is managed by dlmgmtd (we cannot
- * use its VLAN ppa hack name as it might conflict with a vanity name).
- */
-int
-dls_open_style2_vlan(major_t major, uint_t ppa, dls_channel_t *dcp)
-{
- dev_t dev = makedevice(major, DLS_PPA2INST(ppa) + 1);
- uint_t vid = DLS_PPA2VID(ppa);
- dls_vlan_t *lndvp, *dvp;
- int err;
-
- /*
- * First find the dls_vlan_t this VLAN is created on. This must be
- * a GLDv3 driver based device.
- */
- if ((err = dls_vlan_hold_by_dev(dev, &lndvp)) != 0)
- return (err);
-
- if (vid > VLAN_ID_MAX)
- return (ENOENT);
-
- err = dls_vlan_hold(lndvp->dv_dlp->dl_name, vid, &dvp, B_FALSE, B_TRUE);
- if (err != 0)
- goto done;
-
- if ((err = dls_open(dvp, NULL, dcp)) != 0)
- dls_vlan_rele(dvp);
-
-done:
- dls_vlan_rele(lndvp);
- return (err);
-}
-
-int
-dls_open_by_dev(dev_t dev, dls_channel_t *dcp)
-{
- dls_dl_handle_t ddh;
- dls_vlan_t *dvp;
- int err;
-
- /*
- * Get a reference to the given dls_vlan_t.
- */
- if ((err = dls_devnet_open_by_dev(dev, &dvp, &ddh)) != 0)
- return (err);
-
- if ((err = dls_open(dvp, ddh, dcp)) != 0) {
- if (ddh != NULL)
- dls_devnet_close(ddh);
- else
- dls_vlan_rele(dvp);
- }
-
- return (err);
-}
-
-static int
-dls_open(dls_vlan_t *dvp, dls_dl_handle_t ddh, dls_channel_t *dcp)
-{
- dls_impl_t *dip;
- dls_link_t *dlp;
- int err;
zoneid_t zid = getzoneid();
boolean_t local;
/*
- * Check whether this client belongs to the zone of this dvp. Note that
- * a global zone client is allowed to open a local zone dvp.
+ * Check whether this client belongs to the zone of this dlp. Note that
+ * a global zone client is allowed to open a local zone dlp.
*/
- mutex_enter(&dvp->dv_lock);
- if (zid != GLOBAL_ZONEID && dvp->dv_zid != zid) {
- mutex_exit(&dvp->dv_lock);
+ if (zid != GLOBAL_ZONEID && dlp->dl_zid != zid)
return (ENOENT);
- }
- local = (zid == dvp->dv_zid);
- dvp->dv_zone_ref += (local ? 1 : 0);
- mutex_exit(&dvp->dv_lock);
-
- dlp = dvp->dv_dlp;
- if ((err = mac_start(dlp->dl_mh)) != 0) {
- mutex_enter(&dvp->dv_lock);
- dvp->dv_zone_ref -= (local ? 1 : 0);
- mutex_exit(&dvp->dv_lock);
- return (err);
- }
- /*
- * Allocate a new dls_impl_t.
- */
- dip = kmem_cache_alloc(i_dls_impl_cachep, KM_SLEEP);
- dip->di_dvp = dvp;
- dip->di_ddh = ddh;
+ local = (zid == dlp->dl_zid);
+ dlp->dl_zone_ref += (local ? 1 : 0);
/*
* Cache a copy of the MAC interface handle, a pointer to the
- * immutable MAC info and a copy of the current MAC address.
+ * immutable MAC info.
*/
- dip->di_mh = dlp->dl_mh;
- dip->di_mip = dlp->dl_mip;
+ dsp->ds_dlp = dlp;
+ dsp->ds_mh = dlp->dl_mh;
+ dsp->ds_mch = dlp->dl_mch;
+ dsp->ds_mip = dlp->dl_mip;
+ dsp->ds_ddh = ddh;
+ dsp->ds_local = local;
- mac_unicst_get(dip->di_mh, dip->di_unicst_addr);
-
- /*
- * Set the MAC transmit information.
- */
- dip->di_txinfo = mac_tx_get(dip->di_mh);
-
- /*
- * Add a notification function so that we get updates from
- * the MAC.
- */
- dip->di_mnh = mac_notify_add(dip->di_mh, i_dls_notify,
- (void *)dip);
-
- /*
- * Bump the kmem_cache count to make sure it is not prematurely
- * destroyed.
- */
- atomic_add_32(&i_dls_impl_count, 1);
-
- dip->di_local = local;
-
- /*
- * Hand back a reference to the dls_impl_t.
- */
- *dcp = (dls_channel_t)dip;
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
return (0);
}
void
-dls_close(dls_channel_t dc)
+dls_close(dld_str_t *dsp)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
- dls_vlan_t *dvp = dip->di_dvp;
- dls_link_t *dlp = dvp->dv_dlp;
+ dls_link_t *dlp = dsp->ds_dlp;
dls_multicst_addr_t *p;
dls_multicst_addr_t *nextp;
- dls_dl_handle_t ddh = dip->di_ddh;
+ uint32_t old_flags;
- if (dip->di_local) {
- mutex_enter(&dvp->dv_lock);
- dvp->dv_zone_ref--;
- mutex_exit(&dvp->dv_lock);
- }
- dip->di_local = B_FALSE;
+ ASSERT(dsp->ds_datathr_cnt == 0);
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
- dls_active_clear(dc);
+ if (dsp->ds_local)
+ dlp->dl_zone_ref--;
+ dsp->ds_local = B_FALSE;
- rw_enter(&(dip->di_lock), RW_WRITER);
/*
- * Remove the notify function.
+ * Walk the list of multicast addresses, disabling each at the MAC.
+ * Note that we must remove multicast address before
+ * mac_unicast_remove() (called by dls_active_clear()) because
+ * mac_multicast_remove() relies on the unicast flows on the mac
+ * client.
*/
- mac_notify_remove(dip->di_mh, dip->di_mnh);
- dip->di_mnh = NULL;
-
- /*
- * If the dls_impl_t is bound then unbind it.
- */
- if (dip->di_bound) {
- rw_exit(&(dip->di_lock));
- dls_link_remove(dlp, dip);
- rw_enter(&(dip->di_lock), RW_WRITER);
- dip->di_bound = B_FALSE;
- }
-
- /*
- * Walk the list of multicast addresses, disabling each at
- * the MAC.
- */
- for (p = dip->di_dmap; p != NULL; p = nextp) {
- (void) mac_multicst_remove(dip->di_mh, p->dma_addr);
+ for (p = dsp->ds_dmap; p != NULL; p = nextp) {
+ (void) mac_multicast_remove(dsp->ds_mch, p->dma_addr);
nextp = p->dma_nextp;
kmem_free(p, sizeof (dls_multicst_addr_t));
}
- dip->di_dmap = NULL;
+ dsp->ds_dmap = NULL;
- dip->di_rx = NULL;
- dip->di_rx_arg = NULL;
- rw_exit(&(dip->di_lock));
+ dls_active_clear(dsp);
/*
- * If the MAC has been set in promiscuous mode then disable it.
+ * If the dld_str_t is bound then unbind it.
*/
- (void) dls_promisc(dc, 0);
- dip->di_txinfo = NULL;
+ if (dsp->ds_dlstate == DL_IDLE) {
+ (void) dls_unbind(dsp);
+ dsp->ds_dlstate = DL_UNBOUND;
+ }
/*
- * Free the dls_impl_t back to the cache.
+ * If the MAC has been set in promiscuous mode then disable it.
+ * This needs to be done before resetting ds_rx.
*/
- dip->di_txinfo = NULL;
-
- if (dip->di_soft_ring_list != NULL) {
- soft_ring_set_destroy(dip->di_soft_ring_list,
- dip->di_soft_ring_size);
- dip->di_soft_ring_list = NULL;
- }
- dip->di_soft_ring_size = 0;
+ old_flags = dsp->ds_promisc;
+ dsp->ds_promisc = 0;
+ (void) dls_promisc(dsp, old_flags);
/*
- * Decrement the reference count to allow the cache to be destroyed
- * if there are no more dls_impl_t.
+ * At this point we have cutoff inbound packet flow from the mac
+ * for this 'dsp'. The dls_link_remove above cut off packets meant
+ * for us and waited for upcalls to finish. Similarly the dls_promisc
+ * reset above waited for promisc callbacks to finish. Now we can
+ * safely reset ds_rx to NULL
*/
- atomic_add_32(&i_dls_impl_count, -1);
-
- dip->di_dvp = NULL;
+ dsp->ds_rx = NULL;
+ dsp->ds_rx_arg = NULL;
- kmem_cache_free(i_dls_impl_cachep, dip);
-
- mac_stop(dvp->dv_dlp->dl_mh);
+ dsp->ds_dlp = NULL;
/*
- * Release our reference to the dls_vlan_t allowing that to be
- * destroyed if there are no more dls_impl_t. An unreferenced tagged
- * (non-persistent) vlan gets destroyed automatically.
+ * Release our reference to the dls_link_t allowing that to be
+ * destroyed if there are no more dls_impl_t.
*/
- if (ddh != NULL)
- dls_devnet_close(ddh);
- else
- dls_vlan_rele(dvp);
-}
-
-mac_handle_t
-dls_mac(dls_channel_t dc)
-{
- return (((dls_impl_t *)dc)->di_mh);
-}
-
-uint16_t
-dls_vid(dls_channel_t dc)
-{
- return (((dls_impl_t *)dc)->di_dvp->dv_id);
+ dls_link_rele(dlp);
}
int
-dls_bind(dls_channel_t dc, uint32_t sap)
+dls_bind(dld_str_t *dsp, uint32_t sap)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
- dls_link_t *dlp;
uint32_t dls_sap;
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
/*
* Check to see the value is legal for the media type.
*/
- if (!mac_sap_verify(dip->di_mh, sap, &dls_sap))
+ if (!mac_sap_verify(dsp->ds_mh, sap, &dls_sap))
return (EINVAL);
- if (dip->di_promisc & DLS_PROMISC_SAP)
+
+ if (dsp->ds_promisc & DLS_PROMISC_SAP)
dls_sap = DLS_SAP_PROMISC;
/*
- * Set up the dls_impl_t to mark it as able to receive packets.
+ * Set up the dld_str_t to mark it as able to receive packets.
*/
- rw_enter(&(dip->di_lock), RW_WRITER);
- ASSERT(!dip->di_bound);
- dip->di_sap = sap;
- dip->di_bound = B_TRUE;
- rw_exit(&(dip->di_lock));
+ dsp->ds_sap = sap;
/*
- * Now bind the dls_impl_t by adding it into the hash table in the
- * dls_link_t.
+ * The MAC layer does the VLAN demultiplexing and will only pass up
+ * untagged packets to non-promiscuous primary MAC clients. In order to
+ * support the binding to the VLAN SAP which is required by DLPI, dls
+ * needs to get a copy of all tagged packets when the client binds to
+ * the VLAN SAP. We do this by registering a separate promiscuous
+ * callback for each dls client binding to that SAP.
*
- * NOTE: This must be done without the dls_impl_t lock being held
- * otherwise deadlock may ensue.
- */
- dlp = dip->di_dvp->dv_dlp;
- dls_link_add(dlp, dls_sap, dip);
+ * Note: even though there are two promiscuous handles in dld_str_t,
+ * ds_mph is for the regular promiscuous mode, ds_vlan_mph is the handle
+ * to receive VLAN pkt when promiscuous mode is not on. Only one of
+ * them can be non-NULL at the same time, to avoid receiving dup copies
+ * of pkts.
+ */
+ if (sap == ETHERTYPE_VLAN && dsp->ds_promisc == 0) {
+ int err;
+
+ if (dsp->ds_vlan_mph != NULL)
+ return (EINVAL);
+ err = mac_promisc_add(dsp->ds_mch,
+ MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
+ &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
+ return (err);
+ }
+ /*
+ * Now bind the dld_str_t by adding it into the hash table in the
+ * dls_link_t.
+ */
+ dls_link_add(dsp->ds_dlp, dls_sap, dsp);
return (0);
}
-void
-dls_unbind(dls_channel_t dc)
+int
+dls_unbind(dld_str_t *dsp)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
- dls_link_t *dlp;
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
/*
- * Unbind the dls_impl_t by removing it from the hash table in the
- * dls_link_t.
- *
- * NOTE: This must be done without the dls_impl_t lock being held
- * otherise deadlock may enuse.
+ * For VLAN SAP, there was a promisc handle registered when dls_bind.
+ * When unbind this dls link, we need to remove the promisc handle.
+ * See comments in dls_bind().
*/
- dlp = dip->di_dvp->dv_dlp;
- dls_link_remove(dlp, dip);
+ if (dsp->ds_vlan_mph != NULL) {
+ int err;
+
+ err = mac_promisc_remove(dsp->ds_vlan_mph);
+ ASSERT(err == 0);
+ dsp->ds_vlan_mph = NULL;
+ return (err);
+ }
/*
- * Mark the dls_impl_t as unable to receive packets This will make
- * sure that 'receives in flight' will not come our way.
+ * Unbind the dld_str_t by removing it from the hash table in the
+ * dls_link_t.
*/
- dip->di_bound = B_FALSE;
+ dls_link_remove(dsp->ds_dlp, dsp);
+ dsp->ds_sap = 0;
+ return (0);
}
int
-dls_promisc(dls_channel_t dc, uint32_t flags)
+dls_promisc(dld_str_t *dsp, uint32_t old_flags)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
- dls_link_t *dlp;
int err = 0;
- ASSERT(!(flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+ ASSERT(!(dsp->ds_promisc & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
DLS_PROMISC_PHYS)));
- /*
- * Check if we need to turn on 'all sap' mode.
- */
- rw_enter(&(dip->di_lock), RW_WRITER);
- dlp = dip->di_dvp->dv_dlp;
- if ((flags & DLS_PROMISC_SAP) &&
- !(dip->di_promisc & DLS_PROMISC_SAP)) {
- dip->di_promisc |= DLS_PROMISC_SAP;
- if (!dip->di_bound)
- goto multi;
-
- rw_exit(&(dip->di_lock));
- dls_link_remove(dlp, dip);
- dls_link_add(dlp, DLS_SAP_PROMISC, dip);
- rw_enter(&(dip->di_lock), RW_WRITER);
- goto multi;
- }
-
- /*
- * Check if we need to turn off 'all sap' mode.
- */
- if (!(flags & DLS_PROMISC_SAP) &&
- (dip->di_promisc & DLS_PROMISC_SAP)) {
- uint32_t dls_sap;
-
- dip->di_promisc &= ~DLS_PROMISC_SAP;
- if (!dip->di_bound)
- goto multi;
-
- rw_exit(&(dip->di_lock));
- dls_link_remove(dlp, dip);
- (void) mac_sap_verify(dip->di_mh, dip->di_sap, &dls_sap);
- dls_link_add(dlp, dls_sap, dip);
- rw_enter(&(dip->di_lock), RW_WRITER);
- }
-
-multi:
- /*
- * It's easiest to add the txloop callback up-front; if promiscuous
- * mode cannot be enabled, then we'll remove it before returning.
- * Use dl_promisc_lock to prevent racing with another thread also
- * manipulating the promiscuous state on another dls_impl_t associated
- * with the same dls_link_t.
- */
- mutex_enter(&dlp->dl_promisc_lock);
- if ((dlp->dl_npromisc == 0) && (flags & DLS_PROMISC_PHYS)) {
- ASSERT(dlp->dl_mth == NULL);
- dlp->dl_mth = mac_txloop_add(dlp->dl_mh, dls_link_txloop, dlp);
- }
-
- /*
- * Turn on or off 'all multicast' mode, if necessary.
- */
- if (flags & DLS_PROMISC_MULTI) {
- if (!(dip->di_promisc & DLS_PROMISC_MULTI)) {
- if ((err = mac_promisc_set(dip->di_mh, B_TRUE,
- MAC_DEVPROMISC)) != 0) {
- goto done;
- }
- dip->di_promisc |= DLS_PROMISC_MULTI;
- }
- } else {
- if (dip->di_promisc & DLS_PROMISC_MULTI) {
- if ((err = mac_promisc_set(dip->di_mh, B_FALSE,
- MAC_DEVPROMISC)) != 0) {
- goto done;
- }
- dip->di_promisc &= ~DLS_PROMISC_MULTI;
- }
- }
-
- /*
- * Turn on or off 'all physical' mode, if necessary.
- */
- if (flags & DLS_PROMISC_PHYS) {
- if (!(dip->di_promisc & DLS_PROMISC_PHYS)) {
- err = mac_promisc_set(dip->di_mh, B_TRUE, MAC_PROMISC);
- if (err != 0)
- goto done;
-
- dip->di_promisc |= DLS_PROMISC_PHYS;
- dlp->dl_npromisc++;
+ if (old_flags == 0 && dsp->ds_promisc != 0) {
+ /*
+ * If only DLS_PROMISC_SAP, we don't turn on the
+ * physical promisc mode
+ */
+ err = mac_promisc_add(dsp->ds_mch, MAC_CLIENT_PROMISC_ALL,
+ dls_rx_promisc, dsp, &dsp->ds_mph,
+ (dsp->ds_promisc != DLS_PROMISC_SAP) ? 0 :
+ MAC_PROMISC_FLAGS_NO_PHYS);
+ if (err != 0)
+ return (err);
+
+ /* Remove vlan promisc handle to avoid sending dup copy up */
+ if (dsp->ds_vlan_mph != NULL) {
+ err = mac_promisc_remove(dsp->ds_vlan_mph);
+ dsp->ds_vlan_mph = NULL;
}
- } else {
- if (dip->di_promisc & DLS_PROMISC_PHYS) {
- err = mac_promisc_set(dip->di_mh, B_FALSE, MAC_PROMISC);
- if (err != 0)
- goto done;
-
- dip->di_promisc &= ~DLS_PROMISC_PHYS;
- dlp->dl_npromisc--;
+ } else if (old_flags != 0 && dsp->ds_promisc == 0) {
+ ASSERT(dsp->ds_mph != NULL);
+ err = mac_promisc_remove(dsp->ds_mph);
+ /*
+ * The failure only relates to resetting the device promiscuity
+ * The mac layer does not fail in freeing up the promiscuous
+ * data structures, and so we clear the ds_mph. The dld stream
+ * may be closing and we can't fail that.
+ */
+ dsp->ds_mph = NULL;
+ if (err != 0)
+ return (err);
+
+ if (dsp->ds_sap == ETHERTYPE_VLAN &&
+ dsp->ds_dlstate != DL_UNBOUND) {
+ int err;
+
+ if (dsp->ds_vlan_mph != NULL)
+ return (EINVAL);
+ err = mac_promisc_add(dsp->ds_mch,
+ MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
+ &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
+ return (err);
}
+ } else if (old_flags == DLS_PROMISC_SAP && dsp->ds_promisc != 0 &&
+ dsp->ds_promisc != old_flags) {
+ /*
+ * If the old flag is PROMISC_SAP, but the current flag has
+ * changed to some new non-zero value, we need to turn the
+ * physical promiscuous mode.
+ */
+ ASSERT(dsp->ds_mph != NULL);
+ err = mac_promisc_remove(dsp->ds_mph);
+ if (err != 0)
+ return (err);
+ err = mac_promisc_add(dsp->ds_mch, MAC_CLIENT_PROMISC_ALL,
+ dls_rx_promisc, dsp, &dsp->ds_mph, 0);
}
-done:
- if (dlp->dl_npromisc == 0 && dlp->dl_mth != NULL) {
- mac_txloop_remove(dlp->dl_mh, dlp->dl_mth);
- dlp->dl_mth = NULL;
- }
-
- ASSERT(dlp->dl_npromisc == 0 || dlp->dl_mth != NULL);
- mutex_exit(&dlp->dl_promisc_lock);
-
- rw_exit(&(dip->di_lock));
return (err);
}
int
-dls_multicst_add(dls_channel_t dc, const uint8_t *addr)
+dls_multicst_add(dld_str_t *dsp, const uint8_t *addr)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
int err;
dls_multicst_addr_t **pp;
dls_multicst_addr_t *p;
uint_t addr_length;
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
/*
* Check whether the address is in the list of enabled addresses for
- * this dls_impl_t.
+ * this dld_str_t.
+ */
+ addr_length = dsp->ds_mip->mi_addr_length;
+
+ /*
+ * Protect against concurrent access of ds_dmap by data threads using
+ * ds_rw_lock. The mac perimeter serializes the dls_multicst_add and
+ * remove operations. Dropping the ds_rw_lock across mac calls is thus
+ * ok and is also required by the locking protocol.
*/
- rw_enter(&(dip->di_lock), RW_WRITER);
- addr_length = dip->di_mip->mi_addr_length;
- for (pp = &(dip->di_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
+ rw_enter(&dsp->ds_rw_lock, RW_WRITER);
+ for (pp = &(dsp->ds_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
if (bcmp(addr, p->dma_addr, addr_length) == 0) {
/*
* It is there so there's nothing to do.
@@ -610,92 +312,92 @@ dls_multicst_add(dls_channel_t dc, const uint8_t *addr)
}
/*
- * Allocate a new list item.
+ * Allocate a new list item and add it to the list.
*/
- if ((p = kmem_zalloc(sizeof (dls_multicst_addr_t),
- KM_NOSLEEP)) == NULL) {
- err = ENOMEM;
- goto done;
- }
+ p = kmem_zalloc(sizeof (dls_multicst_addr_t), KM_SLEEP);
+ bcopy(addr, p->dma_addr, addr_length);
+ *pp = p;
+ rw_exit(&dsp->ds_rw_lock);
/*
* Enable the address at the MAC.
*/
- if ((err = mac_multicst_add(dip->di_mh, addr)) != 0) {
- kmem_free(p, sizeof (dls_multicst_addr_t));
- goto done;
- }
-
- /*
- * The address is now enabled at the MAC so add it to the list.
- */
- bcopy(addr, p->dma_addr, addr_length);
- *pp = p;
+ err = mac_multicast_add(dsp->ds_mch, addr);
+ if (err == 0)
+ return (0);
+ /* Undo the operation as it has failed */
+ rw_enter(&dsp->ds_rw_lock, RW_WRITER);
+ ASSERT(*pp == p && p->dma_nextp == NULL);
+ *pp = NULL;
+ kmem_free(p, sizeof (dls_multicst_addr_t));
done:
- rw_exit(&(dip->di_lock));
+ rw_exit(&dsp->ds_rw_lock);
return (err);
}
int
-dls_multicst_remove(dls_channel_t dc, const uint8_t *addr)
+dls_multicst_remove(dld_str_t *dsp, const uint8_t *addr)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
- int err;
dls_multicst_addr_t **pp;
dls_multicst_addr_t *p;
uint_t addr_length;
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
/*
* Find the address in the list of enabled addresses for this
- * dls_impl_t.
+ * dld_str_t.
*/
- rw_enter(&(dip->di_lock), RW_WRITER);
- addr_length = dip->di_mip->mi_addr_length;
- for (pp = &(dip->di_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
+ addr_length = dsp->ds_mip->mi_addr_length;
+
+ /*
+ * Protect against concurrent access to ds_dmap by data threads using
+ * ds_rw_lock. The mac perimeter serializes the dls_multicst_add and
+ * remove operations. Dropping the ds_rw_lock across mac calls is thus
+ * ok and is also required by the locking protocol.
+ */
+ rw_enter(&dsp->ds_rw_lock, RW_WRITER);
+ for (pp = &(dsp->ds_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
if (bcmp(addr, p->dma_addr, addr_length) == 0)
break;
}
/*
* If we walked to the end of the list then the given address is
- * not currently enabled for this dls_impl_t.
+ * not currently enabled for this dld_str_t.
*/
if (p == NULL) {
- err = ENOENT;
- goto done;
+ rw_exit(&dsp->ds_rw_lock);
+ return (ENOENT);
}
/*
- * Disable the address at the MAC.
+ * Remove the address from the list.
*/
- if ((err = mac_multicst_remove(dip->di_mh, addr)) != 0)
- goto done;
+ *pp = p->dma_nextp;
+ rw_exit(&dsp->ds_rw_lock);
/*
- * Remove the address from the list.
+ * Disable the address at the MAC.
*/
- *pp = p->dma_nextp;
+ mac_multicast_remove(dsp->ds_mch, addr);
kmem_free(p, sizeof (dls_multicst_addr_t));
-
-done:
- rw_exit(&(dip->di_lock));
- return (err);
+ return (0);
}
mblk_t *
-dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri,
+dls_header(dld_str_t *dsp, const uint8_t *addr, uint16_t sap, uint_t pri,
mblk_t **payloadp)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
uint16_t vid;
size_t extra_len;
uint16_t mac_sap;
mblk_t *mp, *payload;
- boolean_t is_ethernet = (dip->di_mip->mi_media == DL_ETHER);
+ boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
struct ether_vlan_header *evhp;
- vid = dip->di_dvp->dv_id;
+ vid = mac_client_vid(dsp->ds_mch);
payload = (payloadp == NULL) ? NULL : (*payloadp);
/*
@@ -719,7 +421,7 @@ dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri,
mac_sap = sap;
}
- mp = mac_header(dip->di_mh, addr, mac_sap, payload, extra_len);
+ mp = mac_header(dsp->ds_mh, addr, mac_sap, payload, extra_len);
if (mp == NULL)
return (NULL);
@@ -772,209 +474,207 @@ dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri,
return (mp);
}
-int
-dls_header_info(dls_channel_t dc, mblk_t *mp, mac_header_info_t *mhip)
-{
- return (dls_link_header_info(((dls_impl_t *)dc)->di_dvp->dv_dlp,
- mp, mhip));
-}
-
void
-dls_rx_set(dls_channel_t dc, dls_rx_t rx, void *arg)
-{
- dls_impl_t *dip = (dls_impl_t *)dc;
-
- rw_enter(&(dip->di_lock), RW_WRITER);
- dip->di_rx = rx;
- dip->di_rx_arg = arg;
- rw_exit(&(dip->di_lock));
-}
-
-mblk_t *
-dls_tx(dls_channel_t dc, mblk_t *mp)
+dls_rx_set(dld_str_t *dsp, dls_rx_t rx, void *arg)
{
- const mac_txinfo_t *mtp = ((dls_impl_t *)dc)->di_txinfo;
-
- return (mtp->mt_fn(mtp->mt_arg, mp));
+ mutex_enter(&dsp->ds_lock);
+ dsp->ds_rx = rx;
+ dsp->ds_rx_arg = arg;
+ mutex_exit(&dsp->ds_lock);
}
-boolean_t
-dls_accept(dls_impl_t *dip, mac_header_info_t *mhip, dls_rx_t *di_rx,
- void **di_rx_arg)
+static boolean_t
+dls_accept_common(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
+ void **ds_rx_arg, boolean_t promisc, boolean_t promisc_loopback)
{
dls_multicst_addr_t *dmap;
- size_t addr_length = dip->di_mip->mi_addr_length;
+ size_t addr_length = dsp->ds_mip->mi_addr_length;
/*
- * We must not accept packets if the dls_impl_t is not marked as bound
+ * We must not accept packets if the dld_str_t is not marked as bound
* or is being removed.
*/
- rw_enter(&(dip->di_lock), RW_READER);
- if (!dip->di_bound || dip->di_removing)
+ if (dsp->ds_dlstate != DL_IDLE)
goto refuse;
- /*
- * If the dls_impl_t is in 'all physical' mode then always accept.
- */
- if (dip->di_promisc & DLS_PROMISC_PHYS)
- goto accept;
+ if (dsp->ds_promisc != 0) {
+ /*
+ * Filter out packets that arrived from the data path
+ * (i_dls_link_rx) when promisc mode is on.
+ */
+ if (!promisc)
+ goto refuse;
+ /*
+ * If the dls_impl_t is in 'all physical' mode then
+ * always accept.
+ */
+ if (dsp->ds_promisc & DLS_PROMISC_PHYS)
+ goto accept;
- /*
- * For non-promiscs-phys streams, filter out the packets looped back
- * from the underlying driver because of promiscuous setting.
- */
- if (mhip->mhi_prom_looped)
- goto refuse;
+ /*
+ * Loopback packets i.e. packets sent out by DLS on a given
+ * mac end point, will be accepted back by DLS on loopback
+ * from the mac, only in the 'all physical' mode which has been
+ * covered by the previous check above
+ */
+ if (promisc_loopback)
+ goto refuse;
+ }
switch (mhip->mhi_dsttype) {
case MAC_ADDRTYPE_UNICAST:
+ case MAC_ADDRTYPE_BROADCAST:
/*
- * Check to see if the destination address matches the
- * dls_impl_t unicast address.
+ * We can accept unicast and broadcast packets because
+ * filtering is already done by the mac layer.
*/
- if (memcmp(mhip->mhi_daddr, dip->di_unicst_addr, addr_length) ==
- 0) {
- goto accept;
- }
- break;
+ goto accept;
case MAC_ADDRTYPE_MULTICAST:
/*
- * Check the address against the list of addresses enabled
- * for this dls_impl_t or accept it unconditionally if the
- * dls_impl_t is in 'all multicast' mode.
+ * Additional filtering is needed for multicast addresses
+ * because different streams may be interested in different
+ * addresses.
*/
- if (dip->di_promisc & DLS_PROMISC_MULTI)
+ if (dsp->ds_promisc & DLS_PROMISC_MULTI)
goto accept;
- for (dmap = dip->di_dmap; dmap != NULL;
+
+ rw_enter(&dsp->ds_rw_lock, RW_READER);
+ for (dmap = dsp->ds_dmap; dmap != NULL;
dmap = dmap->dma_nextp) {
if (memcmp(mhip->mhi_daddr, dmap->dma_addr,
addr_length) == 0) {
+ rw_exit(&dsp->ds_rw_lock);
goto accept;
}
}
+ rw_exit(&dsp->ds_rw_lock);
break;
- case MAC_ADDRTYPE_BROADCAST:
- /*
- * If the address is broadcast then the dls_impl_t will
- * always accept it.
- */
- goto accept;
}
refuse:
- rw_exit(&(dip->di_lock));
return (B_FALSE);
accept:
/*
- * Since we hold di_lock here, the returned di_rx and di_rx_arg will
- * always be in sync.
+ * the returned ds_rx and ds_rx_arg will always be in sync.
*/
- *di_rx = dip->di_rx;
- *di_rx_arg = dip->di_rx_arg;
- rw_exit(&(dip->di_lock));
+ mutex_enter(&dsp->ds_lock);
+ *ds_rx = dsp->ds_rx;
+ *ds_rx_arg = dsp->ds_rx_arg;
+ mutex_exit(&dsp->ds_lock);
+
return (B_TRUE);
}
/* ARGSUSED */
boolean_t
-dls_accept_loopback(dls_impl_t *dip, mac_header_info_t *mhip, dls_rx_t *di_rx,
- void **di_rx_arg)
+dls_accept(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
+ void **ds_rx_arg)
{
- /*
- * We must not accept packets if the dls_impl_t is not marked as bound
- * or is being removed.
- */
- rw_enter(&(dip->di_lock), RW_READER);
- if (!dip->di_bound || dip->di_removing)
- goto refuse;
-
- /*
- * A dls_impl_t should only accept loopback packets if it is in
- * 'all physical' mode.
- */
- if (dip->di_promisc & DLS_PROMISC_PHYS)
- goto accept;
-
-refuse:
- rw_exit(&(dip->di_lock));
- return (B_FALSE);
-
-accept:
- /*
- * Since we hold di_lock here, the returned di_rx and di_rx_arg will
- * always be in sync.
- */
- *di_rx = dip->di_rx;
- *di_rx_arg = dip->di_rx_arg;
- rw_exit(&(dip->di_lock));
- return (B_TRUE);
+ return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_FALSE,
+ B_FALSE));
}
boolean_t
+dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
+ void **ds_rx_arg, boolean_t loopback)
+{
+ return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
+ loopback));
+}
+
+int
dls_mac_active_set(dls_link_t *dlp)
{
- mutex_enter(&dlp->dl_lock);
+ int err = 0;
/*
- * If this is the first active client on this link, notify
- * the mac that we're becoming an active client.
+ * First client; add the primary unicast address.
*/
- if (dlp->dl_nactive == 0 && !mac_active_shareable_set(dlp->dl_mh)) {
- mutex_exit(&dlp->dl_lock);
- return (B_FALSE);
+ if (dlp->dl_nactive == 0) {
+ /*
+ * First client; add the primary unicast address.
+ */
+ mac_diag_t diag;
+
+ /* request the primary MAC address */
+ if ((err = mac_unicast_primary_add(dlp->dl_mch, &dlp->dl_mah,
+ &diag)) != 0) {
+ return (err);
+ }
+
+ /*
+ * Set the function to start receiving packets.
+ */
+ mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
+
+ /*
+ * We've got a MAC client for this link now.
+ * Push down the flows that were defined on this link
+ * hitherto. The flows are added to the active flow table
+ * and SRS, softrings etc. are created as needed.
+ */
+ mac_link_init_flows(dlp->dl_mch);
}
dlp->dl_nactive++;
- mutex_exit(&dlp->dl_lock);
- return (B_TRUE);
+ return (0);
}
void
dls_mac_active_clear(dls_link_t *dlp)
{
- mutex_enter(&dlp->dl_lock);
- if (--dlp->dl_nactive == 0)
- mac_active_clear(dlp->dl_mh);
- mutex_exit(&dlp->dl_lock);
+ if (--dlp->dl_nactive == 0) {
+ ASSERT(dlp->dl_mah != NULL);
+ /*
+ * We would have initialized subflows etc. only if we
+ * brought up the primary client and set the unicast
+ * unicast address etc. Deactivate the flows. The flow
+ * entry will be removed from the active flow tables,
+ * and the associated SRS, softrings etc will be
+ * deleted. But the flow entry itself won't be
+ * destroyed, instead it will continue to be
+ * archived off the the global flow hash list, for a
+ * possible future activation when say
+ * IP is plumbed again
+ */
+
+ mac_link_release_flows(dlp->dl_mch);
+ (void) mac_unicast_remove(dlp->dl_mch, dlp->dl_mah);
+ dlp->dl_mah = NULL;
+ mac_rx_clear(dlp->dl_mch);
+ }
}
-boolean_t
-dls_active_set(dls_channel_t dc)
+int
+dls_active_set(dld_str_t *dsp)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
- dls_link_t *dlp = dip->di_dvp->dv_dlp;
+ int err = 0;
- rw_enter(&dip->di_lock, RW_WRITER);
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
/* If we're already active, then there's nothing more to do. */
- if (dip->di_active) {
- rw_exit(&dip->di_lock);
- return (B_TRUE);
- }
+ if (dsp->ds_active)
+ return (0);
- if (!dls_mac_active_set(dlp)) {
- rw_exit(&dip->di_lock);
- return (B_FALSE);
+ if ((err = dls_mac_active_set(dsp->ds_dlp)) != 0) {
+ /* except for ENXIO all other errors are mapped to EBUSY */
+ if (err != ENXIO)
+ return (EBUSY);
+ return (err);
}
- dip->di_active = B_TRUE;
- rw_exit(&dip->di_lock);
- return (B_TRUE);
+
+ dsp->ds_active = B_TRUE;
+ return (0);
}
void
-dls_active_clear(dls_channel_t dc)
+dls_active_clear(dld_str_t *dsp)
{
- dls_impl_t *dip = (dls_impl_t *)dc;
- dls_link_t *dlp = dip->di_dvp->dv_dlp;
-
- rw_enter(&dip->di_lock, RW_WRITER);
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
- if (!dip->di_active)
- goto out;
- dip->di_active = B_FALSE;
-
- dls_mac_active_clear(dlp);
+ if (!dsp->ds_active)
+ return;
-out:
- rw_exit(&dip->di_lock);
+ dls_mac_active_clear(dsp->ds_dlp);
+ dsp->ds_active = B_FALSE;
}
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 759fb97f0a..852b87d24b 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -23,34 +23,21 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Data-Link Services Module
*/
-#include <sys/types.h>
-#include <sys/stream.h>
-#include <sys/strsun.h>
-#include <sys/strsubr.h>
#include <sys/sysmacros.h>
-#include <sys/atomic.h>
-#include <sys/modhash.h>
-#include <sys/dlpi.h>
-#include <sys/ethernet.h>
-#include <sys/byteorder.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
#include <sys/vlan.h>
-#include <sys/mac.h>
-#include <sys/sdt.h>
-
-#include <sys/dls.h>
#include <sys/dld_impl.h>
-#include <sys/dls_impl.h>
+#include <sys/sdt.h>
+#include <sys/atomic.h>
static kmem_cache_t *i_dls_link_cachep;
static mod_hash_t *i_dls_link_hash;
static uint_t i_dls_link_count;
-static krwlock_t i_dls_link_lock;
#define LINK_HASHSZ 67 /* prime */
#define IMPL_HASHSZ 67 /* prime */
@@ -58,15 +45,8 @@ static krwlock_t i_dls_link_lock;
/*
* Construct a hash key encompassing both DLSAP value and VLAN idenitifier.
*/
-#define MAKE_KEY(_sap, _vid) \
- ((mod_hash_key_t)(uintptr_t) \
- (((_sap) << VLAN_ID_SIZE) | (_vid) & VLAN_ID_MASK))
-
-/*
- * Extract the DLSAP value from the hash key.
- */
-#define KEY_SAP(_key) \
- (((uint32_t)(uintptr_t)(_key)) >> VLAN_ID_SIZE)
+#define MAKE_KEY(_sap) \
+ ((mod_hash_key_t)(uintptr_t)((_sap) << VLAN_ID_SIZE))
#define DLS_STRIP_PADDING(pktsize, p) { \
if (pktsize != 0) { \
@@ -91,12 +71,9 @@ i_dls_link_constructor(void *buf, void *arg, int kmflag)
bzero(buf, sizeof (dls_link_t));
(void) snprintf(name, MAXNAMELEN, "dls_link_t_%p_hash", buf);
- dlp->dl_impl_hash = mod_hash_create_idhash(name, IMPL_HASHSZ,
+ dlp->dl_str_hash = mod_hash_create_idhash(name, IMPL_HASHSZ,
mod_hash_null_valdtor);
- mutex_init(&dlp->dl_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&dlp->dl_promisc_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&dlp->dl_impl_lock, NULL, RW_DEFAULT, NULL);
return (0);
}
@@ -108,14 +85,12 @@ i_dls_link_destructor(void *buf, void *arg)
ASSERT(dlp->dl_ref == 0);
ASSERT(dlp->dl_mh == NULL);
+ ASSERT(dlp->dl_mah == NULL);
ASSERT(dlp->dl_unknowns == 0);
- mod_hash_destroy_idhash(dlp->dl_impl_hash);
- dlp->dl_impl_hash = NULL;
+ mod_hash_destroy_idhash(dlp->dl_str_hash);
+ dlp->dl_str_hash = NULL;
- mutex_destroy(&dlp->dl_lock);
- mutex_destroy(&dlp->dl_promisc_lock);
- rw_destroy(&dlp->dl_impl_lock);
}
/*
@@ -195,8 +170,7 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
*/
if (memcmp(mhip->mhi_daddr, cmhi.mhi_daddr, addr_size) != 0 ||
memcmp(mhip->mhi_saddr, cmhi.mhi_saddr, addr_size) != 0 ||
- mhip->mhi_bindsap != cmhi.mhi_bindsap ||
- mhip->mhi_prom_looped != cmhi.mhi_prom_looped) {
+ mhip->mhi_bindsap != cmhi.mhi_bindsap) {
/*
* Note that we don't need to restore the padding.
*/
@@ -239,16 +213,34 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
return (mp);
}
-static void
-i_dls_head_hold(dls_head_t *dhp)
+/* ARGSUSED */
+static int
+i_dls_head_hold(mod_hash_key_t key, mod_hash_val_t val)
{
- atomic_inc_32(&dhp->dh_ref);
+ dls_head_t *dhp = (dls_head_t *)val;
+
+ /*
+ * The lock order is mod_hash's internal lock -> dh_lock as in the
+ * call to i_dls_link_rx -> mod_hash_find_cb_rval -> i_dls_head_hold
+ */
+ mutex_enter(&dhp->dh_lock);
+ if (dhp->dh_removing) {
+ mutex_exit(&dhp->dh_lock);
+ return (-1);
+ }
+ dhp->dh_ref++;
+ mutex_exit(&dhp->dh_lock);
+ return (0);
}
-static void
+void
i_dls_head_rele(dls_head_t *dhp)
{
- atomic_dec_32(&dhp->dh_ref);
+ mutex_enter(&dhp->dh_lock);
+ dhp->dh_ref--;
+ if (dhp->dh_ref == 0 && dhp->dh_removing != 0)
+ cv_broadcast(&dhp->dh_cv);
+ mutex_exit(&dhp->dh_lock);
}
static dls_head_t *
@@ -276,83 +268,86 @@ i_dls_head_free(dls_head_t *dhp)
*/
static uint_t
i_dls_link_rx_func(dls_link_t *dlp, mac_resource_handle_t mrh,
- mac_header_info_t *mhip, mblk_t *mp, uint32_t sap, uint16_t vid,
+ mac_header_info_t *mhip, mblk_t *mp, uint32_t sap,
boolean_t (*acceptfunc)())
{
- mod_hash_t *hash = dlp->dl_impl_hash;
+ mod_hash_t *hash = dlp->dl_str_hash;
mod_hash_key_t key;
dls_head_t *dhp;
- dls_impl_t *dip;
+ dld_str_t *dsp;
mblk_t *nmp;
- dls_rx_t di_rx;
- void *di_rx_arg;
+ dls_rx_t ds_rx;
+ void *ds_rx_arg;
uint_t naccepted = 0;
+ int rval;
/*
* Construct a hash key from the VLAN identifier and the
- * DLSAP that represents dls_impl_t in promiscuous mode.
+ * DLSAP that represents dld_str_t in promiscuous mode.
*/
- key = MAKE_KEY(sap, vid);
+ key = MAKE_KEY(sap);
/*
- * Search the hash table for dls_impl_t eligible to receive
- * a packet chain for this DLSAP/VLAN combination.
+ * Search the hash table for dld_str_t eligible to receive
+ * a packet chain for this DLSAP/VLAN combination. The mod hash's
+ * internal lock serializes find/insert/remove from the mod hash list.
+ * Incrementing the dh_ref (while holding the mod hash lock) ensures
+ * dls_link_remove will wait for the upcall to finish.
*/
- rw_enter(&dlp->dl_impl_lock, RW_READER);
- if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) {
- rw_exit(&dlp->dl_impl_lock);
+ if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
+ i_dls_head_hold, &rval) != 0 || (rval != 0)) {
return (B_FALSE);
}
- i_dls_head_hold(dhp);
- rw_exit(&dlp->dl_impl_lock);
/*
- * Find dls_impl_t that will accept the sub-chain.
+ * Find dld_str_t that will accept the sub-chain.
*/
- for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp) {
- if (!acceptfunc(dip, mhip, &di_rx, &di_rx_arg))
+ for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next) {
+ if (!acceptfunc(dsp, mhip, &ds_rx, &ds_rx_arg))
continue;
/*
* We have at least one acceptor.
*/
- naccepted ++;
+ naccepted++;
/*
- * There will normally be at least more dls_impl_t
+ * There will normally be at least more dld_str_t
* (since we've yet to check for non-promiscuous
- * dls_impl_t) so dup the sub-chain.
+ * dld_str_t) so dup the sub-chain.
*/
if ((nmp = copymsgchain(mp)) != NULL)
- di_rx(di_rx_arg, mrh, nmp, mhip);
+ ds_rx(ds_rx_arg, mrh, nmp, mhip);
}
/*
- * Release the hold on the dls_impl_t chain now that we have
+ * Release the hold on the dld_str_t chain now that we have
* finished walking it.
*/
i_dls_head_rele(dhp);
return (naccepted);
}
-static void
-i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+/* ARGSUSED */
+void
+i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
dls_link_t *dlp = arg;
- mod_hash_t *hash = dlp->dl_impl_hash;
+ mod_hash_t *hash = dlp->dl_str_hash;
mblk_t *nextp;
mac_header_info_t mhi;
dls_head_t *dhp;
- dls_impl_t *dip;
- dls_impl_t *ndip;
+ dld_str_t *dsp;
+ dld_str_t *ndsp;
mblk_t *nmp;
mod_hash_key_t key;
uint_t npacket;
boolean_t accepted;
- dls_rx_t di_rx, ndi_rx;
- void *di_rx_arg, *ndi_rx_arg;
+ dls_rx_t ds_rx, nds_rx;
+ void *ds_rx_arg, *nds_rx_arg;
uint16_t vid;
- int err;
+ int err, rval;
/*
* Walk the packet chain.
@@ -384,11 +379,11 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
if (mhi.mhi_istagged) {
/*
* If it is tagged traffic, send it upstream to
- * all dls_impl_t which are attached to the physical
+ * all dld_str_t which are attached to the physical
* link and bound to SAP 0x8100.
*/
if (i_dls_link_rx_func(dlp, mrh, &mhi, mp,
- ETHERTYPE_VLAN, VLAN_ID_NONE, dls_accept) > 0) {
+ ETHERTYPE_VLAN, dls_accept) > 0) {
accepted = B_TRUE;
}
@@ -413,33 +408,30 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
* Construct a hash key from the VLAN identifier and the
* DLSAP.
*/
- key = MAKE_KEY(mhi.mhi_bindsap, vid);
+ key = MAKE_KEY(mhi.mhi_bindsap);
/*
- * Search the has table for dls_impl_t eligible to receive
+ * Search the has table for dld_str_t eligible to receive
* a packet chain for this DLSAP/VLAN combination.
*/
- rw_enter(&dlp->dl_impl_lock, RW_READER);
- if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) {
- rw_exit(&dlp->dl_impl_lock);
+ if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
+ i_dls_head_hold, &rval) != 0 || (rval != 0)) {
freemsgchain(mp);
goto loop;
}
- i_dls_head_hold(dhp);
- rw_exit(&dlp->dl_impl_lock);
/*
- * Find the first dls_impl_t that will accept the sub-chain.
+ * Find the first dld_str_t that will accept the sub-chain.
*/
- for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp)
- if (dls_accept(dip, &mhi, &di_rx, &di_rx_arg))
+ for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next)
+ if (dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
break;
/*
- * If we did not find any dls_impl_t willing to accept the
+ * If we did not find any dld_str_t willing to accept the
* sub-chain then throw it away.
*/
- if (dip == NULL) {
+ if (dsp == NULL) {
i_dls_head_rele(dhp);
freemsgchain(mp);
goto loop;
@@ -451,43 +443,43 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
accepted = B_TRUE;
for (;;) {
/*
- * Find the next dls_impl_t that will accept the
+ * Find the next dld_str_t that will accept the
* sub-chain.
*/
- for (ndip = dip->di_nextp; ndip != NULL;
- ndip = ndip->di_nextp)
- if (dls_accept(ndip, &mhi, &ndi_rx,
- &ndi_rx_arg))
+ for (ndsp = dsp->ds_next; ndsp != NULL;
+ ndsp = ndsp->ds_next)
+ if (dls_accept(ndsp, &mhi, &nds_rx,
+ &nds_rx_arg))
break;
/*
- * If there are no more dls_impl_t that are willing
+ * If there are no more dld_str_t that are willing
* to accept the sub-chain then we don't need to dup
* it before handing it to the current one.
*/
- if (ndip == NULL) {
- di_rx(di_rx_arg, mrh, mp, &mhi);
+ if (ndsp == NULL) {
+ ds_rx(ds_rx_arg, mrh, mp, &mhi);
/*
- * Since there are no more dls_impl_t, we're
+ * Since there are no more dld_str_t, we're
* done.
*/
break;
}
/*
- * There are more dls_impl_t so dup the sub-chain.
+ * There are more dld_str_t so dup the sub-chain.
*/
if ((nmp = copymsgchain(mp)) != NULL)
- di_rx(di_rx_arg, mrh, nmp, &mhi);
+ ds_rx(ds_rx_arg, mrh, nmp, &mhi);
- dip = ndip;
- di_rx = ndi_rx;
- di_rx_arg = ndi_rx_arg;
+ dsp = ndsp;
+ ds_rx = nds_rx;
+ ds_rx_arg = nds_rx_arg;
}
/*
- * Release the hold on the dls_impl_t chain now that we have
+ * Release the hold on the dld_str_t chain now that we have
* finished walking it.
*/
i_dls_head_rele(dhp);
@@ -502,220 +494,119 @@ loop:
}
}
-/*
- * Try to send mp up to the DLS_SAP_PROMISC listeners. Return B_TRUE if this
- * message is sent to any streams.
- */
-static uint_t
-i_dls_link_rx_common_promisc(dls_link_t *dlp, mac_resource_handle_t mrh,
- mac_header_info_t *mhip, mblk_t *mp, uint16_t vid,
- boolean_t (*acceptfunc)())
+/* ARGSUSED */
+void
+dls_rx_vlan_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
- uint_t naccepted;
+ dld_str_t *dsp = arg;
+ dls_link_t *dlp = dsp->ds_dlp;
+ mac_header_info_t mhi;
+ dls_rx_t ds_rx;
+ void *ds_rx_arg;
+ int err;
- naccepted = i_dls_link_rx_func(dlp, mrh, mhip, mp, DLS_SAP_PROMISC,
- vid, acceptfunc);
+ DLS_PREPARE_PKT(dlp, mp, &mhi, err);
+ if (err != 0)
+ goto drop;
- if (vid != VLAN_ID_NONE) {
- naccepted += i_dls_link_rx_func(dlp, mrh, mhip, mp,
- DLS_SAP_PROMISC, VLAN_ID_NONE, acceptfunc);
+ /*
+ * If there is promiscuous handle for vlan, we filter out the untagged
+ * pkts and pkts that are not for the primary unicast address.
+ */
+ if (dsp->ds_vlan_mph != NULL) {
+ uint8_t prim_addr[MAXMACADDRLEN];
+ size_t addr_length = dsp->ds_mip->mi_addr_length;
+
+ if (!(mhi.mhi_istagged))
+ goto drop;
+ ASSERT(dsp->ds_mh != NULL);
+ mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)prim_addr);
+ if (memcmp(mhi.mhi_daddr, prim_addr, addr_length) != 0)
+ goto drop;
+
+ if (!dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
+ goto drop;
+
+ ds_rx(ds_rx_arg, NULL, mp, &mhi);
+ return;
}
- return (naccepted);
+
+drop:
+ atomic_add_32(&dlp->dl_unknowns, 1);
+ freemsg(mp);
}
-static void
-i_dls_link_rx_common(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
- boolean_t (*acceptfunc)())
+/* ARGSUSED */
+void
+dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
- dls_link_t *dlp = arg;
- mod_hash_t *hash = dlp->dl_impl_hash;
- mblk_t *nextp;
+ dld_str_t *dsp = arg;
+ dls_link_t *dlp = dsp->ds_dlp;
mac_header_info_t mhi;
- uint16_t vid, vidkey, pri;
+ dls_rx_t ds_rx;
+ void *ds_rx_arg;
+ int err;
dls_head_t *dhp;
- dls_impl_t *dip;
- mblk_t *nmp;
mod_hash_key_t key;
- uint_t npacket;
- uint32_t sap;
- boolean_t accepted;
- dls_rx_t di_rx, fdi_rx;
- void *di_rx_arg, *fdi_rx_arg;
- boolean_t pass2;
- int err;
+
+ DLS_PREPARE_PKT(dlp, mp, &mhi, err);
+ if (err != 0)
+ goto drop;
/*
- * Walk the packet chain.
+ * In order to filter out sap pkt that no dls channel listens, search
+ * the hash table trying to find a dld_str_t eligible to receive the pkt
*/
- for (; mp != NULL; mp = nextp) {
- /*
- * Wipe the accepted state and the receive information of
- * the first eligible dls_impl_t.
- */
- accepted = B_FALSE;
- pass2 = B_FALSE;
- fdi_rx = NULL;
- fdi_rx_arg = NULL;
-
- DLS_PREPARE_PKT(dlp, mp, &mhi, err);
- if (err != 0) {
- if (acceptfunc == dls_accept)
- atomic_add_32(&(dlp->dl_unknowns), 1);
- nextp = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- continue;
- }
-
- /*
- * Grab the longest sub-chain we can process as a single
- * unit.
- */
- nextp = i_dls_link_subchain(dlp, mp, &mhi, &npacket);
- ASSERT(npacket != 0);
-
- vid = VLAN_ID(mhi.mhi_tci);
- pri = VLAN_PRI(mhi.mhi_tci);
-
- vidkey = vid;
-
- /*
- * Note that we need to first send to the dls_impl_t
- * in promiscuous mode in order to avoid the packet reordering
- * when snooping.
- */
- if (i_dls_link_rx_common_promisc(dlp, mrh, &mhi, mp, vidkey,
- acceptfunc) > 0) {
- accepted = B_TRUE;
- }
-
- /*
- * Non promisc case. Two passes:
- * 1. send tagged packets to ETHERTYPE_VLAN listeners
- * 2. send packets to listeners bound to the specific SAP.
- */
- if (mhi.mhi_istagged) {
- vidkey = VLAN_ID_NONE;
- sap = ETHERTYPE_VLAN;
- } else {
- goto non_promisc_loop;
- }
-non_promisc:
- /*
- * Construct a hash key from the VLAN identifier and the
- * DLSAP.
- */
- key = MAKE_KEY(sap, vidkey);
-
- /*
- * Search the has table for dls_impl_t eligible to receive
- * a packet chain for this DLSAP/VLAN combination.
- */
- rw_enter(&dlp->dl_impl_lock, RW_READER);
- if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) {
- rw_exit(&dlp->dl_impl_lock);
- goto non_promisc_loop;
- }
- i_dls_head_hold(dhp);
- rw_exit(&dlp->dl_impl_lock);
-
- /*
- * Find the first dls_impl_t that will accept the sub-chain.
- */
- for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp) {
- if (!acceptfunc(dip, &mhi, &di_rx, &di_rx_arg))
- continue;
-
- accepted = B_TRUE;
-
- /*
- * To avoid the extra copymsgchain(), if this
- * is the first eligible dls_impl_t, remember required
- * information and send up the message afterwards.
- */
- if (fdi_rx == NULL) {
- fdi_rx = di_rx;
- fdi_rx_arg = di_rx_arg;
- continue;
- }
+ if ((dsp->ds_promisc & DLS_PROMISC_SAP) == 0) {
+ key = MAKE_KEY(mhi.mhi_bindsap);
+ if (mod_hash_find(dsp->ds_dlp->dl_str_hash, key,
+ (mod_hash_val_t *)&dhp) != 0)
+ goto drop;
+ }
- if ((nmp = copymsgchain(mp)) != NULL)
- di_rx(di_rx_arg, mrh, nmp, &mhi);
- }
+ if (!dls_accept_promisc(dsp, &mhi, &ds_rx, &ds_rx_arg, loopback))
+ goto drop;
- /*
- * Release the hold on the dls_impl_t chain now that we have
- * finished walking it.
- */
- i_dls_head_rele(dhp);
+ ds_rx(ds_rx_arg, NULL, mp, &mhi);
+ return;
-non_promisc_loop:
- /*
- * Don't pass the packets up again if:
- * - First pass is done and the packets are tagged and their:
- * - VID and priority are both zero (invalid packets).
- * - their sap is ETHERTYPE_VLAN and their VID is zero
- * (they have already been sent upstreams).
- * - Second pass is done:
- */
- if (pass2 || (mhi.mhi_istagged &&
- ((vid == VLAN_ID_NONE && pri == 0) ||
- (mhi.mhi_bindsap == ETHERTYPE_VLAN &&
- vid == VLAN_ID_NONE)))) {
- /*
- * Send the message up to the first eligible dls_impl_t.
- */
- if (fdi_rx != NULL)
- fdi_rx(fdi_rx_arg, mrh, mp, &mhi);
- else
- freemsgchain(mp);
- } else {
- vidkey = vid;
- sap = mhi.mhi_bindsap;
- pass2 = B_TRUE;
- goto non_promisc;
- }
-
- /*
- * If there were no acceptors then add the packet count to the
- * 'unknown' count.
- */
- if (!accepted && (acceptfunc == dls_accept))
- atomic_add_32(&(dlp->dl_unknowns), npacket);
- }
+drop:
+ atomic_add_32(&dlp->dl_unknowns, 1);
+ freemsg(mp);
}
static void
-i_dls_link_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
-{
- i_dls_link_rx_common(arg, mrh, mp, dls_accept);
-}
-
-void
-dls_link_txloop(void *arg, mblk_t *mp)
+i_dls_link_destroy(dls_link_t *dlp)
{
- i_dls_link_rx_common(arg, NULL, mp, dls_accept_loopback);
-}
+ ASSERT(dlp->dl_nactive == 0);
+ ASSERT(dlp->dl_impl_count == 0);
+ ASSERT(dlp->dl_zone_ref == 0);
-/*ARGSUSED*/
-static uint_t
-i_dls_link_walk(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
-{
- boolean_t *promiscp = arg;
- uint32_t sap = KEY_SAP(key);
+ /*
+ * Free the structure back to the cache.
+ */
+ if (dlp->dl_mch != NULL)
+ mac_client_close(dlp->dl_mch, 0);
- if (sap == DLS_SAP_PROMISC) {
- *promiscp = B_TRUE;
- return (MH_WALK_TERMINATE);
+ if (dlp->dl_mh != NULL) {
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
+ mac_close(dlp->dl_mh);
}
- return (MH_WALK_CONTINUE);
+ dlp->dl_mh = NULL;
+ dlp->dl_mch = NULL;
+ dlp->dl_mip = NULL;
+ dlp->dl_unknowns = 0;
+ kmem_cache_free(i_dls_link_cachep, dlp);
}
static int
i_dls_link_create(const char *name, dls_link_t **dlpp)
{
dls_link_t *dlp;
+ int err;
/*
* Allocate a new dls_link_t structure.
@@ -728,32 +619,34 @@ i_dls_link_create(const char *name, dls_link_t **dlpp)
(void) strlcpy(dlp->dl_name, name, sizeof (dlp->dl_name));
/*
- * Initialize promiscuous bookkeeping fields.
+ * First reference; hold open the MAC interface.
*/
- dlp->dl_npromisc = 0;
- dlp->dl_mth = NULL;
+ ASSERT(dlp->dl_mh == NULL);
+ err = mac_open(dlp->dl_name, &dlp->dl_mh);
+ if (err != 0)
+ goto bail;
+
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
+ dlp->dl_mip = mac_info(dlp->dl_mh);
+
+ /* DLS is the "primary" MAC client */
+ ASSERT(dlp->dl_mch == NULL);
+
+ err = mac_client_open(dlp->dl_mh, &dlp->dl_mch, NULL,
+ MAC_OPEN_FLAGS_TAG_DISABLE | MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK |
+ MAC_OPEN_FLAGS_USE_DATALINK_NAME);
+ if (err != 0)
+ goto bail;
+
+ DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
+ dlp->dl_mch);
*dlpp = dlp;
return (0);
-}
-static void
-i_dls_link_destroy(dls_link_t *dlp)
-{
- ASSERT(dlp->dl_npromisc == 0);
- ASSERT(dlp->dl_nactive == 0);
- ASSERT(dlp->dl_mth == NULL);
- ASSERT(dlp->dl_macref == 0);
- ASSERT(dlp->dl_mh == NULL);
- ASSERT(dlp->dl_mip == NULL);
- ASSERT(dlp->dl_impl_count == 0);
- ASSERT(dlp->dl_mrh == NULL);
-
- /*
- * Free the structure back to the cache.
- */
- dlp->dl_unknowns = 0;
- kmem_cache_free(i_dls_link_cachep, dlp);
+bail:
+ i_dls_link_destroy(dlp);
+ return (err);
}
/*
@@ -777,7 +670,6 @@ dls_link_init(void)
i_dls_link_hash = mod_hash_create_extended("dls_link_hash",
IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
- rw_init(&i_dls_link_lock, NULL, RW_DEFAULT, NULL);
i_dls_link_count = 0;
}
@@ -796,7 +688,6 @@ dls_link_fini(void)
* Destroy the hash table and associated lock.
*/
mod_hash_destroy_hash(i_dls_link_hash);
- rw_destroy(&i_dls_link_lock);
return (0);
}
@@ -804,32 +695,33 @@ dls_link_fini(void)
* Exported functions.
*/
-int
-dls_link_hold(const char *name, dls_link_t **dlpp)
+static int
+dls_link_hold_common(const char *name, dls_link_t **dlpp, boolean_t create)
{
dls_link_t *dlp;
int err;
/*
- * Look up a dls_link_t corresponding to the given mac_handle_t
- * in the global hash table. We need to hold i_dls_link_lock in
- * order to atomically find and insert a dls_link_t into the
- * hash table.
+ * Look up a dls_link_t corresponding to the given macname in the
+ * global hash table. The i_dls_link_hash itself is protected by the
+ * mod_hash package's internal lock which synchronizes
+ * find/insert/remove into the global mod_hash list. Assumes that
+ * inserts and removes are single threaded on a per mac end point
+ * by the mac perimeter.
*/
- rw_enter(&i_dls_link_lock, RW_WRITER);
if ((err = mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
(mod_hash_val_t *)&dlp)) == 0)
goto done;
ASSERT(err == MH_ERR_NOTFOUND);
+ if (!create)
+ return (ENOENT);
/*
* We didn't find anything so we need to create one.
*/
- if ((err = i_dls_link_create(name, &dlp)) != 0) {
- rw_exit(&i_dls_link_lock);
+ if ((err = i_dls_link_create(name, &dlp)) != 0)
return (err);
- }
/*
* Insert the dls_link_t.
@@ -838,124 +730,200 @@ dls_link_hold(const char *name, dls_link_t **dlpp)
(mod_hash_val_t)dlp);
ASSERT(err == 0);
- i_dls_link_count++;
+ atomic_add_32(&i_dls_link_count, 1);
ASSERT(i_dls_link_count != 0);
done:
-
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
/*
* Bump the reference count and hand back the reference.
*/
dlp->dl_ref++;
*dlpp = dlp;
- rw_exit(&i_dls_link_lock);
return (0);
}
+int
+dls_link_hold_create(const char *name, dls_link_t **dlpp)
+{
+ return (dls_link_hold_common(name, dlpp, B_TRUE));
+}
+
+int
+dls_link_hold(const char *name, dls_link_t **dlpp)
+{
+ return (dls_link_hold_common(name, dlpp, B_FALSE));
+}
+
+dev_info_t *
+dls_link_devinfo(dev_t dev)
+{
+ dls_link_t *dlp;
+ dev_info_t *dip;
+ char macname[MAXNAMELEN];
+ char *drv;
+ mac_perim_handle_t mph;
+
+ if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
+ return (NULL);
+ (void) snprintf(macname, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1);
+
+ /*
+ * The code below assumes that the name constructed above is the
+ * macname. This is not the case for legacy devices. Currently this
+ * is ok because this function is only called in the getinfo(9e) path,
+ * which for a legacy device would directly end up in the driver's
+ * getinfo, rather than here
+ */
+ if (mac_perim_enter_by_macname(macname, &mph) != 0)
+ return (NULL);
+
+ if (dls_link_hold(macname, &dlp) != 0) {
+ mac_perim_exit(mph);
+ return (NULL);
+ }
+
+ dip = mac_devinfo_get(dlp->dl_mh);
+ dls_link_rele(dlp);
+ mac_perim_exit(mph);
+
+ return (dip);
+}
+
+dev_t
+dls_link_dev(dls_link_t *dlp)
+{
+ return (makedevice(ddi_driver_major(mac_devinfo_get(dlp->dl_mh)),
+ mac_minor(dlp->dl_mh)));
+}
+
void
dls_link_rele(dls_link_t *dlp)
{
mod_hash_val_t val;
- rw_enter(&i_dls_link_lock, RW_WRITER);
-
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
/*
* Check if there are any more references.
*/
- if (--dlp->dl_ref != 0) {
+ if (--dlp->dl_ref == 0) {
+ (void) mod_hash_remove(i_dls_link_hash,
+ (mod_hash_key_t)dlp->dl_name, &val);
+ ASSERT(dlp == (dls_link_t *)val);
+
/*
- * There are more references so there's nothing more to do.
+ * Destroy the dls_link_t.
*/
- goto done;
+ i_dls_link_destroy(dlp);
+ ASSERT(i_dls_link_count > 0);
+ atomic_add_32(&i_dls_link_count, -1);
}
+}
+
+int
+dls_link_rele_by_name(const char *name)
+{
+ dls_link_t *dlp;
+
+ if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
+ (mod_hash_val_t *)&dlp) != 0)
+ return (ENOENT);
- (void) mod_hash_remove(i_dls_link_hash,
- (mod_hash_key_t)dlp->dl_name, &val);
- ASSERT(dlp == (dls_link_t *)val);
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
/*
- * Destroy the dls_link_t.
+ * Must fail detach if mac client is busy.
*/
- i_dls_link_destroy(dlp);
- ASSERT(i_dls_link_count > 0);
- i_dls_link_count--;
-done:
- rw_exit(&i_dls_link_lock);
+ ASSERT(dlp->dl_ref > 0 && dlp->dl_mch != NULL);
+ if (mac_link_has_flows(dlp->dl_mch))
+ return (ENOTEMPTY);
+
+ dls_link_rele(dlp);
+ return (0);
}
int
-dls_mac_hold(dls_link_t *dlp)
+dls_link_setzid(const char *name, zoneid_t zid)
{
- mac_handle_t mh;
- int err = 0;
+ dls_link_t *dlp;
+ int err = 0;
+ zoneid_t old_zid;
+
+ if ((err = dls_link_hold_create(name, &dlp)) != 0)
+ return (err);
- err = mac_open(dlp->dl_name, &mh);
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
- mutex_enter(&dlp->dl_lock);
+ if ((old_zid = dlp->dl_zid) == zid)
+ goto done;
- ASSERT(IMPLY(dlp->dl_macref != 0, dlp->dl_mh != NULL));
- ASSERT(IMPLY(dlp->dl_macref == 0, dlp->dl_mh == NULL));
- if (err == 0) {
- ASSERT(dlp->dl_mh == NULL || dlp->dl_mh == mh);
- if (dlp->dl_mh == NULL) {
- dlp->dl_mh = mh;
- dlp->dl_mip = mac_info(mh);
+ /*
+ * Check whether this dlp is used by its own zones, if yes,
+ * we cannot change its zoneid.
+ */
+ if (dlp->dl_zone_ref != 0) {
+ err = EBUSY;
+ goto done;
+ }
+
+ if (zid == GLOBAL_ZONEID) {
+ /*
+ * Move the link from the local zone to the global zone,
+ * and release the reference to this link. At the same time
+ * reset the link's active state so that an aggregation is
+ * allowed to be created over it.
+ */
+ dlp->dl_zid = zid;
+ dls_mac_active_clear(dlp);
+ dls_link_rele(dlp);
+ goto done;
+ } else if (old_zid == GLOBAL_ZONEID) {
+ /*
+ * Move the link from the global zone to the local zone,
+ * and hold a reference to this link. Also, set the link
+ * to the "active" state so that the global zone is
+ * not able to create an aggregation over this link.
+ * TODO: revisit once we allow creating aggregations
+ * within a local zone.
+ */
+ if ((err = dls_mac_active_set(dlp)) != 0) {
+ if (err != ENXIO)
+ err = EBUSY;
+ goto done;
}
- dlp->dl_macref++;
+ dlp->dl_zid = zid;
+ return (0);
+ } else {
+ /*
+ * Move the link from a local zone to another local zone.
+ */
+ dlp->dl_zid = zid;
}
- mutex_exit(&dlp->dl_lock);
+done:
+ dls_link_rele(dlp);
return (err);
}
void
-dls_mac_rele(dls_link_t *dlp)
-{
- mutex_enter(&dlp->dl_lock);
- ASSERT(dlp->dl_mh != NULL);
-
- mac_close(dlp->dl_mh);
-
- if (--dlp->dl_macref == 0) {
- dlp->dl_mh = NULL;
- dlp->dl_mip = NULL;
- }
- mutex_exit(&dlp->dl_lock);
-}
-
-void
-dls_link_add(dls_link_t *dlp, uint32_t sap, dls_impl_t *dip)
+dls_link_add(dls_link_t *dlp, uint32_t sap, dld_str_t *dsp)
{
- dls_vlan_t *dvp = dip->di_dvp;
- mod_hash_t *hash = dlp->dl_impl_hash;
+ mod_hash_t *hash = dlp->dl_str_hash;
mod_hash_key_t key;
dls_head_t *dhp;
- dls_impl_t *p;
- mac_rx_t rx;
+ dld_str_t *p;
int err;
- boolean_t promisc = B_FALSE;
- /*
- * Generate a hash key based on the sap and the VLAN id.
- */
- key = MAKE_KEY(sap, dvp->dv_id);
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
/*
- * We need dl_lock here because we want to be able to walk
- * the hash table *and* set the mac rx func atomically. if
- * these two operations are separate, someone else could
- * insert/remove dls_impl_t from the hash table after we
- * drop the hash lock and this could cause our chosen rx
- * func to be incorrect. note that we cannot call mac_rx_add
- * when holding the hash lock because this can cause deadlock.
+ * Generate a hash key based on the sap.
*/
- mutex_enter(&dlp->dl_lock);
+ key = MAKE_KEY(sap);
/*
* Search the table for a list head with this key.
*/
- rw_enter(&dlp->dl_impl_lock, RW_WRITER);
-
if ((err = mod_hash_find(hash, key, (mod_hash_val_t *)&dhp)) != 0) {
ASSERT(err == MH_ERR_NOTFOUND);
@@ -965,94 +933,68 @@ dls_link_add(dls_link_t *dlp, uint32_t sap, dls_impl_t *dip)
}
/*
- * Add the dls_impl_t to the head of the list.
+ * Add the dld_str_t to the head of the list. List walkers in
+ * i_dls_link_rx_* bump up dh_ref to ensure the list does not change
+ * while they walk the list. The membar below ensures that list walkers
+ * see exactly the old list or the new list.
*/
- ASSERT(dip->di_nextp == NULL);
+ ASSERT(dsp->ds_next == NULL);
p = dhp->dh_list;
- dip->di_nextp = p;
- dhp->dh_list = dip;
+ dsp->ds_next = p;
- /*
- * Save a pointer to the list head.
- */
- dip->di_headp = dhp;
- dlp->dl_impl_count++;
+ membar_producer();
- /*
- * Walk the bound dls_impl_t to see if there are any
- * in promiscuous 'all sap' mode.
- */
- mod_hash_walk(hash, i_dls_link_walk, (void *)&promisc);
- rw_exit(&dlp->dl_impl_lock);
+ dhp->dh_list = dsp;
/*
- * If there are then we need to use a receive routine
- * which will route packets to those dls_impl_t as well
- * as ones bound to the DLSAP of the packet.
+ * Save a pointer to the list head.
*/
- if (promisc)
- rx = i_dls_link_rx_promisc;
- else
- rx = i_dls_link_rx;
-
- /* Replace the existing receive function if there is one. */
- if (dlp->dl_mrh != NULL)
- mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE);
- dlp->dl_mrh = mac_active_rx_add(dlp->dl_mh, rx, (void *)dlp);
- mutex_exit(&dlp->dl_lock);
+ dsp->ds_head = dhp;
+ dlp->dl_impl_count++;
}
void
-dls_link_remove(dls_link_t *dlp, dls_impl_t *dip)
+dls_link_remove(dls_link_t *dlp, dld_str_t *dsp)
{
- mod_hash_t *hash = dlp->dl_impl_hash;
- dls_impl_t **pp;
- dls_impl_t *p;
+ mod_hash_t *hash = dlp->dl_str_hash;
+ dld_str_t **pp;
+ dld_str_t *p;
dls_head_t *dhp;
- mac_rx_t rx;
- /*
- * We need dl_lock here because we want to be able to walk
- * the hash table *and* set the mac rx func atomically. if
- * these two operations are separate, someone else could
- * insert/remove dls_impl_t from the hash table after we
- * drop the hash lock and this could cause our chosen rx
- * func to be incorrect. note that we cannot call mac_rx_add
- * when holding the hash lock because this can cause deadlock.
- */
- mutex_enter(&dlp->dl_lock);
- rw_enter(&dlp->dl_impl_lock, RW_WRITER);
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
/*
- * Poll the hash table entry until all references have been dropped.
- * We need to drop all locks before sleeping because we don't want
- * the interrupt handler to block. We set di_removing here to
- * tell the receive callbacks not to pass up packets anymore.
- * This is only a hint to quicken the decrease of the refcnt so
- * the assignment need not be protected by any lock.
+ * We set dh_removing here to tell the receive callbacks not to pass
+ * up packets anymore. Then wait till the current callbacks are done.
+ * This happens either in the close path or in processing the
+ * DL_UNBIND_REQ via a taskq thread, and it is ok to cv_wait in either.
+ * The dh_ref ensures there aren't and there won't be any upcalls
+ * walking or using the dh_list. The mod hash internal lock ensures
+ * that the insert/remove of the dls_head_t itself synchronizes with
+ * any i_dls_link_rx trying to locate it. The perimeter ensures that
+ * there isn't another simultaneous dls_link_add/remove.
*/
- dhp = dip->di_headp;
- dip->di_removing = B_TRUE;
- while (dhp->dh_ref != 0) {
- rw_exit(&dlp->dl_impl_lock);
- mutex_exit(&dlp->dl_lock);
- delay(drv_usectohz(1000)); /* 1ms delay */
- mutex_enter(&dlp->dl_lock);
- rw_enter(&dlp->dl_impl_lock, RW_WRITER);
- }
+ dhp = dsp->ds_head;
+
+ mutex_enter(&dhp->dh_lock);
+ dhp->dh_removing = B_TRUE;
+ while (dhp->dh_ref != 0)
+ cv_wait(&dhp->dh_cv, &dhp->dh_lock);
+ mutex_exit(&dhp->dh_lock);
/*
- * Walk the list and remove the dls_impl_t.
+ * Walk the list and remove the dld_str_t.
*/
- for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->di_nextp)) {
- if (p == dip)
+ for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->ds_next)) {
+ if (p == dsp)
break;
}
ASSERT(p != NULL);
- *pp = p->di_nextp;
- p->di_nextp = NULL;
+ *pp = p->ds_next;
+ p->ds_next = NULL;
+ p->ds_head = NULL;
- ASSERT(dlp->dl_impl_count > 0);
+ ASSERT(dlp->dl_impl_count != 0);
dlp->dl_impl_count--;
if (dhp->dh_list == NULL) {
@@ -1064,41 +1006,11 @@ dls_link_remove(dls_link_t *dlp, dls_impl_t *dip)
(void) mod_hash_remove(hash, dhp->dh_key, &val);
ASSERT(dhp == (dls_head_t *)val);
i_dls_head_free(dhp);
- }
- dip->di_removing = B_FALSE;
-
- /*
- * If there are no dls_impl_t then there's no need to register a
- * receive function with the mac.
- */
- if (dlp->dl_impl_count == 0) {
- rw_exit(&dlp->dl_impl_lock);
- mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE);
- dlp->dl_mrh = NULL;
} else {
- boolean_t promisc = B_FALSE;
-
- /*
- * Walk the bound dls_impl_t to see if there are any
- * in promiscuous 'all sap' mode.
- */
- mod_hash_walk(hash, i_dls_link_walk, (void *)&promisc);
- rw_exit(&dlp->dl_impl_lock);
-
- /*
- * If there are then we need to use a receive routine
- * which will route packets to those dls_impl_t as well
- * as ones bound to the DLSAP of the packet.
- */
- if (promisc)
- rx = i_dls_link_rx_promisc;
- else
- rx = i_dls_link_rx;
-
- mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE);
- dlp->dl_mrh = mac_active_rx_add(dlp->dl_mh, rx, (void *)dlp);
+ mutex_enter(&dhp->dh_lock);
+ dhp->dh_removing = B_FALSE;
+ mutex_exit(&dhp->dh_lock);
}
- mutex_exit(&dlp->dl_lock);
}
int
@@ -1153,10 +1065,5 @@ dls_link_header_info(dls_link_t *dlp, mblk_t *mp, mac_header_info_t *mhip)
mhip->mhi_tci = 0;
}
- /*
- * The messsage is looped back from the underlying driver.
- */
- mhip->mhi_prom_looped = (mp->b_flag & MSGNOLOOP);
-
return (0);
}
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index bf5fc0a814..bb922423b3 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Datalink management routines.
*/
@@ -38,11 +36,17 @@
#include <sys/kstat.h>
#include <sys/vnode.h>
#include <sys/cmn_err.h>
-#include <sys/vlan.h>
#include <sys/softmac.h>
#include <sys/dls.h>
#include <sys/dls_impl.h>
+/*
+ * This vanity name management module is treated as part of the GLD framework
+ * and we don't hold any GLD framework lock across a call to any mac
+ * function that needs to acquire the mac perimeter. The hierarchy is
+ * mac perimeter -> framework locks
+ */
+
static kmem_cache_t *i_dls_devnet_cachep;
static kmutex_t i_dls_mgmt_lock;
static krwlock_t i_dls_devnet_lock;
@@ -56,25 +60,22 @@ boolean_t devnet_need_rebuild;
/* Upcall door handle */
static door_handle_t dls_mgmt_dh = NULL;
+#define DD_CONDEMNED 0x1
+
/*
- * This structure is used to keep the <linkid, macname, vid> mapping.
+ * This structure is used to keep the <linkid, macname> mapping.
*/
typedef struct dls_devnet_s {
- datalink_id_t dd_vlanid;
datalink_id_t dd_linkid;
char dd_mac[MAXNAMELEN];
- uint16_t dd_vid;
- char dd_spa[MAXSPALEN];
- boolean_t dd_explicit;
kstat_t *dd_ksp;
-
uint32_t dd_ref;
kmutex_t dd_mutex;
kcondvar_t dd_cv;
uint32_t dd_tref;
+ uint_t dd_flags;
- kmutex_t dd_zid_mutex;
zoneid_t dd_zid;
boolean_t dd_prop_loaded;
@@ -90,7 +91,6 @@ i_dls_devnet_constructor(void *buf, void *arg, int kmflag)
bzero(buf, sizeof (dls_devnet_t));
mutex_init(&ddp->dd_mutex, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&ddp->dd_zid_mutex, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ddp->dd_cv, NULL, CV_DEFAULT, NULL);
return (0);
}
@@ -104,9 +104,7 @@ i_dls_devnet_destructor(void *buf, void *arg)
ASSERT(ddp->dd_ksp == NULL);
ASSERT(ddp->dd_ref == 0);
ASSERT(ddp->dd_tref == 0);
- ASSERT(!ddp->dd_explicit);
mutex_destroy(&ddp->dd_mutex);
- mutex_destroy(&ddp->dd_zid_mutex);
cv_destroy(&ddp->dd_cv);
}
@@ -128,13 +126,13 @@ dls_mgmt_init(void)
ASSERT(i_dls_devnet_cachep != NULL);
/*
- * Create a hash table, keyed by dd_vlanid, of dls_devnet_t.
+ * Create a hash table, keyed by dd_linkid, of dls_devnet_t.
*/
i_dls_devnet_id_hash = mod_hash_create_idhash("dls_devnet_id_hash",
VLAN_HASHSZ, mod_hash_null_valdtor);
/*
- * Create a hash table, keyed by dd_spa.
+ * Create a hash table, keyed by dd_mac
*/
i_dls_devnet_hash = mod_hash_create_extended("dls_devnet_hash",
VLAN_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
@@ -310,7 +308,6 @@ done:
* registration of its mac
* - class datalink class
* - media type media type; DL_OTHER means unknown
- * - vid VLAN ID (for VLANs)
* - persist whether to persist the datalink
*/
int
@@ -546,7 +543,7 @@ dls_devnet_prop_task(void *arg)
{
dls_devnet_t *ddp = arg;
- (void) dls_mgmt_linkprop_init(ddp->dd_vlanid);
+ (void) dls_mgmt_linkprop_init(ddp->dd_linkid);
mutex_enter(&ddp->dd_mutex);
ddp->dd_prop_loaded = B_TRUE;
@@ -567,58 +564,48 @@ dls_devnet_prop_task_wait(dls_dl_handle_t ddp)
mutex_exit(&ddp->dd_mutex);
}
-/*
- * Hold the vanity naming structure (dls_devnet_t) temporarily. The request to
- * delete the dls_devnet_t will wait until the temporary reference is released.
- */
+void
+dls_devnet_rele_tmp(dls_dl_handle_t dlh)
+{
+ dls_devnet_t *ddp = dlh;
+
+ mutex_enter(&ddp->dd_mutex);
+ ASSERT(ddp->dd_tref != 0);
+ if (--ddp->dd_tref == 0)
+ cv_signal(&ddp->dd_cv);
+ mutex_exit(&ddp->dd_mutex);
+}
+
int
-dls_devnet_hold_tmp(datalink_id_t linkid, dls_dl_handle_t *ddhp)
+dls_devnet_hold_link(datalink_id_t linkid, dls_dl_handle_t *ddhp,
+ dls_link_t **dlpp)
{
- dls_devnet_t *ddp;
- dls_dev_handle_t ddh = NULL;
- dev_t phydev = 0;
- int err;
+ dls_dl_handle_t dlh;
+ dls_link_t *dlp;
+ int err;
- /*
- * Hold this link to prevent it being detached (if physical link).
- */
- if (dls_mgmt_get_phydev(linkid, &phydev) == 0)
- (void) softmac_hold_device(phydev, &ddh);
+ if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
+ return (err);
- rw_enter(&i_dls_devnet_lock, RW_READER);
- if ((err = mod_hash_find(i_dls_devnet_id_hash,
- (mod_hash_key_t)(uintptr_t)linkid, (mod_hash_val_t *)&ddp)) != 0) {
- ASSERT(err == MH_ERR_NOTFOUND);
- rw_exit(&i_dls_devnet_lock);
- softmac_rele_device(ddh);
- return (ENOENT);
+ if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) {
+ dls_devnet_rele_tmp(dlh);
+ return (err);
}
- /*
- * At least one reference was held when this datalink was created.
- */
- ASSERT(ddp->dd_ref > 0);
- mutex_enter(&ddp->dd_mutex);
- ddp->dd_tref++;
- mutex_exit(&ddp->dd_mutex);
- rw_exit(&i_dls_devnet_lock);
- softmac_rele_device(ddh);
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
-done:
- *ddhp = ddp;
+ *ddhp = dlh;
+ *dlpp = dlp;
return (0);
}
void
-dls_devnet_rele_tmp(dls_dl_handle_t dlh)
+dls_devnet_rele_link(dls_dl_handle_t dlh, dls_link_t *dlp)
{
- dls_devnet_t *ddp = dlh;
+ ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
- mutex_enter(&ddp->dd_mutex);
- ASSERT(ddp->dd_tref != 0);
- if (--ddp->dd_tref == 0)
- cv_signal(&ddp->dd_cv);
- mutex_exit(&ddp->dd_mutex);
+ dls_link_rele(dlp);
+ dls_devnet_rele_tmp(dlh);
}
/*
@@ -632,15 +619,23 @@ static int
dls_devnet_stat_update(kstat_t *ksp, int rw)
{
dls_devnet_t *ddp = ksp->ks_private;
- dls_vlan_t *dvp;
+ dls_link_t *dlp;
int err;
+ mac_perim_handle_t mph;
- err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE, B_FALSE);
+ err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
if (err != 0)
return (err);
- err = dls_stat_update(ksp, dvp, rw);
- dls_vlan_rele(dvp);
+ err = dls_link_hold(ddp->dd_mac, &dlp);
+ if (err != 0) {
+ mac_perim_exit(mph);
+ return (err);
+ }
+
+ err = dls_stat_update(ksp, dlp, rw);
+ dls_link_rele(dlp);
+ mac_perim_exit(mph);
return (err);
}
@@ -653,7 +648,7 @@ dls_devnet_stat_create(dls_devnet_t *ddp)
char link[MAXLINKNAMELEN];
kstat_t *ksp;
- if ((dls_mgmt_get_linkinfo(ddp->dd_vlanid, link,
+ if ((dls_mgmt_get_linkinfo(ddp->dd_linkid, link,
NULL, NULL, NULL)) != 0) {
return;
}
@@ -704,114 +699,53 @@ dls_devnet_stat_rename(dls_devnet_t *ddp, const char *link)
}
/*
- * Associate a linkid with a given link (identified by <macname/vid>)
- *
- * Several cases:
- * a. implicit VLAN creation: (non-NULL "vlan")
- * b. explicit VLAN creation: (NULL "vlan")
- * c. explicit non-VLAN creation:
- * (NULL "vlan" and linkid could be INVALID_LINKID if the physical device
- * was created before the daemon was started)
+ * Associate a linkid with a given link (identified by macname)
*/
static int
-dls_devnet_set(const char *macname, uint16_t vid,
- datalink_id_t vlan_linkid, datalink_id_t linkid, const char *vlan,
- dls_devnet_t **ddpp)
+dls_devnet_set(const char *macname, datalink_id_t linkid, dls_devnet_t **ddpp)
{
dls_devnet_t *ddp = NULL;
- char spa[MAXSPALEN];
- boolean_t explicit = (vlan == NULL);
datalink_class_t class;
int err;
- ASSERT(vid != VLAN_ID_NONE || explicit);
- ASSERT(vlan_linkid != DATALINK_INVALID_LINKID || !explicit ||
- vid == VLAN_ID_NONE);
-
- (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
rw_enter(&i_dls_devnet_lock, RW_WRITER);
if ((err = mod_hash_find(i_dls_devnet_hash,
- (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) == 0) {
- char link[MAXLINKNAMELEN];
-
- if (explicit) {
- if ((vid != VLAN_ID_NONE) ||
- (ddp->dd_vlanid != DATALINK_INVALID_LINKID)) {
- err = EEXIST;
- goto done;
- }
-
- /*
- * This might be a physical link that has already
- * been created, but which does not have a vlan_linkid
- * because dlmgmtd was not running when it was created.
- */
- if ((err = dls_mgmt_get_linkinfo(vlan_linkid, NULL,
- &class, NULL, NULL)) != 0) {
- goto done;
- }
-
- if (class != DATALINK_CLASS_PHYS) {
- err = EINVAL;
- goto done;
- }
-
- goto newphys;
+ (mod_hash_key_t)macname, (mod_hash_val_t *)&ddp)) == 0) {
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
+ err = EEXIST;
+ goto done;
}
/*
- * Implicit VLAN, but the same name has already
- * been associated with another linkid. Check if the name
- * of that link matches the given VLAN name.
+ * This might be a physical link that has already
+ * been created, but which does not have a linkid
+ * because dlmgmtd was not running when it was created.
*/
- ASSERT(vid != VLAN_ID_NONE);
- if ((err = dls_mgmt_get_linkinfo(ddp->dd_vlanid, link,
- NULL, NULL, NULL)) != 0) {
+ if ((err = dls_mgmt_get_linkinfo(linkid, NULL,
+ &class, NULL, NULL)) != 0) {
goto done;
}
- if (strcmp(link, vlan) != 0) {
- err = EEXIST;
+ if (class != DATALINK_CLASS_PHYS) {
+ err = EINVAL;
goto done;
}
- /*
- * This is not an implicit created VLAN any more, return
- * this existing datalink.
- */
- ASSERT(ddp->dd_ref > 0);
- ddp->dd_ref++;
- goto done;
- }
-
- /*
- * Request the daemon to create a new vlan_linkid for this implicitly
- * created vlan.
- */
- if (!explicit && ((err = dls_mgmt_create(vlan, 0,
- DATALINK_CLASS_VLAN, DL_ETHER, B_FALSE, &vlan_linkid)) != 0)) {
- goto done;
+ goto newphys;
}
-
ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP);
- ddp->dd_vid = vid;
- ddp->dd_explicit = explicit;
ddp->dd_tref = 0;
ddp->dd_ref++;
ddp->dd_zid = GLOBAL_ZONEID;
(void) strncpy(ddp->dd_mac, macname, MAXNAMELEN);
- (void) snprintf(ddp->dd_spa, MAXSPALEN, "%s/%d", macname, vid);
VERIFY(mod_hash_insert(i_dls_devnet_hash,
- (mod_hash_key_t)ddp->dd_spa, (mod_hash_val_t)ddp) == 0);
+ (mod_hash_key_t)ddp->dd_mac, (mod_hash_val_t)ddp) == 0);
newphys:
-
- ddp->dd_vlanid = vlan_linkid;
- if (ddp->dd_vlanid != DATALINK_INVALID_LINKID) {
+ if (linkid != DATALINK_INVALID_LINKID) {
ddp->dd_linkid = linkid;
-
VERIFY(mod_hash_insert(i_dls_devnet_id_hash,
- (mod_hash_key_t)(uintptr_t)vlan_linkid,
+ (mod_hash_key_t)(uintptr_t)linkid,
(mod_hash_val_t)ddp) == 0);
devnet_need_rebuild = B_TRUE;
dls_devnet_stat_create(ddp);
@@ -832,90 +766,83 @@ done:
return (err);
}
-static void
-dls_devnet_unset_common(dls_devnet_t *ddp)
-{
- mod_hash_val_t val;
-
- ASSERT(RW_WRITE_HELD(&i_dls_devnet_lock));
-
- ASSERT(ddp->dd_ref == 0);
-
- /*
- * Remove this dls_devnet_t from the hash table.
- */
- VERIFY(mod_hash_remove(i_dls_devnet_hash,
- (mod_hash_key_t)ddp->dd_spa, &val) == 0);
-
- if (ddp->dd_vlanid != DATALINK_INVALID_LINKID) {
- VERIFY(mod_hash_remove(i_dls_devnet_id_hash,
- (mod_hash_key_t)(uintptr_t)ddp->dd_vlanid, &val) == 0);
-
- dls_devnet_stat_destroy(ddp);
- devnet_need_rebuild = B_TRUE;
- }
-
- /*
- * Wait until all temporary references are released.
- */
- mutex_enter(&ddp->dd_mutex);
- while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL))
- cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
-
- ddp->dd_prop_loaded = B_FALSE;
- mutex_exit(&ddp->dd_mutex);
-
- if (!ddp->dd_explicit) {
- ASSERT(ddp->dd_vid != VLAN_ID_NONE);
- ASSERT(ddp->dd_vlanid != DATALINK_INVALID_LINKID);
- (void) dls_mgmt_destroy(ddp->dd_vlanid, B_FALSE);
- }
-
- ddp->dd_vlanid = DATALINK_INVALID_LINKID;
- ddp->dd_zid = GLOBAL_ZONEID;
- ddp->dd_explicit = B_FALSE;
- kmem_cache_free(i_dls_devnet_cachep, ddp);
-}
-
/*
- * Disassociate a linkid with a given link (identified by <macname/vid>)
+ * Disassociate a linkid with a given link (identified by macname)
+ * This waits until temporary references to the dls_devnet_t are gone.
*/
static int
-dls_devnet_unset(const char *macname, uint16_t vid, datalink_id_t *id)
+dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
{
dls_devnet_t *ddp;
- char spa[MAXSPALEN];
int err;
-
- (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
+ mod_hash_val_t val;
rw_enter(&i_dls_devnet_lock, RW_WRITER);
if ((err = mod_hash_find(i_dls_devnet_hash,
- (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) != 0) {
+ (mod_hash_key_t)macname, (mod_hash_val_t *)&ddp)) != 0) {
ASSERT(err == MH_ERR_NOTFOUND);
rw_exit(&i_dls_devnet_lock);
return (ENOENT);
}
- ASSERT(ddp->dd_ref != 0);
+ mutex_enter(&ddp->dd_mutex);
- if (ddp->dd_ref != 1) {
+ /*
+ * Make sure downcalls into softmac_create or softmac_destroy from
+ * devfs don't cv_wait on any devfs related condition for fear of
+ * deadlock. Return EBUSY if the asynchronous thread started for
+ * property loading as part of the post attach hasn't yet completed.
+ */
+ ASSERT(ddp->dd_ref != 0);
+ if ((ddp->dd_ref != 1) || (!wait &&
+ (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) {
+ mutex_exit(&ddp->dd_mutex);
rw_exit(&i_dls_devnet_lock);
return (EBUSY);
}
+ ddp->dd_flags |= DD_CONDEMNED;
ddp->dd_ref--;
+ *id = ddp->dd_linkid;
- if (id != NULL)
- *id = ddp->dd_vlanid;
+ /*
+ * Remove this dls_devnet_t from the hash table.
+ */
+ VERIFY(mod_hash_remove(i_dls_devnet_hash,
+ (mod_hash_key_t)ddp->dd_mac, &val) == 0);
- dls_devnet_unset_common(ddp);
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
+ VERIFY(mod_hash_remove(i_dls_devnet_id_hash,
+ (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, &val) == 0);
+
+ dls_devnet_stat_destroy(ddp);
+ devnet_need_rebuild = B_TRUE;
+ }
rw_exit(&i_dls_devnet_lock);
+
+ if (wait) {
+ /*
+ * Wait until all temporary references are released.
+ */
+ while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL))
+ cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
+ } else {
+ ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL);
+ }
+
+ ddp->dd_prop_loaded = B_FALSE;
+ ddp->dd_linkid = DATALINK_INVALID_LINKID;
+ ddp->dd_zid = GLOBAL_ZONEID;
+ ddp->dd_flags = 0;
+ mutex_exit(&ddp->dd_mutex);
+ kmem_cache_free(i_dls_devnet_cachep, ddp);
+
return (0);
}
static int
-dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp)
+dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
+ boolean_t tmp_hold)
{
dls_devnet_t *ddp;
dev_t phydev = 0;
@@ -938,39 +865,70 @@ dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp)
return (ENOENT);
}
+ mutex_enter(&ddp->dd_mutex);
ASSERT(ddp->dd_ref > 0);
- ddp->dd_ref++;
+ if (ddp->dd_flags & DD_CONDEMNED) {
+ mutex_exit(&ddp->dd_mutex);
+ rw_exit(&i_dls_devnet_lock);
+ softmac_rele_device(ddh);
+ return (ENOENT);
+ }
+ if (tmp_hold)
+ ddp->dd_tref++;
+ else
+ ddp->dd_ref++;
+ mutex_exit(&ddp->dd_mutex);
rw_exit(&i_dls_devnet_lock);
+
softmac_rele_device(ddh);
-done:
*ddpp = ddp;
return (0);
}
+int
+dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp)
+{
+ return (dls_devnet_hold_common(linkid, ddpp, B_FALSE));
+}
+
+/*
+ * Hold the vanity naming structure (dls_devnet_t) temporarily. The request to
+ * delete the dls_devnet_t will wait until the temporary reference is released.
+ */
+int
+dls_devnet_hold_tmp(datalink_id_t linkid, dls_devnet_t **ddpp)
+{
+ return (dls_devnet_hold_common(linkid, ddpp, B_TRUE));
+}
+
/*
* This funtion is called when a DLS client tries to open a device node.
* This dev_t could a result of a /dev/net node access (returned by
* devnet_create_rvp->dls_devnet_open()) or a direct /dev node access.
- * In both cases, this function returns 0. In the first case, bump the
- * reference count of the dls_devnet_t structure, so that it will not be
- * freed when devnet_inactive_callback->dls_devnet_close() is called
- * (Note that devnet_inactive_callback() is called right after dld_open,
- * not when the /dev/net access is done). In the second case, ddhp would
- * be NULL.
- *
- * To undo this function, call dls_devnet_close() in the first case, and call
- * dls_vlan_rele() in the second case.
+ * In both cases, this function bumps up the reference count of the
+ * dls_devnet_t structure. The reference is held as long as the device node
+ * is open. In the case of /dev/net while it is true that the initial reference
+ * is held when the devnet_create_rvp->dls_devnet_open call happens, this
+ * initial reference is released immediately in devnet_inactive_callback ->
+ * dls_devnet_close(). (Note that devnet_inactive_callback() is called right
+ * after dld_open completes, not when the /dev/net node is being closed).
+ * To undo this function, call dls_devnet_rele()
*/
int
-dls_devnet_open_by_dev(dev_t dev, dls_vlan_t **dvpp, dls_dl_handle_t *ddhp)
+dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp)
{
+ char name[MAXNAMELEN];
+ char *drv;
dls_dev_handle_t ddh = NULL;
- char spa[MAXSPALEN];
dls_devnet_t *ddp;
- dls_vlan_t *dvp;
int err;
+ if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
+ return (EINVAL);
+
+ (void) snprintf(name, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1);
+
/*
* Hold this link to prevent it being detached in case of a
* GLDv3 physical link.
@@ -978,64 +936,49 @@ dls_devnet_open_by_dev(dev_t dev, dls_vlan_t **dvpp, dls_dl_handle_t *ddhp)
if (getminor(dev) - 1 < MAC_MAX_MINOR)
(void) softmac_hold_device(dev, &ddh);
- /*
- * Found the dls_vlan_t with the given dev.
- */
- err = dls_vlan_hold_by_dev(dev, &dvp);
- softmac_rele_device(ddh);
-
- if (err != 0)
- return (err);
-
- (void) snprintf(spa, MAXSPALEN, "%s/%d",
- dvp->dv_dlp->dl_name, dvp->dv_id);
-
rw_enter(&i_dls_devnet_lock, RW_WRITER);
if ((err = mod_hash_find(i_dls_devnet_hash,
- (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) != 0) {
+ (mod_hash_key_t)name, (mod_hash_val_t *)&ddp)) != 0) {
ASSERT(err == MH_ERR_NOTFOUND);
rw_exit(&i_dls_devnet_lock);
- *ddhp = NULL;
- *dvpp = dvp;
- return (0);
+ softmac_rele_device(ddh);
+ return (ENOENT);
}
-
+ mutex_enter(&ddp->dd_mutex);
ASSERT(ddp->dd_ref > 0);
+ if (ddp->dd_flags & DD_CONDEMNED) {
+ mutex_exit(&ddp->dd_mutex);
+ rw_exit(&i_dls_devnet_lock);
+ softmac_rele_device(ddh);
+ return (ENOENT);
+ }
ddp->dd_ref++;
+ mutex_exit(&ddp->dd_mutex);
rw_exit(&i_dls_devnet_lock);
+
+ softmac_rele_device(ddh);
+
*ddhp = ddp;
- *dvpp = dvp;
return (0);
}
-static void
+void
dls_devnet_rele(dls_devnet_t *ddp)
{
- rw_enter(&i_dls_devnet_lock, RW_WRITER);
- ASSERT(ddp->dd_ref != 0);
- if (--ddp->dd_ref != 0) {
- rw_exit(&i_dls_devnet_lock);
- return;
- }
- /*
- * This should only happen for implicitly-created VLAN.
- */
- ASSERT(ddp->dd_vid != VLAN_ID_NONE);
- dls_devnet_unset_common(ddp);
- rw_exit(&i_dls_devnet_lock);
+ mutex_enter(&ddp->dd_mutex);
+ ASSERT(ddp->dd_ref > 1);
+ ddp->dd_ref--;
+ mutex_exit(&ddp->dd_mutex);
}
static int
-dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
+dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
{
- char link_under[MAXLINKNAMELEN];
char drv[MAXLINKNAMELEN];
uint_t ppa;
major_t major;
dev_t phy_dev, tmp_dev;
- uint_t vid;
datalink_id_t linkid;
- dls_devnet_t *ddp;
dls_dev_handle_t ddh;
int err;
@@ -1056,35 +999,8 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
if (ddi_parse(link, drv, &ppa) != DDI_SUCCESS)
return (ENOENT);
- if ((vid = DLS_PPA2VID(ppa)) > VLAN_ID_MAX)
- return (ENOENT);
-
- ppa = (uint_t)DLS_PPA2INST(ppa);
- (void) snprintf(link_under, sizeof (link_under), "%s%d", drv, ppa);
-
- if (vid != VLAN_ID_NONE) {
- /*
- * Only global zone can implicitly create a VLAN.
- */
- if (zid != GLOBAL_ZONEID)
- return (ENOENT);
-
- /*
- * This is potentially an implicitly-created VLAN. Hold the
- * link this VLAN is created on.
- */
- if (dls_mgmt_get_linkid(link_under, &linkid) == 0 &&
- dls_devnet_hold_tmp(linkid, &ddp) == 0) {
- if (ddp->dd_vid != VLAN_ID_NONE) {
- dls_devnet_rele_tmp(ddp);
- return (ENOENT);
- }
- goto implicit;
- }
- }
-
/*
- * If this link (or the link that an implicit vlan is created on)
+ * If this link:
* (a) is a physical device, (b) this is the first boot, (c) the MAC
* is not registered yet, and (d) we cannot find its linkid, then the
* linkname is the same as the devname.
@@ -1102,7 +1018,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
* At this time, the MAC should be registered, check its phy_dev using
* the given name.
*/
- if ((err = dls_mgmt_get_linkid(link_under, &linkid)) != 0 ||
+ if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0 ||
(err = dls_mgmt_get_phydev(linkid, &tmp_dev)) != 0) {
softmac_rele_device(ddh);
return (err);
@@ -1112,65 +1028,45 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
return (ENOENT);
}
- if (vid == VLAN_ID_NONE) {
- /*
- * For non-VLAN, we are done.
- */
- err = dls_devnet_hold(linkid, ddpp);
- softmac_rele_device(ddh);
- return (err);
- }
-
- /*
- * If this is an implicit VLAN, temporarily hold this non-VLAN.
- */
- VERIFY(dls_devnet_hold_tmp(linkid, &ddp) == 0);
+ err = dls_devnet_hold(linkid, ddpp);
softmac_rele_device(ddh);
- ASSERT(ddp->dd_vid == VLAN_ID_NONE);
-
- /*
- * Again, this is potentially an implicitly-created VLAN.
- */
-
-implicit:
- ASSERT(vid != VLAN_ID_NONE);
- err = dls_devnet_set(ddp->dd_mac, vid, DATALINK_INVALID_LINKID,
- linkid, link, ddpp);
- dls_devnet_rele_tmp(ddp);
return (err);
}
-/*
- * Get linkid for the given dev.
- */
int
-dls_devnet_dev2linkid(dev_t dev, datalink_id_t *linkidp)
+dls_devnet_macname2linkid(const char *macname, datalink_id_t *linkidp)
{
- dls_vlan_t *dvp;
dls_devnet_t *ddp;
- char spa[MAXSPALEN];
- int err;
-
- if ((err = dls_vlan_hold_by_dev(dev, &dvp)) != 0)
- return (err);
-
- (void) snprintf(spa, MAXSPALEN, "%s/%d",
- dvp->dv_dlp->dl_name, dvp->dv_id);
rw_enter(&i_dls_devnet_lock, RW_READER);
- if (mod_hash_find(i_dls_devnet_hash, (mod_hash_key_t)spa,
+ if (mod_hash_find(i_dls_devnet_hash, (mod_hash_key_t)macname,
(mod_hash_val_t *)&ddp) != 0) {
rw_exit(&i_dls_devnet_lock);
- dls_vlan_rele(dvp);
return (ENOENT);
}
- *linkidp = ddp->dd_vlanid;
+ *linkidp = ddp->dd_linkid;
rw_exit(&i_dls_devnet_lock);
- dls_vlan_rele(dvp);
return (0);
}
+
+/*
+ * Get linkid for the given dev.
+ */
+int
+dls_devnet_dev2linkid(dev_t dev, datalink_id_t *linkidp)
+{
+ char macname[MAXNAMELEN];
+ char *drv;
+
+ if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
+ return (EINVAL);
+
+ (void) snprintf(macname, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1);
+ return (dls_devnet_macname2linkid(macname, linkidp));
+}
+
/*
* Get the link's physical dev_t. It this is a VLAN, get the dev_t of the
* link this VLAN is created on.
@@ -1213,6 +1109,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
int err = 0;
dev_t phydev = 0;
dls_devnet_t *ddp;
+ mac_perim_handle_t mph = NULL;
mac_handle_t mh;
mod_hash_val_t val;
@@ -1232,6 +1129,14 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
if (dls_mgmt_get_phydev(id1, &phydev) == 0)
(void) softmac_hold_device(phydev, &ddh);
+ /*
+ * The framework does not hold hold locks across calls to the
+ * mac perimeter, hence enter the perimeter first. This also waits
+ * for the property loading to finish.
+ */
+ if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0)
+ goto done;
+
rw_enter(&i_dls_devnet_lock, RW_WRITER);
if ((err = mod_hash_find(i_dls_devnet_id_hash,
(mod_hash_key_t)(uintptr_t)id1, (mod_hash_val_t *)&ddp)) != 0) {
@@ -1241,41 +1146,21 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
}
/*
- * Let the property loading thread finish.
- * Unfortunately, we have to drop i_dls_devnet_lock temporarily
- * to avoid deadlocks, and ensure ddp is still in the hash after
- * reacquiring it. Observe lock order as well.
- */
- mutex_enter(&ddp->dd_mutex);
- if (ddp->dd_prop_taskid != NULL) {
- rw_exit(&i_dls_devnet_lock);
- while (ddp->dd_prop_taskid != NULL)
- cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
- mutex_exit(&ddp->dd_mutex);
- rw_enter(&i_dls_devnet_lock, RW_WRITER);
-
- if ((err = mod_hash_find(i_dls_devnet_id_hash,
- (mod_hash_key_t)(uintptr_t)id1,
- (mod_hash_val_t *)&ddp)) != 0) {
- ASSERT(err == MH_ERR_NOTFOUND);
- err = ENOENT;
- goto done;
- }
- } else {
- mutex_exit(&ddp->dd_mutex);
- }
-
- /*
* Return EBUSY if any applications have this link open.
*/
- if ((ddp->dd_explicit && ddp->dd_ref > 1) ||
- (!ddp->dd_explicit && ddp->dd_ref > 0)) {
+ if (ddp->dd_ref > 1) {
err = EBUSY;
goto done;
}
if (id2 == DATALINK_INVALID_LINKID) {
(void) strlcpy(linkname, link, sizeof (linkname));
+
+ /* rename mac client name and its flow if exists */
+ if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
+ goto done;
+ (void) mac_rename_primary(mh, link);
+ mac_close(mh);
goto done;
}
@@ -1294,7 +1179,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
/*
* We release the reference of the MAC which mac_open() is
* holding. Note that this mac will not be unregistered
- * because the physical device is hold.
+ * because the physical device is held.
*/
mac_close(mh);
@@ -1302,7 +1187,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
* Check if there is any other MAC clients, if not, hold this mac
* exclusively until we are done.
*/
- if ((err = mac_hold_exclusive(mh)) != 0)
+ if ((err = mac_mark_exclusive(mh)) != 0)
goto done;
/*
@@ -1310,23 +1195,25 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
*/
if ((err = mod_hash_find(i_dls_devnet_id_hash,
(mod_hash_key_t)(uintptr_t)id2, &val)) != MH_ERR_NOTFOUND) {
- mac_rele_exclusive(mh);
+ mac_unmark_exclusive(mh);
err = EEXIST;
goto done;
}
err = dls_mgmt_get_linkinfo(id2, linkname, NULL, NULL, NULL);
if (err != 0) {
- mac_rele_exclusive(mh);
+ mac_unmark_exclusive(mh);
goto done;
}
(void) mod_hash_remove(i_dls_devnet_id_hash,
(mod_hash_key_t)(uintptr_t)id1, &val);
- ddp->dd_vlanid = id2;
+ ddp->dd_linkid = id2;
(void) mod_hash_insert(i_dls_devnet_id_hash,
- (mod_hash_key_t)(uintptr_t)ddp->dd_vlanid, (mod_hash_val_t)ddp);
+ (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, (mod_hash_val_t)ddp);
+
+ mac_unmark_exclusive(mh);
/* load properties for new id */
mutex_enter(&ddp->dd_mutex);
@@ -1335,8 +1222,6 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
dls_devnet_prop_task, ddp, TQ_SLEEP);
mutex_exit(&ddp->dd_mutex);
- mac_rele_exclusive(mh);
-
done:
/*
* Change the name of the kstat based on the new link name.
@@ -1345,6 +1230,8 @@ done:
dls_devnet_stat_rename(ddp, linkname);
rw_exit(&i_dls_devnet_lock);
+ if (mph != NULL)
+ mac_perim_exit(mph);
softmac_rele_device(ddh);
return (err);
}
@@ -1355,26 +1242,30 @@ dls_devnet_setzid(const char *link, zoneid_t zid)
dls_devnet_t *ddp;
int err;
zoneid_t old_zid;
+ mac_perim_handle_t mph;
+
+ if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
+ return (err);
- if ((err = dls_devnet_hold_by_name(link, &ddp, GLOBAL_ZONEID)) != 0)
+ err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
+ if (err != 0)
return (err);
- mutex_enter(&ddp->dd_zid_mutex);
if ((old_zid = ddp->dd_zid) == zid) {
- mutex_exit(&ddp->dd_zid_mutex);
+ mac_perim_exit(mph);
dls_devnet_rele(ddp);
return (0);
}
- if ((err = dls_vlan_setzid(ddp->dd_mac, ddp->dd_vid, zid)) != 0) {
- mutex_exit(&ddp->dd_zid_mutex);
+ if ((err = dls_link_setzid(ddp->dd_mac, zid)) != 0) {
+ mac_perim_exit(mph);
dls_devnet_rele(ddp);
return (err);
}
ddp->dd_zid = zid;
devnet_need_rebuild = B_TRUE;
- mutex_exit(&ddp->dd_zid_mutex);
+ mac_perim_exit(mph);
/*
* Keep this open reference only if it belonged to the global zone
@@ -1402,9 +1293,7 @@ dls_devnet_getzid(datalink_id_t linkid, zoneid_t *zidp)
if ((err = dls_devnet_hold_tmp(linkid, &ddp)) != 0)
return (err);
- mutex_enter(&ddp->dd_zid_mutex);
*zidp = ddp->dd_zid;
- mutex_exit(&ddp->dd_zid_mutex);
dls_devnet_rele_tmp(ddp);
return (0);
@@ -1417,13 +1306,16 @@ int
dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
{
dls_devnet_t *ddp;
- dls_vlan_t *dvp;
+ dls_link_t *dlp;
zoneid_t zid = getzoneid();
int err;
+ mac_perim_handle_t mph;
- if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0)
+ if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
return (err);
+ dls_devnet_prop_task_wait(ddp);
+
/*
* Opening a link that does not belong to the current non-global zone
* is not allowed.
@@ -1433,16 +1325,22 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
return (ENOENT);
}
- err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE, B_TRUE);
+ err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
if (err != 0) {
dls_devnet_rele(ddp);
return (err);
}
- dls_devnet_prop_task_wait(ddp);
+ err = dls_link_hold_create(ddp->dd_mac, &dlp);
+ mac_perim_exit(mph);
+
+ if (err != 0) {
+ dls_devnet_rele(ddp);
+ return (err);
+ }
*dhp = ddp;
- *devp = dvp->dv_dev;
+ *devp = dls_link_dev(dlp);
return (0);
}
@@ -1453,15 +1351,20 @@ void
dls_devnet_close(dls_dl_handle_t dlh)
{
dls_devnet_t *ddp = dlh;
- dls_vlan_t *dvp;
+ dls_link_t *dlp;
+ mac_perim_handle_t mph;
+
+ VERIFY(mac_perim_enter_by_macname(ddp->dd_mac, &mph) == 0);
+ VERIFY(dls_link_hold(ddp->dd_mac, &dlp) == 0);
/*
- * The VLAN is hold in dls_open_devnet_link().
+ * One rele for the hold placed in dls_devnet_open, another for
+ * the hold done just above
*/
- VERIFY((dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE,
- B_FALSE)) == 0);
- dls_vlan_rele(dvp);
- dls_vlan_rele(dvp);
+ dls_link_rele(dlp);
+ dls_link_rele(dlp);
+ mac_perim_exit(mph);
+
dls_devnet_rele(ddp);
}
@@ -1481,15 +1384,27 @@ dls_devnet_rebuild()
int
dls_devnet_create(mac_handle_t mh, datalink_id_t linkid)
{
+ dls_link_t *dlp;
int err;
+ mac_perim_handle_t mph;
- if ((err = dls_vlan_create(mac_name(mh), 0, B_FALSE)) != 0)
- return (err);
-
- err = dls_devnet_set(mac_name(mh), 0, linkid, linkid, NULL, NULL);
- if (err != 0)
- (void) dls_vlan_destroy(mac_name(mh), 0);
+ mac_perim_enter_by_mh(mh, &mph);
+ /*
+ * Make this association before we call dls_link_hold_create as
+ * we need to use the linkid to get the user name for the link
+ * when we create the MAC client.
+ */
+ if ((err = dls_devnet_set(mac_name(mh), linkid, NULL)) != 0) {
+ mac_perim_exit(mph);
+ return (err);
+ }
+ if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) {
+ (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
+ mac_perim_exit(mph);
+ return (err);
+ }
+ mac_perim_exit(mph);
return (err);
}
@@ -1503,134 +1418,39 @@ int
dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid)
{
ASSERT(linkid != DATALINK_INVALID_LINKID);
- return (dls_devnet_set(mac_name(mh), 0, linkid, linkid, NULL, NULL));
+ return (dls_devnet_set(mac_name(mh), linkid, NULL));
}
int
-dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp)
+dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
{
- int err;
+ int err;
+ mac_perim_handle_t mph;
*idp = DATALINK_INVALID_LINKID;
- err = dls_devnet_unset(mac_name(mh), 0, idp);
+ err = dls_devnet_unset(mac_name(mh), idp, wait);
if (err != 0 && err != ENOENT)
return (err);
- if ((err = dls_vlan_destroy(mac_name(mh), 0)) == 0)
- return (0);
-
- (void) dls_devnet_set(mac_name(mh), 0, *idp, *idp, NULL, NULL);
- return (err);
-}
+ mac_perim_enter_by_mh(mh, &mph);
+ err = dls_link_rele_by_name(mac_name(mh));
+ mac_perim_exit(mph);
-int
-dls_devnet_create_vlan(datalink_id_t vlanid, datalink_id_t linkid,
- uint16_t vid, boolean_t force)
-{
- dls_devnet_t *lnddp, *ddp;
- dls_vlan_t *dvp;
- int err;
-
- /*
- * Hold the link the VLAN is being created on (which must not be a
- * VLAN).
- */
- ASSERT(vid != VLAN_ID_NONE);
- if ((err = dls_devnet_hold_tmp(linkid, &lnddp)) != 0)
- return (err);
-
- if (lnddp->dd_vid != VLAN_ID_NONE) {
- err = EINVAL;
- goto done;
- }
-
- /*
- * A new link.
- */
- err = dls_devnet_set(lnddp->dd_mac, vid, vlanid, linkid, NULL, &ddp);
- if (err != 0)
- goto done;
-
- /*
- * Hold the dls_vlan_t (and create it if needed).
- */
- err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, force, B_TRUE);
- if (err != 0)
- VERIFY(dls_devnet_unset(lnddp->dd_mac, vid, NULL) == 0);
+ if (err == 0)
+ return (0);
-done:
- dls_devnet_rele_tmp(lnddp);
+ (void) dls_devnet_set(mac_name(mh), *idp, NULL);
return (err);
}
-int
-dls_devnet_destroy_vlan(datalink_id_t vlanid)
-{
- char macname[MAXNAMELEN];
- uint16_t vid;
- dls_devnet_t *ddp;
- dls_vlan_t *dvp;
- int err;
-
- if ((err = dls_devnet_hold_tmp(vlanid, &ddp)) != 0)
- return (err);
-
- if (ddp->dd_vid == VLAN_ID_NONE) {
- dls_devnet_rele_tmp(ddp);
- return (EINVAL);
- }
-
- if (!ddp->dd_explicit) {
- dls_devnet_rele_tmp(ddp);
- return (EBUSY);
- }
-
- (void) strncpy(macname, ddp->dd_mac, MAXNAMELEN);
- vid = ddp->dd_vid;
-
- /*
- * It is safe to release the temporary reference we just held, as the
- * reference from VLAN creation is still held.
- */
- dls_devnet_rele_tmp(ddp);
-
- if ((err = dls_devnet_unset(macname, vid, NULL)) != 0)
- return (err);
-
- /*
- * This VLAN has already been held as the result of VLAN creation.
- */
- VERIFY(dls_vlan_hold(macname, vid, &dvp, B_FALSE, B_FALSE) == 0);
-
- /*
- * Release the reference which was held when this VLAN was created,
- * and the reference which was just held.
- */
- dls_vlan_rele(dvp);
- dls_vlan_rele(dvp);
- return (0);
-}
-
const char *
dls_devnet_mac(dls_dl_handle_t ddh)
{
return (ddh->dd_mac);
}
-uint16_t
-dls_devnet_vid(dls_dl_handle_t ddh)
-{
- return (ddh->dd_vid);
-}
-
datalink_id_t
dls_devnet_linkid(dls_dl_handle_t ddh)
{
return (ddh->dd_linkid);
}
-
-boolean_t
-dls_devnet_is_explicit(dls_dl_handle_t ddh)
-{
- return (ddh->dd_explicit);
-}
diff --git a/usr/src/uts/common/io/dls/dls_mod.c b/usr/src/uts/common/io/dls/dls_mod.c
index b93befd45c..5f594a0ff9 100644
--- a/usr/src/uts/common/io/dls/dls_mod.c
+++ b/usr/src/uts/common/io/dls/dls_mod.c
@@ -23,18 +23,12 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Data-Link Services Module
*/
-#include <sys/types.h>
#include <sys/modctl.h>
-#include <sys/mac.h>
-
-#include <sys/dls.h>
-#include <sys/dls_impl.h>
+#include <sys/dld_impl.h>
static struct modlmisc i_dls_modlmisc = {
&mod_miscops,
@@ -54,8 +48,6 @@ static struct modlinkage i_dls_modlinkage = {
static void
i_dls_mod_init(void)
{
- dls_init();
- dls_vlan_init();
dls_link_init();
dls_mgmt_init();
}
@@ -69,13 +61,6 @@ i_dls_mod_fini(void)
return (err);
dls_mgmt_fini();
-
- err = dls_vlan_fini();
- ASSERT(err == 0);
-
- err = dls_fini();
- ASSERT(err == 0);
-
return (0);
}
diff --git a/usr/src/uts/common/io/dls/dls_soft_ring.c b/usr/src/uts/common/io/dls/dls_soft_ring.c
deleted file mode 100644
index 078b9a9e4c..0000000000
--- a/usr/src/uts/common/io/dls/dls_soft_ring.c
+++ /dev/null
@@ -1,773 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * General Soft rings - Simulating Rx rings in S/W.
- *
- * This is a general purpose high-performance soft ring mechanism. It is
- * similar to a taskq with a single worker thread. The dls creates a
- * set of these rings to simulate the H/W Rx ring (DMA channels) some
- * NICs have. The purpose is to present a common interface to IP
- * so the individual squeues can control these rings and switch them
- * between polling and interrupt mode.
- *
- * This code also serves as a fanout mechanism for fast NIC feeding slow
- * CPU where incoming traffic can be separated into multiple soft rings
- * based on capability negotiation with IP and IP also creates thread
- * affinity to soft ring worker threads to CPU so that conenction to
- * CPU/Squeue affinity is never broken.
- *
- * The soft rings can also be driven by a classifier which can direct
- * traffic to individual soft rings based on the input from IP.
- */
-
-#include <sys/types.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/kmem.h>
-#include <sys/cpuvar.h>
-#include <sys/condvar_impl.h>
-#include <sys/systm.h>
-#include <sys/callb.h>
-#include <sys/sdt.h>
-#include <sys/ddi.h>
-#include <sys/strsun.h>
-#include <sys/strsubr.h>
-#include <inet/common.h>
-#include <inet/ip.h>
-#include <inet/ipsec_impl.h>
-#include <inet/sadb.h>
-#include <inet/ipsecah.h>
-
-#include <sys/dls_impl.h>
-#include <sys/dls_soft_ring.h>
-
-static void soft_ring_fire(void *);
-static void soft_ring_drain(soft_ring_t *, clock_t);
-static void soft_ring_worker(soft_ring_t *);
-static void soft_ring_stop_workers(soft_ring_t **, int);
-static void dls_taskq_stop_soft_ring(void *);
-
-typedef struct soft_ring_taskq {
- soft_ring_t **ringp_list;
- uint_t ring_size;
-} soft_ring_taskq_t;
-
-kmem_cache_t *soft_ring_cache;
-
-
-int soft_ring_workerwait_ms = 10;
-int soft_ring_max_q_cnt = (4 * 1024 * 1024);
-
-/* The values above converted to ticks */
-static int soft_ring_workerwait_tick = 0;
-
-#define SOFT_RING_WORKER_WAKEUP(ringp) { \
- timeout_id_t tid = (ringp)->s_ring_tid; \
- \
- ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \
- /* \
- * Queue isn't being processed, so take \
- * any post enqueue actions needed before leaving. \
- */ \
- if (tid != 0) { \
- /* \
- * Waiting for an enter() to process mblk(s). \
- */ \
- clock_t waited = lbolt - (ringp)->s_ring_awaken; \
- \
- if (TICK_TO_MSEC(waited) >= (ringp)->s_ring_wait) { \
- /* \
- * Times up and have a worker thread \
- * waiting for work, so schedule it. \
- */ \
- (ringp)->s_ring_tid = 0; \
- cv_signal(&(ringp)->s_ring_async); \
- mutex_exit(&(ringp)->s_ring_lock); \
- (void) untimeout(tid); \
- } else { \
- mutex_exit(&(ringp)->s_ring_lock); \
- } \
- } else if ((ringp)->s_ring_wait != 0) { \
- (ringp)->s_ring_awaken = lbolt; \
- (ringp)->s_ring_tid = timeout(soft_ring_fire, (ringp), \
- (ringp)->s_ring_wait); \
- mutex_exit(&(ringp)->s_ring_lock); \
- } else { \
- /* \
- * Schedule the worker thread. \
- */ \
- cv_signal(&(ringp)->s_ring_async); \
- mutex_exit(&(ringp)->s_ring_lock); \
- } \
- ASSERT(MUTEX_NOT_HELD(&(ringp)->s_ring_lock)); \
-}
-
-
-#define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt) { \
- /* \
- * Enqueue our mblk chain. \
- */ \
- ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \
- \
- if ((ringp)->s_ring_last != NULL) \
- (ringp)->s_ring_last->b_next = (mp); \
- else \
- (ringp)->s_ring_first = (mp); \
- (ringp)->s_ring_last = (tail); \
- (ringp)->s_ring_count += (cnt); \
- ASSERT((ringp)->s_ring_count > 0); \
-}
-
-void
-soft_ring_init(void)
-{
- soft_ring_cache = kmem_cache_create("soft_ring_cache",
- sizeof (soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
-
- soft_ring_workerwait_tick =
- MSEC_TO_TICK_ROUNDUP(soft_ring_workerwait_ms);
-}
-
-/* ARGSUSED */
-soft_ring_t *
-soft_ring_create(char *name, processorid_t bind, clock_t wait,
- uint_t type, pri_t pri)
-{
- soft_ring_t *ringp;
-
- ringp = kmem_cache_alloc(soft_ring_cache, KM_NOSLEEP);
- if (ringp == NULL)
- return (NULL);
-
- bzero(ringp, sizeof (soft_ring_t));
- (void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1);
- ringp->s_ring_name[S_RING_NAMELEN] = '\0';
- mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL);
-
- ringp->s_ring_type = type;
- ringp->s_ring_bind = bind;
- if (bind != S_RING_BIND_NONE)
- soft_ring_bind(ringp, bind);
- ringp->s_ring_wait = MSEC_TO_TICK(wait);
-
- ringp->s_ring_worker = thread_create(NULL, 0, soft_ring_worker,
- ringp, 0, &p0, TS_RUN, pri);
-
- return (ringp);
-}
-
-soft_ring_t **
-soft_ring_set_create(char *name, processorid_t bind, clock_t wait,
- uint_t type, pri_t pri, int ring_size)
-{
- int i;
- soft_ring_t **ringp_list;
-
- if ((ringp_list =
- (soft_ring_t **) kmem_zalloc(sizeof (soft_ring_t *) * ring_size,
- KM_NOSLEEP)) != NULL) {
- for (i = 0; i < ring_size; i++) {
- ringp_list[i] = soft_ring_create(name, bind, wait,
- type, pri);
- if (ringp_list[i] == NULL)
- break;
- }
- if (i != ring_size) {
- soft_ring_stop_workers(ringp_list, ring_size);
- soft_ring_set_destroy(ringp_list, ring_size);
- ringp_list = NULL;
- }
- }
- return (ringp_list);
-}
-
-static void
-soft_ring_stop_workers(soft_ring_t **ringp_set, int ring_size)
-{
- int i;
- soft_ring_t *ringp;
- timeout_id_t tid;
- kt_did_t t_did = 0;
-
- for (i = 0; (i < ring_size) && (ringp_set[i] != NULL); i++) {
- ringp = ringp_set[i];
-
- soft_ring_unbind((void *)ringp);
- mutex_enter(&ringp->s_ring_lock);
- if ((tid = ringp->s_ring_tid) != 0)
- (void) untimeout(tid);
-
- ringp->s_ring_tid = 0;
-
- if (!(ringp->s_ring_state & S_RING_DEAD)) {
- ringp->s_ring_state |= S_RING_DESTROY;
- t_did = ringp->s_ring_worker->t_did;
-
-
- /* Wake the worker so it can exit */
- cv_signal(&(ringp)->s_ring_async);
- }
- mutex_exit(&ringp->s_ring_lock);
-
- /*
- * Here comes the tricky part. IP and driver ensure
- * that packet flow has stopped but worker thread
- * might still be draining the soft ring. We have
- * already set the S_RING_DESTROY flag. We wait till
- * the worker thread takes notice and stops processing
- * the soft_ring and exits. It sets S_RING_DEAD on
- * exiting.
- */
- if (t_did)
- thread_join(t_did);
- }
-}
-
-void
-soft_ring_set_destroy(soft_ring_t **ringp_set, int ring_size)
-{
- int i;
- mblk_t *mp;
- soft_ring_t *ringp;
-
- for (i = 0; (i < ring_size) && (ringp_set[i] != NULL); i++) {
- ringp = ringp_set[i];
-
- mutex_enter(&ringp->s_ring_lock);
-
- ASSERT(ringp->s_ring_state & S_RING_DEAD);
-
- while ((mp = ringp->s_ring_first) != NULL) {
- ringp->s_ring_first = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- }
- ringp->s_ring_last = NULL;
- mutex_exit(&ringp->s_ring_lock);
-
- /*
- * IP/driver ensure that no packets are flowing
- * when we are destroying the soft rings otherwise bad
- * things will happen.
- */
- kmem_cache_free(soft_ring_cache, ringp);
- ringp_set[i] = NULL;
- }
- kmem_free(ringp_set, sizeof (soft_ring_t *) * ring_size);
-}
-
-/* ARGSUSED */
-void
-soft_ring_bind(void *arg, processorid_t bind)
-{
- cpu_t *cp;
- soft_ring_t *ringp = (soft_ring_t *)arg;
-
- mutex_enter(&ringp->s_ring_lock);
- if (ringp->s_ring_state & S_RING_BOUND) {
- mutex_exit(&ringp->s_ring_lock);
- return;
- }
-
- ringp->s_ring_state |= S_RING_BOUND;
- ringp->s_ring_bind = bind;
- mutex_exit(&ringp->s_ring_lock);
-
- cp = cpu[bind];
- mutex_enter(&cpu_lock);
- if (cpu_is_online(cp)) {
- thread_affinity_set(ringp->s_ring_worker, ringp->s_ring_bind);
- }
- mutex_exit(&cpu_lock);
-}
-
-void
-soft_ring_unbind(void *arg)
-{
- soft_ring_t *ringp = (soft_ring_t *)arg;
-
- mutex_enter(&ringp->s_ring_lock);
- if (!(ringp->s_ring_state & S_RING_BOUND)) {
- mutex_exit(&ringp->s_ring_lock);
- return;
- }
-
- ringp->s_ring_state &= ~S_RING_BOUND;
- ringp->s_ring_bind = S_RING_BIND_NONE;
- mutex_exit(&ringp->s_ring_lock);
-
- thread_affinity_clear(ringp->s_ring_worker);
-}
-
-/*
- * soft_ring_enter() - enter soft_ring sqp with mblk mp (which can be
- * a chain), while tail points to the end and cnt in number of
- * mblks in the chain.
- *
- * For a chain of single packet (i.e. mp == tail), go through the
- * fast path if no one is processing the soft_ring and nothing is queued.
- *
- * The proc and arg for each mblk is already stored in the mblk in
- * appropriate places.
- */
-/* ARGSUSED */
-static void
-soft_ring_process(soft_ring_t *ringp,
- mblk_t *mp_chain, mblk_t *tail, uint_t count)
-{
- void *arg1, *arg2;
- s_ring_proc_t proc;
-
- ASSERT(ringp != NULL);
- ASSERT(mp_chain != NULL);
- ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
-
- mutex_enter(&ringp->s_ring_lock);
-
- ringp->s_ring_total_inpkt += count;
- if (!(ringp->s_ring_state & S_RING_PROC) &&
- !(ringp->s_ring_type == S_RING_WORKER_ONLY)) {
- /*
- * See if anything is already queued. If we are the
- * first packet, do inline processing else queue the
- * packet and do the drain.
- */
- if (ringp->s_ring_first == NULL && count == 1) {
- /*
- * Fast-path, ok to process and nothing queued.
- */
- ringp->s_ring_run = curthread;
- ringp->s_ring_state |= (S_RING_PROC);
-
- /*
- * We are the chain of 1 packet so
- * go through this fast path.
- */
- ASSERT(mp_chain->b_next == NULL);
- proc = ringp->s_ring_upcall;
- arg1 = ringp->s_ring_upcall_arg1;
- arg2 = ringp->s_ring_upcall_arg2;
-
- mutex_exit(&ringp->s_ring_lock);
- (*proc)(arg1, arg2, mp_chain, NULL);
-
- ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
- mutex_enter(&ringp->s_ring_lock);
- ringp->s_ring_run = NULL;
- ringp->s_ring_state &= ~S_RING_PROC;
- if (ringp->s_ring_first == NULL) {
- /*
- * We processed inline our packet and
- * nothing new has arrived. We are done.
- */
- mutex_exit(&ringp->s_ring_lock);
- return;
- }
- } else {
- SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, count);
- }
-
- /*
- * We are here because either we couldn't do inline
- * processing (because something was already queued),
- * or we had a chanin of more than one packet,
- * or something else arrived after we were done with
- * inline processing.
- */
- ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
- ASSERT(ringp->s_ring_first != NULL);
-
-
- soft_ring_drain(ringp, -1);
- mutex_exit(&ringp->s_ring_lock);
- return;
- } else {
- /*
- * Queue is already being processed. Just enqueue
- * the packet and go away.
- */
- if (ringp->s_ring_count > soft_ring_max_q_cnt) {
- freemsgchain(mp_chain);
- DLS_BUMP_STAT(dlss_soft_ring_pkt_drop, count);
- } else
- SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, count);
- if (!(ringp->s_ring_state & S_RING_PROC)) {
- SOFT_RING_WORKER_WAKEUP(ringp);
- } else {
- ASSERT(ringp->s_ring_run != NULL);
- mutex_exit(&ringp->s_ring_lock);
- }
- return;
- }
-}
-
-/*
- * PRIVATE FUNCTIONS
- */
-
-static void
-soft_ring_fire(void *arg)
-{
- soft_ring_t *ringp = arg;
-
- mutex_enter(&ringp->s_ring_lock);
- if (ringp->s_ring_tid == 0) {
- mutex_exit(&ringp->s_ring_lock);
- return;
- }
-
- ringp->s_ring_tid = 0;
-
- if (!(ringp->s_ring_state & S_RING_PROC)) {
- cv_signal(&ringp->s_ring_async);
- }
- mutex_exit(&ringp->s_ring_lock);
-}
-
-/* ARGSUSED */
-static void
-soft_ring_drain(soft_ring_t *ringp, clock_t expire)
-{
- mblk_t *mp;
- s_ring_proc_t proc;
- void *arg1, *arg2;
- timeout_id_t tid;
-
- ringp->s_ring_run = curthread;
- ASSERT(mutex_owned(&ringp->s_ring_lock));
- ASSERT(!(ringp->s_ring_state & S_RING_PROC));
-
- if ((tid = ringp->s_ring_tid) != 0)
- ringp->s_ring_tid = 0;
-
- ringp->s_ring_state |= S_RING_PROC;
-
-
- proc = ringp->s_ring_upcall;
- arg1 = ringp->s_ring_upcall_arg1;
- arg2 = ringp->s_ring_upcall_arg2;
-
- while (ringp->s_ring_first != NULL) {
- mp = ringp->s_ring_first;
- ringp->s_ring_first = NULL;
- ringp->s_ring_last = NULL;
- ringp->s_ring_count = 0;
- mutex_exit(&ringp->s_ring_lock);
-
- if (tid != 0) {
- (void) untimeout(tid);
- tid = 0;
- }
-
- (*proc)(arg1, arg2, mp, NULL);
-
- mutex_enter(&ringp->s_ring_lock);
- }
-
- ringp->s_ring_state &= ~S_RING_PROC;
- ringp->s_ring_run = NULL;
-}
-
-static void
-soft_ring_worker(soft_ring_t *ringp)
-{
- kmutex_t *lock = &ringp->s_ring_lock;
- kcondvar_t *async = &ringp->s_ring_async;
- callb_cpr_t cprinfo;
-
- CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "soft_ring");
- mutex_enter(lock);
-
- for (;;) {
- while (ringp->s_ring_first == NULL ||
- (ringp->s_ring_state & S_RING_PROC)) {
- CALLB_CPR_SAFE_BEGIN(&cprinfo);
- if (ringp->s_ring_state & S_RING_DESTROY)
- goto destroy;
-still_wait:
- cv_wait(async, lock);
- if (ringp->s_ring_state & S_RING_DESTROY) {
-destroy:
- if (ringp->s_ring_state & S_RING_DESTROY) {
- ringp->s_ring_state |= S_RING_DEAD;
- CALLB_CPR_EXIT(&cprinfo);
- thread_exit();
- }
- }
- if (ringp->s_ring_state & S_RING_PROC) {
- goto still_wait;
- }
- CALLB_CPR_SAFE_END(&cprinfo, lock);
- }
- soft_ring_drain(ringp, -1);
- }
-}
-
-void
-dls_soft_ring_disable(dls_channel_t dc)
-{
- dls_impl_t *dip = (dls_impl_t *)dc;
- soft_ring_t **ringp_list = NULL;
- int ring_size;
-
- rw_enter(&(dip->di_lock), RW_READER);
- if (dip->di_soft_ring_list != NULL) {
- ringp_list = dip->di_soft_ring_list;
- ring_size = dip->di_soft_ring_size;
- }
- rw_exit(&(dip->di_lock));
-
- if (ringp_list != NULL)
- soft_ring_stop_workers(ringp_list, ring_size);
-}
-
-static void
-dls_taskq_stop_soft_ring(void *arg)
-{
- soft_ring_taskq_t *ring_taskq;
- soft_ring_t **ringp_list;
- int ring_size;
-
- ring_taskq = (soft_ring_taskq_t *)arg;
- ringp_list = ring_taskq->ringp_list;
- ring_size = ring_taskq->ring_size;
- kmem_free(ring_taskq, sizeof (soft_ring_taskq_t));
-
- soft_ring_stop_workers(ringp_list, ring_size);
- soft_ring_set_destroy(ringp_list, ring_size);
-}
-
-boolean_t
-dls_soft_ring_enable(dls_channel_t dc, dl_capab_dls_t *soft_ringp)
-{
- dls_impl_t *dip;
- int i;
- soft_ring_t **softring_set;
- soft_ring_t *softring;
- mac_rx_fifo_t mrf;
- soft_ring_taskq_t *ring_taskq;
- char name[64];
-
- dip = (dls_impl_t *)dc;
-
- rw_enter(&(dip->di_lock), RW_WRITER);
-
- if (dip->di_soft_ring_list != NULL) {
- /*
- * Both ds_lock and di_lock are held as writer.
- * As soft_ring_stop_workers() blocks for the
- * worker thread(s) to complete, there is a possibility
- * that the worker thread(s) could be in the process
- * of draining the queue and is blocked waiting for
- * either ds_lock or di_lock. Moreover the NIC interrupt
- * thread could be blocked in dls_accept().
- * To avoid deadlock condition, taskq thread would be
- * created to handle soft_ring_stop_workers() and
- * blocking if required which would avoid holding
- * both ds_lock and di_lock.
- * NOTE: we cannot drop either locks here, due to
- * weird race conditions seen.
- */
- ring_taskq = (soft_ring_taskq_t *)
- kmem_zalloc(sizeof (soft_ring_taskq_t), KM_NOSLEEP);
- if (ring_taskq == NULL) {
- rw_exit(&(dip->di_lock));
- return (B_FALSE);
- }
- ring_taskq->ringp_list = dip->di_soft_ring_list;
- ring_taskq->ring_size = dip->di_soft_ring_size;
- if (taskq_dispatch(system_taskq, dls_taskq_stop_soft_ring,
- ring_taskq, TQ_NOSLEEP) == NULL) {
- rw_exit(&(dip->di_lock));
- kmem_free(ring_taskq, sizeof (soft_ring_taskq_t));
- return (B_FALSE);
- }
- dip->di_soft_ring_list = NULL;
- }
- dip->di_soft_ring_size = 0;
-
- bzero(name, sizeof (name));
- (void) snprintf(name, sizeof (name), "dls_soft_ring_%p", (void *)dip);
- dip->di_soft_ring_list = soft_ring_set_create(name, S_RING_BIND_NONE,
- 0, S_RING_WORKER_ONLY, minclsyspri, soft_ringp->dls_ring_cnt);
-
- if (dip->di_soft_ring_list == NULL) {
- rw_exit(&(dip->di_lock));
- return (B_FALSE);
- }
-
- dip->di_soft_ring_size = soft_ringp->dls_ring_cnt;
- softring_set = dip->di_soft_ring_list;
-
- dip->di_ring_add = (mac_resource_add_t)soft_ringp->dls_ring_add;
- dip->di_rx = (dls_rx_t)soft_ringp->dls_ring_assign;
- dip->di_rx_arg = (void *)soft_ringp->dls_rx_handle;
-
- bzero(&mrf, sizeof (mac_rx_fifo_t));
- mrf.mrf_type = MAC_RX_FIFO;
- for (i = 0; i < soft_ringp->dls_ring_cnt; i++) {
- softring = softring_set[i];
- mrf.mrf_arg = softring;
- softring->s_ring_upcall_arg1 =
- (void *)soft_ringp->dls_rx_handle;
- softring->s_ring_upcall_arg2 =
- dip->di_ring_add((void *)soft_ringp->dls_rx_handle,
- (mac_resource_t *)&mrf);
- softring->s_ring_upcall =
- (s_ring_proc_t)soft_ringp->dls_rx;
- }
-
- /*
- * Note that soft_ring is enabled. This prevents further DLIOCHDRINFO
- * ioctls from overwriting the receive function pointer.
- */
- rw_exit(&(dip->di_lock));
- return (B_TRUE);
-}
-
-int dls_bad_ip_pkt = 0;
-
-static mblk_t *
-dls_skip_mblk(mblk_t *bp, mblk_t *mp, int *skip_lenp)
-{
- while (MBLKL(bp) <= *skip_lenp) {
- *skip_lenp -= MBLKL(bp);
- bp = bp->b_cont;
- if (bp == NULL) {
- dls_bad_ip_pkt++;
- freemsg(mp);
- return (NULL);
- }
- }
- return (bp);
-}
-
-#define HASH32(x) (((x) >> 24) ^ ((x) >> 16) ^ ((x) >> 8) ^ (x))
-#define COMPUTE_INDEX(key, sz) (key % sz)
-
-/*
- * dls_soft_ring_fanout():
- */
-/* ARGSUSED */
-void
-dls_soft_ring_fanout(void *rx_handle, void *rx_cookie, mblk_t *mp_chain,
- mac_header_info_t *mhip)
-{
- mblk_t *mp, *bp, *head, *tail;
- ipha_t *ipha;
- dls_impl_t *dip = (dls_impl_t *)rx_handle;
- int indx, saved_indx;
- int hash = 0;
- int skip_len;
- uint8_t protocol;
- int count = 0;
-
- head = tail = NULL;
-
- while (mp_chain != NULL) {
- bp = mp = mp_chain;
- mp_chain = mp_chain->b_next;
- mp->b_next = NULL;
- if ((MBLKL(mp) < sizeof (ipha_t)) || !OK_32PTR(mp->b_rptr)) {
- mp = msgpullup(bp, sizeof (ipha_t));
- freemsg(bp);
- if (mp == NULL) {
- dls_bad_ip_pkt++;
- continue;
- }
- bp = mp;
- }
-
- ipha = (ipha_t *)mp->b_rptr;
- skip_len = IPH_HDR_LENGTH(ipha);
- protocol = ipha->ipha_protocol;
- again:
- switch (protocol) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_SCTP:
- case IPPROTO_ESP:
- /*
- * Note that for ESP, we fanout on SPI and it is at the
- * same offset as the 2x16-bit ports. So it is clumped
- * along with TCP, UDP and SCTP.
- */
- if (MBLKL(bp) <= skip_len) {
- bp = dls_skip_mblk(bp, mp, &skip_len);
- if (bp == NULL)
- continue;
- }
-
- hash = HASH32(*(uint32_t *)(bp->b_rptr + skip_len));
- break;
-
- case IPPROTO_AH: {
- ah_t *ah;
- uint_t ah_length;
-
- if (MBLKL(bp) <= skip_len) {
- bp = dls_skip_mblk(bp, mp, &skip_len);
- if (bp == NULL)
- continue;
- }
-
- ah = (ah_t *)(bp->b_rptr + skip_len);
- protocol = ah->ah_nexthdr;
- ah_length = AH_TOTAL_LEN(ah);
- skip_len += ah_length;
- goto again;
- }
-
- default:
- /*
- * Send the packet to a ring based on src/dest addresses
- */
- hash =
- (HASH32(ipha->ipha_src) ^ HASH32(ipha->ipha_dst));
- break;
- }
-
- indx = COMPUTE_INDEX(hash, dip->di_soft_ring_size);
- if (head == NULL) {
- saved_indx = indx;
- head = tail = mp;
- count++;
- } else if (indx == saved_indx) {
- tail->b_next = mp;
- tail = mp;
- count++;
- } else {
- soft_ring_process(dip->di_soft_ring_list[saved_indx],
- head, tail, count);
- head = tail = mp;
- saved_indx = indx;
- count = 1;
- }
- }
- if (head != NULL)
- soft_ring_process(dip->di_soft_ring_list[saved_indx],
- head, tail, count);
-}
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 99f41d0c7d..a6f89a8b49 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -23,22 +23,12 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Data-Link Services Module
*/
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/atomic.h>
-#include <sys/kstat.h>
-#include <sys/vlan.h>
-#include <sys/mac.h>
+#include <sys/dld_impl.h>
#include <sys/mac_ether.h>
-#include <sys/ctype.h>
-#include <sys/dls.h>
-#include <sys/dls_impl.h>
static mac_stat_info_t i_dls_si[] = {
{ MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
@@ -66,35 +56,18 @@ static mac_stat_info_t i_dls_si[] = {
#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0]))
/*
- * Private functions.
- */
-
-static int
-i_dls_mac_stat_update(kstat_t *ksp, int rw)
-{
- dls_vlan_t *dvp = ksp->ks_private;
-
- return (dls_stat_update(ksp, dvp, rw));
-}
-
-/*
* Exported functions.
*/
int
-dls_stat_update(kstat_t *ksp, dls_vlan_t *dvp, int rw)
+dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
{
- dls_link_t *dlp = dvp->dv_dlp;
kstat_named_t *knp;
uint_t i;
uint64_t val;
- int err;
if (rw != KSTAT_READ)
return (EACCES);
- if ((err = dls_mac_hold(dlp)) != 0)
- return (err);
-
knp = (kstat_named_t *)ksp->ks_data;
for (i = 0; i < STAT_INFO_COUNT; i++) {
val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
@@ -124,7 +97,6 @@ dls_stat_update(kstat_t *ksp, dls_vlan_t *dvp, int rw)
}
knp++;
knp->value.ui32 = dlp->dl_unknowns;
- dls_mac_rele(dlp);
return (0);
}
@@ -158,45 +130,3 @@ dls_stat_create(const char *module, int instance, const char *name,
*kspp = ksp;
return (0);
}
-
-void
-dls_mac_stat_create(dls_vlan_t *dvp)
-{
- kstat_t *ksp = NULL;
- major_t major;
-
- /*
- * Create the legacy kstats to provide backward compatibility.
- * These kstats need to be created even when this link does not
- * have a link name, i.e., when the VLAN is accessed using its
- * /dev node.
- *
- * Note that we only need to create the legacy kstats for GLDv3
- * physical links, aggregation links which are created using
- * the 'key' option, and any VLAN links created over them.
- * This can be determined by checking its dv_ppa.
- */
- ASSERT(dvp->dv_ksp == NULL);
- if (dvp->dv_ppa >= MAC_MAX_MINOR)
- return;
-
- major = getmajor(dvp->dv_dev);
- ASSERT(GLDV3_DRV(major) && (dvp->dv_ksp == NULL));
-
- if (dls_stat_create(ddi_major_to_name(major),
- dvp->dv_id * 1000 + dvp->dv_ppa, NULL,
- i_dls_mac_stat_update, dvp, &ksp) != 0) {
- return;
- }
- ASSERT(ksp != NULL);
- dvp->dv_ksp = ksp;
-}
-
-void
-dls_mac_stat_destroy(dls_vlan_t *dvp)
-{
- if (dvp->dv_ksp != NULL) {
- kstat_delete(dvp->dv_ksp);
- dvp->dv_ksp = NULL;
- }
-}
diff --git a/usr/src/uts/common/io/dls/dls_vlan.c b/usr/src/uts/common/io/dls/dls_vlan.c
deleted file mode 100644
index 9df000e86a..0000000000
--- a/usr/src/uts/common/io/dls/dls_vlan.c
+++ /dev/null
@@ -1,561 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Data-Link Services Module
- */
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/modhash.h>
-#include <sys/stat.h>
-#include <sys/kstat.h>
-#include <sys/vlan.h>
-#include <sys/mac.h>
-#include <sys/ctype.h>
-#include <sys/dls.h>
-#include <sys/dls_impl.h>
-
-static kmem_cache_t *i_dls_vlan_cachep;
-static mod_hash_t *i_dls_vlan_hash;
-static mod_hash_t *i_dls_vlan_dev_hash;
-static krwlock_t i_dls_vlan_lock;
-static uint_t i_dls_vlan_count;
-
-#define VLAN_HASHSZ 67 /* prime */
-
-/*
- * Private functions.
- */
-
-/*ARGSUSED*/
-static int
-i_dls_vlan_constructor(void *buf, void *arg, int kmflag)
-{
- dls_vlan_t *dvp = buf;
-
- bzero(buf, sizeof (dls_vlan_t));
- mutex_init(&dvp->dv_lock, NULL, MUTEX_DEFAULT, NULL);
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-i_dls_vlan_destructor(void *buf, void *arg)
-{
- dls_vlan_t *dvp = buf;
-
- ASSERT(dvp->dv_ref == 0);
- ASSERT(dvp->dv_zone_ref == 0);
- mutex_destroy(&dvp->dv_lock);
-}
-
-/*
- * Module initialization functions.
- */
-void
-dls_vlan_init(void)
-{
- /*
- * Create a kmem_cache of dls_vlan_t structures.
- */
- i_dls_vlan_cachep = kmem_cache_create("dls_vlan_cache",
- sizeof (dls_vlan_t), 0, i_dls_vlan_constructor,
- i_dls_vlan_destructor, NULL, NULL, NULL, 0);
- ASSERT(i_dls_vlan_cachep != NULL);
-
- /*
- * Create a hash table, keyed by dv_spa, of dls_vlan_t.
- */
- i_dls_vlan_hash = mod_hash_create_extended("dls_vlan_hash",
- VLAN_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
- mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
-
- /*
- * Create a hash table, keyed by dv_dev, of dls_vlan_t.
- */
- i_dls_vlan_dev_hash = mod_hash_create_ptrhash("dls_vlan_dev_hash",
- VLAN_HASHSZ, mod_hash_null_valdtor, sizeof (dev_t));
-
- rw_init(&i_dls_vlan_lock, NULL, RW_DEFAULT, NULL);
- i_dls_vlan_count = 0;
-}
-
-int
-dls_vlan_fini(void)
-{
- if (i_dls_vlan_count > 0)
- return (EBUSY);
-
- /*
- * Destroy the hash table
- */
- mod_hash_destroy_hash(i_dls_vlan_hash);
- mod_hash_destroy_hash(i_dls_vlan_dev_hash);
- rw_destroy(&i_dls_vlan_lock);
-
- /*
- * Destroy the kmem_cache.
- */
- kmem_cache_destroy(i_dls_vlan_cachep);
- return (0);
-}
-
-/*
- * Exported functions.
- */
-
-/*
- * If vid is VLAN_ID_NONE, then the minor_t to access this dls_vlan_t is
- * ppa + 1, otherwise, we need to allocate the minor_t in this function.
- *
- * If ppa is greater than DLS_MAX_PPA, it means that we do not need to create
- * the VLAN minor node for this MAC, as this MAC is (a) a legacy device, (b)
- * an aggr created without the "key" argument, or (c) a new type of link
- * whose ppa is allocated by mac_minor_hold() in mac_register().
- */
-int
-dls_vlan_create(const char *macname, uint16_t vid, boolean_t force)
-{
- char node[MAXPATHLEN];
- char spa[MAXSPALEN];
- char *driver;
- dls_link_t *dlp;
- dls_vlan_t *dvp;
- minor_t minor = 0;
- mac_handle_t mh;
- int ppa;
- dev_info_t *dip;
- uint32_t margin = VLAN_TAGSZ;
- int err = 0;
-
- if ((err = mac_open(macname, &mh)) != 0)
- return (err);
-
- /*
- * First check whether VLANs are able to be created on this MAC.
- */
- if (vid != VLAN_ID_NONE) {
- if ((mac_info(mh)->mi_media != DL_ETHER) ||
- (mac_info(mh)->mi_nativemedia != DL_ETHER)) {
- mac_close(mh);
- return (EINVAL);
- }
- if (!force &&
- ((err = mac_margin_add(mh, &margin, B_FALSE)) != 0)) {
- mac_close(mh);
- return (err);
- }
- }
-
- /*
- * Get a reference to a dls_link_t representing the MAC. This call
- * will create one if necessary.
- */
- if ((err = dls_link_hold(macname, &dlp)) != 0) {
- if (vid != VLAN_ID_NONE && !force)
- VERIFY(mac_margin_remove(mh, margin) == 0);
- mac_close(mh);
- return (err);
- }
-
- rw_enter(&i_dls_vlan_lock, RW_WRITER);
-
- /*
- * Try to find this VLAN in i_dls_vlan_hash first. The spa
- * is in the <macname/vid> form.
- */
- (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
- if ((err = mod_hash_find(i_dls_vlan_hash,
- (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) == 0) {
- err = EEXIST;
- goto fail;
- }
-
- ppa = mac_minor(mh) - 1;
- dip = mac_devinfo_get(mh);
-
- if (vid == VLAN_ID_NONE) {
- /*
- * Derives minor number directly from non-VLAN link's PPA.
- */
- minor = ppa + 1;
- } else if ((minor = mac_minor_hold(B_TRUE)) == 0) {
- /*
- * Allocate minor number from minor_arenap for VLANs.
- */
- err = ENOMEM;
- goto fail;
- }
-
- /*
- * First create its minor node for non-legacy links, including VLANs
- * and non-VLANs. This is for /dev nodes backward compatibility.
- */
- if (vid != VLAN_ID_NONE && ppa < MAC_MAX_MINOR) {
-
- driver = (char *)ddi_driver_name(dip);
-
- /* Create a style-1 DLPI device */
- (void) snprintf(node, MAXPATHLEN, "%s%d", driver,
- vid * 1000 + ppa);
- if (ddi_create_minor_node(dip, node, S_IFCHR, minor,
- DDI_NT_NET, 0) != DDI_SUCCESS) {
- err = EINVAL;
- goto fail;
- }
- }
-
- dvp = kmem_cache_alloc(i_dls_vlan_cachep, KM_SLEEP);
- dvp->dv_id = vid;
- dvp->dv_dlp = dlp;
- dvp->dv_dev = makedevice(ddi_driver_major(dip), minor);
- dvp->dv_dip = dip;
- dvp->dv_ppa = ppa;
- dvp->dv_force = force;
- dvp->dv_ref = 0;
- dvp->dv_zone_ref = 0;
- dvp->dv_zid = GLOBAL_ZONEID;
- (void) strlcpy(dvp->dv_spa, spa, MAXSPALEN);
- dls_mac_stat_create(dvp);
-
- err = mod_hash_insert(i_dls_vlan_hash,
- (mod_hash_key_t)dvp->dv_spa, (mod_hash_val_t)dvp);
- ASSERT(err == 0);
-
- err = mod_hash_insert(i_dls_vlan_dev_hash,
- (mod_hash_key_t)dvp->dv_dev, (mod_hash_val_t)dvp);
- ASSERT(err == 0);
-
- i_dls_vlan_count++;
- rw_exit(&i_dls_vlan_lock);
-
- /*
- * Hold the underlying MAC for VLANs to keep the margin request.
- * We cannot hold the mac for non-VLANs, because a reference would
- * prevent the device from detaching.
- */
- if (vid != VLAN_ID_NONE)
- VERIFY(dls_mac_hold(dvp->dv_dlp) == 0);
-
- mac_close(mh);
- return (0);
-
-fail:
- rw_exit(&i_dls_vlan_lock);
- if (vid != VLAN_ID_NONE && minor != 0)
- mac_minor_rele(minor);
- dls_link_rele(dlp);
- if (vid != VLAN_ID_NONE && !force)
- VERIFY(mac_margin_remove(mh, margin) == 0);
- mac_close(mh);
- return (err);
-}
-
-int
-dls_vlan_destroy(const char *macname, uint16_t vid)
-{
- char spa[MAXSPALEN];
- dls_vlan_t *dvp;
- mod_hash_val_t val;
- int err;
-
- /*
- * Try to find this VLAN in i_dls_vlan_hash first. The spa
- * is in the <macname/vid> form.
- */
- (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
-
- rw_enter(&i_dls_vlan_lock, RW_WRITER);
-
- if ((err = mod_hash_find(i_dls_vlan_hash,
- (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) != 0) {
- rw_exit(&i_dls_vlan_lock);
- return (ENOENT);
- }
-
- /*
- * Check to see if it is referenced by any dls_impl_t.
- */
- if (dvp->dv_ref != 0) {
- rw_exit(&i_dls_vlan_lock);
- return (EBUSY);
- }
-
- ASSERT(dvp->dv_zone_ref == 0);
-
- /*
- * Remove and destroy the hash table entry.
- */
- err = mod_hash_remove(i_dls_vlan_hash,
- (mod_hash_key_t)dvp->dv_spa, (mod_hash_val_t *)&val);
- ASSERT(err == 0);
- ASSERT(dvp == (dls_vlan_t *)val);
-
- err = mod_hash_remove(i_dls_vlan_dev_hash,
- (mod_hash_key_t)dvp->dv_dev, (mod_hash_val_t *)&val);
- ASSERT(err == 0);
- ASSERT(dvp == (dls_vlan_t *)val);
-
- if (vid != VLAN_ID_NONE && dvp->dv_ppa < MAC_MAX_MINOR) {
- char node[MAXPATHLEN];
- char *driver;
-
- /*
- * Remove the minor nodes for this link.
- */
- driver = (char *)ddi_driver_name(dvp->dv_dip);
- (void) snprintf(node, MAXPATHLEN, "%s%d", driver,
- vid * 1000 + dvp->dv_ppa);
- ddi_remove_minor_node(dvp->dv_dip, node);
- }
-
- dls_mac_stat_destroy(dvp);
-
- ASSERT(i_dls_vlan_count > 0);
- i_dls_vlan_count--;
- rw_exit(&i_dls_vlan_lock);
-
- if (vid != VLAN_ID_NONE) {
- if (!dvp->dv_force) {
- (void) mac_margin_remove(dvp->dv_dlp->dl_mh,
- VLAN_TAGSZ);
- }
- dls_mac_rele(dvp->dv_dlp);
- }
-
- /*
- * Release minor to dls_minor_arenap for VLANs
- */
- if (vid != VLAN_ID_NONE)
- mac_minor_rele(getminor(dvp->dv_dev));
-
- /*
- * Release the dls_link_t. This will destroy the dls_link_t and
- * release the MAC if there are no more dls_vlan_t.
- */
- dls_link_rele(dvp->dv_dlp);
- kmem_cache_free(i_dls_vlan_cachep, dvp);
- return (0);
-}
-
-int
-dls_vlan_hold(const char *macname, uint16_t vid, dls_vlan_t **dvpp,
- boolean_t force, boolean_t create_vlan)
-{
- char spa[MAXSPALEN];
- dls_vlan_t *dvp;
- boolean_t vlan_created;
- int err = 0;
-
- (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
-
-again:
- rw_enter(&i_dls_vlan_lock, RW_WRITER);
- if ((err = mod_hash_find(i_dls_vlan_hash,
- (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) != 0) {
-
- ASSERT(err == MH_ERR_NOTFOUND);
-
- vlan_created = B_FALSE;
- if (!create_vlan || vid == VLAN_ID_NONE) {
- rw_exit(&i_dls_vlan_lock);
- return (ENOENT);
- }
- rw_exit(&i_dls_vlan_lock);
-
- err = dls_vlan_create(macname, vid, force);
- if ((err != 0) && (err != EEXIST))
- return (err);
-
- /*
- * At this point someone else could do a dls_vlan_hold and
- * dls_vlan_rele on this new vlan and causes it to be
- * destroyed. This will at worst cause us to spin a few
- * times.
- */
- vlan_created = (err != EEXIST);
- goto again;
- }
-
- dvp->dv_ref++;
- rw_exit(&i_dls_vlan_lock);
-
- if ((err = dls_mac_hold(dvp->dv_dlp)) != 0) {
- rw_enter(&i_dls_vlan_lock, RW_WRITER);
- dvp->dv_ref--;
- rw_exit(&i_dls_vlan_lock);
- if (vlan_created)
- (void) dls_vlan_destroy(macname, vid);
- return (err);
- }
-
- *dvpp = dvp;
- return (0);
-}
-
-int
-dls_vlan_hold_by_dev(dev_t dev, dls_vlan_t **dvpp)
-{
- dls_vlan_t *dvp;
- int err;
-
- rw_enter(&i_dls_vlan_lock, RW_WRITER);
- if ((err = mod_hash_find(i_dls_vlan_dev_hash, (mod_hash_key_t)dev,
- (mod_hash_val_t *)&dvp)) != 0) {
- ASSERT(err == MH_ERR_NOTFOUND);
- rw_exit(&i_dls_vlan_lock);
- return (ENOENT);
- }
-
- dvp->dv_ref++;
- rw_exit(&i_dls_vlan_lock);
-
- if ((err = dls_mac_hold(dvp->dv_dlp)) != 0) {
- rw_enter(&i_dls_vlan_lock, RW_WRITER);
- dvp->dv_ref--;
- rw_exit(&i_dls_vlan_lock);
- return (err);
- }
-
- *dvpp = dvp;
- return (0);
-}
-
-/*
- * Free the dvp if this is a VLAN and this is the last reference.
- */
-void
-dls_vlan_rele(dls_vlan_t *dvp)
-{
- char macname[MAXNAMELEN];
- uint16_t vid;
- boolean_t destroy_vlan = B_FALSE;
-
- dls_mac_rele(dvp->dv_dlp);
-
- rw_enter(&i_dls_vlan_lock, RW_WRITER);
- if (--dvp->dv_ref != 0) {
- rw_exit(&i_dls_vlan_lock);
- return;
- }
-
- if (dvp->dv_id != VLAN_ID_NONE) {
- destroy_vlan = B_TRUE;
- (void) strncpy(macname, dvp->dv_dlp->dl_name, MAXNAMELEN);
- vid = dvp->dv_id;
- }
- rw_exit(&i_dls_vlan_lock);
-
- if (destroy_vlan)
- (void) dls_vlan_destroy(macname, vid);
-}
-
-int
-dls_vlan_setzid(const char *mac, uint16_t vid, zoneid_t zid)
-{
- dls_vlan_t *dvp;
- int err;
- zoneid_t old_zid;
-
- if ((err = dls_vlan_hold(mac, vid, &dvp, B_FALSE, B_TRUE)) != 0)
- return (err);
-
- mutex_enter(&dvp->dv_lock);
- if ((old_zid = dvp->dv_zid) == zid) {
- mutex_exit(&dvp->dv_lock);
- goto done;
- }
-
- /*
- * Check whether this dvp is used by its own zones, if yes,
- * we cannot change its zoneid.
- */
- if (dvp->dv_zone_ref != 0) {
- mutex_exit(&dvp->dv_lock);
- err = EBUSY;
- goto done;
- }
-
- if (zid == GLOBAL_ZONEID) {
- /*
- * Move the link from the local zone to the global zone,
- * and release the reference to this link. At the same time
- * reset the link's active state so that an aggregation is
- * allowed to be created over it.
- */
- dvp->dv_zid = zid;
- mutex_exit(&dvp->dv_lock);
- dls_mac_active_clear(dvp->dv_dlp);
- dls_vlan_rele(dvp);
- goto done;
- } else if (old_zid == GLOBAL_ZONEID) {
- /*
- * Move the link from the global zone to the local zone,
- * and hold a reference to this link. Also, set the link
- * to the "active" state so that the global zone is
- * not able to create an aggregation over this link.
- * TODO: revisit once we allow creating aggregations
- * within a local zone.
- */
- if (!dls_mac_active_set(dvp->dv_dlp)) {
- mutex_exit(&dvp->dv_lock);
- err = EBUSY;
- goto done;
- }
- dvp->dv_zid = zid;
- mutex_exit(&dvp->dv_lock);
- return (0);
- } else {
- /*
- * Move the link from a local zone to another local zone.
- */
- dvp->dv_zid = zid;
- mutex_exit(&dvp->dv_lock);
- }
-
-done:
- dls_vlan_rele(dvp);
- return (err);
-}
-
-/*
- * Find dev_info_t based on the minor node of the link.
- */
-dev_info_t *
-dls_finddevinfo(dev_t dev)
-{
- dls_vlan_t *dvp;
- dev_info_t *dip;
-
- if (dls_vlan_hold_by_dev(dev, &dvp) != 0)
- return (NULL);
-
- dip = dvp->dv_dip;
- dls_vlan_rele(dvp);
- return (dip);
-}
diff --git a/usr/src/uts/common/io/dmfe/dmfe_impl.h b/usr/src/uts/common/io/dmfe/dmfe_impl.h
index 6792f540bd..978229574d 100644
--- a/usr/src/uts/common/io/dmfe/dmfe_impl.h
+++ b/usr/src/uts/common/io/dmfe/dmfe_impl.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DMFE_IMPL_H
#define _SYS_DMFE_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -56,7 +54,7 @@ extern "C" {
#include <sys/sunddi.h>
#include <sys/miiregs.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include "dmfe.h"
diff --git a/usr/src/uts/common/io/dmfe/dmfe_main.c b/usr/src/uts/common/io/dmfe/dmfe_main.c
index 152c14f1e8..c231f61ec4 100644
--- a/usr/src/uts/common/io/dmfe/dmfe_main.c
+++ b/usr/src/uts/common/io/dmfe/dmfe_main.c
@@ -207,12 +207,11 @@ static int dmfe_m_promisc(void *, boolean_t);
static int dmfe_m_multicst(void *, boolean_t, const uint8_t *);
static int dmfe_m_unicst(void *, const uint8_t *);
static void dmfe_m_ioctl(void *, queue_t *, mblk_t *);
-static boolean_t dmfe_m_getcapab(void *, mac_capab_t, void *);
static mblk_t *dmfe_m_tx(void *, mblk_t *);
static int dmfe_m_stat(void *, uint_t, uint64_t *);
static mac_callbacks_t dmfe_m_callbacks = {
- (MC_IOCTL | MC_GETCAPAB),
+ (MC_IOCTL),
dmfe_m_stat,
dmfe_m_start,
dmfe_m_stop,
@@ -220,9 +219,8 @@ static mac_callbacks_t dmfe_m_callbacks = {
dmfe_m_multicst,
dmfe_m_unicst,
dmfe_m_tx,
- NULL,
dmfe_m_ioctl,
- dmfe_m_getcapab,
+ NULL,
};
@@ -1621,46 +1619,6 @@ dmfe_m_promisc(void *arg, boolean_t on)
return (0);
}
-/*ARGSUSED*/
-static boolean_t
-dmfe_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
-{
- /*
- * Note that the chip could support some form of polling and
- * multiaddress support. We should look into adding polling
- * support later, once Solaris is better positioned to take
- * advantage of it, although it may be of little use since
- * even a lowly 500MHz US-IIe should be able to keep up with
- * 100Mbps. (Esp. if the packets are not unreasonably sized.)
- *
- * Multiaddress support, however, is likely to be of more
- * utility with crossbow and virtualized NICs. Although, the
- * fact that dmfe is only supported on low-end US-IIe hardware
- * makes one wonder whether VNICs are likely to be used on
- * such platforms. The chip certainly supports the notion,
- * since it can be run in HASH-ONLY mode. (Though this would
- * require software to drop unicast packets that are
- * incorrectly received due to hash collision of the
- * destination mac address.)
- *
- * Interestingly enough, modern Davicom chips (the 9102D)
- * support full IP checksum offload, though its unclear
- * whether any of these chips are used on any systems that can
- * run Solaris.
- *
- * If this driver is ever supported on x86 hardware, then
- * these assumptions should be revisited.
- */
- switch (cap) {
- case MAC_CAPAB_POLL:
- case MAC_CAPAB_MULTIADDRESS:
- case MAC_CAPAB_HCKSUM:
- default:
- return (B_FALSE);
- }
-}
-
-
#undef DMFE_DBG
diff --git a/usr/src/uts/common/io/e1000g/e1000g_main.c b/usr/src/uts/common/io/e1000g/e1000g_main.c
index 8bde171cbb..e7fe619c3e 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_main.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_main.c
@@ -64,8 +64,6 @@ static uint_t e1000g_intr_pciexpress(caddr_t);
static uint_t e1000g_intr(caddr_t);
static void e1000g_intr_work(struct e1000g *, uint32_t);
#pragma inline(e1000g_intr_work)
-static uint32_t e1000g_get_itr(uint32_t, uint32_t, uint32_t);
-#pragma inline(e1000g_get_itr)
static int e1000g_init(struct e1000g *);
static int e1000g_start(struct e1000g *, boolean_t);
static void e1000g_stop(struct e1000g *, boolean_t);
@@ -73,11 +71,6 @@ static int e1000g_m_start(void *);
static void e1000g_m_stop(void *);
static int e1000g_m_promisc(void *, boolean_t);
static boolean_t e1000g_m_getcapab(void *, mac_capab_t, void *);
-static int e1000g_m_unicst(void *, const uint8_t *);
-static int e1000g_m_unicst_add(void *, mac_multi_addr_t *);
-static int e1000g_m_unicst_remove(void *, mac_addr_slot_t);
-static int e1000g_m_unicst_modify(void *, mac_multi_addr_t *);
-static int e1000g_m_unicst_get(void *, mac_multi_addr_t *);
static int e1000g_m_multicst(void *, boolean_t, const uint8_t *);
static void e1000g_m_ioctl(void *, queue_t *, mblk_t *);
static int e1000g_m_setprop(void *, const char *, mac_prop_id_t,
@@ -98,7 +91,7 @@ static int e1000g_register_mac(struct e1000g *);
static boolean_t e1000g_rx_drain(struct e1000g *);
static boolean_t e1000g_tx_drain(struct e1000g *);
static void e1000g_init_unicst(struct e1000g *);
-static int e1000g_unicst_set(struct e1000g *, const uint8_t *, mac_addr_slot_t);
+static int e1000g_unicst_set(struct e1000g *, const uint8_t *, int);
/*
* Local routines
@@ -172,10 +165,8 @@ mac_priv_prop_t e1000g_priv_props[] = {
{"_rx_intr_abs_delay", MAC_PROP_PERM_RW},
{"_intr_throttling_rate", MAC_PROP_PERM_RW},
{"_intr_adaptive", MAC_PROP_PERM_RW},
- {"_tx_recycle_thresh", MAC_PROP_PERM_RW},
{"_adv_pause_cap", MAC_PROP_PERM_READ},
{"_adv_asym_pause_cap", MAC_PROP_PERM_READ},
- {"_tx_recycle_num", MAC_PROP_PERM_RW}
};
#define E1000G_MAX_PRIV_PROPS \
(sizeof (e1000g_priv_props)/sizeof (mac_priv_prop_t))
@@ -245,9 +236,8 @@ static mac_callbacks_t e1000g_m_callbacks = {
e1000g_m_stop,
e1000g_m_promisc,
e1000g_m_multicst,
- e1000g_m_unicst,
- e1000g_m_tx,
NULL,
+ e1000g_m_tx,
e1000g_m_ioctl,
e1000g_m_getcapab,
NULL,
@@ -607,6 +597,7 @@ e1000g_register_mac(struct e1000g *Adapter)
mac->m_margin = VLAN_TAGSZ;
mac->m_priv_props = e1000g_priv_props;
mac->m_priv_prop_count = E1000G_MAX_PRIV_PROPS;
+ mac->m_v12n = MAC_VIRT_LEVEL1;
err = mac_register(mac, &Adapter->mh);
mac_free(mac);
@@ -935,17 +926,17 @@ e1000g_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
if (Adapter == NULL)
return (DDI_FAILURE);
+ rx_drain = e1000g_rx_drain(Adapter);
+ if (!rx_drain && !e1000g_force_detach)
+ return (DDI_FAILURE);
+
if (mac_unregister(Adapter->mh) != 0) {
e1000g_log(Adapter, CE_WARN, "Unregister MAC failed");
return (DDI_FAILURE);
}
Adapter->attach_progress &= ~ATTACH_PROGRESS_MAC;
-
- if (Adapter->chip_state != E1000G_STOP)
- e1000g_stop(Adapter, B_TRUE);
-
- rx_drain = e1000g_rx_drain(Adapter);
+ ASSERT(Adapter->chip_state == E1000G_STOP);
/*
* If e1000g_force_detach is enabled, driver detach is safe.
@@ -955,9 +946,6 @@ e1000g_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
*/
if (e1000g_force_detach) {
e1000g_free_priv_devi_node(Adapter, rx_drain);
- } else {
- if (!rx_drain)
- return (DDI_FAILURE);
}
e1000g_unattach(devinfo, Adapter);
@@ -1122,6 +1110,8 @@ e1000g_init_locks(struct e1000g *Adapter)
MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri));
mutex_init(&rx_ring->freelist_lock, NULL,
MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri));
+ mutex_init(&rx_ring->recycle_lock, NULL,
+ MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri));
}
static void
@@ -1138,6 +1128,7 @@ e1000g_destroy_locks(struct e1000g *Adapter)
rx_ring = Adapter->rx_ring;
mutex_destroy(&rx_ring->rx_lock);
mutex_destroy(&rx_ring->freelist_lock);
+ mutex_destroy(&rx_ring->recycle_lock);
mutex_destroy(&Adapter->link_lock);
mutex_destroy(&Adapter->watchdog_lock);
@@ -1432,6 +1423,8 @@ e1000g_init(struct e1000g *Adapter)
goto init_fail;
}
+ Adapter->poll_mode = e1000g_poll_mode;
+
rw_exit(&Adapter->chip_lock);
return (DDI_SUCCESS);
@@ -1549,6 +1542,106 @@ e1000g_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
}
}
+/*
+ * The default value of e1000g_poll_mode == 0 assumes that the NIC is
+ * capable of supporting only one interrupt and we shouldn't disable
+ * the physical interrupt. In this case we let the interrupt come and
+ * we queue the packets in the rx ring itself in case we are in polling
+ * mode (better latency but slightly lower performance and a very
+ * high intrrupt count in mpstat which is harmless).
+ *
+ * e1000g_poll_mode == 1 assumes that we have per Rx ring interrupt
+ * which can be disabled in poll mode. This gives better overall
+ * throughput (compared to the mode above), shows very low interrupt
+ * count but has slightly higher latency since we pick the packets when
+ * the poll thread does polling.
+ *
+ * Currently, this flag should be enabled only while doing performance
+ * measurement or when it can be guaranteed that entire NIC going
+ * in poll mode will not harm any traffic like cluster heartbeat etc.
+ */
+int e1000g_poll_mode = 0;
+
+/*
+ * Called from the upper layers when driver is in polling mode to
+ * pick up any queued packets. Care should be taken to not block
+ * this thread.
+ */
+static mblk_t *e1000g_poll_ring(void *arg, int bytes_to_pickup)
+{
+ e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)arg;
+ mblk_t *mp = NULL;
+ mblk_t *tail;
+ uint_t sz = 0;
+ struct e1000g *adapter;
+
+ adapter = rx_ring->adapter;
+
+ mutex_enter(&rx_ring->rx_lock);
+ ASSERT(rx_ring->poll_flag);
+
+ /*
+ * Get any packets that have arrived. Works only if we
+ * actually disable the physical adapter/rx_ring interrupt.
+ * (e1000g_poll_mode == 1). In case e1000g_poll_mode == 0,
+ * packets will have already been added to the poll list
+ * by the interrupt (see e1000g_intr_work()).
+ */
+ if (adapter->poll_mode) {
+ mp = e1000g_receive(rx_ring, &tail, &sz);
+ if (mp != NULL) {
+ if (rx_ring->poll_list_head == NULL)
+ rx_ring->poll_list_head = mp;
+ else
+ rx_ring->poll_list_tail->b_next = mp;
+ rx_ring->poll_list_tail = tail;
+ rx_ring->poll_list_sz += sz;
+ }
+ }
+
+ mp = rx_ring->poll_list_head;
+ if (mp == NULL) {
+ mutex_exit(&rx_ring->rx_lock);
+ return (NULL);
+ }
+
+ /* Check if we can sendup the entire chain */
+ if (bytes_to_pickup >= rx_ring->poll_list_sz) {
+ mp = rx_ring->poll_list_head;
+ rx_ring->poll_list_head = NULL;
+ rx_ring->poll_list_tail = NULL;
+ rx_ring->poll_list_sz = 0;
+ mutex_exit(&rx_ring->rx_lock);
+ return (mp);
+ }
+
+ /*
+ * We need to find out how much chain we can send up. We
+ * are guaranteed that atleast one packet will go up since
+ * we already checked that.
+ */
+ tail = mp;
+ sz = 0;
+ while (mp != NULL) {
+ sz += MBLKL(mp);
+ if (sz > bytes_to_pickup) {
+ sz -= MBLKL(mp);
+ break;
+ }
+ tail = mp;
+ mp = mp->b_next;
+ }
+
+ mp = rx_ring->poll_list_head;
+ rx_ring->poll_list_head = tail->b_next;
+ if (rx_ring->poll_list_head == NULL)
+ rx_ring->poll_list_tail = NULL;
+ rx_ring->poll_list_sz -= sz;
+ tail->b_next = NULL;
+ mutex_exit(&rx_ring->rx_lock);
+ return (mp);
+}
+
static int
e1000g_m_start(void *arg)
{
@@ -1912,7 +2005,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
struct e1000_hw *hw;
hw = &Adapter->shared;
e1000g_tx_ring_t *tx_ring = Adapter->tx_ring;
- uint32_t itr;
Adapter->rx_pkt_cnt = 0;
Adapter->tx_pkt_cnt = 0;
@@ -1929,16 +2021,79 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
}
if (icr & E1000_ICR_RXT0) {
- mblk_t *mp;
+ mblk_t *mp;
+ uint_t sz = 0;
+ mblk_t *tmp, *tail = NULL;
+ e1000g_rx_ring_t *rx_ring;
- mutex_enter(&Adapter->rx_ring->rx_lock);
- mp = e1000g_receive(Adapter);
- mutex_exit(&Adapter->rx_ring->rx_lock);
+ rx_ring = Adapter->rx_ring;
+ mutex_enter(&rx_ring->rx_lock);
+ /*
+ * If the real interrupt for the Rx ring was
+ * not disabled (e1000g_poll_mode == 0), then
+ * we still pick up the packets and queue them
+ * on Rx ring if we were in polling mode. this
+ * enables the polling thread to pick up packets
+ * really fast in polling mode and helps improve
+ * latency.
+ */
+ mp = e1000g_receive(rx_ring, &tail, &sz);
rw_exit(&Adapter->chip_lock);
- if (mp != NULL)
- mac_rx(Adapter->mh, Adapter->mrh, mp);
+ if (mp != NULL) {
+ ASSERT(tail != NULL);
+ if (!rx_ring->poll_flag) {
+ /*
+ * If not polling, see if something was
+ * already queued. Take care not to
+ * reorder packets.
+ */
+ if (rx_ring->poll_list_head == NULL) {
+ mutex_exit(&rx_ring->rx_lock);
+ mac_rx_ring(Adapter->mh, rx_ring->mrh,
+ mp, rx_ring->ring_gen_num);
+ } else {
+ tmp = rx_ring->poll_list_head;
+ rx_ring->poll_list_head = NULL;
+ rx_ring->poll_list_tail->b_next = mp;
+ rx_ring->poll_list_tail = NULL;
+ rx_ring->poll_list_sz = 0;
+ mutex_exit(&rx_ring->rx_lock);
+ mac_rx_ring(Adapter->mh, rx_ring->mrh,
+ tmp, rx_ring->ring_gen_num);
+ }
+ } else {
+ /*
+ * We are in a polling mode. Put the
+ * processed packets on the poll list.
+ */
+ if (rx_ring->poll_list_head == NULL)
+ rx_ring->poll_list_head = mp;
+ else
+ rx_ring->poll_list_tail->b_next = mp;
+ rx_ring->poll_list_tail = tail;
+ rx_ring->poll_list_sz += sz;
+ mutex_exit(&rx_ring->rx_lock);
+ }
+ } else if (!rx_ring->poll_flag &&
+ rx_ring->poll_list_head != NULL) {
+ /*
+ * Nothing new has arrived (then why
+ * was the interrupt raised??). Check
+ * if something queued from the last
+ * time.
+ */
+ tmp = rx_ring->poll_list_head;
+ rx_ring->poll_list_head = NULL;
+ rx_ring->poll_list_tail = NULL;
+ rx_ring->poll_list_sz = 0;
+ mutex_exit(&rx_ring->rx_lock);
+ mac_rx_ring(Adapter->mh, rx_ring->mrh,
+ tmp, rx_ring->ring_gen_num);
+ } else {
+ mutex_exit(&rx_ring->rx_lock);
+ }
} else
rw_exit(&Adapter->chip_lock);
@@ -1952,7 +2107,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
E1000G_DEBUG_STAT(tx_ring->stat_recycle_intr);
rw_exit(&Adapter->chip_lock);
- /* Schedule the re-transmit */
if (tx_ring->resched_needed &&
(tx_ring->tbd_avail > DEFAULT_TX_UPDATE_THRESHOLD)) {
tx_ring->resched_needed = B_FALSE;
@@ -1961,15 +2115,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
}
}
- if (Adapter->intr_adaptive) {
- itr = e1000g_get_itr(Adapter->rx_pkt_cnt, Adapter->tx_pkt_cnt,
- Adapter->intr_throttling_rate);
- if (itr) {
- E1000_WRITE_REG(hw, E1000_ITR, itr);
- Adapter->intr_throttling_rate = itr;
- }
- }
-
/*
* The Receive Sequence errors RXSEQ and the link status change LSC
* are checked to detect that the cable has been pulled out. For
@@ -2040,40 +2185,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
}
}
-static uint32_t
-e1000g_get_itr(uint32_t rx_packet, uint32_t tx_packet, uint32_t cur_itr)
-{
- uint32_t new_itr;
-
- /*
- * Determine a propper itr according to rx/tx packet count
- * per interrupt, the value of itr are based on document
- * and testing.
- */
- if ((rx_packet < DEFAULT_INTR_PACKET_LOW) ||
- (tx_packet < DEFAULT_INTR_PACKET_LOW)) {
- new_itr = DEFAULT_INTR_THROTTLING_LOW;
- goto itr_done;
- }
- if ((rx_packet > DEFAULT_INTR_PACKET_HIGH) ||
- (tx_packet > DEFAULT_INTR_PACKET_HIGH)) {
- new_itr = DEFAULT_INTR_THROTTLING_LOW;
- goto itr_done;
- }
- if (cur_itr < DEFAULT_INTR_THROTTLING_HIGH) {
- new_itr = cur_itr + (DEFAULT_INTR_THROTTLING_HIGH >> 2);
- if (new_itr > DEFAULT_INTR_THROTTLING_HIGH)
- new_itr = DEFAULT_INTR_THROTTLING_HIGH;
- } else
- new_itr = DEFAULT_INTR_THROTTLING_HIGH;
-
-itr_done:
- if (cur_itr == new_itr)
- return (0);
- else
- return (new_itr);
-}
-
static void
e1000g_init_unicst(struct e1000g *Adapter)
{
@@ -2082,45 +2193,33 @@ e1000g_init_unicst(struct e1000g *Adapter)
hw = &Adapter->shared;
- if (!Adapter->unicst_init) {
+ if (Adapter->init_count == 0) {
/* Initialize the multiple unicast addresses */
Adapter->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
+ /* Workaround for an erratum of 82571 chipst */
if ((hw->mac.type == e1000_82571) &&
(e1000_get_laa_state_82571(hw) == B_TRUE))
Adapter->unicst_total--;
- Adapter->unicst_avail = Adapter->unicst_total - 1;
+ Adapter->unicst_avail = Adapter->unicst_total;
- /* Store the default mac address */
- e1000_rar_set(hw, hw->mac.addr, 0);
- if ((hw->mac.type == e1000_82571) &&
- (e1000_get_laa_state_82571(hw) == B_TRUE))
- e1000_rar_set(hw, hw->mac.addr, LAST_RAR_ENTRY);
-
- bcopy(hw->mac.addr, Adapter->unicst_addr[0].mac.addr,
- ETHERADDRL);
- Adapter->unicst_addr[0].mac.set = 1;
-
- for (slot = 1; slot < Adapter->unicst_total; slot++)
- Adapter->unicst_addr[slot].mac.set = 0;
-
- Adapter->unicst_init = B_TRUE;
+ for (slot = 0; slot < Adapter->unicst_total; slot++) {
+ /* Clear both the flag and MAC address */
+ Adapter->unicst_addr[slot].reg.high = 0;
+ Adapter->unicst_addr[slot].reg.low = 0;
+ }
} else {
- /* Recover the default mac address */
- bcopy(Adapter->unicst_addr[0].mac.addr, hw->mac.addr,
- ETHERADDRL);
-
- /* Store the default mac address */
- e1000_rar_set(hw, hw->mac.addr, 0);
+ /* Workaround for an erratum of 82571 chipst */
if ((hw->mac.type == e1000_82571) &&
(e1000_get_laa_state_82571(hw) == B_TRUE))
e1000_rar_set(hw, hw->mac.addr, LAST_RAR_ENTRY);
/* Re-configure the RAR registers */
- for (slot = 1; slot < Adapter->unicst_total; slot++)
- e1000_rar_set(hw,
- Adapter->unicst_addr[slot].mac.addr, slot);
+ for (slot = 0; slot < Adapter->unicst_total; slot++)
+ if (Adapter->unicst_addr[slot].mac.set == 1)
+ e1000_rar_set(hw,
+ Adapter->unicst_addr[slot].mac.addr, slot);
}
if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK)
@@ -2128,22 +2227,8 @@ e1000g_init_unicst(struct e1000g *Adapter)
}
static int
-e1000g_m_unicst(void *arg, const uint8_t *mac_addr)
-{
- struct e1000g *Adapter;
-
- Adapter = (struct e1000g *)arg;
-
- /* Store the default MAC address */
- bcopy(mac_addr, Adapter->shared.mac.addr, ETHERADDRL);
-
- /* Set MAC address in address slot 0, which is the default address */
- return (e1000g_unicst_set(Adapter, mac_addr, 0));
-}
-
-static int
e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
- mac_addr_slot_t slot)
+ int slot)
{
struct e1000_hw *hw;
@@ -2166,14 +2251,36 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
E1000_WRITE_REG(hw, E1000_RCTL, E1000_RCTL_RST);
msec_delay(5);
}
+ if (mac_addr == NULL) {
+ E1000_WRITE_REG_ARRAY(hw, E1000_RA, slot << 1, 0);
+ E1000_WRITE_FLUSH(hw);
+ E1000_WRITE_REG_ARRAY(hw, E1000_RA, (slot << 1) + 1, 0);
+ E1000_WRITE_FLUSH(hw);
+ /* Clear both the flag and MAC address */
+ Adapter->unicst_addr[slot].reg.high = 0;
+ Adapter->unicst_addr[slot].reg.low = 0;
+ } else {
+ bcopy(mac_addr, Adapter->unicst_addr[slot].mac.addr,
+ ETHERADDRL);
+ e1000_rar_set(hw, (uint8_t *)mac_addr, slot);
+ Adapter->unicst_addr[slot].mac.set = 1;
+ }
- bcopy(mac_addr, Adapter->unicst_addr[slot].mac.addr, ETHERADDRL);
- e1000_rar_set(hw, (uint8_t *)mac_addr, slot);
-
+ /* Workaround for an erratum of 82571 chipst */
if (slot == 0) {
if ((hw->mac.type == e1000_82571) &&
(e1000_get_laa_state_82571(hw) == B_TRUE))
- e1000_rar_set(hw, (uint8_t *)mac_addr, LAST_RAR_ENTRY);
+ if (mac_addr == NULL) {
+ E1000_WRITE_REG_ARRAY(hw, E1000_RA,
+ slot << 1, 0);
+ E1000_WRITE_FLUSH(hw);
+ E1000_WRITE_REG_ARRAY(hw, E1000_RA,
+ (slot << 1) + 1, 0);
+ E1000_WRITE_FLUSH(hw);
+ } else {
+ e1000_rar_set(hw, (uint8_t *)mac_addr,
+ LAST_RAR_ENTRY);
+ }
}
/*
@@ -2192,7 +2299,6 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
}
rw_exit(&Adapter->chip_lock);
-
if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
return (EIO);
@@ -2201,163 +2307,6 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
return (0);
}
-/*
- * e1000g_m_unicst_add() - will find an unused address slot, set the
- * address value to the one specified, reserve that slot and enable
- * the NIC to start filtering on the new MAC address.
- * Returns 0 on success.
- */
-static int
-e1000g_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
-{
- struct e1000g *Adapter = (struct e1000g *)arg;
- mac_addr_slot_t slot;
- int err;
-
- if (mac_unicst_verify(Adapter->mh,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
- return (EINVAL);
-
- rw_enter(&Adapter->chip_lock, RW_WRITER);
- if (Adapter->unicst_avail == 0) {
- /* no slots available */
- rw_exit(&Adapter->chip_lock);
- return (ENOSPC);
- }
-
- /*
- * Primary/default address is in slot 0. The next addresses
- * are the multiple MAC addresses. So multiple MAC address 0
- * is in slot 1, 1 in slot 2, and so on. So the first multiple
- * MAC address resides in slot 1.
- */
- for (slot = 1; slot < Adapter->unicst_total; slot++) {
- if (Adapter->unicst_addr[slot].mac.set == 0) {
- Adapter->unicst_addr[slot].mac.set = 1;
- break;
- }
- }
-
- ASSERT((slot > 0) && (slot < Adapter->unicst_total));
-
- Adapter->unicst_avail--;
- rw_exit(&Adapter->chip_lock);
-
- maddr->mma_slot = slot;
-
- if ((err = e1000g_unicst_set(Adapter, maddr->mma_addr, slot)) != 0) {
- rw_enter(&Adapter->chip_lock, RW_WRITER);
- Adapter->unicst_addr[slot].mac.set = 0;
- Adapter->unicst_avail++;
- rw_exit(&Adapter->chip_lock);
- }
-
- return (err);
-}
-
-/*
- * e1000g_m_unicst_remove() - removes a MAC address that was added by a
- * call to e1000g_m_unicst_add(). The slot number that was returned in
- * e1000g_m_unicst_add() is passed in the call to remove the address.
- * Returns 0 on success.
- */
-static int
-e1000g_m_unicst_remove(void *arg, mac_addr_slot_t slot)
-{
- struct e1000g *Adapter = (struct e1000g *)arg;
- int err;
-
- if ((slot <= 0) || (slot >= Adapter->unicst_total))
- return (EINVAL);
-
- rw_enter(&Adapter->chip_lock, RW_WRITER);
- if (Adapter->unicst_addr[slot].mac.set == 1) {
- Adapter->unicst_addr[slot].mac.set = 0;
- Adapter->unicst_avail++;
- rw_exit(&Adapter->chip_lock);
-
- /* Copy the default address to the passed slot */
- if ((err = e1000g_unicst_set(Adapter,
- Adapter->unicst_addr[0].mac.addr, slot)) != 0) {
- rw_enter(&Adapter->chip_lock, RW_WRITER);
- Adapter->unicst_addr[slot].mac.set = 1;
- Adapter->unicst_avail--;
- rw_exit(&Adapter->chip_lock);
- }
- return (err);
- }
- rw_exit(&Adapter->chip_lock);
-
- return (EINVAL);
-}
-
-/*
- * e1000g_m_unicst_modify() - modifies the value of an address that
- * has been added by e1000g_m_unicst_add(). The new address, address
- * length and the slot number that was returned in the call to add
- * should be passed to e1000g_m_unicst_modify(). mma_flags should be
- * set to 0. Returns 0 on success.
- */
-static int
-e1000g_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
-{
- struct e1000g *Adapter = (struct e1000g *)arg;
- mac_addr_slot_t slot;
-
- if (mac_unicst_verify(Adapter->mh,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
- return (EINVAL);
-
- slot = maddr->mma_slot;
-
- if ((slot <= 0) || (slot >= Adapter->unicst_total))
- return (EINVAL);
-
- rw_enter(&Adapter->chip_lock, RW_WRITER);
- if (Adapter->unicst_addr[slot].mac.set == 1) {
- rw_exit(&Adapter->chip_lock);
-
- return (e1000g_unicst_set(Adapter, maddr->mma_addr, slot));
- }
- rw_exit(&Adapter->chip_lock);
-
- return (EINVAL);
-}
-
-/*
- * e1000g_m_unicst_get() - will get the MAC address and all other
- * information related to the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
- */
-static int
-e1000g_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
-{
- struct e1000g *Adapter = (struct e1000g *)arg;
- mac_addr_slot_t slot;
-
- slot = maddr->mma_slot;
-
- if ((slot <= 0) || (slot >= Adapter->unicst_total))
- return (EINVAL);
-
- rw_enter(&Adapter->chip_lock, RW_WRITER);
- if (Adapter->unicst_addr[slot].mac.set == 1) {
- bcopy(Adapter->unicst_addr[slot].mac.addr,
- maddr->mma_addr, ETHERADDRL);
- maddr->mma_flags = MMAC_SLOT_USED;
- } else {
- maddr->mma_flags = MMAC_SLOT_UNUSED;
- }
- rw_exit(&Adapter->chip_lock);
-
- return (0);
-}
-
static int
multicst_add(struct e1000g *Adapter, const uint8_t *multiaddr)
{
@@ -2586,6 +2535,274 @@ e1000g_m_promisc(void *arg, boolean_t on)
return (0);
}
+/*
+ * Entry points to enable and disable interrupts at the granularity of
+ * a group.
+ * Turns the poll_mode for the whole adapter on and off to enable or
+ * override the ring level polling control over the hardware interrupts.
+ */
+static int
+e1000g_rx_group_intr_enable(mac_intr_handle_t arg)
+{
+ struct e1000g *adapter = (struct e1000g *)arg;
+ e1000g_rx_ring_t *rx_ring = adapter->rx_ring;
+
+ /*
+ * Later interrupts at the granularity of the this ring will
+ * invoke mac_rx() with NULL, indicating the need for another
+ * software classification.
+ * We have a single ring usable per adapter now, so we only need to
+ * reset the rx handle for that one.
+ * When more RX rings can be used, we should update each one of them.
+ */
+ mutex_enter(&rx_ring->rx_lock);
+ rx_ring->mrh = NULL;
+ adapter->poll_mode = B_FALSE;
+ mutex_exit(&rx_ring->rx_lock);
+ return (0);
+}
+
+static int
+e1000g_rx_group_intr_disable(mac_intr_handle_t arg)
+{
+ struct e1000g *adapter = (struct e1000g *)arg;
+ e1000g_rx_ring_t *rx_ring = adapter->rx_ring;
+
+ mutex_enter(&rx_ring->rx_lock);
+
+ /*
+ * Later interrupts at the granularity of the this ring will
+ * invoke mac_rx() with the handle for this ring;
+ */
+ adapter->poll_mode = B_TRUE;
+ rx_ring->mrh = rx_ring->mrh_init;
+ mutex_exit(&rx_ring->rx_lock);
+ return (0);
+}
+
+/*
+ * Entry points to enable and disable interrupts at the granularity of
+ * a ring.
+ * adapter poll_mode controls whether we actually proceed with hardware
+ * interrupt toggling.
+ */
+static int
+e1000g_rx_ring_intr_enable(mac_intr_handle_t intrh)
+{
+ e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)intrh;
+ struct e1000g *adapter = rx_ring->adapter;
+ struct e1000_hw *hw = &adapter->shared;
+ uint32_t intr_mask;
+ boolean_t poll_mode;
+
+ mutex_enter(&rx_ring->rx_lock);
+ rx_ring->poll_flag = 0;
+ poll_mode = adapter->poll_mode;
+ mutex_exit(&rx_ring->rx_lock);
+
+ if (poll_mode) {
+ /* Rx interrupt enabling for MSI and legacy */
+ intr_mask = E1000_READ_REG(hw, E1000_IMS);
+ intr_mask |= E1000_IMS_RXT0;
+ E1000_WRITE_REG(hw, E1000_IMS, intr_mask);
+ E1000_WRITE_FLUSH(hw);
+
+ /* Trigger a Rx interrupt to check Rx ring */
+ E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0);
+ E1000_WRITE_FLUSH(hw);
+ }
+ return (0);
+}
+
+static int
+e1000g_rx_ring_intr_disable(mac_intr_handle_t intrh)
+{
+ e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)intrh;
+ struct e1000g *adapter = rx_ring->adapter;
+ struct e1000_hw *hw = &adapter->shared;
+ boolean_t poll_mode;
+
+ /*
+ * Once the adapter can support per Rx ring interrupt,
+ * we should disable the real interrupt instead of just setting
+ * the flag.
+ */
+ mutex_enter(&rx_ring->rx_lock);
+ rx_ring->poll_flag = 1;
+ poll_mode = adapter->poll_mode;
+ mutex_exit(&rx_ring->rx_lock);
+
+ if (poll_mode) {
+ /* Rx interrupt disabling for MSI and legacy */
+ E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
+ E1000_WRITE_FLUSH(hw);
+ }
+ return (0);
+}
+
+/*
+ * e1000g_unicst_find - Find the slot for the specified unicast address
+ */
+static int
+e1000g_unicst_find(struct e1000g *Adapter, const uint8_t *mac_addr)
+{
+ int slot;
+
+ ASSERT(mutex_owned(&Adapter->gen_lock));
+
+ for (slot = 0; slot < Adapter->unicst_total; slot++) {
+ if (Adapter->unicst_addr[slot].mac.set == 1) {
+ if (bcmp(Adapter->unicst_addr[slot].mac.addr,
+ mac_addr, ETHERADDRL) == 0)
+ return (slot);
+ } else
+ continue;
+ }
+
+ return (-1);
+}
+
+/*
+ * Entry points to add and remove a MAC address to a ring group.
+ * The caller takes care of adding and removing the MAC addresses
+ * to the filter via these two routines.
+ */
+
+static int
+e1000g_addmac(void *arg, const uint8_t *mac_addr)
+{
+ struct e1000g *Adapter = (struct e1000g *)arg;
+ int slot;
+
+ mutex_enter(&Adapter->gen_lock);
+
+ if (e1000g_unicst_find(Adapter, mac_addr) != -1) {
+ /* The same address is already in slot */
+ mutex_exit(&Adapter->gen_lock);
+ return (0);
+ }
+
+ if (Adapter->unicst_avail == 0) {
+ /* no slots available */
+ mutex_exit(&Adapter->gen_lock);
+ return (ENOSPC);
+ }
+
+ /* Search for a free slot */
+ for (slot = 0; slot < Adapter->unicst_total; slot++) {
+ if (Adapter->unicst_addr[slot].mac.set == 0)
+ break;
+ }
+ ASSERT(slot < Adapter->unicst_total);
+
+ e1000g_unicst_set(Adapter, mac_addr, slot);
+ Adapter->unicst_avail--;
+
+ mutex_exit(&Adapter->gen_lock);
+
+ return (0);
+}
+
+static int
+e1000g_remmac(void *arg, const uint8_t *mac_addr)
+{
+ struct e1000g *Adapter = (struct e1000g *)arg;
+ int slot;
+
+ mutex_enter(&Adapter->gen_lock);
+
+ slot = e1000g_unicst_find(Adapter, mac_addr);
+ if (slot == -1) {
+ mutex_exit(&Adapter->gen_lock);
+ return (EINVAL);
+ }
+
+ ASSERT(Adapter->unicst_addr[slot].mac.set);
+
+ /* Clear this slot */
+ e1000g_unicst_set(Adapter, NULL, slot);
+ Adapter->unicst_avail++;
+
+ mutex_exit(&Adapter->gen_lock);
+
+ return (0);
+}
+
+static int
+e1000g_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+ e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)rh;
+
+ mutex_enter(&rx_ring->rx_lock);
+ rx_ring->ring_gen_num = mr_gen_num;
+ mutex_exit(&rx_ring->rx_lock);
+ return (0);
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ *
+ * The hardware supports a single group with currently only one ring
+ * available.
+ * Though not offering virtualization ability per se, exposing the
+ * group/ring still enables the polling and interrupt toggling.
+ */
+void
+e1000g_fill_ring(void *arg, mac_ring_type_t rtype, const int grp_index,
+ const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ struct e1000g *Adapter = (struct e1000g *)arg;
+ e1000g_rx_ring_t *rx_ring = Adapter->rx_ring;
+ mac_intr_t *mintr;
+
+ /*
+ * We advertised only RX group/rings, so the MAC framework shouldn't
+ * ask for any thing else.
+ */
+ ASSERT(rtype == MAC_RING_TYPE_RX && grp_index == 0 && ring_index == 0);
+
+ rx_ring->mrh = rx_ring->mrh_init = rh;
+ infop->mri_driver = (mac_ring_driver_t)rx_ring;
+ infop->mri_start = e1000g_ring_start;
+ infop->mri_stop = NULL;
+ infop->mri_poll = e1000g_poll_ring;
+
+ /* Ring level interrupts */
+ mintr = &infop->mri_intr;
+ mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+ mintr->mi_enable = e1000g_rx_ring_intr_enable;
+ mintr->mi_disable = e1000g_rx_ring_intr_disable;
+}
+
+static void
+e1000g_fill_group(void *arg, mac_ring_type_t rtype, const int grp_index,
+ mac_group_info_t *infop, mac_group_handle_t gh)
+{
+ struct e1000g *Adapter = (struct e1000g *)arg;
+ mac_intr_t *mintr;
+
+ /*
+ * We advertised a single RX ring. Getting a request for anything else
+ * signifies a bug in the MAC framework.
+ */
+ ASSERT(rtype == MAC_RING_TYPE_RX && grp_index == 0);
+
+ Adapter->rx_group = gh;
+
+ infop->mgi_driver = (mac_group_driver_t)Adapter;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = e1000g_addmac;
+ infop->mgi_remmac = e1000g_remmac;
+ infop->mgi_count = 1;
+
+ /* Group level interrupts */
+ mintr = &infop->mgi_intr;
+ mintr->mi_handle = (mac_intr_handle_t)Adapter;
+ mintr->mi_enable = e1000g_rx_group_intr_enable;
+ mintr->mi_disable = e1000g_rx_group_intr_disable;
+}
+
static boolean_t
e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
@@ -2602,34 +2819,6 @@ e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
return (B_FALSE);
break;
}
- case MAC_CAPAB_POLL:
- /*
- * There's nothing for us to fill in, simply returning
- * B_TRUE stating that we support polling is sufficient.
- */
- break;
-
- case MAC_CAPAB_MULTIADDRESS: {
- multiaddress_capab_t *mmacp = cap_data;
-
- /*
- * The number of MAC addresses made available by
- * this capability is one less than the total as
- * the primary address in slot 0 is counted in
- * the total.
- */
- mmacp->maddr_naddr = Adapter->unicst_total - 1;
- mmacp->maddr_naddrfree = Adapter->unicst_avail;
- /* No multiple factory addresses, set mma_flag to 0 */
- mmacp->maddr_flag = 0;
- mmacp->maddr_handle = Adapter;
- mmacp->maddr_add = e1000g_m_unicst_add;
- mmacp->maddr_remove = e1000g_m_unicst_remove;
- mmacp->maddr_modify = e1000g_m_unicst_modify;
- mmacp->maddr_get = e1000g_m_unicst_get;
- mmacp->maddr_reserve = NULL;
- break;
- }
case MAC_CAPAB_LSO: {
mac_capab_lso_t *cap_lso = cap_data;
@@ -2642,7 +2831,20 @@ e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
return (B_FALSE);
break;
}
+ case MAC_CAPAB_RINGS: {
+ mac_capab_rings_t *cap_rings = cap_data;
+ /* No TX rings exposed yet */
+ if (cap_rings->mr_type != MAC_RING_TYPE_RX)
+ return (B_FALSE);
+
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = 1;
+ cap_rings->mr_gnum = 1;
+ cap_rings->mr_rget = e1000g_fill_ring;
+ cap_rings->mr_gget = e1000g_fill_group;
+ break;
+ }
default:
return (B_FALSE);
}
@@ -3124,32 +3326,6 @@ e1000g_set_priv_prop(struct e1000g *Adapter, const char *pr_name,
}
return (err);
}
- if (strcmp(pr_name, "_tx_recycle_thresh") == 0) {
- if (pr_val == NULL) {
- err = EINVAL;
- return (err);
- }
- (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
- if (result < MIN_TX_RECYCLE_THRESHOLD ||
- result > MAX_TX_RECYCLE_THRESHOLD)
- err = EINVAL;
- else
- Adapter->tx_recycle_thresh = (uint32_t)result;
- return (err);
- }
- if (strcmp(pr_name, "_tx_recycle_num") == 0) {
- if (pr_val == NULL) {
- err = EINVAL;
- return (err);
- }
- (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
- if (result < MIN_TX_RECYCLE_NUM ||
- result > MAX_TX_RECYCLE_NUM)
- err = EINVAL;
- else
- Adapter->tx_recycle_num = (uint32_t)result;
- return (err);
- }
return (ENOTSUP);
}
@@ -3236,18 +3412,6 @@ e1000g_get_priv_prop(struct e1000g *Adapter, const char *pr_name,
err = 0;
goto done;
}
- if (strcmp(pr_name, "_tx_recycle_thresh") == 0) {
- value = (is_default ? DEFAULT_TX_RECYCLE_THRESHOLD :
- Adapter->tx_recycle_thresh);
- err = 0;
- goto done;
- }
- if (strcmp(pr_name, "_tx_recycle_num") == 0) {
- value = (is_default ? DEFAULT_TX_RECYCLE_NUM :
- Adapter->tx_recycle_num);
- err = 0;
- goto done;
- }
done:
if (err == 0) {
(void) snprintf(pr_val, pr_valsize, "%d", value);
@@ -3368,22 +3532,6 @@ e1000g_get_conf(struct e1000g *Adapter)
B_TRUE : B_FALSE;
/*
- * Tx recycle threshold
- */
- Adapter->tx_recycle_thresh =
- e1000g_get_prop(Adapter, "tx_recycle_thresh",
- MIN_TX_RECYCLE_THRESHOLD, MAX_TX_RECYCLE_THRESHOLD,
- DEFAULT_TX_RECYCLE_THRESHOLD);
-
- /*
- * Tx recycle descriptor number
- */
- Adapter->tx_recycle_num =
- e1000g_get_prop(Adapter, "tx_recycle_num",
- MIN_TX_RECYCLE_NUM, MAX_TX_RECYCLE_NUM,
- DEFAULT_TX_RECYCLE_NUM);
-
- /*
* Hardware checksum enable/disable parameter
*/
Adapter->tx_hcksum_enable =
@@ -3672,6 +3820,23 @@ e1000g_reset_link(struct e1000g *Adapter)
}
static void
+e1000g_timer_tx_resched(struct e1000g *Adapter)
+{
+ e1000g_tx_ring_t *tx_ring = Adapter->tx_ring;
+
+ if (tx_ring->resched_needed &&
+ ((ddi_get_lbolt() - tx_ring->resched_timestamp) >
+ drv_usectohz(1000000)) &&
+ (Adapter->chip_state == E1000G_START) &&
+ (tx_ring->tbd_avail >= DEFAULT_TX_NO_RESOURCE)) {
+ tx_ring->resched_needed = B_FALSE;
+ mac_tx_update(Adapter->mh);
+ E1000G_STAT(tx_ring->stat_reschedule);
+ E1000G_STAT(tx_ring->stat_timer_reschedule);
+ }
+}
+
+static void
e1000g_local_timer(void *ws)
{
struct e1000g *Adapter = (struct e1000g *)ws;
@@ -3683,10 +3848,11 @@ e1000g_local_timer(void *ws)
if (Adapter->chip_state == E1000G_ERROR) {
Adapter->reset_count++;
- if (e1000g_global_reset(Adapter))
+ if (e1000g_global_reset(Adapter)) {
ddi_fm_service_impact(Adapter->dip,
DDI_SERVICE_RESTORED);
- else
+ e1000g_timer_tx_resched(Adapter);
+ } else
ddi_fm_service_impact(Adapter->dip,
DDI_SERVICE_LOST);
return;
@@ -3697,10 +3863,11 @@ e1000g_local_timer(void *ws)
"Tx stall detected. Activate automatic recovery.\n");
e1000g_fm_ereport(Adapter, DDI_FM_DEVICE_STALL);
Adapter->reset_count++;
- if (e1000g_reset_adapter(Adapter))
+ if (e1000g_reset_adapter(Adapter)) {
ddi_fm_service_impact(Adapter->dip,
DDI_SERVICE_RESTORED);
- else
+ e1000g_timer_tx_resched(Adapter);
+ } else
ddi_fm_service_impact(Adapter->dip,
DDI_SERVICE_LOST);
return;
@@ -3769,6 +3936,8 @@ e1000g_local_timer(void *ws)
if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK)
ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
+ else
+ e1000g_timer_tx_resched(Adapter);
restart_watchdog_timer(Adapter);
}
diff --git a/usr/src/uts/common/io/e1000g/e1000g_rx.c b/usr/src/uts/common/io/e1000g/e1000g_rx.c
index 3bb4a5e90f..15d22b8c9a 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_rx.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_rx.c
@@ -20,7 +20,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDLv1.
+ * Use is subject to license terms.
*/
/*
@@ -147,10 +147,16 @@ e1000g_rxfree_func(p_rx_sw_packet_t packet)
}
}
- mutex_enter(&rx_ring->freelist_lock);
- QUEUE_PUSH_TAIL(&rx_ring->free_list, &packet->Link);
- rx_ring->avail_freepkt++;
- mutex_exit(&rx_ring->freelist_lock);
+ /*
+ * Enqueue the recycled packets in a recycle queue. When freelist
+ * dries up, move the entire chain of packets from recycle queue
+ * to freelist. This helps in avoiding per packet mutex contention
+ * around freelist.
+ */
+ mutex_enter(&rx_ring->recycle_lock);
+ QUEUE_PUSH_TAIL(&rx_ring->recycle_list, &packet->Link);
+ rx_ring->recycle_freepkt++;
+ mutex_exit(&rx_ring->recycle_lock);
rw_exit(&e1000g_rx_detach_lock);
}
@@ -236,6 +242,8 @@ e1000g_rx_setup(struct e1000g *Adapter)
/* Init the list of "Free Receive Buffer" */
QUEUE_INIT_LIST(&rx_ring->free_list);
+ /* Init the list of "Free Receive Buffer" */
+ QUEUE_INIT_LIST(&rx_ring->recycle_list);
/*
* Setup Receive list and the Free list. Note that
* the both were allocated in one packet area.
@@ -263,6 +271,7 @@ e1000g_rx_setup(struct e1000g *Adapter)
&packet->Link);
}
rx_ring->avail_freepkt = Adapter->rx_freelist_num;
+ rx_ring->recycle_freepkt = 0;
Adapter->rx_buffer_setup = B_TRUE;
} else {
@@ -414,8 +423,23 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring)
mutex_enter(&rx_ring->freelist_lock);
packet = (p_rx_sw_packet_t)
QUEUE_POP_HEAD(&rx_ring->free_list);
- if (packet != NULL)
+ if (packet != NULL) {
rx_ring->avail_freepkt--;
+ } else {
+ /*
+ * If the freelist has no packets, check the recycle list
+ * to see if there are any available descriptor there.
+ */
+ mutex_enter(&rx_ring->recycle_lock);
+ QUEUE_SWITCH(&rx_ring->free_list, &rx_ring->recycle_list);
+ rx_ring->avail_freepkt = rx_ring->recycle_freepkt;
+ rx_ring->recycle_freepkt = 0;
+ mutex_exit(&rx_ring->recycle_lock);
+ packet = (p_rx_sw_packet_t)
+ QUEUE_POP_HEAD(&rx_ring->free_list);
+ if (packet != NULL)
+ rx_ring->avail_freepkt--;
+ }
mutex_exit(&rx_ring->freelist_lock);
return (packet);
@@ -427,7 +451,7 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring)
* This routine will process packets received in an interrupt
*/
mblk_t *
-e1000g_receive(struct e1000g *Adapter)
+e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz)
{
struct e1000_hw *hw;
mblk_t *nmp;
@@ -443,7 +467,7 @@ e1000g_receive(struct e1000g *Adapter)
boolean_t accept_frame;
boolean_t end_of_packet;
boolean_t need_copy;
- e1000g_rx_ring_t *rx_ring;
+ struct e1000g *Adapter;
dma_buffer_t *rx_buf;
uint16_t cksumflags;
@@ -452,9 +476,10 @@ e1000g_receive(struct e1000g *Adapter)
pkt_count = 0;
desc_count = 0;
cksumflags = 0;
+ *sz = 0;
+ Adapter = rx_ring->adapter;
hw = &Adapter->shared;
- rx_ring = Adapter->rx_ring;
/* Sync the Rx descriptor DMA buffers */
(void) ddi_dma_sync(rx_ring->rbd_dma_handle,
@@ -805,6 +830,8 @@ rx_end_of_packet:
ret_nmp = rx_ring->rx_mblk;
}
ret_nmp->b_next = NULL;
+ *tail = ret_nmp;
+ *sz += length;
rx_ring->rx_mblk = NULL;
rx_ring->rx_mblk_tail = NULL;
diff --git a/usr/src/uts/common/io/e1000g/e1000g_stat.c b/usr/src/uts/common/io/e1000g/e1000g_stat.c
index 7df4317e9e..0c67c914a5 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_stat.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_stat.c
@@ -20,7 +20,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDLv1.
+ * Use is subject to license terms.
*/
/*
@@ -185,7 +185,8 @@ e1000g_update_stats(kstat_t *ksp, int rw)
e1000g_ksp->rx_none.value.ul = rx_ring->stat_none;
e1000g_ksp->rx_multi_desc.value.ul = rx_ring->stat_multi_desc;
e1000g_ksp->rx_no_freepkt.value.ul = rx_ring->stat_no_freepkt;
- e1000g_ksp->rx_avail_freepkt.value.ul = rx_ring->avail_freepkt;
+ e1000g_ksp->rx_avail_freepkt.value.ul = rx_ring->avail_freepkt +
+ rx_ring->recycle_freepkt;
e1000g_ksp->tx_under_size.value.ul = tx_ring->stat_under_size;
e1000g_ksp->tx_exceed_frags.value.ul = tx_ring->stat_exceed_frags;
diff --git a/usr/src/uts/common/io/e1000g/e1000g_sw.h b/usr/src/uts/common/io/e1000g/e1000g_sw.h
index 605440cd48..e7c56a5877 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_sw.h
+++ b/usr/src/uts/common/io/e1000g/e1000g_sw.h
@@ -54,7 +54,7 @@ extern "C" {
#include <sys/kstat.h>
#include <sys/modctl.h>
#include <sys/errno.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/vlan.h>
#include <sys/ddi.h>
@@ -114,8 +114,6 @@ extern "C" {
#define MAX_INTR_THROTTLING 65535
#define MAX_RX_BCOPY_THRESHOLD E1000_RX_BUFFER_SIZE_2K
#define MAX_TX_BCOPY_THRESHOLD E1000_TX_BUFFER_SIZE_2K
-#define MAX_TX_RECYCLE_THRESHOLD MAX_NUM_TX_DESCRIPTOR
-#define MAX_TX_RECYCLE_NUM MAX_NUM_TX_DESCRIPTOR
#define MIN_NUM_TX_DESCRIPTOR 80
#define MIN_NUM_RX_DESCRIPTOR 80
@@ -129,8 +127,6 @@ extern "C" {
#define MIN_INTR_THROTTLING 0
#define MIN_RX_BCOPY_THRESHOLD 0
#define MIN_TX_BCOPY_THRESHOLD ETHERMIN
-#define MIN_TX_RECYCLE_THRESHOLD 0
-#define MIN_TX_RECYCLE_NUM MAX_TX_DESC_PER_PACKET
#define DEFAULT_NUM_RX_DESCRIPTOR 2048
#define DEFAULT_NUM_TX_DESCRIPTOR 2048
@@ -143,13 +139,11 @@ extern "C" {
#define MIN_INTR_PER_SEC 3000
#define DEFAULT_INTR_PACKET_LOW 5
#define DEFAULT_INTR_PACKET_HIGH 128
-#define DEFAULT_TX_RECYCLE_THRESHOLD 512
#else
#define MAX_INTR_PER_SEC 15000
#define MIN_INTR_PER_SEC 4000
#define DEFAULT_INTR_PACKET_LOW 10
#define DEFAULT_INTR_PACKET_HIGH 48
-#define DEFAULT_TX_RECYCLE_THRESHOLD DEFAULT_TX_NO_RESOURCE
#endif
#define DEFAULT_RX_INTR_DELAY 0
@@ -162,7 +156,6 @@ extern "C" {
#define DEFAULT_RX_BCOPY_THRESHOLD 128
#define DEFAULT_TX_BCOPY_THRESHOLD 512
-#define DEFAULT_TX_RECYCLE_NUM 64
#define DEFAULT_TX_UPDATE_THRESHOLD 256
#define DEFAULT_TX_NO_RESOURCE MAX_TX_DESC_PER_PACKET
@@ -402,6 +395,14 @@ extern "C" {
(_LH1)->Blink = ((PSINGLE_LIST_LINK)(_LH2)->Blink); \
}
+
+#define QUEUE_SWITCH(_LH1, _LH2) \
+ if ((_LH2)->Flink) { \
+ (_LH1)->Flink = (_LH2)->Flink; \
+ (_LH1)->Blink = (_LH2)->Blink; \
+ (_LH2)->Flink = (_LH2)->Blink = (PSINGLE_LIST_LINK)0; \
+ }
+
/*
* Property lookups
*/
@@ -717,6 +718,7 @@ typedef struct _e1000g_tx_ring {
* reschedule when tx resource is available
*/
boolean_t resched_needed;
+ clock_t resched_timestamp;
uint32_t stall_watchdog;
uint32_t recycle_fail;
mblk_list_t mblks;
@@ -727,6 +729,7 @@ typedef struct _e1000g_tx_ring {
uint32_t stat_no_desc;
uint32_t stat_send_fail;
uint32_t stat_reschedule;
+ uint32_t stat_timer_reschedule;
uint32_t stat_over_size;
#ifdef E1000G_DEBUG
uint32_t stat_under_size;
@@ -752,6 +755,7 @@ typedef struct _e1000g_tx_ring {
typedef struct _e1000g_rx_ring {
kmutex_t rx_lock;
kmutex_t freelist_lock;
+ kmutex_t recycle_lock;
/*
* Descriptor queue definitions
*/
@@ -768,13 +772,23 @@ typedef struct _e1000g_rx_ring {
p_rx_sw_packet_t packet_area;
LIST_DESCRIBER recv_list;
LIST_DESCRIBER free_list;
+ LIST_DESCRIBER recycle_list;
p_rx_sw_packet_t pending_list;
uint32_t pending_count;
uint32_t avail_freepkt;
+ uint32_t recycle_freepkt;
uint32_t rx_mblk_len;
mblk_t *rx_mblk;
mblk_t *rx_mblk_tail;
+ mac_ring_handle_t mrh;
+ mac_ring_handle_t mrh_init;
+ uint64_t ring_gen_num;
+ mblk_t *poll_list_head;
+ mblk_t *poll_list_tail;
+ uint_t poll_list_sz;
+ boolean_t poll_flag;
+
/*
* Statistics
*/
@@ -833,8 +847,6 @@ typedef struct e1000g {
boolean_t intr_adaptive;
boolean_t tx_intr_enable;
- uint32_t tx_recycle_thresh;
- uint32_t tx_recycle_num;
uint32_t tx_intr_delay;
uint32_t tx_intr_abs_delay;
uint32_t rx_intr_delay;
@@ -853,6 +865,9 @@ typedef struct e1000g {
e1000g_rx_ring_t rx_ring[1];
e1000g_tx_ring_t tx_ring[1];
+ mac_group_handle_t rx_group;
+
+ kmutex_t gen_lock; /* General lock for the whole struct e1000g */
/*
* Rx and Tx packet count for interrupt adaptive setting
@@ -909,6 +924,8 @@ typedef struct e1000g {
kstat_t *e1000g_ksp;
+ boolean_t poll_mode;
+
uint16_t phy_ctrl; /* contents of PHY_CTRL */
uint16_t phy_status; /* contents of PHY_STATUS */
uint16_t phy_an_adv; /* contents of PHY_AUTONEG_ADV */
@@ -980,7 +997,7 @@ void e1000g_free_tx_swpkt(p_tx_sw_packet_t packet);
void e1000g_tx_freemsg(e1000g_tx_ring_t *tx_ring);
uint_t e1000g_tx_softint_worker(caddr_t arg1, caddr_t arg2);
mblk_t *e1000g_m_tx(void *arg, mblk_t *mp);
-mblk_t *e1000g_receive(struct e1000g *Adapter);
+mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz);
void e1000g_rxfree_func(p_rx_sw_packet_t packet);
int e1000g_m_stat(void *arg, uint_t stat, uint64_t *val);
@@ -1008,6 +1025,7 @@ extern boolean_t e1000g_force_detach;
extern uint32_t e1000g_mblks_pending;
extern krwlock_t e1000g_rx_detach_lock;
extern private_devi_list_t *e1000g_private_devi_list;
+extern int e1000g_poll_mode;
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/io/e1000g/e1000g_tx.c b/usr/src/uts/common/io/e1000g/e1000g_tx.c
index 4255c098b4..d67b67ff63 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_tx.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_tx.c
@@ -20,7 +20,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDLv1.
+ * Use is subject to license terms.
*/
/*
@@ -211,8 +211,7 @@ e1000g_send(struct e1000g *Adapter, mblk_t *mp)
* Descriptors... As you may run short of them before getting any
* transmit interrupt...
*/
- if (tx_ring->resched_needed ||
- (tx_ring->tbd_avail < Adapter->tx_recycle_thresh)) {
+ if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
(void) e1000g_recycle(tx_ring);
E1000G_DEBUG_STAT(tx_ring->stat_recycle);
@@ -406,6 +405,7 @@ tx_send_failed:
* Enable Transmit interrupts, so that the interrupt routine can
* call mac_tx_update() when transmit descriptors become available.
*/
+ tx_ring->resched_timestamp = ddi_get_lbolt();
tx_ring->resched_needed = B_TRUE;
if (!Adapter->tx_intr_enable)
e1000g_mask_tx_interrupt(Adapter);
@@ -434,6 +434,7 @@ tx_no_resource:
* Enable Transmit interrupts, so that the interrupt routine can
* call mac_tx_update() when transmit descriptors become available.
*/
+ tx_ring->resched_timestamp = ddi_get_lbolt();
tx_ring->resched_needed = B_TRUE;
if (!Adapter->tx_intr_enable)
e1000g_mask_tx_interrupt(Adapter);
@@ -449,9 +450,14 @@ e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
uintptr_t ip_start;
uintptr_t tcp_start;
mblk_t *nmp;
+ uint32_t lsoflags;
+ uint32_t mss;
bzero(cur_context, sizeof (context_data_t));
+ /* first check lso information */
+ lso_info_get(mp, &mss, &lsoflags);
+
/* retrieve checksum info */
hcksum_retrieve(mp, NULL, NULL, &cur_context->cksum_start,
&cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
@@ -464,45 +470,48 @@ e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
cur_context->ether_header_size =
sizeof (struct ether_header);
- if (cur_context->cksum_flags & HW_LSO) {
- if ((cur_context->mss = DB_LSOMSS(mp)) != 0) {
- /* free the invaid packet */
- if (!((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
- (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
- return (B_FALSE);
- }
- cur_context->lso_flag = B_TRUE;
- /*
- * Some fields are cleared for the hardware to fill
- * in. We don't assume Ethernet header, IP header and
- * TCP header are always in the same mblk fragment,
- * while we assume each header is always within one
- * mblk fragment and Ethernet header is always in the
- * first mblk fragment.
- */
- nmp = mp;
- ip_start = (uintptr_t)(nmp->b_rptr)
- + cur_context->ether_header_size;
- if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
- ip_start = (uintptr_t)nmp->b_cont->b_rptr
- + (ip_start - (uintptr_t)(nmp->b_wptr));
- nmp = nmp->b_cont;
- }
- tcp_start = ip_start +
- IPH_HDR_LENGTH((ipha_t *)ip_start);
- if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
- tcp_start = (uintptr_t)nmp->b_cont->b_rptr
- + (tcp_start - (uintptr_t)(nmp->b_wptr));
- nmp = nmp->b_cont;
- }
- cur_context->hdr_len = cur_context->ether_header_size
- + IPH_HDR_LENGTH((ipha_t *)ip_start)
- + TCP_HDR_LENGTH((tcph_t *)tcp_start);
- ((ipha_t *)ip_start)->ipha_length = 0;
- ((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
- /* calculate the TCP packet payload length */
- cur_context->pay_len = msg_size - cur_context->hdr_len;
+ if (lsoflags & HW_LSO) {
+ ASSERT(mss != 0);
+
+ /* free the invalid packet */
+ if (mss == 0 ||
+ !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
+ (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
+ return (B_FALSE);
+ }
+ cur_context->mss = (uint16_t)mss;
+ cur_context->lso_flag = B_TRUE;
+
+ /*
+ * Some fields are cleared for the hardware to fill
+ * in. We don't assume Ethernet header, IP header and
+ * TCP header are always in the same mblk fragment,
+ * while we assume each header is always within one
+ * mblk fragment and Ethernet header is always in the
+ * first mblk fragment.
+ */
+ nmp = mp;
+ ip_start = (uintptr_t)(nmp->b_rptr)
+ + cur_context->ether_header_size;
+ if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
+ ip_start = (uintptr_t)nmp->b_cont->b_rptr
+ + (ip_start - (uintptr_t)(nmp->b_wptr));
+ nmp = nmp->b_cont;
}
+ tcp_start = ip_start +
+ IPH_HDR_LENGTH((ipha_t *)ip_start);
+ if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
+ tcp_start = (uintptr_t)nmp->b_cont->b_rptr
+ + (tcp_start - (uintptr_t)(nmp->b_wptr));
+ nmp = nmp->b_cont;
+ }
+ cur_context->hdr_len = cur_context->ether_header_size
+ + IPH_HDR_LENGTH((ipha_t *)ip_start)
+ + TCP_HDR_LENGTH((tcph_t *)tcp_start);
+ ((ipha_t *)ip_start)->ipha_length = 0;
+ ((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
+ /* calculate the TCP packet payload length */
+ cur_context->pay_len = msg_size - cur_context->hdr_len;
}
return (B_TRUE);
}
@@ -816,7 +825,6 @@ e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
return (desc_count);
}
-
/*
* e1000g_tx_setup - setup tx data structures
*
@@ -955,7 +963,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
mblk_t *nmp;
struct e1000_tx_desc *descriptor;
int desc_count;
- int is_intr;
/*
* This function will examine each TxSwPacket in the 'used' queue
@@ -972,13 +979,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
return (0);
}
- is_intr = servicing_interrupt();
-
- if (is_intr)
- mutex_enter(&tx_ring->usedlist_lock);
- else if (mutex_tryenter(&tx_ring->usedlist_lock) == 0)
- return (0);
-
desc_count = 0;
QUEUE_INIT_LIST(&pending_list);
@@ -987,7 +987,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
0, 0, DDI_DMA_SYNC_FORKERNEL);
if (e1000g_check_dma_handle(
tx_ring->tbd_dma_handle) != DDI_FM_OK) {
- mutex_exit(&tx_ring->usedlist_lock);
ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
Adapter->chip_state = E1000G_ERROR;
return (0);
@@ -996,6 +995,7 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
/*
* While there are still TxSwPackets in the used queue check them
*/
+ mutex_enter(&tx_ring->usedlist_lock);
while ((packet =
(p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
@@ -1030,9 +1030,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
descriptor + 1;
desc_count += packet->num_desc;
-
- if (is_intr && (desc_count >= Adapter->tx_recycle_num))
- break;
} else {
/*
* Found a sw packet that the e1000g is not done
diff --git a/usr/src/uts/common/io/hxge/hxge.h b/usr/src/uts/common/io/hxge/hxge.h
index 837cbbc90c..37183afc7d 100644
--- a/usr/src/uts/common/io/hxge/hxge.h
+++ b/usr/src/uts/common/io/hxge/hxge.h
@@ -202,7 +202,6 @@ typedef struct _hxge_stats_t {
hxge_pfc_stats_t pfc_stats; /* pfc stats */
hxge_port_stats_t port_stats; /* port stats */
- hxge_mmac_stats_t mmac_stats; /* Multi mac. stats */
hxge_peu_sys_stats_t peu_sys_stats; /* PEU system stats */
} hxge_stats_t, *p_hxge_stats_t;
@@ -357,7 +356,6 @@ struct _hxge_t {
uint32_t hxge_port_rbr_size;
uint32_t hxge_port_rcr_size;
uint32_t hxge_port_tx_ring_size;
- hxge_mmac_t hxge_mmac_info;
kmutex_t pio_lock;
hxge_timeout timeout;
diff --git a/usr/src/uts/common/io/hxge/hxge_impl.h b/usr/src/uts/common/io/hxge/hxge_impl.h
index 57ad2c9a21..67bab83787 100644
--- a/usr/src/uts/common/io/hxge/hxge_impl.h
+++ b/usr/src/uts/common/io/hxge/hxge_impl.h
@@ -68,8 +68,7 @@ extern "C" {
#include <sys/netlb.h>
#include <sys/ddi_intr.h>
-#include <sys/mac.h>
-#include <sys/mac_impl.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
/*
@@ -315,32 +314,6 @@ typedef struct _dev_regs_t {
unsigned char *hxge_romp; /* fcode pointer */
} dev_regs_t, *p_dev_regs_t;
-typedef struct _nxge_mac_addr_t {
- ether_addr_t addr;
- uint_t flags;
-} hxge_mac_addr_t;
-
-/*
- * Driver alternate mac address structure.
- */
-typedef struct _hxge_mmac_t {
- uint8_t total_factory_macs;
- uint8_t num_mmac;
- uint8_t num_factory_mmac;
- hxge_mac_addr_t mac_pool[16];
- ether_addr_t factory_mac_pool[16];
- uint8_t naddrfree; /* number of alt mac addr available */
-} hxge_mmac_t;
-
-/*
- * mmac stats structure
- */
-typedef struct _hxge_mmac_stats_t {
- uint8_t mmac_max_cnt;
- uint8_t mmac_avail_cnt;
- struct ether_addr mmac_avail_pool[16];
-} hxge_mmac_stats_t, *p_hxge_mmac_stats_t;
-
#include <hxge_common_impl.h>
#include <hxge_common.h>
#include <hxge_rxdma.h>
diff --git a/usr/src/uts/common/io/hxge/hxge_kstats.c b/usr/src/uts/common/io/hxge/hxge_kstats.c
index 9e3a86e953..1629c7c828 100644
--- a/usr/src/uts/common/io/hxge/hxge_kstats.c
+++ b/usr/src/uts/common/io/hxge/hxge_kstats.c
@@ -261,50 +261,6 @@ hxge_kstat_index_t hxge_pfc_stats[] = {
};
typedef enum {
- MMAC_MAX_ADDR,
- MMAC_AVAIL_ADDR,
- MMAC_ADDR_POOL1,
- MMAC_ADDR_POOL2,
- MMAC_ADDR_POOL3,
- MMAC_ADDR_POOL4,
- MMAC_ADDR_POOL5,
- MMAC_ADDR_POOL6,
- MMAC_ADDR_POOL7,
- MMAC_ADDR_POOL8,
- MMAC_ADDR_POOL9,
- MMAC_ADDR_POOL10,
- MMAC_ADDR_POOL11,
- MMAC_ADDR_POOL12,
- MMAC_ADDR_POOL13,
- MMAC_ADDR_POOL14,
- MMAC_ADDR_POOL15,
- MMAC_ADDR_POOL16,
- MMAC_STATS_END
-} hxge_mmac_stat_index_t;
-
-hxge_kstat_index_t hxge_mmac_stats[] = {
- {MMAC_MAX_ADDR, KSTAT_DATA_UINT64, "max_mmac_addr"},
- {MMAC_AVAIL_ADDR, KSTAT_DATA_UINT64, "avail_mmac_addr"},
- {MMAC_ADDR_POOL1, KSTAT_DATA_UINT64, "mmac_addr_1"},
- {MMAC_ADDR_POOL2, KSTAT_DATA_UINT64, "mmac_addr_2"},
- {MMAC_ADDR_POOL3, KSTAT_DATA_UINT64, "mmac_addr_3"},
- {MMAC_ADDR_POOL4, KSTAT_DATA_UINT64, "mmac_addr_4"},
- {MMAC_ADDR_POOL5, KSTAT_DATA_UINT64, "mmac_addr_5"},
- {MMAC_ADDR_POOL6, KSTAT_DATA_UINT64, "mmac_addr_6"},
- {MMAC_ADDR_POOL7, KSTAT_DATA_UINT64, "mmac_addr_7"},
- {MMAC_ADDR_POOL8, KSTAT_DATA_UINT64, "mmac_addr_8"},
- {MMAC_ADDR_POOL9, KSTAT_DATA_UINT64, "mmac_addr_9"},
- {MMAC_ADDR_POOL10, KSTAT_DATA_UINT64, "mmac_addr_10"},
- {MMAC_ADDR_POOL11, KSTAT_DATA_UINT64, "mmac_addr_11"},
- {MMAC_ADDR_POOL12, KSTAT_DATA_UINT64, "mmac_addr_12"},
- {MMAC_ADDR_POOL13, KSTAT_DATA_UINT64, "mmac_addr_13"},
- {MMAC_ADDR_POOL14, KSTAT_DATA_UINT64, "mmac_addr_14"},
- {MMAC_ADDR_POOL15, KSTAT_DATA_UINT64, "mmac_addr_15"},
- {MMAC_ADDR_POOL16, KSTAT_DATA_UINT64, "mmac_addr_16"},
- {MMAC_STATS_END, NULL, NULL},
-};
-
-typedef enum {
SPC_ACC_ERR = 0,
TDC_PIOACC_ERR,
RDC_PIOACC_ERR,
@@ -580,75 +536,6 @@ hxge_pfc_stat_update(kstat_t *ksp, int rw)
return (0);
}
-static uint64_t
-hxge_mac_octet_to_u64(struct ether_addr addr)
-{
- int i;
- uint64_t addr64 = 0;
-
- for (i = ETHERADDRL - 1; i >= 0; i--) {
- addr64 <<= 8;
- addr64 |= addr.ether_addr_octet[i];
- }
- return (addr64);
-}
-
-/* ARGSUSED */
-int
-hxge_mmac_stat_update(kstat_t *ksp, int rw)
-{
- p_hxge_t hxgep;
- p_hxge_mmac_kstat_t mmac_kstatsp;
- p_hxge_mmac_stats_t statsp;
-
- hxgep = (p_hxge_t)ksp->ks_private;
- if (hxgep == NULL)
- return (-1);
-
- HXGE_DEBUG_MSG((hxgep, KST_CTL, "==> hxge_mmac_stat_update"));
-
- mmac_kstatsp = (p_hxge_mmac_kstat_t)ksp->ks_data;
- statsp = (p_hxge_mmac_stats_t)&hxgep->statsp->mmac_stats;
-
- mmac_kstatsp->mmac_max_addr_cnt.value.ul = statsp->mmac_max_cnt;
- mmac_kstatsp->mmac_avail_addr_cnt.value.ul = statsp->mmac_avail_cnt;
- mmac_kstatsp->mmac_addr1.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[0]);
- mmac_kstatsp->mmac_addr2.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[1]);
- mmac_kstatsp->mmac_addr3.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[2]);
- mmac_kstatsp->mmac_addr4.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[3]);
- mmac_kstatsp->mmac_addr5.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[4]);
- mmac_kstatsp->mmac_addr6.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[5]);
- mmac_kstatsp->mmac_addr7.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[6]);
- mmac_kstatsp->mmac_addr8.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[7]);
- mmac_kstatsp->mmac_addr9.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[8]);
- mmac_kstatsp->mmac_addr10.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[9]);
- mmac_kstatsp->mmac_addr11.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[10]);
- mmac_kstatsp->mmac_addr12.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[11]);
- mmac_kstatsp->mmac_addr13.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[12]);
- mmac_kstatsp->mmac_addr14.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[13]);
- mmac_kstatsp->mmac_addr15.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[14]);
- mmac_kstatsp->mmac_addr16.value.ul =
- hxge_mac_octet_to_u64(statsp->mmac_avail_pool[15]);
-
- HXGE_DEBUG_MSG((hxgep, KST_CTL, "<== hxge_mmac_stat_update"));
- return (0);
-}
-
/* ARGSUSED */
int
hxge_peu_sys_stat_update(kstat_t *ksp, int rw)
@@ -722,7 +609,6 @@ hxge_setup_kstats(p_hxge_t hxgep)
p_hxge_port_kstat_t hxgekp;
size_t hxge_kstat_sz;
char stat_name[64];
- char mmac_name[64];
int i;
HXGE_DEBUG_MSG((hxgep, KST_CTL, "==> hxge_setup_kstats"));
@@ -779,14 +665,6 @@ hxge_setup_kstats(p_hxge_t hxgep)
if (hxgep->statsp->vmac_ksp == NULL)
cmn_err(CE_WARN, "kstat_create failed for vmac");
- /* Setup MMAC statistics */
- (void) sprintf(mmac_name, "MMAC Stats%d", hxgep->instance);
- hxgep->statsp->mmac_ksp = hxge_setup_local_kstat(hxgep,
- hxgep->instance, "MMAC",
- &hxge_mmac_stats[0], MMAC_STATS_END, hxge_mmac_stat_update);
- if (hxgep->statsp->mmac_ksp == NULL)
- cmn_err(CE_WARN, "kstat_create failed for mmac");
-
/* Setup PEU System statistics */
hxgep->statsp->peu_sys_ksp = hxge_setup_local_kstat(hxgep,
hxgep->instance, "PEU", &hxge_peu_sys_stats[0],
diff --git a/usr/src/uts/common/io/hxge/hxge_main.c b/usr/src/uts/common/io/hxge/hxge_main.c
index b58bf49d8d..47a61060bf 100644
--- a/usr/src/uts/common/io/hxge/hxge_main.c
+++ b/usr/src/uts/common/io/hxge/hxge_main.c
@@ -151,13 +151,8 @@ static int hxge_m_unicst(void *, const uint8_t *);
static int hxge_m_multicst(void *, boolean_t, const uint8_t *);
static int hxge_m_promisc(void *, boolean_t);
static void hxge_m_ioctl(void *, queue_t *, mblk_t *);
-static void hxge_m_resources(void *);
static hxge_status_t hxge_mac_register(p_hxge_t hxgep);
-static int hxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr);
-static int hxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
-static int hxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr);
-static int hxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr);
static boolean_t hxge_m_getcapab(void *, mac_capab_t, void *);
static boolean_t hxge_param_locked(mac_prop_id_t pr_num);
static int hxge_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
@@ -196,7 +191,7 @@ mac_priv_prop_t hxge_priv_props[] = {
#define MAX_DUMP_SZ 256
#define HXGE_M_CALLBACK_FLAGS \
- (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
+ (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
extern mblk_t *hxge_m_tx(void *arg, mblk_t *mp);
extern hxge_status_t hxge_pfc_set_default_mac_addr(p_hxge_t hxgep);
@@ -210,7 +205,6 @@ static mac_callbacks_t hxge_m_callbacks = {
hxge_m_multicst,
hxge_m_unicst,
hxge_m_tx,
- hxge_m_resources,
hxge_m_ioctl,
hxge_m_getcapab,
NULL,
@@ -2697,386 +2691,17 @@ hxge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
HXGE_DEBUG_MSG((hxgep, NEMO_CTL, "<== hxge_m_ioctl"));
}
-extern void hxge_rx_hw_blank(void *arg, time_t ticks, uint_t count);
-
-static void
-hxge_m_resources(void *arg)
-{
- p_hxge_t hxgep = arg;
- mac_rx_fifo_t mrf;
- p_rx_rcr_rings_t rcr_rings;
- p_rx_rcr_ring_t *rcr_p;
- p_rx_rcr_ring_t rcrp;
- uint32_t i, ndmas;
- int status;
-
- HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_m_resources"));
-
- MUTEX_ENTER(hxgep->genlock);
-
- if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = hxge_init(hxgep);
- if (status != HXGE_OK) {
- HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_m_resources: "
- "hxge_init failed"));
- MUTEX_EXIT(hxgep->genlock);
- return;
- }
- }
-
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = hxge_rx_hw_blank;
- mrf.mrf_arg = (void *)hxgep;
-
- mrf.mrf_normal_blank_time = RXDMA_RCR_TO_DEFAULT;
- mrf.mrf_normal_pkt_count = RXDMA_RCR_PTHRES_DEFAULT;
-
- rcr_rings = hxgep->rx_rcr_rings;
- rcr_p = rcr_rings->rcr_rings;
- ndmas = rcr_rings->ndmas;
-
- /*
- * Export our receive resources to the MAC layer.
- */
- for (i = 0; i < ndmas; i++) {
- rcrp = (void *)(p_rx_rcr_ring_t)rcr_p[i];
- rcrp->rcr_mac_handle =
- mac_resource_add(hxgep->mach, (mac_resource_t *)&mrf);
-
- HXGE_DEBUG_MSG((hxgep, RX_CTL,
- "==> hxge_m_resources: vdma %d dma %d "
- "rcrptr 0x%016llx mac_handle 0x%016llx",
- i, rcrp->rdc, rcr_p[i], rcrp->rcr_mac_handle));
- }
-
- MUTEX_EXIT(hxgep->genlock);
-
- HXGE_DEBUG_MSG((hxgep, RX_CTL, "<== hxge_m_resources"));
-}
-
-/*
- * Set an alternate MAC address
- */
-static int
-hxge_altmac_set(p_hxge_t hxgep, uint8_t *maddr, mac_addr_slot_t slot)
-{
- uint64_t address;
- uint64_t tmp;
- hpi_status_t status;
- uint8_t addrn;
- int i;
-
- /*
- * Convert a byte array to a 48 bit value.
- * Need to check endianess if in doubt
- */
- address = 0;
- for (i = 0; i < ETHERADDRL; i++) {
- tmp = maddr[i];
- address <<= 8;
- address |= tmp;
- }
-
- addrn = (uint8_t)slot;
- status = hpi_pfc_set_mac_address(hxgep->hpi_handle, addrn, address);
- if (status != HPI_SUCCESS)
- return (EIO);
-
- return (0);
-}
-
-static void
-hxge_mmac_kstat_update(p_hxge_t hxgep, mac_addr_slot_t slot)
-{
- p_hxge_mmac_stats_t mmac_stats;
- int i;
- hxge_mmac_t *mmac_info;
-
- mmac_info = &hxgep->hxge_mmac_info;
- mmac_stats = &hxgep->statsp->mmac_stats;
- mmac_stats->mmac_max_cnt = mmac_info->num_mmac;
- mmac_stats->mmac_avail_cnt = mmac_info->naddrfree;
-
- for (i = 0; i < ETHERADDRL; i++) {
- mmac_stats->mmac_avail_pool[slot].ether_addr_octet[i] =
- mmac_info->mac_pool[slot].addr[(ETHERADDRL - 1) - i];
- }
-}
-
-/*
- * Find an unused address slot, set the address value to the one specified,
- * enable the port to start filtering on the new MAC address.
- * Returns: 0 on success.
- */
-int
-hxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
-{
- p_hxge_t hxgep = arg;
- mac_addr_slot_t slot;
- hxge_mmac_t *mmac_info;
- int err;
- hxge_status_t status;
-
- mutex_enter(hxgep->genlock);
-
- /*
- * Make sure that hxge is initialized, if _start() has
- * not been called.
- */
- if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = hxge_init(hxgep);
- if (status != HXGE_OK) {
- mutex_exit(hxgep->genlock);
- return (ENXIO);
- }
- }
-
- mmac_info = &hxgep->hxge_mmac_info;
- if (mmac_info->naddrfree == 0) {
- mutex_exit(hxgep->genlock);
- return (ENOSPC);
- }
-
- if (!mac_unicst_verify(hxgep->mach, maddr->mma_addr,
- maddr->mma_addrlen)) {
- mutex_exit(hxgep->genlock);
- return (EINVAL);
- }
-
- /*
- * Search for the first available slot. Because naddrfree
- * is not zero, we are guaranteed to find one.
- * Slot 0 is for unique (primary) MAC. The first alternate
- * MAC slot is slot 1.
- */
- for (slot = 1; slot < mmac_info->num_mmac; slot++) {
- if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
- break;
- }
-
- ASSERT(slot < mmac_info->num_mmac);
- if ((err = hxge_altmac_set(hxgep, maddr->mma_addr, slot)) != 0) {
- mutex_exit(hxgep->genlock);
- return (err);
- }
- bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, ETHERADDRL);
- mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED;
- mmac_info->naddrfree--;
- hxge_mmac_kstat_update(hxgep, slot);
-
- maddr->mma_slot = slot;
-
- mutex_exit(hxgep->genlock);
- return (0);
-}
-
-/*
- * Remove the specified mac address and update
- * the h/w not to filter the mac address anymore.
- * Returns: 0, on success.
- */
-int
-hxge_m_mmac_remove(void *arg, mac_addr_slot_t slot)
-{
- p_hxge_t hxgep = arg;
- hxge_mmac_t *mmac_info;
- int err = 0;
- hxge_status_t status;
-
- mutex_enter(hxgep->genlock);
-
- /*
- * Make sure that hxge is initialized, if _start() has
- * not been called.
- */
- if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = hxge_init(hxgep);
- if (status != HXGE_OK) {
- mutex_exit(hxgep->genlock);
- return (ENXIO);
- }
- }
-
- mmac_info = &hxgep->hxge_mmac_info;
- if (slot <= 0 || slot >= mmac_info->num_mmac) {
- mutex_exit(hxgep->genlock);
- return (EINVAL);
- }
-
- if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
- if (hpi_pfc_mac_addr_disable(hxgep->hpi_handle, slot) ==
- HPI_SUCCESS) {
- mmac_info->mac_pool[slot].flags &= ~MMAC_SLOT_USED;
- mmac_info->naddrfree++;
- /*
- * Clear mac_pool[slot].addr so that kstat shows 0
- * alternate MAC address if the slot is not used.
- */
- bzero(mmac_info->mac_pool[slot].addr, ETHERADDRL);
- hxge_mmac_kstat_update(hxgep, slot);
- } else {
- err = EIO;
- }
- } else {
- err = EINVAL;
- }
-
- mutex_exit(hxgep->genlock);
- return (err);
-}
-
-/*
- * Modify a mac address added by hxge_mmac_add().
- * Returns: 0, on success.
- */
-int
-hxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr)
-{
- p_hxge_t hxgep = arg;
- mac_addr_slot_t slot;
- hxge_mmac_t *mmac_info;
- int err = 0;
- hxge_status_t status;
-
- if (!mac_unicst_verify(hxgep->mach, maddr->mma_addr,
- maddr->mma_addrlen))
- return (EINVAL);
-
- slot = maddr->mma_slot;
-
- mutex_enter(hxgep->genlock);
-
- /*
- * Make sure that hxge is initialized, if _start() has
- * not been called.
- */
- if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = hxge_init(hxgep);
- if (status != HXGE_OK) {
- mutex_exit(hxgep->genlock);
- return (ENXIO);
- }
- }
-
- mmac_info = &hxgep->hxge_mmac_info;
- if (slot <= 0 || slot >= mmac_info->num_mmac) {
- mutex_exit(hxgep->genlock);
- return (EINVAL);
- }
-
- if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
- if ((err = hxge_altmac_set(hxgep, maddr->mma_addr,
- slot)) == 0) {
- bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr,
- ETHERADDRL);
- hxge_mmac_kstat_update(hxgep, slot);
- }
- } else {
- err = EINVAL;
- }
-
- mutex_exit(hxgep->genlock);
- return (err);
-}
-
-/*
- * static int
- * hxge_m_mmac_get() - Get the MAC address and other information
- * related to the slot. mma_flags should be set to 0 in the call.
- * Note: although kstat shows MAC address as zero when a slot is
- * not used, Crossbow expects hxge_m_mmac_get to copy factory MAC
- * to the caller as long as the slot is not using a user MAC address.
- * The following table shows the rules,
- *
- * USED VENDOR mma_addr
- * ------------------------------------------------------------
- * (1) Slot uses a user MAC: yes no user MAC
- * (2) Slot uses a factory MAC: yes yes factory MAC
- * (3) Slot is not used but is
- * factory MAC capable: no yes factory MAC
- * (4) Slot is not used and is
- * not factory MAC capable: no no 0
- * ------------------------------------------------------------
- */
-int
-hxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr)
-{
- hxge_t *hxgep = arg;
- mac_addr_slot_t slot;
- hxge_mmac_t *mmac_info;
- hxge_status_t status;
-
- slot = maddr->mma_slot;
-
- mutex_enter(hxgep->genlock);
-
- /*
- * Make sure that hxge is initialized, if _start() has
- * not been called.
- */
- if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = hxge_init(hxgep);
- if (status != HXGE_OK) {
- mutex_exit(hxgep->genlock);
- return (ENXIO);
- }
- }
-
- mmac_info = &hxgep->hxge_mmac_info;
- if (slot <= 0 || slot >= mmac_info->num_mmac) {
- mutex_exit(hxgep->genlock);
- return (EINVAL);
- }
-
- maddr->mma_flags = 0;
- if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
- maddr->mma_flags |= MMAC_SLOT_USED;
- bcopy(mmac_info->mac_pool[slot].addr,
- maddr->mma_addr, ETHERADDRL);
- maddr->mma_addrlen = ETHERADDRL;
- }
-
- mutex_exit(hxgep->genlock);
- return (0);
-}
-
/*ARGSUSED*/
boolean_t
hxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
- p_hxge_t hxgep = (p_hxge_t)arg;
uint32_t *txflags = cap_data;
- multiaddress_capab_t *mmacp = cap_data;
switch (cap) {
case MAC_CAPAB_HCKSUM:
*txflags = HCKSUM_INET_PARTIAL;
break;
- case MAC_CAPAB_POLL:
- /*
- * There's nothing for us to fill in, simply returning B_TRUE
- * stating that we support polling is sufficient.
- */
- break;
-
- case MAC_CAPAB_MULTIADDRESS:
- /*
- * The number of MAC addresses made available by
- * this capability is one less than the total as
- * the primary address in slot 0 is counted in
- * the total.
- */
- mmacp->maddr_naddr = PFC_N_MAC_ADDRESSES - 1;
- mmacp->maddr_naddrfree = hxgep->hxge_mmac_info.naddrfree;
- mmacp->maddr_flag = 0; /* No multiple factory macs */
- mmacp->maddr_handle = hxgep;
- mmacp->maddr_add = hxge_m_mmac_add;
- mmacp->maddr_remove = hxge_m_mmac_remove;
- mmacp->maddr_modify = hxge_m_mmac_modify;
- mmacp->maddr_get = hxge_m_mmac_get;
- mmacp->maddr_reserve = NULL; /* No multiple factory macs */
- break;
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/common/io/hxge/hxge_rxdma.c b/usr/src/uts/common/io/hxge/hxge_rxdma.c
index 0c3747f6bd..2de507a8e9 100644
--- a/usr/src/uts/common/io/hxge/hxge_rxdma.c
+++ b/usr/src/uts/common/io/hxge/hxge_rxdma.c
@@ -1228,10 +1228,8 @@ hxge_rx_pkts_vring(p_hxge_t hxgep, uint_t vindex, p_hxge_ldv_t ldvp,
#ifdef HXGE_DEBUG
HXGE_DEBUG_MSG((hxgep, RX_CTL,
"==> hxge_rx_pkts_vring:calling mac_rx (NEMO) "
- "LEN %d mp $%p mp->b_next $%p rcrp $%p "
- "mac_handle $%p",
- (mp->b_wptr - mp->b_rptr), mp, mp->b_next,
- rcrp, rcrp->rcr_mac_handle));
+ "LEN %d mp $%p mp->b_next $%p rcrp $%p",
+ (mp->b_wptr - mp->b_rptr), mp, mp->b_next, rcrp));
HXGE_DEBUG_MSG((hxgep, RX_CTL,
"==> hxge_rx_pkts_vring: dump packets "
"(mp $%p b_rptr $%p b_wptr $%p):\n %s",
@@ -1257,7 +1255,7 @@ hxge_rx_pkts_vring(p_hxge_t hxgep, uint_t vindex, p_hxge_ldv_t ldvp,
HXGE_DEBUG_MSG((hxgep, RX_CTL,
"==> hxge_rx_pkts_vring: send packet to stack"));
- mac_rx(hxgep->mach, rcrp->rcr_mac_handle, mp);
+ mac_rx(hxgep->mach, NULL, mp);
HXGE_DEBUG_MSG((hxgep, RX_CTL, "<== hxge_rx_pkts_vring"));
}
diff --git a/usr/src/uts/common/io/hxge/hxge_rxdma.h b/usr/src/uts/common/io/hxge/hxge_rxdma.h
index c5277ca590..0d1808a67c 100644
--- a/usr/src/uts/common/io/hxge/hxge_rxdma.h
+++ b/usr/src/uts/common/io/hxge/hxge_rxdma.h
@@ -344,7 +344,6 @@ typedef struct _rx_rcr_ring_t {
uint32_t intr_timeout;
uint32_t intr_threshold;
uint64_t max_receive_pkts;
- mac_resource_handle_t rcr_mac_handle;
uint32_t rcvd_pkt_bytes; /* Received bytes of a packet */
} rx_rcr_ring_t, *p_rx_rcr_ring_t;
diff --git a/usr/src/uts/common/io/hxge/hxge_virtual.c b/usr/src/uts/common/io/hxge/hxge_virtual.c
index b1eff782aa..bbc65993d0 100644
--- a/usr/src/uts/common/io/hxge/hxge_virtual.c
+++ b/usr/src/uts/common/io/hxge/hxge_virtual.c
@@ -36,7 +36,6 @@ static void hxge_set_hw_dma_config(p_hxge_t);
static void hxge_set_hw_class_config(p_hxge_t);
static void hxge_ldgv_setup(p_hxge_ldg_t *ldgp, p_hxge_ldv_t *ldvp, uint8_t ldv,
uint8_t endldg, int *ngrps);
-static hxge_status_t hxge_mmac_init(p_hxge_t);
extern uint16_t hxge_rcr_timeout;
extern uint16_t hxge_rcr_threshold;
@@ -894,35 +893,11 @@ hxge_intr_mask_mgmt_set(p_hxge_t hxgep, boolean_t on)
static hxge_status_t
hxge_get_mac_addr_properties(p_hxge_t hxgep)
{
- uint32_t num_macs;
- hxge_status_t status;
-
HXGE_DEBUG_MSG((hxgep, DDI_CTL, "==> hxge_get_mac_addr_properties "));
(void) hxge_pfc_mac_addrs_get(hxgep);
hxgep->ouraddr = hxgep->factaddr;
- /*
- * Get the number of MAC addresses the Hydra supports per blade.
- */
- if (hxge_pfc_num_macs_get(hxgep, &num_macs) == HXGE_OK) {
- hxgep->hxge_mmac_info.num_mmac = (uint8_t)num_macs;
- } else {
- HXGE_ERROR_MSG((NULL, HXGE_ERR_CTL,
- "hxge_get_mac_addr_properties: get macs failed"));
- return (HXGE_ERROR);
- }
-
- /*
- * Initialize alt. mac addr. in the mac pool
- */
- status = hxge_mmac_init(hxgep);
- if (status != HXGE_OK) {
- HXGE_ERROR_MSG((NULL, HXGE_ERR_CTL,
- "hxge_get_mac_addr_properties: init mmac failed"));
- return (HXGE_ERROR);
- }
-
HXGE_DEBUG_MSG((hxgep, DDI_CTL, "<== hxge_get_mac_addr_properties "));
return (HXGE_OK);
}
@@ -971,49 +946,3 @@ hxge_ldgv_setup(p_hxge_ldg_t *ldgp, p_hxge_ldv_t *ldvp, uint8_t ldv,
HXGE_DEBUG_MSG((NULL, INT_CTL, "<== hxge_ldgv_setup"));
}
-
-/*
- * Note: This function assumes the following distribution of mac
- * addresses for a hydra blade:
- *
- * -------------
- * 0| |0 - local-mac-address for blade
- * -------------
- * | |1 - Start of alt. mac addr. for blade
- * | |
- * | |
- * | |15
- * --------------
- */
-
-static hxge_status_t
-hxge_mmac_init(p_hxge_t hxgep)
-{
- int slot;
- hxge_mmac_t *mmac_info;
-
- mmac_info = (hxge_mmac_t *)&hxgep->hxge_mmac_info;
-
- /* Set flags for unique MAC */
- mmac_info->mac_pool[0].flags |= MMAC_SLOT_USED | MMAC_VENDOR_ADDR;
- mmac_info->num_factory_mmac = 1;
-
- /*
- * Skip the factory/default address which is in slot 0.
- * Initialze all other mac addr. to "AVAILABLE" state.
- * Clear flags of all alternate MAC slots.
- */
- for (slot = 1; slot < mmac_info->num_mmac; slot++) {
- (void) hpi_pfc_clear_mac_address(hxgep->hpi_handle, slot);
- mmac_info->mac_pool[slot].flags = 0;
- }
-
- /* Exclude the factory mac address */
- mmac_info->naddrfree = mmac_info->num_mmac - 1;
-
- /* Initialize the first two parameters for mmac kstat */
- hxgep->statsp->mmac_stats.mmac_max_cnt = mmac_info->num_mmac;
- hxgep->statsp->mmac_stats.mmac_avail_cnt = mmac_info->naddrfree;
-
- return (HXGE_OK);
-}
diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd.c b/usr/src/uts/common/io/ib/clients/ibd/ibd.c
index 099e2036c8..7992e1007b 100644
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c
+++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c
@@ -37,6 +37,7 @@
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/dlpi.h>
+#include <sys/mac_provider.h>
#include <sys/pattr.h> /* for HCK_PARTIALCKSUM */
#include <sys/sysmacros.h> /* for offsetof */
@@ -310,7 +311,6 @@ static mac_callbacks_t ib_m_callbacks = {
ibd_m_unicst,
ibd_m_tx,
NULL,
- NULL,
ibd_m_getcapab
};
@@ -4102,13 +4102,6 @@ ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
return (B_FALSE);
break;
}
- case MAC_CAPAB_POLL:
- /*
- * Fallthrough to default, as we don't support GLDv3
- * polling. When blanking is implemented, we will need to
- * change this to return B_TRUE in addition to registering
- * an mc_resources callback.
- */
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/common/io/igb/igb.conf b/usr/src/uts/common/io/igb/igb.conf
index c2ae8d4cd3..93860209f0 100644
--- a/usr/src/uts/common/io/igb/igb.conf
+++ b/usr/src/uts/common/io/igb/igb.conf
@@ -1,19 +1,17 @@
#
# CDDL HEADER START
#
-# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
-# You can obtain a copy of the license at:
-# http://www.opensolaris.org/os/licensing.
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
-# When using or redistributing this file, you may do so under the
-# License only. No other modification of this header is permitted.
-#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
@@ -21,11 +19,11 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms of the CDDL.
+# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
#
# Driver.conf file for Intel 1Gb ethernet driver (igb)
@@ -121,29 +119,29 @@
# flow_control = 3;
#
# -------------------- Transmit/Receive Queues --------------------
-# tx_queue_number
-# The number of the transmit queues
-# Allowed values: 1 - 4
-# Default value: 1
#
# tx_ring_size
# The number of the transmit descriptors per transmit queue
# Allowed values: 64 - 4096
# Default value: 512
#
-# rx_queue_number
-# The number of the receive queues
-# Allowed values: 1 - 4
-# Default value: 1
-#
# rx_ring_size
# The number of the receive descriptors per receive queue
# Allowed values: 64 - 4096
# Default value: 512
#
-# Note: The final values of tx_queue_number and rx_queue_number are decided
-# by the number of interrupt vectors obtained by the driver. They could be
-# less than the specified values because of limited interrupt vector number.
+# mr_enable
+# Enable multiple rx queues and tx queues
+# Allowed values: 0, 1
+# Default value: 1
+#
+# rx_group_number
+# The number of the receive ring groups
+# Allowed values: 1 - 4
+# Default value: 1
+#
+# Note: If the specified values of the rx_group_number are not supported by
+# hardware, the rx_group_number will be downgrade to an acceptable value.
#
# -------- How to set parameters for a particular interface ---------
# The example below shows how to locate the device path and set a parameter
diff --git a/usr/src/uts/common/io/igb/igb_gld.c b/usr/src/uts/common/io/igb/igb_gld.c
index d897a484e3..c1213647ec 100644
--- a/usr/src/uts/common/io/igb/igb_gld.c
+++ b/usr/src/uts/common/io/igb/igb_gld.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "igb_sw.h"
@@ -555,37 +555,6 @@ igb_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
}
/*
- * Set a new device unicast address.
- */
-int
-igb_m_unicst(void *arg, const uint8_t *mac_addr)
-{
- igb_t *igb = (igb_t *)arg;
- int result;
-
- mutex_enter(&igb->gen_lock);
-
- if (igb->igb_state & IGB_SUSPENDED) {
- mutex_exit(&igb->gen_lock);
- return (ECANCELED);
- }
-
- /*
- * Store the new MAC address.
- */
- bcopy(mac_addr, igb->hw.mac.addr, ETHERADDRL);
-
- /*
- * Set MAC address in address slot 0, which is the default address.
- */
- result = igb_unicst_set(igb, mac_addr, 0);
-
- mutex_exit(&igb->gen_lock);
-
- return (result);
-}
-
-/*
* Pass on M_IOCTL messages passed to the DLD, and support
* private IOCTLs for debugging and ndd.
*/
@@ -654,18 +623,16 @@ igb_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
}
}
-
/*
- * Find an unused address slot, set the address to it, reserve
- * this slot and enable the device to start filtering on the
- * new address.
+ * Add a MAC address to the target RX group.
*/
-int
-igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
+static int
+igb_addmac(void *arg, const uint8_t *mac_addr)
{
- igb_t *igb = (igb_t *)arg;
- mac_addr_slot_t slot;
- int err;
+ igb_rx_group_t *rx_group = (igb_rx_group_t *)arg;
+ igb_t *igb = rx_group->igb;
+ struct e1000_hw *hw = &igb->hw;
+ int i, slot;
mutex_enter(&igb->gen_lock);
@@ -674,12 +641,6 @@ igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
return (ECANCELED);
}
- if (mac_unicst_verify(igb->mac_hdl,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
- mutex_exit(&igb->gen_lock);
- return (EINVAL);
- }
-
if (igb->unicst_avail == 0) {
/* no slots available */
mutex_exit(&igb->gen_lock);
@@ -687,39 +648,55 @@ igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
}
/*
- * Primary/default address is in slot 0. The next addresses
- * are the multiple MAC addresses. So multiple MAC address 0
- * is in slot 1, 1 in slot 2, and so on. So the first multiple
- * MAC address resides in slot 1.
+ * The slots from 0 to igb->num_rx_groups are reserved slots which
+ * are 1 to 1 mapped with group index directly. The other slots are
+ * shared between the all of groups. While adding a MAC address,
+ * it will try to set the reserved slots first, then the shared slots.
*/
- for (slot = 1; slot < igb->unicst_total; slot++) {
- if (igb->unicst_addr[slot].mac.set == 0)
- break;
- }
+ slot = -1;
+ if (igb->unicst_addr[rx_group->index].mac.set == 1) {
+ /*
+ * The reserved slot for current group is used, find the free
+ * slots in the shared slots.
+ */
+ for (i = igb->num_rx_groups; i < igb->unicst_total; i++) {
+ if (igb->unicst_addr[i].mac.set == 0) {
+ slot = i;
+ break;
+ }
+ }
+ } else
+ slot = rx_group->index;
- ASSERT((slot > 0) && (slot < igb->unicst_total));
+ if (slot == -1) {
+ /* no slots available in the shared slots */
+ mutex_exit(&igb->gen_lock);
+ return (ENOSPC);
+ }
- maddr->mma_slot = slot;
+ /* Set VMDq according to the mode supported by hardware. */
+ e1000_rar_set_vmdq(hw, mac_addr, slot, igb->vmdq_mode, rx_group->index);
- if ((err = igb_unicst_set(igb, maddr->mma_addr, slot)) == 0) {
- igb->unicst_addr[slot].mac.set = 1;
- igb->unicst_avail--;
- }
+ bcopy(mac_addr, igb->unicst_addr[slot].mac.addr, ETHERADDRL);
+ igb->unicst_addr[slot].mac.group_index = rx_group->index;
+ igb->unicst_addr[slot].mac.set = 1;
+ igb->unicst_avail--;
mutex_exit(&igb->gen_lock);
- return (err);
+ return (0);
}
-
/*
- * Removes a MAC address that was added before.
+ * Remove a MAC address from the specified RX group.
*/
-int
-igb_m_unicst_remove(void *arg, mac_addr_slot_t slot)
+static int
+igb_remmac(void *arg, const uint8_t *mac_addr)
{
- igb_t *igb = (igb_t *)arg;
- int err;
+ igb_rx_group_t *rx_group = (igb_rx_group_t *)arg;
+ igb_t *igb = rx_group->igb;
+ struct e1000_hw *hw = &igb->hw;
+ int slot;
mutex_enter(&igb->gen_lock);
@@ -728,7 +705,8 @@ igb_m_unicst_remove(void *arg, mac_addr_slot_t slot)
return (ECANCELED);
}
- if ((slot <= 0) || (slot >= igb->unicst_total)) {
+ slot = igb_unicst_find(igb, mac_addr);
+ if (slot == -1) {
mutex_exit(&igb->gen_lock);
return (EINVAL);
}
@@ -738,104 +716,189 @@ igb_m_unicst_remove(void *arg, mac_addr_slot_t slot)
return (EINVAL);
}
- /* Copy the default address to the passed slot */
- if ((err = igb_unicst_set(igb,
- igb->unicst_addr[0].mac.addr, slot)) == 0) {
- igb->unicst_addr[slot].mac.set = 0;
- igb->unicst_avail++;
- }
+ /* Clear the MAC ddress in the slot */
+ e1000_rar_clear(hw, slot);
+ igb->unicst_addr[slot].mac.set = 0;
+ igb->unicst_avail++;
mutex_exit(&igb->gen_lock);
- return (err);
+ return (0);
}
/*
- * Modifies the value of an address that has been added before.
- * The new address length and the slot number that was returned
- * in the call to add should be passed in. mma_flags should be
- * set to 0.
- * Returns 0 on success.
+ * Enable interrupt on the specificed rx ring.
*/
int
-igb_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
+igb_rx_ring_intr_enable(mac_intr_handle_t intrh)
{
- igb_t *igb = (igb_t *)arg;
- mac_addr_slot_t slot;
- int err;
-
- mutex_enter(&igb->gen_lock);
+ igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)intrh;
+ igb_t *igb = rx_ring->igb;
+ struct e1000_hw *hw = &igb->hw;
+ uint32_t index = rx_ring->index;
- if (igb->igb_state & IGB_SUSPENDED) {
- mutex_exit(&igb->gen_lock);
- return (ECANCELED);
+ if (igb->intr_type == DDI_INTR_TYPE_MSIX) {
+ /* Interrupt enabling for MSI-X */
+ igb->eims_mask |= (E1000_EICR_RX_QUEUE0 << index);
+ E1000_WRITE_REG(hw, E1000_EIMS, igb->eims_mask);
+ E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask);
+ } else {
+ ASSERT(index == 0);
+ /* Interrupt enabling for MSI and legacy */
+ igb->ims_mask |= E1000_IMS_RXT0;
+ E1000_WRITE_REG(hw, E1000_IMS, igb->ims_mask);
}
- if (mac_unicst_verify(igb->mac_hdl,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
- mutex_exit(&igb->gen_lock);
- return (EINVAL);
- }
+ E1000_WRITE_FLUSH(hw);
- slot = maddr->mma_slot;
+ return (0);
+}
- if ((slot <= 0) || (slot >= igb->unicst_total)) {
- mutex_exit(&igb->gen_lock);
- return (EINVAL);
+/*
+ * Disable interrupt on the specificed rx ring.
+ */
+int
+igb_rx_ring_intr_disable(mac_intr_handle_t intrh)
+{
+ igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)intrh;
+ igb_t *igb = rx_ring->igb;
+ struct e1000_hw *hw = &igb->hw;
+ uint32_t index = rx_ring->index;
+
+ if (igb->intr_type == DDI_INTR_TYPE_MSIX) {
+ /* Interrupt disabling for MSI-X */
+ igb->eims_mask &= ~(E1000_EICR_RX_QUEUE0 << index);
+ E1000_WRITE_REG(hw, E1000_EIMC,
+ (E1000_EICR_RX_QUEUE0 << index));
+ E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask);
+ } else {
+ ASSERT(index == 0);
+ /* Interrupt disabling for MSI and legacy */
+ igb->ims_mask &= ~E1000_IMS_RXT0;
+ E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
}
- if (igb->unicst_addr[slot].mac.set == 0) {
- mutex_exit(&igb->gen_lock);
- return (EINVAL);
+ E1000_WRITE_FLUSH(hw);
+
+ return (0);
+}
+
+/*
+ * Get the global ring index by a ring index within a group.
+ */
+int
+igb_get_rx_ring_index(igb_t *igb, int gindex, int rindex)
+{
+ igb_rx_ring_t *rx_ring;
+ int i;
+
+ for (i = 0; i < igb->num_rx_rings; i++) {
+ rx_ring = &igb->rx_rings[i];
+ if (rx_ring->group_index == gindex)
+ rindex--;
+ if (rindex < 0)
+ return (i);
}
- err = igb_unicst_set(igb, maddr->mma_addr, slot);
+ return (-1);
+}
- mutex_exit(&igb->gen_lock);
+static int
+igb_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+ igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)rh;
- return (err);
+ mutex_enter(&rx_ring->rx_lock);
+ rx_ring->ring_gen_num = mr_gen_num;
+ mutex_exit(&rx_ring->rx_lock);
+ return (0);
}
/*
- * Get the MAC address and all other information related to
- * the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
+ * Callback funtion for MAC layer to register all rings.
*/
-int
-igb_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
+/* ARGSUSED */
+void
+igb_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+ const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
{
igb_t *igb = (igb_t *)arg;
- mac_addr_slot_t slot;
+ mac_intr_t *mintr = &infop->mri_intr;
- mutex_enter(&igb->gen_lock);
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ igb_rx_ring_t *rx_ring;
+ int global_index;
- if (igb->igb_state & IGB_SUSPENDED) {
- mutex_exit(&igb->gen_lock);
- return (ECANCELED);
- }
+ /*
+ * 'index' is the ring index within the group.
+ * We need the global ring index by searching in group.
+ */
+ global_index = igb_get_rx_ring_index(igb, rg_index, index);
- slot = maddr->mma_slot;
+ ASSERT(global_index >= 0);
- if ((slot <= 0) || (slot >= igb->unicst_total)) {
- mutex_exit(&igb->gen_lock);
- return (EINVAL);
+ rx_ring = &igb->rx_rings[global_index];
+ rx_ring->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)rx_ring;
+ infop->mri_start = igb_ring_start;
+ infop->mri_stop = NULL;
+ infop->mri_poll = (mac_ring_poll_t)igb_rx_ring_poll;
+
+ mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+ mintr->mi_enable = igb_rx_ring_intr_enable;
+ mintr->mi_disable = igb_rx_ring_intr_disable;
+
+ break;
}
+ case MAC_RING_TYPE_TX: {
+ ASSERT(index < igb->num_tx_rings);
- if (igb->unicst_addr[slot].mac.set == 1) {
- bcopy(igb->unicst_addr[slot].mac.addr,
- maddr->mma_addr, ETHERADDRL);
- maddr->mma_flags = MMAC_SLOT_USED;
- } else {
- maddr->mma_flags = MMAC_SLOT_UNUSED;
+ igb_tx_ring_t *tx_ring = &igb->tx_rings[index];
+ tx_ring->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)tx_ring;
+ infop->mri_start = NULL;
+ infop->mri_stop = NULL;
+ infop->mri_tx = igb_tx_ring_send;
+
+ break;
}
- mutex_exit(&igb->gen_lock);
+ default:
+ break;
+ }
+}
- return (0);
+void
+igb_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+ mac_group_info_t *infop, mac_group_handle_t gh)
+{
+ igb_t *igb = (igb_t *)arg;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ igb_rx_group_t *rx_group;
+
+ ASSERT((index >= 0) && (index < igb->num_rx_groups));
+
+ rx_group = &igb->rx_groups[index];
+ rx_group->group_handle = gh;
+
+ infop->mgi_driver = (mac_group_driver_t)rx_group;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = igb_addmac;
+ infop->mgi_remmac = igb_remmac;
+ infop->mgi_count = (igb->num_rx_rings / igb->num_rx_groups);
+
+ break;
+ }
+ case MAC_RING_TYPE_TX:
+ break;
+ default:
+ break;
+ }
}
/*
@@ -863,27 +926,34 @@ igb_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
*tx_hcksum_flags = HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM;
break;
}
- case MAC_CAPAB_MULTIADDRESS: {
- multiaddress_capab_t *mmacp = cap_data;
+ case MAC_CAPAB_RINGS: {
+ mac_capab_rings_t *cap_rings = cap_data;
+
+ switch (cap_rings->mr_type) {
+ case MAC_RING_TYPE_RX:
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = igb->num_rx_rings;
+ cap_rings->mr_gnum = igb->num_rx_groups;
+ cap_rings->mr_rget = igb_fill_ring;
+ cap_rings->mr_gget = igb_fill_group;
+ cap_rings->mr_gaddring = NULL;
+ cap_rings->mr_gremring = NULL;
- /*
- * The number of MAC addresses made available by
- * this capability is one less than the total as
- * the primary address in slot 0 is counted in
- * the total.
- */
- mmacp->maddr_naddr = igb->unicst_total - 1;
- mmacp->maddr_naddrfree = igb->unicst_avail;
- /* No multiple factory addresses, set mma_flag to 0 */
- mmacp->maddr_flag = 0;
- mmacp->maddr_handle = igb;
- mmacp->maddr_add = igb_m_unicst_add;
- mmacp->maddr_remove = igb_m_unicst_remove;
- mmacp->maddr_modify = igb_m_unicst_modify;
- mmacp->maddr_get = igb_m_unicst_get;
- mmacp->maddr_reserve = NULL;
+ break;
+ case MAC_RING_TYPE_TX:
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = igb->num_tx_rings;
+ cap_rings->mr_gnum = 0;
+ cap_rings->mr_rget = igb_fill_ring;
+ cap_rings->mr_gget = NULL;
+
+ break;
+ default:
+ break;
+ }
break;
}
+
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/common/io/igb/igb_hw.h b/usr/src/uts/common/io/igb/igb_hw.h
index 814b0c09fb..04c410d7d1 100644
--- a/usr/src/uts/common/io/igb/igb_hw.h
+++ b/usr/src/uts/common/io/igb/igb_hw.h
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,8 +20,12 @@
*/
/*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
*/
/* IntelVersion: 1.357 v2007-12-10_dragonlake5 */
@@ -31,8 +33,6 @@
#ifndef _IGB_HW_H
#define _IGB_HW_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -620,6 +620,9 @@ s32 e1000_read_pcie_cap_reg(struct e1000_hw *hw, u32 reg, u16 *value);
void e1000_free_dev_spec_struct(struct e1000_hw *hw);
void e1000_read_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value);
void e1000_write_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value);
+void e1000_rar_clear(struct e1000_hw *hw, uint32_t);
+void e1000_rar_set_vmdq(struct e1000_hw *hw, const uint8_t *, uint32_t,
+ uint32_t, uint8_t);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/io/igb/igb_main.c b/usr/src/uts/common/io/igb/igb_main.c
index 18a7050e7e..ed475f0014 100644
--- a/usr/src/uts/common/io/igb/igb_main.c
+++ b/usr/src/uts/common/io/igb/igb_main.c
@@ -60,6 +60,8 @@ static void igb_setup_tx(igb_t *);
static void igb_setup_rx_ring(igb_rx_ring_t *);
static void igb_setup_tx_ring(igb_tx_ring_t *);
static void igb_setup_rss(igb_t *);
+static void igb_setup_mac_rss_classify(igb_t *);
+static void igb_setup_mac_classify(igb_t *);
static void igb_init_unicst(igb_t *);
static void igb_setup_multicst(igb_t *);
static void igb_get_phy_state(igb_t *);
@@ -93,10 +95,11 @@ static void igb_setup_adapter_msix(igb_t *);
static uint_t igb_intr_legacy(void *, void *);
static uint_t igb_intr_msi(void *, void *);
static uint_t igb_intr_rx(void *, void *);
+static uint_t igb_intr_tx(void *, void *);
static uint_t igb_intr_tx_other(void *, void *);
static void igb_intr_rx_work(igb_rx_ring_t *);
static void igb_intr_tx_work(igb_tx_ring_t *);
-static void igb_intr_other_work(igb_t *);
+static void igb_intr_link_work(igb_t *);
static void igb_get_driver_control(struct e1000_hw *);
static void igb_release_driver_control(struct e1000_hw *);
@@ -175,14 +178,12 @@ static mac_callbacks_t igb_m_callbacks = {
igb_m_stop,
igb_m_promisc,
igb_m_multicst,
- igb_m_unicst,
- igb_m_tx,
+ NULL,
NULL,
igb_m_ioctl,
igb_m_getcapab
};
-
/*
* Module Initialization Functions
*/
@@ -339,7 +340,7 @@ igb_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
* interrupts are allocated.
*/
if (igb_alloc_rings(igb) != IGB_SUCCESS) {
- igb_error(igb, "Failed to allocate rx and tx rings");
+ igb_error(igb, "Failed to allocate rx/tx rings or groups");
goto attach_fail;
}
igb->attach_progress |= ATTACH_PROGRESS_ALLOC_RINGS;
@@ -378,10 +379,13 @@ igb_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
/*
* Initialize chipset hardware
*/
+ mutex_enter(&igb->gen_lock);
if (igb_init(igb) != IGB_SUCCESS) {
+ mutex_exit(&igb->gen_lock);
igb_error(igb, "Failed to initialize adapter");
goto attach_fail;
}
+ mutex_exit(&igb->gen_lock);
igb->attach_progress |= ATTACH_PROGRESS_INIT;
/*
@@ -710,6 +714,7 @@ igb_register_mac(igb_t *igb)
mac->m_max_sdu = igb->max_frame_size -
sizeof (struct ether_vlan_header) - ETHERFCSL;
mac->m_margin = VLAN_TAGSZ;
+ mac->m_v12n = MAC_VIRT_LEVEL1;
status = mac_register(mac, &igb->mac_hdl);
@@ -1019,7 +1024,7 @@ igb_init(igb_t *igb)
uint32_t pba;
uint32_t high_water;
- mutex_enter(&igb->gen_lock);
+ ASSERT(mutex_owned(&igb->gen_lock));
/*
* Reset chipset to put the hardware in a known state
@@ -1121,7 +1126,6 @@ igb_init(igb_t *igb)
goto init_fail;
}
- mutex_exit(&igb->gen_lock);
return (IGB_SUCCESS);
init_fail:
@@ -1131,8 +1135,6 @@ init_fail:
if (e1000_check_reset_block(hw) == E1000_SUCCESS)
(void) e1000_phy_hw_reset(hw);
- mutex_exit(&igb->gen_lock);
-
ddi_fm_service_impact(igb->dip, DDI_SERVICE_LOST);
return (IGB_FAILURE);
@@ -1541,9 +1543,12 @@ igb_start(igb_t *igb)
/*
* Start the chipset hardware
*/
- if (igb_chip_start(igb) != IGB_SUCCESS) {
- igb_fm_ereport(igb, DDI_FM_DEVICE_INVAL_STATE);
- goto start_failure;
+ if (!(igb->attach_progress & ATTACH_PROGRESS_INIT)) {
+ if (igb_init(igb) != IGB_SUCCESS) {
+ igb_fm_ereport(igb, DDI_FM_DEVICE_INVAL_STATE);
+ goto start_failure;
+ }
+ igb->attach_progress |= ATTACH_PROGRESS_INIT;
}
/*
@@ -1591,6 +1596,8 @@ igb_stop(igb_t *igb)
ASSERT(mutex_owned(&igb->gen_lock));
+ igb->attach_progress &= ~ ATTACH_PROGRESS_INIT;
+
/*
* Disable the adapter interrupts
*/
@@ -1656,6 +1663,23 @@ igb_alloc_rings(igb_t *igb)
return (IGB_FAILURE);
}
+ /*
+ * Allocate memory space for rx ring groups
+ */
+ igb->rx_groups = kmem_zalloc(
+ sizeof (igb_rx_group_t) * igb->num_rx_groups,
+ KM_NOSLEEP);
+
+ if (igb->rx_groups == NULL) {
+ kmem_free(igb->rx_rings,
+ sizeof (igb_rx_ring_t) * igb->num_rx_rings);
+ kmem_free(igb->tx_rings,
+ sizeof (igb_tx_ring_t) * igb->num_tx_rings);
+ igb->rx_rings = NULL;
+ igb->tx_rings = NULL;
+ return (IGB_FAILURE);
+ }
+
return (IGB_SUCCESS);
}
@@ -1676,6 +1700,12 @@ igb_free_rings(igb_t *igb)
sizeof (igb_tx_ring_t) * igb->num_tx_rings);
igb->tx_rings = NULL;
}
+
+ if (igb->rx_groups != NULL) {
+ kmem_free(igb->rx_groups,
+ sizeof (igb_rx_group_t) * igb->num_rx_groups);
+ igb->rx_groups = NULL;
+ }
}
/*
@@ -1782,8 +1812,10 @@ static void
igb_setup_rx(igb_t *igb)
{
igb_rx_ring_t *rx_ring;
+ igb_rx_group_t *rx_group;
struct e1000_hw *hw = &igb->hw;
uint32_t reg_val;
+ uint32_t ring_per_group;
int i;
/*
@@ -1804,12 +1836,24 @@ igb_setup_rx(igb_t *igb)
E1000_WRITE_REG(hw, E1000_RCTL, reg_val);
+ for (i = 0; i < igb->num_rx_groups; i++) {
+ rx_group = &igb->rx_groups[i];
+ rx_group->index = i;
+ rx_group->igb = igb;
+ }
+
/*
* igb_setup_rx_ring must be called after configuring RCTL
*/
+ ring_per_group = igb->num_rx_rings / igb->num_rx_groups;
for (i = 0; i < igb->num_rx_rings; i++) {
rx_ring = &igb->rx_rings[i];
igb_setup_rx_ring(rx_ring);
+
+ /*
+ * Map a ring to a group by assigning a group index
+ */
+ rx_ring->group_index = i / ring_per_group;
}
/*
@@ -1829,10 +1873,32 @@ igb_setup_rx(igb_t *igb)
}
/*
- * Setup RSS for multiple receive queues
+ * Setup classify and RSS for multiple receive queues
*/
- if (igb->num_rx_rings > 1)
- igb_setup_rss(igb);
+ switch (igb->vmdq_mode) {
+ case E1000_VMDQ_OFF:
+ /*
+ * One ring group, only RSS is needed when more than
+ * one ring enabled.
+ */
+ if (igb->num_rx_rings > 1)
+ igb_setup_rss(igb);
+ break;
+ case E1000_VMDQ_MAC:
+ /*
+ * Multiple groups, each group has one ring,
+ * only the MAC classification is needed.
+ */
+ igb_setup_mac_classify(igb);
+ break;
+ case E1000_VMDQ_MAC_RSS:
+ /*
+ * Multiple groups and multiple rings, both
+ * MAC classification and RSS are needed.
+ */
+ igb_setup_mac_rss_classify(igb);
+ break;
+ }
}
static void
@@ -1848,6 +1914,7 @@ igb_setup_tx_ring(igb_tx_ring_t *tx_ring)
ASSERT(mutex_owned(&tx_ring->tx_lock));
ASSERT(mutex_owned(&igb->gen_lock));
+
/*
* Initialize the length register
*/
@@ -1922,6 +1989,14 @@ igb_setup_tx_ring(igb_tx_ring_t *tx_ring)
}
/*
+ * Enable specific tx ring, it is required by multiple tx
+ * ring support.
+ */
+ reg_val = E1000_READ_REG(hw, E1000_TXDCTL(tx_ring->index));
+ reg_val |= E1000_TXDCTL_QUEUE_ENABLE;
+ E1000_WRITE_REG(hw, E1000_TXDCTL(tx_ring->index), reg_val);
+
+ /*
* Initialize hardware checksum offload settings
*/
tx_ring->hcksum_context.hcksum_flags = 0;
@@ -2036,6 +2111,117 @@ igb_setup_rss(igb_t *igb)
}
/*
+ * igb_setup_mac_rss_classify - Setup MAC classification and rss
+ */
+static void
+igb_setup_mac_rss_classify(igb_t *igb)
+{
+ struct e1000_hw *hw = &igb->hw;
+ uint32_t i, mrqc, vmdctl, rxcsum;
+ uint32_t ring_per_group;
+ int shift_group0, shift_group1;
+ uint32_t random;
+ union e1000_reta {
+ uint32_t dword;
+ uint8_t bytes[4];
+ } reta;
+
+ ring_per_group = igb->num_rx_rings / igb->num_rx_groups;
+
+ /* Setup the Redirection Table, it is shared between two groups */
+ shift_group0 = 2;
+ shift_group1 = 6;
+ for (i = 0; i < (32 * 4); i++) {
+ reta.bytes[i & 3] = ((i % ring_per_group) << shift_group0) |
+ ((ring_per_group + (i % ring_per_group)) << shift_group1);
+ if ((i & 3) == 3) {
+ E1000_WRITE_REG(hw,
+ (E1000_RETA(0) + (i & ~3)), reta.dword);
+ }
+ }
+
+ /* Fill out hash function seeds */
+ for (i = 0; i < 10; i++) {
+ (void) random_get_pseudo_bytes((uint8_t *)&random,
+ sizeof (uint32_t));
+ E1000_WRITE_REG(hw, E1000_RSSRK(i), random);
+ }
+
+ /*
+ * Setup the Multiple Receive Queue Control register,
+ * enable VMDq based on packet destination MAC address and RSS.
+ */
+ mrqc = E1000_MRQC_ENABLE_VMDQ_MAC_RSS_GROUP;
+ mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 |
+ E1000_MRQC_RSS_FIELD_IPV4_TCP |
+ E1000_MRQC_RSS_FIELD_IPV6 |
+ E1000_MRQC_RSS_FIELD_IPV6_TCP |
+ E1000_MRQC_RSS_FIELD_IPV4_UDP |
+ E1000_MRQC_RSS_FIELD_IPV6_UDP |
+ E1000_MRQC_RSS_FIELD_IPV6_UDP_EX |
+ E1000_MRQC_RSS_FIELD_IPV6_TCP_EX);
+
+ E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
+
+
+ /* Define the default group and default queues */
+ vmdctl = E1000_VMDQ_MAC_GROUP_DEFAULT_QUEUE;
+ E1000_WRITE_REG(hw, E1000_VMD_CTL, vmdctl);
+
+ /*
+ * Disable Packet Checksum to enable RSS for multiple receive queues.
+ *
+ * The Packet Checksum is not ethernet CRC. It is another kind of
+ * checksum offloading provided by the 82575 chipset besides the IP
+ * header checksum offloading and the TCP/UDP checksum offloading.
+ * The Packet Checksum is by default computed over the entire packet
+ * from the first byte of the DA through the last byte of the CRC,
+ * including the Ethernet and IP headers.
+ *
+ * It is a hardware limitation that Packet Checksum is mutually
+ * exclusive with RSS.
+ */
+ rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
+ rxcsum |= E1000_RXCSUM_PCSD;
+ E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
+}
+
+/*
+ * igb_setup_mac_classify - Setup MAC classification feature
+ */
+static void
+igb_setup_mac_classify(igb_t *igb)
+{
+ struct e1000_hw *hw = &igb->hw;
+ uint32_t mrqc, rxcsum;
+
+ /*
+ * Setup the Multiple Receive Queue Control register,
+ * enable VMDq based on packet destination MAC address.
+ */
+ mrqc = E1000_MRQC_ENABLE_VMDQ_MAC_GROUP;
+ E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
+
+ /*
+ * Disable Packet Checksum to enable RSS for multiple receive queues.
+ *
+ * The Packet Checksum is not ethernet CRC. It is another kind of
+ * checksum offloading provided by the 82575 chipset besides the IP
+ * header checksum offloading and the TCP/UDP checksum offloading.
+ * The Packet Checksum is by default computed over the entire packet
+ * from the first byte of the DA through the last byte of the CRC,
+ * including the Ethernet and IP headers.
+ *
+ * It is a hardware limitation that Packet Checksum is mutually
+ * exclusive with RSS.
+ */
+ rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
+ rxcsum |= E1000_RXCSUM_PCSD;
+ E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
+
+}
+
+/*
* igb_init_unicst - Initialize the unicast addresses
*/
static void
@@ -2049,41 +2235,39 @@ igb_init_unicst(igb_t *igb)
*
* 1. Chipset is initialized the first time
* Initialize the multiple unicast addresses, and
- * save the default mac address.
+ * save the default MAC address.
*
* 2. Chipset is reset
* Recover the multiple unicast addresses from the
* software data structure to the RAR registers.
*/
- if (!igb->unicst_init) {
- /* Initialize the multiple unicast addresses */
- igb->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
- igb->unicst_avail = igb->unicst_total - 1;
+ /*
+ * Clear the default MAC address in the RAR0 rgister,
+ * which is loaded from EEPROM when system boot or chipreset,
+ * this will cause the conficts with add_mac/rem_mac entry
+ * points when VMDq is enabled. For this reason, the RAR0
+ * must be cleared for both cases mentioned above.
+ */
+ e1000_rar_clear(hw, 0);
- /* Store the default mac address */
- e1000_rar_set(hw, hw->mac.addr, 0);
+ if (!igb->unicst_init) {
- bcopy(hw->mac.addr, igb->unicst_addr[0].mac.addr,
- ETHERADDRL);
- igb->unicst_addr[0].mac.set = 1;
+ /* Initialize the multiple unicast addresses */
+ igb->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
+ igb->unicst_avail = igb->unicst_total;
- for (slot = 1; slot < igb->unicst_total; slot++)
+ for (slot = 0; slot < igb->unicst_total; slot++)
igb->unicst_addr[slot].mac.set = 0;
igb->unicst_init = B_TRUE;
} else {
- /* Recover the default mac address */
- bcopy(igb->unicst_addr[0].mac.addr, hw->mac.addr,
- ETHERADDRL);
-
- /* Store the default mac address */
- e1000_rar_set(hw, hw->mac.addr, 0);
-
/* Re-configure the RAR registers */
- for (slot = 1; slot < igb->unicst_total; slot++)
- e1000_rar_set(hw,
- igb->unicst_addr[slot].mac.addr, slot);
+ for (slot = 0; slot < igb->unicst_total; slot++) {
+ e1000_rar_set_vmdq(hw, igb->unicst_addr[slot].mac.addr,
+ slot, igb->vmdq_mode,
+ igb->unicst_addr[slot].mac.group_index);
+ }
}
if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK)
@@ -2091,11 +2275,30 @@ igb_init_unicst(igb_t *igb)
}
/*
+ * igb_unicst_find - Find the slot for the specified unicast address
+ */
+int
+igb_unicst_find(igb_t *igb, const uint8_t *mac_addr)
+{
+ int slot;
+
+ ASSERT(mutex_owned(&igb->gen_lock));
+
+ for (slot = 0; slot < igb->unicst_total; slot++) {
+ if (bcmp(igb->unicst_addr[slot].mac.addr,
+ mac_addr, ETHERADDRL) == 0)
+ return (slot);
+ }
+
+ return (-1);
+}
+
+/*
* igb_unicst_set - Set the unicast address to the specified slot
*/
int
igb_unicst_set(igb_t *igb, const uint8_t *mac_addr,
- mac_addr_slot_t slot)
+ int slot)
{
struct e1000_hw *hw = &igb->hw;
@@ -2232,6 +2435,8 @@ igb_get_conf(igb_t *igb)
struct e1000_hw *hw = &igb->hw;
uint32_t default_mtu;
uint32_t flow_control;
+ uint32_t ring_per_group;
+ int i;
/*
* igb driver supports the following user configurations:
@@ -2299,16 +2504,66 @@ igb_get_conf(igb_t *igb)
/*
* Multiple rings configurations
*/
- igb->num_tx_rings = igb_get_prop(igb, PROP_TX_QUEUE_NUM,
- MIN_TX_QUEUE_NUM, MAX_TX_QUEUE_NUM, DEFAULT_TX_QUEUE_NUM);
igb->tx_ring_size = igb_get_prop(igb, PROP_TX_RING_SIZE,
MIN_TX_RING_SIZE, MAX_TX_RING_SIZE, DEFAULT_TX_RING_SIZE);
-
- igb->num_rx_rings = igb_get_prop(igb, PROP_RX_QUEUE_NUM,
- MIN_RX_QUEUE_NUM, MAX_RX_QUEUE_NUM, DEFAULT_RX_QUEUE_NUM);
igb->rx_ring_size = igb_get_prop(igb, PROP_RX_RING_SIZE,
MIN_RX_RING_SIZE, MAX_RX_RING_SIZE, DEFAULT_RX_RING_SIZE);
+ igb->mr_enable = igb_get_prop(igb, PROP_MR_ENABLE, 0, 1, 1);
+ igb->num_rx_groups = igb_get_prop(igb, PROP_RX_GROUP_NUM,
+ MIN_RX_GROUP_NUM, MAX_RX_GROUP_NUM, DEFAULT_RX_GROUP_NUM);
+
+ if (igb->mr_enable) {
+ igb->num_tx_rings = DEFAULT_TX_QUEUE_NUM;
+ igb->num_rx_rings = DEFAULT_RX_QUEUE_NUM;
+ } else {
+ igb->num_tx_rings = 1;
+ igb->num_rx_rings = 1;
+
+ if (igb->num_rx_groups > 1) {
+ igb_error(igb,
+ "Invalid rx groups number. Please enable multiple "
+ "rings first");
+ igb->num_rx_groups = 1;
+ }
+ }
+
+ /*
+ * Check the divisibility between rx rings and rx groups.
+ */
+ for (i = igb->num_rx_groups; i > 0; i--) {
+ if ((igb->num_rx_rings % i) == 0)
+ break;
+ }
+ if (i != igb->num_rx_groups) {
+ igb_error(igb,
+ "Invalid rx groups number. Downgrade the rx group "
+ "number to %d.", i);
+ igb->num_rx_groups = i;
+ }
+
+ /*
+ * Get the ring number per group.
+ */
+ ring_per_group = igb->num_rx_rings / igb->num_rx_groups;
+
+ if (igb->num_rx_groups == 1) {
+ /*
+ * One rx ring group, the rx ring number is num_rx_rings.
+ */
+ igb->vmdq_mode = E1000_VMDQ_OFF;
+ } else if (ring_per_group == 1) {
+ /*
+ * Multiple rx groups, each group has one rx ring.
+ */
+ igb->vmdq_mode = E1000_VMDQ_MAC;
+ } else {
+ /*
+ * Multiple groups and multiple rings.
+ */
+ igb->vmdq_mode = E1000_VMDQ_MAC_RSS;
+ }
+
/*
* Tunable used to force an interrupt type. The only use is
* for testing of the lesser interrupt types.
@@ -2861,6 +3116,7 @@ igb_enable_adapter_interrupts(igb_t *igb)
/* Interrupt enabling for MSI-X */
E1000_WRITE_REG(hw, E1000_EIMS, igb->eims_mask);
E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask);
+ igb->ims_mask = E1000_IMS_LSC;
E1000_WRITE_REG(hw, E1000_IMS, E1000_IMS_LSC);
/* Enable MSI-X PBA support */
@@ -2873,6 +3129,7 @@ igb_enable_adapter_interrupts(igb_t *igb)
E1000_WRITE_REG(hw, E1000_CTRL_EXT, reg);
} else {
/* Interrupt enabling for MSI and legacy */
+ igb->ims_mask = IMS_ENABLE_MASK;
E1000_WRITE_REG(hw, E1000_IMS, IMS_ENABLE_MASK);
}
@@ -3176,11 +3433,12 @@ igb_intr_rx_work(igb_rx_ring_t *rx_ring)
mblk_t *mp;
mutex_enter(&rx_ring->rx_lock);
- mp = igb_rx(rx_ring);
+ mp = igb_rx(rx_ring, IGB_NO_POLL);
mutex_exit(&rx_ring->rx_lock);
if (mp != NULL)
- mac_rx(rx_ring->igb->mac_hdl, NULL, mp);
+ mac_rx_ring(rx_ring->igb->mac_hdl, rx_ring->ring_handle, mp,
+ rx_ring->ring_gen_num);
}
#pragma inline(igb_intr_tx_work)
@@ -3197,17 +3455,17 @@ igb_intr_tx_work(igb_tx_ring_t *tx_ring)
if (tx_ring->reschedule &&
(tx_ring->tbd_free >= tx_ring->resched_thresh)) {
tx_ring->reschedule = B_FALSE;
- mac_tx_update(tx_ring->igb->mac_hdl);
+ mac_tx_ring_update(tx_ring->igb->mac_hdl, tx_ring->ring_handle);
IGB_DEBUG_STAT(tx_ring->stat_reschedule);
}
}
-#pragma inline(igb_intr_other_work)
+#pragma inline(igb_intr_link_work)
/*
- * igb_intr_other_work - other processing of ISR
+ * igb_intr_link_work - link-status-change processing of ISR
*/
static void
-igb_intr_other_work(igb_t *igb)
+igb_intr_link_work(igb_t *igb)
{
boolean_t link_changed;
@@ -3273,7 +3531,7 @@ igb_intr_legacy(void *arg1, void *arg2)
ASSERT(igb->num_tx_rings == 1);
if (icr & E1000_ICR_RXT0) {
- mp = igb_rx(&igb->rx_rings[0]);
+ mp = igb_rx(&igb->rx_rings[0], IGB_NO_POLL);
}
if (icr & E1000_ICR_TXDW) {
@@ -3320,7 +3578,7 @@ igb_intr_legacy(void *arg1, void *arg2)
if (tx_reschedule) {
tx_ring->reschedule = B_FALSE;
- mac_tx_update(igb->mac_hdl);
+ mac_tx_ring_update(igb->mac_hdl, tx_ring->ring_handle);
IGB_DEBUG_STAT(tx_ring->stat_reschedule);
}
@@ -3359,7 +3617,7 @@ igb_intr_msi(void *arg1, void *arg2)
}
if (icr & E1000_ICR_LSC) {
- igb_intr_other_work(igb);
+ igb_intr_link_work(igb);
}
return (DDI_INTR_CLAIMED);
@@ -3385,10 +3643,27 @@ igb_intr_rx(void *arg1, void *arg2)
}
/*
+ * igb_intr_tx - Interrupt handler for tx
+ */
+static uint_t
+igb_intr_tx(void *arg1, void *arg2)
+{
+ igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg1;
+
+ _NOTE(ARGUNUSED(arg2));
+
+ /*
+ * Only used via MSI-X vector so don't check cause bits
+ * and only clean the given ring.
+ */
+ igb_intr_tx_work(tx_ring);
+
+ return (DDI_INTR_CLAIMED);
+}
+
+/*
* igb_intr_tx_other - Interrupt handler for both tx and other
*
- * Always look for Tx cleanup work. Only look for other work if the right
- * bits are set in the Interrupt Cause Register.
*/
static uint_t
igb_intr_tx_other(void *arg1, void *arg2)
@@ -3401,17 +3676,18 @@ igb_intr_tx_other(void *arg1, void *arg2)
icr = E1000_READ_REG(&igb->hw, E1000_ICR);
/*
- * Always look for Tx cleanup work. We don't have separate
- * transmit vectors, so we have only one tx ring enabled.
+ * Look for tx reclaiming work first. Remember, in the
+ * case of only interrupt sharing, only one tx ring is
+ * used
*/
- ASSERT(igb->num_tx_rings == 1);
igb_intr_tx_work(&igb->tx_rings[0]);
/*
- * Check for "other" causes.
+ * Need check cause bits and only link change will
+ * be processed
*/
if (icr & E1000_ICR_LSC) {
- igb_intr_other_work(igb);
+ igb_intr_link_work(igb);
}
return (DDI_INTR_CLAIMED);
@@ -3504,23 +3780,12 @@ static int
igb_alloc_intr_handles(igb_t *igb, int intr_type)
{
dev_info_t *devinfo;
- int request, count, avail, actual;
- int rx_rings, minimum;
+ int orig, request, count, avail, actual;
+ int diff, minimum;
int rc;
devinfo = igb->dip;
- /*
- * Currently only 1 tx ring is supported. More tx rings
- * will be supported with future enhancement.
- */
- if (igb->num_tx_rings > 1) {
- igb->num_tx_rings = 1;
- igb_log(igb,
- "Use only 1 MSI-X vector for tx, "
- "force tx queue number to 1");
- }
-
switch (intr_type) {
case DDI_INTR_TYPE_FIXED:
request = 1; /* Request 1 legacy interrupt handle */
@@ -3536,12 +3801,12 @@ igb_alloc_intr_handles(igb_t *igb, int intr_type)
case DDI_INTR_TYPE_MSIX:
/*
- * Best number of vectors for the adapter is
- * # rx rings + # tx rings + 1 for other
- * But currently we only support number of vectors of
- * # rx rings + 1 for tx & other
+ * Number of vectors for the adapter is
+ * # rx rings + # tx rings
+ * One of tx vectors is for tx & other
*/
- request = igb->num_rx_rings + 1;
+ request = igb->num_rx_rings + igb->num_tx_rings;
+ orig = request;
minimum = 2;
IGB_DEBUGLOG_0(igb, "interrupt type: MSI-X");
break;
@@ -3613,15 +3878,24 @@ igb_alloc_intr_handles(igb_t *igb, int intr_type)
}
/*
- * For MSI-X, actual might force us to reduce number of rx rings
+ * For MSI-X, actual might force us to reduce number of tx & rx rings
*/
- if (intr_type == DDI_INTR_TYPE_MSIX) {
- rx_rings = actual - 1;
- if (rx_rings < igb->num_rx_rings) {
+ if ((intr_type == DDI_INTR_TYPE_MSIX) && (orig > actual)) {
+ diff = orig - actual;
+ if (diff < igb->num_tx_rings) {
+ igb_log(igb,
+ "MSI-X vectors force Tx queue number to %d",
+ igb->num_tx_rings - diff);
+ igb->num_tx_rings -= diff;
+ } else {
+ igb_log(igb,
+ "MSI-X vectors force Tx queue number to 1");
+ igb->num_tx_rings = 1;
+
igb_log(igb,
"MSI-X vectors force Rx queue number to %d",
- rx_rings);
- igb->num_rx_rings = rx_rings;
+ actual - 1);
+ igb->num_rx_rings = actual - 1;
}
}
@@ -3662,6 +3936,7 @@ static int
igb_add_intr_handlers(igb_t *igb)
{
igb_rx_ring_t *rx_ring;
+ igb_tx_ring_t *tx_ring;
int vector;
int rc;
int i;
@@ -3671,14 +3946,17 @@ igb_add_intr_handlers(igb_t *igb)
switch (igb->intr_type) {
case DDI_INTR_TYPE_MSIX:
/* Add interrupt handler for tx + other */
+ tx_ring = &igb->tx_rings[0];
rc = ddi_intr_add_handler(igb->htable[vector],
(ddi_intr_handler_t *)igb_intr_tx_other,
(void *)igb, NULL);
+
if (rc != DDI_SUCCESS) {
igb_log(igb,
"Add tx/other interrupt handler failed: %d", rc);
return (IGB_FAILURE);
}
+ tx_ring->intr_vector = vector;
vector++;
/* Add interrupt handler for each rx ring */
@@ -3704,6 +3982,31 @@ igb_add_intr_handlers(igb_t *igb)
vector++;
}
+
+ /* Add interrupt handler for each tx ring from 2nd ring */
+ for (i = 1; i < igb->num_tx_rings; i++) {
+ tx_ring = &igb->tx_rings[i];
+
+ rc = ddi_intr_add_handler(igb->htable[vector],
+ (ddi_intr_handler_t *)igb_intr_tx,
+ (void *)tx_ring, NULL);
+
+ if (rc != DDI_SUCCESS) {
+ igb_log(igb,
+ "Add tx interrupt handler failed. "
+ "return: %d, tx ring: %d", rc, i);
+ for (vector--; vector >= 0; vector--) {
+ (void) ddi_intr_remove_handler(
+ igb->htable[vector]);
+ }
+ return (IGB_FAILURE);
+ }
+
+ tx_ring->intr_vector = vector;
+
+ vector++;
+ }
+
break;
case DDI_INTR_TYPE_MSI:
@@ -3764,14 +4067,14 @@ igb_setup_adapter_msix(igb_t *igb)
struct e1000_hw *hw = &igb->hw;
/*
- * Set vector for Tx + Other causes
- * NOTE assumption that there is only one of these and it is vector 0
+ * Set vector for other causes, NOTE assumption that it is vector 0
*/
vector = 0;
+
igb->eims_mask = E1000_EICR_TX_QUEUE0 | E1000_EICR_OTHER;
E1000_WRITE_REG(hw, E1000_MSIXBM(vector), igb->eims_mask);
-
vector++;
+
for (i = 0; i < igb->num_rx_rings; i++) {
/*
* Set vector for each rx ring
@@ -3787,6 +4090,21 @@ igb_setup_adapter_msix(igb_t *igb)
vector++;
}
+ for (i = 1; i < igb->num_tx_rings; i++) {
+ /*
+ * Set vector for each tx ring from 2nd tx ring
+ */
+ eims = (E1000_EICR_TX_QUEUE0 << i);
+ E1000_WRITE_REG(hw, E1000_MSIXBM(vector), eims);
+
+ /*
+ * Accumulate bits to enable in igb_enable_adapter_interrupts()
+ */
+ igb->eims_mask |= eims;
+
+ vector++;
+ }
+
ASSERT(vector == igb->intr_cnt);
/*
diff --git a/usr/src/uts/common/io/igb/igb_osdep.c b/usr/src/uts/common/io/igb/igb_osdep.c
index 9d03c05494..f915edd5ae 100644
--- a/usr/src/uts/common/io/igb/igb_osdep.c
+++ b/usr/src/uts/common/io/igb/igb_osdep.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "igb_osdep.h"
#include "igb_api.h"
@@ -114,3 +114,61 @@ e1000_enable_pciex_master(struct e1000_hw *hw)
ctrl &= ~E1000_CTRL_GIO_MASTER_DISABLE;
E1000_WRITE_REG(hw, E1000_CTRL, ctrl);
}
+
+/*
+ * e1000_rar_set_vmdq - Clear the RAR registers
+ */
+void
+e1000_rar_clear(struct e1000_hw *hw, uint32_t index)
+{
+
+ uint32_t rar_high;
+
+ /* Make the hardware the Address invalid by setting the clear bit */
+ rar_high = ~E1000_RAH_AV;
+
+ E1000_WRITE_REG_ARRAY(hw, E1000_RA, ((index << 1) + 1), rar_high);
+ E1000_WRITE_FLUSH(hw);
+}
+
+/*
+ * e1000_rar_set_vmdq - Set the RAR registers for VMDq
+ */
+void
+e1000_rar_set_vmdq(struct e1000_hw *hw, const uint8_t *addr, uint32_t index,
+ uint32_t vmdq_mode, uint8_t qsel)
+{
+ uint32_t rar_low, rar_high;
+
+ /*
+ * NIC expects these in little endian so reverse the byte order
+ * from network order (big endian) to little endian.
+ */
+
+ rar_low = ((uint32_t)addr[0] | ((uint32_t)addr[1] << 8) |
+ ((uint32_t)addr[2] << 16) | ((uint32_t)addr[3] << 24));
+
+ rar_high = ((uint32_t)addr[4] | ((uint32_t)addr[5] << 8));
+
+ /* Indicate to hardware the Address is Valid. */
+ rar_high |= E1000_RAH_AV;
+
+ /* Set que selector based on vmdq mode */
+ switch (vmdq_mode) {
+ default:
+ case E1000_VMDQ_OFF:
+ break;
+ case E1000_VMDQ_MAC:
+ rar_high |= (qsel << 18);
+ break;
+ case E1000_VMDQ_MAC_RSS:
+ rar_high |= 1 << (18 + qsel);
+ break;
+
+ }
+
+ /* write to receive address registers */
+ E1000_WRITE_REG_ARRAY(hw, E1000_RA, (index << 1), rar_low);
+ E1000_WRITE_REG_ARRAY(hw, E1000_RA, ((index << 1) + 1), rar_high);
+ E1000_WRITE_FLUSH(hw);
+}
diff --git a/usr/src/uts/common/io/igb/igb_osdep.h b/usr/src/uts/common/io/igb/igb_osdep.h
index 42ba27a2e3..f56f320a1c 100644
--- a/usr/src/uts/common/io/igb/igb_osdep.h
+++ b/usr/src/uts/common/io/igb/igb_osdep.h
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,17 @@
*/
/*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
*/
#ifndef _IGB_OSDEP_H
#define _IGB_OSDEP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -96,6 +96,18 @@ extern "C" {
#define IEEE_ESR_1000X_HD_CAPS 0x4000 /* 1000X HD capable */
#define IEEE_ESR_1000X_FD_CAPS 0x8000 /* 1000X FD capable */
+/* VMDq MODE supported by hardware */
+#define E1000_VMDQ_OFF 0
+#define E1000_VMDQ_MAC 1
+#define E1000_VMDQ_MAC_RSS 2
+
+/* VMDq based on packet destination MAC address */
+#define E1000_MRQC_ENABLE_VMDQ_MAC_GROUP 0x00000003
+/* VMDq based on packet destination MAC address and RSS */
+#define E1000_MRQC_ENABLE_VMDQ_MAC_RSS_GROUP 0x00000005
+/* The default queue in each VMDqs */
+#define E1000_VMDQ_MAC_GROUP_DEFAULT_QUEUE 0x100
+
#define E1000_WRITE_FLUSH(a) (void) E1000_READ_REG(a, E1000_STATUS)
#define E1000_WRITE_REG(hw, reg, value) \
diff --git a/usr/src/uts/common/io/igb/igb_rx.c b/usr/src/uts/common/io/igb/igb_rx.c
index ec04dc6b8e..acf15ed35c 100644
--- a/usr/src/uts/common/io/igb/igb_rx.c
+++ b/usr/src/uts/common/io/igb/igb_rx.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "igb_sw.h"
@@ -251,6 +251,24 @@ igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
}
}
+mblk_t *
+igb_rx_ring_poll(void *arg, int bytes)
+{
+ igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)arg;
+ mblk_t *mp = NULL;
+
+ ASSERT(bytes >= 0);
+
+ if (bytes == 0)
+ return (mp);
+
+ mutex_enter(&rx_ring->rx_lock);
+ mp = igb_rx(rx_ring, bytes);
+ mutex_exit(&rx_ring->rx_lock);
+
+ return (mp);
+}
+
/*
* igb_rx - Receive the data of one ring
*
@@ -260,7 +278,7 @@ igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
* passed up to mac_rx().
*/
mblk_t *
-igb_rx(igb_rx_ring_t *rx_ring)
+igb_rx(igb_rx_ring_t *rx_ring, int poll_bytes)
{
union e1000_adv_rx_desc *current_rbd;
rx_control_block_t *current_rcb;
@@ -272,6 +290,7 @@ igb_rx(igb_rx_ring_t *rx_ring)
uint32_t pkt_len;
uint32_t status_error;
uint32_t pkt_num;
+ uint32_t total_bytes;
igb_t *igb = rx_ring->igb;
mblk_head = NULL;
@@ -296,6 +315,7 @@ igb_rx(igb_rx_ring_t *rx_ring)
current_rbd = &rx_ring->rbd_ring[rx_next];
pkt_num = 0;
+ total_bytes = 0;
status_error = current_rbd->wb.upper.status_error;
while (status_error & E1000_RXD_STAT_DD) {
/*
@@ -315,6 +335,14 @@ igb_rx(igb_rx_ring_t *rx_ring)
(status_error & E1000_RXDEXT_STATERR_IPE));
pkt_len = current_rbd->wb.upper.length;
+
+ if ((poll_bytes != IGB_NO_POLL) &&
+ ((pkt_len + total_bytes) > poll_bytes))
+ break;
+
+ IGB_DEBUG_STAT(rx_ring->stat_pkt_cnt);
+ total_bytes += pkt_len;
+
mp = NULL;
/*
* For packets with length more than the copy threshold,
diff --git a/usr/src/uts/common/io/igb/igb_sw.h b/usr/src/uts/common/io/igb/igb_sw.h
index 457c929d1a..a69ba3bb77 100644
--- a/usr/src/uts/common/io/igb/igb_sw.h
+++ b/usr/src/uts/common/io/igb/igb_sw.h
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,17 @@
*/
/*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
*/
#ifndef _IGB_SW_H
#define _IGB_SW_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -48,7 +48,7 @@ extern "C" {
#include <sys/modctl.h>
#include <sys/errno.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/vlan.h>
#include <sys/ddi.h>
@@ -88,6 +88,9 @@ extern "C" {
#define IGB_INTR_MSI 2
#define IGB_INTR_LEGACY 3
+#define IGB_NO_POLL -1
+#define IGB_NO_FREE_SLOT -1
+
#define MAX_NUM_UNICAST_ADDRESSES E1000_RAR_ENTRIES
#define MAX_NUM_MULTICAST_ADDRESSES 256
#define MAX_NUM_EITR 10
@@ -97,10 +100,9 @@ extern "C" {
/*
* Maximum values for user configurable parameters
*/
-#define MAX_TX_QUEUE_NUM 4
-#define MAX_RX_QUEUE_NUM 4
#define MAX_TX_RING_SIZE 4096
#define MAX_RX_RING_SIZE 4096
+#define MAX_RX_GROUP_NUM 4
#define MAX_MTU 9000
#define MAX_RX_LIMIT_PER_INTR 4096
@@ -119,10 +121,9 @@ extern "C" {
/*
* Minimum values for user configurable parameters
*/
-#define MIN_TX_QUEUE_NUM 1
-#define MIN_RX_QUEUE_NUM 1
#define MIN_TX_RING_SIZE 64
#define MIN_RX_RING_SIZE 64
+#define MIN_RX_GROUP_NUM 1
#define MIN_MTU ETHERMIN
#define MIN_RX_LIMIT_PER_INTR 16
@@ -140,10 +141,11 @@ extern "C" {
/*
* Default values for user configurable parameters
*/
-#define DEFAULT_TX_QUEUE_NUM 1
-#define DEFAULT_RX_QUEUE_NUM 1
+#define DEFAULT_TX_QUEUE_NUM 4
+#define DEFAULT_RX_QUEUE_NUM 4
#define DEFAULT_TX_RING_SIZE 512
#define DEFAULT_RX_RING_SIZE 512
+#define DEFAULT_RX_GROUP_NUM 1
#define DEFAULT_MTU ETHERMTU
#define DEFAULT_RX_LIMIT_PER_INTR 256
@@ -187,7 +189,6 @@ extern "C" {
#define ATTACH_PROGRESS_ENABLE_INTR 0x1000 /* DDI interrupts enabled */
#define ATTACH_PROGRESS_FMINIT 0x2000 /* FMA initialized */
-
#define PROP_ADV_AUTONEG_CAP "adv_autoneg_cap"
#define PROP_ADV_1000FDX_CAP "adv_1000fdx_cap"
#define PROP_ADV_1000HDX_CAP "adv_1000hdx_cap"
@@ -197,10 +198,10 @@ extern "C" {
#define PROP_ADV_10HDX_CAP "adv_10hdx_cap"
#define PROP_DEFAULT_MTU "default_mtu"
#define PROP_FLOW_CONTROL "flow_control"
-#define PROP_TX_QUEUE_NUM "tx_queue_number"
#define PROP_TX_RING_SIZE "tx_ring_size"
-#define PROP_RX_QUEUE_NUM "rx_queue_number"
#define PROP_RX_RING_SIZE "rx_ring_size"
+#define PROP_MR_ENABLE "mr_enable"
+#define PROP_RX_GROUP_NUM "rx_group_number"
#define PROP_INTR_FORCE "intr_force"
#define PROP_TX_HCKSUM_ENABLE "tx_hcksum_enable"
@@ -410,7 +411,7 @@ typedef union igb_ether_addr {
} reg;
struct {
uint8_t set;
- uint8_t redundant;
+ uint8_t group_index;
uint8_t addr[ETHERADDRL];
} mac;
} igb_ether_addr_t;
@@ -479,6 +480,7 @@ typedef struct rx_control_block {
*/
typedef struct igb_tx_ring {
uint32_t index; /* Ring index */
+ uint32_t intr_vector; /* Interrupt vector index */
/*
* Mutexes
@@ -538,13 +540,14 @@ typedef struct igb_tx_ring {
uint32_t stat_fail_no_tcb;
uint32_t stat_fail_dma_bind;
uint32_t stat_reschedule;
+ uint32_t stat_pkt_cnt;
#endif
/*
* Pointer to the igb struct
*/
struct igb *igb;
-
+ mac_ring_handle_t ring_handle; /* call back ring handle */
} igb_tx_ring_t;
/*
@@ -592,12 +595,24 @@ typedef struct igb_rx_ring {
uint32_t stat_frame_error;
uint32_t stat_cksum_error;
uint32_t stat_exceed_pkt;
+ uint32_t stat_pkt_cnt;
#endif
struct igb *igb; /* Pointer to igb struct */
-
+ mac_ring_handle_t ring_handle; /* call back ring handle */
+ uint32_t group_index; /* group index */
+ uint64_t ring_gen_num;
} igb_rx_ring_t;
+/*
+ * Software Receive Ring Group
+ */
+typedef struct igb_rx_group {
+ uint32_t index; /* Group index */
+ mac_group_handle_t group_handle; /* call back group handle */
+ struct igb *igb; /* Pointer to igb struct */
+} igb_rx_group_t;
+
typedef struct igb {
int instance;
mac_handle_t mac_hdl;
@@ -616,13 +631,18 @@ typedef struct igb {
uint32_t loopback_mode;
uint32_t max_frame_size;
+ uint32_t mr_enable; /* Enable multiple rings */
+ uint32_t vmdq_mode; /* Mode of VMDq */
+
/*
- * Receive Rings
+ * Receive Rings and Groups
*/
igb_rx_ring_t *rx_rings; /* Array of rx rings */
uint32_t num_rx_rings; /* Number of rx rings in use */
uint32_t rx_ring_size; /* Rx descriptor ring size */
uint32_t rx_buf_size; /* Rx buffer size */
+ igb_rx_group_t *rx_groups; /* Array of rx groups */
+ uint32_t num_rx_groups; /* Number of rx groups in use */
/*
* Transmit Rings
@@ -652,6 +672,7 @@ typedef struct igb {
uint_t intr_pri;
ddi_intr_handle_t *htable;
uint32_t eims_mask;
+ uint32_t ims_mask;
kmutex_t gen_lock; /* General lock for device access */
kmutex_t watchdog_lock;
@@ -772,7 +793,8 @@ void igb_free_dma(igb_t *);
int igb_start(igb_t *);
void igb_stop(igb_t *);
int igb_setup_link(igb_t *, boolean_t);
-int igb_unicst_set(igb_t *, const uint8_t *, mac_addr_slot_t);
+int igb_unicst_find(igb_t *, const uint8_t *);
+int igb_unicst_set(igb_t *, const uint8_t *, int);
int igb_multicst_add(igb_t *, const uint8_t *);
int igb_multicst_remove(igb_t *, const uint8_t *);
enum ioc_reply igb_loopback_ioctl(igb_t *, struct iocblk *, mblk_t *);
@@ -795,22 +817,23 @@ int igb_m_unicst(void *, const uint8_t *);
int igb_m_stat(void *, uint_t, uint64_t *);
void igb_m_resources(void *);
void igb_m_ioctl(void *, queue_t *, mblk_t *);
-int igb_m_unicst_add(void *, mac_multi_addr_t *);
-int igb_m_unicst_remove(void *, mac_addr_slot_t);
-int igb_m_unicst_modify(void *, mac_multi_addr_t *);
-int igb_m_unicst_get(void *, mac_multi_addr_t *);
boolean_t igb_m_getcapab(void *, mac_capab_t, void *);
+void igb_fill_ring(void *, mac_ring_type_t, const int, const int,
+ mac_ring_info_t *, mac_ring_handle_t);
+void igb_fill_group(void *arg, mac_ring_type_t, const int,
+ mac_group_info_t *, mac_group_handle_t);
+int igb_rx_ring_intr_enable(mac_intr_handle_t);
+int igb_rx_ring_intr_disable(mac_intr_handle_t);
/*
* Function prototypes in igb_rx.c
*/
-mblk_t *igb_rx(igb_rx_ring_t *);
+mblk_t *igb_rx(igb_rx_ring_t *, int);
void igb_rx_recycle(caddr_t arg);
/*
* Function prototypes in igb_tx.c
*/
-mblk_t *igb_m_tx(void *, mblk_t *);
void igb_free_tcb(tx_control_block_t *);
void igb_put_free_list(igb_tx_ring_t *, link_list_t *);
uint32_t igb_tx_recycle_legacy(igb_tx_ring_t *);
@@ -835,6 +858,8 @@ enum ioc_reply igb_nd_ioctl(igb_t *, queue_t *, mblk_t *, struct iocblk *);
*/
int igb_init_stats(igb_t *);
+mblk_t *igb_rx_ring_poll(void *, int);
+mblk_t *igb_tx_ring_send(void *, mblk_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/io/igb/igb_tx.c b/usr/src/uts/common/io/igb/igb_tx.c
index b3a0090ebe..7b43bbad97 100644
--- a/usr/src/uts/common/io/igb/igb_tx.c
+++ b/usr/src/uts/common/io/igb/igb_tx.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "igb_sw.h"
@@ -42,7 +42,7 @@ static tx_control_block_t *igb_get_free_list(igb_tx_ring_t *);
static void igb_get_hcksum_context(mblk_t *, hcksum_context_t *);
static boolean_t igb_check_hcksum_context(igb_tx_ring_t *, hcksum_context_t *);
static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *,
- hcksum_context_t *);
+ hcksum_context_t *, uint32_t);
#ifndef IGB_DEBUG
#pragma inline(igb_save_desc)
@@ -51,58 +51,14 @@ static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *,
#pragma inline(igb_fill_hcksum_context)
#endif
-/*
- * igb_m_tx
- *
- * The GLDv3 interface to call driver's tx routine to transmit
- * the mblks.
- */
mblk_t *
-igb_m_tx(void *arg, mblk_t *mp)
+igb_tx_ring_send(void *arg, mblk_t *mp)
{
- igb_t *igb = (igb_t *)arg;
- mblk_t *next;
- igb_tx_ring_t *tx_ring;
+ igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg;
- /*
- * If the adapter is suspended, or it is not started, or the link
- * is not up, the mblks are simply dropped.
- */
- if (((igb->igb_state & IGB_SUSPENDED) != 0) ||
- ((igb->igb_state & IGB_STARTED) == 0) ||
- (igb->link_state != LINK_STATE_UP)) {
- /* Free the mblk chain */
- while (mp != NULL) {
- next = mp->b_next;
- mp->b_next = NULL;
-
- freemsg(mp);
- mp = next;
- }
+ ASSERT(tx_ring != NULL);
- return (NULL);
- }
-
- /*
- * Decide which tx ring is used to transmit the packets.
- * This needs to be updated later to fit the new interface
- * of the multiple rings support.
- */
- tx_ring = &igb->tx_rings[0];
-
- while (mp != NULL) {
- next = mp->b_next;
- mp->b_next = NULL;
-
- if (!igb_tx(tx_ring, mp)) {
- mp->b_next = next;
- break;
- }
-
- mp = next;
- }
-
- return (mp);
+ return ((igb_tx(tx_ring, mp)) ? NULL : mp);
}
/*
@@ -671,7 +627,7 @@ igb_check_hcksum_context(igb_tx_ring_t *tx_ring, hcksum_context_t *hcksum)
*/
static void
igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *ctx_tbd,
- hcksum_context_t *hcksum)
+ hcksum_context_t *hcksum, uint32_t ring_index)
{
/*
* Fill the context descriptor with the checksum
@@ -708,7 +664,7 @@ igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *ctx_tbd,
}
ctx_tbd->seqnum_seed = 0;
- ctx_tbd->mss_l4len_idx = 0;
+ ctx_tbd->mss_l4len_idx = ring_index << 4;
}
/*
@@ -764,7 +720,8 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
* hardware checksum offload informations.
*/
igb_fill_hcksum_context(
- (struct e1000_adv_tx_context_desc *)tbd, hcksum);
+ (struct e1000_adv_tx_context_desc *)tbd, hcksum,
+ tx_ring->index);
index = NEXT_INDEX(index, 1, tx_ring->ring_size);
desc_num++;
@@ -843,6 +800,7 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
if (hcksum_flags & HCK_PARTIALCKSUM)
first_tbd->read.olinfo_status |=
E1000_TXD_POPTS_TXSM << 8;
+ first_tbd->read.olinfo_status |= tx_ring->index << 4;
}
/*
@@ -853,6 +811,8 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
tbd->read.cmd_type_len |=
E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS;
+ IGB_DEBUG_STAT(tx_ring->stat_pkt_cnt);
+
/*
* Sync the DMA buffer of the tx descriptor ring
*/
diff --git a/usr/src/uts/common/io/ipw/ipw2100.c b/usr/src/uts/common/io/ipw/ipw2100.c
index 3ad59d1051..d1171b5122 100644
--- a/usr/src/uts/common/io/ipw/ipw2100.c
+++ b/usr/src/uts/common/io/ipw/ipw2100.c
@@ -48,7 +48,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <net/if.h>
#include <sys/mac_wifi.h>
#include <sys/varargs.h>
@@ -177,7 +177,6 @@ mac_callbacks_t ipw2100_m_callbacks = {
ipw2100_m_multicst,
ipw2100_m_unicst,
ipw2100_m_tx,
- NULL,
ipw2100_m_ioctl
};
diff --git a/usr/src/uts/common/io/iwh/iwh.c b/usr/src/uts/common/io/iwh/iwh.c
index cce2a98845..1865a7ee5c 100644
--- a/usr/src/uts/common/io/iwh/iwh.c
+++ b/usr/src/uts/common/io/iwh/iwh.c
@@ -48,7 +48,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_wifi.h>
#include <sys/net80211.h>
#include <sys/net80211_proto.h>
@@ -414,7 +414,6 @@ mac_callbacks_t iwh_m_callbacks = {
iwh_m_multicst,
iwh_m_unicst,
iwh_m_tx,
- NULL,
iwh_m_ioctl
};
diff --git a/usr/src/uts/common/io/iwi/ipw2200.c b/usr/src/uts/common/io/iwi/ipw2200.c
index 465c3ea2a7..80633d498f 100644
--- a/usr/src/uts/common/io/iwi/ipw2200.c
+++ b/usr/src/uts/common/io/iwi/ipw2200.c
@@ -48,7 +48,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_wifi.h>
#include <sys/varargs.h>
#include <sys/pci.h>
@@ -207,7 +207,6 @@ mac_callbacks_t ipw2200_m_callbacks = {
ipw2200_m_multicst,
ipw2200_m_unicst,
ipw2200_m_tx,
- NULL,
ipw2200_m_ioctl
};
diff --git a/usr/src/uts/common/io/iwk/iwk2.c b/usr/src/uts/common/io/iwk/iwk2.c
index a0f17f2927..4ec4b774c8 100644
--- a/usr/src/uts/common/io/iwk/iwk2.c
+++ b/usr/src/uts/common/io/iwk/iwk2.c
@@ -48,7 +48,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_wifi.h>
#include <sys/net80211.h>
#include <sys/net80211_proto.h>
@@ -423,7 +423,6 @@ mac_callbacks_t iwk_m_callbacks = {
iwk_m_multicst,
iwk_m_unicst,
iwk_m_tx,
- NULL,
iwk_m_ioctl,
NULL,
NULL,
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe.conf b/usr/src/uts/common/io/ixgbe/ixgbe.conf
index 0e46fe5a0d..215d3d9516 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe.conf
+++ b/usr/src/uts/common/io/ixgbe/ixgbe.conf
@@ -1,19 +1,17 @@
#
# CDDL HEADER START
#
-# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
-# You can obtain a copy of the license at:
-# http://www.opensolaris.org/os/licensing.
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
-# When using or redistributing this file, you may do so under the
-# License only. No other modification of this header is permitted.
-#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
@@ -21,11 +19,10 @@
# CDDL HEADER END
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms of the CDDL.
+# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
#
-#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
#
#
# Driver.conf file for Intel 10GbE PCIE NIC Driver (ixgbe)
@@ -45,35 +42,31 @@
# 1 - Receive only
# 2 - Transmit only
# 3 - Receive and transmit
-# default value: 3
+# default value: 0
#
# flow_control = 3;
#
# -------------------- Transmit/Receive Queues --------------------
-# tx/rx queue.
-# tx_queue_number
-# The number of the transmit queues
-# Allowed values: 1 - 32
-# Default value: 1
#
# tx_ring_size
# The number of the transmit descriptors per transmit queue
# Allowed values: 64 - 4096
-# Default value: 512
-#
-# rx_queue_number
-# The number of the receive queues
-# Allowed values: 1 - 64
-# Default value: 1
+# Default value: 1024
#
# rx_ring_size
# The number of the receive descriptors per receive queue
# Allowed values: 64 - 4096
-# Default value: 512
+# Default value: 1024
#
-# Note: The final values of tx_queue_number and rx_queue_number are decided
-# by the number of interrupt vectors obtained by the driver. They could be
-# less than the specified values because of limited interrupt vector number.
+# mr_enable
+# Enable multiple tx queues and rx queues
+# Allowed values: 0 - 1
+# Default value: 1
+#
+# rx_group_number
+# The number of the receive groups
+# Allowed values: 1 - 16
+# Default value: 1
#
# -------- How to set parameters for a particular interface ---------
# The example below shows how to locate the device path and set a parameter
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_common.c b/usr/src/uts/common/io/ixgbe/ixgbe_common.c
index f472cbd290..76e0232ff7 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_common.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_common.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,14 +20,16 @@
*/
/*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
*/
/* IntelVersion: 1.159 v2008-03-04 */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "ixgbe_common.h"
#include "ixgbe_api.h"
@@ -1546,27 +1546,11 @@ ixgbe_set_mta(struct ixgbe_hw *hw, u8 *mc_addr)
void
ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr)
{
- u32 rar_entries = hw->mac.num_rar_entries;
- u32 rar;
-
DEBUGOUT6(" MC Addr =%.2X %.2X %.2X %.2X %.2X %.2X\n",
mc_addr[0], mc_addr[1], mc_addr[2],
mc_addr[3], mc_addr[4], mc_addr[5]);
- /*
- * Place this multicast address in the RAR if there is room,
- * else put it in the MTA
- */
- if (hw->addr_ctrl.rar_used_count < rar_entries) {
- /* use RAR from the end up for multicast */
- rar = rar_entries - hw->addr_ctrl.mc_addr_in_rar_count - 1;
- hw->mac.ops.set_rar(hw, rar, mc_addr, 0, IXGBE_RAH_AV);
- DEBUGOUT1("Added a multicast address to RAR[%d]\n", rar);
- hw->addr_ctrl.rar_used_count++;
- hw->addr_ctrl.mc_addr_in_rar_count++;
- } else {
- ixgbe_set_mta(hw, mc_addr);
- }
+ ixgbe_set_mta(hw, mc_addr);
DEBUGOUT("ixgbe_add_mc_addr Complete\n");
}
@@ -1588,7 +1572,6 @@ ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
u32 mc_addr_count, ixgbe_mc_addr_itr next)
{
u32 i;
- u32 rar_entries = hw->mac.num_rar_entries;
u32 vmdq;
/*
@@ -1596,18 +1579,8 @@ ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
* use.
*/
hw->addr_ctrl.num_mc_addrs = mc_addr_count;
- hw->addr_ctrl.rar_used_count -= hw->addr_ctrl.mc_addr_in_rar_count;
- hw->addr_ctrl.mc_addr_in_rar_count = 0;
hw->addr_ctrl.mta_in_use = 0;
- /* Zero out the other receive addresses. */
- DEBUGOUT2("Clearing RAR[%d-%d]\n", hw->addr_ctrl.rar_used_count,
- rar_entries - 1);
- for (i = hw->addr_ctrl.rar_used_count; i < rar_entries; i++) {
- IXGBE_WRITE_REG(hw, IXGBE_RAL(i), 0);
- IXGBE_WRITE_REG(hw, IXGBE_RAH(i), 0);
- }
-
/* Clear the MTA */
DEBUGOUT(" Clearing MTA\n");
for (i = 0; i < hw->mac.mcft_size; i++)
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_gld.c b/usr/src/uts/common/io/ixgbe/ixgbe_gld.c
index 78a96bd4ef..b4b3a966fe 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_gld.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_gld.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "ixgbe_sw.h"
@@ -103,16 +103,24 @@ ixgbe_m_stat(void *arg, uint_t stat, uint64_t *val)
break;
case MAC_STAT_RBYTES:
- for (i = 0; i < 16; i++)
- ixgbe_ks->tor.value.ui64 +=
+ ixgbe_ks->tor.value.ui64 = 0;
+ for (i = 0; i < 16; i++) {
+ ixgbe_ks->qbrc[i].value.ui64 +=
IXGBE_READ_REG(hw, IXGBE_QBRC(i));
+ ixgbe_ks->tor.value.ui64 +=
+ ixgbe_ks->qbrc[i].value.ui64;
+ }
*val = ixgbe_ks->tor.value.ui64;
break;
case MAC_STAT_OBYTES:
- for (i = 0; i < 16; i++)
- ixgbe_ks->tot.value.ui64 +=
+ ixgbe_ks->tot.value.ui64 = 0;
+ for (i = 0; i < 16; i++) {
+ ixgbe_ks->qbtc[i].value.ui64 +=
IXGBE_READ_REG(hw, IXGBE_QBTC(i));
+ ixgbe_ks->tot.value.ui64 +=
+ ixgbe_ks->qbtc[i].value.ui64;
+ }
*val = ixgbe_ks->tot.value.ui64;
break;
@@ -412,37 +420,6 @@ ixgbe_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
}
/*
- * Set a new device unicast address.
- */
-int
-ixgbe_m_unicst(void *arg, const uint8_t *mac_addr)
-{
- ixgbe_t *ixgbe = (ixgbe_t *)arg;
- int result;
-
- mutex_enter(&ixgbe->gen_lock);
-
- if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
- mutex_exit(&ixgbe->gen_lock);
- return (ECANCELED);
- }
-
- /*
- * Store the new MAC address.
- */
- bcopy(mac_addr, ixgbe->hw.mac.addr, ETHERADDRL);
-
- /*
- * Set MAC address in address slot 0, which is the default address.
- */
- result = ixgbe_unicst_set(ixgbe, mac_addr, 0);
-
- mutex_exit(&ixgbe->gen_lock);
-
- return (result);
-}
-
-/*
* Pass on M_IOCTL messages passed to the DLD, and support
* private IOCTLs for debugging and ndd.
*/
@@ -511,191 +488,6 @@ ixgbe_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
}
}
-
-/*
- * Find an unused address slot, set the address to it, reserve
- * this slot and enable the device to start filtering on the
- * new address.
- */
-int
-ixgbe_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
-{
- ixgbe_t *ixgbe = (ixgbe_t *)arg;
- mac_addr_slot_t slot;
- int err;
-
- mutex_enter(&ixgbe->gen_lock);
-
- if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
- mutex_exit(&ixgbe->gen_lock);
- return (ECANCELED);
- }
-
- if (mac_unicst_verify(ixgbe->mac_hdl,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
- mutex_exit(&ixgbe->gen_lock);
- return (EINVAL);
- }
-
- if (ixgbe->unicst_avail == 0) {
- /* no slots available */
- mutex_exit(&ixgbe->gen_lock);
- return (ENOSPC);
- }
-
- /*
- * Primary/default address is in slot 0. The next addresses
- * are the multiple MAC addresses. So multiple MAC address 0
- * is in slot 1, 1 in slot 2, and so on. So the first multiple
- * MAC address resides in slot 1.
- */
- for (slot = 1; slot < ixgbe->unicst_total; slot++) {
- if (ixgbe->unicst_addr[slot].mac.set == 0)
- break;
- }
-
- ASSERT((slot > 0) && (slot < ixgbe->unicst_total));
-
- maddr->mma_slot = slot;
-
- if ((err = ixgbe_unicst_set(ixgbe, maddr->mma_addr, slot)) == 0) {
- ixgbe->unicst_addr[slot].mac.set = 1;
- ixgbe->unicst_avail--;
- }
-
- mutex_exit(&ixgbe->gen_lock);
-
- return (err);
-}
-
-/*
- * Removes a MAC address that was added before.
- */
-int
-ixgbe_m_unicst_remove(void *arg, mac_addr_slot_t slot)
-{
- ixgbe_t *ixgbe = (ixgbe_t *)arg;
- int err;
-
- mutex_enter(&ixgbe->gen_lock);
-
- if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
- mutex_exit(&ixgbe->gen_lock);
- return (ECANCELED);
- }
-
- if ((slot <= 0) || (slot >= ixgbe->unicst_total)) {
- mutex_exit(&ixgbe->gen_lock);
- return (EINVAL);
- }
-
- if (ixgbe->unicst_addr[slot].mac.set == 1) {
- /*
- * Copy the default address to the passed slot
- */
- if ((err = ixgbe_unicst_set(ixgbe,
- ixgbe->unicst_addr[0].mac.addr, slot)) == 0) {
- ixgbe->unicst_addr[slot].mac.set = 0;
- ixgbe->unicst_avail++;
- }
-
- mutex_exit(&ixgbe->gen_lock);
-
- return (err);
- }
-
- mutex_exit(&ixgbe->gen_lock);
-
- return (EINVAL);
-}
-
-/*
- * Modifies the value of an address that has been added before.
- * The new address length and the slot number that was returned
- * in the call to add should be passed in. mma_flags should be
- * set to 0.
- * Returns 0 on success.
- */
-int
-ixgbe_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
-{
- ixgbe_t *ixgbe = (ixgbe_t *)arg;
- mac_addr_slot_t slot;
- int err;
-
- mutex_enter(&ixgbe->gen_lock);
-
- if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
- mutex_exit(&ixgbe->gen_lock);
- return (ECANCELED);
- }
-
- if (mac_unicst_verify(ixgbe->mac_hdl,
- maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
- mutex_exit(&ixgbe->gen_lock);
- return (EINVAL);
- }
-
- slot = maddr->mma_slot;
-
- if ((slot <= 0) || (slot >= ixgbe->unicst_total)) {
- mutex_exit(&ixgbe->gen_lock);
- return (EINVAL);
- }
-
- if (ixgbe->unicst_addr[slot].mac.set == 1) {
- err = ixgbe_unicst_set(ixgbe, maddr->mma_addr, slot);
- mutex_exit(&ixgbe->gen_lock);
- return (err);
- }
-
- mutex_exit(&ixgbe->gen_lock);
-
- return (EINVAL);
-}
-
-/*
- * Get the MAC address and all other information related to
- * the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
- */
-int
-ixgbe_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
-{
- ixgbe_t *ixgbe = (ixgbe_t *)arg;
- mac_addr_slot_t slot;
-
- mutex_enter(&ixgbe->gen_lock);
-
- if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
- mutex_exit(&ixgbe->gen_lock);
- return (ECANCELED);
- }
-
- slot = maddr->mma_slot;
-
- if ((slot <= 0) || (slot >= ixgbe->unicst_total)) {
- mutex_exit(&ixgbe->gen_lock);
- return (EINVAL);
- }
- if (ixgbe->unicst_addr[slot].mac.set == 1) {
- bcopy(ixgbe->unicst_addr[slot].mac.addr,
- maddr->mma_addr, ETHERADDRL);
- maddr->mma_flags = MMAC_SLOT_USED;
- } else {
- maddr->mma_flags = MMAC_SLOT_UNUSED;
- }
-
- mutex_exit(&ixgbe->gen_lock);
-
- return (0);
-}
-
/*
* Obtain the MAC's capabilities and associated data from
* the driver.
@@ -732,25 +524,29 @@ ixgbe_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
return (B_FALSE);
}
}
- case MAC_CAPAB_MULTIADDRESS: {
- multiaddress_capab_t *mmacp = cap_data;
-
- /*
- * The number of MAC addresses made available by
- * this capability is one less than the total as
- * the primary address in slot 0 is counted in
- * the total.
- */
- mmacp->maddr_naddr = ixgbe->unicst_total - 1;
- mmacp->maddr_naddrfree = ixgbe->unicst_avail;
- /* No multiple factory addresses, set mma_flag to 0 */
- mmacp->maddr_flag = 0;
- mmacp->maddr_handle = ixgbe;
- mmacp->maddr_add = ixgbe_m_unicst_add;
- mmacp->maddr_remove = ixgbe_m_unicst_remove;
- mmacp->maddr_modify = ixgbe_m_unicst_modify;
- mmacp->maddr_get = ixgbe_m_unicst_get;
- mmacp->maddr_reserve = NULL;
+ case MAC_CAPAB_RINGS: {
+ mac_capab_rings_t *cap_rings = cap_data;
+
+ switch (cap_rings->mr_type) {
+ case MAC_RING_TYPE_RX:
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = ixgbe->num_rx_rings;
+ cap_rings->mr_gnum = ixgbe->num_rx_groups;
+ cap_rings->mr_rget = ixgbe_fill_ring;
+ cap_rings->mr_gget = ixgbe_fill_group;
+ cap_rings->mr_gaddring = NULL;
+ cap_rings->mr_gremring = NULL;
+ break;
+ case MAC_RING_TYPE_TX:
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = ixgbe->num_tx_rings;
+ cap_rings->mr_gnum = 0;
+ cap_rings->mr_rget = ixgbe_fill_ring;
+ cap_rings->mr_gget = NULL;
+ break;
+ default:
+ break;
+ }
break;
}
default:
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
index f7bbcb1ff6..f8acd5fdd5 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
@@ -61,6 +61,8 @@ static void ixgbe_setup_rx_ring(ixgbe_rx_ring_t *);
static void ixgbe_setup_tx_ring(ixgbe_tx_ring_t *);
static void ixgbe_setup_rss(ixgbe_t *);
static void ixgbe_init_unicst(ixgbe_t *);
+static int ixgbe_unicst_set(ixgbe_t *, const uint8_t *, int);
+static int ixgbe_unicst_find(ixgbe_t *, const uint8_t *);
static void ixgbe_setup_multicst(ixgbe_t *);
static void ixgbe_get_hw_state(ixgbe_t *);
static void ixgbe_get_conf(ixgbe_t *);
@@ -83,7 +85,9 @@ static int ixgbe_alloc_intr_handles(ixgbe_t *, int);
static int ixgbe_add_intr_handlers(ixgbe_t *);
static void ixgbe_map_rxring_to_vector(ixgbe_t *, int, int);
static void ixgbe_map_txring_to_vector(ixgbe_t *, int, int);
-static void ixgbe_set_ivar(ixgbe_t *, uint16_t, uint8_t);
+static void ixgbe_setup_ivar(ixgbe_t *, uint16_t, uint8_t);
+static void ixgbe_enable_ivar(ixgbe_t *, uint16_t);
+static void ixgbe_disable_ivar(ixgbe_t *, uint16_t);
static int ixgbe_map_rings_to_vectors(ixgbe_t *);
static void ixgbe_setup_adapter_vector(ixgbe_t *);
static void ixgbe_rem_intr_handlers(ixgbe_t *);
@@ -92,12 +96,14 @@ static int ixgbe_enable_intrs(ixgbe_t *);
static int ixgbe_disable_intrs(ixgbe_t *);
static uint_t ixgbe_intr_legacy(void *, void *);
static uint_t ixgbe_intr_msi(void *, void *);
-static uint_t ixgbe_intr_rx(void *, void *);
-static uint_t ixgbe_intr_tx_other(void *, void *);
+static uint_t ixgbe_intr_rx_tx(void *, void *);
+static uint_t ixgbe_intr_other(void *, void *);
static void ixgbe_intr_rx_work(ixgbe_rx_ring_t *);
static void ixgbe_intr_tx_work(ixgbe_tx_ring_t *);
static void ixgbe_intr_other_work(ixgbe_t *);
static void ixgbe_get_driver_control(struct ixgbe_hw *);
+static int ixgbe_addmac(void *, const uint8_t *);
+static int ixgbe_remmac(void *, const uint8_t *);
static void ixgbe_release_driver_control(struct ixgbe_hw *);
static int ixgbe_attach(dev_info_t *, ddi_attach_cmd_t);
@@ -188,8 +194,7 @@ static mac_callbacks_t ixgbe_m_callbacks = {
ixgbe_m_stop,
ixgbe_m_promisc,
ixgbe_m_multicst,
- ixgbe_m_unicst,
- ixgbe_m_tx,
+ NULL,
NULL,
ixgbe_m_ioctl,
ixgbe_m_getcapab
@@ -675,6 +680,7 @@ ixgbe_register_mac(ixgbe_t *ixgbe)
mac->m_min_sdu = 0;
mac->m_max_sdu = ixgbe->default_mtu;
mac->m_margin = VLAN_TAGSZ;
+ mac->m_v12n = MAC_VIRT_LEVEL1;
status = mac_register(mac, &ixgbe->mac_hdl);
@@ -765,6 +771,7 @@ static int
ixgbe_init_driver_settings(ixgbe_t *ixgbe)
{
struct ixgbe_hw *hw = &ixgbe->hw;
+ dev_info_t *devinfo = ixgbe->dip;
ixgbe_rx_ring_t *rx_ring;
ixgbe_tx_ring_t *tx_ring;
uint32_t rx_size;
@@ -779,6 +786,11 @@ ixgbe_init_driver_settings(ixgbe_t *ixgbe)
}
/*
+ * Get the system page size
+ */
+ ixgbe->sys_page_size = ddi_ptob(devinfo, (ulong_t)1);
+
+ /*
* Set rx buffer size
*
* The IP header alignment room is counted in the calculation.
@@ -1569,6 +1581,23 @@ ixgbe_alloc_rings(ixgbe_t *ixgbe)
return (IXGBE_FAILURE);
}
+ /*
+ * Allocate memory space for rx ring groups
+ */
+ ixgbe->rx_groups = kmem_zalloc(
+ sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups,
+ KM_NOSLEEP);
+
+ if (ixgbe->rx_groups == NULL) {
+ kmem_free(ixgbe->rx_rings,
+ sizeof (ixgbe_rx_ring_t) * ixgbe->num_rx_rings);
+ kmem_free(ixgbe->tx_rings,
+ sizeof (ixgbe_tx_ring_t) * ixgbe->num_tx_rings);
+ ixgbe->rx_rings = NULL;
+ ixgbe->tx_rings = NULL;
+ return (IXGBE_FAILURE);
+ }
+
return (IXGBE_SUCCESS);
}
@@ -1589,6 +1618,12 @@ ixgbe_free_rings(ixgbe_t *ixgbe)
sizeof (ixgbe_tx_ring_t) * ixgbe->num_tx_rings);
ixgbe->tx_rings = NULL;
}
+
+ if (ixgbe->rx_groups != NULL) {
+ kmem_free(ixgbe->rx_groups,
+ sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups);
+ ixgbe->rx_groups = NULL;
+ }
}
/*
@@ -1693,7 +1728,9 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
{
ixgbe_rx_ring_t *rx_ring;
struct ixgbe_hw *hw = &ixgbe->hw;
+ ixgbe_rx_group_t *rx_group;
uint32_t reg_val;
+ uint32_t ring_mapping;
int i;
/*
@@ -1723,6 +1760,29 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
}
/*
+ * Setup rx groups.
+ */
+ for (i = 0; i < ixgbe->num_rx_groups; i++) {
+ rx_group = &ixgbe->rx_groups[i];
+ rx_group->index = i;
+ rx_group->ixgbe = ixgbe;
+ }
+
+ /*
+ * Setup the per-ring statistics mapping.
+ */
+ ring_mapping = 0;
+ for (i = 0; i < ixgbe->num_rx_rings; i++) {
+ ring_mapping |= (i & 0xF) << (8 * (i & 0x3));
+ if ((i & 0x3) == 0x3) {
+ IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i >> 2), ring_mapping);
+ ring_mapping = 0;
+ }
+ }
+ if ((i & 0x3) != 0x3)
+ IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i >> 2), ring_mapping);
+
+ /*
* The Max Frame Size in MHADD will be internally increased by four
* bytes if the packet has a VLAN field, so includes MTU, ethernet
* header and frame check sequence.
@@ -1858,6 +1918,7 @@ ixgbe_setup_tx(ixgbe_t *ixgbe)
struct ixgbe_hw *hw = &ixgbe->hw;
ixgbe_tx_ring_t *tx_ring;
uint32_t reg_val;
+ uint32_t ring_mapping;
int i;
for (i = 0; i < ixgbe->num_tx_rings; i++) {
@@ -1866,6 +1927,20 @@ ixgbe_setup_tx(ixgbe_t *ixgbe)
}
/*
+ * Setup the per-ring statistics mapping.
+ */
+ ring_mapping = 0;
+ for (i = 0; i < ixgbe->num_tx_rings; i++) {
+ ring_mapping |= (i & 0xF) << (8 * (i & 0x3));
+ if ((i & 0x3) == 0x3) {
+ IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i >> 2), ring_mapping);
+ ring_mapping = 0;
+ }
+ }
+ if ((i & 0x3) != 0x3)
+ IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i >> 2), ring_mapping);
+
+ /*
* Enable CRC appending and TX padding (for short tx frames)
*/
reg_val = IXGBE_READ_REG(hw, IXGBE_HLREG0);
@@ -1936,13 +2011,13 @@ static void
ixgbe_init_unicst(ixgbe_t *ixgbe)
{
struct ixgbe_hw *hw = &ixgbe->hw;
+ uint8_t *mac_addr;
int slot;
/*
* Here we should consider two situations:
*
- * 1. Chipset is initialized the first time
- * Initialize the multiple unicast addresses, and
- * save the default mac address.
+ * 1. Chipset is initialized at the first time,
+ * Clear all the multiple unicast addresses.
*
* 2. Chipset is reset
* Recover the multiple unicast addresses from the
@@ -1953,36 +2028,36 @@ ixgbe_init_unicst(ixgbe_t *ixgbe)
* Initialize the multiple unicast addresses
*/
ixgbe->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
-
- ixgbe->unicst_avail = ixgbe->unicst_total - 1;
-
- bcopy(hw->mac.addr, ixgbe->unicst_addr[0].mac.addr,
- ETHERADDRL);
- ixgbe->unicst_addr[0].mac.set = 1;
-
- for (slot = 1; slot < ixgbe->unicst_total; slot++)
+ ixgbe->unicst_avail = ixgbe->unicst_total;
+ for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+ mac_addr = ixgbe->unicst_addr[slot].mac.addr;
+ bzero(mac_addr, ETHERADDRL);
+ (void) ixgbe_set_rar(hw, slot, mac_addr, NULL, NULL);
ixgbe->unicst_addr[slot].mac.set = 0;
-
+ }
ixgbe->unicst_init = B_TRUE;
} else {
- /*
- * Recover the default mac address
- */
- bcopy(ixgbe->unicst_addr[0].mac.addr, hw->mac.addr,
- ETHERADDRL);
-
/* Re-configure the RAR registers */
- for (slot = 1; slot < ixgbe->unicst_total; slot++)
- (void) ixgbe_set_rar(hw, slot,
- ixgbe->unicst_addr[slot].mac.addr, NULL, NULL);
+ for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+ mac_addr = ixgbe->unicst_addr[slot].mac.addr;
+ if (ixgbe->unicst_addr[slot].mac.set == 1) {
+ (void) ixgbe_set_rar(hw, slot, mac_addr,
+ NULL, IXGBE_RAH_AV);
+ } else {
+ bzero(mac_addr, ETHERADDRL);
+ (void) ixgbe_set_rar(hw, slot, mac_addr,
+ NULL, NULL);
+ }
+ }
}
}
+
/*
* ixgbe_unicst_set - Set the unicast address to the specified slot.
*/
int
ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr,
- mac_addr_slot_t slot)
+ int slot)
{
struct ixgbe_hw *hw = &ixgbe->hw;
@@ -1996,7 +2071,7 @@ ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr,
/*
* Set the unicast address to the RAR register
*/
- (void) ixgbe_set_rar(hw, slot, (uint8_t *)mac_addr, NULL, NULL);
+ (void) ixgbe_set_rar(hw, slot, (uint8_t *)mac_addr, NULL, IXGBE_RAH_AV);
if (ixgbe_check_acc_handle(ixgbe->osdep.reg_handle) != DDI_FM_OK) {
ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
@@ -2007,6 +2082,25 @@ ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr,
}
/*
+ * ixgbe_unicst_find - Find the slot for the specified unicast address
+ */
+int
+ixgbe_unicst_find(ixgbe_t *ixgbe, const uint8_t *mac_addr)
+{
+ int slot;
+
+ ASSERT(mutex_owned(&ixgbe->gen_lock));
+
+ for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+ if (bcmp(ixgbe->unicst_addr[slot].mac.addr,
+ mac_addr, ETHERADDRL) == 0)
+ return (slot);
+ }
+
+ return (-1);
+}
+
+/*
* ixgbe_multicst_add - Add a multicst address.
*/
int
@@ -2153,7 +2247,7 @@ ixgbe_get_conf(ixgbe_t *ixgbe)
* Ethernet flow control configuration
*/
flow_control = ixgbe_get_prop(ixgbe, PROP_FLOW_CONTROL,
- ixgbe_fc_none, 3, ixgbe_fc_full);
+ ixgbe_fc_none, 3, ixgbe_fc_none);
if (flow_control == 3)
flow_control = ixgbe_fc_default;
@@ -2173,10 +2267,25 @@ ixgbe_get_conf(ixgbe_t *ixgbe)
MIN_RX_RING_SIZE, MAX_RX_RING_SIZE, DEFAULT_RX_RING_SIZE);
/*
+ * Multiple groups configuration
+ */
+ ixgbe->num_rx_groups = ixgbe_get_prop(ixgbe, PROP_RX_GROUP_NUM,
+ MIN_RX_GROUP_NUM, MAX_RX_GROUP_NUM, DEFAULT_RX_GROUP_NUM);
+
+ ixgbe->mr_enable = ixgbe_get_prop(ixgbe, PROP_MR_ENABLE,
+ 0, 1, DEFAULT_MR_ENABLE);
+
+ if (ixgbe->mr_enable == B_FALSE) {
+ ixgbe->num_tx_rings = 1;
+ ixgbe->num_rx_rings = 1;
+ ixgbe->num_rx_groups = 1;
+ }
+
+ /*
* Tunable used to force an interrupt type. The only use is
* for testing of the lesser interrupt types.
* 0 = don't force interrupt type
- * 1 = force interrupt type MSIX
+ * 1 = force interrupt type MSI-X
* 2 = force interrupt type MSI
* 3 = force interrupt type Legacy
*/
@@ -2413,6 +2522,7 @@ ixgbe_stall_check(ixgbe_t *ixgbe)
result = B_FALSE;
for (i = 0; i < ixgbe->num_tx_rings; i++) {
tx_ring = &ixgbe->tx_rings[i];
+ tx_ring->tx_recycle(tx_ring);
if (tx_ring->recycle_fail > 0)
tx_ring->stall_watchdog++;
@@ -2872,11 +2982,12 @@ ixgbe_intr_rx_work(ixgbe_rx_ring_t *rx_ring)
mutex_enter(&rx_ring->rx_lock);
- mp = ixgbe_rx(rx_ring);
+ mp = ixgbe_ring_rx(rx_ring, IXGBE_POLL_NULL);
mutex_exit(&rx_ring->rx_lock);
if (mp != NULL)
- mac_rx(rx_ring->ixgbe->mac_hdl, NULL, mp);
+ mac_rx_ring(rx_ring->ixgbe->mac_hdl, rx_ring->ring_handle, mp,
+ rx_ring->ring_gen_num);
}
#pragma inline(ixgbe_intr_tx_work)
@@ -2897,7 +3008,8 @@ ixgbe_intr_tx_work(ixgbe_tx_ring_t *tx_ring)
if (tx_ring->reschedule &&
(tx_ring->tbd_free >= tx_ring->resched_thresh)) {
tx_ring->reschedule = B_FALSE;
- mac_tx_update(tx_ring->ixgbe->mac_hdl);
+ mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+ tx_ring->ring_handle);
IXGBE_DEBUG_STAT(tx_ring->stat_reschedule);
}
}
@@ -2943,6 +3055,7 @@ ixgbe_intr_legacy(void *arg1, void *arg2)
ixgbe_t *ixgbe = (ixgbe_t *)arg1;
struct ixgbe_hw *hw = &ixgbe->hw;
ixgbe_tx_ring_t *tx_ring;
+ ixgbe_rx_ring_t *rx_ring;
uint32_t eicr;
mblk_t *mp;
boolean_t tx_reschedule;
@@ -2974,16 +3087,20 @@ ixgbe_intr_legacy(void *arg1, void *arg2)
ASSERT(ixgbe->num_tx_rings == 1);
/*
- * For legacy interrupt, we can't differentiate
- * between tx and rx, so always clean both
+ * For legacy interrupt, rx rings[0] will use RTxQ[0].
*/
- if (eicr & IXGBE_EICR_RTX_QUEUE) {
-
+ if (eicr & 0x1) {
/*
* Clean the rx descriptors
*/
- mp = ixgbe_rx(&ixgbe->rx_rings[0]);
+ rx_ring = &ixgbe->rx_rings[0];
+ mp = ixgbe_ring_rx(rx_ring, IXGBE_POLL_NULL);
+ }
+ /*
+ * For legacy interrupt, tx rings[0] will use RTxQ[1].
+ */
+ if (eicr & 0x2) {
/*
* Recycle the tx descriptors
*/
@@ -3020,11 +3137,12 @@ ixgbe_intr_legacy(void *arg1, void *arg2)
* Do the following work outside of the gen_lock
*/
if (mp != NULL)
- mac_rx(ixgbe->mac_hdl, NULL, mp);
+ mac_rx_ring(rx_ring->ixgbe->mac_hdl, rx_ring->ring_handle, mp,
+ rx_ring->ring_gen_num);
if (tx_reschedule) {
tx_ring->reschedule = B_FALSE;
- mac_tx_update(ixgbe->mac_hdl);
+ mac_tx_ring_update(ixgbe->mac_hdl, tx_ring->ring_handle);
IXGBE_DEBUG_STAT(tx_ring->stat_reschedule);
}
@@ -3055,11 +3173,16 @@ ixgbe_intr_msi(void *arg1, void *arg2)
ASSERT(ixgbe->num_tx_rings == 1);
/*
- * For MSI interrupt, we can't differentiate
- * between tx and rx, so always clean both.
+ * For MSI interrupt, rx rings[0] will use RTxQ[0].
*/
- if (eicr & IXGBE_EICR_RTX_QUEUE) {
+ if (eicr & 0x1) {
ixgbe_intr_rx_work(&ixgbe->rx_rings[0]);
+ }
+
+ /*
+ * For MSI interrupt, tx rings[0] will use RTxQ[1].
+ */
+ if (eicr & 0x2) {
ixgbe_intr_tx_work(&ixgbe->tx_rings[0]);
}
@@ -3071,38 +3194,47 @@ ixgbe_intr_msi(void *arg1, void *arg2)
}
/*
- * ixgbe_intr_rx - Interrupt handler for rx.
+ * ixgbe_intr_rx_tx - Interrupt handler for rx and tx.
*/
static uint_t
-ixgbe_intr_rx(void *arg1, void *arg2)
+ixgbe_intr_rx_tx(void *arg1, void *arg2)
{
_NOTE(ARGUNUSED(arg2));
- ixgbe_ring_vector_t *vect = (ixgbe_ring_vector_t *)arg1;
- ixgbe_t *ixgbe = vect->ixgbe;
- int r_idx;
+ ixgbe_ring_vector_t *vect = (ixgbe_ring_vector_t *)arg1;
+ ixgbe_t *ixgbe = vect->ixgbe;
+ int r_idx = 0;
/*
- * clean each rx ring that has its bit set in the map
+ * Clean each rx ring that has its bit set in the map
*/
r_idx = bt_getlowbit(vect->rx_map, 0, (ixgbe->num_rx_rings - 1));
-
while (r_idx >= 0) {
ixgbe_intr_rx_work(&ixgbe->rx_rings[r_idx]);
r_idx = bt_getlowbit(vect->rx_map, (r_idx + 1),
(ixgbe->num_rx_rings - 1));
}
+ /*
+ * Clean each tx ring that has its bit set in the map
+ */
+ r_idx = bt_getlowbit(vect->tx_map, 0, (ixgbe->num_tx_rings - 1));
+ while (r_idx >= 0) {
+ ixgbe_intr_tx_work(&ixgbe->tx_rings[r_idx]);
+ r_idx = bt_getlowbit(vect->tx_map, (r_idx + 1),
+ (ixgbe->num_tx_rings - 1));
+ }
+
return (DDI_INTR_CLAIMED);
}
/*
- * ixgbe_intr_tx_other - Interrupt handler for both tx and other.
+ * ixgbe_intr_other - Interrupt handler for other.
*
- * Always look for Tx cleanup work. Only look for other work if the right
- * bits are set in the Interrupt Cause Register.
+ * Only look for other work if the right bits are set in the
+ * Interrupt Cause Register.
*/
static uint_t
-ixgbe_intr_tx_other(void *arg1, void *arg2)
+ixgbe_intr_other(void *arg1, void *arg2)
{
_NOTE(ARGUNUSED(arg2));
ixgbe_t *ixgbe = (ixgbe_t *)arg1;
@@ -3112,14 +3244,8 @@ ixgbe_intr_tx_other(void *arg1, void *arg2)
eicr = IXGBE_READ_REG(hw, IXGBE_EICR);
/*
- * Always look for Tx cleanup work. We don't have separate
- * transmit vectors, so we have only one tx ring enabled.
- */
- ASSERT(ixgbe->num_tx_rings == 1);
- ixgbe_intr_tx_work(&ixgbe->tx_rings[0]);
-
- /*
- * Check for "other" causes.
+ * Need check cause bits and only link change will
+ * be processed
*/
if (eicr & IXGBE_EICR_LSC) {
ixgbe_intr_other_work(ixgbe);
@@ -3174,12 +3300,13 @@ ixgbe_alloc_intrs(ixgbe_t *ixgbe)
}
/*
- * MSI-X not used, force rings to 1
+ * MSI-X not used, force rings and groups to 1
*/
ixgbe->num_rx_rings = 1;
+ ixgbe->num_rx_groups = 1;
ixgbe->num_tx_rings = 1;
ixgbe_log(ixgbe,
- "MSI-X not used, force rx and tx queue number to 1");
+ "MSI-X not used, force rings and groups number to 1");
/*
* Install MSI interrupts
@@ -3217,30 +3344,19 @@ ixgbe_alloc_intrs(ixgbe_t *ixgbe)
*
* For legacy and MSI, only 1 handle is needed. For MSI-X,
* if fewer than 2 handles are available, return failure.
- * Upon success, this sets the number of Rx rings to a number that
- * matches the handles available for Rx interrupts.
+ * Upon success, this maps the vectors to rx and tx rings for
+ * interrupts.
*/
static int
ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
{
dev_info_t *devinfo;
int request, count, avail, actual;
- int rx_rings, minimum;
+ int minimum;
int rc;
devinfo = ixgbe->dip;
- /*
- * Currently only 1 tx ring is supported. More tx rings
- * will be supported with future enhancement.
- */
- if (ixgbe->num_tx_rings > 1) {
- ixgbe->num_tx_rings = 1;
- ixgbe_log(ixgbe,
- "Use only 1 MSI-X vector for tx, "
- "force tx queue number to 1");
- }
-
switch (intr_type) {
case DDI_INTR_TYPE_FIXED:
request = 1; /* Request 1 legacy interrupt handle */
@@ -3257,11 +3373,11 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
case DDI_INTR_TYPE_MSIX:
/*
* Best number of vectors for the adapter is
- * # rx rings + # tx rings + 1 for other
- * But currently we only support number of vectors of
- * # rx rings + 1 for tx & other
+ * # rx rings + # tx rings + 1 for other.
*/
- request = ixgbe->num_rx_rings + 1;
+ request = ixgbe->num_rx_rings + ixgbe->num_tx_rings + 1;
+ if (request > (IXGBE_MAX_RING_VECTOR + 1))
+ request = IXGBE_MAX_RING_VECTOR + 1;
minimum = 2;
IXGBE_DEBUGLOG_0(ixgbe, "interrupt type: MSI-X");
break;
@@ -3327,9 +3443,8 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
ixgbe->intr_cnt = actual;
/*
- * Now we know the actual number of vectors. Here we assume that
- * tx and other will share 1 vector and all remaining (must be at
- * least 1 remaining) will be used for rx.
+ * Now we know the actual number of vectors. Here we map the vector
+ * to other, rx rings and tx ring.
*/
if (actual < minimum) {
ixgbe_log(ixgbe, "Insufficient interrupt handles available: %d",
@@ -3338,19 +3453,6 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
}
/*
- * For MSI-X, actual might force us to reduce number of rx rings
- */
- if (intr_type == DDI_INTR_TYPE_MSIX) {
- rx_rings = actual - 1;
- if (rx_rings < ixgbe->num_rx_rings) {
- ixgbe_log(ixgbe,
- "MSI-X vectors force Rx queue number to %d",
- rx_rings);
- ixgbe->num_rx_rings = rx_rings;
- }
- }
-
- /*
* Get priority for first vector, assume remaining are all the same
*/
rc = ddi_intr_get_pri(ixgbe->htable[0], &ixgbe->intr_pri);
@@ -3386,56 +3488,47 @@ alloc_handle_fail:
static int
ixgbe_add_intr_handlers(ixgbe_t *ixgbe)
{
- ixgbe_rx_ring_t *rx_ring;
- int vector;
+ int vector = 0;
int rc;
- int i;
-
- vector = 0;
switch (ixgbe->intr_type) {
case DDI_INTR_TYPE_MSIX:
/*
- * Add interrupt handler for tx + other
- */
- rc = ddi_intr_add_handler(ixgbe->htable[vector],
- (ddi_intr_handler_t *)ixgbe_intr_tx_other,
- (void *)ixgbe, NULL);
- if (rc != DDI_SUCCESS) {
- ixgbe_log(ixgbe,
- "Add tx/other interrupt handler failed: %d", rc);
- return (IXGBE_FAILURE);
- }
- vector++;
-
- /*
- * Add interrupt handler for each rx ring
+ * Add interrupt handler for rx and tx rings: vector[0 -
+ * (ixgbe->intr_cnt -1)].
*/
- for (i = 0; i < ixgbe->num_rx_rings; i++) {
- rx_ring = &ixgbe->rx_rings[i];
-
+ for (vector = 0; vector < (ixgbe->intr_cnt -1); vector++) {
/*
* install pointer to vect_map[vector]
*/
rc = ddi_intr_add_handler(ixgbe->htable[vector],
- (ddi_intr_handler_t *)ixgbe_intr_rx,
+ (ddi_intr_handler_t *)ixgbe_intr_rx_tx,
(void *)&ixgbe->vect_map[vector], NULL);
if (rc != DDI_SUCCESS) {
ixgbe_log(ixgbe,
"Add rx interrupt handler failed. "
- "return: %d, rx ring: %d", rc, i);
+ "return: %d, vector: %d", rc, vector);
for (vector--; vector >= 0; vector--) {
(void) ddi_intr_remove_handler(
ixgbe->htable[vector]);
}
return (IXGBE_FAILURE);
}
+ }
- rx_ring->intr_vector = vector;
-
- vector++;
+ /*
+ * Add interrupt handler for other: vector[ixgbe->intr_cnt -1]
+ */
+ rc = ddi_intr_add_handler(ixgbe->htable[vector],
+ (ddi_intr_handler_t *)ixgbe_intr_other,
+ (void *)ixgbe, NULL);
+ if (rc != DDI_SUCCESS) {
+ ixgbe_log(ixgbe,
+ "Add other interrupt handler failed: %d", rc);
+ return (IXGBE_FAILURE);
}
+
break;
case DDI_INTR_TYPE_MSI:
@@ -3452,10 +3545,6 @@ ixgbe_add_intr_handlers(ixgbe_t *ixgbe)
return (IXGBE_FAILURE);
}
- rx_ring = &ixgbe->rx_rings[0];
- rx_ring->intr_vector = vector;
-
- vector++;
break;
case DDI_INTR_TYPE_FIXED:
@@ -3472,17 +3561,13 @@ ixgbe_add_intr_handlers(ixgbe_t *ixgbe)
return (IXGBE_FAILURE);
}
- rx_ring = &ixgbe->rx_rings[0];
- rx_ring->intr_vector = vector;
-
- vector++;
break;
default:
return (IXGBE_FAILURE);
}
- ASSERT(vector == ixgbe->intr_cnt);
+ ASSERT(vector == (ixgbe->intr_cnt -1));
return (IXGBE_SUCCESS);
}
@@ -3509,6 +3594,7 @@ ixgbe_map_rxring_to_vector(ixgbe_t *ixgbe, int r_idx, int v_idx)
/*
* Remember bit position
*/
+ ixgbe->rx_rings[r_idx].intr_vector = v_idx;
ixgbe->rx_rings[r_idx].vect_bit = 1 << v_idx;
}
@@ -3534,48 +3620,81 @@ ixgbe_map_txring_to_vector(ixgbe_t *ixgbe, int t_idx, int v_idx)
/*
* Remember bit position
*/
+ ixgbe->tx_rings[t_idx].intr_vector = v_idx;
ixgbe->tx_rings[t_idx].vect_bit = 1 << v_idx;
}
/*
- * ixgbe_set_ivar - Set the given entry in the given interrupt vector
+ * ixgbe_setup_ivar - Set the given entry in the given interrupt vector
* allocation register (IVAR).
*/
static void
-ixgbe_set_ivar(ixgbe_t *ixgbe, uint16_t int_alloc_entry, uint8_t msix_vector)
+ixgbe_setup_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry, uint8_t msix_vector)
{
struct ixgbe_hw *hw = &ixgbe->hw;
u32 ivar, index;
msix_vector |= IXGBE_IVAR_ALLOC_VAL;
- index = (int_alloc_entry >> 2) & 0x1F;
+ index = (intr_alloc_entry >> 2) & 0x1F;
+ ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index));
+ ivar &= ~(0xFF << (8 * (intr_alloc_entry & 0x3)));
+ ivar |= (msix_vector << (8 * (intr_alloc_entry & 0x3)));
+ IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar);
+}
+
+/*
+ * ixgbe_enable_ivar - Enable the given entry by setting the VAL bit of
+ * given interrupt vector allocation register (IVAR).
+ */
+static void
+ixgbe_enable_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry)
+{
+ struct ixgbe_hw *hw = &ixgbe->hw;
+ u32 ivar, index;
+
+ index = (intr_alloc_entry >> 2) & 0x1F;
+ ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index));
+ ivar |= (IXGBE_IVAR_ALLOC_VAL << (8 * (intr_alloc_entry & 0x3)));
+ IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar);
+}
+
+/*
+ * ixgbe_enable_ivar - Disble the given entry by clearing the VAL bit of
+ * given interrupt vector allocation register (IVAR).
+ */
+static void
+ixgbe_disable_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry)
+{
+ struct ixgbe_hw *hw = &ixgbe->hw;
+ u32 ivar, index;
+
+ index = (intr_alloc_entry >> 2) & 0x1F;
ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index));
- ivar &= ~(0xFF << (8 * (int_alloc_entry & 0x3)));
- ivar |= (msix_vector << (8 * (int_alloc_entry & 0x3)));
+ ivar &= ~(IXGBE_IVAR_ALLOC_VAL << (8 * (intr_alloc_entry & 0x3)));
IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar);
}
/*
* ixgbe_map_rings_to_vectors - Map descriptor rings to interrupt vectors.
*
- * For msi-x, this currently implements only the scheme which is
- * 1 vector for tx + other, 1 vector for each rx ring.
+ * For MSI-X, here will map rx and tx ring to vector[0 - (vectors -1)].
+ * The last vector will be used for other interrupt.
*/
static int
ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe)
{
int i, vector = 0;
- int vect_remain = ixgbe->intr_cnt;
/* initialize vector map */
bzero(&ixgbe->vect_map, sizeof (ixgbe->vect_map));
/*
- * non-MSI-X case is very simple: all interrupts on vector 0
+ * non-MSI-X case is very simple: rx rings[0] on RTxQ[0],
+ * tx rings[0] on RTxQ[1].
*/
if (ixgbe->intr_type != DDI_INTR_TYPE_MSIX) {
ixgbe_map_rxring_to_vector(ixgbe, 0, 0);
- ixgbe_map_txring_to_vector(ixgbe, 0, 0);
+ ixgbe_map_txring_to_vector(ixgbe, 0, 1);
return (IXGBE_SUCCESS);
}
@@ -3584,16 +3703,19 @@ ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe)
*/
/*
- * Map vector 0 to tx
+ * Map vectors to rx rings
*/
- ixgbe_map_txring_to_vector(ixgbe, 0, vector++);
- vect_remain--;
+ for (i = 0; i < ixgbe->num_rx_rings; i++) {
+ ixgbe_map_rxring_to_vector(ixgbe, i, vector);
+ vector = (vector +1) % (ixgbe->intr_cnt -1);
+ }
/*
- * Map remaining vectors to rx rings
+ * Map vectors to tx rings
*/
- for (i = 0; i < vect_remain; i++) {
- ixgbe_map_rxring_to_vector(ixgbe, i, vector++);
+ for (i = 0; i < ixgbe->num_tx_rings; i++) {
+ ixgbe_map_txring_to_vector(ixgbe, i, vector);
+ vector = (vector +1) % (ixgbe->intr_cnt -1);
}
return (IXGBE_SUCCESS);
@@ -3602,16 +3724,16 @@ ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe)
/*
* ixgbe_setup_adapter_vector - Setup the adapter interrupt vector(s).
*
- * This relies on queue/vector mapping already set up in the
+ * This relies on ring/vector mapping already set up in the
* vect_map[] structures
*/
static void
ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
{
struct ixgbe_hw *hw = &ixgbe->hw;
- ixgbe_ring_vector_t *vect; /* vector bitmap */
- int r_idx; /* ring index */
- int v_idx; /* vector index */
+ ixgbe_ring_vector_t *vect; /* vector bitmap */
+ int r_idx; /* ring index */
+ int v_idx; /* vector index */
/*
* Clear any previous entries
@@ -3620,9 +3742,20 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
IXGBE_WRITE_REG(hw, IXGBE_IVAR(v_idx), 0);
/*
- * "Other" is always on vector 0
+ * For non MSI-X interrupt, rx rings[0] will use RTxQ[0], and
+ * tx rings[0] will use RTxQ[1].
+ */
+ if (ixgbe->intr_type != DDI_INTR_TYPE_MSIX) {
+ ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(0), 0);
+ ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(0), 1);
+ return;
+ }
+
+ /*
+ * For MSI-X interrupt, "Other" is always on last vector.
*/
- ixgbe_set_ivar(ixgbe, IXGBE_IVAR_OTHER_CAUSES_INDEX, 0);
+ ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_OTHER_CAUSES_INDEX,
+ (ixgbe->intr_cnt - 1));
/*
* For each interrupt vector, populate the IVAR table
@@ -3637,7 +3770,7 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
(ixgbe->num_rx_rings - 1));
while (r_idx >= 0) {
- ixgbe_set_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx),
+ ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx),
v_idx);
r_idx = bt_getlowbit(vect->rx_map, (r_idx + 1),
(ixgbe->num_rx_rings - 1));
@@ -3650,7 +3783,7 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
(ixgbe->num_tx_rings - 1));
while (r_idx >= 0) {
- ixgbe_set_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(r_idx),
+ ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(r_idx),
v_idx);
r_idx = bt_getlowbit(vect->tx_map, (r_idx + 1),
(ixgbe->num_tx_rings - 1));
@@ -3996,3 +4129,231 @@ ixgbe_fm_ereport(ixgbe_t *ixgbe, char *detail)
FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, NULL);
}
}
+
+static int
+ixgbe_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+ ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)rh;
+
+ mutex_enter(&rx_ring->rx_lock);
+ rx_ring->ring_gen_num = mr_gen_num;
+ mutex_exit(&rx_ring->rx_lock);
+ return (0);
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ */
+/* ARGSUSED */
+void
+ixgbe_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+ const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ ixgbe_t *ixgbe = (ixgbe_t *)arg;
+ mac_intr_t *mintr = &infop->mri_intr;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ ASSERT(rg_index == 0);
+ ASSERT(ring_index < ixgbe->num_rx_rings);
+
+ ixgbe_rx_ring_t *rx_ring = &ixgbe->rx_rings[ring_index];
+ rx_ring->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)rx_ring;
+ infop->mri_start = ixgbe_ring_start;
+ infop->mri_stop = NULL;
+ infop->mri_poll = ixgbe_ring_rx_poll;
+
+ mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+ mintr->mi_enable = ixgbe_rx_ring_intr_enable;
+ mintr->mi_disable = ixgbe_rx_ring_intr_disable;
+
+ break;
+ }
+ case MAC_RING_TYPE_TX: {
+ ASSERT(rg_index == -1);
+ ASSERT(ring_index < ixgbe->num_tx_rings);
+
+ ixgbe_tx_ring_t *tx_ring = &ixgbe->tx_rings[ring_index];
+ tx_ring->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)tx_ring;
+ infop->mri_start = NULL;
+ infop->mri_stop = NULL;
+ infop->mri_tx = ixgbe_ring_tx;
+
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+/*
+ * Callback funtion for MAC layer to register all groups.
+ */
+void
+ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+ mac_group_info_t *infop, mac_group_handle_t gh)
+{
+ ixgbe_t *ixgbe = (ixgbe_t *)arg;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ ixgbe_rx_group_t *rx_group;
+
+ rx_group = &ixgbe->rx_groups[index];
+ rx_group->group_handle = gh;
+
+ infop->mgi_driver = (mac_group_driver_t)rx_group;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = ixgbe_addmac;
+ infop->mgi_remmac = ixgbe_remmac;
+ infop->mgi_count = (ixgbe->num_rx_rings / ixgbe->num_rx_groups);
+
+ break;
+ }
+ case MAC_RING_TYPE_TX:
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * Enable interrupt on the specificed rx ring.
+ */
+int
+ixgbe_rx_ring_intr_enable(mac_intr_handle_t intrh)
+{
+ ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)intrh;
+ ixgbe_t *ixgbe = rx_ring->ixgbe;
+ int r_idx = rx_ring->index;
+ int v_idx = rx_ring->intr_vector;
+
+ mutex_enter(&ixgbe->gen_lock);
+ ASSERT(BT_TEST(ixgbe->vect_map[v_idx].rx_map, r_idx) == 0);
+
+ /*
+ * To enable interrupt by setting the VAL bit of given interrupt
+ * vector allocation register (IVAR).
+ */
+ ixgbe_enable_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx));
+
+ BT_SET(ixgbe->vect_map[v_idx].rx_map, r_idx);
+ mutex_exit(&ixgbe->gen_lock);
+
+ return (0);
+}
+
+/*
+ * Disable interrupt on the specificed rx ring.
+ */
+int
+ixgbe_rx_ring_intr_disable(mac_intr_handle_t intrh)
+{
+ ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)intrh;
+ ixgbe_t *ixgbe = rx_ring->ixgbe;
+ int r_idx = rx_ring->index;
+ int v_idx = rx_ring->intr_vector;
+
+ mutex_enter(&ixgbe->gen_lock);
+
+ ASSERT(BT_TEST(ixgbe->vect_map[v_idx].rx_map, r_idx) == 1);
+
+ /*
+ * To disable interrupt by clearing the VAL bit of given interrupt
+ * vector allocation register (IVAR).
+ */
+ ixgbe_disable_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx));
+
+ BT_CLEAR(ixgbe->vect_map[v_idx].rx_map, r_idx);
+
+ mutex_exit(&ixgbe->gen_lock);
+
+ return (0);
+}
+
+/*
+ * Add a mac address.
+ */
+static int
+ixgbe_addmac(void *arg, const uint8_t *mac_addr)
+{
+ ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)arg;
+ ixgbe_t *ixgbe = rx_group->ixgbe;
+ int slot;
+ int err;
+
+ mutex_enter(&ixgbe->gen_lock);
+
+ if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+ mutex_exit(&ixgbe->gen_lock);
+ return (ECANCELED);
+ }
+
+ if (ixgbe->unicst_avail == 0) {
+ /* no slots available */
+ mutex_exit(&ixgbe->gen_lock);
+ return (ENOSPC);
+ }
+
+ for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+ if (ixgbe->unicst_addr[slot].mac.set == 0)
+ break;
+ }
+
+ ASSERT((slot >= 0) && (slot < ixgbe->unicst_total));
+
+ if ((err = ixgbe_unicst_set(ixgbe, mac_addr, slot)) == 0) {
+ ixgbe->unicst_addr[slot].mac.set = 1;
+ ixgbe->unicst_avail--;
+ }
+
+ mutex_exit(&ixgbe->gen_lock);
+
+ return (err);
+}
+
+/*
+ * Remove a mac address.
+ */
+static int
+ixgbe_remmac(void *arg, const uint8_t *mac_addr)
+{
+ ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)arg;
+ ixgbe_t *ixgbe = rx_group->ixgbe;
+ int slot;
+ int err;
+
+ mutex_enter(&ixgbe->gen_lock);
+
+ if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+ mutex_exit(&ixgbe->gen_lock);
+ return (ECANCELED);
+ }
+
+ slot = ixgbe_unicst_find(ixgbe, mac_addr);
+ if (slot == -1) {
+ mutex_exit(&ixgbe->gen_lock);
+ return (EINVAL);
+ }
+
+ if (ixgbe->unicst_addr[slot].mac.set == 0) {
+ mutex_exit(&ixgbe->gen_lock);
+ return (EINVAL);
+ }
+
+ bzero(ixgbe->unicst_addr[slot].mac.addr, ETHERADDRL);
+ if ((err = ixgbe_unicst_set(ixgbe,
+ ixgbe->unicst_addr[slot].mac.addr, slot)) == 0) {
+ ixgbe->unicst_addr[slot].mac.set = 0;
+ ixgbe->unicst_avail++;
+ }
+
+ mutex_exit(&ixgbe->gen_lock);
+
+ return (err);
+}
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_rx.c b/usr/src/uts/common/io/ixgbe/ixgbe_rx.c
index 3f09a4215d..63e42cede2 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_rx.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_rx.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "ixgbe_sw.h"
@@ -176,7 +176,10 @@ ixgbe_rx_bind(ixgbe_rx_ring_t *rx_ring, uint32_t index, uint32_t pkt_len)
* DMA buffer, we have to return and use bcopy to
* process the packet.
*/
- if (current_rcb->mp == NULL) {
+ if (current_rcb->mp != NULL) {
+ current_rcb->mp->b_rptr += IPHDR_ALIGN_ROOM;
+ current_rcb->mp->b_wptr += IPHDR_ALIGN_ROOM;
+ } else {
atomic_inc_32(&rx_ring->rcb_free);
return (NULL);
}
@@ -246,7 +249,7 @@ ixgbe_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
}
/*
- * ixgbe_rx - Receive the data of one ring.
+ * ixgbe_ring_rx - Receive the data of one ring.
*
* This function goes throught h/w descriptor in one specified rx ring,
* receives the data if the descriptor status shows the data is ready.
@@ -254,7 +257,7 @@ ixgbe_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
* passed up to mac_rx().
*/
mblk_t *
-ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
+ixgbe_ring_rx(ixgbe_rx_ring_t *rx_ring, int poll_bytes)
{
union ixgbe_adv_rx_desc *current_rbd;
rx_control_block_t *current_rcb;
@@ -266,6 +269,7 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
uint32_t pkt_len;
uint32_t status_error;
uint32_t pkt_num;
+ uint32_t received_bytes;
ixgbe_t *ixgbe = rx_ring->ixgbe;
struct ixgbe_hw *hw = &ixgbe->hw;
@@ -289,6 +293,7 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
rx_next = rx_ring->rbd_next;
current_rbd = &rx_ring->rbd_ring[rx_next];
+ received_bytes = 0;
pkt_num = 0;
status_error = current_rbd->wb.upper.status_error;
while (status_error & IXGBE_RXD_STAT_DD) {
@@ -309,6 +314,13 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
(status_error & IXGBE_RXDADV_ERR_IPE));
pkt_len = current_rbd->wb.upper.length;
+
+ if ((poll_bytes != IXGBE_POLL_NULL) &&
+ ((received_bytes + pkt_len) > poll_bytes))
+ break;
+
+ received_bytes += pkt_len;
+
mp = NULL;
/*
* For packets with length more than the copy threshold,
@@ -378,3 +390,21 @@ rx_discard:
return (mblk_head);
}
+
+mblk_t *
+ixgbe_ring_rx_poll(void *arg, int n_bytes)
+{
+ ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)arg;
+ mblk_t *mp = NULL;
+
+ ASSERT(n_bytes >= 0);
+
+ if (n_bytes == 0)
+ return (mp);
+
+ mutex_enter(&rx_ring->rx_lock);
+ mp = ixgbe_ring_rx(rx_ring, n_bytes);
+ mutex_exit(&rx_ring->rx_lock);
+
+ return (mp);
+}
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_stat.c b/usr/src/uts/common/io/ixgbe/ixgbe_stat.c
index 776af1fba4..00eccf23a2 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_stat.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_stat.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "ixgbe_sw.h"
@@ -87,17 +87,29 @@ ixgbe_update_stats(kstat_t *ks, int rw)
ixgbe_ks->tx_reschedule.value.ui64 +=
ixgbe->tx_rings[i].stat_reschedule;
}
+#endif
/*
* Hardware calculated statistics.
*/
+ ixgbe_ks->gprc.value.ui64 = 0;
+ ixgbe_ks->gptc.value.ui64 = 0;
+ ixgbe_ks->tor.value.ui64 = 0;
+ ixgbe_ks->tot.value.ui64 = 0;
for (i = 0; i < 16; i++) {
- ixgbe_ks->gprc.value.ul += IXGBE_READ_REG(hw, IXGBE_QPRC(i));
- ixgbe_ks->gptc.value.ul += IXGBE_READ_REG(hw, IXGBE_QPTC(i));
- ixgbe_ks->tor.value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBRC(i));
- ixgbe_ks->tot.value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBTC(i));
+ ixgbe_ks->qprc[i].value.ui64 +=
+ IXGBE_READ_REG(hw, IXGBE_QPRC(i));
+ ixgbe_ks->gprc.value.ui64 += ixgbe_ks->qprc[i].value.ui64;
+ ixgbe_ks->qptc[i].value.ui64 +=
+ IXGBE_READ_REG(hw, IXGBE_QPTC(i));
+ ixgbe_ks->gptc.value.ui64 += ixgbe_ks->qptc[i].value.ui64;
+ ixgbe_ks->qbrc[i].value.ui64 +=
+ IXGBE_READ_REG(hw, IXGBE_QBRC(i));
+ ixgbe_ks->tor.value.ui64 += ixgbe_ks->qbrc[i].value.ui64;
+ ixgbe_ks->qbtc[i].value.ui64 +=
+ IXGBE_READ_REG(hw, IXGBE_QBTC(i));
+ ixgbe_ks->tot.value.ui64 += ixgbe_ks->qbtc[i].value.ui64;
}
-
/*
* This is a Workaround:
* Currently h/w GORCH, GOTCH, TORH registers are not
@@ -124,7 +136,6 @@ ixgbe_update_stats(kstat_t *ks, int rw)
ixgbe_ks->ptc511.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC511);
ixgbe_ks->ptc1023.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC1023);
ixgbe_ks->ptc1522.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC1522);
-#endif
ixgbe_ks->mspdc.value.ui64 += IXGBE_READ_REG(hw, IXGBE_MSPDC);
for (i = 0; i < 8; i++)
@@ -200,6 +211,7 @@ ixgbe_init_stats(ixgbe_t *ixgbe)
KSTAT_DATA_UINT64);
kstat_named_init(&ixgbe_ks->tx_reschedule, "tx_reschedule",
KSTAT_DATA_UINT64);
+#endif
kstat_named_init(&ixgbe_ks->gprc, "good_pkts_recvd",
KSTAT_DATA_UINT64);
@@ -233,7 +245,138 @@ ixgbe_init_stats(ixgbe_t *ixgbe)
KSTAT_DATA_UINT64);
kstat_named_init(&ixgbe_ks->ptc1522, "pkts_xmitd_(1024-1522b)",
KSTAT_DATA_UINT64);
-#endif
+
+ kstat_named_init(&ixgbe_ks->qprc[0], "queue_pkts_recvd [ 0]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[1], "queue_pkts_recvd [ 1]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[2], "queue_pkts_recvd [ 2]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[3], "queue_pkts_recvd [ 3]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[4], "queue_pkts_recvd [ 4]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[5], "queue_pkts_recvd [ 5]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[6], "queue_pkts_recvd [ 6]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[7], "queue_pkts_recvd [ 7]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[8], "queue_pkts_recvd [ 8]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[9], "queue_pkts_recvd [ 9]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[10], "queue_pkts_recvd [10]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[11], "queue_pkts_recvd [11]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[12], "queue_pkts_recvd [12]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[13], "queue_pkts_recvd [13]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[14], "queue_pkts_recvd [14]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qprc[15], "queue_pkts_recvd [15]",
+ KSTAT_DATA_UINT64);
+
+ kstat_named_init(&ixgbe_ks->qptc[0], "queue_pkts_xmitd [ 0]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[1], "queue_pkts_xmitd [ 1]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[2], "queue_pkts_xmitd [ 2]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[3], "queue_pkts_xmitd [ 3]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[4], "queue_pkts_xmitd [ 4]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[5], "queue_pkts_xmitd [ 5]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[6], "queue_pkts_xmitd [ 6]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[7], "queue_pkts_xmitd [ 7]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[8], "queue_pkts_xmitd [ 8]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[9], "queue_pkts_xmitd [ 9]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[10], "queue_pkts_xmitd [10]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[11], "queue_pkts_xmitd [11]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[12], "queue_pkts_xmitd [12]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[13], "queue_pkts_xmitd [13]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[14], "queue_pkts_xmitd [14]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qptc[15], "queue_pkts_xmitd [15]",
+ KSTAT_DATA_UINT64);
+
+ kstat_named_init(&ixgbe_ks->qbrc[0], "queue_bytes_recvd [ 0]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[1], "queue_bytes_recvd [ 1]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[2], "queue_bytes_recvd [ 2]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[3], "queue_bytes_recvd [ 3]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[4], "queue_bytes_recvd [ 4]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[5], "queue_bytes_recvd [ 5]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[6], "queue_bytes_recvd [ 6]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[7], "queue_bytes_recvd [ 7]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[8], "queue_bytes_recvd [ 8]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[9], "queue_bytes_recvd [ 9]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[10], "queue_bytes_recvd [10]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[11], "queue_bytes_recvd [11]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[12], "queue_bytes_recvd [12]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[13], "queue_bytes_recvd [13]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[14], "queue_bytes_recvd [14]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbrc[15], "queue_bytes_recvd [15]",
+ KSTAT_DATA_UINT64);
+
+ kstat_named_init(&ixgbe_ks->qbtc[0], "queue_bytes_xmitd [ 0]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[1], "queue_bytes_xmitd [ 1]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[2], "queue_bytes_xmitd [ 2]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[3], "queue_bytes_xmitd [ 3]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[4], "queue_bytes_xmitd [ 4]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[5], "queue_bytes_xmitd [ 5]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[6], "queue_bytes_xmitd [ 6]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[7], "queue_bytes_xmitd [ 7]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[8], "queue_bytes_xmitd [ 8]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[9], "queue_bytes_xmitd [ 9]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[10], "queue_bytes_xmitd [10]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[11], "queue_bytes_xmitd [11]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[12], "queue_bytes_xmitd [12]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[13], "queue_bytes_xmitd [13]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[14], "queue_bytes_xmitd [14]",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ixgbe_ks->qbtc[15], "queue_bytes_xmitd [15]",
+ KSTAT_DATA_UINT64);
kstat_named_init(&ixgbe_ks->mspdc, "mac_short_packet_discard",
KSTAT_DATA_UINT64);
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
index 390233fff5..f648c57a18 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,17 @@
*/
/*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
*/
#ifndef _IXGBE_SW_H
#define _IXGBE_SW_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -48,7 +48,7 @@ extern "C" {
#include <sys/modctl.h>
#include <sys/errno.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/vlan.h>
#include <sys/ddi.h>
@@ -89,6 +89,8 @@ extern "C" {
#define IXGBE_INTR_MSI 2
#define IXGBE_INTR_LEGACY 3
+#define IXGBE_POLL_NULL -1
+
#define MAX_COOKIE 18
#define MIN_NUM_TX_DESC 2
@@ -102,6 +104,7 @@ extern "C" {
*/
#define MAX_TX_QUEUE_NUM 32
#define MAX_RX_QUEUE_NUM 64
+#define MAX_RX_GROUP_NUM 1
#define MAX_TX_RING_SIZE 4096
#define MAX_RX_RING_SIZE 4096
@@ -121,6 +124,7 @@ extern "C" {
*/
#define MIN_TX_QUEUE_NUM 1
#define MIN_RX_QUEUE_NUM 1
+#define MIN_RX_GROUP_NUM 1
#define MIN_TX_RING_SIZE 64
#define MIN_RX_RING_SIZE 64
@@ -136,17 +140,18 @@ extern "C" {
/*
* Default values for user configurable parameters
*/
-#define DEFAULT_TX_QUEUE_NUM 1
-#define DEFAULT_RX_QUEUE_NUM 1
-#define DEFAULT_TX_RING_SIZE 512
-#define DEFAULT_RX_RING_SIZE 512
+#define DEFAULT_TX_QUEUE_NUM 8
+#define DEFAULT_RX_QUEUE_NUM 8
+#define DEFAULT_RX_GROUP_NUM 1
+#define DEFAULT_TX_RING_SIZE 1024
+#define DEFAULT_RX_RING_SIZE 1024
#define DEFAULT_MTU ETHERMTU
#define DEFAULT_RX_LIMIT_PER_INTR 256
#define DEFAULT_INTR_THROTTLING 200 /* In unit of 256 nsec */
#define DEFAULT_RX_COPY_THRESHOLD 128
#define DEFAULT_TX_COPY_THRESHOLD 512
-#define DEFAULT_TX_RECYCLE_THRESHOLD MAX_COOKIE
+#define DEFAULT_TX_RECYCLE_THRESHOLD (MAX_COOKIE + 1)
#define DEFAULT_TX_OVERLOAD_THRESHOLD MIN_NUM_TX_DESC
#define DEFAULT_TX_RESCHED_THRESHOLD 128
#define DEFAULT_FCRTH 0x20000
@@ -156,6 +161,14 @@ extern "C" {
#define DEFAULT_TX_HCKSUM_ENABLE B_TRUE
#define DEFAULT_RX_HCKSUM_ENABLE B_TRUE
#define DEFAULT_LSO_ENABLE B_TRUE
+#define DEFAULT_MR_ENABLE B_TRUE
+#define DEFAULT_TX_HEAD_WB_ENABLE B_TRUE
+
+#define IXGBE_LSO_MAXLEN 65535
+
+#define DEFAULT_TX_HCKSUM_ENABLE B_TRUE
+#define DEFAULT_RX_HCKSUM_ENABLE B_TRUE
+#define DEFAULT_LSO_ENABLE B_TRUE
#define DEFAULT_TX_HEAD_WB_ENABLE B_TRUE
#define IXGBE_LSO_MAXLEN 65535
@@ -167,11 +180,12 @@ extern "C" {
#define MAX_LINK_DOWN_TIMEOUT 8 /* 8 seconds */
/*
- * limits on msi-x vectors for 82598
+ * Limits on msi-x vectors for 82598
*/
-#define IXGBE_MAX_INTR_VECTOR 18
-#define IXGBE_MAX_OTHER_VECTOR 2
-#define IXGBE_MAX_RING_VECTOR (IXGBE_MAX_INTR_VECTOR - IXGBE_MAX_OTHER_VECTOR)
+#define IXGBE_MAX_INTR_VECTOR 18
+#define IXGBE_MAX_OTHER_VECTOR 1
+#define IXGBE_MAX_TCP_TIMER_VECTOR 1
+#define IXGBE_MAX_RING_VECTOR 16
/*
* Extra register bit masks for 82598
@@ -209,11 +223,13 @@ extern "C" {
#define PROP_TX_RING_SIZE "tx_ring_size"
#define PROP_RX_QUEUE_NUM "rx_queue_number"
#define PROP_RX_RING_SIZE "rx_ring_size"
+#define PROP_RX_GROUP_NUM "rx_group_number"
#define PROP_INTR_FORCE "intr_force"
#define PROP_TX_HCKSUM_ENABLE "tx_hcksum_enable"
#define PROP_RX_HCKSUM_ENABLE "rx_hcksum_enable"
#define PROP_LSO_ENABLE "lso_enable"
+#define PROP_MR_ENABLE "mr_enable"
#define PROP_TX_HEAD_WB_ENABLE "tx_head_wb_enable"
#define PROP_TX_COPY_THRESHOLD "tx_copy_threshold"
#define PROP_TX_RECYCLE_THRESHOLD "tx_recycle_threshold"
@@ -264,9 +280,6 @@ enum ioc_reply {
IOC_REPLY /* OK, just send reply */
};
-#define MBLK_LEN(mp) ((uintptr_t)(mp)->b_wptr - \
- (uintptr_t)(mp)->b_rptr)
-
#define DMA_SYNC(area, flag) ((void) ddi_dma_sync((area)->dma_handle, \
0, 0, (flag)))
@@ -533,13 +546,15 @@ typedef struct ixgbe_tx_ring {
uint32_t stat_fail_no_tcb;
uint32_t stat_fail_dma_bind;
uint32_t stat_reschedule;
+ uint32_t stat_lso_header_fail;
#endif
+ mac_ring_handle_t ring_handle;
+
/*
* Pointer to the ixgbe struct
*/
struct ixgbe *ixgbe;
-
} ixgbe_tx_ring_t;
/*
@@ -590,11 +605,22 @@ typedef struct ixgbe_rx_ring {
uint32_t stat_exceed_pkt;
#endif
- struct ixgbe *ixgbe; /* Pointer to ixgbe struct */
+ mac_ring_handle_t ring_handle;
+ uint64_t ring_gen_num;
+ struct ixgbe *ixgbe; /* Pointer to ixgbe struct */
} ixgbe_rx_ring_t;
/*
+ * Software Receive Ring Group
+ */
+typedef struct ixgbe_rx_group {
+ uint32_t index; /* Group index */
+ mac_group_handle_t group_handle; /* call back group handle */
+ struct ixgbe *ixgbe; /* Pointer to ixgbe struct */
+} ixgbe_rx_group_t;
+
+/*
* structure to map ring cleanup to msi-x vector
*/
typedef struct ixgbe_ring_vector {
@@ -641,6 +667,12 @@ typedef struct ixgbe {
uint32_t rx_buf_size; /* Rx buffer size */
/*
+ * Receive Groups
+ */
+ ixgbe_rx_group_t *rx_groups; /* Array of rx groups */
+ uint32_t num_rx_groups; /* Number of rx groups in use */
+
+ /*
* Transmit Rings
*/
ixgbe_tx_ring_t *tx_rings; /* Array of tx rings */
@@ -651,6 +683,7 @@ typedef struct ixgbe {
boolean_t tx_head_wb_enable; /* Tx head wrtie-back */
boolean_t tx_hcksum_enable; /* Tx h/w cksum offload */
boolean_t lso_enable; /* Large Segment Offload */
+ boolean_t mr_enable; /* Multiple Tx and Rx Ring */
uint32_t tx_copy_thresh; /* Tx copy threshold */
uint32_t tx_recycle_thresh; /* Tx recycle threshold */
uint32_t tx_overload_thresh; /* Tx overload threshold */
@@ -684,6 +717,8 @@ typedef struct ixgbe {
uint32_t mcast_count;
struct ether_addr mcast_table[MAX_NUM_MULTICAST_ADDRESSES];
+ ulong_t sys_page_size;
+
/*
* Kstat definitions
*/
@@ -694,13 +729,11 @@ typedef struct ixgbe {
*/
caddr_t nd_data;
nd_param_t nd_params[PARAM_COUNT];
-
} ixgbe_t;
typedef struct ixgbe_stat {
-
kstat_named_t link_speed; /* Link Speed */
-#ifdef IXGBE_DEBUG
+
kstat_named_t reset_count; /* Reset Count */
kstat_named_t rx_frame_error; /* Rx Error in Packet */
@@ -729,7 +762,11 @@ typedef struct ixgbe_stat {
kstat_named_t ptc511; /* Packets Xmitted (255-511b) */
kstat_named_t ptc1023; /* Packets Xmitted (512-1023b) */
kstat_named_t ptc1522; /* Packets Xmitted (1024-1522b */
-#endif
+ kstat_named_t qprc[16]; /* Queue Packets Received Count */
+ kstat_named_t qptc[16]; /* Queue Packets Transmitted Count */
+ kstat_named_t qbrc[16]; /* Queue Bytes Received Count */
+ kstat_named_t qbtc[16]; /* Queue Bytes Transmitted Count */
+
kstat_named_t crcerrs; /* CRC Error Count */
kstat_named_t illerrc; /* Illegal Byte Error Count */
kstat_named_t errbc; /* Error Byte Count */
@@ -770,7 +807,6 @@ void ixgbe_set_fma_flags(int, int);
int ixgbe_start(ixgbe_t *);
void ixgbe_stop(ixgbe_t *);
int ixgbe_driver_setup_link(ixgbe_t *, boolean_t);
-int ixgbe_unicst_set(ixgbe_t *, const uint8_t *, mac_addr_slot_t);
int ixgbe_multicst_add(ixgbe_t *, const uint8_t *);
int ixgbe_multicst_remove(ixgbe_t *, const uint8_t *);
enum ioc_reply ixgbe_loopback_ioctl(ixgbe_t *, struct iocblk *, mblk_t *);
@@ -783,6 +819,13 @@ int ixgbe_check_acc_handle(ddi_acc_handle_t handle);
int ixgbe_check_dma_handle(ddi_dma_handle_t handle);
void ixgbe_fm_ereport(ixgbe_t *, char *);
+void ixgbe_fill_ring(void *, mac_ring_type_t, const int, const int,
+ mac_ring_info_t *, mac_ring_handle_t);
+void ixgbe_fill_group(void *arg, mac_ring_type_t, const int,
+ mac_group_info_t *, mac_group_handle_t);
+int ixgbe_rx_ring_intr_enable(mac_intr_handle_t);
+int ixgbe_rx_ring_intr_disable(mac_intr_handle_t);
+
/*
* Function prototypes in ixgbe_gld.c
*/
@@ -790,26 +833,22 @@ int ixgbe_m_start(void *);
void ixgbe_m_stop(void *);
int ixgbe_m_promisc(void *, boolean_t);
int ixgbe_m_multicst(void *, boolean_t, const uint8_t *);
-int ixgbe_m_unicst(void *, const uint8_t *);
int ixgbe_m_stat(void *, uint_t, uint64_t *);
void ixgbe_m_resources(void *);
void ixgbe_m_ioctl(void *, queue_t *, mblk_t *);
-int ixgbe_m_unicst_add(void *, mac_multi_addr_t *);
-int ixgbe_m_unicst_remove(void *, mac_addr_slot_t);
-int ixgbe_m_unicst_modify(void *, mac_multi_addr_t *);
-int ixgbe_m_unicst_get(void *, mac_multi_addr_t *);
boolean_t ixgbe_m_getcapab(void *, mac_capab_t, void *);
/*
* Function prototypes in ixgbe_rx.c
*/
-mblk_t *ixgbe_rx(ixgbe_rx_ring_t *);
+mblk_t *ixgbe_ring_rx(ixgbe_rx_ring_t *, int);
void ixgbe_rx_recycle(caddr_t arg);
+mblk_t *ixgbe_ring_rx_poll(void *, int);
/*
* Function prototypes in ixgbe_tx.c
*/
-mblk_t *ixgbe_m_tx(void *, mblk_t *);
+mblk_t *ixgbe_ring_tx(void *, mblk_t *);
void ixgbe_free_tcb(tx_control_block_t *);
void ixgbe_put_free_list(ixgbe_tx_ring_t *, link_list_t *);
uint32_t ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *);
@@ -834,7 +873,6 @@ enum ioc_reply ixgbe_nd_ioctl(ixgbe_t *, queue_t *, mblk_t *, struct iocblk *);
*/
int ixgbe_init_stats(ixgbe_t *);
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
index f2a5d8fa0c..721353c756 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
@@ -1,19 +1,17 @@
/*
* CDDL HEADER START
*
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
- * You can obtain a copy of the license at:
- * http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,16 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
#include "ixgbe_sw.h"
-static boolean_t ixgbe_tx(ixgbe_tx_ring_t *, mblk_t *);
static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
uint32_t, boolean_t);
static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
@@ -44,7 +43,7 @@ static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
ixgbe_tx_context_t *);
static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
- ixgbe_tx_context_t *);
+ ixgbe_tx_context_t *, int);
#ifndef IXGBE_DEBUG
#pragma inline(ixgbe_save_desc)
@@ -54,65 +53,9 @@ static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
#endif
/*
- * ixgbe_m_tx
- *
- * The GLDv3 interface to call driver's tx routine to transmit
- * the mblks.
- */
-mblk_t *
-ixgbe_m_tx(void *arg, mblk_t *mp)
-{
- ixgbe_t *ixgbe = (ixgbe_t *)arg;
- mblk_t *next;
- ixgbe_tx_ring_t *tx_ring;
-
- /*
- * If the adapter is suspended, or it is not started, or the link
- * is not up, the mblks are simply dropped.
- */
- if (((ixgbe->ixgbe_state & IXGBE_SUSPENDED) != 0) ||
- ((ixgbe->ixgbe_state & IXGBE_STARTED) == 0) ||
- (ixgbe->link_state != LINK_STATE_UP)) {
- /* Free the mblk chain */
- while (mp != NULL) {
- next = mp->b_next;
- mp->b_next = NULL;
-
- freemsg(mp);
- mp = next;
- }
-
- return (NULL);
- }
-
- /*
- * Decide which tx ring is used to transmit the packets.
- * This needs to be updated later to fit the new interface
- * of the multiple rings support.
- */
- tx_ring = &ixgbe->tx_rings[0];
-
- while (mp != NULL) {
- next = mp->b_next;
- mp->b_next = NULL;
-
- if (!ixgbe_tx(tx_ring, mp)) {
- mp->b_next = next;
- break;
- }
-
- mp = next;
- }
-
- return (mp);
-}
-
-/*
- * ixgbe_tx - Main transmit processing
+ * ixgbe_ring_tx
*
- * Called from ixgbe_m_tx with an mblk ready to transmit. this
- * routine sets up the transmit descriptors and sends data to
- * the wire.
+ * To transmit one mblk through one specified ring.
*
* One mblk can consist of several fragments, each fragment
* will be processed with different methods based on the size.
@@ -136,9 +79,10 @@ ixgbe_m_tx(void *arg, mblk_t *mp)
* be used. After the processing, those tx control blocks will
* be put to the work list.
*/
-static boolean_t
-ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
+mblk_t *
+ixgbe_ring_tx(void *arg, mblk_t *mp)
{
+ ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
ixgbe_t *ixgbe = tx_ring->ixgbe;
tx_type_t current_flag, next_flag;
uint32_t current_len, next_len;
@@ -150,11 +94,19 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
tx_control_block_t *tcb;
ixgbe_tx_context_t tx_context, *ctx;
link_list_t pending_list;
+ uint32_t len, hdr_frag_len, hdr_len;
+ uint32_t copy_thresh;
+ mblk_t *new_mp;
+ mblk_t *pre_mp;
+
+ ASSERT(mp->b_next == NULL);
+
+ copy_thresh = tx_ring->copy_thresh;
/* Get the mblk size */
mbsize = 0;
for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
- mbsize += MBLK_LEN(nmp);
+ mbsize += MBLKL(nmp);
}
if (ixgbe->tx_hcksum_enable) {
@@ -166,25 +118,24 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
ctx = &tx_context;
if (ixgbe_get_context(mp, ctx) < 0) {
freemsg(mp);
- return (B_TRUE);
+ return (NULL);
}
/*
* If the mblk size exceeds the max size ixgbe could
- * process, then discard this mblk, and return B_TRUE
+ * process, then discard this mblk, and return NULL.
*/
if ((ctx->lso_flag && ((mbsize - ctx->mac_hdr_len)
> IXGBE_LSO_MAXLEN)) || (!ctx->lso_flag &&
(mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
freemsg(mp);
IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
- return (B_TRUE);
+ return (NULL);
}
} else {
ctx = NULL;
}
-
/*
* Check and recycle tx descriptors.
* The recycle threshold here should be selected carefully
@@ -194,13 +145,13 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
/*
* After the recycling, if the tbd_free is less than the
- * overload_threshold, assert overload, return B_FALSE;
+ * overload_threshold, assert overload, return mp;
* and we need to re-schedule the tx again.
*/
if (tx_ring->tbd_free < tx_ring->overload_thresh) {
tx_ring->reschedule = B_TRUE;
IXGBE_DEBUG_STAT(tx_ring->stat_overload);
- return (B_FALSE);
+ return (mp);
}
/*
@@ -213,12 +164,77 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
desc_num = 0;
desc_total = 0;
+ /*
+ * The software should guarantee LSO packet header(MAC+IP+TCP)
+ * to be within one descriptor. Here we reallocate and refill the
+ * the header if it's physical memory non-contiguous.
+ */
+ if ((ctx != NULL) && ctx->lso_flag) {
+ /* find the last fragment of the header */
+ len = MBLKL(mp);
+ ASSERT(len > 0);
+ nmp = mp;
+ pre_mp = NULL;
+ hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
+ while (len < hdr_len) {
+ pre_mp = nmp;
+ nmp = nmp->b_cont;
+ len += MBLKL(nmp);
+ }
+ /*
+ * If the header and the payload are in different mblks,
+ * we simply force the header to be copied into pre-allocated
+ * page-aligned buffer.
+ */
+ if (len == hdr_len)
+ goto adjust_threshold;
+
+ hdr_frag_len = hdr_len - (len - MBLKL(nmp));
+ /*
+ * There are two cases we need to reallocate a mblk for the
+ * last header fragment:
+ * 1. the header is in multiple mblks and the last fragment
+ * share the same mblk with the payload
+ * 2. the header is in a single mblk shared with the payload
+ * and the header is physical memory non-contiguous
+ */
+ if ((nmp != mp) ||
+ (P2NPHASE((uintptr_t)nmp->b_rptr, ixgbe->sys_page_size)
+ < len)) {
+ IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
+ /*
+ * reallocate the mblk for the last header fragment,
+ * expect to bcopy into pre-allocated page-aligned
+ * buffer
+ */
+ new_mp = allocb(hdr_frag_len, NULL);
+ if (!new_mp)
+ return (B_FALSE);
+ bcopy(nmp->b_rptr, new_mp->b_rptr, hdr_frag_len);
+ /* link the new header fragment with the other parts */
+ new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
+ new_mp->b_cont = nmp;
+ if (pre_mp)
+ pre_mp->b_cont = new_mp;
+ nmp->b_rptr += hdr_frag_len;
+ if (hdr_frag_len == hdr_len)
+ mp = new_mp;
+ }
+adjust_threshold:
+ /*
+ * adjust the bcopy threshhold to guarantee
+ * the header to use bcopy way
+ */
+ if (copy_thresh < hdr_len)
+ copy_thresh = hdr_len;
+ }
+
current_mp = mp;
- current_len = MBLK_LEN(current_mp);
+ current_len = MBLKL(current_mp);
/*
* Decide which method to use for the first fragment
*/
- current_flag = (current_len <= tx_ring->copy_thresh) ?
+ current_flag = (current_len <= copy_thresh) ?
USE_COPY : USE_DMA;
/*
* If the mblk includes several contiguous small fragments,
@@ -238,7 +254,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
while (current_mp) {
next_mp = current_mp->b_cont;
eop = (next_mp == NULL); /* Last fragment of the packet? */
- next_len = eop ? 0: MBLK_LEN(next_mp);
+ next_len = eop ? 0: MBLKL(next_mp);
/*
* When the current fragment is an empty fragment, if
@@ -254,7 +270,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
if ((current_len == 0) && (copy_done)) {
current_mp = next_mp;
current_len = next_len;
- current_flag = (current_len <= tx_ring->copy_thresh) ?
+ current_flag = (current_len <= copy_thresh) ?
USE_COPY : USE_DMA;
continue;
}
@@ -302,10 +318,10 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
* copied to the current tx buffer, we need
* to complete the current copy processing.
*/
- next_flag = (next_len > tx_ring->copy_thresh) ?
+ next_flag = (next_len > copy_thresh) ?
USE_DMA: USE_COPY;
copy_done = B_TRUE;
- } else if (next_len > tx_ring->copy_thresh) {
+ } else if (next_len > copy_thresh) {
/*
* The next fragment needs to be processed with
* DMA binding. So the copy prcessing will be
@@ -329,7 +345,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
* Check whether to use bcopy or DMA binding to process
* the next fragment.
*/
- next_flag = (next_len > tx_ring->copy_thresh) ?
+ next_flag = (next_len > copy_thresh) ?
USE_DMA: USE_COPY;
ASSERT(copy_done == B_TRUE);
@@ -367,7 +383,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
/*
* If the number of free tx descriptors is not enough for transmit
- * then return failure.
+ * then return mp.
*
* Note: we must put this check under the mutex protection to
* ensure the correctness when multiple threads access it in
@@ -386,7 +402,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
mutex_exit(&tx_ring->tx_lock);
- return (B_TRUE);
+ return (NULL);
tx_failure:
/*
@@ -410,7 +426,7 @@ tx_failure:
/* Transmit failed, do not drop the mblk, rechedule the transmit */
tx_ring->reschedule = B_TRUE;
- return (B_FALSE);
+ return (mp);
}
/*
@@ -536,7 +552,9 @@ static int
ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
{
uint32_t start;
- uint32_t flags;
+ uint32_t hckflags;
+ uint32_t lsoflags;
+ uint32_t mss;
uint32_t len;
uint32_t size;
uint32_t offset;
@@ -548,16 +566,16 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
ASSERT(mp != NULL);
- hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags);
+ hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &hckflags);
bzero(ctx, sizeof (ixgbe_tx_context_t));
- ctx->hcksum_flags = flags;
- if (flags == 0)
+ if (hckflags == 0)
return (0);
+ ctx->hcksum_flags = hckflags;
- ctx->mss = DB_LSOMSS(mp);
- ctx->lso_flag = (ctx->hcksum_flags & HW_LSO) &&
- (ctx->mss != 0);
+ lso_info_get(mp, &mss, &lsoflags);
+ ctx->mss = mss;
+ ctx->lso_flag = (lsoflags == HW_LSO);
/*
* LSO relies on tx h/w checksum, so here will drop the package
@@ -582,12 +600,12 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
* in one mblk fragment, so we go thourgh the fragments to parse
* the ether type.
*/
- size = len = MBLK_LEN(mp);
+ size = len = MBLKL(mp);
offset = offsetof(struct ether_header, ether_type);
while (size <= offset) {
mp = mp->b_cont;
ASSERT(mp != NULL);
- len = MBLK_LEN(mp);
+ len = MBLKL(mp);
size += len;
}
pos = mp->b_rptr + offset + len - size;
@@ -601,7 +619,7 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
while (size <= offset) {
mp = mp->b_cont;
ASSERT(mp != NULL);
- len = MBLK_LEN(mp);
+ len = MBLKL(mp);
size += len;
}
pos = mp->b_rptr + offset + len - size;
@@ -613,25 +631,32 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
}
/*
- * Here we assume the IP(V6) header is fully included in
+ * Here we don't assume the IP(V6) header is fully included in
* one mblk fragment.
*/
switch (etype) {
case ETHERTYPE_IP:
- offset = mac_hdr_len;
- while (size <= offset) {
- mp = mp->b_cont;
- ASSERT(mp != NULL);
- len = MBLK_LEN(mp);
- size += len;
- }
- pos = mp->b_rptr + offset + len - size;
-
if (ctx->lso_flag) {
- *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t,
- ipha_length))) = 0;
- *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t,
- ipha_hdr_checksum))) = 0;
+ offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
+ while (size <= offset) {
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ len = MBLKL(mp);
+ size += len;
+ }
+ pos = mp->b_rptr + offset + len - size;
+ *((uint16_t *)(uintptr_t)(pos)) = 0;
+
+ offset = offsetof(ipha_t, ipha_hdr_checksum) +
+ mac_hdr_len;
+ while (size <= offset) {
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ len = MBLKL(mp);
+ size += len;
+ }
+ pos = mp->b_rptr + offset + len - size;
+ *((uint16_t *)(uintptr_t)(pos)) = 0;
/*
* To perform ixgbe LSO, here also need to fill
@@ -642,14 +667,23 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
*/
}
- l4_proto = *(uint8_t *)(pos + offsetof(ipha_t, ipha_protocol));
+ offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
+ while (size <= offset) {
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ len = MBLKL(mp);
+ size += len;
+ }
+ pos = mp->b_rptr + offset + len - size;
+
+ l4_proto = *(uint8_t *)pos;
break;
case ETHERTYPE_IPV6:
offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
while (size <= offset) {
mp = mp->b_cont;
ASSERT(mp != NULL);
- len = MBLK_LEN(mp);
+ len = MBLKL(mp);
size += len;
}
pos = mp->b_rptr + offset + len - size;
@@ -667,7 +701,7 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
while (size <= offset) {
mp = mp->b_cont;
ASSERT(mp != NULL);
- len = MBLK_LEN(mp);
+ len = MBLKL(mp);
size += len;
}
pos = mp->b_rptr + offset + len - size;
@@ -702,13 +736,14 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
return (B_FALSE);
/*
- * Compare the checksum data retrieved from the mblk and the
- * stored checksum data of the last context descriptor. The data
- * need to be checked are:
+ * Compare the context data retrieved from the mblk and the
+ * stored data of the last context descriptor. The data need
+ * to be checked are:
* hcksum_flags
* l4_proto
* mac_hdr_len
* ip_hdr_len
+ * lso_flag
* mss (only checked for LSO)
* l4_hr_len (only checked for LSO)
* Either one of the above data is changed, a new context descriptor
@@ -716,16 +751,14 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
*/
last = &tx_ring->tx_context;
- if (ctx->hcksum_flags != 0) {
- if ((ctx->hcksum_flags != last->hcksum_flags) ||
- (ctx->l4_proto != last->l4_proto) ||
- (ctx->mac_hdr_len != last->mac_hdr_len) ||
- (ctx->ip_hdr_len != last->ip_hdr_len) ||
- (ctx->lso_flag && ((ctx->mss != last->mss) ||
- (ctx->l4_hdr_len != last->l4_hdr_len)))) {
-
- return (B_TRUE);
- }
+ if ((ctx->hcksum_flags != last->hcksum_flags) ||
+ (ctx->l4_proto != last->l4_proto) ||
+ (ctx->mac_hdr_len != last->mac_hdr_len) ||
+ (ctx->ip_hdr_len != last->ip_hdr_len) ||
+ (ctx->lso_flag != last->lso_flag) ||
+ (ctx->lso_flag && ((ctx->mss != last->mss) ||
+ (ctx->l4_hdr_len != last->l4_hdr_len)))) {
+ return (B_TRUE);
}
return (B_FALSE);
@@ -738,11 +771,11 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
*/
static void
ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
- ixgbe_tx_context_t *ctx)
+ ixgbe_tx_context_t *ctx, int ring_index)
{
/*
* Fill the context descriptor with the checksum
- * context information we've got
+ * context information we've got.
*/
ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
@@ -775,12 +808,12 @@ ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
}
ctx_tbd->seqnum_seed = 0;
+ ctx_tbd->mss_l4len_idx = ring_index << 4;
+
if (ctx->lso_flag) {
- ctx_tbd->mss_l4len_idx =
+ ctx_tbd->mss_l4len_idx |=
(ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
(ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
- } else {
- ctx_tbd->mss_l4len_idx = 0;
}
}
@@ -838,7 +871,7 @@ ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
*/
ixgbe_fill_context(
(struct ixgbe_adv_tx_context_desc *)tbd,
- ctx);
+ ctx, tx_ring->index);
index = NEXT_INDEX(index, 1, tx_ring->ring_size);
desc_num++;
@@ -908,6 +941,14 @@ ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
*/
ASSERT(first_tbd != NULL);
first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
+ first_tbd->read.olinfo_status |= (tx_ring->index << 4);
+
+ if (ctx != NULL && ctx->lso_flag) {
+ first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
+ first_tbd->read.olinfo_status |=
+ (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
+ - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
+ }
if (ctx != NULL && ctx->lso_flag) {
first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
@@ -1017,14 +1058,18 @@ ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
* The mutex_tryenter() is used to avoid unnecessary
* lock contention.
*/
- if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
- return (0);
+ mutex_enter(&tx_ring->recycle_lock);
ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
if (tx_ring->tbd_free == tx_ring->ring_size) {
tx_ring->recycle_fail = 0;
tx_ring->stall_watchdog = 0;
+ if (tx_ring->reschedule) {
+ tx_ring->reschedule = B_FALSE;
+ mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+ tx_ring->ring_handle);
+ }
mutex_exit(&tx_ring->recycle_lock);
return (0);
}
@@ -1108,6 +1153,12 @@ ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
*/
atomic_add_32(&tx_ring->tbd_free, desc_num);
+ if ((tx_ring->tbd_free >= tx_ring->resched_thresh) &&
+ (tx_ring->reschedule)) {
+ tx_ring->reschedule = B_FALSE;
+ mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+ tx_ring->ring_handle);
+ }
mutex_exit(&tx_ring->recycle_lock);
/*
@@ -1152,14 +1203,18 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
* The mutex_tryenter() is used to avoid unnecessary
* lock contention.
*/
- if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
- return (0);
+ mutex_enter(&tx_ring->recycle_lock);
ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
if (tx_ring->tbd_free == tx_ring->ring_size) {
tx_ring->recycle_fail = 0;
tx_ring->stall_watchdog = 0;
+ if (tx_ring->reschedule) {
+ tx_ring->reschedule = B_FALSE;
+ mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+ tx_ring->ring_handle);
+ }
mutex_exit(&tx_ring->recycle_lock);
return (0);
}
@@ -1245,6 +1300,12 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
*/
atomic_add_32(&tx_ring->tbd_free, desc_num);
+ if ((tx_ring->tbd_free >= tx_ring->resched_thresh) &&
+ (tx_ring->reschedule)) {
+ tx_ring->reschedule = B_FALSE;
+ mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+ tx_ring->ring_handle);
+ }
mutex_exit(&tx_ring->recycle_lock);
/*
diff --git a/usr/src/uts/common/io/mac/README b/usr/src/uts/common/io/mac/README
new file mode 100644
index 0000000000..744c9842c3
--- /dev/null
+++ b/usr/src/uts/common/io/mac/README
@@ -0,0 +1,80 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#
+
+This README describes the organization of the files and subdirectories
+that make up the misc/mac module.
+
+Changes to the sources should follow the layout and naming conventions
+adopted herein.
+
+Each functional component of the mac module is implemented in a separate
+source file. The external interfaces are declared in header files delivered
+under <sys>. The internal data structures and definitions are declared
+in header files internal to this directory.
+
+. Client Interface
+ This is the kernel programming interface for accessing L2 services as
+ a consumer.
+ . mac_client.c
+ . sys/mac_client.h: APIs intended for external MAC consumers
+ . sys/mac_client_priv.h: APIs for GLDv3 components only (dld,
+ dls, aggr, vnic, etc).
+ . mac_client_impl.h Internals.
+
+. Provider Interface
+ This is the GLDv3 kernel driver interface. Functions and data structures
+ are used by L2 drivers to provide services to MAC consumers.
+ . mac_provider.c
+ . sys/mac_provider.h
+
+. MAC Type Plugins
+ The GLDv3 L2 supports multiple types of media control. Each type is
+ implemented as a plugin delivered in a separate file under the
+ plugin/ directory.
+ Add a new file to the plugin/ directory for introducing a new MAC type.
+
+. Core Component.
+ - Scheduling Engine:
+ . mac_datapath_setup.c: Control path for the scheduler.
+ . mac_soft_ring.c,
+ mac_soft_ring.h: Fanout Soft Rings.
+ . mac_sched.c: Data path
+ . mac_bcast.c Data path and switching for broadcast and
+ multicast packets.
+ . mac_stat.c: Statistics
+
+ - Classification Engine
+ mac_flow.c: Flows and software classification:
+
+ - NICs Resources Management
+ . mac.c (this file also has other miscelanea)
+
+. Misc
+ . mac.c
+ . mac_util.c
+ . mac_ndd.c
+
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index a7c472bfb2..1ee6d36cd6 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -24,9 +24,246 @@
* Use is subject to license terms.
*/
-
/*
* MAC Services Module
+ *
+ * The GLDv3 framework locking - The MAC layer
+ * --------------------------------------------
+ *
+ * The MAC layer is central to the GLD framework and can provide the locking
+ * framework needed for itself and for the use of MAC clients. MAC end points
+ * are fairly disjoint and don't share a lot of state. So a coarse grained
+ * multi-threading scheme is to single thread all create/modify/delete or set
+ * type of control operations on a per mac end point while allowing data threads
+ * concurrently.
+ *
+ * Control operations (set) that modify a mac end point are always serialized on
+ * a per mac end point basis, We have at most 1 such thread per mac end point
+ * at a time.
+ *
+ * All other operations that are not serialized are essentially multi-threaded.
+ * For example a control operation (get) like getting statistics which may not
+ * care about reading values atomically or data threads sending or receiving
+ * data. Mostly these type of operations don't modify the control state. Any
+ * state these operations care about are protected using traditional locks.
+ *
+ * The perimeter only serializes serial operations. It does not imply there
+ * aren't any other concurrent operations. However a serialized operation may
+ * sometimes need to make sure it is the only thread. In this case it needs
+ * to use reference counting mechanisms to cv_wait until any current data
+ * threads are done.
+ *
+ * The mac layer itself does not hold any locks across a call to another layer.
+ * The perimeter is however held across a down call to the driver to make the
+ * whole control operation atomic with respect to other control operations.
+ * Also the data path and get type control operations may proceed concurrently.
+ * These operations synchronize with the single serial operation on a given mac
+ * end point using regular locks. The perimeter ensures that conflicting
+ * operations like say a mac_multicast_add and a mac_multicast_remove on the
+ * same mac end point don't interfere with each other and also ensures that the
+ * changes in the mac layer and the call to the underlying driver to say add a
+ * multicast address are done atomically without interference from a thread
+ * trying to delete the same address.
+ *
+ * For example, consider
+ * mac_multicst_add()
+ * {
+ * mac_perimeter_enter(); serialize all control operations
+ *
+ * grab list lock protect against access by data threads
+ * add to list
+ * drop list lock
+ *
+ * call driver's mi_multicst
+ *
+ * mac_perimeter_exit();
+ * }
+ *
+ * To lessen the number of serialization locks and simplify the lock hierarchy,
+ * we serialize all the control operations on a per mac end point by using a
+ * single serialization lock called the perimeter. We allow recursive entry into
+ * the perimeter to facilitate use of this mechanism by both the mac client and
+ * the MAC layer itself.
+ *
+ * MAC client means an entity that does an operation on a mac handle
+ * obtained from a mac_open/mac_client_open. Similarly MAC driver means
+ * an entity that does an operation on a mac handle obtained from a
+ * mac_register. An entity could be both client and driver but on different
+ * handles eg. aggr. and should only make the corresponding mac interface calls
+ * i.e. mac driver interface or mac client interface as appropriate for that
+ * mac handle.
+ *
+ * General rules.
+ * -------------
+ *
+ * R1. The lock order of upcall threads is natually opposite to downcall
+ * threads. Hence upcalls must not hold any locks across layers for fear of
+ * recursive lock enter and lock order violation. This applies to all layers.
+ *
+ * R2. The perimeter is just another lock. Since it is held in the down
+ * direction, acquiring the perimeter in an upcall is prohibited as it would
+ * cause a deadlock. This applies to all layers.
+ *
+ * Note that upcalls that need to grab the mac perimeter (for example
+ * mac_notify upcalls) can still achieve that by posting the request to a
+ * thread, which can then grab all the required perimeters and locks in the
+ * right global order. Note that in the above example the mac layer iself
+ * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
+ * to the client must do that. Please see the aggr code for an example.
+ *
+ * MAC client rules
+ * ----------------
+ *
+ * R3. A MAC client may use the MAC provided perimeter facility to serialize
+ * control operations on a per mac end point. It does this by by acquring
+ * and holding the perimeter across a sequence of calls to the mac layer.
+ * This ensures atomicity across the entire block of mac calls. In this
+ * model the MAC client must not hold any client locks across the calls to
+ * the mac layer. This model is the preferred solution.
+ *
+ * R4. However if a MAC client has a lot of global state across all mac end
+ * points the per mac end point serialization may not be sufficient. In this
+ * case the client may choose to use global locks or use its own serialization.
+ * To avoid deadlocks, these client layer locks held across the mac calls
+ * in the control path must never be acquired by the data path for the reason
+ * mentioned below.
+ *
+ * (Assume that a control operation that holds a client lock blocks in the
+ * mac layer waiting for upcall reference counts to drop to zero. If an upcall
+ * data thread that holds this reference count, tries to acquire the same
+ * client lock subsequently it will deadlock).
+ *
+ * A MAC client may follow either the R3 model or the R4 model, but can't
+ * mix both. In the former, the hierarchy is Perim -> client locks, but in
+ * the latter it is client locks -> Perim.
+ *
+ * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
+ * context since they may block while trying to acquire the perimeter.
+ * In addition some calls may block waiting for upcall refcnts to come down to
+ * zero.
+ *
+ * R6. MAC clients must make sure that they are single threaded and all threads
+ * from the top (in particular data threads) have finished before calling
+ * mac_client_close. The MAC framework does not track the number of client
+ * threads using the mac client handle. Also mac clients must make sure
+ * they have undone all the control operations before calling mac_client_close.
+ * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
+ * mac_unicast_add/mac_multicast_add.
+ *
+ * MAC framework rules
+ * -------------------
+ *
+ * R7. The mac layer itself must not hold any mac layer locks (except the mac
+ * perimeter) across a call to any other layer from the mac layer. The call to
+ * any other layer could be via mi_* entry points, classifier entry points into
+ * the driver or via upcall pointers into layers above. The mac perimeter may
+ * be acquired or held only in the down direction, for e.g. when calling into
+ * a mi_* driver enty point to provide atomicity of the operation.
+ *
+ * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
+ * mac driver interfaces, the MAC layer must provide a cut out for control
+ * interfaces like upcall notifications and start them in a separate thread.
+ *
+ * R9. Note that locking order also implies a plumbing order. For example
+ * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
+ * to plumb in any other order must be failed at mac_open time, otherwise it
+ * could lead to deadlocks due to inverse locking order.
+ *
+ * R10. MAC driver interfaces must not block since the driver could call them
+ * in interrupt context.
+ *
+ * R11. Walkers must preferably not hold any locks while calling walker
+ * callbacks. Instead these can operate on reference counts. In simple
+ * callbacks it may be ok to hold a lock and call the callbacks, but this is
+ * harder to maintain in the general case of arbitrary callbacks.
+ *
+ * R12. The MAC layer must protect upcall notification callbacks using reference
+ * counts rather than holding locks across the callbacks.
+ *
+ * R13. Given the variety of drivers, it is preferable if the MAC layer can make
+ * sure that any pointers (such as mac ring pointers) it passes to the driver
+ * remain valid until mac unregister time. Currently the mac layer achieves
+ * this by using generation numbers for rings and freeing the mac rings only
+ * at unregister time. The MAC layer must provide a layer of indirection and
+ * must not expose underlying driver rings or driver data structures/pointers
+ * directly to MAC clients.
+ *
+ * MAC driver rules
+ * ----------------
+ *
+ * R14. It would be preferable if MAC drivers don't hold any locks across any
+ * mac call. However at a minimum they must not hold any locks across data
+ * upcalls. They must also make sure that all references to mac data structures
+ * are cleaned up and that it is single threaded at mac_unregister time.
+ *
+ * R15. MAC driver interfaces don't block and so the action may be done
+ * asynchronously in a separate thread as for example handling notifications.
+ * The driver must not assume that the action is complete when the call
+ * returns.
+ *
+ * R16. Drivers must maintain a generation number per Rx ring, and pass it
+ * back to mac_rx_ring(); They are expected to increment the generation
+ * number whenever the ring's stop routine is invoked.
+ * See comments in mac_rx_ring();
+ *
+ * R17 Similarly mi_stop is another synchronization point and the driver must
+ * ensure that all upcalls are done and there won't be any future upcall
+ * before returning from mi_stop.
+ *
+ * R18. The driver may assume that all set/modify control operations via
+ * the mi_* entry points are single threaded on a per mac end point.
+ *
+ * Lock and Perimeter hierarchy scenarios
+ * ---------------------------------------
+ *
+ * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
+ *
+ * ft_lock -> fe_lock [mac_flow_lookup]
+ *
+ * mi_rw_lock -> fe_lock [mac_bcast_send]
+ *
+ * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
+ *
+ * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
+ *
+ * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
+ *
+ * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
+ * client to driver. In the case of clients that explictly use the mac provided
+ * perimeter mechanism for its serialization, the hierarchy is
+ * Perimeter -> mac layer locks, since the client never holds any locks across
+ * the mac calls. In the case of clients that use its own locks the hierarchy
+ * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
+ * calls mac_perim_enter/exit in this case.
+ *
+ * Subflow creation rules
+ * ---------------------------
+ * o In case of a user specified cpulist present on underlying link and flows,
+ * the flows cpulist must be a subset of the underlying link.
+ * o In case of a user specified fanout mode present on link and flow, the
+ * subflow fanout count has to be less than or equal to that of the
+ * underlying link. The cpu-bindings for the subflows will be a subset of
+ * the underlying link.
+ * o In case if no cpulist specified on both underlying link and flow, the
+ * underlying link relies on a MAC tunable to provide out of box fanout.
+ * The subflow will have no cpulist (the subflow will be unbound)
+ * o In case if no cpulist is specified on the underlying link, a subflow can
+ * carry either a user-specified cpulist or fanout count. The cpu-bindings
+ * for the subflow will not adhere to restriction that they need to be subset
+ * of the underlying link.
+ * o In case where the underlying link is carrying either a user specified
+ * cpulist or fanout mode and for a unspecified subflow, the subflow will be
+ * created unbound.
+ * o While creating unbound subflows, bandwidth mode changes attempt to
+ * figure a right fanout count. In such cases the fanout count will override
+ * the unbound cpu-binding behavior.
+ * o In addition to this, while cycling between flow and link properties, we
+ * impose a restriction that if a link property has a subflow with
+ * user-specified attributes, we will not allow changing the link property.
+ * The administrator needs to reset all the user specified properties for the
+ * subflows before attempting a link property change.
+ * Some of the above rules can be overridden by specifying additional command
+ * line options while creating or modifying link or subflow properties.
*/
#include <sys/types.h>
@@ -39,11 +276,13 @@
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/dlpi.h>
-#include <sys/dls.h>
#include <sys/modhash.h>
-#include <sys/vlan.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
#include <sys/mac_impl.h>
+#include <sys/mac.h>
+#include <sys/dls.h>
#include <sys/dld.h>
#include <sys/modctl.h>
#include <sys/fs/dv_node.h>
@@ -52,20 +291,45 @@
#include <sys/callb.h>
#include <sys/cpuvar.h>
#include <sys/atomic.h>
+#include <sys/bitmap.h>
+#include <sys/sdt.h>
+#include <sys/mac_flow.h>
+#include <sys/ddi_intr_impl.h>
+#include <sys/disp.h>
#include <sys/sdt.h>
+#include <sys/vnic.h>
+#include <sys/vnic_impl.h>
+#include <sys/vlan.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <sys/exacct.h>
+#include <sys/exacct_impl.h>
#include <inet/nd.h>
#include <sys/ethernet.h>
#define IMPL_HASHSZ 67 /* prime */
-static kmem_cache_t *i_mac_impl_cachep;
-static mod_hash_t *i_mac_impl_hash;
+kmem_cache_t *i_mac_impl_cachep;
+mod_hash_t *i_mac_impl_hash;
krwlock_t i_mac_impl_lock;
uint_t i_mac_impl_count;
-static kmem_cache_t *mac_vnic_tx_cache;
+static kmem_cache_t *mac_ring_cache;
static id_space_t *minor_ids;
static uint32_t minor_count;
+/*
+ * Logging stuff. Perhaps mac_logging_interval could be broken into
+ * mac_flow_log_interval and mac_link_log_interval if we want to be
+ * able to schedule them differently.
+ */
+uint_t mac_logging_interval;
+boolean_t mac_flow_log_enable;
+boolean_t mac_link_log_enable;
+timeout_id_t mac_logging_timer;
+
+/* for debugging, see MAC_DBG_PRT() in mac_impl.h */
+int mac_dbg = 0;
+
#define MACTYPE_KMODDIR "mac"
#define MACTYPE_HASHSZ 67
static mod_hash_t *i_mactype_hash;
@@ -75,295 +339,75 @@ static mod_hash_t *i_mactype_hash;
*/
static kmutex_t i_mactype_lock;
-static void i_mac_notify_thread(void *);
-static mblk_t *mac_vnic_tx(void *, mblk_t *);
-static mblk_t *mac_vnic_txloop(void *, mblk_t *);
-static void mac_register_priv_prop(mac_impl_t *, mac_priv_prop_t *, uint_t);
-static void mac_unregister_priv_prop(mac_impl_t *);
-
/*
- * Private functions.
+ * mac_tx_percpu_cnt
+ *
+ * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
+ * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
+ * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
+ * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
*/
-
-/*ARGSUSED*/
-static int
-i_mac_constructor(void *buf, void *arg, int kmflag)
-{
- mac_impl_t *mip = buf;
-
- bzero(buf, sizeof (mac_impl_t));
-
- mip->mi_linkstate = LINK_STATE_UNKNOWN;
-
- rw_init(&mip->mi_state_lock, NULL, RW_DRIVER, NULL);
- rw_init(&mip->mi_gen_lock, NULL, RW_DRIVER, NULL);
- rw_init(&mip->mi_data_lock, NULL, RW_DRIVER, NULL);
- rw_init(&mip->mi_notify_lock, NULL, RW_DRIVER, NULL);
- rw_init(&mip->mi_rx_lock, NULL, RW_DRIVER, NULL);
- rw_init(&mip->mi_tx_lock, NULL, RW_DRIVER, NULL);
- rw_init(&mip->mi_resource_lock, NULL, RW_DRIVER, NULL);
- mutex_init(&mip->mi_activelink_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&mip->mi_notify_bits_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&mip->mi_notify_cv, NULL, CV_DRIVER, NULL);
- mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&mip->mi_rx_cv, NULL, CV_DRIVER, NULL);
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-i_mac_destructor(void *buf, void *arg)
-{
- mac_impl_t *mip = buf;
-
- ASSERT(mip->mi_ref == 0);
- ASSERT(!mip->mi_exclusive);
- ASSERT(mip->mi_active == 0);
- ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
- ASSERT(mip->mi_devpromisc == 0);
- ASSERT(mip->mi_promisc == 0);
- ASSERT(mip->mi_mmap == NULL);
- ASSERT(mip->mi_mmrp == NULL);
- ASSERT(mip->mi_mnfp == NULL);
- ASSERT(mip->mi_resource_add == NULL);
- ASSERT(mip->mi_ksp == NULL);
- ASSERT(mip->mi_kstat_count == 0);
- ASSERT(mip->mi_notify_bits == 0);
- ASSERT(mip->mi_notify_thread == NULL);
-
- rw_destroy(&mip->mi_gen_lock);
- rw_destroy(&mip->mi_state_lock);
- rw_destroy(&mip->mi_data_lock);
- rw_destroy(&mip->mi_notify_lock);
- rw_destroy(&mip->mi_rx_lock);
- rw_destroy(&mip->mi_tx_lock);
- rw_destroy(&mip->mi_resource_lock);
- mutex_destroy(&mip->mi_activelink_lock);
- mutex_destroy(&mip->mi_notify_bits_lock);
- cv_destroy(&mip->mi_notify_cv);
- mutex_destroy(&mip->mi_lock);
- cv_destroy(&mip->mi_rx_cv);
-}
+int mac_tx_percpu_cnt;
+int mac_tx_percpu_cnt_max = 128;
+
+static int i_mac_constructor(void *, void *, int);
+static void i_mac_destructor(void *, void *);
+static int i_mac_ring_ctor(void *, void *, int);
+static void i_mac_ring_dtor(void *, void *);
+static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
+void mac_tx_client_flush(mac_client_impl_t *);
+void mac_tx_client_block(mac_client_impl_t *);
+static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
+static int mac_start_group_and_rings(mac_group_t *);
+static void mac_stop_group_and_rings(mac_group_t *);
/*
- * mac_vnic_tx_t kmem cache support functions.
+ * Module initialization functions.
*/
-/* ARGSUSED */
-static int
-i_mac_vnic_tx_ctor(void *buf, void *arg, int mkflag)
-{
- mac_vnic_tx_t *vnic_tx = buf;
-
- bzero(buf, sizeof (mac_vnic_tx_t));
- mutex_init(&vnic_tx->mv_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&vnic_tx->mv_cv, NULL, CV_DRIVER, NULL);
- return (0);
-}
-
-/* ARGSUSED */
-static void
-i_mac_vnic_tx_dtor(void *buf, void *arg)
-{
- mac_vnic_tx_t *vnic_tx = buf;
-
- ASSERT(vnic_tx->mv_refs == 0);
- mutex_destroy(&vnic_tx->mv_lock);
- cv_destroy(&vnic_tx->mv_cv);
-}
-
-static void
-i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
+void
+mac_init(void)
{
- rw_enter(&i_mac_impl_lock, RW_READER);
- if (mip->mi_disabled)
- goto exit;
-
- /*
- * Guard against incorrect notifications. (Running a newer
- * mac client against an older implementation?)
- */
- if (type >= MAC_NNOTE)
- goto exit;
+ mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
+ boot_max_ncpus);
- mutex_enter(&mip->mi_notify_bits_lock);
- mip->mi_notify_bits |= (1 << type);
- cv_broadcast(&mip->mi_notify_cv);
- mutex_exit(&mip->mi_notify_bits_lock);
+ /* Upper bound is mac_tx_percpu_cnt_max */
+ if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
+ mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
-exit:
- rw_exit(&i_mac_impl_lock);
-}
+ if (mac_tx_percpu_cnt < 1) {
+ /* Someone set max_tx_percpu_cnt_max to 0 or less */
+ mac_tx_percpu_cnt = 1;
+ }
-static void
-i_mac_log_link_state(mac_impl_t *mip)
-{
+ ASSERT(mac_tx_percpu_cnt >= 1);
+ mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
/*
- * If no change, then it is not interesting.
+ * Make it of the form 2**N - 1 in the range
+ * [0 .. mac_tx_percpu_cnt_max - 1]
*/
- if (mip->mi_lastlinkstate == mip->mi_linkstate)
- return;
-
- switch (mip->mi_linkstate) {
- case LINK_STATE_UP:
- if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
- char det[200];
-
- mip->mi_type->mt_ops.mtops_link_details(det,
- sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
-
- cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
- } else {
- cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
- }
- break;
-
- case LINK_STATE_DOWN:
- /*
- * Only transitions from UP to DOWN are interesting
- */
- if (mip->mi_lastlinkstate != LINK_STATE_UNKNOWN)
- cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
- break;
-
- case LINK_STATE_UNKNOWN:
- /*
- * This case is normally not interesting.
- */
- break;
- }
- mip->mi_lastlinkstate = mip->mi_linkstate;
-}
-
-static void
-i_mac_notify_thread(void *arg)
-{
- mac_impl_t *mip = arg;
- callb_cpr_t cprinfo;
-
- CALLB_CPR_INIT(&cprinfo, &mip->mi_notify_bits_lock, callb_generic_cpr,
- "i_mac_notify_thread");
-
- mutex_enter(&mip->mi_notify_bits_lock);
- for (;;) {
- uint32_t bits;
- uint32_t type;
-
- bits = mip->mi_notify_bits;
- if (bits == 0) {
- CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock);
- CALLB_CPR_SAFE_END(&cprinfo, &mip->mi_notify_bits_lock);
- continue;
- }
- mip->mi_notify_bits = 0;
-
- if ((bits & (1 << MAC_NNOTE)) != 0) {
- /* request to quit */
- ASSERT(mip->mi_disabled);
- break;
- }
-
- mutex_exit(&mip->mi_notify_bits_lock);
-
- /*
- * Log link changes.
- */
- if ((bits & (1 << MAC_NOTE_LINK)) != 0)
- i_mac_log_link_state(mip);
-
- /*
- * Do notification callbacks for each notification type.
- */
- for (type = 0; type < MAC_NNOTE; type++) {
- mac_notify_fn_t *mnfp;
-
- if ((bits & (1 << type)) == 0) {
- continue;
- }
-
- /*
- * Walk the list of notifications.
- */
- rw_enter(&mip->mi_notify_lock, RW_READER);
- for (mnfp = mip->mi_mnfp; mnfp != NULL;
- mnfp = mnfp->mnf_nextp) {
-
- mnfp->mnf_fn(mnfp->mnf_arg, type);
- }
- rw_exit(&mip->mi_notify_lock);
- }
-
- mutex_enter(&mip->mi_notify_bits_lock);
- }
-
- mip->mi_notify_thread = NULL;
- cv_broadcast(&mip->mi_notify_cv);
-
- CALLB_CPR_EXIT(&cprinfo);
-
- thread_exit();
-}
-
-static mactype_t *
-i_mactype_getplugin(const char *pname)
-{
- mactype_t *mtype = NULL;
- boolean_t tried_modload = B_FALSE;
-
- mutex_enter(&i_mactype_lock);
+ mac_tx_percpu_cnt--;
-find_registered_mactype:
- if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
- (mod_hash_val_t *)&mtype) != 0) {
- if (!tried_modload) {
- /*
- * If the plugin has not yet been loaded, then
- * attempt to load it now. If modload() succeeds,
- * the plugin should have registered using
- * mactype_register(), in which case we can go back
- * and attempt to find it again.
- */
- if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
- tried_modload = B_TRUE;
- goto find_registered_mactype;
- }
- }
- } else {
- /*
- * Note that there's no danger that the plugin we've loaded
- * could be unloaded between the modload() step and the
- * reference count bump here, as we're holding
- * i_mactype_lock, which mactype_unregister() also holds.
- */
- atomic_inc_32(&mtype->mt_ref);
- }
-
- mutex_exit(&i_mactype_lock);
- return (mtype);
-}
-
-/*
- * Module initialization functions.
- */
-
-void
-mac_init(void)
-{
i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
NULL, NULL, NULL, 0);
ASSERT(i_mac_impl_cachep != NULL);
- mac_vnic_tx_cache = kmem_cache_create("mac_vnic_tx_cache",
- sizeof (mac_vnic_tx_t), 0, i_mac_vnic_tx_ctor, i_mac_vnic_tx_dtor,
- NULL, NULL, NULL, 0);
- ASSERT(mac_vnic_tx_cache != NULL);
+ mac_ring_cache = kmem_cache_create("mac_ring_cache",
+ sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
+ NULL, NULL, 0);
+ ASSERT(mac_ring_cache != NULL);
i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
+
+ mac_flow_init();
+ mac_soft_ring_init();
+ mac_bcast_init();
+ mac_client_init();
+
i_mac_impl_count = 0;
i_mactype_hash = mod_hash_create_extended("mactype_hash",
@@ -380,6 +424,12 @@ mac_init(void)
minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1, MAXMIN32);
ASSERT(minor_ids != NULL);
minor_count = 0;
+
+ /* Let's default to 20 seconds */
+ mac_logging_interval = 20;
+ mac_flow_log_enable = B_FALSE;
+ mac_link_log_enable = B_FALSE;
+ mac_logging_timer = 0;
}
int
@@ -389,567 +439,701 @@ mac_fini(void)
return (EBUSY);
id_space_destroy(minor_ids);
+ mac_flow_fini();
mod_hash_destroy_hash(i_mac_impl_hash);
rw_destroy(&i_mac_impl_lock);
- kmem_cache_destroy(i_mac_impl_cachep);
- kmem_cache_destroy(mac_vnic_tx_cache);
+ mac_client_fini();
+ kmem_cache_destroy(mac_ring_cache);
mod_hash_destroy_hash(i_mactype_hash);
+ mac_soft_ring_finish();
return (0);
}
-/*
- * Client functions.
- */
-
-static int
-mac_hold(const char *macname, mac_impl_t **pmip)
+void
+mac_init_ops(struct dev_ops *ops, const char *name)
{
- mac_impl_t *mip;
- int err;
-
- /*
- * Check the device name length to make sure it won't overflow our
- * buffer.
- */
- if (strlen(macname) >= MAXNAMELEN)
- return (EINVAL);
-
- /*
- * Look up its entry in the global hash table.
- */
- rw_enter(&i_mac_impl_lock, RW_WRITER);
- err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
- (mod_hash_val_t *)&mip);
+ dld_init_ops(ops, name);
+}
- if (err != 0) {
- rw_exit(&i_mac_impl_lock);
- return (ENOENT);
- }
+void
+mac_fini_ops(struct dev_ops *ops)
+{
+ dld_fini_ops(ops);
+}
- if (mip->mi_disabled) {
- rw_exit(&i_mac_impl_lock);
- return (ENOENT);
- }
+/*ARGSUSED*/
+static int
+i_mac_constructor(void *buf, void *arg, int kmflag)
+{
+ mac_impl_t *mip = buf;
- if (mip->mi_exclusive) {
- rw_exit(&i_mac_impl_lock);
- return (EBUSY);
- }
+ bzero(buf, sizeof (mac_impl_t));
- mip->mi_ref++;
- rw_exit(&i_mac_impl_lock);
+ mip->mi_linkstate = LINK_STATE_UNKNOWN;
+ mip->mi_nclients = 0;
- *pmip = mip;
+ mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
+ rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
+ mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
+ cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
+ mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
+ cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
return (0);
}
+/*ARGSUSED*/
static void
-mac_rele(mac_impl_t *mip)
+i_mac_destructor(void *buf, void *arg)
{
- rw_enter(&i_mac_impl_lock, RW_WRITER);
- ASSERT(mip->mi_ref != 0);
- if (--mip->mi_ref == 0)
- ASSERT(!mip->mi_activelink);
- rw_exit(&i_mac_impl_lock);
-}
+ mac_impl_t *mip = buf;
+ mac_cb_info_t *mcbi;
-int
-mac_hold_exclusive(mac_handle_t mh)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ ASSERT(mip->mi_ref == 0);
+ ASSERT(mip->mi_active == 0);
+ ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
+ ASSERT(mip->mi_devpromisc == 0);
+ ASSERT(mip->mi_promisc == 0);
+ ASSERT(mip->mi_ksp == NULL);
+ ASSERT(mip->mi_kstat_count == 0);
+ ASSERT(mip->mi_nclients == 0);
+ ASSERT(mip->mi_nactiveclients == 0);
+ ASSERT(mip->mi_state_flags == 0);
+ ASSERT(mip->mi_factory_addr == NULL);
+ ASSERT(mip->mi_factory_addr_num == 0);
+ ASSERT(mip->mi_default_tx_ring == NULL);
+
+ mcbi = &mip->mi_notify_cb_info;
+ ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
+ ASSERT(mip->mi_notify_bits == 0);
+ ASSERT(mip->mi_notify_thread == NULL);
+ ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
+ mcbi->mcbi_lockp = NULL;
- /*
- * Look up its entry in the global hash table.
- */
- rw_enter(&i_mac_impl_lock, RW_WRITER);
- if (mip->mi_disabled) {
- rw_exit(&i_mac_impl_lock);
- return (ENOENT);
- }
+ mcbi = &mip->mi_promisc_cb_info;
+ ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
+ ASSERT(mip->mi_promisc_list == NULL);
+ ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
+ mcbi->mcbi_lockp = NULL;
- if (mip->mi_ref != 0) {
- rw_exit(&i_mac_impl_lock);
- return (EBUSY);
- }
+ ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
+ ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
- ASSERT(!mip->mi_exclusive);
+ mutex_destroy(&mip->mi_lock);
+ rw_destroy(&mip->mi_rw_lock);
- mip->mi_ref++;
- mip->mi_exclusive = B_TRUE;
- rw_exit(&i_mac_impl_lock);
+ mutex_destroy(&mip->mi_promisc_lock);
+ cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
+ mutex_destroy(&mip->mi_notify_lock);
+ cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
+ mutex_destroy(&mip->mi_ring_lock);
+}
+
+/* ARGSUSED */
+static int
+i_mac_ring_ctor(void *buf, void *arg, int kmflag)
+{
+ mac_ring_t *ring = (mac_ring_t *)buf;
+
+ bzero(ring, sizeof (mac_ring_t));
+ cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
+ ring->mr_state = MR_FREE;
return (0);
}
+/* ARGSUSED */
+static void
+i_mac_ring_dtor(void *buf, void *arg)
+{
+ mac_ring_t *ring = (mac_ring_t *)buf;
+
+ cv_destroy(&ring->mr_cv);
+ mutex_destroy(&ring->mr_lock);
+}
+
+/*
+ * Common functions to do mac callback addition and deletion. Currently this is
+ * used by promisc callbacks and notify callbacks. List addition and deletion
+ * need to take care of list walkers. List walkers in general, can't hold list
+ * locks and make upcall callbacks due to potential lock order and recursive
+ * reentry issues. Instead list walkers increment the list walker count to mark
+ * the presence of a walker thread. Addition can be carefully done to ensure
+ * that the list walker always sees either the old list or the new list.
+ * However the deletion can't be done while the walker is active, instead the
+ * deleting thread simply marks the entry as logically deleted. The last walker
+ * physically deletes and frees up the logically deleted entries when the walk
+ * is complete.
+ */
void
-mac_rele_exclusive(mac_handle_t mh)
+mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
+ mac_cb_t *mcb_elem)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_cb_t *p;
+ mac_cb_t **pp;
+
+ /* Verify it is not already in the list */
+ for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
+ if (p == mcb_elem)
+ break;
+ }
+ VERIFY(p == NULL);
/*
- * Look up its entry in the global hash table.
+ * Add it to the head of the callback list. The membar ensures that
+ * the following list pointer manipulations reach global visibility
+ * in exactly the program order below.
*/
- rw_enter(&i_mac_impl_lock, RW_WRITER);
- ASSERT(mip->mi_ref == 1 && mip->mi_exclusive);
- mip->mi_ref--;
- mip->mi_exclusive = B_FALSE;
- rw_exit(&i_mac_impl_lock);
+ ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
+
+ mcb_elem->mcb_nextp = *mcb_head;
+ membar_producer();
+ *mcb_head = mcb_elem;
}
-int
-mac_open(const char *macname, mac_handle_t *mhp)
+/*
+ * Mark the entry as logically deleted. If there aren't any walkers unlink
+ * from the list. In either case return the corresponding status.
+ */
+boolean_t
+mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
+ mac_cb_t *mcb_elem)
{
- mac_impl_t *mip;
- int err;
+ mac_cb_t *p;
+ mac_cb_t **pp;
+ ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
/*
- * Look up its entry in the global hash table.
+ * Search the callback list for the entry to be removed
*/
- if ((err = mac_hold(macname, &mip)) != 0)
- return (err);
+ for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
+ if (p == mcb_elem)
+ break;
+ }
+ VERIFY(p != NULL);
/*
- * Hold the dip associated to the MAC to prevent it from being
- * detached. For a softmac, its underlying dip is held by the
- * mi_open() callback.
- *
- * This is done to be more tolerant with some defective drivers,
- * which incorrectly handle mac_unregister() failure in their
- * xxx_detach() routine. For example, some drivers ignore the
- * failure of mac_unregister() and free all resources that
- * that are needed for data transmition.
+ * If there are walkers just mark it as deleted and the last walker
+ * will remove from the list and free it.
*/
- e_ddi_hold_devi(mip->mi_dip);
+ if (mcbi->mcbi_walker_cnt != 0) {
+ p->mcb_flags |= MCB_CONDEMNED;
+ mcbi->mcbi_del_cnt++;
+ return (B_FALSE);
+ }
- rw_enter(&mip->mi_gen_lock, RW_WRITER);
+ ASSERT(mcbi->mcbi_del_cnt == 0);
+ *pp = p->mcb_nextp;
+ p->mcb_nextp = NULL;
+ return (B_TRUE);
+}
- if ((mip->mi_oref != 0) ||
- !(mip->mi_callbacks->mc_callbacks & MC_OPEN)) {
- goto done;
+/*
+ * Wait for all pending callback removals to be completed
+ */
+void
+mac_callback_remove_wait(mac_cb_info_t *mcbi)
+{
+ ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
+ while (mcbi->mcbi_del_cnt != 0) {
+ DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
+ cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
}
+}
- /*
- * Note that we do not hold i_mac_impl_lock when calling the
- * mc_open() callback function to avoid deadlock with the
- * i_mac_notify() function.
- */
- if ((err = mip->mi_open(mip->mi_driver)) != 0) {
- rw_exit(&mip->mi_gen_lock);
- ddi_release_devi(mip->mi_dip);
- mac_rele(mip);
- return (err);
+/*
+ * The last mac callback walker does the cleanup. Walk the list and unlik
+ * all the logically deleted entries and construct a temporary list of
+ * removed entries. Return the list of removed entries to the caller.
+ */
+mac_cb_t *
+mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
+{
+ mac_cb_t *p;
+ mac_cb_t **pp;
+ mac_cb_t *rmlist = NULL; /* List of removed elements */
+ int cnt = 0;
+
+ ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
+ ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
+
+ pp = mcb_head;
+ while (*pp != NULL) {
+ if ((*pp)->mcb_flags & MCB_CONDEMNED) {
+ p = *pp;
+ *pp = p->mcb_nextp;
+ p->mcb_nextp = rmlist;
+ rmlist = p;
+ cnt++;
+ continue;
+ }
+ pp = &(*pp)->mcb_nextp;
}
-done:
- mip->mi_oref++;
- rw_exit(&mip->mi_gen_lock);
- *mhp = (mac_handle_t)mip;
- return (0);
+ ASSERT(mcbi->mcbi_del_cnt == cnt);
+ mcbi->mcbi_del_cnt = 0;
+ return (rmlist);
}
-int
-mac_open_by_linkid(datalink_id_t linkid, mac_handle_t *mhp)
+boolean_t
+mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
{
- dls_dl_handle_t dlh;
- int err;
-
- if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
- return (err);
+ mac_cb_t *mcb;
- if (dls_devnet_vid(dlh) != VLAN_ID_NONE) {
- err = EINVAL;
- goto done;
+ /* Verify it is not already in the list */
+ for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
+ if (mcb == mcb_elem)
+ return (B_TRUE);
}
- dls_devnet_prop_task_wait(dlh);
-
- err = mac_open(dls_devnet_mac(dlh), mhp);
-
-done:
- dls_devnet_rele_tmp(dlh);
- return (err);
+ return (B_FALSE);
}
-int
-mac_open_by_linkname(const char *link, mac_handle_t *mhp)
+boolean_t
+mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
{
- datalink_id_t linkid;
- int err;
+ boolean_t found;
- if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0)
- return (err);
- return (mac_open_by_linkid(linkid, mhp));
+ mutex_enter(mcbi->mcbi_lockp);
+ found = mac_callback_lookup(mcb_headp, mcb_elem);
+ mutex_exit(mcbi->mcbi_lockp);
+
+ return (found);
}
+/* Free the list of removed callbacks */
void
-mac_close(mac_handle_t mh)
+mac_callback_free(mac_cb_t *rmlist)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_cb_t *mcb;
+ mac_cb_t *mcb_next;
- rw_enter(&mip->mi_gen_lock, RW_WRITER);
-
- ASSERT(mip->mi_oref != 0);
- if (--mip->mi_oref == 0) {
- if ((mip->mi_callbacks->mc_callbacks & MC_CLOSE))
- mip->mi_close(mip->mi_driver);
+ for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
+ mcb_next = mcb->mcb_nextp;
+ kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
}
- rw_exit(&mip->mi_gen_lock);
-
- ddi_release_devi(mip->mi_dip);
- mac_rele(mip);
}
-const mac_info_t *
-mac_info(mac_handle_t mh)
+/*
+ * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
+ * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
+ * is only a single shared total walker count, and an entry can't be physically
+ * unlinked if a walker is active on either list. The last walker does this
+ * cleanup of logically deleted entries.
+ */
+void
+i_mac_promisc_walker_cleanup(mac_impl_t *mip)
{
- return (&((mac_impl_t *)mh)->mi_info);
-}
+ mac_cb_t *rmlist;
+ mac_cb_t *mcb;
+ mac_cb_t *mcb_next;
+ mac_promisc_impl_t *mpip;
-dev_info_t *
-mac_devinfo_get(mac_handle_t mh)
-{
- return (((mac_impl_t *)mh)->mi_dip);
+ /*
+ * Construct a temporary list of deleted callbacks by walking the
+ * the mi_promisc_list. Then for each entry in the temporary list,
+ * remove it from the mci_promisc_list and free the entry.
+ */
+ rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
+ &mip->mi_promisc_list);
+
+ for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
+ mcb_next = mcb->mcb_nextp;
+ mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
+ VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
+ &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
+ mcb->mcb_flags = 0;
+ mcb->mcb_nextp = NULL;
+ kmem_cache_free(mac_promisc_impl_cache, mpip);
+ }
}
-const char *
-mac_name(mac_handle_t mh)
+void
+i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
{
- return (((mac_impl_t *)mh)->mi_name);
-}
+ mac_cb_info_t *mcbi;
-minor_t
-mac_minor(mac_handle_t mh)
-{
- return (((mac_impl_t *)mh)->mi_minor);
+ /*
+ * Signal the notify thread even after mi_ref has become zero and
+ * mi_disabled is set. The synchronization with the notify thread
+ * happens in mac_unregister and that implies the driver must make
+ * sure it is single-threaded (with respect to mac calls) and that
+ * all pending mac calls have returned before it calls mac_unregister
+ */
+ rw_enter(&i_mac_impl_lock, RW_READER);
+ if (mip->mi_state_flags & MIS_DISABLED)
+ goto exit;
+
+ /*
+ * Guard against incorrect notifications. (Running a newer
+ * mac client against an older implementation?)
+ */
+ if (type >= MAC_NNOTE)
+ goto exit;
+
+ mcbi = &mip->mi_notify_cb_info;
+ mutex_enter(mcbi->mcbi_lockp);
+ mip->mi_notify_bits |= (1 << type);
+ cv_broadcast(&mcbi->mcbi_cv);
+ mutex_exit(mcbi->mcbi_lockp);
+
+exit:
+ rw_exit(&i_mac_impl_lock);
}
-uint64_t
-mac_stat_get(mac_handle_t mh, uint_t stat)
+/*
+ * Mac serialization primitives. Please see the block comment at the
+ * top of the file.
+ */
+void
+i_mac_perim_enter(mac_impl_t *mip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- uint64_t val;
- int ret;
-
- /*
- * The range of stat determines where it is maintained. Stat
- * values from 0 up to (but not including) MAC_STAT_MIN are
- * mainteined by the mac module itself. Everything else is
- * maintained by the driver.
- */
- if (stat < MAC_STAT_MIN) {
- /* These stats are maintained by the mac module itself. */
- switch (stat) {
- case MAC_STAT_LINK_STATE:
- return (mip->mi_linkstate);
- case MAC_STAT_LINK_UP:
- return (mip->mi_linkstate == LINK_STATE_UP);
- case MAC_STAT_PROMISC:
- return (mip->mi_devpromisc != 0);
- default:
- ASSERT(B_FALSE);
- }
- }
+ mac_client_impl_t *mcip;
- /*
- * Call the driver to get the given statistic.
- */
- ret = mip->mi_getstat(mip->mi_driver, stat, &val);
- if (ret != 0) {
+ if (mip->mi_state_flags & MIS_IS_VNIC) {
/*
- * The driver doesn't support this statistic. Get the
- * statistic's default value.
+ * This is a VNIC. Return the lower mac since that is what
+ * we want to serialize on.
*/
- val = mac_stat_default(mip, stat);
+ mcip = mac_vnic_lower(mip);
+ mip = mcip->mci_mip;
+ }
+
+ mutex_enter(&mip->mi_perim_lock);
+ if (mip->mi_perim_owner == curthread) {
+ mip->mi_perim_ocnt++;
+ mutex_exit(&mip->mi_perim_lock);
+ return;
}
- return (val);
+
+ while (mip->mi_perim_owner != NULL)
+ cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
+
+ mip->mi_perim_owner = curthread;
+ ASSERT(mip->mi_perim_ocnt == 0);
+ mip->mi_perim_ocnt++;
+#ifdef DEBUG
+ mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
+ MAC_PERIM_STACK_DEPTH);
+#endif
+ mutex_exit(&mip->mi_perim_lock);
}
int
-mac_start(mac_handle_t mh)
+i_mac_perim_enter_nowait(mac_impl_t *mip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- int err;
+ /*
+ * The vnic is a special case, since the serialization is done based
+ * on the lower mac. If the lower mac is busy, it does not imply the
+ * vnic can't be unregistered. But in the case of other drivers,
+ * a busy perimeter or open mac handles implies that the mac is busy
+ * and can't be unregistered.
+ */
+ if (mip->mi_state_flags & MIS_IS_VNIC) {
+ i_mac_perim_enter(mip);
+ return (0);
+ }
- ASSERT(mip->mi_start != NULL);
+ mutex_enter(&mip->mi_perim_lock);
+ if (mip->mi_perim_owner != NULL) {
+ mutex_exit(&mip->mi_perim_lock);
+ return (EBUSY);
+ }
+ ASSERT(mip->mi_perim_ocnt == 0);
+ mip->mi_perim_owner = curthread;
+ mip->mi_perim_ocnt++;
+ mutex_exit(&mip->mi_perim_lock);
- rw_enter(&(mip->mi_state_lock), RW_WRITER);
+ return (0);
+}
- /*
- * Check whether the device is already started.
- */
- if (mip->mi_active++ != 0) {
+void
+i_mac_perim_exit(mac_impl_t *mip)
+{
+ mac_client_impl_t *mcip;
+
+ if (mip->mi_state_flags & MIS_IS_VNIC) {
/*
- * It's already started so there's nothing more to do.
+ * This is a VNIC. Return the lower mac since that is what
+ * we want to serialize on.
*/
- err = 0;
- goto done;
+ mcip = mac_vnic_lower(mip);
+ mip = mcip->mci_mip;
}
- /*
- * Start the device.
- */
- if ((err = mip->mi_start(mip->mi_driver)) != 0)
- --mip->mi_active;
+ ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
-done:
- rw_exit(&(mip->mi_state_lock));
- return (err);
+ mutex_enter(&mip->mi_perim_lock);
+ if (--mip->mi_perim_ocnt == 0) {
+ mip->mi_perim_owner = NULL;
+ cv_signal(&mip->mi_perim_cv);
+ }
+ mutex_exit(&mip->mi_perim_lock);
}
-void
-mac_stop(mac_handle_t mh)
+/*
+ * Returns whether the current thread holds the mac perimeter. Used in making
+ * assertions.
+ */
+boolean_t
+mac_perim_held(mac_handle_t mh)
{
mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_client_impl_t *mcip;
- ASSERT(mip->mi_stop != NULL);
-
- rw_enter(&(mip->mi_state_lock), RW_WRITER);
-
- /*
- * Check whether the device is still needed.
- */
- ASSERT(mip->mi_active != 0);
- if (--mip->mi_active != 0) {
+ if (mip->mi_state_flags & MIS_IS_VNIC) {
/*
- * It's still needed so there's nothing more to do.
+ * This is a VNIC. Return the lower mac since that is what
+ * we want to serialize on.
*/
- goto done;
+ mcip = mac_vnic_lower(mip);
+ mip = mcip->mci_mip;
}
+ return (mip->mi_perim_owner == curthread);
+}
+/*
+ * mac client interfaces to enter the mac perimeter of a mac end point, given
+ * its mac handle, or macname or linkid.
+ */
+void
+mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ i_mac_perim_enter(mip);
/*
- * Stop the device.
+ * The mac_perim_handle_t returned encodes the 'mip' and whether a
+ * mac_open has been done internally while entering the perimeter.
+ * This information is used in mac_perim_exit
*/
- mip->mi_stop(mip->mi_driver);
-
-done:
- rw_exit(&(mip->mi_state_lock));
+ MAC_ENCODE_MPH(*mphp, mip, 0);
}
int
-mac_multicst_add(mac_handle_t mh, const uint8_t *addr)
+mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_multicst_addr_t **pp;
- mac_multicst_addr_t *p;
- int err;
-
- ASSERT(mip->mi_multicst != NULL);
+ int err;
+ mac_handle_t mh;
- /*
- * Verify the address.
- */
- if ((err = mip->mi_type->mt_ops.mtops_multicst_verify(addr,
- mip->mi_pdata)) != 0) {
+ if ((err = mac_open(name, &mh)) != 0)
return (err);
- }
- /*
- * Check whether the given address is already enabled.
- */
- rw_enter(&(mip->mi_data_lock), RW_WRITER);
- for (pp = &(mip->mi_mmap); (p = *pp) != NULL; pp = &(p->mma_nextp)) {
- if (bcmp(p->mma_addr, addr, mip->mi_type->mt_addr_length) ==
- 0) {
- /*
- * The address is already enabled so just bump the
- * reference count.
- */
- p->mma_ref++;
- err = 0;
- goto done;
- }
- }
+ mac_perim_enter_by_mh(mh, mphp);
+ MAC_ENCODE_MPH(*mphp, mh, 1);
+ return (0);
+}
- /*
- * Allocate a new list entry.
- */
- if ((p = kmem_zalloc(sizeof (mac_multicst_addr_t),
- KM_NOSLEEP)) == NULL) {
- err = ENOMEM;
- goto done;
- }
+int
+mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
+{
+ int err;
+ mac_handle_t mh;
- /*
- * Enable a new multicast address.
- */
- if ((err = mip->mi_multicst(mip->mi_driver, B_TRUE, addr)) != 0) {
- kmem_free(p, sizeof (mac_multicst_addr_t));
- goto done;
- }
+ if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
+ return (err);
- /*
- * Add the address to the list of enabled addresses.
- */
- bcopy(addr, p->mma_addr, mip->mi_type->mt_addr_length);
- p->mma_ref++;
- *pp = p;
+ mac_perim_enter_by_mh(mh, mphp);
+ MAC_ENCODE_MPH(*mphp, mh, 1);
+ return (0);
+}
-done:
- rw_exit(&(mip->mi_data_lock));
- return (err);
+void
+mac_perim_exit(mac_perim_handle_t mph)
+{
+ mac_impl_t *mip;
+ boolean_t need_close;
+
+ MAC_DECODE_MPH(mph, mip, need_close);
+ i_mac_perim_exit(mip);
+ if (need_close)
+ mac_close((mac_handle_t)mip);
}
int
-mac_multicst_remove(mac_handle_t mh, const uint8_t *addr)
+mac_hold(const char *macname, mac_impl_t **pmip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_multicst_addr_t **pp;
- mac_multicst_addr_t *p;
- int err;
+ mac_impl_t *mip;
+ int err;
- ASSERT(mip->mi_multicst != NULL);
+ /*
+ * Check the device name length to make sure it won't overflow our
+ * buffer.
+ */
+ if (strlen(macname) >= MAXNAMELEN)
+ return (EINVAL);
/*
- * Find the entry in the list for the given address.
+ * Look up its entry in the global hash table.
*/
- rw_enter(&(mip->mi_data_lock), RW_WRITER);
- for (pp = &(mip->mi_mmap); (p = *pp) != NULL; pp = &(p->mma_nextp)) {
- if (bcmp(p->mma_addr, addr, mip->mi_type->mt_addr_length) ==
- 0) {
- if (--p->mma_ref == 0)
- break;
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
+ (mod_hash_val_t *)&mip);
- /*
- * There is still a reference to this address so
- * there's nothing more to do.
- */
- err = 0;
- goto done;
- }
+ if (err != 0) {
+ rw_exit(&i_mac_impl_lock);
+ return (ENOENT);
}
- /*
- * We did not find an entry for the given address so it is not
- * currently enabled.
- */
- if (p == NULL) {
- err = ENOENT;
- goto done;
+ if (mip->mi_state_flags & MIS_DISABLED) {
+ rw_exit(&i_mac_impl_lock);
+ return (ENOENT);
}
- ASSERT(p->mma_ref == 0);
- /*
- * Disable the multicast address.
- */
- if ((err = mip->mi_multicst(mip->mi_driver, B_FALSE, addr)) != 0) {
- p->mma_ref++;
- goto done;
+ if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
+ rw_exit(&i_mac_impl_lock);
+ return (EBUSY);
}
- /*
- * Remove it from the list.
- */
- *pp = p->mma_nextp;
- kmem_free(p, sizeof (mac_multicst_addr_t));
+ mip->mi_ref++;
+ rw_exit(&i_mac_impl_lock);
-done:
- rw_exit(&(mip->mi_data_lock));
- return (err);
+ *pmip = mip;
+ return (0);
}
-/*
- * mac_unicst_verify: Verifies the passed address. It fails
- * if the passed address is a group address or has incorrect length.
- */
-boolean_t
-mac_unicst_verify(mac_handle_t mh, const uint8_t *addr, uint_t len)
+void
+mac_rele(mac_impl_t *mip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
-
- /*
- * Verify the address.
- */
- if ((len != mip->mi_type->mt_addr_length) ||
- (mip->mi_type->mt_ops.mtops_unicst_verify(addr,
- mip->mi_pdata)) != 0) {
- return (B_FALSE);
- } else {
- return (B_TRUE);
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ ASSERT(mip->mi_ref != 0);
+ if (--mip->mi_ref == 0) {
+ ASSERT(mip->mi_nactiveclients == 0 &&
+ !(mip->mi_state_flags & MIS_EXCLUSIVE));
}
+ rw_exit(&i_mac_impl_lock);
}
+/*
+ * This function is called only by mac_client_open.
+ */
int
-mac_unicst_set(mac_handle_t mh, const uint8_t *addr)
+mac_start(mac_impl_t *mip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- int err;
- boolean_t notify = B_FALSE;
-
- ASSERT(mip->mi_unicst != NULL);
+ int err = 0;
- /*
- * Verify the address.
- */
- if ((err = mip->mi_type->mt_ops.mtops_unicst_verify(addr,
- mip->mi_pdata)) != 0) {
- return (err);
- }
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(mip->mi_start != NULL);
/*
- * Program the new unicast address.
+ * Check whether the device is already started.
*/
- rw_enter(&(mip->mi_data_lock), RW_WRITER);
+ if (mip->mi_active++ == 0) {
+ mac_ring_t *ring = NULL;
- /*
- * If address doesn't change, do nothing.
- * This check is necessary otherwise it may call into mac_unicst_set
- * recursively.
- */
- if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0)
- goto done;
+ /*
+ * Start the device.
+ */
+ err = mip->mi_start(mip->mi_driver);
+ if (err != 0) {
+ mip->mi_active--;
+ return (err);
+ }
- if ((err = mip->mi_unicst(mip->mi_driver, addr)) != 0)
- goto done;
+ /*
+ * Start the default tx ring.
+ */
+ if (mip->mi_default_tx_ring != NULL) {
- /*
- * Save the address and flag that we need to send a notification.
- */
- bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length);
- notify = B_TRUE;
+ ring = (mac_ring_t *)mip->mi_default_tx_ring;
+ err = mac_start_ring(ring);
+ if (err != 0) {
+ mip->mi_active--;
+ return (err);
+ }
+ ring->mr_state = MR_INUSE;
+ }
-done:
- rw_exit(&(mip->mi_data_lock));
+ if (mip->mi_rx_groups != NULL) {
+ /*
+ * Start the default ring, since it will be needed
+ * to receive broadcast and multicast traffic for
+ * both primary and non-primary MAC clients.
+ */
+ mac_group_t *grp = &mip->mi_rx_groups[0];
- if (notify)
- i_mac_notify(mip, MAC_NOTE_UNICST);
+ ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
+ err = mac_start_group_and_rings(grp);
+ if (err != 0) {
+ mip->mi_active--;
+ if (ring != NULL) {
+ mac_stop_ring(ring);
+ ring->mr_state = MR_FREE;
+ }
+ return (err);
+ }
+ mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED);
+ }
+ }
return (err);
}
+/*
+ * This function is called only by mac_client_close.
+ */
void
-mac_unicst_get(mac_handle_t mh, uint8_t *addr)
+mac_stop(mac_impl_t *mip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ ASSERT(mip->mi_stop != NULL);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
/*
- * Copy out the current unicast source address.
+ * Check whether the device is still needed.
*/
- rw_enter(&(mip->mi_data_lock), RW_READER);
- bcopy(mip->mi_addr, addr, mip->mi_type->mt_addr_length);
- rw_exit(&(mip->mi_data_lock));
-}
+ ASSERT(mip->mi_active != 0);
+ if (--mip->mi_active == 0) {
+ if (mip->mi_rx_groups != NULL) {
+ /*
+ * There should be no more active clients since the
+ * MAC is being stopped. Stop the default RX group
+ * and transition it back to registered state.
+ */
+ mac_group_t *grp = &mip->mi_rx_groups[0];
-void
-mac_dest_get(mac_handle_t mh, uint8_t *addr)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ /*
+ * When clients are torn down, the groups
+ * are release via mac_release_rx_group which
+ * knows the the default group is always in
+ * started mode since broadcast uses it. So
+ * we can assert that their are no clients
+ * (since mac_bcast_add doesn't register itself
+ * as a client) and group is in SHARED state.
+ */
+ ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
+ ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) &&
+ mip->mi_nactiveclients == 0);
+ mac_stop_group_and_rings(grp);
+ mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
+ }
- /*
- * Copy out the current destination address.
- */
- rw_enter(&(mip->mi_data_lock), RW_READER);
- bcopy(mip->mi_dstaddr, addr, mip->mi_type->mt_addr_length);
- rw_exit(&(mip->mi_data_lock));
+ if (mip->mi_default_tx_ring != NULL) {
+ mac_ring_t *ring;
+
+ ring = (mac_ring_t *)mip->mi_default_tx_ring;
+ mac_stop_ring(ring);
+ ring->mr_state = MR_FREE;
+ }
+
+ /*
+ * Stop the device.
+ */
+ mip->mi_stop(mip->mi_driver);
+ }
}
int
-mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
+i_mac_promisc_set(mac_impl_t *mip, boolean_t on, mac_promisc_type_t ptype)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
int err = 0;
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
ASSERT(mip->mi_setpromisc != NULL);
ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC);
@@ -958,7 +1142,6 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
* For details on the distinction between "device promiscuous mode"
* and "MAC promiscuous mode", see PSARC/2005/289.
*/
- rw_enter(&(mip->mi_data_lock), RW_WRITER);
if (on) {
/*
* Enable promiscuous mode on the device if not yet enabled.
@@ -967,7 +1150,7 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
if (err != 0) {
mip->mi_devpromisc--;
- goto done;
+ return (err);
}
i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
}
@@ -978,10 +1161,9 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
if (ptype == MAC_PROMISC && mip->mi_promisc++ == 0)
i_mac_notify(mip, MAC_NOTE_PROMISC);
} else {
- if (mip->mi_devpromisc == 0) {
- err = EPROTO;
- goto done;
- }
+ if (mip->mi_devpromisc == 0)
+ return (EPROTO);
+
/*
* Disable promiscuous mode on the device if this is the last
* enabling.
@@ -990,7 +1172,7 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
if (err != 0) {
mip->mi_devpromisc++;
- goto done;
+ return (err);
}
i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
}
@@ -1003,11 +1185,27 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
i_mac_notify(mip, MAC_NOTE_PROMISC);
}
-done:
- rw_exit(&(mip->mi_data_lock));
- return (err);
+ return (0);
}
+int
+mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ int rv;
+
+ i_mac_perim_enter(mip);
+ rv = i_mac_promisc_set(mip, on, ptype);
+ i_mac_perim_exit(mip);
+
+ return (rv);
+}
+
+/*
+ * The promiscuity state can change any time. If the caller needs to take
+ * actions that are atomic with the promiscuity state, then the caller needs
+ * to bracket the entire sequence with mac_perim_enter/exit
+ */
boolean_t
mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype)
{
@@ -1024,1296 +1222,1162 @@ mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype)
return (mip->mi_promisc != 0);
}
+/*
+ * Invoked at MAC instance attach time to initialize the list
+ * of factory MAC addresses supported by a MAC instance. This function
+ * builds a local cache in the mac_impl_t for the MAC addresses
+ * supported by the underlying hardware. The MAC clients themselves
+ * use the mac_addr_factory*() functions to query and reserve
+ * factory MAC addresses.
+ */
void
-mac_sdu_get(mac_handle_t mh, uint_t *min_sdu, uint_t *max_sdu)
+mac_addr_factory_init(mac_impl_t *mip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_capab_multifactaddr_t capab;
+ uint8_t *addr;
+ int i;
- if (min_sdu != NULL)
- *min_sdu = mip->mi_sdu_min;
- if (max_sdu != NULL)
- *max_sdu = mip->mi_sdu_max;
-}
-
-void
-mac_resources(mac_handle_t mh)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ /*
+ * First round to see how many factory MAC addresses are available.
+ */
+ bzero(&capab, sizeof (capab));
+ if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
+ &capab) || (capab.mcm_naddr == 0)) {
+ /*
+ * The MAC instance doesn't support multiple factory
+ * MAC addresses, we're done here.
+ */
+ return;
+ }
/*
- * If the driver supports resource registration, call the driver to
- * ask it to register its resources.
+ * Allocate the space and get all the factory addresses.
*/
- if (mip->mi_callbacks->mc_callbacks & MC_RESOURCES)
- mip->mi_resources(mip->mi_driver);
+ addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
+ capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
+
+ mip->mi_factory_addr_num = capab.mcm_naddr;
+ mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
+ sizeof (mac_factory_addr_t), KM_SLEEP);
+
+ for (i = 0; i < capab.mcm_naddr; i++) {
+ bcopy(addr + i * MAXMACADDRLEN,
+ mip->mi_factory_addr[i].mfa_addr,
+ mip->mi_type->mt_addr_length);
+ mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
+ }
+
+ kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
}
void
-mac_ioctl(mac_handle_t mh, queue_t *wq, mblk_t *bp)
+mac_addr_factory_fini(mac_impl_t *mip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- int cmd = ((struct iocblk *)bp->b_rptr)->ioc_cmd;
-
- if ((cmd == ND_GET && (mip->mi_callbacks->mc_callbacks & MC_GETPROP)) ||
- (cmd == ND_SET && (mip->mi_callbacks->mc_callbacks & MC_SETPROP))) {
- /*
- * If ndd props were registered, call them.
- * Note that ndd ioctls are Obsolete
- */
- mac_ndd_ioctl(mip, wq, bp);
+ if (mip->mi_factory_addr == NULL) {
+ ASSERT(mip->mi_factory_addr_num == 0);
return;
}
- /*
- * Call the driver to handle the ioctl. The driver may not support
- * any ioctls, in which case we reply with a NAK on its behalf.
- */
- if (mip->mi_callbacks->mc_callbacks & MC_IOCTL)
- mip->mi_ioctl(mip->mi_driver, wq, bp);
- else
- miocnak(wq, bp, 0, EINVAL);
+ kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
+ sizeof (mac_factory_addr_t));
+
+ mip->mi_factory_addr = NULL;
+ mip->mi_factory_addr_num = 0;
}
-const mac_txinfo_t *
-mac_do_tx_get(mac_handle_t mh, boolean_t is_vnic)
+/*
+ * Reserve a factory MAC address. If *slot is set to -1, the function
+ * attempts to reserve any of the available factory MAC addresses and
+ * returns the reserved slot id. If no slots are available, the function
+ * returns ENOSPC. If *slot is not set to -1, the function reserves
+ * the specified slot if it is available, or returns EBUSY is the slot
+ * is already used. Returns ENOTSUP if the underlying MAC does not
+ * support multiple factory addresses. If the slot number is not -1 but
+ * is invalid, returns EINVAL.
+ */
+int
+mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_txinfo_t *mtp;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ int i, ret = 0;
+ i_mac_perim_enter(mip);
/*
- * Grab the lock to prevent us from racing with MAC_PROMISC being
- * changed. This is sufficient since MAC clients are careful to always
- * call mac_txloop_add() prior to enabling MAC_PROMISC, and to disable
- * MAC_PROMISC prior to calling mac_txloop_remove().
+ * Protect against concurrent readers that may need a self-consistent
+ * view of the factory addresses
*/
- rw_enter(&mip->mi_tx_lock, RW_READER);
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
- if (mac_promisc_get(mh, MAC_PROMISC)) {
- ASSERT(mip->mi_mtfp != NULL);
- if (mip->mi_vnic_present && !is_vnic) {
- mtp = &mip->mi_vnic_txloopinfo;
- } else {
- mtp = &mip->mi_txloopinfo;
+ if (mip->mi_factory_addr_num == 0) {
+ ret = ENOTSUP;
+ goto bail;
+ }
+
+ if (*slot != -1) {
+ /* check the specified slot */
+ if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
+ ret = EINVAL;
+ goto bail;
+ }
+ if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
+ ret = EBUSY;
+ goto bail;
}
} else {
- if (mip->mi_vnic_present && !is_vnic) {
- mtp = &mip->mi_vnic_txinfo;
- } else {
- /*
- * Note that we cannot ASSERT() that mip->mi_mtfp is
- * NULL, because to satisfy the above ASSERT(), we
- * have to disable MAC_PROMISC prior to calling
- * mac_txloop_remove().
- */
- mtp = &mip->mi_txinfo;
+ /* pick the next available slot */
+ for (i = 0; i < mip->mi_factory_addr_num; i++) {
+ if (!mip->mi_factory_addr[i].mfa_in_use)
+ break;
+ }
+
+ if (i == mip->mi_factory_addr_num) {
+ ret = ENOSPC;
+ goto bail;
}
+ *slot = i+1;
}
- rw_exit(&mip->mi_tx_lock);
- return (mtp);
-}
+ mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
+ mip->mi_factory_addr[*slot-1].mfa_client = mcip;
-/*
- * Invoked by VNIC to obtain the transmit entry point.
- */
-const mac_txinfo_t *
-mac_vnic_tx_get(mac_handle_t mh)
-{
- return (mac_do_tx_get(mh, B_TRUE));
+bail:
+ rw_exit(&mip->mi_rw_lock);
+ i_mac_perim_exit(mip);
+ return (ret);
}
/*
- * Invoked by any non-VNIC client to obtain the transmit entry point.
- * If a VNIC is present, the VNIC transmit function provided by the VNIC
- * will be returned to the MAC client.
+ * Release the specified factory MAC address slot.
*/
-const mac_txinfo_t *
-mac_tx_get(mac_handle_t mh)
-{
- return (mac_do_tx_get(mh, B_FALSE));
-}
-
-link_state_t
-mac_link_get(mac_handle_t mh)
-{
- return (((mac_impl_t *)mh)->mi_linkstate);
-}
-
-mac_notify_handle_t
-mac_notify_add(mac_handle_t mh, mac_notify_t notify, void *arg)
+void
+mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_notify_fn_t *mnfp;
-
- mnfp = kmem_zalloc(sizeof (mac_notify_fn_t), KM_SLEEP);
- mnfp->mnf_fn = notify;
- mnfp->mnf_arg = arg;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ i_mac_perim_enter(mip);
/*
- * Add it to the head of the 'notify' callback list.
+ * Protect against concurrent readers that may need a self-consistent
+ * view of the factory addresses
*/
- rw_enter(&mip->mi_notify_lock, RW_WRITER);
- mnfp->mnf_nextp = mip->mi_mnfp;
- mip->mi_mnfp = mnfp;
- rw_exit(&mip->mi_notify_lock);
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
- return ((mac_notify_handle_t)mnfp);
+ ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
+ ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
+
+ mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
+
+ rw_exit(&mip->mi_rw_lock);
+ i_mac_perim_exit(mip);
}
+/*
+ * Stores in mac_addr the value of the specified MAC address. Returns
+ * 0 on success, or EINVAL if the slot number is not valid for the MAC.
+ * The caller must provide a string of at least MAXNAMELEN bytes.
+ */
void
-mac_notify_remove(mac_handle_t mh, mac_notify_handle_t mnh)
+mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
+ uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_notify_fn_t *mnfp = (mac_notify_fn_t *)mnh;
- mac_notify_fn_t **pp;
- mac_notify_fn_t *p;
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ boolean_t in_use;
+
+ ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
/*
- * Search the 'notify' callback list for the function closure.
+ * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
+ * and mi_rw_lock
*/
- rw_enter(&mip->mi_notify_lock, RW_WRITER);
- for (pp = &(mip->mi_mnfp); (p = *pp) != NULL;
- pp = &(p->mnf_nextp)) {
- if (p == mnfp)
- break;
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+ bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
+ *addr_len = mip->mi_type->mt_addr_length;
+ in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
+ if (in_use && client_name != NULL) {
+ bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
+ client_name, MAXNAMELEN);
}
- ASSERT(p != NULL);
+ if (in_use_arg != NULL)
+ *in_use_arg = in_use;
+ rw_exit(&mip->mi_rw_lock);
+}
- /*
- * Remove it from the list.
- */
- *pp = p->mnf_nextp;
- rw_exit(&mip->mi_notify_lock);
+/*
+ * Returns the number of factory MAC addresses (in addition to the
+ * primary MAC address), 0 if the underlying MAC doesn't support
+ * that feature.
+ */
+uint_t
+mac_addr_factory_num(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
- /*
- * Free it.
- */
- kmem_free(mnfp, sizeof (mac_notify_fn_t));
+ return (mip->mi_factory_addr_num);
}
+
void
-mac_notify(mac_handle_t mh)
+mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_notify_type_t type;
+ mac_ring_t *ring;
- for (type = 0; type < MAC_NNOTE; type++)
- i_mac_notify(mip, type);
+ for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
+ ring->mr_flag &= ~flag;
}
/*
- * Register a receive function for this mac.
- * More information on this function's interaction with mac_rx()
- * can be found atop mac_rx().
+ * The following mac_hwrings_xxx() functions are private mac client functions
+ * used by the aggr driver to access and control the underlying HW Rx group
+ * and rings. In this case, the aggr driver has exclusive control of the
+ * underlying HW Rx group/rings, it calls the following functions to
+ * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
+ * addresses, or set up the Rx callback.
*/
-mac_rx_handle_t
-mac_do_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg, boolean_t is_active)
+/* ARGSUSED */
+static void
+mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
+ mblk_t *mp_chain, boolean_t loopback)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_rx_fn_t *mrfp;
+ mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
+ mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
+ mac_direct_rx_t proc;
+ void *arg1;
+ mac_resource_handle_t arg2;
- mrfp = kmem_zalloc(sizeof (mac_rx_fn_t), KM_SLEEP);
- mrfp->mrf_fn = rx;
- mrfp->mrf_arg = arg;
- mrfp->mrf_active = is_active;
+ proc = srs_rx->sr_func;
+ arg1 = srs_rx->sr_arg1;
+ arg2 = mac_srs->srs_mrh;
- /*
- * Add it to the head of the 'rx' callback list.
- */
- rw_enter(&(mip->mi_rx_lock), RW_WRITER);
+ proc(arg1, arg2, mp_chain, NULL);
+}
+
+/*
+ * This function is called to get the list of HW rings that are reserved by
+ * an exclusive mac client.
+ *
+ * Return value: the number of HW rings.
+ */
+int
+mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
+ mac_ring_handle_t *hwrh)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ flow_entry_t *flent = mcip->mci_flent;
+ mac_group_t *grp = flent->fe_rx_ring_group;
+ mac_ring_t *ring;
+ int cnt = 0;
/*
- * mac_rx() will only call callbacks that are marked inuse.
+ * The mac client did not reserve any RX group, return directly.
+ * This is probably because the underlying MAC does not support
+ * any RX groups.
*/
- mrfp->mrf_inuse = B_TRUE;
- mrfp->mrf_nextp = mip->mi_mrfp;
+ *hwgh = NULL;
+ if (grp == NULL)
+ return (0);
/*
- * mac_rx() could be traversing the remainder of the list
- * and miss the new callback we're adding here. This is not a problem
- * because we do not guarantee the callback to take effect immediately
- * after mac_rx_add() returns.
+ * This RX group must be reserved by this mac client.
*/
- mip->mi_mrfp = mrfp;
- rw_exit(&(mip->mi_rx_lock));
+ ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
+ (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp))));
- return ((mac_rx_handle_t)mrfp);
+ for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) {
+ ASSERT(cnt < MAX_RINGS_PER_GROUP);
+ hwrh[cnt++] = (mac_ring_handle_t)ring;
+ }
+ *hwgh = (mac_group_handle_t)grp;
+ return (cnt);
}
-mac_rx_handle_t
-mac_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg)
+/*
+ * Setup the RX callback of the mac client which exclusively controls HW ring.
+ */
+void
+mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh)
{
- return (mac_do_rx_add(mh, rx, arg, B_FALSE));
+ mac_ring_t *hw_ring = (mac_ring_t *)hwrh;
+ mac_soft_ring_set_t *mac_srs = hw_ring->mr_srs;
+
+ mac_srs->srs_mrh = prh;
+ mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
}
-mac_rx_handle_t
-mac_active_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg)
+void
+mac_hwring_teardown(mac_ring_handle_t hwrh)
{
- return (mac_do_rx_add(mh, rx, arg, B_TRUE));
+ mac_ring_t *hw_ring = (mac_ring_t *)hwrh;
+ mac_soft_ring_set_t *mac_srs = hw_ring->mr_srs;
+
+ mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
+ mac_srs->srs_mrh = NULL;
}
-/*
- * Unregister a receive function for this mac.
- * This function does not block if wait is B_FALSE. This is useful
- * for clients who call mac_rx_remove() from a non-blockable context.
- * More information on this function's interaction with mac_rx()
- * can be found atop mac_rx().
- */
-void
-mac_rx_remove(mac_handle_t mh, mac_rx_handle_t mrh, boolean_t wait)
+int
+mac_hwring_disable_intr(mac_ring_handle_t rh)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_rx_fn_t *mrfp = (mac_rx_fn_t *)mrh;
- mac_rx_fn_t **pp;
- mac_rx_fn_t *p;
+ mac_ring_t *rr_ring = (mac_ring_t *)rh;
+ mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
- /*
- * Search the 'rx' callback list for the function closure.
- */
- rw_enter(&mip->mi_rx_lock, RW_WRITER);
- for (pp = &(mip->mi_mrfp); (p = *pp) != NULL; pp = &(p->mrf_nextp)) {
- if (p == mrfp)
- break;
- }
- ASSERT(p != NULL);
+ return (intr->mi_disable(intr->mi_handle));
+}
- /*
- * If mac_rx() is running, mark callback for deletion
- * and return (if wait is false), or wait until mac_rx()
- * exits (if wait is true).
- */
- if (mip->mi_rx_ref > 0) {
- DTRACE_PROBE1(defer_delete, mac_impl_t *, mip);
- p->mrf_inuse = B_FALSE;
- mutex_enter(&mip->mi_lock);
- mip->mi_rx_removed++;
- mutex_exit(&mip->mi_lock);
+int
+mac_hwring_enable_intr(mac_ring_handle_t rh)
+{
+ mac_ring_t *rr_ring = (mac_ring_t *)rh;
+ mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
- rw_exit(&mip->mi_rx_lock);
- if (wait)
- mac_rx_remove_wait(mh);
- return;
- }
+ return (intr->mi_enable(intr->mi_handle));
+}
+
+int
+mac_hwring_start(mac_ring_handle_t rh)
+{
+ mac_ring_t *rr_ring = (mac_ring_t *)rh;
- /* Remove it from the list. */
- *pp = p->mrf_nextp;
- kmem_free(mrfp, sizeof (mac_rx_fn_t));
- rw_exit(&mip->mi_rx_lock);
+ MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
+ return (0);
}
-/*
- * Wait for all pending callback removals to be completed by mac_rx().
- * Note that if we call mac_rx_remove() immediately before this, there is no
- * guarantee we would wait *only* on the callback that we specified.
- * mac_rx_remove() could have been called by other threads and we would have
- * to wait for other marked callbacks to be removed as well.
- */
void
-mac_rx_remove_wait(mac_handle_t mh)
+mac_hwring_stop(mac_ring_handle_t rh)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_ring_t *rr_ring = (mac_ring_t *)rh;
- mutex_enter(&mip->mi_lock);
- while (mip->mi_rx_removed > 0) {
- DTRACE_PROBE1(need_wait, mac_impl_t *, mip);
- cv_wait(&mip->mi_rx_cv, &mip->mi_lock);
- }
- mutex_exit(&mip->mi_lock);
+ mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
}
-mac_txloop_handle_t
-mac_txloop_add(mac_handle_t mh, mac_txloop_t tx, void *arg)
+mblk_t *
+mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_txloop_fn_t *mtfp;
+ mac_ring_t *rr_ring = (mac_ring_t *)rh;
+ mac_ring_info_t *info = &rr_ring->mr_info;
- mtfp = kmem_zalloc(sizeof (mac_txloop_fn_t), KM_SLEEP);
- mtfp->mtf_fn = tx;
- mtfp->mtf_arg = arg;
+ return (info->mri_poll(info->mri_driver, bytes_to_pickup));
+}
- /*
- * Add it to the head of the 'tx' callback list.
- */
- rw_enter(&(mip->mi_tx_lock), RW_WRITER);
- mtfp->mtf_nextp = mip->mi_mtfp;
- mip->mi_mtfp = mtfp;
- rw_exit(&(mip->mi_tx_lock));
+int
+mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
+{
+ mac_group_t *group = (mac_group_t *)gh;
- return ((mac_txloop_handle_t)mtfp);
+ return (mac_group_addmac(group, addr));
+}
+
+int
+mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
+{
+ mac_group_t *group = (mac_group_t *)gh;
+
+ return (mac_group_remmac(group, addr));
}
/*
- * Unregister a transmit function for this mac. This removes the function
- * from the list of transmit functions for this mac.
+ * Set the RX group to be shared/reserved. Note that the group must be
+ * started/stopped outside of this function.
*/
void
-mac_txloop_remove(mac_handle_t mh, mac_txloop_handle_t mth)
+mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_txloop_fn_t *mtfp = (mac_txloop_fn_t *)mth;
- mac_txloop_fn_t **pp;
- mac_txloop_fn_t *p;
-
/*
- * Search the 'tx' callback list for the function.
+ * If there is no change in the group state, just return.
*/
- rw_enter(&(mip->mi_tx_lock), RW_WRITER);
- for (pp = &(mip->mi_mtfp); (p = *pp) != NULL; pp = &(p->mtf_nextp)) {
- if (p == mtfp)
- break;
+ if (grp->mrg_state == state)
+ return;
+
+ switch (state) {
+ case MAC_GROUP_STATE_RESERVED:
+ /*
+ * Successfully reserved the group.
+ *
+ * Given that there is an exclusive client controlling this
+ * group, we enable the group level polling when available,
+ * so that SRSs get to turn on/off individual rings they's
+ * assigned to.
+ */
+ ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
+
+ if (GROUP_INTR_DISABLE_FUNC(grp) != NULL)
+ GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
+
+ break;
+
+ case MAC_GROUP_STATE_SHARED:
+ /*
+ * Set all rings of this group to software classified.
+ * If the group has an overriding interrupt, then re-enable it.
+ */
+ ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
+
+ if (GROUP_INTR_ENABLE_FUNC(grp) != NULL)
+ GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
+
+ /* The ring is not available for reservations any more */
+ break;
+
+ case MAC_GROUP_STATE_REGISTERED:
+ /* Also callable from mac_register, perim is not held */
+ break;
+
+ default:
+ ASSERT(B_FALSE);
+ break;
}
- ASSERT(p != NULL);
- /* Remove it from the list. */
- *pp = p->mtf_nextp;
- kmem_free(mtfp, sizeof (mac_txloop_fn_t));
- rw_exit(&(mip->mi_tx_lock));
+ grp->mrg_state = state;
}
-void
-mac_resource_set(mac_handle_t mh, mac_resource_add_t add, void *arg)
+/*
+ * Quiesce future hardware classified packets for the specified Rx ring
+ */
+static void
+mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
-
- /*
- * Update the 'resource_add' callbacks.
- */
- rw_enter(&(mip->mi_resource_lock), RW_WRITER);
- mip->mi_resource_add = add;
- mip->mi_resource_add_arg = arg;
- rw_exit(&(mip->mi_resource_lock));
+ ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
+ ASSERT(ring_flag == MR_CONDEMNED || ring_flag == MR_QUIESCE);
+
+ mutex_enter(&rx_ring->mr_lock);
+ rx_ring->mr_flag |= ring_flag;
+ while (rx_ring->mr_refcnt != 0)
+ cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
+ mutex_exit(&rx_ring->mr_lock);
}
/*
- * Driver support functions.
+ * Please see mac_tx for details about the per cpu locking scheme
*/
-
-mac_register_t *
-mac_alloc(uint_t mac_version)
+static void
+mac_tx_lock_all(mac_client_impl_t *mcip)
{
- mac_register_t *mregp;
+ int i;
- /*
- * Make sure there isn't a version mismatch between the driver and
- * the framework. In the future, if multiple versions are
- * supported, this check could become more sophisticated.
- */
- if (mac_version != MAC_VERSION)
- return (NULL);
-
- mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
- mregp->m_version = mac_version;
- return (mregp);
+ for (i = 0; i <= mac_tx_percpu_cnt; i++)
+ mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
}
-void
-mac_free(mac_register_t *mregp)
+static void
+mac_tx_unlock_all(mac_client_impl_t *mcip)
{
- kmem_free(mregp, sizeof (mac_register_t));
+ int i;
+
+ for (i = mac_tx_percpu_cnt; i >= 0; i--)
+ mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
}
-/*
- * Allocate a minor number.
- */
-minor_t
-mac_minor_hold(boolean_t sleep)
+static void
+mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
{
- minor_t minor;
+ int i;
- /*
- * Grab a value from the arena.
- */
- atomic_add_32(&minor_count, 1);
+ for (i = mac_tx_percpu_cnt; i > 0; i--)
+ mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
+}
- if (sleep)
- minor = (uint_t)id_alloc(minor_ids);
- else
- minor = (uint_t)id_alloc_nosleep(minor_ids);
+static int
+mac_tx_sum_refcnt(mac_client_impl_t *mcip)
+{
+ int i;
+ int refcnt = 0;
- if (minor == 0) {
- atomic_add_32(&minor_count, -1);
- return (0);
- }
+ for (i = 0; i <= mac_tx_percpu_cnt; i++)
+ refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
- return (minor);
+ return (refcnt);
}
/*
- * Release a previously allocated minor number.
+ * Stop future Tx packets coming down from the client in preparation for
+ * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
+ * of rings between clients
*/
void
-mac_minor_rele(minor_t minor)
+mac_tx_client_block(mac_client_impl_t *mcip)
{
- /*
- * Return the value to the arena.
- */
- id_free(minor_ids, minor);
- atomic_add_32(&minor_count, -1);
+ mac_tx_lock_all(mcip);
+ mcip->mci_tx_flag |= MCI_TX_QUIESCE;
+ while (mac_tx_sum_refcnt(mcip) != 0) {
+ mac_tx_unlock_allbutzero(mcip);
+ cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
+ mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
+ mac_tx_lock_all(mcip);
+ }
+ mac_tx_unlock_all(mcip);
}
-uint32_t
-mac_no_notification(mac_handle_t mh)
+void
+mac_tx_client_unblock(mac_client_impl_t *mcip)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- return (mip->mi_unsup_note);
+ mac_tx_lock_all(mcip);
+ mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
+ mac_tx_unlock_all(mcip);
}
-boolean_t
-mac_is_legacy(mac_handle_t mh)
+/*
+ * Wait for an SRS to quiesce. The SRS worker will signal us when the
+ * quiesce is done.
+ */
+static void
+mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- return (mip->mi_legacy);
+ mutex_enter(&srs->srs_lock);
+ while (!(srs->srs_state & srs_flag))
+ cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
+ mutex_exit(&srs->srs_lock);
}
/*
- * mac_register() is how drivers register new MACs with the GLDv3
- * framework. The mregp argument is allocated by drivers using the
- * mac_alloc() function, and can be freed using mac_free() immediately upon
- * return from mac_register(). Upon success (0 return value), the mhp
- * opaque pointer becomes the driver's handle to its MAC interface, and is
- * the argument to all other mac module entry points.
+ * Quiescing an Rx SRS is achieved by the following sequence. The protocol
+ * works bottom up by cutting off packet flow from the bottommost point in the
+ * mac, then the SRS, and then the soft rings. There are 2 use cases of this
+ * mechanism. One is a temporary quiesce of the SRS, such as say while changing
+ * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
+ * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
+ * for the SRS and MR flags. In the former case the threads pause waiting for
+ * a restart, while in the latter case the threads exit. The Tx SRS teardown
+ * is also mostly similar to the above.
+ *
+ * 1. Stop future hardware classified packets at the lowest level in the mac.
+ * Remove any hardware classification rule (CONDEMNED case) and mark the
+ * rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
+ * from increasing. Upcalls from the driver that come through hardware
+ * classification will be dropped in mac_rx from now on. Then we wait for
+ * the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
+ * sure there aren't any upcall threads from the driver through hardware
+ * classification. In the case of SRS teardown we also remove the
+ * classification rule in the driver.
+ *
+ * 2. Stop future software classified packets by marking the flow entry with
+ * FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
+ * increasing. We also remove the flow entry from the table in the latter
+ * case. Then wait for the fe_refcnt to reach an appropriate quiescent value
+ * that indicates there aren't any active threads using that flow entry.
+ *
+ * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
+ * SRS worker thread, and the soft ring threads are quiesced in sequence
+ * with the SRS worker thread serving as a master controller. This
+ * mechansim is explained in mac_srs_worker_quiesce().
+ *
+ * The restart mechanism to reactivate the SRS and softrings is explained
+ * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
+ * restart sequence.
*/
-int
-mac_register(mac_register_t *mregp, mac_handle_t *mhp)
-{
- mac_impl_t *mip;
- mactype_t *mtype;
- int err = EINVAL;
- struct devnames *dnp = NULL;
- uint_t instance;
- boolean_t style1_created = B_FALSE;
- boolean_t style2_created = B_FALSE;
- mac_capab_legacy_t legacy;
- char *driver;
- minor_t minor = 0;
-
- /* Find the required MAC-Type plugin. */
- if ((mtype = i_mactype_getplugin(mregp->m_type_ident)) == NULL)
- return (EINVAL);
+void
+mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
+{
+ flow_entry_t *flent = srs->srs_flent;
+ uint_t mr_flag, srs_done_flag;
- /* Create a mac_impl_t to represent this MAC. */
- mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
+ ASSERT(!(srs->srs_type & SRST_TX));
- /*
- * The mac is not ready for open yet.
- */
- mip->mi_disabled = B_TRUE;
-
- /*
- * When a mac is registered, the m_instance field can be set to:
- *
- * 0: Get the mac's instance number from m_dip.
- * This is usually used for physical device dips.
- *
- * [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
- * For example, when an aggregation is created with the key option,
- * "key" will be used as the instance number.
- *
- * -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
- * This is often used when a MAC of a virtual link is registered
- * (e.g., aggregation when "key" is not specified, or vnic).
- *
- * Note that the instance number is used to derive the mi_minor field
- * of mac_impl_t, which will then be used to derive the name of kstats
- * and the devfs nodes. The first 2 cases are needed to preserve
- * backward compatibility.
- */
- switch (mregp->m_instance) {
- case 0:
- instance = ddi_get_instance(mregp->m_dip);
- break;
- case ((uint_t)-1):
- minor = mac_minor_hold(B_TRUE);
- if (minor == 0) {
- err = ENOSPC;
- goto fail;
- }
- instance = minor - 1;
- break;
- default:
- instance = mregp->m_instance;
- if (instance >= MAC_MAX_MINOR) {
- err = EINVAL;
- goto fail;
- }
- break;
+ if (srs_quiesce_flag == SRS_CONDEMNED) {
+ mr_flag = MR_CONDEMNED;
+ srs_done_flag = SRS_CONDEMNED_DONE;
+ if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
+ mac_srs_client_poll_disable(srs->srs_mcip, srs);
+ } else {
+ ASSERT(srs_quiesce_flag == SRS_QUIESCE);
+ mr_flag = MR_QUIESCE;
+ srs_done_flag = SRS_QUIESCE_DONE;
+ if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
+ mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
}
- mip->mi_minor = (minor_t)(instance + 1);
- mip->mi_dip = mregp->m_dip;
-
- driver = (char *)ddi_driver_name(mip->mi_dip);
-
- /* Construct the MAC name as <drvname><instance> */
- (void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
- driver, instance);
-
- mip->mi_driver = mregp->m_driver;
-
- mip->mi_type = mtype;
- mip->mi_margin = mregp->m_margin;
- mip->mi_info.mi_media = mtype->mt_type;
- mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
- if (mregp->m_max_sdu <= mregp->m_min_sdu)
- goto fail;
- mip->mi_sdu_min = mregp->m_min_sdu;
- mip->mi_sdu_max = mregp->m_max_sdu;
- mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
- /*
- * If the media supports a broadcast address, cache a pointer to it
- * in the mac_info_t so that upper layers can use it.
- */
- mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
-
- /*
- * Copy the unicast source address into the mac_info_t, but only if
- * the MAC-Type defines a non-zero address length. We need to
- * handle MAC-Types that have an address length of 0
- * (point-to-point protocol MACs for example).
- */
- if (mip->mi_type->mt_addr_length > 0) {
- if (mregp->m_src_addr == NULL)
- goto fail;
- mip->mi_info.mi_unicst_addr =
- kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
- bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
- mip->mi_type->mt_addr_length);
-
+ if (srs->srs_ring != NULL) {
+ mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
+ } else {
/*
- * Copy the fixed 'factory' MAC address from the immutable
- * info. This is taken to be the MAC address currently in
- * use.
+ * SRS is driven by software classification. In case
+ * of CONDEMNED, the top level teardown functions will
+ * deal with flow removal.
*/
- bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
- mip->mi_type->mt_addr_length);
- /* Copy the destination address if one is provided. */
- if (mregp->m_dst_addr != NULL) {
- bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
- mip->mi_type->mt_addr_length);
+ if (srs_quiesce_flag != SRS_CONDEMNED) {
+ FLOW_MARK(flent, FE_QUIESCE);
+ mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
}
- } else if (mregp->m_src_addr != NULL) {
- goto fail;
}
/*
- * The format of the m_pdata is specific to the plugin. It is
- * passed in as an argument to all of the plugin callbacks. The
- * driver can update this information by calling
- * mac_pdata_update().
+ * Signal the SRS to quiesce itself, and then cv_wait for the
+ * SRS quiesce to complete. The SRS worker thread will wake us
+ * up when the quiesce is complete
*/
- if (mregp->m_pdata != NULL) {
- /*
- * Verify that the plugin supports MAC plugin data and that
- * the supplied data is valid.
- */
- if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
- goto fail;
- if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
- mregp->m_pdata_size)) {
- goto fail;
- }
- mip->mi_pdata = kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
- bcopy(mregp->m_pdata, mip->mi_pdata, mregp->m_pdata_size);
- mip->mi_pdata_size = mregp->m_pdata_size;
- }
+ mac_srs_signal(srs, srs_quiesce_flag);
+ mac_srs_quiesce_wait(srs, srs_done_flag);
+}
- /*
- * Register the private properties.
- */
- mac_register_priv_prop(mip, mregp->m_priv_props,
- mregp->m_priv_prop_count);
+/*
+ * Remove an SRS.
+ */
+void
+mac_rx_srs_remove(mac_soft_ring_set_t *srs)
+{
+ flow_entry_t *flent = srs->srs_flent;
+ int i;
+ mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
/*
- * Stash the driver callbacks into the mac_impl_t, but first sanity
- * check to make sure all mandatory callbacks are set.
+ * Locate and remove our entry in the fe_rx_srs[] array, and
+ * adjust the fe_rx_srs array entries and array count by
+ * moving the last entry into the vacated spot.
*/
- if (mregp->m_callbacks->mc_getstat == NULL ||
- mregp->m_callbacks->mc_start == NULL ||
- mregp->m_callbacks->mc_stop == NULL ||
- mregp->m_callbacks->mc_setpromisc == NULL ||
- mregp->m_callbacks->mc_multicst == NULL ||
- mregp->m_callbacks->mc_unicst == NULL ||
- mregp->m_callbacks->mc_tx == NULL) {
- goto fail;
+ mutex_enter(&flent->fe_lock);
+ for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+ if (flent->fe_rx_srs[i] == srs)
+ break;
}
- mip->mi_callbacks = mregp->m_callbacks;
- /*
- * Set up the possible transmit routines.
- */
- mip->mi_txinfo.mt_fn = mip->mi_tx;
- mip->mi_txinfo.mt_arg = mip->mi_driver;
+ ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
+ if (i != flent->fe_rx_srs_cnt - 1) {
+ flent->fe_rx_srs[i] =
+ flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
+ i = flent->fe_rx_srs_cnt - 1;
+ }
- mip->mi_legacy = mac_capab_get((mac_handle_t)mip,
- MAC_CAPAB_LEGACY, &legacy);
+ flent->fe_rx_srs[i] = NULL;
+ flent->fe_rx_srs_cnt--;
+ mutex_exit(&flent->fe_lock);
- if (mip->mi_legacy) {
- /*
- * Legacy device. Messages being sent will be looped back
- * by the underlying driver. Therefore the txloop function
- * pointer is the same as the tx function pointer.
- */
- mip->mi_txloopinfo.mt_fn = mip->mi_txinfo.mt_fn;
- mip->mi_txloopinfo.mt_arg = mip->mi_txinfo.mt_arg;
- mip->mi_unsup_note = legacy.ml_unsup_note;
- mip->mi_phy_dev = legacy.ml_dev;
- } else {
- /*
- * Normal device. The framework needs to do the loopback.
- */
- mip->mi_txloopinfo.mt_fn = mac_txloop;
- mip->mi_txloopinfo.mt_arg = mip;
- mip->mi_unsup_note = 0;
- mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
- ddi_get_instance(mip->mi_dip) + 1);
- }
+ mac_srs_free(srs);
+}
- mip->mi_vnic_txinfo.mt_fn = mac_vnic_tx;
- mip->mi_vnic_txinfo.mt_arg = mip;
+static void
+mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
+{
+ mutex_enter(&srs->srs_lock);
+ srs->srs_state &= ~flag;
+ mutex_exit(&srs->srs_lock);
+}
+
+void
+mac_rx_srs_restart(mac_soft_ring_set_t *srs)
+{
+ flow_entry_t *flent = srs->srs_flent;
+ mac_ring_t *mr;
- mip->mi_vnic_txloopinfo.mt_fn = mac_vnic_txloop;
- mip->mi_vnic_txloopinfo.mt_arg = mip;
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
+ ASSERT((srs->srs_type & SRST_TX) == 0);
/*
- * Allocate a notification thread.
+ * This handles a change in the number of SRSs between the quiesce and
+ * and restart operation of a flow.
*/
- mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
- mip, 0, &p0, TS_RUN, minclsyspri);
- if (mip->mi_notify_thread == NULL)
- goto fail;
+ if (!SRS_QUIESCED(srs))
+ return;
/*
- * Initialize the kstats for this device.
+ * Signal the SRS to restart itself. Wait for the restart to complete
+ * Note that we only restart the SRS if it is not marked as
+ * permanently quiesced.
*/
- mac_stat_create(mip);
-
-
- /* set the gldv3 flag in dn_flags */
- dnp = &devnamesp[ddi_driver_major(mip->mi_dip)];
- LOCK_DEV_OPS(&dnp->dn_lock);
- dnp->dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
- UNLOCK_DEV_OPS(&dnp->dn_lock);
-
- if (mip->mi_minor < MAC_MAX_MINOR + 1) {
- /* Create a style-2 DLPI device */
- if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
- DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
- goto fail;
- style2_created = B_TRUE;
+ if (!SRS_QUIESCED_PERMANENT(srs)) {
+ mac_srs_signal(srs, SRS_RESTART);
+ mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
+ mac_srs_clear_flag(srs, SRS_RESTART_DONE);
- /* Create a style-1 DLPI device */
- if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
- mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
- goto fail;
- style1_created = B_TRUE;
+ mac_srs_client_poll_restart(srs->srs_mcip, srs);
}
- rw_enter(&i_mac_impl_lock, RW_WRITER);
- if (mod_hash_insert(i_mac_impl_hash,
- (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
-
- rw_exit(&i_mac_impl_lock);
- err = EEXIST;
- goto fail;
+ /* Finally clear the flags to let the packets in */
+ mr = srs->srs_ring;
+ if (mr != NULL) {
+ MAC_RING_UNMARK(mr, MR_QUIESCE);
+ /* In case the ring was stopped, safely restart it */
+ (void) mac_start_ring(mr);
+ } else {
+ FLOW_UNMARK(flent, FE_QUIESCE);
}
+}
- DTRACE_PROBE2(mac__register, struct devnames *, dnp,
- (mac_impl_t *), mip);
-
- /*
- * Mark the MAC to be ready for open.
- */
- mip->mi_disabled = B_FALSE;
-
- rw_exit(&i_mac_impl_lock);
-
- atomic_inc_32(&i_mac_impl_count);
+/*
+ * Temporary quiesce of a flow and associated Rx SRS.
+ * Please see block comment above mac_rx_classify_flow_rem.
+ */
+/* ARGSUSED */
+int
+mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
+{
+ int i;
- cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
- *mhp = (mac_handle_t)mip;
+ for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+ mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
+ SRS_QUIESCE);
+ }
return (0);
+}
-fail:
- if (style1_created)
- ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
-
- if (style2_created)
- ddi_remove_minor_node(mip->mi_dip, driver);
+/*
+ * Restart a flow and associated Rx SRS that has been quiesced temporarily
+ * Please see block comment above mac_rx_classify_flow_rem
+ */
+/* ARGSUSED */
+int
+mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
+{
+ int i;
- /* clean up notification thread */
- if (mip->mi_notify_thread != NULL) {
- mutex_enter(&mip->mi_notify_bits_lock);
- mip->mi_notify_bits = (1 << MAC_NNOTE);
- cv_broadcast(&mip->mi_notify_cv);
- while (mip->mi_notify_bits != 0)
- cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock);
- mutex_exit(&mip->mi_notify_bits_lock);
- }
+ for (i = 0; i < flent->fe_rx_srs_cnt; i++)
+ mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
- if (mip->mi_info.mi_unicst_addr != NULL) {
- kmem_free(mip->mi_info.mi_unicst_addr,
- mip->mi_type->mt_addr_length);
- mip->mi_info.mi_unicst_addr = NULL;
- }
+ return (0);
+}
- mac_stat_destroy(mip);
+void
+mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ flow_entry_t *flent = mcip->mci_flent;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_soft_ring_set_t *mac_srs;
+ int i;
- if (mip->mi_type != NULL) {
- atomic_dec_32(&mip->mi_type->mt_ref);
- mip->mi_type = NULL;
- }
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
- if (mip->mi_pdata != NULL) {
- kmem_free(mip->mi_pdata, mip->mi_pdata_size);
- mip->mi_pdata = NULL;
- mip->mi_pdata_size = 0;
- }
+ if (flent == NULL)
+ return;
- if (minor != 0) {
- ASSERT(minor > MAC_MAX_MINOR);
- mac_minor_rele(minor);
+ for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+ mac_srs = flent->fe_rx_srs[i];
+ mutex_enter(&mac_srs->srs_lock);
+ if (on)
+ mac_srs->srs_state |= SRS_QUIESCE_PERM;
+ else
+ mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
+ mutex_exit(&mac_srs->srs_lock);
}
-
- mac_unregister_priv_prop(mip);
-
- kmem_cache_free(i_mac_impl_cachep, mip);
- return (err);
}
-int
-mac_disable(mac_handle_t mh)
+void
+mac_rx_client_quiesce(mac_client_handle_t mch)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
- /*
- * See if there are any other references to this mac_t (e.g., VLAN's).
- * If not, set mi_disabled to prevent any new VLAN's from being
- * created while we're destroying this mac.
- */
- rw_enter(&i_mac_impl_lock, RW_WRITER);
- if (mip->mi_ref > 0) {
- rw_exit(&i_mac_impl_lock);
- return (EBUSY);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ if (MCIP_DATAPATH_SETUP(mcip)) {
+ (void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
+ NULL);
+ (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+ mac_rx_classify_flow_quiesce, NULL);
}
- mip->mi_disabled = B_TRUE;
- rw_exit(&i_mac_impl_lock);
- return (0);
}
-int
-mac_unregister(mac_handle_t mh)
+void
+mac_rx_client_restart(mac_client_handle_t mch)
{
- int err;
- mac_impl_t *mip = (mac_impl_t *)mh;
- mod_hash_val_t val;
- mac_multicst_addr_t *p, *nextp;
- mac_margin_req_t *mmr, *nextmmr;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
- /*
- * See if there are any other references to this mac_t (e.g., VLAN's).
- * If not, set mi_disabled to prevent any new VLAN's from being
- * created while we're destroying this mac. Once mac_disable() returns
- * 0, the rest of mac_unregister() stuff should continue without
- * returning an error.
- */
- if (!mip->mi_disabled) {
- if ((err = mac_disable(mh)) != 0)
- return (err);
- }
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
- /*
- * Clean up notification thread (wait for it to exit).
- */
- mutex_enter(&mip->mi_notify_bits_lock);
- mip->mi_notify_bits = (1 << MAC_NNOTE);
- cv_broadcast(&mip->mi_notify_cv);
- while (mip->mi_notify_bits != 0)
- cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock);
- mutex_exit(&mip->mi_notify_bits_lock);
-
- if (mip->mi_minor < MAC_MAX_MINOR + 1) {
- ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
- ddi_remove_minor_node(mip->mi_dip,
- (char *)ddi_driver_name(mip->mi_dip));
+ if (MCIP_DATAPATH_SETUP(mcip)) {
+ (void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
+ (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+ mac_rx_classify_flow_restart, NULL);
}
+}
- ASSERT(!mip->mi_activelink);
-
- mac_stat_destroy(mip);
-
- rw_enter(&i_mac_impl_lock, RW_WRITER);
- (void) mod_hash_remove(i_mac_impl_hash,
- (mod_hash_key_t)mip->mi_name, &val);
- ASSERT(mip == (mac_impl_t *)val);
+/*
+ * This function only quiesces the Tx SRS and softring worker threads. Callers
+ * need to make sure that there aren't any mac client threads doing current or
+ * future transmits in the mac before calling this function.
+ */
+void
+mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
+{
+ mac_client_impl_t *mcip = srs->srs_mcip;
- ASSERT(i_mac_impl_count > 0);
- atomic_dec_32(&i_mac_impl_count);
- rw_exit(&i_mac_impl_lock);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
- if (mip->mi_pdata != NULL)
- kmem_free(mip->mi_pdata, mip->mi_pdata_size);
- mip->mi_pdata = NULL;
- mip->mi_pdata_size = 0;
+ ASSERT(srs->srs_type & SRST_TX);
+ ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
+ srs_quiesce_flag == SRS_QUIESCE);
/*
- * Free the list of multicast addresses.
+ * Signal the SRS to quiesce itself, and then cv_wait for the
+ * SRS quiesce to complete. The SRS worker thread will wake us
+ * up when the quiesce is complete
*/
- for (p = mip->mi_mmap; p != NULL; p = nextp) {
- nextp = p->mma_nextp;
- kmem_free(p, sizeof (mac_multicst_addr_t));
- }
- mip->mi_mmap = NULL;
+ mac_srs_signal(srs, srs_quiesce_flag);
+ mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
+ SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
+}
+void
+mac_tx_srs_restart(mac_soft_ring_set_t *srs)
+{
/*
- * Free the list of margin request.
+ * Resizing the fanout could result in creation of new SRSs.
+ * They may not necessarily be in the quiesced state in which
+ * case it need be restarted
*/
- for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
- nextmmr = mmr->mmr_nextp;
- kmem_free(mmr, sizeof (mac_margin_req_t));
- }
- mip->mi_mmrp = NULL;
-
- mip->mi_linkstate = LINK_STATE_UNKNOWN;
- kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
- mip->mi_info.mi_unicst_addr = NULL;
-
- atomic_dec_32(&mip->mi_type->mt_ref);
- mip->mi_type = NULL;
-
- if (mip->mi_minor > MAC_MAX_MINOR)
- mac_minor_rele(mip->mi_minor);
-
- mac_unregister_priv_prop(mip);
-
- cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
-
- kmem_cache_free(i_mac_impl_cachep, mip);
+ if (!SRS_QUIESCED(srs))
+ return;
- return (0);
+ mac_srs_signal(srs, SRS_RESTART);
+ mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
+ mac_srs_clear_flag(srs, SRS_RESTART_DONE);
}
/*
- * To avoid potential deadlocks, mac_rx() releases mi_rx_lock
- * before invoking its list of upcalls. This introduces races with
- * mac_rx_remove() and mac_rx_add(), who can potentially modify the
- * upcall list while mi_rx_lock is not being held. The race with
- * mac_rx_remove() is handled by incrementing mi_rx_ref upon entering
- * mac_rx(); a non-zero mi_rx_ref would tell mac_rx_remove()
- * to not modify the list but instead mark an upcall for deletion.
- * before mac_rx() exits, mi_rx_ref is decremented and if it
- * is 0, the marked upcalls will be removed from the list and freed.
- * The race with mac_rx_add() is harmless because mac_rx_add() only
- * prepends to the list and since mac_rx() saves the list head
- * before releasing mi_rx_lock, any prepended upcall won't be seen
- * until the next packet chain arrives.
- *
- * To minimize lock contention between multiple parallel invocations
- * of mac_rx(), mi_rx_lock is acquired as a READER lock. The
- * use of atomic operations ensures the sanity of mi_rx_ref. mi_rx_lock
- * will be upgraded to WRITER mode when there are marked upcalls to be
- * cleaned.
+ * Temporary quiesce of a flow and associated Rx SRS.
+ * Please see block comment above mac_rx_srs_quiesce
*/
-static void
-mac_do_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain,
- boolean_t active_only)
+/* ARGSUSED */
+int
+mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mblk_t *bp = mp_chain;
- mac_rx_fn_t *mrfp;
-
/*
- * Call all registered receive functions.
+ * The fe_tx_srs is null for a subflow on an interface that is
+ * not plumbed
*/
- rw_enter(&mip->mi_rx_lock, RW_READER);
- if ((mrfp = mip->mi_mrfp) == NULL) {
- /* There are no registered receive functions. */
- freemsgchain(bp);
- rw_exit(&mip->mi_rx_lock);
- return;
- }
- atomic_inc_32(&mip->mi_rx_ref);
- rw_exit(&mip->mi_rx_lock);
+ if (flent->fe_tx_srs != NULL)
+ mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
+ return (0);
+}
+/* ARGSUSED */
+int
+mac_tx_flow_restart(flow_entry_t *flent, void *arg)
+{
/*
- * Call registered receive functions.
+ * The fe_tx_srs is null for a subflow on an interface that is
+ * not plumbed
*/
- do {
- mblk_t *recv_bp;
-
- if (active_only && !mrfp->mrf_active) {
- mrfp = mrfp->mrf_nextp;
- if (mrfp == NULL) {
- /*
- * We hit the last receiver, but it's not
- * active.
- */
- freemsgchain(bp);
- }
- continue;
- }
-
- recv_bp = (mrfp->mrf_nextp != NULL) ? copymsgchain(bp) : bp;
- if (recv_bp != NULL) {
- if (mrfp->mrf_inuse) {
- /*
- * Send bp itself and keep the copy.
- * If there's only one active receiver,
- * it should get the original message,
- * tagged with the hardware checksum flags.
- */
- mrfp->mrf_fn(mrfp->mrf_arg, mrh, bp);
- bp = recv_bp;
- } else {
- freemsgchain(recv_bp);
- }
- }
-
- mrfp = mrfp->mrf_nextp;
- } while (mrfp != NULL);
+ if (flent->fe_tx_srs != NULL)
+ mac_tx_srs_restart(flent->fe_tx_srs);
+ return (0);
+}
- rw_enter(&mip->mi_rx_lock, RW_READER);
- if (atomic_dec_32_nv(&mip->mi_rx_ref) == 0 && mip->mi_rx_removed > 0) {
- mac_rx_fn_t **pp, *p;
- uint32_t cnt = 0;
+void
+mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
- DTRACE_PROBE1(delete_callbacks, mac_impl_t *, mip);
+ mac_tx_client_block(mcip);
+ if (MCIP_TX_SRS(mcip) != NULL) {
+ mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
+ (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+ mac_tx_flow_quiesce, NULL);
+ }
+}
- /*
- * Need to become exclusive before doing cleanup
- */
- if (rw_tryupgrade(&mip->mi_rx_lock) == 0) {
- rw_exit(&mip->mi_rx_lock);
- rw_enter(&mip->mi_rx_lock, RW_WRITER);
- }
+void
+mac_tx_client_restart(mac_client_impl_t *mcip)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
- /*
- * We return if another thread has already entered and cleaned
- * up the list.
- */
- if (mip->mi_rx_ref > 0 || mip->mi_rx_removed == 0) {
- rw_exit(&mip->mi_rx_lock);
- return;
- }
+ mac_tx_client_unblock(mcip);
+ if (MCIP_TX_SRS(mcip) != NULL) {
+ mac_tx_srs_restart(MCIP_TX_SRS(mcip));
+ (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+ mac_tx_flow_restart, NULL);
+ }
+}
- /*
- * Free removed callbacks.
- */
- pp = &mip->mi_mrfp;
- while (*pp != NULL) {
- if (!(*pp)->mrf_inuse) {
- p = *pp;
- *pp = (*pp)->mrf_nextp;
- kmem_free(p, sizeof (*p));
- cnt++;
- continue;
- }
- pp = &(*pp)->mrf_nextp;
- }
+void
+mac_tx_client_flush(mac_client_impl_t *mcip)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
- /*
- * Wake up mac_rx_remove_wait()
- */
- mutex_enter(&mip->mi_lock);
- ASSERT(mip->mi_rx_removed == cnt);
- mip->mi_rx_removed = 0;
- cv_broadcast(&mip->mi_rx_cv);
- mutex_exit(&mip->mi_lock);
- }
- rw_exit(&mip->mi_rx_lock);
+ mac_tx_client_quiesce(mcip, SRS_QUIESCE);
+ mac_tx_client_restart(mcip);
}
void
-mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
+mac_client_quiesce(mac_client_impl_t *mcip)
{
- mac_do_rx(mh, mrh, mp_chain, B_FALSE);
+ mac_rx_client_quiesce((mac_client_handle_t)mcip);
+ mac_tx_client_quiesce(mcip, SRS_QUIESCE);
}
-/*
- * Send a packet chain up to the receive callbacks which declared
- * themselves as being active.
- */
void
-mac_active_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp_chain)
+mac_client_restart(mac_client_impl_t *mcip)
{
- mac_do_rx(arg, mrh, mp_chain, B_TRUE);
+ mac_rx_client_restart((mac_client_handle_t)mcip);
+ mac_tx_client_restart(mcip);
}
/*
- * Function passed to the active client sharing a VNIC. This function
- * is returned by mac_tx_get() when a VNIC is present. It invokes
- * the VNIC transmit entry point which was specified by the VNIC when
- * it called mac_vnic_set(). The VNIC transmit entry point will
- * pass the packets to the local VNICs and/or to the underlying VNICs
- * if needed.
+ * Allocate a minor number.
*/
-static mblk_t *
-mac_vnic_tx(void *arg, mblk_t *mp)
+minor_t
+mac_minor_hold(boolean_t sleep)
{
- mac_impl_t *mip = arg;
- mac_txinfo_t *mtfp;
- mac_vnic_tx_t *mvt;
+ minor_t minor;
/*
- * There is a race between the notification of the VNIC
- * addition and removal, and the processing of the VNIC notification
- * by the MAC client. During this window, it is possible for
- * an active MAC client to contine invoking mac_vnic_tx() while
- * the VNIC has already been removed. So we cannot assume
- * that mi_vnic_present will always be true when mac_vnic_tx()
- * is invoked.
+ * Grab a value from the arena.
*/
- rw_enter(&mip->mi_tx_lock, RW_READER);
- if (!mip->mi_vnic_present) {
- rw_exit(&mip->mi_tx_lock);
- freemsgchain(mp);
- return (NULL);
- }
+ atomic_add_32(&minor_count, 1);
- ASSERT(mip->mi_vnic_tx != NULL);
- mvt = mip->mi_vnic_tx;
- MAC_VNIC_TXINFO_REFHOLD(mvt);
- rw_exit(&mip->mi_tx_lock);
+ if (sleep)
+ minor = (uint_t)id_alloc(minor_ids);
+ else
+ minor = (uint_t)id_alloc_nosleep(minor_ids);
- mtfp = &mvt->mv_txinfo;
- mtfp->mt_fn(mtfp->mt_arg, mp);
+ if (minor == 0) {
+ atomic_add_32(&minor_count, -1);
+ return (0);
+ }
- MAC_VNIC_TXINFO_REFRELE(mvt);
- return (NULL);
+ return (minor);
}
/*
- * Transmit function -- ONLY used when there are registered loopback listeners.
+ * Release a previously allocated minor number.
*/
-mblk_t *
-mac_do_txloop(void *arg, mblk_t *bp, boolean_t call_vnic)
+void
+mac_minor_rele(minor_t minor)
{
- mac_impl_t *mip = arg;
- mac_txloop_fn_t *mtfp;
- mblk_t *loop_bp, *resid_bp, *next_bp;
-
- if (call_vnic) {
- /*
- * In promiscous mode, a copy of the sent packet will
- * be sent to the client's promiscous receive entry
- * points via mac_vnic_tx()->
- * mac_active_rx_promisc()->mac_rx_default().
- */
- return (mac_vnic_tx(arg, bp));
- }
-
- while (bp != NULL) {
- next_bp = bp->b_next;
- bp->b_next = NULL;
-
- if ((loop_bp = copymsg(bp)) == NULL)
- goto noresources;
-
- if ((resid_bp = mip->mi_tx(mip->mi_driver, bp)) != NULL) {
- ASSERT(resid_bp == bp);
- freemsg(loop_bp);
- goto noresources;
- }
-
- rw_enter(&mip->mi_tx_lock, RW_READER);
- mtfp = mip->mi_mtfp;
- while (mtfp != NULL && loop_bp != NULL) {
- bp = loop_bp;
-
- /* XXX counter bump if copymsg() fails? */
- if (mtfp->mtf_nextp != NULL)
- loop_bp = copymsg(bp);
- else
- loop_bp = NULL;
-
- mtfp->mtf_fn(mtfp->mtf_arg, bp);
- mtfp = mtfp->mtf_nextp;
- }
- rw_exit(&mip->mi_tx_lock);
-
- /*
- * It's possible we've raced with the disabling of promiscuous
- * mode, in which case we can discard our copy.
- */
- if (loop_bp != NULL)
- freemsg(loop_bp);
-
- bp = next_bp;
- }
-
- return (NULL);
-
-noresources:
- bp->b_next = next_bp;
- return (bp);
+ /*
+ * Return the value to the arena.
+ */
+ id_free(minor_ids, minor);
+ atomic_add_32(&minor_count, -1);
}
-mblk_t *
-mac_txloop(void *arg, mblk_t *bp)
+uint32_t
+mac_no_notification(mac_handle_t mh)
{
- return (mac_do_txloop(arg, bp, B_FALSE));
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ return (mip->mi_unsup_note);
}
-static mblk_t *
-mac_vnic_txloop(void *arg, mblk_t *bp)
+/*
+ * Prevent any new opens of this mac in preparation for unregister
+ */
+int
+i_mac_disable(mac_impl_t *mip)
{
- return (mac_do_txloop(arg, bp, B_TRUE));
-}
+ mac_client_impl_t *mcip;
-void
-mac_link_update(mac_handle_t mh, link_state_t link)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ if (mip->mi_state_flags & MIS_DISABLED) {
+ /* Already disabled, return success */
+ rw_exit(&i_mac_impl_lock);
+ return (0);
+ }
+ /*
+ * See if there are any other references to this mac_t (e.g., VLAN's).
+ * If so return failure. If all the other checks below pass, then
+ * set mi_disabled atomically under the i_mac_impl_lock to prevent
+ * any new VLAN's from being created or new mac client opens of this
+ * mac end point.
+ */
+ if (mip->mi_ref > 0) {
+ rw_exit(&i_mac_impl_lock);
+ return (EBUSY);
+ }
/*
- * Save the link state.
+ * mac clients must delete all multicast groups they join before
+ * closing. bcast groups are reference counted, the last client
+ * to delete the group will wait till the group is physically
+ * deleted. Since all clients have closed this mac end point
+ * mi_bcast_ngrps must be zero at this point
*/
- mip->mi_linkstate = link;
+ ASSERT(mip->mi_bcast_ngrps == 0);
/*
- * Send a MAC_NOTE_LINK notification.
+ * Don't let go of this if it has some flows.
+ * All other code guarantees no flows are added to a disabled
+ * mac, therefore it is sufficient to check for the flow table
+ * only here.
*/
- i_mac_notify(mip, MAC_NOTE_LINK);
+ mcip = mac_primary_client_handle(mip);
+ if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
+ rw_exit(&i_mac_impl_lock);
+ return (ENOTEMPTY);
+ }
+
+ mip->mi_state_flags |= MIS_DISABLED;
+ rw_exit(&i_mac_impl_lock);
+ return (0);
}
-void
-mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
+int
+mac_disable_nowait(mac_handle_t mh)
{
mac_impl_t *mip = (mac_impl_t *)mh;
+ int err;
- if (mip->mi_type->mt_addr_length == 0)
- return;
+ if ((err = i_mac_perim_enter_nowait(mip)) != 0)
+ return (err);
+ err = i_mac_disable(mip);
+ i_mac_perim_exit(mip);
+ return (err);
+}
- /*
- * If the address has not changed, do nothing.
- */
- if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0)
- return;
+int
+mac_disable(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ int err;
- /*
- * Save the address.
- */
- bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length);
+ i_mac_perim_enter(mip);
+ err = i_mac_disable(mip);
+ i_mac_perim_exit(mip);
/*
- * Send a MAC_NOTE_UNICST notification.
+ * Clean up notification thread and wait for it to exit.
*/
- i_mac_notify(mip, MAC_NOTE_UNICST);
-}
+ if (err == 0)
+ i_mac_notify_exit(mip);
-void
-mac_tx_update(mac_handle_t mh)
-{
- /*
- * Send a MAC_NOTE_TX notification.
- */
- i_mac_notify((mac_impl_t *)mh, MAC_NOTE_TX);
+ return (err);
}
-void
-mac_resource_update(mac_handle_t mh)
+/*
+ * Called when the MAC instance has a non empty flow table, to de-multiplex
+ * incoming packets to the right flow.
+ * The MAC's rw lock is assumed held as a READER.
+ */
+/* ARGSUSED */
+static mblk_t *
+mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
{
+ flow_entry_t *flent = NULL;
+ uint_t flags = FLOW_INBOUND;
+ int err;
+
/*
- * Send a MAC_NOTE_RESOURCE notification.
+ * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
+ * to mac_flow_lookup() so that the VLAN packets can be successfully
+ * passed to the non-VLAN aggregation flows.
+ *
+ * Note that there is possibly a race between this and
+ * mac_unicast_remove/add() and VLAN packets could be incorrectly
+ * classified to non-VLAN flows of non-aggregation mac clients. These
+ * VLAN packets will be then filtered out by the mac module.
*/
- i_mac_notify((mac_impl_t *)mh, MAC_NOTE_RESOURCE);
-}
-
-mac_resource_handle_t
-mac_resource_add(mac_handle_t mh, mac_resource_t *mrp)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_resource_handle_t mrh;
- mac_resource_add_t add;
- void *arg;
-
- rw_enter(&mip->mi_resource_lock, RW_READER);
- add = mip->mi_resource_add;
- arg = mip->mi_resource_add_arg;
+ if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
+ flags |= FLOW_IGNORE_VLAN;
- if (add != NULL)
- mrh = add(arg, mrp);
- else
- mrh = NULL;
- rw_exit(&mip->mi_resource_lock);
+ err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
+ if (err != 0) {
+ /* no registered receive function */
+ return (mp);
+ } else {
+ mac_client_impl_t *mcip;
- return (mrh);
+ /*
+ * This flent might just be an additional one on the MAC client,
+ * i.e. for classification purposes (different fdesc), however
+ * the resources, SRS et. al., are in the mci_flent, so if
+ * this isn't the mci_flent, we need to get it.
+ */
+ if ((mcip = flent->fe_mcip) != NULL &&
+ mcip->mci_flent != flent) {
+ FLOW_REFRELE(flent);
+ flent = mcip->mci_flent;
+ FLOW_TRY_REFHOLD(flent, err);
+ if (err != 0)
+ return (mp);
+ }
+ (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
+ B_FALSE);
+ FLOW_REFRELE(flent);
+ }
+ return (NULL);
}
-int
-mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
+mblk_t *
+mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
{
mac_impl_t *mip = (mac_impl_t *)mh;
+ mblk_t *bp, *bp1, **bpp, *list = NULL;
/*
- * Verify that the plugin supports MAC plugin data and that the
- * supplied data is valid.
+ * We walk the chain and attempt to classify each packet.
+ * The packets that couldn't be classified will be returned
+ * back to the caller.
*/
- if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
- return (EINVAL);
- if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
- return (EINVAL);
+ bp = mp_chain;
+ bpp = &list;
+ while (bp != NULL) {
+ bp1 = bp;
+ bp = bp->b_next;
+ bp1->b_next = NULL;
- if (mip->mi_pdata != NULL)
- kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+ if (mac_rx_classify(mip, mrh, bp1) != NULL) {
+ *bpp = bp1;
+ bpp = &bp1->b_next;
+ }
+ }
+ return (list);
+}
- mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
- bcopy(mac_pdata, mip->mi_pdata, dsize);
- mip->mi_pdata_size = dsize;
+static int
+mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
+{
+ mac_ring_handle_t ring = arg;
- /*
- * Since the MAC plugin data is used to construct MAC headers that
- * were cached in fast-path headers, we need to flush fast-path
- * information for links associated with this mac.
- */
- i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
+ if (flent->fe_tx_srs)
+ mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
return (0);
}
void
-mac_multicst_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
- boolean_t add)
+i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_multicst_addr_t *p;
+ mac_client_impl_t *cclient;
+ mac_soft_ring_set_t *mac_srs;
/*
- * If no specific refresh function was given then default to the
- * driver's m_multicst entry point.
+ * After grabbing the mi_rw_lock, the list of clients can't change.
+ * If there are any clients mi_disabled must be B_FALSE and can't
+ * get set since there are clients. If there aren't any clients we
+ * don't do anything. In any case the mip has to be valid. The driver
+ * must make sure that it goes single threaded (with respect to mac
+ * calls) and wait for all pending mac calls to finish before calling
+ * mac_unregister.
*/
- if (refresh == NULL) {
- refresh = mip->mi_multicst;
- arg = mip->mi_driver;
+ rw_enter(&i_mac_impl_lock, RW_READER);
+ if (mip->mi_state_flags & MIS_DISABLED) {
+ rw_exit(&i_mac_impl_lock);
+ return;
}
- ASSERT(refresh != NULL);
/*
- * Walk the multicast address list and call the refresh function for
- * each address.
+ * Get MAC tx srs from walking mac_client_handle list.
*/
- rw_enter(&(mip->mi_data_lock), RW_READER);
- for (p = mip->mi_mmap; p != NULL; p = p->mma_nextp)
- refresh(arg, add, p->mma_addr);
- rw_exit(&(mip->mi_data_lock));
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+ for (cclient = mip->mi_clients_list; cclient != NULL;
+ cclient = cclient->mci_client_next) {
+ if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
+ mac_tx_srs_wakeup(mac_srs, ring);
+ if (!FLOW_TAB_EMPTY(cclient->mci_subflow_tab)) {
+ (void) mac_flow_walk_nolock(cclient->mci_subflow_tab,
+ mac_tx_flow_srs_wakeup, ring);
+ }
+ }
+ rw_exit(&mip->mi_rw_lock);
+ rw_exit(&i_mac_impl_lock);
}
+/* ARGSUSED */
void
-mac_unicst_refresh(mac_handle_t mh, mac_unicst_t refresh, void *arg)
+mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
+ boolean_t add)
{
- mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ i_mac_perim_enter((mac_impl_t *)mh);
/*
* If no specific refresh function was given then default to the
- * driver's mi_unicst entry point.
+ * driver's m_multicst entry point.
*/
if (refresh == NULL) {
- refresh = mip->mi_unicst;
+ refresh = mip->mi_multicst;
arg = mip->mi_driver;
}
- ASSERT(refresh != NULL);
- /*
- * Call the refresh function with the current unicast address.
- */
- refresh(arg, mip->mi_addr);
+ mac_bcast_refresh(mip, refresh, arg, add);
+ i_mac_perim_exit((mac_impl_t *)mh);
}
void
@@ -2352,7 +2416,7 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
mac_margin_req_t **pp, *p;
int err = 0;
- rw_enter(&(mip->mi_data_lock), RW_WRITER);
+ rw_enter(&(mip->mi_rw_lock), RW_WRITER);
if (current)
*marginp = mip->mi_margin;
@@ -2369,7 +2433,7 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
* Check whether the given margin is already in the list. If so,
* bump the reference count.
*/
- for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
+ for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
if (p->mmr_margin == *marginp) {
/*
* The margin requested is already in the list,
@@ -2383,18 +2447,14 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
}
- if ((p = kmem_zalloc(sizeof (mac_margin_req_t), KM_NOSLEEP)) == NULL) {
- err = ENOMEM;
- goto done;
- }
-
+ p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
p->mmr_margin = *marginp;
p->mmr_ref++;
p->mmr_nextp = *pp;
*pp = p;
done:
- rw_exit(&(mip->mi_data_lock));
+ rw_exit(&(mip->mi_rw_lock));
return (err);
}
@@ -2409,7 +2469,7 @@ mac_margin_remove(mac_handle_t mh, uint32_t margin)
mac_margin_req_t **pp, *p;
int err = 0;
- rw_enter(&(mip->mi_data_lock), RW_WRITER);
+ rw_enter(&(mip->mi_rw_lock), RW_WRITER);
/*
* Find the entry in the list for the given margin.
*/
@@ -2442,30 +2502,17 @@ mac_margin_remove(mac_handle_t mh, uint32_t margin)
*pp = p->mmr_nextp;
kmem_free(p, sizeof (mac_margin_req_t));
done:
- rw_exit(&(mip->mi_data_lock));
+ rw_exit(&(mip->mi_rw_lock));
return (err);
}
-/*
- * The mac client requests to get the mac's current margin value.
- */
-void
-mac_margin_get(mac_handle_t mh, uint32_t *marginp)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
-
- rw_enter(&(mip->mi_data_lock), RW_READER);
- *marginp = mip->mi_margin;
- rw_exit(&(mip->mi_data_lock));
-}
-
boolean_t
mac_margin_update(mac_handle_t mh, uint32_t margin)
{
mac_impl_t *mip = (mac_impl_t *)mh;
uint32_t margin_needed = 0;
- rw_enter(&(mip->mi_data_lock), RW_WRITER);
+ rw_enter(&(mip->mi_rw_lock), RW_WRITER);
if (mip->mi_mmrp != NULL)
margin_needed = mip->mi_mmrp->mmr_margin;
@@ -2473,7 +2520,7 @@ mac_margin_update(mac_handle_t mh, uint32_t margin)
if (margin_needed <= margin)
mip->mi_margin = margin;
- rw_exit(&(mip->mi_data_lock));
+ rw_exit(&(mip->mi_rw_lock));
if (margin_needed <= margin)
i_mac_notify(mip, MAC_NOTE_MARGIN);
@@ -2481,287 +2528,48 @@ mac_margin_update(mac_handle_t mh, uint32_t margin)
return (margin_needed <= margin);
}
-boolean_t
-mac_do_active_set(mac_handle_t mh, boolean_t shareable)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
-
- mutex_enter(&mip->mi_activelink_lock);
- if (mip->mi_activelink) {
- mutex_exit(&mip->mi_activelink_lock);
- return (B_FALSE);
- }
- mip->mi_activelink = B_TRUE;
- mip->mi_shareable = shareable;
- mutex_exit(&mip->mi_activelink_lock);
- return (B_TRUE);
-}
-
/*
- * Called by MAC clients. By default, active MAC clients cannot
- * share the NIC with VNICs.
+ * MAC Type Plugin functions.
*/
-boolean_t
-mac_active_set(mac_handle_t mh)
-{
- return (mac_do_active_set(mh, B_FALSE));
-}
-/*
- * Called by MAC clients which can share the NIC with VNICS, e.g. DLS.
- */
-boolean_t
-mac_active_shareable_set(mac_handle_t mh)
+mactype_t *
+mactype_getplugin(const char *pname)
{
- return (mac_do_active_set(mh, B_TRUE));
-}
-
-void
-mac_active_clear(mac_handle_t mh)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
-
- mutex_enter(&mip->mi_activelink_lock);
- ASSERT(mip->mi_activelink);
- mip->mi_activelink = B_FALSE;
- mutex_exit(&mip->mi_activelink_lock);
-}
-
-boolean_t
-mac_vnic_set(mac_handle_t mh, mac_txinfo_t *tx_info, mac_getcapab_t getcapab_fn,
- void *getcapab_arg)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_vnic_tx_t *vnic_tx;
+ mactype_t *mtype = NULL;
+ boolean_t tried_modload = B_FALSE;
- mutex_enter(&mip->mi_activelink_lock);
- rw_enter(&mip->mi_tx_lock, RW_WRITER);
- ASSERT(!mip->mi_vnic_present);
+ mutex_enter(&i_mactype_lock);
- if (mip->mi_activelink && !mip->mi_shareable) {
+find_registered_mactype:
+ if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
+ (mod_hash_val_t *)&mtype) != 0) {
+ if (!tried_modload) {
+ /*
+ * If the plugin has not yet been loaded, then
+ * attempt to load it now. If modload() succeeds,
+ * the plugin should have registered using
+ * mactype_register(), in which case we can go back
+ * and attempt to find it again.
+ */
+ if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
+ tried_modload = B_TRUE;
+ goto find_registered_mactype;
+ }
+ }
+ } else {
/*
- * The NIC is already used by an active client which cannot
- * share it with VNICs.
+ * Note that there's no danger that the plugin we've loaded
+ * could be unloaded between the modload() step and the
+ * reference count bump here, as we're holding
+ * i_mactype_lock, which mactype_unregister() also holds.
*/
- rw_exit(&mip->mi_tx_lock);
- mutex_exit(&mip->mi_activelink_lock);
- return (B_FALSE);
- }
-
- vnic_tx = kmem_cache_alloc(mac_vnic_tx_cache, KM_SLEEP);
- vnic_tx->mv_refs = 0;
- vnic_tx->mv_txinfo = *tx_info;
- vnic_tx->mv_clearing = B_FALSE;
-
- mip->mi_vnic_present = B_TRUE;
- mip->mi_vnic_tx = vnic_tx;
- mip->mi_vnic_getcapab_fn = getcapab_fn;
- mip->mi_vnic_getcapab_arg = getcapab_arg;
- rw_exit(&mip->mi_tx_lock);
- mutex_exit(&mip->mi_activelink_lock);
-
- i_mac_notify(mip, MAC_NOTE_VNIC);
- return (B_TRUE);
-}
-
-void
-mac_vnic_clear(mac_handle_t mh)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- mac_vnic_tx_t *vnic_tx;
-
- rw_enter(&mip->mi_tx_lock, RW_WRITER);
- ASSERT(mip->mi_vnic_present);
- mip->mi_vnic_present = B_FALSE;
- /*
- * Setting mi_vnic_tx to NULL here under the lock guarantees
- * that no new references to the current VNIC transmit structure
- * will be taken by mac_vnic_tx(). This is a necessary condition
- * for safely waiting for the reference count to drop to
- * zero below.
- */
- vnic_tx = mip->mi_vnic_tx;
- mip->mi_vnic_tx = NULL;
- mip->mi_vnic_getcapab_fn = NULL;
- mip->mi_vnic_getcapab_arg = NULL;
- rw_exit(&mip->mi_tx_lock);
-
- i_mac_notify(mip, MAC_NOTE_VNIC);
-
- /*
- * Wait for all TX calls referencing the VNIC transmit
- * entry point that was removed to complete.
- */
- mutex_enter(&vnic_tx->mv_lock);
- vnic_tx->mv_clearing = B_TRUE;
- while (vnic_tx->mv_refs > 0)
- cv_wait(&vnic_tx->mv_cv, &vnic_tx->mv_lock);
- mutex_exit(&vnic_tx->mv_lock);
- kmem_cache_free(mac_vnic_tx_cache, vnic_tx);
-}
-
-/*
- * mac_info_get() is used for retrieving the mac_info when a DL_INFO_REQ is
- * issued before a DL_ATTACH_REQ. we walk the i_mac_impl_hash table and find
- * the first mac_impl_t with a matching driver name; then we copy its mac_info_t
- * to the caller. we do all this with i_mac_impl_lock held so the mac_impl_t
- * cannot disappear while we are accessing it.
- */
-typedef struct i_mac_info_state_s {
- const char *mi_name;
- mac_info_t *mi_infop;
-} i_mac_info_state_t;
-
-/*ARGSUSED*/
-static uint_t
-i_mac_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
-{
- i_mac_info_state_t *statep = arg;
- mac_impl_t *mip = (mac_impl_t *)val;
-
- if (mip->mi_disabled)
- return (MH_WALK_CONTINUE);
-
- if (strcmp(statep->mi_name,
- ddi_driver_name(mip->mi_dip)) != 0)
- return (MH_WALK_CONTINUE);
-
- statep->mi_infop = &mip->mi_info;
- return (MH_WALK_TERMINATE);
-}
-
-boolean_t
-mac_info_get(const char *name, mac_info_t *minfop)
-{
- i_mac_info_state_t state;
-
- rw_enter(&i_mac_impl_lock, RW_READER);
- state.mi_name = name;
- state.mi_infop = NULL;
- mod_hash_walk(i_mac_impl_hash, i_mac_info_walker, &state);
- if (state.mi_infop == NULL) {
- rw_exit(&i_mac_impl_lock);
- return (B_FALSE);
- }
- *minfop = *state.mi_infop;
- rw_exit(&i_mac_impl_lock);
- return (B_TRUE);
-}
-
-boolean_t
-mac_do_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data,
- boolean_t is_vnic)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
-
- if (!is_vnic) {
- rw_enter(&mip->mi_tx_lock, RW_READER);
- if (mip->mi_vnic_present) {
- boolean_t rv;
-
- rv = mip->mi_vnic_getcapab_fn(mip->mi_vnic_getcapab_arg,
- cap, cap_data);
- rw_exit(&mip->mi_tx_lock);
- return (rv);
- }
- rw_exit(&mip->mi_tx_lock);
- }
-
- if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
- return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
- else
- return (B_FALSE);
-}
-
-boolean_t
-mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
-{
- return (mac_do_capab_get(mh, cap, cap_data, B_FALSE));
-}
-
-boolean_t
-mac_vnic_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
-{
- return (mac_do_capab_get(mh, cap, cap_data, B_TRUE));
-}
-
-boolean_t
-mac_sap_verify(mac_handle_t mh, uint32_t sap, uint32_t *bind_sap)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- return (mip->mi_type->mt_ops.mtops_sap_verify(sap, bind_sap,
- mip->mi_pdata));
-}
-
-mblk_t *
-mac_header(mac_handle_t mh, const uint8_t *daddr, uint32_t sap, mblk_t *payload,
- size_t extra_len)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- return (mip->mi_type->mt_ops.mtops_header(mip->mi_addr, daddr, sap,
- mip->mi_pdata, payload, extra_len));
-}
-
-int
-mac_header_info(mac_handle_t mh, mblk_t *mp, mac_header_info_t *mhip)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- return (mip->mi_type->mt_ops.mtops_header_info(mp, mip->mi_pdata,
- mhip));
-}
-
-mblk_t *
-mac_header_cook(mac_handle_t mh, mblk_t *mp)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_COOK) {
- if (DB_REF(mp) > 1) {
- mblk_t *newmp = copymsg(mp);
- if (newmp == NULL)
- return (NULL);
- freemsg(mp);
- mp = newmp;
- }
- return (mip->mi_type->mt_ops.mtops_header_cook(mp,
- mip->mi_pdata));
- }
- return (mp);
-}
-
-mblk_t *
-mac_header_uncook(mac_handle_t mh, mblk_t *mp)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
- if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_UNCOOK) {
- if (DB_REF(mp) > 1) {
- mblk_t *newmp = copymsg(mp);
- if (newmp == NULL)
- return (NULL);
- freemsg(mp);
- mp = newmp;
- }
- return (mip->mi_type->mt_ops.mtops_header_uncook(mp,
- mip->mi_pdata));
+ atomic_inc_32(&mtype->mt_ref);
}
- return (mp);
-}
-
-void
-mac_init_ops(struct dev_ops *ops, const char *name)
-{
- dld_init_ops(ops, name);
-}
-void
-mac_fini_ops(struct dev_ops *ops)
-{
- dld_fini_ops(ops);
+ mutex_exit(&i_mactype_lock);
+ return (mtype);
}
-/*
- * MAC Type Plugin functions.
- */
-
mactype_register_t *
mactype_alloc(uint_t mactype_version)
{
@@ -2878,19 +2686,70 @@ done:
return (err);
}
+/*
+ * Returns TRUE when the specified property is intended for the MAC framework,
+ * as opposed to driver defined properties.
+ */
+static boolean_t
+mac_is_macprop(mac_prop_t *macprop)
+{
+ switch (macprop->mp_id) {
+ case MAC_PROP_MAXBW:
+ case MAC_PROP_PRIO:
+ case MAC_PROP_BIND_CPU:
+ return (B_TRUE);
+ default:
+ return (B_FALSE);
+ }
+}
+
+/*
+ * mac_set_prop() sets mac or hardware driver properties:
+ * mac properties include maxbw, priority, and cpu binding list. Driver
+ * properties are private properties to the hardware, such as mtu, speed
+ * etc.
+ * If the property is a driver property, mac_set_prop() calls driver's callback
+ * function to set it.
+ * If the property is a mac property, mac_set_prop() invokes mac_set_resources()
+ * which will cache the property value in mac_impl_t and may call
+ * mac_client_set_resource() to update property value of the primary mac client,
+ * if it exists.
+ */
int
mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize)
{
int err = ENOTSUP;
mac_impl_t *mip = (mac_impl_t *)mh;
+ ASSERT(MAC_PERIM_HELD(mh));
+
+ /* If it is mac property, call mac_set_resources() */
+ if (mac_is_macprop(macprop)) {
+ mac_resource_props_t mrp;
+
+ if (valsize < sizeof (mac_resource_props_t))
+ return (EINVAL);
+ bzero(&mrp, sizeof (mac_resource_props_t));
+ bcopy(val, &mrp, sizeof (mrp));
+ return (mac_set_resources(mh, &mrp));
+ }
+ /* For driver properties, call driver's callback */
if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
macprop->mp_name, macprop->mp_id, valsize, val);
}
+
return (err);
}
+/*
+ * mac_get_prop() gets mac or hardware driver properties.
+ *
+ * If the property is a driver property, mac_get_prop() calls driver's callback
+ * function to get it.
+ * If the property is a mac property, mac_get_prop() invokes mac_get_resources()
+ * which returns the cached value in mac_impl_t.
+ */
int
mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
uint_t *perm)
@@ -2900,6 +2759,18 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
uint32_t sdu;
link_state_t link_state;
+ /* If mac property, read from cache */
+ if (mac_is_macprop(macprop)) {
+ mac_resource_props_t mrp;
+
+ if (valsize < sizeof (mac_resource_props_t))
+ return (EINVAL);
+ bzero(&mrp, sizeof (mac_resource_props_t));
+ mac_get_resources(mh, &mrp);
+ bcopy(&mrp, val, sizeof (mac_resource_props_t));
+ return (0);
+ }
+
switch (macprop->mp_id) {
case MAC_PROP_MTU:
if (valsize < sizeof (sdu))
@@ -2932,7 +2803,9 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
return (0);
default:
break;
+
}
+ /* If driver property, request from driver */
if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
err = mip->mi_callbacks->mc_getprop(mip->mi_driver,
macprop->mp_name, macprop->mp_id, macprop->mp_flags,
@@ -2941,21 +2814,7 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
return (err);
}
-int
-mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
-{
- mac_impl_t *mip = (mac_impl_t *)mh;
-
- if (sdu_max <= mip->mi_sdu_min)
- return (EINVAL);
- mip->mi_sdu_max = sdu_max;
-
- /* Send a MAC_NOTE_SDU_SIZE notification. */
- i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
- return (0);
-}
-
-static void
+void
mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
{
mac_priv_prop_t *mpriv;
@@ -2969,7 +2828,7 @@ mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
mip->mi_priv_prop_count = nprop;
}
-static void
+void
mac_unregister_priv_prop(mac_impl_t *mip)
{
mac_priv_prop_t *mpriv;
@@ -2981,3 +2840,2283 @@ mac_unregister_priv_prop(mac_impl_t *mip)
}
mip->mi_priv_prop_count = 0;
}
+
+/*
+ * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
+ * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
+ * cases if MAC free's the ring structure after mac_stop_ring(), any
+ * illegal access to the ring structure coming from the driver will panic
+ * the system. In order to protect the system from such inadverent access,
+ * we maintain a cache of rings in the mac_impl_t after they get free'd up.
+ * When packets are received on free'd up rings, MAC (through the generation
+ * count mechanism) will drop such packets.
+ */
+static mac_ring_t *
+mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings)
+{
+ mac_ring_t *ring;
+
+ if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+ mutex_enter(&mip->mi_ring_lock);
+ if (mip->mi_ring_freelist != NULL) {
+ ring = mip->mi_ring_freelist;
+ mip->mi_ring_freelist = ring->mr_next;
+ bzero(ring, sizeof (mac_ring_t));
+ } else {
+ ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
+ }
+ mutex_exit(&mip->mi_ring_lock);
+ } else {
+ ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP);
+ }
+ ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
+ return (ring);
+}
+
+static void
+mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
+{
+ if (ring->mr_type == MAC_RING_TYPE_RX) {
+ mutex_enter(&mip->mi_ring_lock);
+ ring->mr_state = MR_FREE;
+ ring->mr_flag = 0;
+ ring->mr_next = mip->mi_ring_freelist;
+ mip->mi_ring_freelist = ring;
+ mutex_exit(&mip->mi_ring_lock);
+ } else {
+ kmem_free(ring, sizeof (mac_ring_t));
+ }
+}
+
+static void
+mac_ring_freeall(mac_impl_t *mip)
+{
+ mac_ring_t *ring_next;
+ mutex_enter(&mip->mi_ring_lock);
+ mac_ring_t *ring = mip->mi_ring_freelist;
+ while (ring != NULL) {
+ ring_next = ring->mr_next;
+ kmem_cache_free(mac_ring_cache, ring);
+ ring = ring_next;
+ }
+ mip->mi_ring_freelist = NULL;
+ mutex_exit(&mip->mi_ring_lock);
+}
+
+int
+mac_start_ring(mac_ring_t *ring)
+{
+ int rv = 0;
+
+ if (ring->mr_start != NULL)
+ rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
+
+ return (rv);
+}
+
+void
+mac_stop_ring(mac_ring_t *ring)
+{
+ if (ring->mr_stop != NULL)
+ ring->mr_stop(ring->mr_driver);
+
+ /*
+ * Increment the ring generation number for this ring.
+ */
+ ring->mr_gen_num++;
+}
+
+int
+mac_start_group(mac_group_t *group)
+{
+ int rv = 0;
+
+ if (group->mrg_start != NULL)
+ rv = group->mrg_start(group->mrg_driver);
+
+ return (rv);
+}
+
+void
+mac_stop_group(mac_group_t *group)
+{
+ if (group->mrg_stop != NULL)
+ group->mrg_stop(group->mrg_driver);
+}
+
+/*
+ * Called from mac_start() on the default Rx group. Broadcast and multicast
+ * packets are received only on the default group. Hence the default group
+ * needs to be up even if the primary client is not up, for the other groups
+ * to be functional. We do this by calling this function at mac_start time
+ * itself. However the broadcast packets that are received can't make their
+ * way beyond mac_rx until a mac client creates a broadcast flow.
+ */
+static int
+mac_start_group_and_rings(mac_group_t *group)
+{
+ mac_ring_t *ring;
+ int rv = 0;
+
+ ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
+ if ((rv = mac_start_group(group)) != 0)
+ return (rv);
+
+ for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+ ASSERT(ring->mr_state == MR_FREE);
+ if ((rv = mac_start_ring(ring)) != 0)
+ goto error;
+ ring->mr_state = MR_INUSE;
+ ring->mr_classify_type = MAC_SW_CLASSIFIER;
+ }
+ return (0);
+
+error:
+ mac_stop_group_and_rings(group);
+ return (rv);
+}
+
+/* Called from mac_stop on the default Rx group */
+static void
+mac_stop_group_and_rings(mac_group_t *group)
+{
+ mac_ring_t *ring;
+
+ for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+ if (ring->mr_state != MR_FREE) {
+ mac_stop_ring(ring);
+ ring->mr_state = MR_FREE;
+ ring->mr_flag = 0;
+ ring->mr_classify_type = MAC_NO_CLASSIFIER;
+ }
+ }
+ mac_stop_group(group);
+}
+
+
+static mac_ring_t *
+mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
+ mac_capab_rings_t *cap_rings)
+{
+ mac_ring_t *ring;
+ mac_ring_info_t ring_info;
+
+ ring = mac_ring_alloc(mip, cap_rings);
+
+ /* Prepare basic information of ring */
+ ring->mr_index = index;
+ ring->mr_type = group->mrg_type;
+ ring->mr_gh = (mac_group_handle_t)group;
+
+ /* Insert the new ring to the list. */
+ ring->mr_next = group->mrg_rings;
+ group->mrg_rings = ring;
+
+ /* Zero to reuse the info data structure */
+ bzero(&ring_info, sizeof (ring_info));
+
+ /* Query ring information from driver */
+ cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
+ index, &ring_info, (mac_ring_handle_t)ring);
+
+ ring->mr_info = ring_info;
+
+ /* Update ring's status */
+ ring->mr_state = MR_FREE;
+ ring->mr_flag = 0;
+
+ /* Update the ring count of the group */
+ group->mrg_cur_count++;
+ return (ring);
+}
+
+/*
+ * Rings are chained together for easy regrouping.
+ */
+static void
+mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
+ mac_capab_rings_t *cap_rings)
+{
+ int index;
+
+ /*
+ * Initialize all ring members of this group. Size of zero will not
+ * enter the loop, so it's safe for initializing an empty group.
+ */
+ for (index = size - 1; index >= 0; index--)
+ (void) mac_init_ring(mip, group, index, cap_rings);
+}
+
+int
+mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
+{
+ mac_capab_rings_t *cap_rings;
+ mac_group_t *group, *groups;
+ mac_group_info_t group_info;
+ uint_t group_free = 0;
+ uint_t ring_left;
+ mac_ring_t *ring;
+ int g, err = 0;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX:
+ ASSERT(mip->mi_rx_groups == NULL);
+
+ cap_rings = &mip->mi_rx_rings_cap;
+ cap_rings->mr_type = MAC_RING_TYPE_RX;
+ break;
+ case MAC_RING_TYPE_TX:
+ ASSERT(mip->mi_tx_groups == NULL);
+
+ cap_rings = &mip->mi_tx_rings_cap;
+ cap_rings->mr_type = MAC_RING_TYPE_TX;
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS,
+ cap_rings))
+ return (0);
+
+ /*
+ * Allocate a contiguous buffer for all groups.
+ */
+ groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1),
+ KM_SLEEP);
+
+ ring_left = cap_rings->mr_rnum;
+
+ /*
+ * Get all ring groups if any, and get their ring members
+ * if any.
+ */
+ for (g = 0; g < cap_rings->mr_gnum; g++) {
+ group = groups + g;
+
+ /* Prepare basic information of the group */
+ group->mrg_index = g;
+ group->mrg_type = rtype;
+ group->mrg_state = MAC_GROUP_STATE_UNINIT;
+ group->mrg_mh = (mac_handle_t)mip;
+ group->mrg_next = group + 1;
+
+ /* Zero to reuse the info data structure */
+ bzero(&group_info, sizeof (group_info));
+
+ /* Query group information from driver */
+ cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
+ (mac_group_handle_t)group);
+
+ switch (cap_rings->mr_group_type) {
+ case MAC_GROUP_TYPE_DYNAMIC:
+ if (cap_rings->mr_gaddring == NULL ||
+ cap_rings->mr_gremring == NULL) {
+ DTRACE_PROBE3(
+ mac__init__rings_no_addremring,
+ char *, mip->mi_name,
+ mac_group_add_ring_t,
+ cap_rings->mr_gaddring,
+ mac_group_add_ring_t,
+ cap_rings->mr_gremring);
+ err = EINVAL;
+ goto bail;
+ }
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX:
+ /*
+ * The first RX group must have non-zero
+ * rings, and the following groups must
+ * have zero rings.
+ */
+ if (g == 0 && group_info.mgi_count == 0) {
+ DTRACE_PROBE1(
+ mac__init__rings__rx__def__zero,
+ char *, mip->mi_name);
+ err = EINVAL;
+ goto bail;
+ }
+ if (g > 0 && group_info.mgi_count != 0) {
+ DTRACE_PROBE3(
+ mac__init__rings__rx__nonzero,
+ char *, mip->mi_name,
+ int, g, int, group_info.mgi_count);
+ err = EINVAL;
+ goto bail;
+ }
+ break;
+ case MAC_RING_TYPE_TX:
+ /*
+ * All TX ring groups must have zero rings.
+ */
+ if (group_info.mgi_count != 0) {
+ DTRACE_PROBE3(
+ mac__init__rings__tx__nonzero,
+ char *, mip->mi_name,
+ int, g, int, group_info.mgi_count);
+ err = EINVAL;
+ goto bail;
+ }
+ break;
+ }
+ break;
+ case MAC_GROUP_TYPE_STATIC:
+ /*
+ * Note that an empty group is allowed, e.g., an aggr
+ * would start with an empty group.
+ */
+ break;
+ default:
+ /* unknown group type */
+ DTRACE_PROBE2(mac__init__rings__unknown__type,
+ char *, mip->mi_name,
+ int, cap_rings->mr_group_type);
+ err = EINVAL;
+ goto bail;
+ }
+
+
+ /*
+ * Driver must register group->mgi_addmac/remmac() for rx groups
+ * to support multiple MAC addresses.
+ */
+ if (rtype == MAC_RING_TYPE_RX) {
+ if ((group_info.mgi_addmac == NULL) ||
+ (group_info.mgi_addmac == NULL))
+ goto bail;
+ }
+
+ /* Cache driver-supplied information */
+ group->mrg_info = group_info;
+
+ /* Update the group's status and group count. */
+ mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
+ group_free++;
+
+ group->mrg_rings = NULL;
+ group->mrg_cur_count = 0;
+ mac_init_group(mip, group, group_info.mgi_count, cap_rings);
+ ring_left -= group_info.mgi_count;
+
+ /* The current group size should be equal to default value */
+ ASSERT(group->mrg_cur_count == group_info.mgi_count);
+ }
+
+ /* Build up a dummy group for free resources as a pool */
+ group = groups + cap_rings->mr_gnum;
+
+ /* Prepare basic information of the group */
+ group->mrg_index = -1;
+ group->mrg_type = rtype;
+ group->mrg_state = MAC_GROUP_STATE_UNINIT;
+ group->mrg_mh = (mac_handle_t)mip;
+ group->mrg_next = NULL;
+
+ /*
+ * If there are ungrouped rings, allocate a continuous buffer for
+ * remaining resources.
+ */
+ if (ring_left != 0) {
+ group->mrg_rings = NULL;
+ group->mrg_cur_count = 0;
+ mac_init_group(mip, group, ring_left, cap_rings);
+
+ /* The current group size should be equal to ring_left */
+ ASSERT(group->mrg_cur_count == ring_left);
+
+ ring_left = 0;
+
+ /* Update this group's status */
+ mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
+ } else
+ group->mrg_rings = NULL;
+
+ ASSERT(ring_left == 0);
+
+bail:
+ /* Cache other important information to finalize the initialization */
+ switch (rtype) {
+ case MAC_RING_TYPE_RX:
+ mip->mi_rx_group_type = cap_rings->mr_group_type;
+ mip->mi_rx_group_count = cap_rings->mr_gnum;
+ mip->mi_rx_groups = groups;
+ break;
+ case MAC_RING_TYPE_TX:
+ mip->mi_tx_group_type = cap_rings->mr_group_type;
+ mip->mi_tx_group_count = cap_rings->mr_gnum;
+ mip->mi_tx_group_free = group_free;
+ mip->mi_tx_groups = groups;
+
+ /*
+ * Ring 0 is used as the default one and it could be assigned
+ * to a client as well.
+ */
+ group = groups + cap_rings->mr_gnum;
+ ring = group->mrg_rings;
+ while ((ring->mr_index != 0) && (ring->mr_next != NULL))
+ ring = ring->mr_next;
+ ASSERT(ring->mr_index == 0);
+ mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ if (err != 0)
+ mac_free_rings(mip, rtype);
+
+ return (err);
+}
+
+/*
+ * Called to free all ring groups with particular type. It's supposed all groups
+ * have been released by clinet.
+ */
+void
+mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
+{
+ mac_group_t *group, *groups;
+ uint_t group_count;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX:
+ if (mip->mi_rx_groups == NULL)
+ return;
+
+ groups = mip->mi_rx_groups;
+ group_count = mip->mi_rx_group_count;
+
+ mip->mi_rx_groups = NULL;
+ mip->mi_rx_group_count = 0;
+ break;
+ case MAC_RING_TYPE_TX:
+ ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
+
+ if (mip->mi_tx_groups == NULL)
+ return;
+
+ groups = mip->mi_tx_groups;
+ group_count = mip->mi_tx_group_count;
+
+ mip->mi_tx_groups = NULL;
+ mip->mi_tx_group_count = 0;
+ mip->mi_tx_group_free = 0;
+ mip->mi_default_tx_ring = NULL;
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ for (group = groups; group != NULL; group = group->mrg_next) {
+ mac_ring_t *ring;
+
+ if (group->mrg_cur_count == 0)
+ continue;
+
+ ASSERT(group->mrg_rings != NULL);
+
+ while ((ring = group->mrg_rings) != NULL) {
+ group->mrg_rings = ring->mr_next;
+ mac_ring_free(mip, ring);
+ }
+ }
+
+ /* Free all the cached rings */
+ mac_ring_freeall(mip);
+ /* Free the block of group data strutures */
+ kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
+}
+
+/*
+ * Associate a MAC address with a receive group.
+ *
+ * The return value of this function should always be checked properly, because
+ * any type of failure could cause unexpected results. A group can be added
+ * or removed with a MAC address only after it has been reserved. Ideally,
+ * a successful reservation always leads to calling mac_group_addmac() to
+ * steer desired traffic. Failure of adding an unicast MAC address doesn't
+ * always imply that the group is functioning abnormally.
+ *
+ * Currently this function is called everywhere, and it reflects assumptions
+ * about MAC addresses in the implementation. CR 6735196.
+ */
+int
+mac_group_addmac(mac_group_t *group, const uint8_t *addr)
+{
+ ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+ ASSERT(group->mrg_info.mgi_addmac != NULL);
+
+ return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
+}
+
+/*
+ * Remove the association between MAC address and receive group.
+ */
+int
+mac_group_remmac(mac_group_t *group, const uint8_t *addr)
+{
+ ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+ ASSERT(group->mrg_info.mgi_remmac != NULL);
+
+ return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
+}
+
+/*
+ * Release a ring in use by marking it MR_FREE.
+ * Any other client may reserve it for its use.
+ */
+void
+mac_release_tx_ring(mac_ring_handle_t rh)
+{
+ mac_ring_t *ring = (mac_ring_t *)rh;
+ mac_group_t *group = (mac_group_t *)ring->mr_gh;
+ mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(ring->mr_state != MR_FREE);
+
+ /*
+ * Default tx ring will be released by mac_stop().
+ */
+ if (rh == mip->mi_default_tx_ring)
+ return;
+
+ mac_stop_ring(ring);
+
+ ring->mr_state = MR_FREE;
+ ring->mr_flag = 0;
+}
+
+/*
+ * Send packets through a selected tx ring.
+ */
+mblk_t *
+mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp)
+{
+ mac_ring_t *ring = (mac_ring_t *)rh;
+ mac_ring_info_t *info = &ring->mr_info;
+
+ ASSERT(ring->mr_type == MAC_RING_TYPE_TX);
+ ASSERT(ring->mr_state >= MR_INUSE);
+ ASSERT(info->mri_tx != NULL);
+
+ return (info->mri_tx(info->mri_driver, mp));
+}
+
+/*
+ * Find a ring from its index.
+ */
+mac_ring_t *
+mac_find_ring(mac_group_t *group, int index)
+{
+ mac_ring_t *ring = group->mrg_rings;
+
+ for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
+ if (ring->mr_index == index)
+ break;
+
+ return (ring);
+}
+/*
+ * Add a ring to an existing group.
+ *
+ * The ring must be either passed directly (for example if the ring
+ * movement is initiated by the framework), or specified through a driver
+ * index (for example when the ring is added by the driver.
+ *
+ * The caller needs to call mac_perim_enter() before calling this function.
+ */
+int
+i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
+{
+ mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+ mac_capab_rings_t *cap_rings;
+ boolean_t driver_call = (ring == NULL);
+ mac_group_type_t group_type;
+ int ret = 0;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ switch (group->mrg_type) {
+ case MAC_RING_TYPE_RX:
+ cap_rings = &mip->mi_rx_rings_cap;
+ group_type = mip->mi_rx_group_type;
+ break;
+ case MAC_RING_TYPE_TX:
+ cap_rings = &mip->mi_tx_rings_cap;
+ group_type = mip->mi_tx_group_type;
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ /*
+ * There should be no ring with the same ring index in the target
+ * group.
+ */
+ ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) ==
+ NULL);
+
+ if (driver_call) {
+ /*
+ * The function is called as a result of a request from
+ * a driver to add a ring to an existing group, for example
+ * from the aggregation driver. Allocate a new mac_ring_t
+ * for that ring.
+ */
+ ring = mac_init_ring(mip, group, index, cap_rings);
+ ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
+ } else {
+ /*
+ * The function is called as a result of a MAC layer request
+ * to add a ring to an existing group. In this case the
+ * ring is being moved between groups, which requires
+ * the underlying driver to support dynamic grouping,
+ * and the mac_ring_t already exists.
+ */
+ ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
+ ASSERT(cap_rings->mr_gaddring != NULL);
+ ASSERT(ring->mr_gh == NULL);
+ }
+
+ /*
+ * At this point the ring should not be in use, and it should be
+ * of the right for the target group.
+ */
+ ASSERT(ring->mr_state < MR_INUSE);
+ ASSERT(ring->mr_srs == NULL);
+ ASSERT(ring->mr_type == group->mrg_type);
+
+ if (!driver_call) {
+ /*
+ * Add the driver level hardware ring if the process was not
+ * initiated by the driver, and the target group is not the
+ * group.
+ */
+ if (group->mrg_driver != NULL) {
+ cap_rings->mr_gaddring(group->mrg_driver,
+ ring->mr_driver, ring->mr_type);
+ }
+
+ /*
+ * Insert the ring ahead existing rings.
+ */
+ ring->mr_next = group->mrg_rings;
+ group->mrg_rings = ring;
+ ring->mr_gh = (mac_group_handle_t)group;
+ group->mrg_cur_count++;
+ }
+
+ /*
+ * If the group has not been actively used, we're done.
+ */
+ if (group->mrg_index != -1 &&
+ group->mrg_state < MAC_GROUP_STATE_RESERVED)
+ return (0);
+
+ /*
+ * Set up SRS/SR according to the ring type.
+ */
+ switch (ring->mr_type) {
+ case MAC_RING_TYPE_RX:
+ /*
+ * Setup SRS on top of the new ring if the group is
+ * reserved for someones exclusive use.
+ */
+ if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
+ flow_entry_t *flent;
+ mac_client_impl_t *mcip;
+
+ mcip = MAC_RX_GROUP_ONLY_CLIENT(group);
+ ASSERT(mcip != NULL);
+ flent = mcip->mci_flent;
+ ASSERT(flent->fe_rx_srs_cnt > 0);
+ mac_srs_group_setup(mcip, flent, group, SRST_LINK);
+ }
+ break;
+ case MAC_RING_TYPE_TX:
+ /*
+ * For TX this function is only invoked during the
+ * initial creation of a group when a share is
+ * associated with a MAC client. So the datapath is not
+ * yet setup, and will be setup later after the
+ * group has been reserved and populated.
+ */
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ /*
+ * Start the ring if needed. Failure causes to undo the grouping action.
+ */
+ if ((ret = mac_start_ring(ring)) != 0) {
+ if (ring->mr_type == MAC_RING_TYPE_RX) {
+ if (ring->mr_srs != NULL) {
+ mac_rx_srs_remove(ring->mr_srs);
+ ring->mr_srs = NULL;
+ }
+ }
+ if (!driver_call) {
+ cap_rings->mr_gremring(group->mrg_driver,
+ ring->mr_driver, ring->mr_type);
+ }
+ group->mrg_cur_count--;
+ group->mrg_rings = ring->mr_next;
+
+ ring->mr_gh = NULL;
+
+ if (driver_call)
+ mac_ring_free(mip, ring);
+
+ return (ret);
+ }
+
+ /*
+ * Update the ring's state.
+ */
+ ring->mr_state = MR_INUSE;
+ MAC_RING_UNMARK(ring, MR_INCIPIENT);
+ return (0);
+}
+
+/*
+ * Remove a ring from it's current group. MAC internal function for dynamic
+ * grouping.
+ *
+ * The caller needs to call mac_perim_enter() before calling this function.
+ */
+void
+i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
+ boolean_t driver_call)
+{
+ mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+ mac_capab_rings_t *cap_rings = NULL;
+ mac_group_type_t group_type;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ ASSERT(mac_find_ring(group, ring->mr_index) == ring);
+ ASSERT((mac_group_t *)ring->mr_gh == group);
+ ASSERT(ring->mr_type == group->mrg_type);
+
+ switch (ring->mr_type) {
+ case MAC_RING_TYPE_RX:
+ group_type = mip->mi_rx_group_type;
+ cap_rings = &mip->mi_rx_rings_cap;
+
+ if (group->mrg_state >= MAC_GROUP_STATE_RESERVED)
+ mac_stop_ring(ring);
+
+ /*
+ * Only hardware classified packets hold a reference to the
+ * ring all the way up the Rx path. mac_rx_srs_remove()
+ * will take care of quiescing the Rx path and removing the
+ * SRS. The software classified path neither holds a reference
+ * nor any association with the ring in mac_rx.
+ */
+ if (ring->mr_srs != NULL) {
+ mac_rx_srs_remove(ring->mr_srs);
+ ring->mr_srs = NULL;
+ }
+ ring->mr_state = MR_FREE;
+ ring->mr_flag = 0;
+
+ break;
+ case MAC_RING_TYPE_TX:
+ /*
+ * For TX this function is only invoked in two
+ * cases:
+ *
+ * 1) In the case of a failure during the
+ * initial creation of a group when a share is
+ * associated with a MAC client. So the SRS is not
+ * yet setup, and will be setup later after the
+ * group has been reserved and populated.
+ *
+ * 2) From mac_release_tx_group() when freeing
+ * a TX SRS.
+ *
+ * In both cases the SRS and its soft rings are
+ * already quiesced.
+ */
+ ASSERT(!driver_call);
+ group_type = mip->mi_tx_group_type;
+ cap_rings = &mip->mi_tx_rings_cap;
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ /*
+ * Remove the ring from the group.
+ */
+ if (ring == group->mrg_rings)
+ group->mrg_rings = ring->mr_next;
+ else {
+ mac_ring_t *pre;
+
+ pre = group->mrg_rings;
+ while (pre->mr_next != ring)
+ pre = pre->mr_next;
+ pre->mr_next = ring->mr_next;
+ }
+ group->mrg_cur_count--;
+
+ if (!driver_call) {
+ ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
+ ASSERT(cap_rings->mr_gremring != NULL);
+
+ /*
+ * Remove the driver level hardware ring.
+ */
+ if (group->mrg_driver != NULL) {
+ cap_rings->mr_gremring(group->mrg_driver,
+ ring->mr_driver, ring->mr_type);
+ }
+ }
+
+ ring->mr_gh = NULL;
+ if (driver_call) {
+ mac_ring_free(mip, ring);
+ } else {
+ ring->mr_state = MR_FREE;
+ ring->mr_flag = 0;
+ }
+}
+
+/*
+ * Move a ring to the target group. If needed, remove the ring from the group
+ * that it currently belongs to.
+ *
+ * The caller need to enter MAC's perimeter by calling mac_perim_enter().
+ */
+static int
+mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
+{
+ mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
+ int rv;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(d_group != NULL);
+ ASSERT(s_group->mrg_mh == d_group->mrg_mh);
+
+ if (s_group == d_group)
+ return (0);
+
+ /*
+ * Remove it from current group first.
+ */
+ if (s_group != NULL)
+ i_mac_group_rem_ring(s_group, ring, B_FALSE);
+
+ /*
+ * Add it to the new group.
+ */
+ rv = i_mac_group_add_ring(d_group, ring, 0);
+ if (rv != 0) {
+ /*
+ * Failed to add ring back to source group. If
+ * that fails, the ring is stuck in limbo, log message.
+ */
+ if (i_mac_group_add_ring(s_group, ring, 0)) {
+ cmn_err(CE_WARN, "%s: failed to move ring %p\n",
+ mip->mi_name, (void *)ring);
+ }
+ }
+
+ return (rv);
+}
+
+/*
+ * Find a MAC address according to its value.
+ */
+mac_address_t *
+mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
+{
+ mac_address_t *map;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
+ if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
+ break;
+ }
+
+ return (map);
+}
+
+/*
+ * Check whether the MAC address is shared by multiple clients.
+ */
+boolean_t
+mac_check_macaddr_shared(mac_address_t *map)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
+
+ return (map->ma_nusers > 1);
+}
+
+/*
+ * Enable a MAC address by enabling promiscuous mode.
+ */
+static int
+mac_add_macaddr_promisc(mac_impl_t *mip, mac_group_t *group)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /*
+ * Current interface only allow to set promiscuous mode with the
+ * default group. Note, mip->mi_rx_groups might be NULL.
+ */
+ ASSERT(group == mip->mi_rx_groups);
+
+ if (group == mip->mi_rx_groups)
+ return (i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC));
+ else
+ return (ENOTSUP);
+}
+
+/*
+ * Remove a MAC address that was added by enabling promiscuous mode.
+ */
+static int
+mac_remove_macaddr_promisc(mac_impl_t *mip, mac_group_t *group)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(group == mip->mi_rx_groups);
+
+ return (i_mac_promisc_set(mip, B_FALSE, MAC_DEVPROMISC));
+}
+
+/*
+ * Remove the specified MAC address from the MAC address list and free it.
+ */
+static void
+mac_free_macaddr(mac_address_t *map)
+{
+ mac_impl_t *mip = map->ma_mip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(mip->mi_addresses != NULL);
+
+ map = mac_find_macaddr(mip, map->ma_addr);
+
+ ASSERT(map != NULL);
+ ASSERT(map->ma_nusers == 0);
+
+ if (map == mip->mi_addresses) {
+ mip->mi_addresses = map->ma_next;
+ } else {
+ mac_address_t *pre;
+
+ pre = mip->mi_addresses;
+ while (pre->ma_next != map)
+ pre = pre->ma_next;
+ pre->ma_next = map->ma_next;
+ }
+
+ kmem_free(map, sizeof (mac_address_t));
+}
+
+/*
+ * Add a MAC address reference for a client. If the desired MAC address
+ * exists, add a reference to it. Otherwise, add the new address by adding
+ * it to a reserved group or setting promiscuous mode. Won't try different
+ * group is the group is non-NULL, so the caller must explictly share
+ * default group when needed.
+ *
+ * Note, the primary MAC address is initialized at registration time, so
+ * to add it to default group only need to activate it if its reference
+ * count is still zero. Also, some drivers may not have advertised RINGS
+ * capability.
+ */
+int
+mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr)
+{
+ mac_address_t *map;
+ int err = 0;
+ boolean_t allocated_map = B_FALSE;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ map = mac_find_macaddr(mip, mac_addr);
+
+ /*
+ * If the new MAC address has not been added. Allocate a new one
+ * and set it up.
+ */
+ if (map == NULL) {
+ map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
+ map->ma_len = mip->mi_type->mt_addr_length;
+ bcopy(mac_addr, map->ma_addr, map->ma_len);
+ map->ma_nusers = 0;
+ map->ma_group = group;
+ map->ma_mip = mip;
+
+ /* add the new MAC address to the head of the address list */
+ map->ma_next = mip->mi_addresses;
+ mip->mi_addresses = map;
+
+ allocated_map = B_TRUE;
+ }
+
+ ASSERT(map->ma_group == group);
+
+ /*
+ * If the MAC address is already in use, simply account for the
+ * new client.
+ */
+ if (map->ma_nusers++ > 0)
+ return (0);
+
+ /*
+ * Activate this MAC address by adding it to the reserved group.
+ */
+ if (group != NULL) {
+ err = mac_group_addmac(group, (const uint8_t *)mac_addr);
+ if (err == 0) {
+ map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+ return (0);
+ }
+ }
+
+ /*
+ * Try promiscuous mode. Note that rx_groups could be NULL, so we
+ * need to handle drivers that don't advertise the RINGS capability.
+ */
+ if (group == mip->mi_rx_groups) {
+ /*
+ * For drivers that don't advertise RINGS capability, do
+ * nothing for the primary address.
+ */
+ if ((group == NULL) &&
+ (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
+ map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+ return (0);
+ }
+
+ /*
+ * Enable promiscuous mode in order to receive traffic
+ * to the new MAC address.
+ */
+ err = mac_add_macaddr_promisc(mip, group);
+ if (err == 0) {
+ map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
+ return (0);
+ }
+ }
+
+ /*
+ * Free the MAC address that could not be added. Don't free
+ * a pre-existing address, it could have been the entry
+ * for the primary MAC address which was pre-allocated by
+ * mac_init_macaddr(), and which must remain on the list.
+ */
+ map->ma_nusers--;
+ if (allocated_map)
+ mac_free_macaddr(map);
+ return (err);
+}
+
+/*
+ * Remove a reference to a MAC address. This may cause to remove the MAC
+ * address from an associated group or to turn off promiscuous mode.
+ * The caller needs to handle the failure properly.
+ */
+int
+mac_remove_macaddr(mac_address_t *map)
+{
+ mac_impl_t *mip = map->ma_mip;
+ int err = 0;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
+
+ /*
+ * If it's not the last client using this MAC address, only update
+ * the MAC clients count.
+ */
+ if (--map->ma_nusers > 0)
+ return (0);
+
+ /*
+ * The MAC address is no longer used by any MAC client, so remove
+ * it from its associated group, or turn off promiscuous mode
+ * if it was enabled for the MAC address.
+ */
+ switch (map->ma_type) {
+ case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
+ /*
+ * Don't free the preset primary address for drivers that
+ * don't advertise RINGS capability.
+ */
+ if (map->ma_group == NULL)
+ return (0);
+
+ err = mac_group_remmac(map->ma_group, map->ma_addr);
+ break;
+ case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
+ err = mac_remove_macaddr_promisc(mip, map->ma_group);
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ if (err != 0)
+ return (err);
+
+ /*
+ * We created MAC address for the primary one at registration, so we
+ * won't free it here. mac_fini_macaddr() will take care of it.
+ */
+ if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
+ mac_free_macaddr(map);
+
+ return (0);
+}
+
+/*
+ * Update an existing MAC address. The caller need to make sure that the new
+ * value has not been used.
+ */
+int
+mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
+{
+ mac_impl_t *mip = map->ma_mip;
+ int err = 0;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
+
+ switch (map->ma_type) {
+ case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
+ /*
+ * Update the primary address for drivers that are not
+ * RINGS capable.
+ */
+ if (map->ma_group == NULL) {
+ err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
+ mac_addr);
+ if (err != 0)
+ return (err);
+ break;
+ }
+
+ /*
+ * If this MAC address is not currently in use,
+ * simply break out and update the value.
+ */
+ if (map->ma_nusers == 0)
+ break;
+
+ /*
+ * Need to replace the MAC address associated with a group.
+ */
+ err = mac_group_remmac(map->ma_group, map->ma_addr);
+ if (err != 0)
+ return (err);
+
+ err = mac_group_addmac(map->ma_group, mac_addr);
+
+ /*
+ * Failure hints hardware error. The MAC layer needs to
+ * have error notification facility to handle this.
+ * Now, simply try to restore the value.
+ */
+ if (err != 0)
+ (void) mac_group_addmac(map->ma_group, map->ma_addr);
+
+ break;
+ case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
+ /*
+ * Need to do nothing more if in promiscuous mode.
+ */
+ break;
+ default:
+ ASSERT(B_FALSE);
+ }
+
+ /*
+ * Successfully replaced the MAC address.
+ */
+ if (err == 0)
+ bcopy(mac_addr, map->ma_addr, map->ma_len);
+
+ return (err);
+}
+
+/*
+ * Freshen the MAC address with new value. Its caller must have updated the
+ * hardware MAC address before calling this function.
+ * This funcitons is supposed to be used to handle the MAC address change
+ * notification from underlying drivers.
+ */
+void
+mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
+{
+ mac_impl_t *mip = map->ma_mip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
+
+ /*
+ * Freshen the MAC address with new value.
+ */
+ bcopy(mac_addr, map->ma_addr, map->ma_len);
+ bcopy(mac_addr, mip->mi_addr, map->ma_len);
+
+ /*
+ * Update all MAC clients that share this MAC address.
+ */
+ mac_unicast_update_clients(mip, map);
+}
+
+/*
+ * Set up the primary MAC address.
+ */
+void
+mac_init_macaddr(mac_impl_t *mip)
+{
+ mac_address_t *map;
+
+ /*
+ * The reference count is initialized to zero, until it's really
+ * activated.
+ */
+ map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
+ map->ma_len = mip->mi_type->mt_addr_length;
+ bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
+
+ /*
+ * If driver advertises RINGS capability, it shouldn't have initialized
+ * its primary MAC address. For other drivers, including VNIC, the
+ * primary address must work after registration.
+ */
+ if (mip->mi_rx_groups == NULL)
+ map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+
+ /*
+ * The primary MAC address is reserved for default group according
+ * to current design.
+ */
+ map->ma_group = mip->mi_rx_groups;
+ map->ma_mip = mip;
+
+ mip->mi_addresses = map;
+}
+
+/*
+ * Clean up the primary MAC address. Note, only one primary MAC address
+ * is allowed. All other MAC addresses must have been freed appropriately.
+ */
+void
+mac_fini_macaddr(mac_impl_t *mip)
+{
+ mac_address_t *map = mip->mi_addresses;
+
+ /* there should be exactly one entry left on the list */
+ ASSERT(map != NULL);
+ ASSERT(map->ma_nusers == 0);
+ ASSERT(map->ma_next == NULL);
+
+ kmem_free(map, sizeof (mac_address_t));
+ mip->mi_addresses = NULL;
+}
+
+/*
+ * Logging related functions.
+ */
+
+/* Write the Flow description to the log file */
+int
+mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
+{
+ flow_desc_t *fdesc;
+ mac_resource_props_t *mrp;
+ net_desc_t ndesc;
+
+ bzero(&ndesc, sizeof (net_desc_t));
+
+ /*
+ * Grab the fe_lock to see a self-consistent fe_flow_desc.
+ * Updates to the fe_flow_desc are done under the fe_lock
+ */
+ mutex_enter(&flent->fe_lock);
+ fdesc = &flent->fe_flow_desc;
+ mrp = &flent->fe_resource_props;
+
+ ndesc.nd_name = flent->fe_flow_name;
+ ndesc.nd_devname = mcip->mci_name;
+ bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
+ bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL);
+ ndesc.nd_sap = htonl(fdesc->fd_sap);
+ ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
+ ndesc.nd_bw_limit = mrp->mrp_maxbw;
+ if (ndesc.nd_isv4) {
+ ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
+ ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
+ } else {
+ bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN);
+ bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN);
+ }
+ ndesc.nd_sport = htons(fdesc->fd_local_port);
+ ndesc.nd_dport = htons(fdesc->fd_remote_port);
+ ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol;
+ mutex_exit(&flent->fe_lock);
+
+ return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC));
+}
+
+/* Write the Flow statistics to the log file */
+int
+mac_write_flow_stats(flow_entry_t *flent)
+{
+ flow_stats_t *fl_stats;
+ net_stat_t nstat;
+
+ fl_stats = &flent->fe_flowstats;
+ nstat.ns_name = flent->fe_flow_name;
+ nstat.ns_ibytes = fl_stats->fs_rbytes;
+ nstat.ns_obytes = fl_stats->fs_obytes;
+ nstat.ns_ipackets = fl_stats->fs_ipackets;
+ nstat.ns_opackets = fl_stats->fs_opackets;
+ nstat.ns_ierrors = fl_stats->fs_ierrors;
+ nstat.ns_oerrors = fl_stats->fs_oerrors;
+
+ return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC));
+}
+
+/* Write the Link Description to the log file */
+int
+mac_write_link_desc(mac_client_impl_t *mcip)
+{
+ net_desc_t ndesc;
+ flow_entry_t *flent = mcip->mci_flent;
+
+ bzero(&ndesc, sizeof (net_desc_t));
+
+ ndesc.nd_name = mcip->mci_name;
+ ndesc.nd_devname = mcip->mci_name;
+ ndesc.nd_isv4 = B_TRUE;
+ /*
+ * Grab the fe_lock to see a self-consistent fe_flow_desc.
+ * Updates to the fe_flow_desc are done under the fe_lock
+ * after removing the flent from the flow table.
+ */
+ mutex_enter(&flent->fe_lock);
+ bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
+ mutex_exit(&flent->fe_lock);
+
+ return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC));
+}
+
+/* Write the Link statistics to the log file */
+int
+mac_write_link_stats(mac_client_impl_t *mcip)
+{
+ net_stat_t nstat;
+
+ nstat.ns_name = mcip->mci_name;
+ nstat.ns_ibytes = mcip->mci_stat_ibytes;
+ nstat.ns_obytes = mcip->mci_stat_obytes;
+ nstat.ns_ipackets = mcip->mci_stat_ipackets;
+ nstat.ns_opackets = mcip->mci_stat_opackets;
+ nstat.ns_ierrors = mcip->mci_stat_ierrors;
+ nstat.ns_oerrors = mcip->mci_stat_oerrors;
+
+ return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC));
+}
+
+/*
+ * For a given flow, if the descrition has not been logged before, do it now.
+ * If it is a VNIC, then we have collected information about it from the MAC
+ * table, so skip it.
+ */
+/*ARGSUSED*/
+static int
+mac_log_flowinfo(flow_entry_t *flent, void *args)
+{
+ mac_client_impl_t *mcip = flent->fe_mcip;
+
+ if (mcip == NULL)
+ return (0);
+
+ /*
+ * If the name starts with "vnic", and fe_user_generated is true (to
+ * exclude the mcast and active flow entries created implicitly for
+ * a vnic, it is a VNIC flow. i.e. vnic1 is a vnic flow,
+ * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
+ */
+ if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
+ (flent->fe_type & FLOW_USER) != 0) {
+ return (0);
+ }
+
+ if (!flent->fe_desc_logged) {
+ /*
+ * We don't return error because we want to continu the
+ * walk in case this is the last walk which means we
+ * need to reset fe_desc_logged in all the flows.
+ */
+ if (mac_write_flow_desc(flent, mcip) != 0)
+ return (0);
+ flent->fe_desc_logged = B_TRUE;
+ }
+
+ /*
+ * Regardless of the error, we want to proceed in case we have to
+ * reset fe_desc_logged.
+ */
+ (void) mac_write_flow_stats(flent);
+
+ if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
+ flent->fe_desc_logged = B_FALSE;
+
+ return (0);
+}
+
+typedef struct i_mac_log_state_s {
+ boolean_t mi_last;
+ int mi_fenable;
+ int mi_lenable;
+} i_mac_log_state_t;
+
+/*
+ * Walk the mac_impl_ts and log the description for each mac client of this mac,
+ * if it hasn't already been done. Additionally, log statistics for the link as
+ * well. Walk the flow table and log information for each flow as well.
+ * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
+ * also fe_desc_logged, if flow logging is on) since we want to log the
+ * description if and when logging is restarted.
+ */
+/*ARGSUSED*/
+static uint_t
+i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+{
+ mac_impl_t *mip = (mac_impl_t *)val;
+ i_mac_log_state_t *lstate = (i_mac_log_state_t *)arg;
+ int ret;
+ mac_client_impl_t *mcip;
+
+ /*
+ * Only walk the client list for NIC and etherstub
+ */
+ if ((mip->mi_state_flags & MIS_DISABLED) ||
+ ((mip->mi_state_flags & MIS_IS_VNIC) &&
+ (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL)))
+ return (MH_WALK_CONTINUE);
+
+ for (mcip = mip->mi_clients_list; mcip != NULL;
+ mcip = mcip->mci_client_next) {
+ if (!MCIP_DATAPATH_SETUP(mcip))
+ continue;
+ if (lstate->mi_lenable) {
+ if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
+ ret = mac_write_link_desc(mcip);
+ if (ret != 0) {
+ /*
+ * We can't terminate it if this is the last
+ * walk, else there might be some links with
+ * mi_desc_logged set to true, which means
+ * their description won't be logged the next
+ * time logging is started (similarly for the
+ * flows within such links). We can continue
+ * without walking the flow table (i.e. to
+ * set fe_desc_logged to false) because we
+ * won't have written any flow stuff for this
+ * link as we haven't logged the link itself.
+ */
+ if (lstate->mi_last)
+ return (MH_WALK_CONTINUE);
+ else
+ return (MH_WALK_TERMINATE);
+ }
+ mcip->mci_state_flags |= MCIS_DESC_LOGGED;
+ }
+ }
+
+ if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last)
+ return (MH_WALK_TERMINATE);
+
+ if (lstate->mi_last)
+ mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
+
+ if (lstate->mi_fenable) {
+ if (mcip->mci_subflow_tab != NULL) {
+ (void) mac_flow_walk(mcip->mci_subflow_tab,
+ mac_log_flowinfo, mip);
+ }
+ }
+ }
+ return (MH_WALK_CONTINUE);
+}
+
+/*
+ * The timer thread that runs every mac_logging_interval seconds and logs
+ * link and/or flow information.
+ */
+/* ARGSUSED */
+void
+mac_log_linkinfo(void *arg)
+{
+ i_mac_log_state_t lstate;
+
+ rw_enter(&i_mac_impl_lock, RW_READER);
+ if (!mac_flow_log_enable && !mac_link_log_enable) {
+ rw_exit(&i_mac_impl_lock);
+ return;
+ }
+ lstate.mi_fenable = mac_flow_log_enable;
+ lstate.mi_lenable = mac_link_log_enable;
+ lstate.mi_last = B_FALSE;
+ rw_exit(&i_mac_impl_lock);
+
+ mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
+
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ if (mac_flow_log_enable || mac_link_log_enable) {
+ mac_logging_timer = timeout(mac_log_linkinfo, NULL,
+ SEC_TO_TICK(mac_logging_interval));
+ }
+ rw_exit(&i_mac_impl_lock);
+}
+
+/*
+ * Start the logging timer.
+ */
+void
+mac_start_logusage(mac_logtype_t type, uint_t interval)
+{
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ switch (type) {
+ case MAC_LOGTYPE_FLOW:
+ if (mac_flow_log_enable) {
+ rw_exit(&i_mac_impl_lock);
+ return;
+ }
+ mac_flow_log_enable = B_TRUE;
+ /* FALLTHRU */
+ case MAC_LOGTYPE_LINK:
+ if (mac_link_log_enable) {
+ rw_exit(&i_mac_impl_lock);
+ return;
+ }
+ mac_link_log_enable = B_TRUE;
+ break;
+ default:
+ ASSERT(0);
+ }
+ mac_logging_interval = interval;
+ rw_exit(&i_mac_impl_lock);
+ mac_log_linkinfo(NULL);
+}
+
+/*
+ * Stop the logging timer if both Link and Flow logging are turned off.
+ */
+void
+mac_stop_logusage(mac_logtype_t type)
+{
+ i_mac_log_state_t lstate;
+
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ lstate.mi_fenable = mac_flow_log_enable;
+ lstate.mi_lenable = mac_link_log_enable;
+
+ /* Last walk */
+ lstate.mi_last = B_TRUE;
+
+ switch (type) {
+ case MAC_LOGTYPE_FLOW:
+ if (lstate.mi_fenable) {
+ ASSERT(mac_link_log_enable);
+ mac_flow_log_enable = B_FALSE;
+ mac_link_log_enable = B_FALSE;
+ break;
+ }
+ /* FALLTHRU */
+ case MAC_LOGTYPE_LINK:
+ if (!lstate.mi_lenable || mac_flow_log_enable) {
+ rw_exit(&i_mac_impl_lock);
+ return;
+ }
+ mac_link_log_enable = B_FALSE;
+ break;
+ default:
+ ASSERT(0);
+ }
+ rw_exit(&i_mac_impl_lock);
+ (void) untimeout(mac_logging_timer);
+ mac_logging_timer = 0;
+
+ /* Last walk */
+ mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
+}
+
+/*
+ * Walk the rx and tx SRS/SRs for a flow and update the priority value.
+ */
+void
+mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
+{
+ pri_t pri;
+ int count;
+ mac_soft_ring_set_t *mac_srs;
+
+ if (flent->fe_rx_srs_cnt <= 0)
+ return;
+
+ if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
+ SRST_FLOW) {
+ pri = FLOW_PRIORITY(mcip->mci_min_pri,
+ mcip->mci_max_pri,
+ flent->fe_resource_props.mrp_priority);
+ } else {
+ pri = mcip->mci_max_pri;
+ }
+
+ for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
+ mac_srs = flent->fe_rx_srs[count];
+ mac_update_srs_priority(mac_srs, pri);
+ }
+ /*
+ * If we have a Tx SRS, we need to modify all the threads associated
+ * with it.
+ */
+ if (flent->fe_tx_srs != NULL)
+ mac_update_srs_priority(flent->fe_tx_srs, pri);
+}
+
+/*
+ * RX and TX rings are reserved according to different semantics depending
+ * on the requests from the MAC clients and type of rings:
+ *
+ * On the Tx side, by default we reserve individual rings, independently from
+ * the groups.
+ *
+ * On the Rx side, the reservation is at the granularity of the group
+ * of rings, and used for v12n level 1 only. It has a special case for the
+ * primary client.
+ *
+ * If a share is allocated to a MAC client, we allocate a TX group and an
+ * RX group to the client, and assign TX rings and RX rings to these
+ * groups according to information gathered from the driver through
+ * the share capability.
+ *
+ * The foreseable evolution of Rx rings will handle v12n level 2 and higher
+ * to allocate individual rings out of a group and program the hw classifier
+ * based on IP address or higher level criteria.
+ */
+
+/*
+ * mac_reserve_tx_ring()
+ * Reserve a unused ring by marking it with MR_INUSE state.
+ * As reserved, the ring is ready to function.
+ *
+ * Notes for Hybrid I/O:
+ *
+ * If a specific ring is needed, it is specified through the desired_ring
+ * argument. Otherwise that argument is set to NULL.
+ * If the desired ring was previous allocated to another client, this
+ * function swaps it with a new ring from the group of unassigned rings.
+ */
+mac_ring_t *
+mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
+{
+ mac_group_t *group;
+ mac_ring_t *ring;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ if (mip->mi_tx_groups == NULL)
+ return (NULL);
+
+ /*
+ * Find an available ring and start it before changing its status.
+ * The unassigned rings are at the end of the mi_tx_groups
+ * array.
+ */
+ group = mip->mi_tx_groups + mip->mi_tx_group_count;
+
+ for (ring = group->mrg_rings; ring != NULL;
+ ring = ring->mr_next) {
+ if (desired_ring == NULL) {
+ if (ring->mr_state == MR_FREE)
+ /* wanted any free ring and found one */
+ break;
+ } else {
+ mac_ring_t *sring;
+ mac_client_impl_t *client;
+ mac_soft_ring_set_t *srs;
+
+ if (ring != desired_ring)
+ /* wants a desired ring but this one ain't it */
+ continue;
+
+ if (ring->mr_state == MR_FREE)
+ break;
+
+ /*
+ * Found the desired ring but it's already in use.
+ * Swap it with a new ring.
+ */
+
+ /* find the client which owns that ring */
+ for (client = mip->mi_clients_list; client != NULL;
+ client = client->mci_client_next) {
+ srs = MCIP_TX_SRS(client);
+ if (srs != NULL && mac_tx_srs_ring_present(srs,
+ desired_ring)) {
+ /* found our ring */
+ break;
+ }
+ }
+ ASSERT(client != NULL);
+
+ /*
+ * Note that we cannot simply invoke the group
+ * add/rem routines since the client doesn't have a
+ * TX group. So we need to instead add/remove
+ * the rings from the SRS.
+ */
+ ASSERT(client->mci_share == NULL);
+
+ /* first quiece the client */
+ mac_tx_client_quiesce(client, SRS_QUIESCE);
+
+ /* give a new ring to the client... */
+ sring = mac_reserve_tx_ring(mip, NULL);
+ if (sring != NULL) {
+ /*
+ * There are no other available ring
+ * on that MAC instance. The client
+ * will fallback to the shared TX
+ * ring.
+ *
+ * XXX if the user required the client
+ * to have a hardware transmit ring,
+ * we need to ensure we don't remove
+ * the last ring from the client.
+ * In that case look for a repacement
+ * ring from a client which does not
+ * require a hardware ring, we could
+ * add an argument to
+ * mac_reserve_tx_ring() which causes
+ * it to take a ring from such a client
+ * even if the desired ring is NULL.
+ * This will have to be done as part
+ * of the fix for CR 6758935. If that still
+ * fails, i.e. if all rings are allocated
+ * to clients which require rings, then
+ * cleanly fail the operation.
+ */
+ mac_tx_srs_add_ring(srs, sring);
+ }
+
+ /* ... in exchange for our desired ring */
+ mac_tx_srs_del_ring(srs, desired_ring);
+
+ /* restart the client */
+ mac_tx_client_restart(client);
+
+ break;
+ }
+ }
+
+ if (ring != NULL) {
+ if (mac_start_ring(ring) != 0)
+ return (NULL);
+ ring->mr_state = MR_INUSE;
+ }
+
+ return (ring);
+}
+
+/*
+ * Minimum number of rings to leave in the default TX group when allocating
+ * rings to new clients.
+ */
+static uint_t mac_min_rx_default_rings = 1;
+
+/*
+ * Populate a zero-ring group with rings. If the share is non-NULL,
+ * the rings are chosen according to that share.
+ * Invoked after allocating a new RX or TX group through
+ * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
+ * Returns zero on success, an errno otherwise.
+ */
+int
+i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
+ mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share)
+{
+ mac_ring_t **rings, *tmp_ring[1], *ring;
+ uint_t nrings;
+ int rv, i, j;
+
+ ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC &&
+ mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
+ ASSERT(new_group->mrg_cur_count == 0);
+
+ /*
+ * First find the rings to allocate to the group.
+ */
+ if (share != NULL) {
+ /* get rings through ms_squery() */
+ mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
+ ASSERT(nrings != 0);
+ rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
+ KM_SLEEP);
+ mip->mi_share_capab.ms_squery(share, ring_type,
+ (mac_ring_handle_t *)rings, &nrings);
+ } else {
+ /* this function is called for TX only with a share */
+ ASSERT(ring_type == MAC_RING_TYPE_RX);
+ /*
+ * Pick one ring from default group.
+ *
+ * for now pick the second ring which requires the first ring
+ * at index 0 to stay in the default group, since it is the
+ * ring which carries the multicast traffic.
+ * We need a better way for a driver to indicate this,
+ * for example a per-ring flag.
+ */
+ for (ring = src_group->mrg_rings; ring != NULL;
+ ring = ring->mr_next) {
+ if (ring->mr_index != 0)
+ break;
+ }
+ ASSERT(ring != NULL);
+ nrings = 1;
+ tmp_ring[0] = ring;
+ rings = tmp_ring;
+ }
+
+ switch (ring_type) {
+ case MAC_RING_TYPE_RX:
+ if (src_group->mrg_cur_count - nrings <
+ mac_min_rx_default_rings) {
+ /* we ran out of rings */
+ return (ENOSPC);
+ }
+
+ /* move receive rings to new group */
+ for (i = 0; i < nrings; i++) {
+ rv = mac_group_mov_ring(mip, new_group, rings[i]);
+ if (rv != 0) {
+ /* move rings back on failure */
+ for (j = 0; j < i; j++) {
+ (void) mac_group_mov_ring(mip,
+ src_group, rings[j]);
+ }
+ return (rv);
+ }
+ }
+ break;
+
+ case MAC_RING_TYPE_TX: {
+ mac_ring_t *tmp_ring;
+
+ /* move the TX rings to the new group */
+ ASSERT(src_group == NULL);
+ for (i = 0; i < nrings; i++) {
+ /* get the desired ring */
+ tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
+ ASSERT(tmp_ring == rings[i]);
+ rv = mac_group_mov_ring(mip, new_group, rings[i]);
+ if (rv != 0) {
+ /* cleanup on failure */
+ for (j = 0; j < i; j++) {
+ (void) mac_group_mov_ring(mip,
+ mip->mi_tx_groups +
+ mip->mi_tx_group_count, rings[j]);
+ }
+ }
+ }
+ break;
+ }
+ }
+
+ if (share != NULL) {
+ /* add group to share */
+ mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
+ /* free temporary array of rings */
+ kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
+ }
+
+ return (0);
+}
+
+void
+mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
+{
+ mac_grp_client_t *mgcp;
+
+ for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
+ if (mgcp->mgc_client == mcip)
+ break;
+ }
+
+ VERIFY(mgcp == NULL);
+
+ mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
+ mgcp->mgc_client = mcip;
+ mgcp->mgc_next = grp->mrg_clients;
+ grp->mrg_clients = mgcp;
+
+}
+
+void
+mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
+{
+ mac_grp_client_t *mgcp, **pprev;
+
+ for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
+ pprev = &mgcp->mgc_next, mgcp = *pprev) {
+ if (mgcp->mgc_client == mcip)
+ break;
+ }
+
+ ASSERT(mgcp != NULL);
+
+ *pprev = mgcp->mgc_next;
+ kmem_free(mgcp, sizeof (mac_grp_client_t));
+}
+
+/*
+ * mac_reserve_rx_group()
+ *
+ * Finds an available group and exclusively reserves it for a client.
+ * The group is chosen to suit the flow's resource controls (bandwidth and
+ * fanout requirements) and the address type.
+ * If the requestor is the pimary MAC then return the group with the
+ * largest number of rings, otherwise the default ring when available.
+ */
+mac_group_t *
+mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr,
+ mac_rx_group_reserve_type_t rtype)
+{
+ mac_share_handle_t share = mcip->mci_share;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_group_t *grp = NULL;
+ int i, start, loopcount;
+ int err;
+ mac_address_t *map;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /* Check if a group already has this mac address (case of VLANs) */
+ if ((map = mac_find_macaddr(mip, mac_addr)) != NULL)
+ return (map->ma_group);
+
+ if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 ||
+ rtype == MAC_RX_NO_RESERVE)
+ return (NULL);
+
+ /*
+ * Try to exclusively reserve a RX group.
+ *
+ * For flows requires SW_RING it always goes to the default group
+ * (Until we can explicitely call out default groups (CR 6695600),
+ * we assume that the default group is always at position zero);
+ *
+ * For flows requires HW_DEFAULT_RING (unicast flow of the primary
+ * client), try to reserve the default RX group only.
+ *
+ * For flows requires HW_RING (unicast flow of other clients), try
+ * to reserve non-default RX group then the default group.
+ */
+ switch (rtype) {
+ case MAC_RX_RESERVE_DEFAULT:
+ start = 0;
+ loopcount = 1;
+ break;
+ case MAC_RX_RESERVE_NONDEFAULT:
+ start = 1;
+ loopcount = mip->mi_rx_group_count;
+ }
+
+ for (i = start; i < start + loopcount; i++) {
+ grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count];
+
+ DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
+ int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
+
+ /*
+ * Check to see whether this mac client is the only client
+ * on this RX group. If not, we cannot exclusively reserve
+ * this RX group.
+ */
+ if (!MAC_RX_GROUP_NO_CLIENT(grp) &&
+ (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) {
+ continue;
+ }
+
+ /*
+ * This group could already be SHARED by other multicast
+ * flows on this client. In that case, the group would
+ * be shared and has already been started.
+ */
+ ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
+
+ if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
+ (mac_start_group(grp) != 0)) {
+ continue;
+ }
+
+ if ((i % mip->mi_rx_group_count) == 0 ||
+ mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
+ break;
+ }
+
+ ASSERT(grp->mrg_cur_count == 0);
+
+ /*
+ * Populate the group. Rings should be taken
+ * from the default group at position 0 for now.
+ */
+
+ err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
+ &mip->mi_rx_groups[0], grp, share);
+ if (err == 0)
+ break;
+
+ DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
+ mip->mi_name, int, grp->mrg_index, int, err);
+
+ /*
+ * It's a dynamic group but the grouping operation failed.
+ */
+ mac_stop_group(grp);
+ }
+
+ if (i == start + loopcount)
+ return (NULL);
+
+ ASSERT(grp != NULL);
+
+ DTRACE_PROBE2(rx__group__reserved,
+ char *, mip->mi_name, int, grp->mrg_index);
+ return (grp);
+}
+
+/*
+ * mac_rx_release_group()
+ *
+ * This is called when there are no clients left for the group.
+ * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
+ * and if it is a non default group, the shares are removed and
+ * all rings are assigned back to default group.
+ */
+void
+mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_ring_t *ring;
+
+ ASSERT(group != &mip->mi_rx_groups[0]);
+
+ /*
+ * This is the case where there are no clients left. Any
+ * SRS etc on this group have also be quiesced.
+ */
+ for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+ if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
+ ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
+ /*
+ * Remove the SRS associated with the HW ring.
+ * As a result, polling will be disabled.
+ */
+ ring->mr_srs = NULL;
+ }
+ ASSERT(ring->mr_state == MR_INUSE);
+ mac_stop_ring(ring);
+ ring->mr_state = MR_FREE;
+ ring->mr_flag = 0;
+ }
+
+ /* remove group from share */
+ if (mcip->mci_share != NULL) {
+ mip->mi_share_capab.ms_sremove(mcip->mci_share,
+ group->mrg_driver);
+ }
+
+ if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
+ mac_ring_t *ring;
+
+ /*
+ * Rings were dynamically allocated to group.
+ * Move rings back to default group.
+ */
+ while ((ring = group->mrg_rings) != NULL) {
+ (void) mac_group_mov_ring(mip,
+ &mip->mi_rx_groups[0], ring);
+ }
+ }
+ mac_stop_group(group);
+ /*
+ * Possible improvement: See if we can assign the group just released
+ * to a another client of the mip
+ */
+}
+
+/*
+ * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
+ * when a share was allocated to the client.
+ */
+mac_group_t *
+mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share)
+{
+ mac_group_t *grp;
+ int rv, i;
+
+ /*
+ * TX groups are currently allocated only to MAC clients
+ * which are associated with a share. Since we have a fixed
+ * number of share and groups, and we already successfully
+ * allocated a share, find an available TX group.
+ */
+ ASSERT(share != NULL);
+ ASSERT(mip->mi_tx_group_free > 0);
+
+ for (i = 0; i < mip->mi_tx_group_count; i++) {
+ grp = &mip->mi_tx_groups[i];
+
+ if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
+ (grp->mrg_state == MAC_GROUP_STATE_UNINIT))
+ continue;
+
+ rv = mac_start_group(grp);
+ ASSERT(rv == 0);
+
+ grp->mrg_state = MAC_GROUP_STATE_RESERVED;
+ break;
+ }
+
+ ASSERT(grp != NULL);
+
+ /*
+ * Populate the group. Rings should be taken from the group
+ * of unassigned rings, which is past the array of TX
+ * groups adversized by the driver.
+ */
+ rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL,
+ grp, share);
+ if (rv != 0) {
+ DTRACE_PROBE3(tx__group__reserve__alloc__rings,
+ char *, mip->mi_name, int, grp->mrg_index, int, rv);
+
+ mac_stop_group(grp);
+ grp->mrg_state = MAC_GROUP_STATE_UNINIT;
+
+ return (NULL);
+ }
+
+ mip->mi_tx_group_free--;
+
+ return (grp);
+}
+
+void
+mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp)
+{
+ mac_client_impl_t *mcip = grp->mrg_tx_client;
+ mac_share_handle_t share = mcip->mci_share;
+ mac_ring_t *ring;
+
+ ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
+ ASSERT(share != NULL);
+ ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED);
+
+ mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
+ while ((ring = grp->mrg_rings) != NULL) {
+ /* move the ring back to the pool */
+ (void) mac_group_mov_ring(mip, mip->mi_tx_groups +
+ mip->mi_tx_group_count, ring);
+ }
+ mac_stop_group(grp);
+ mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
+ grp->mrg_tx_client = NULL;
+ mip->mi_tx_group_free++;
+}
+
+/*
+ * This is a 1-time control path activity initiated by the client (IP).
+ * The mac perimeter protects against other simultaneous control activities,
+ * for example an ioctl that attempts to change the degree of fanout and
+ * increase or decrease the number of softrings associated with this Tx SRS.
+ */
+static mac_tx_notify_cb_t *
+mac_client_tx_notify_add(mac_client_impl_t *mcip,
+ mac_tx_notify_t notify, void *arg)
+{
+ mac_cb_info_t *mcbi;
+ mac_tx_notify_cb_t *mtnfp;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
+ mtnfp->mtnf_fn = notify;
+ mtnfp->mtnf_arg = arg;
+ mtnfp->mtnf_link.mcb_objp = mtnfp;
+ mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
+ mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
+
+ mcbi = &mcip->mci_tx_notify_cb_info;
+ mutex_enter(mcbi->mcbi_lockp);
+ mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
+ mutex_exit(mcbi->mcbi_lockp);
+ return (mtnfp);
+}
+
+static void
+mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
+{
+ mac_cb_info_t *mcbi;
+ mac_cb_t **cblist;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
+ &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
+ cmn_err(CE_WARN,
+ "mac_client_tx_notify_remove: callback not "
+ "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
+ return;
+ }
+
+ mcbi = &mcip->mci_tx_notify_cb_info;
+ cblist = &mcip->mci_tx_notify_cb_list;
+ mutex_enter(mcbi->mcbi_lockp);
+ if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
+ kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
+ else
+ mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
+ mutex_exit(mcbi->mcbi_lockp);
+}
+
+/*
+ * mac_client_tx_notify():
+ * call to add and remove flow control callback routine.
+ */
+mac_tx_notify_handle_t
+mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
+ void *ptr)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_tx_notify_cb_t *mtnfp = NULL;
+
+ i_mac_perim_enter(mcip->mci_mip);
+
+ if (callb_func != NULL) {
+ /* Add a notify callback */
+ mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
+ } else {
+ mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
+ }
+ i_mac_perim_exit(mcip->mci_mip);
+
+ return ((mac_tx_notify_handle_t)mtnfp);
+}
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
new file mode 100644
index 0000000000..5fd2a6ef55
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -0,0 +1,668 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/list.h>
+#include <sys/kmem.h>
+#include <sys/stream.h>
+#include <sys/modctl.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/stat.h>
+#include <sys/modhash.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/sdt.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_flow_impl.h>
+
+/*
+ * Broadcast and multicast traffic must be distributed to the MAC clients
+ * that are defined on top of the same MAC. The set of
+ * destinations to which a multicast packet must be sent is a subset
+ * of all MAC clients defined on top of the MAC. A MAC client can be member
+ * of more than one such subset.
+ *
+ * To accomodate these requirements, we introduce broadcast groups.
+ * A broadcast group is associated with a broadcast or multicast
+ * address. The members of a broadcast group consist of the MAC clients
+ * that should received copies of packets sent to the address
+ * associated with the group, and are defined on top of the
+ * same MAC.
+ *
+ * The broadcast groups defined on top of a MAC are chained,
+ * hanging off the mac_impl_t. The broadcast group id's are
+ * unique globally (tracked by mac_bcast_id).
+ */
+
+/*
+ * The same MAC client may be added for different <addr,vid> tuple,
+ * we maintain a ref count for the number of times it has been added
+ * to account for deleting the MAC client from the group.
+ */
+typedef struct mac_bcast_grp_mcip_s {
+ mac_client_impl_t *mgb_client;
+ int mgb_client_ref;
+} mac_bcast_grp_mcip_t;
+
+typedef struct mac_bcast_grp_s { /* Protected by */
+ struct mac_bcast_grp_s *mbg_next; /* SL */
+ void *mbg_addr; /* SL */
+ uint16_t mbg_vid; /* SL */
+ mac_impl_t *mbg_mac_impl; /* WO */
+ mac_addrtype_t mbg_addrtype; /* WO */
+ flow_entry_t *mbg_flow_ent; /* WO */
+ mac_bcast_grp_mcip_t *mbg_clients; /* mi_rw_lock */
+ uint_t mbg_nclients; /* mi_rw_lock */
+ uint_t mbg_nclients_alloc; /* SL */
+ uint64_t mbg_clients_gen; /* mi_rw_lock */
+ uint32_t mbg_id; /* atomic */
+} mac_bcast_grp_t;
+
+static kmem_cache_t *mac_bcast_grp_cache;
+static uint32_t mac_bcast_id = 0;
+
+void
+mac_bcast_init(void)
+{
+ mac_bcast_grp_cache = kmem_cache_create("mac_bcast_grp_cache",
+ sizeof (mac_bcast_grp_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+mac_bcast_fini(void)
+{
+ kmem_cache_destroy(mac_bcast_grp_cache);
+}
+
+mac_impl_t *
+mac_bcast_grp_mip(void *grp)
+{
+ mac_bcast_grp_t *bcast_grp = grp;
+
+ return (bcast_grp->mbg_mac_impl);
+}
+
+/*
+ * Free the specific broadcast group. Invoked when the last reference
+ * to the group is released.
+ */
+void
+mac_bcast_grp_free(void *bcast_grp)
+{
+ mac_bcast_grp_t *grp = bcast_grp;
+ mac_impl_t *mip = grp->mbg_mac_impl;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
+ /*
+ * The address is a multicast address, have the
+ * underlying NIC leave the multicast group.
+ */
+ (void) mip->mi_multicst(mip->mi_driver, B_FALSE, grp->mbg_addr);
+ }
+
+ ASSERT(grp->mbg_addr != NULL);
+ kmem_free(grp->mbg_addr, mip->mi_type->mt_addr_length);
+ kmem_free(grp->mbg_clients,
+ grp->mbg_nclients_alloc * sizeof (mac_bcast_grp_mcip_t));
+ mip->mi_bcast_ngrps--;
+ kmem_cache_free(mac_bcast_grp_cache, grp);
+}
+
+/*
+ * arg1: broadcast group
+ * arg2: sender MAC client if it is being sent by a MAC client,
+ * NULL if it was received from the wire.
+ */
+void
+mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
+{
+ mac_bcast_grp_t *grp = arg1;
+ mac_client_impl_t *src_mcip = arg2, *dst_mcip;
+ mac_impl_t *mip = grp->mbg_mac_impl;
+ uint64_t gen;
+ uint_t i;
+ mblk_t *mp_chain1;
+ flow_entry_t *flent;
+ int err;
+
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+
+ /*
+ * Pass a copy of the mp chain to every MAC client except the sender
+ * MAC client, if the packet was not received from the underlying NIC.
+ *
+ * The broadcast group lock should not be held across calls to
+ * the flow's callback function, since the same group could
+ * potentially be accessed from the same context. When the lock
+ * is reacquired, changes to the broadcast group while the lock
+ * was released are caught using a generation counter incremented
+ * each time the list of MAC clients associated with the broadcast
+ * group is changed.
+ */
+ for (i = 0; i < grp->mbg_nclients_alloc; i++) {
+ dst_mcip = grp->mbg_clients[i].mgb_client;
+ if (dst_mcip == NULL)
+ continue;
+ flent = dst_mcip->mci_flent;
+ if (flent == NULL || dst_mcip == src_mcip) {
+ /*
+ * Don't send a copy of the packet back to
+ * its sender.
+ */
+ continue;
+ }
+
+ /*
+ * It is important to hold a reference on the
+ * flow_ent here.
+ */
+ if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL)
+ break;
+ /*
+ * Fix the checksum for packets originating
+ * from the local machine.
+ */
+ if ((src_mcip != NULL) &&
+ (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL)
+ break;
+
+ FLOW_TRY_REFHOLD(flent, err);
+ if (err != 0) {
+ freemsgchain(mp_chain1);
+ continue;
+ }
+
+ gen = grp->mbg_clients_gen;
+
+ rw_exit(&mip->mi_rw_lock);
+
+ DTRACE_PROBE4(mac__bcast__send__to, mac_client_impl_t *,
+ src_mcip, flow_fn_t, dst_mcip->mci_flent->fe_cb_fn,
+ void *, dst_mcip->mci_flent->fe_cb_arg1,
+ void *, dst_mcip->mci_flent->fe_cb_arg2);
+
+ (dst_mcip->mci_flent->fe_cb_fn)(dst_mcip->mci_flent->fe_cb_arg1,
+ dst_mcip->mci_flent->fe_cb_arg2, mp_chain1, is_loopback);
+ FLOW_REFRELE(flent);
+
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+
+ /* update stats */
+ if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST)
+ dst_mcip->mci_stat_multircv++;
+ else
+ dst_mcip->mci_stat_brdcstrcv++;
+
+ if (grp->mbg_clients_gen != gen) {
+ /*
+ * The list of MAC clients associated with the group
+ * was changed while the lock was released.
+ * Give up on the current packet.
+ */
+ rw_exit(&mip->mi_rw_lock);
+ freemsgchain(mp_chain);
+ return;
+ }
+ }
+ rw_exit(&mip->mi_rw_lock);
+
+ if (src_mcip != NULL) {
+ /*
+ * The packet was sent from one of the MAC clients,
+ * so we need to send a copy of the packet to the
+ * underlying NIC so that it can be sent on the wire.
+ */
+ mblk_t *rest;
+
+ src_mcip->mci_stat_multixmt++;
+ src_mcip->mci_stat_brdcstxmt++;
+
+ rest = MAC_RING_TX_DEFAULT(mip, mp_chain);
+ if (rest != NULL)
+ freemsgchain(rest);
+ } else {
+ freemsgchain(mp_chain);
+ }
+}
+
+/*
+ * Add the specified MAC client to the group corresponding to the specified
+ * broadcast or multicast address.
+ * Return 0 on success, or an errno value on failure.
+ */
+int
+mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
+ mac_addrtype_t addrtype)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_bcast_grp_t *grp = NULL, **last_grp;
+ size_t addr_len = mip->mi_type->mt_addr_length;
+ int rc = 0;
+ int i, index = -1;
+ mac_mcast_addrs_t *mci_maddr = NULL;
+ mac_mcast_addrs_t *mi_maddr = NULL;
+ mac_mcast_addrs_t **last_maddr;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST ||
+ addrtype == MAC_ADDRTYPE_BROADCAST);
+
+ /* The list is protected by the perimeter */
+ last_grp = &mip->mi_bcast_grp;
+ for (grp = *last_grp; grp != NULL;
+ last_grp = &grp->mbg_next, grp = grp->mbg_next) {
+ if (bcmp(grp->mbg_addr, addr, addr_len) == 0 &&
+ grp->mbg_vid == vid)
+ break;
+ }
+
+ if (grp == NULL) {
+ /*
+ * The group does not yet exist, create it.
+ */
+ flow_desc_t flow_desc;
+ char flow_name[MAXFLOWNAME];
+
+ grp = kmem_cache_alloc(mac_bcast_grp_cache, KM_SLEEP);
+ bzero(grp, sizeof (mac_bcast_grp_t));
+ grp->mbg_next = NULL;
+ grp->mbg_mac_impl = mip;
+
+ DTRACE_PROBE1(mac__bcast__add__new__group, mac_bcast_grp_t *,
+ grp);
+
+ grp->mbg_addr = kmem_zalloc(addr_len, KM_SLEEP);
+ bcopy(addr, grp->mbg_addr, addr_len);
+ grp->mbg_addrtype = addrtype;
+ grp->mbg_vid = vid;
+
+ /*
+ * Add a new flow to the underlying MAC.
+ */
+ bzero(&flow_desc, sizeof (flow_desc));
+ bcopy(addr, &flow_desc.fd_dst_mac, addr_len);
+ flow_desc.fd_mac_len = (uint32_t)addr_len;
+
+ flow_desc.fd_mask = FLOW_LINK_DST;
+ if (vid != 0) {
+ flow_desc.fd_vid = vid;
+ flow_desc.fd_mask |= FLOW_LINK_VID;
+ }
+
+ grp->mbg_id = atomic_add_32_nv(&mac_bcast_id, 1);
+ (void) sprintf(flow_name,
+ "mac/%s/mcast%d", mip->mi_name, grp->mbg_id);
+
+ rc = mac_flow_create(&flow_desc, NULL, flow_name,
+ grp, FLOW_MCAST, &grp->mbg_flow_ent);
+ if (rc != 0) {
+ kmem_free(grp->mbg_addr, addr_len);
+ kmem_cache_free(mac_bcast_grp_cache, grp);
+ return (rc);
+ }
+ grp->mbg_flow_ent->fe_mbg = grp;
+ mip->mi_bcast_ngrps++;
+
+ /*
+ * Initial creation reference on the flow. This is released
+ * in the corresponding delete action i_mac_bcast_delete()
+ */
+ FLOW_REFHOLD(grp->mbg_flow_ent);
+
+ /*
+ * When the multicast and broadcast packet is received
+ * by the underlying NIC, mac_rx_classify() will invoke
+ * mac_bcast_send() with arg2=NULL, which will cause
+ * mac_bcast_send() to send a copy of the packet(s)
+ * to every MAC client opened on top of the underlying MAC.
+ *
+ * When the mac_bcast_send() function is invoked from
+ * the transmit path of a MAC client, it will specify the
+ * transmitting MAC client as the arg2 value, which will
+ * allow mac_bcast_send() to skip that MAC client and not
+ * send it a copy of the packet.
+ *
+ * We program the classifier to dispatch matching broadcast
+ * packets to mac_bcast_send().
+ */
+
+ grp->mbg_flow_ent->fe_cb_fn = mac_bcast_send;
+ grp->mbg_flow_ent->fe_cb_arg1 = grp;
+ grp->mbg_flow_ent->fe_cb_arg2 = NULL;
+
+ rc = mac_flow_add(mip->mi_flow_tab, grp->mbg_flow_ent);
+ if (rc != 0) {
+ FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
+ return (rc);
+ }
+
+ /*
+ * For multicast addresses, have the underlying MAC
+ * join the corresponsing multicast group.
+ */
+ if (addrtype == MAC_ADDRTYPE_MULTICAST) {
+ rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr);
+ if (rc != 0) {
+ mac_flow_remove(mip->mi_flow_tab,
+ grp->mbg_flow_ent, B_FALSE);
+ mac_flow_wait(grp->mbg_flow_ent,
+ FLOW_DRIVER_UPCALL);
+ FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
+ return (rc);
+ }
+ }
+
+ *last_grp = grp;
+ }
+
+ ASSERT(grp->mbg_addrtype == addrtype);
+
+ /*
+ * Add the MAC client to the list of MAC clients associated
+ * with the group.
+ */
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
+ if (addrtype == MAC_ADDRTYPE_MULTICAST) {
+ /*
+ * We maintain a separate list for each MAC client. Get
+ * the entry or add, if it is not present.
+ */
+ last_maddr = &mcip->mci_mcast_addrs;
+ for (mci_maddr = *last_maddr; mci_maddr != NULL;
+ last_maddr = &mci_maddr->mma_next,
+ mci_maddr = mci_maddr->mma_next) {
+ if (bcmp(mci_maddr->mma_addr, addr, addr_len) == 0)
+ break;
+ }
+ if (mci_maddr == NULL) {
+ mci_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+ KM_SLEEP);
+ bcopy(addr, mci_maddr->mma_addr, addr_len);
+ *last_maddr = mci_maddr;
+ }
+ mci_maddr->mma_ref++;
+
+ /*
+ * In case of a driver (say aggr), we also need this
+ * information on a per MAC instance basis.
+ */
+ last_maddr = &mip->mi_mcast_addrs;
+ for (mi_maddr = *last_maddr; mi_maddr != NULL;
+ last_maddr = &mi_maddr->mma_next,
+ mi_maddr = mi_maddr->mma_next) {
+ if (bcmp(mi_maddr->mma_addr, addr, addr_len) == 0)
+ break;
+ }
+ if (mi_maddr == NULL) {
+ mi_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+ KM_SLEEP);
+ bcopy(addr, mi_maddr->mma_addr, addr_len);
+ *last_maddr = mi_maddr;
+ }
+ mi_maddr->mma_ref++;
+ }
+ for (i = 0; i < grp->mbg_nclients_alloc; i++) {
+ /*
+ * The MAC client was already added, say when we have
+ * different unicast addresses with the same vid.
+ * Just increment the ref and we are done.
+ */
+ if (grp->mbg_clients[i].mgb_client == mcip) {
+ grp->mbg_clients[i].mgb_client_ref++;
+ goto add_done;
+ } else if (grp->mbg_clients[i].mgb_client == NULL &&
+ index == -1) {
+ index = i;
+ }
+ }
+ if (grp->mbg_nclients_alloc == grp->mbg_nclients) {
+ mac_bcast_grp_mcip_t *new_clients;
+ uint_t new_size = grp->mbg_nclients+1;
+
+ new_clients = kmem_zalloc(new_size *
+ sizeof (mac_bcast_grp_mcip_t), KM_SLEEP);
+
+ if (grp->mbg_nclients > 0) {
+ ASSERT(grp->mbg_clients != NULL);
+ bcopy(grp->mbg_clients, new_clients, grp->mbg_nclients *
+ sizeof (mac_bcast_grp_mcip_t));
+ kmem_free(grp->mbg_clients, grp->mbg_nclients *
+ sizeof (mac_bcast_grp_mcip_t));
+ }
+
+ grp->mbg_clients = new_clients;
+ grp->mbg_nclients_alloc = new_size;
+ index = new_size - 1;
+ }
+
+ ASSERT(index != -1);
+ grp->mbg_clients[index].mgb_client = mcip;
+ grp->mbg_clients[index].mgb_client_ref = 1;
+ grp->mbg_nclients++;
+ /*
+ * Since we're adding to the list of MAC clients using that group,
+ * kick the generation count, which will allow mac_bcast_send()
+ * to detect that condition after re-acquiring the lock.
+ */
+ grp->mbg_clients_gen++;
+add_done:
+ rw_exit(&mip->mi_rw_lock);
+
+ return (0);
+}
+
+/*
+ * Remove the specified MAC client from the group corresponding to
+ * the specific broadcast or multicast address.
+ *
+ * Note: mac_bcast_delete() calls mac_remove_flow() which
+ * will call cv_wait for fe_refcnt to drop to 0. So this function
+ * should not be called from interrupt or STREAMS context.
+ */
+void
+mac_bcast_delete(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_bcast_grp_t *grp = NULL, **prev;
+ size_t addr_len = mip->mi_type->mt_addr_length;
+ flow_entry_t *flent;
+ uint_t i;
+ mac_mcast_addrs_t *maddr = NULL;
+ mac_mcast_addrs_t **mprev;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /* find the broadcast group. The list is protected by the perimeter */
+ prev = &mip->mi_bcast_grp;
+ for (grp = mip->mi_bcast_grp; grp != NULL; prev = &grp->mbg_next,
+ grp = grp->mbg_next) {
+ if (bcmp(grp->mbg_addr, addr, addr_len) == 0 &&
+ grp->mbg_vid == vid)
+ break;
+ }
+ ASSERT(grp != NULL);
+
+ /*
+ * Remove the MAC client from the list of MAC clients associated
+ * with that broadcast group.
+ *
+ * We mark the mbg_clients[] location corresponding to the removed MAC
+ * client NULL and reuse that location when we add a new MAC client.
+ */
+
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
+
+ for (i = 0; i < grp->mbg_nclients_alloc; i++) {
+ if (grp->mbg_clients[i].mgb_client == mcip)
+ break;
+ }
+
+ ASSERT(i < grp->mbg_nclients_alloc);
+ /*
+ * If there are more references to this MAC client, then we let
+ * it remain till it goes to 0.
+ */
+ if (--grp->mbg_clients[i].mgb_client_ref > 0)
+ goto update_maddr;
+
+ grp->mbg_clients[i].mgb_client = NULL;
+ grp->mbg_clients[i].mgb_client_ref = 0;
+
+ /*
+ * Since we're removing from the list of MAC clients using that group,
+ * kick the generation count, which will allow mac_bcast_send()
+ * to detect that condition.
+ */
+ grp->mbg_clients_gen++;
+
+ if (--grp->mbg_nclients == 0) {
+ /*
+ * The last MAC client of the group was just removed.
+ * Unlink the current group from the list of groups
+ * defined on top of the underlying NIC. The group
+ * structure will stay around until the last reference
+ * is dropped.
+ */
+ *prev = grp->mbg_next;
+ }
+update_maddr:
+ if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
+ mprev = &mcip->mci_mcast_addrs;
+ for (maddr = mcip->mci_mcast_addrs; maddr != NULL;
+ mprev = &maddr->mma_next, maddr = maddr->mma_next) {
+ if (bcmp(grp->mbg_addr, maddr->mma_addr,
+ mip->mi_type->mt_addr_length) == 0)
+ break;
+ }
+ ASSERT(maddr != NULL);
+ if (--maddr->mma_ref == 0) {
+ *mprev = maddr->mma_next;
+ maddr->mma_next = NULL;
+ kmem_free(maddr, sizeof (mac_mcast_addrs_t));
+ }
+
+ mprev = &mip->mi_mcast_addrs;
+ for (maddr = mip->mi_mcast_addrs; maddr != NULL;
+ mprev = &maddr->mma_next, maddr = maddr->mma_next) {
+ if (bcmp(grp->mbg_addr, maddr->mma_addr,
+ mip->mi_type->mt_addr_length) == 0)
+ break;
+ }
+ ASSERT(maddr != NULL);
+ if (--maddr->mma_ref == 0) {
+ *mprev = maddr->mma_next;
+ maddr->mma_next = NULL;
+ kmem_free(maddr, sizeof (mac_mcast_addrs_t));
+ }
+ }
+ rw_exit(&mip->mi_rw_lock);
+
+ /*
+ * If the group itself is being removed, remove the
+ * corresponding flow from the underlying NIC.
+ */
+ flent = grp->mbg_flow_ent;
+ if (grp->mbg_nclients == 0) {
+ mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
+ mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+ FLOW_FINAL_REFRELE(flent);
+ }
+}
+
+/*
+ * This will be called by a driver, such as aggr, when a port is added/removed
+ * to add/remove the port to/from all the multcast addresses for that aggr.
+ */
+void
+mac_bcast_refresh(mac_impl_t *mip, mac_multicst_t refresh_fn, void *arg,
+ boolean_t add)
+{
+ mac_mcast_addrs_t *grp, *next;
+
+ ASSERT(refresh_fn != NULL);
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /*
+ * Walk the multicast address list and call the refresh function for
+ * each address.
+ */
+
+ for (grp = mip->mi_mcast_addrs; grp != NULL; grp = next) {
+ /*
+ * Save the next pointer just in case the refresh
+ * function's action causes the group entry to be
+ * freed.
+ * We won't be adding to this list as part of the
+ * refresh.
+ */
+ next = grp->mma_next;
+ refresh_fn(arg, add, grp->mma_addr);
+ }
+}
+
+/*
+ * Walk the MAC client's multicast address list and add/remove the addr/vid
+ * ('arg' is 'flent') to all the addresses.
+ */
+void
+mac_client_bcast_refresh(mac_client_impl_t *mcip, mac_multicst_t refresh_fn,
+ void *arg, boolean_t add)
+{
+ mac_mcast_addrs_t *grp, *next;
+ mac_impl_t *mip = mcip->mci_mip;
+
+ ASSERT(refresh_fn != NULL);
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ /*
+ * Walk the multicast address list and call the refresh function for
+ * each address.
+ * Broadcast addresses are not added or removed through the multicast
+ * entry points, so don't include them as part of the refresh.
+ */
+ for (grp = mcip->mci_mcast_addrs; grp != NULL; grp = next) {
+ /*
+ * Save the next pointer just in case the refresh
+ * function's action causes the group entry to be
+ * freed.
+ * We won't be adding to this list as part of the
+ * refresh.
+ */
+ next = grp->mma_next;
+ refresh_fn(arg, add, grp->mma_addr);
+ }
+}
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
new file mode 100644
index 0000000000..bd6b552e67
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -0,0 +1,3763 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * - General Introduction:
+ *
+ * This file contains the implementation of the MAC client kernel
+ * API and related code. The MAC client API allows a kernel module
+ * to gain access to a MAC instance (physical NIC, link aggregation, etc).
+ * It allows a MAC client to associate itself with a MAC address,
+ * VLANs, callback functions for data traffic and for promiscuous mode.
+ * The MAC client API is also used to specify the properties associated
+ * with a MAC client, such as bandwidth limits, priority, CPUS, etc.
+ * These properties are further used to determine the hardware resources
+ * to allocate to the various MAC clients.
+ *
+ * - Primary MAC clients:
+ *
+ * The MAC client API refers to "primary MAC clients". A primary MAC
+ * client is a client which "owns" the primary MAC address of
+ * the underlying MAC instance. The primary MAC address is called out
+ * since it is associated with specific semantics: the primary MAC
+ * address is the MAC address which is assigned to the IP interface
+ * when it is plumbed, and the primary MAC address is assigned
+ * to VLAN data-links. The primary address of a MAC instance can
+ * also change dynamically from under the MAC client, for example
+ * as a result of a change of state of a link aggregation. In that
+ * case the MAC layer automatically updates all data-structures which
+ * refer to the current value of the primary MAC address. Typical
+ * primary MAC clients are dls, aggr, and xnb. A typical non-primary
+ * MAC client is the vnic driver.
+ *
+ * - Virtual Switching:
+ *
+ * The MAC layer implements a virtual switch between the MAC clients
+ * (primary and non-primary) defined on top of the same underlying
+ * NIC (physical, link aggregation, etc). The virtual switch is
+ * VLAN-aware, i.e. it allows multiple MAC clients to be member
+ * of one or more VLANs, and the virtual switch will distribute
+ * multicast tagged packets only to the member of the corresponding
+ * VLANs.
+ *
+ * - Upper vs Lower MAC:
+ *
+ * Creating a VNIC on top of a MAC instance effectively causes
+ * two MAC instances to be layered on top of each other, one for
+ * the VNIC(s), one for the underlying MAC instance (physical NIC,
+ * link aggregation, etc). In the code below we refer to the
+ * underlying NIC as the "lower MAC", and we refer to VNICs as
+ * the "upper MAC".
+ *
+ * - Pass-through for VNICs:
+ *
+ * When VNICs are created on top of an underlying MAC, this causes
+ * a layering of two MAC instances. Since the lower MAC already
+ * does the switching and demultiplexing to its MAC clients, the
+ * upper MAC would simply have to pass packets to the layer below
+ * or above it, which would introduce overhead. In order to avoid
+ * this overhead, the MAC layer implements a pass-through mechanism
+ * for VNICs. When a VNIC opens the lower MAC instance, it saves
+ * the MAC client handle it optains from the MAC layer. When a MAC
+ * client opens a VNIC (upper MAC), the MAC layer detects that
+ * the MAC being opened is a VNIC, and gets the MAC client handle
+ * that the VNIC driver obtained from the lower MAC. This exchange
+ * is doing through a private capability between the MAC layer
+ * and the VNIC driver. The upper MAC then returns that handle
+ * directly to its MAC client. Any operation done by the upper
+ * MAC client is now done on the lower MAC client handle, which
+ * allows the VNIC driver to be completely bypassed for the
+ * performance sensitive data-path.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/id_space.h>
+#include <sys/esunddi.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/modhash.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/dls.h>
+#include <sys/dld.h>
+#include <sys/modctl.h>
+#include <sys/fs/dv_node.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/callb.h>
+#include <sys/cpuvar.h>
+#include <sys/atomic.h>
+#include <sys/sdt.h>
+#include <sys/mac_flow.h>
+#include <sys/ddi_intr_impl.h>
+#include <sys/disp.h>
+#include <sys/sdt.h>
+#include <sys/vnic.h>
+#include <sys/vnic_impl.h>
+#include <sys/vlan.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <sys/exacct.h>
+#include <sys/exacct_impl.h>
+#include <inet/nd.h>
+#include <sys/ethernet.h>
+
+kmem_cache_t *mac_client_impl_cache;
+kmem_cache_t *mac_promisc_impl_cache;
+
+static boolean_t mac_client_single_rcvr(mac_client_impl_t *);
+static flow_entry_t *mac_client_swap_mciflent(mac_client_impl_t *);
+static flow_entry_t *mac_client_get_flow(mac_client_impl_t *,
+ mac_unicast_impl_t *);
+static void mac_client_remove_flow_from_list(mac_client_impl_t *,
+ flow_entry_t *);
+static void mac_client_add_to_flow_list(mac_client_impl_t *, flow_entry_t *);
+static void mac_rename_flow_names(mac_client_impl_t *, const char *);
+static void mac_virtual_link_update(mac_impl_t *);
+
+/* ARGSUSED */
+static int
+i_mac_client_impl_ctor(void *buf, void *arg, int kmflag)
+{
+ int i;
+ mac_client_impl_t *mcip = buf;
+
+ bzero(buf, MAC_CLIENT_IMPL_SIZE);
+ mutex_init(&mcip->mci_tx_cb_lock, NULL, MUTEX_DRIVER, NULL);
+ mcip->mci_tx_notify_cb_info.mcbi_lockp = &mcip->mci_tx_cb_lock;
+
+ ASSERT(mac_tx_percpu_cnt >= 0);
+ for (i = 0; i <= mac_tx_percpu_cnt; i++) {
+ mutex_init(&mcip->mci_tx_pcpu[i].pcpu_tx_lock, NULL,
+ MUTEX_DRIVER, NULL);
+ }
+ cv_init(&mcip->mci_tx_cv, NULL, CV_DRIVER, NULL);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+i_mac_client_impl_dtor(void *buf, void *arg)
+{
+ int i;
+ mac_client_impl_t *mcip = buf;
+
+ ASSERT(mcip->mci_promisc_list == NULL);
+ ASSERT(mcip->mci_unicast_list == NULL);
+ ASSERT(mcip->mci_state_flags == 0);
+ ASSERT(mcip->mci_tx_flag == 0);
+
+ mutex_destroy(&mcip->mci_tx_cb_lock);
+
+ ASSERT(mac_tx_percpu_cnt >= 0);
+ for (i = 0; i <= mac_tx_percpu_cnt; i++) {
+ ASSERT(mcip->mci_tx_pcpu[i].pcpu_tx_refcnt == 0);
+ mutex_destroy(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
+ }
+ cv_destroy(&mcip->mci_tx_cv);
+}
+
+/* ARGSUSED */
+static int
+i_mac_promisc_impl_ctor(void *buf, void *arg, int kmflag)
+{
+ mac_promisc_impl_t *mpip = buf;
+
+ bzero(buf, sizeof (mac_promisc_impl_t));
+ mpip->mpi_mci_link.mcb_objp = buf;
+ mpip->mpi_mci_link.mcb_objsize = sizeof (mac_promisc_impl_t);
+ mpip->mpi_mi_link.mcb_objp = buf;
+ mpip->mpi_mi_link.mcb_objsize = sizeof (mac_promisc_impl_t);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+i_mac_promisc_impl_dtor(void *buf, void *arg)
+{
+ mac_promisc_impl_t *mpip = buf;
+
+ ASSERT(mpip->mpi_mci_link.mcb_objp != NULL);
+ ASSERT(mpip->mpi_mci_link.mcb_objsize == sizeof (mac_promisc_impl_t));
+ ASSERT(mpip->mpi_mi_link.mcb_objp == mpip->mpi_mci_link.mcb_objp);
+ ASSERT(mpip->mpi_mi_link.mcb_objsize == sizeof (mac_promisc_impl_t));
+
+ mpip->mpi_mci_link.mcb_objp = NULL;
+ mpip->mpi_mci_link.mcb_objsize = 0;
+ mpip->mpi_mi_link.mcb_objp = NULL;
+ mpip->mpi_mi_link.mcb_objsize = 0;
+
+ ASSERT(mpip->mpi_mci_link.mcb_flags == 0);
+ mpip->mpi_mci_link.mcb_objsize = 0;
+}
+
+void
+mac_client_init(void)
+{
+ ASSERT(mac_tx_percpu_cnt >= 0);
+
+ mac_client_impl_cache = kmem_cache_create("mac_client_impl_cache",
+ MAC_CLIENT_IMPL_SIZE, 0, i_mac_client_impl_ctor,
+ i_mac_client_impl_dtor, NULL, NULL, NULL, 0);
+ ASSERT(mac_client_impl_cache != NULL);
+
+ mac_promisc_impl_cache = kmem_cache_create("mac_promisc_impl_cache",
+ sizeof (mac_promisc_impl_t), 0, i_mac_promisc_impl_ctor,
+ i_mac_promisc_impl_dtor, NULL, NULL, NULL, 0);
+ ASSERT(mac_promisc_impl_cache != NULL);
+}
+
+void
+mac_client_fini(void)
+{
+ kmem_cache_destroy(mac_client_impl_cache);
+ kmem_cache_destroy(mac_promisc_impl_cache);
+}
+
+/*
+ * Return the lower MAC client handle from the VNIC driver for the
+ * specified VNIC MAC instance.
+ */
+mac_client_impl_t *
+mac_vnic_lower(mac_impl_t *mip)
+{
+ mac_capab_vnic_t cap;
+ mac_client_impl_t *mcip;
+
+ VERIFY(i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, &cap));
+ mcip = cap.mcv_mac_client_handle(cap.mcv_arg);
+
+ return (mcip);
+}
+
+/*
+ * Return the MAC client handle of the primary MAC client for the
+ * specified MAC instance, or NULL otherwise.
+ */
+mac_client_impl_t *
+mac_primary_client_handle(mac_impl_t *mip)
+{
+ mac_client_impl_t *mcip;
+
+ if (mip->mi_state_flags & MIS_IS_VNIC)
+ return (mac_vnic_lower(mip));
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ for (mcip = mip->mi_clients_list; mcip != NULL;
+ mcip = mcip->mci_client_next) {
+ if (MCIP_DATAPATH_SETUP(mcip) && mac_is_primary_client(mcip))
+ return (mcip);
+ }
+ return (NULL);
+}
+
+/*
+ * Open a MAC specified by its MAC name.
+ */
+int
+mac_open(const char *macname, mac_handle_t *mhp)
+{
+ mac_impl_t *mip;
+ int err;
+
+ /*
+ * Look up its entry in the global hash table.
+ */
+ if ((err = mac_hold(macname, &mip)) != 0)
+ return (err);
+
+ /*
+ * Hold the dip associated to the MAC to prevent it from being
+ * detached. For a softmac, its underlying dip is held by the
+ * mi_open() callback.
+ *
+ * This is done to be more tolerant with some defective drivers,
+ * which incorrectly handle mac_unregister() failure in their
+ * xxx_detach() routine. For example, some drivers ignore the
+ * failure of mac_unregister() and free all resources that
+ * that are needed for data transmition.
+ */
+ e_ddi_hold_devi(mip->mi_dip);
+
+ if (!(mip->mi_callbacks->mc_callbacks & MC_OPEN)) {
+ *mhp = (mac_handle_t)mip;
+ return (0);
+ }
+
+ /*
+ * The mac perimeter is used in both mac_open and mac_close by the
+ * framework to single thread the MC_OPEN/MC_CLOSE of drivers.
+ */
+ i_mac_perim_enter(mip);
+ mip->mi_oref++;
+ if (mip->mi_oref != 1 || ((err = mip->mi_open(mip->mi_driver)) == 0)) {
+ *mhp = (mac_handle_t)mip;
+ i_mac_perim_exit(mip);
+ return (0);
+ }
+ mip->mi_oref--;
+ ddi_release_devi(mip->mi_dip);
+ mac_rele(mip);
+ i_mac_perim_exit(mip);
+ return (err);
+}
+
+/*
+ * Open a MAC specified by its linkid.
+ */
+int
+mac_open_by_linkid(datalink_id_t linkid, mac_handle_t *mhp)
+{
+ dls_dl_handle_t dlh;
+ int err;
+
+ if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
+ return (err);
+
+ dls_devnet_prop_task_wait(dlh);
+
+ err = mac_open(dls_devnet_mac(dlh), mhp);
+
+ dls_devnet_rele_tmp(dlh);
+ return (err);
+}
+
+/*
+ * Open a MAC specified by its link name.
+ */
+int
+mac_open_by_linkname(const char *link, mac_handle_t *mhp)
+{
+ datalink_id_t linkid;
+ int err;
+
+ if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0)
+ return (err);
+ return (mac_open_by_linkid(linkid, mhp));
+}
+
+/*
+ * Close the specified MAC.
+ */
+void
+mac_close(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ i_mac_perim_enter(mip);
+ /*
+ * The mac perimeter is used in both mac_open and mac_close by the
+ * framework to single thread the MC_OPEN/MC_CLOSE of drivers.
+ */
+ if (mip->mi_callbacks->mc_callbacks & MC_OPEN) {
+ ASSERT(mip->mi_oref != 0);
+ if (--mip->mi_oref == 0) {
+ if ((mip->mi_callbacks->mc_callbacks & MC_CLOSE))
+ mip->mi_close(mip->mi_driver);
+ }
+ }
+ i_mac_perim_exit(mip);
+ ddi_release_devi(mip->mi_dip);
+ mac_rele(mip);
+}
+
+/*
+ * Misc utility functions to retrieve various information about a MAC
+ * instance or a MAC client.
+ */
+
+const mac_info_t *
+mac_info(mac_handle_t mh)
+{
+ return (&((mac_impl_t *)mh)->mi_info);
+}
+
+dev_info_t *
+mac_devinfo_get(mac_handle_t mh)
+{
+ return (((mac_impl_t *)mh)->mi_dip);
+}
+
+const char *
+mac_name(mac_handle_t mh)
+{
+ return (((mac_impl_t *)mh)->mi_name);
+}
+
+char *
+mac_client_name(mac_client_handle_t mch)
+{
+ return (((mac_client_impl_t *)mch)->mci_name);
+}
+
+minor_t
+mac_minor(mac_handle_t mh)
+{
+ return (((mac_impl_t *)mh)->mi_minor);
+}
+
+/*
+ * Return the VID associated with a MAC client. This function should
+ * be called for clients which are associated with only one VID.
+ */
+uint16_t
+mac_client_vid(mac_client_handle_t mch)
+{
+ uint16_t vid = VLAN_ID_NONE;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ flow_desc_t flow_desc;
+
+ if (mcip->mci_nflents == 0)
+ return (vid);
+
+ ASSERT(MCIP_DATAPATH_SETUP(mcip) && mac_client_single_rcvr(mcip));
+
+ mac_flow_get_desc(mcip->mci_flent, &flow_desc);
+ if ((flow_desc.fd_mask & FLOW_LINK_VID) != 0)
+ vid = flow_desc.fd_vid;
+
+ return (vid);
+}
+
+/*
+ * Return the link speed associated with the specified MAC client.
+ *
+ * The link speed of a MAC client is equal to the smallest value of
+ * 1) the current link speed of the underlying NIC, or
+ * 2) the bandwidth limit set for the MAC client.
+ *
+ * Note that the bandwidth limit can be higher than the speed
+ * of the underlying NIC. This is allowed to avoid spurious
+ * administration action failures or artifically lowering the
+ * bandwidth limit of a link that may have temporarily lowered
+ * its link speed due to hardware problem or administrator action.
+ */
+static uint64_t
+mac_client_ifspeed(mac_client_impl_t *mcip)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ uint64_t nic_speed;
+
+ nic_speed = mac_stat_get((mac_handle_t)mip, MAC_STAT_IFSPEED);
+
+ if (nic_speed == 0) {
+ return (0);
+ } else {
+ uint64_t policy_limit = (uint64_t)-1;
+
+ if (MCIP_RESOURCE_PROPS_MASK(mcip) & MRP_MAXBW)
+ policy_limit = MCIP_RESOURCE_PROPS_MAXBW(mcip);
+
+ return (MIN(policy_limit, nic_speed));
+ }
+}
+
+/*
+ * Return the link state of the specified client. If here are more
+ * than one clients of the underying mac_impl_t, the link state
+ * will always be UP regardless of the link state of the underlying
+ * mac_impl_t. This is needed to allow the MAC clients to continue
+ * to communicate with each other even when the physical link of
+ * their mac_impl_t is down.
+ */
+static uint64_t
+mac_client_link_state(mac_client_impl_t *mcip)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ uint16_t vid;
+ mac_client_impl_t *mci_list;
+ mac_unicast_impl_t *mui_list, *oth_mui_list;
+
+ /*
+ * Returns LINK_STATE_UP if there are other MAC clients defined on
+ * mac_impl_t which share same VLAN ID as that of mcip. Note that
+ * if 'mcip' has more than one VID's then we match ANY one of the
+ * VID's with other MAC client's VID's and return LINK_STATE_UP.
+ */
+ rw_enter(&mcip->mci_rw_lock, RW_READER);
+ for (mui_list = mcip->mci_unicast_list; mui_list != NULL;
+ mui_list = mui_list->mui_next) {
+ vid = mui_list->mui_vid;
+ for (mci_list = mip->mi_clients_list; mci_list != NULL;
+ mci_list = mci_list->mci_client_next) {
+ if (mci_list == mcip)
+ continue;
+ for (oth_mui_list = mci_list->mci_unicast_list;
+ oth_mui_list != NULL; oth_mui_list = oth_mui_list->
+ mui_next) {
+ if (vid == oth_mui_list->mui_vid) {
+ rw_exit(&mcip->mci_rw_lock);
+ return (LINK_STATE_UP);
+ }
+ }
+ }
+ }
+ rw_exit(&mcip->mci_rw_lock);
+
+ return (mac_stat_get((mac_handle_t)mip, MAC_STAT_LINK_STATE));
+}
+
+/*
+ * Return the statistics of a MAC client. These statistics are different
+ * then the statistics of the underlying MAC which are returned by
+ * mac_stat_get().
+ */
+uint64_t
+mac_client_stat_get(mac_client_handle_t mch, uint_t stat)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ uint64_t val;
+
+ switch (stat) {
+ case MAC_STAT_LINK_STATE:
+ val = mac_client_link_state(mcip);
+ break;
+ case MAC_STAT_LINK_UP:
+ val = (mac_client_link_state(mcip) == LINK_STATE_UP);
+ break;
+ case MAC_STAT_PROMISC:
+ val = mac_stat_get((mac_handle_t)mip, MAC_STAT_PROMISC);
+ break;
+ case MAC_STAT_IFSPEED:
+ val = mac_client_ifspeed(mcip);
+ break;
+ case MAC_STAT_MULTIRCV:
+ val = mcip->mci_stat_multircv;
+ break;
+ case MAC_STAT_BRDCSTRCV:
+ val = mcip->mci_stat_brdcstrcv;
+ break;
+ case MAC_STAT_MULTIXMT:
+ val = mcip->mci_stat_multixmt;
+ break;
+ case MAC_STAT_BRDCSTXMT:
+ val = mcip->mci_stat_brdcstxmt;
+ break;
+ case MAC_STAT_OBYTES:
+ val = mcip->mci_stat_obytes;
+ break;
+ case MAC_STAT_OPACKETS:
+ val = mcip->mci_stat_opackets;
+ break;
+ case MAC_STAT_OERRORS:
+ val = mcip->mci_stat_oerrors;
+ break;
+ case MAC_STAT_IPACKETS:
+ val = mcip->mci_stat_ipackets;
+ break;
+ case MAC_STAT_RBYTES:
+ val = mcip->mci_stat_ibytes;
+ break;
+ case MAC_STAT_IERRORS:
+ val = mcip->mci_stat_ierrors;
+ break;
+ default:
+ val = mac_stat_default(mip, stat);
+ break;
+ }
+
+ return (val);
+}
+
+/*
+ * Return the statistics of the specified MAC instance.
+ */
+uint64_t
+mac_stat_get(mac_handle_t mh, uint_t stat)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ uint64_t val;
+ int ret;
+
+ /*
+ * The range of stat determines where it is maintained. Stat
+ * values from 0 up to (but not including) MAC_STAT_MIN are
+ * mainteined by the mac module itself. Everything else is
+ * maintained by the driver.
+ *
+ * If the mac_impl_t being queried corresponds to a VNIC,
+ * the stats need to be queried from the lower MAC client
+ * corresponding to the VNIC. (The mac_link_update()
+ * invoked by the driver to the lower MAC causes the *lower
+ * MAC* to update its mi_linkstate, and send a notification
+ * to its MAC clients. Due to the VNIC passthrough,
+ * these notifications are sent to the upper MAC clients
+ * of the VNIC directly, and the upper mac_impl_t of the VNIC
+ * does not have a valid mi_linkstate.
+ */
+ if (stat < MAC_STAT_MIN && !(mip->mi_state_flags & MIS_IS_VNIC)) {
+ /* these stats are maintained by the mac module itself */
+ switch (stat) {
+ case MAC_STAT_LINK_STATE:
+ return (mip->mi_linkstate);
+ case MAC_STAT_LINK_UP:
+ return (mip->mi_linkstate == LINK_STATE_UP);
+ case MAC_STAT_PROMISC:
+ return (mip->mi_devpromisc != 0);
+ default:
+ ASSERT(B_FALSE);
+ }
+ }
+
+ /*
+ * Call the driver to get the given statistic.
+ */
+ ret = mip->mi_getstat(mip->mi_driver, stat, &val);
+ if (ret != 0) {
+ /*
+ * The driver doesn't support this statistic. Get the
+ * statistic's default value.
+ */
+ val = mac_stat_default(mip, stat);
+ }
+ return (val);
+}
+
+/*
+ * Utility function which returns the VID associated with a flow entry.
+ */
+uint16_t
+i_mac_flow_vid(flow_entry_t *flent)
+{
+ flow_desc_t flow_desc;
+
+ mac_flow_get_desc(flent, &flow_desc);
+
+ if ((flow_desc.fd_mask & FLOW_LINK_VID) != 0)
+ return (flow_desc.fd_vid);
+ return (VLAN_ID_NONE);
+}
+
+/*
+ * Verify the validity of the specified unicast MAC address. Returns B_TRUE
+ * if the address is valid, B_FALSE otherwise (multicast address, or incorrect
+ * length.
+ */
+boolean_t
+mac_unicst_verify(mac_handle_t mh, const uint8_t *addr, uint_t len)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ /*
+ * Verify the address. No lock is needed since mi_type and plugin
+ * details don't change after mac_register().
+ */
+ if ((len != mip->mi_type->mt_addr_length) ||
+ (mip->mi_type->mt_ops.mtops_unicst_verify(addr,
+ mip->mi_pdata)) != 0) {
+ return (B_FALSE);
+ } else {
+ return (B_TRUE);
+ }
+}
+
+void
+mac_sdu_get(mac_handle_t mh, uint_t *min_sdu, uint_t *max_sdu)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ if (min_sdu != NULL)
+ *min_sdu = mip->mi_sdu_min;
+ if (max_sdu != NULL)
+ *max_sdu = mip->mi_sdu_max;
+}
+
+/*
+ * Update the MAC unicast address of the specified client's flows. Currently
+ * only one unicast MAC unicast address is allowed per client.
+ */
+static void
+mac_unicast_update_client_flow(mac_client_impl_t *mcip)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ flow_entry_t *flent = mcip->mci_flent;
+ mac_address_t *map = mcip->mci_unicast;
+ flow_desc_t flow_desc;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ ASSERT(flent != NULL);
+
+ mac_flow_get_desc(flent, &flow_desc);
+ ASSERT(flow_desc.fd_mask & FLOW_LINK_DST);
+
+ bcopy(map->ma_addr, flow_desc.fd_dst_mac, map->ma_len);
+ mac_flow_set_desc(flent, &flow_desc);
+
+ /*
+ * A MAC client could have one MAC address but multiple
+ * VLANs. In that case update the flow entries corresponding
+ * to all VLANs of the MAC client.
+ */
+ for (flent = mcip->mci_flent_list; flent != NULL;
+ flent = flent->fe_client_next) {
+ mac_flow_get_desc(flent, &flow_desc);
+ if (!(flent->fe_type & FLOW_PRIMARY_MAC ||
+ flent->fe_type & FLOW_VNIC_MAC))
+ continue;
+
+ bcopy(map->ma_addr, flow_desc.fd_dst_mac, map->ma_len);
+ mac_flow_set_desc(flent, &flow_desc);
+ }
+}
+
+/*
+ * Update all clients that share the same unicast address.
+ */
+void
+mac_unicast_update_clients(mac_impl_t *mip, mac_address_t *map)
+{
+ mac_client_impl_t *mcip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /*
+ * Find all clients that share the same unicast MAC address and update
+ * them appropriately.
+ */
+ for (mcip = mip->mi_clients_list; mcip != NULL;
+ mcip = mcip->mci_client_next) {
+ /*
+ * Ignore clients that don't share this MAC address.
+ */
+ if (map != mcip->mci_unicast)
+ continue;
+
+ /*
+ * Update those clients with same old unicast MAC address.
+ */
+ mac_unicast_update_client_flow(mcip);
+ }
+}
+
+/*
+ * Update the unicast MAC address of the specified VNIC MAC client.
+ *
+ * Check whether the operation is valid. Any of following cases should fail:
+ *
+ * 1. It's a VLAN type of VNIC.
+ * 2. The new value is current "primary" MAC address.
+ * 3. The current MAC address is shared with other clients.
+ * 4. The new MAC address has been used. This case will be valid when
+ * client migration is fully supported.
+ */
+int
+mac_vnic_unicast_set(mac_client_handle_t mch, const uint8_t *addr)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_address_t *map = mcip->mci_unicast;
+ int err;
+
+ ASSERT(!(mip->mi_state_flags & MIS_IS_VNIC));
+ ASSERT(mcip->mci_state_flags & MCIS_IS_VNIC);
+ ASSERT(mcip->mci_flags != MAC_CLIENT_FLAGS_PRIMARY);
+
+ i_mac_perim_enter(mip);
+
+ /*
+ * If this is a VLAN type of VNIC, it's using "primary" MAC address
+ * of the underlying interface. Must fail here. Refer to case 1 above.
+ */
+ if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0) {
+ i_mac_perim_exit(mip);
+ return (ENOTSUP);
+ }
+
+ /*
+ * If the new address is the "primary" one, must fail. Refer to
+ * case 2 above.
+ */
+ if (bcmp(addr, mip->mi_addr, map->ma_len) == 0) {
+ i_mac_perim_exit(mip);
+ return (EACCES);
+ }
+
+ /*
+ * If the address is shared by multiple clients, must fail. Refer
+ * to case 3 above.
+ */
+ if (mac_check_macaddr_shared(map)) {
+ i_mac_perim_exit(mip);
+ return (EBUSY);
+ }
+
+ /*
+ * If the new address has been used, must fail for now. Refer to
+ * case 4 above.
+ */
+ if (mac_find_macaddr(mip, (uint8_t *)addr) != NULL) {
+ i_mac_perim_exit(mip);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Update the MAC address.
+ */
+ err = mac_update_macaddr(map, (uint8_t *)addr);
+
+ if (err != 0) {
+ i_mac_perim_exit(mip);
+ return (err);
+ }
+
+ /*
+ * Update all flows of this MAC client.
+ */
+ mac_unicast_update_client_flow(mcip);
+
+ i_mac_perim_exit(mip);
+ return (0);
+}
+
+/*
+ * Program the new primary unicast address of the specified MAC.
+ *
+ * Function mac_update_macaddr() takes care different types of underlying
+ * MAC. If the underlying MAC is VNIC, the VNIC driver must have registerd
+ * mi_unicst() entry point, that indirectly calls mac_vnic_unicast_set()
+ * which will take care of updating the MAC address of the corresponding
+ * MAC client.
+ *
+ * This is the only interface that allow the client to update the "primary"
+ * MAC address of the underlying MAC. The new value must have not been
+ * used by other clients.
+ */
+int
+mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_address_t *map;
+ int err;
+
+ /* verify the address validity */
+ if (!mac_unicst_verify(mh, addr, mip->mi_type->mt_addr_length))
+ return (EINVAL);
+
+ i_mac_perim_enter(mip);
+
+ /*
+ * If the new value is the same as the current primary address value,
+ * there's nothing to do.
+ */
+ if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) {
+ i_mac_perim_exit(mip);
+ return (0);
+ }
+
+ if (mac_find_macaddr(mip, (uint8_t *)addr) != 0) {
+ i_mac_perim_exit(mip);
+ return (EBUSY);
+ }
+
+ map = mac_find_macaddr(mip, mip->mi_addr);
+ ASSERT(map != NULL);
+
+ /*
+ * Update the MAC address.
+ */
+ if (mip->mi_state_flags & MIS_IS_AGGR) {
+ mac_capab_aggr_t aggr_cap;
+
+ /*
+ * If the mac is an aggregation, other than the unicast
+ * addresses programming, aggr must be informed about this
+ * primary unicst address change to change its mac address
+ * policy to be user-specified.
+ */
+ ASSERT(map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED);
+ VERIFY(i_mac_capab_get(mh, MAC_CAPAB_AGGR, &aggr_cap));
+ err = aggr_cap.mca_unicst(mip->mi_driver, addr);
+ if (err == 0)
+ bcopy(addr, map->ma_addr, map->ma_len);
+ } else {
+ err = mac_update_macaddr(map, (uint8_t *)addr);
+ }
+
+ if (err != 0) {
+ i_mac_perim_exit(mip);
+ return (err);
+ }
+
+ mac_unicast_update_clients(mip, map);
+
+ /*
+ * Save the new primary MAC address in mac_impl_t.
+ */
+ bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length);
+
+ i_mac_perim_exit(mip);
+
+ if (err == 0)
+ i_mac_notify(mip, MAC_NOTE_UNICST);
+
+ return (err);
+}
+
+/*
+ * Return the current primary MAC address of the specified MAC.
+ */
+void
+mac_unicast_primary_get(mac_handle_t mh, uint8_t *addr)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+ bcopy(mip->mi_addr, addr, mip->mi_type->mt_addr_length);
+ rw_exit(&mip->mi_rw_lock);
+}
+
+/*
+ * Return information about the use of the primary MAC address of the
+ * specified MAC instance:
+ *
+ * - if client_name is non-NULL, it must point to a string of at
+ * least MAXNAMELEN bytes, and will be set to the name of the MAC
+ * client which uses the primary MAC address.
+ *
+ * - if in_use is non-NULL, used to return whether the primary MAC
+ * address is currently in use.
+ */
+void
+mac_unicast_primary_info(mac_handle_t mh, char *client_name, boolean_t *in_use)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_client_impl_t *cur_client;
+
+ if (in_use != NULL)
+ *in_use = B_FALSE;
+ if (client_name != NULL)
+ bzero(client_name, MAXNAMELEN);
+
+ /*
+ * The mi_rw_lock is used to protect threads that don't hold the
+ * mac perimeter to get a consistent view of the mi_clients_list.
+ * Threads that modify the list must hold both the mac perimeter and
+ * mi_rw_lock(RW_WRITER)
+ */
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+ for (cur_client = mip->mi_clients_list; cur_client != NULL;
+ cur_client = cur_client->mci_client_next) {
+ if (mac_is_primary_client(cur_client) ||
+ (mip->mi_state_flags & MIS_IS_VNIC)) {
+ rw_exit(&mip->mi_rw_lock);
+ if (in_use != NULL)
+ *in_use = B_TRUE;
+ if (client_name != NULL) {
+ bcopy(cur_client->mci_name, client_name,
+ MAXNAMELEN);
+ }
+ return;
+ }
+ }
+ rw_exit(&mip->mi_rw_lock);
+}
+
+/*
+ * Add the specified MAC client to the list of clients which opened
+ * the specified MAC.
+ */
+static void
+mac_client_add(mac_client_impl_t *mcip)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /* add VNIC to the front of the list */
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
+ mcip->mci_client_next = mip->mi_clients_list;
+ mip->mi_clients_list = mcip;
+ mip->mi_nclients++;
+ rw_exit(&mip->mi_rw_lock);
+}
+
+/*
+ * Remove the specified MAC client from the list of clients which opened
+ * the specified MAC.
+ */
+static void
+mac_client_remove(mac_client_impl_t *mcip)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_client_impl_t **prev, *cclient;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
+ prev = &mip->mi_clients_list;
+ cclient = *prev;
+ while (cclient != NULL && cclient != mcip) {
+ prev = &cclient->mci_client_next;
+ cclient = *prev;
+ }
+ ASSERT(cclient != NULL);
+ *prev = cclient->mci_client_next;
+ mip->mi_nclients--;
+ rw_exit(&mip->mi_rw_lock);
+}
+
+static mac_unicast_impl_t *
+mac_client_find_vid(mac_client_impl_t *mcip, uint16_t vid)
+{
+ mac_unicast_impl_t *muip = mcip->mci_unicast_list;
+
+ while ((muip != NULL) && (muip->mui_vid != vid))
+ muip = muip->mui_next;
+
+ return (muip);
+}
+
+/*
+ * Return whether the specified (MAC address, VID) tuple is already used by
+ * one of the MAC clients associated with the specified MAC.
+ */
+static boolean_t
+mac_addr_in_use(mac_impl_t *mip, uint8_t *mac_addr, uint16_t vid)
+{
+ mac_client_impl_t *client;
+ mac_address_t *map;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ for (client = mip->mi_clients_list; client != NULL;
+ client = client->mci_client_next) {
+
+ /*
+ * Ignore clients that don't have unicast address.
+ */
+ if (client->mci_unicast_list == NULL)
+ continue;
+
+ map = client->mci_unicast;
+
+ if ((bcmp(mac_addr, map->ma_addr, map->ma_len) == 0) &&
+ (mac_client_find_vid(client, vid) != NULL)) {
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Generate a random MAC address. The MAC address prefix is
+ * stored in the array pointed to by mac_addr, and its length, in bytes,
+ * is specified by prefix_len. The least significant bits
+ * after prefix_len bytes are generated, and stored after the prefix
+ * in the mac_addr array.
+ */
+int
+mac_addr_random(mac_client_handle_t mch, uint_t prefix_len,
+ uint8_t *mac_addr, mac_diag_t *diag)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ size_t addr_len = mip->mi_type->mt_addr_length;
+
+ if (prefix_len >= addr_len) {
+ *diag = MAC_DIAG_MACPREFIXLEN_INVALID;
+ return (EINVAL);
+ }
+
+ /* check the prefix value */
+ if (prefix_len > 0) {
+ bzero(mac_addr + prefix_len, addr_len - prefix_len);
+ if (!mac_unicst_verify((mac_handle_t)mip, mac_addr,
+ addr_len)) {
+ *diag = MAC_DIAG_MACPREFIX_INVALID;
+ return (EINVAL);
+ }
+ }
+
+ /* generate the MAC address */
+ if (prefix_len < addr_len) {
+ (void) random_get_pseudo_bytes(mac_addr +
+ prefix_len, addr_len - prefix_len);
+ }
+
+ *diag = 0;
+ return (0);
+}
+
+/*
+ * Set the priority range for this MAC client. This will be used to
+ * determine the absolute priority for the threads created for this
+ * MAC client using the specified "low", "medium" and "high" level.
+ * This will also be used for any subflows on this MAC client.
+ */
+#define MAC_CLIENT_SET_PRIORITY_RANGE(mcip, pri) { \
+ (mcip)->mci_min_pri = FLOW_MIN_PRIORITY(MINCLSYSPRI, \
+ MAXCLSYSPRI, (pri)); \
+ (mcip)->mci_max_pri = FLOW_MAX_PRIORITY(MINCLSYSPRI, \
+ MAXCLSYSPRI, (mcip)->mci_min_pri); \
+ }
+
+/*
+ * MAC client open entry point. Return a new MAC client handle. Each
+ * MAC client is associated with a name, specified through the 'name'
+ * argument.
+ */
+int
+mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
+ uint16_t flags)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_client_impl_t *mcip;
+ int err = 0;
+ boolean_t share_desired =
+ ((flags & MAC_OPEN_FLAGS_SHARES_DESIRED) != 0);
+ boolean_t no_hwrings = ((flags & MAC_OPEN_FLAGS_NO_HWRINGS) != 0);
+ boolean_t req_hwrings = ((flags & MAC_OPEN_FLAGS_REQ_HWRINGS) != 0);
+ flow_entry_t *flent = NULL;
+
+ *mchp = NULL;
+ if (share_desired && no_hwrings) {
+ /* can't have shares but no hardware rings */
+ return (EINVAL);
+ }
+
+ i_mac_perim_enter(mip);
+
+ if (mip->mi_state_flags & MIS_IS_VNIC) {
+ /*
+ * The underlying MAC is a VNIC. Return the MAC client
+ * handle of the lower MAC which was obtained by
+ * the VNIC driver when it did its mac_client_open().
+ */
+
+ mcip = mac_vnic_lower(mip);
+ /*
+ * If there are multiple MAC clients of the VNIC, they
+ * all share the same underlying MAC client handle.
+ */
+ if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0)
+ mcip->mci_state_flags |= MCIS_TAG_DISABLE;
+
+ if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0)
+ mcip->mci_state_flags |= MCIS_STRIP_DISABLE;
+
+ if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0)
+ mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK;
+
+ /*
+ * Note that multiple mac clients share the same mcip in
+ * this case.
+ */
+ if (flags & MAC_OPEN_FLAGS_EXCLUSIVE)
+ mcip->mci_state_flags |= MCIS_EXCLUSIVE;
+
+ mip->mi_clients_list = mcip;
+ i_mac_perim_exit(mip);
+ *mchp = (mac_client_handle_t)mcip;
+ return (err);
+ }
+
+ mcip = kmem_cache_alloc(mac_client_impl_cache, KM_SLEEP);
+
+ mcip->mci_mip = mip;
+ mcip->mci_upper_mip = NULL;
+ mcip->mci_rx_fn = mac_pkt_drop;
+ mcip->mci_rx_arg = NULL;
+ mcip->mci_direct_rx_fn = NULL;
+ mcip->mci_direct_rx_arg = NULL;
+
+ if ((flags & MAC_OPEN_FLAGS_IS_VNIC) != 0)
+ mcip->mci_state_flags |= MCIS_IS_VNIC;
+
+ if ((flags & MAC_OPEN_FLAGS_EXCLUSIVE) != 0)
+ mcip->mci_state_flags |= MCIS_EXCLUSIVE;
+
+ if ((flags & MAC_OPEN_FLAGS_IS_AGGR_PORT) != 0)
+ mcip->mci_state_flags |= MCIS_IS_AGGR_PORT;
+
+ if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0)
+ mcip->mci_state_flags |= MCIS_TAG_DISABLE;
+
+ if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0)
+ mcip->mci_state_flags |= MCIS_STRIP_DISABLE;
+
+ if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0)
+ mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK;
+
+ if ((flags & MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) {
+ datalink_id_t linkid;
+
+ ASSERT(name == NULL);
+ if ((err = dls_devnet_macname2linkid(mip->mi_name,
+ &linkid)) != 0) {
+ goto done;
+ }
+ if ((err = dls_mgmt_get_linkinfo(linkid, mcip->mci_name, NULL,
+ NULL, NULL)) != 0) {
+ /*
+ * Use mac name if dlmgmtd is not available.
+ */
+ if (err == EBADF) {
+ (void) strlcpy(mcip->mci_name, mip->mi_name,
+ sizeof (mcip->mci_name));
+ err = 0;
+ } else {
+ goto done;
+ }
+ }
+ mcip->mci_state_flags |= MCIS_USE_DATALINK_NAME;
+ } else {
+ ASSERT(name != NULL);
+ if (strlen(name) > MAXNAMELEN) {
+ err = EINVAL;
+ goto done;
+ }
+ (void) strlcpy(mcip->mci_name, name, sizeof (mcip->mci_name));
+ }
+ /* the subflow table will be created dynamically */
+ mcip->mci_subflow_tab = NULL;
+ mcip->mci_stat_multircv = 0;
+ mcip->mci_stat_brdcstrcv = 0;
+ mcip->mci_stat_multixmt = 0;
+ mcip->mci_stat_brdcstxmt = 0;
+
+ mcip->mci_stat_obytes = 0;
+ mcip->mci_stat_opackets = 0;
+ mcip->mci_stat_oerrors = 0;
+ mcip->mci_stat_ibytes = 0;
+ mcip->mci_stat_ipackets = 0;
+ mcip->mci_stat_ierrors = 0;
+
+ /* Create an initial flow */
+
+ err = mac_flow_create(NULL, NULL, mcip->mci_name, NULL,
+ mcip->mci_state_flags & MCIS_IS_VNIC ? FLOW_VNIC_MAC :
+ FLOW_PRIMARY_MAC, &flent);
+ if (err != 0)
+ goto done;
+ mcip->mci_flent = flent;
+ FLOW_MARK(flent, FE_MC_NO_DATAPATH);
+ flent->fe_mcip = mcip;
+ /*
+ * Place initial creation reference on the flow. This reference
+ * is released in the corresponding delete action viz.
+ * mac_unicast_remove after waiting for all transient refs to
+ * to go away. The wait happens in mac_flow_wait.
+ */
+ FLOW_REFHOLD(flent);
+
+ /*
+ * Do this ahead of the mac_bcast_add() below so that the mi_nclients
+ * will have the right value for mac_rx_srs_setup().
+ */
+ mac_client_add(mcip);
+
+ mcip->mci_no_hwrings = no_hwrings;
+ mcip->mci_req_hwrings = req_hwrings;
+ mcip->mci_share = NULL;
+ if (share_desired) {
+ ASSERT(!no_hwrings);
+ i_mac_share_alloc(mcip);
+ }
+
+ DTRACE_PROBE2(mac__client__open__allocated, mac_impl_t *,
+ mcip->mci_mip, mac_client_impl_t *, mcip);
+ *mchp = (mac_client_handle_t)mcip;
+
+ i_mac_perim_exit(mip);
+ return (0);
+
+done:
+ i_mac_perim_exit(mip);
+ mcip->mci_state_flags = 0;
+ mcip->mci_tx_flag = 0;
+ kmem_cache_free(mac_client_impl_cache, mcip);
+ return (err);
+}
+
+/*
+ * Close the specified MAC client handle.
+ */
+void
+mac_client_close(mac_client_handle_t mch, uint16_t flags)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ flow_entry_t *flent;
+
+ i_mac_perim_enter(mip);
+
+ if (flags & MAC_CLOSE_FLAGS_EXCLUSIVE)
+ mcip->mci_state_flags &= ~MCIS_EXCLUSIVE;
+
+ if ((mcip->mci_state_flags & MCIS_IS_VNIC) &&
+ !(flags & MAC_CLOSE_FLAGS_IS_VNIC)) {
+ /*
+ * This is an upper VNIC client initiated operation.
+ * The lower MAC client will be closed by the VNIC driver
+ * when the VNIC is deleted.
+ */
+
+ /*
+ * Clear the flags set when the upper client initiated
+ * open.
+ */
+ mcip->mci_state_flags &= ~(MCIS_TAG_DISABLE |
+ MCIS_STRIP_DISABLE | MCIS_DISABLE_TX_VID_CHECK);
+
+ i_mac_perim_exit(mip);
+ return;
+ }
+
+ /*
+ * Remove the flent associated with the MAC client
+ */
+ flent = mcip->mci_flent;
+ mcip->mci_flent = NULL;
+ FLOW_FINAL_REFRELE(flent);
+
+ /*
+ * MAC clients must remove the unicast addresses and promisc callbacks
+ * they added before issuing a mac_client_close().
+ */
+ ASSERT(mcip->mci_unicast_list == NULL);
+ ASSERT(mcip->mci_promisc_list == NULL);
+ ASSERT(mcip->mci_tx_notify_cb_list == NULL);
+
+ i_mac_share_free(mcip);
+
+ mac_client_remove(mcip);
+
+ i_mac_perim_exit(mip);
+ mcip->mci_subflow_tab = NULL;
+ mcip->mci_state_flags = 0;
+ mcip->mci_tx_flag = 0;
+ kmem_cache_free(mac_client_impl_cache, mch);
+}
+
+/*
+ * Enable bypass for the specified MAC client.
+ */
+boolean_t
+mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /*
+ * If the mac_client is a VLAN or native media is non ethernet, we
+ * should not do DLS bypass and instead let the packets go via the
+ * default mac_rx_deliver route so vlan header can be stripped etc.
+ */
+ if (mcip->mci_nvids > 0 ||
+ mip->mi_info.mi_nativemedia != DL_ETHER)
+ return (B_FALSE);
+
+ /*
+ * These are not accessed directly in the data path, and hence
+ * don't need any protection
+ */
+ mcip->mci_direct_rx_fn = rx_fn;
+ mcip->mci_direct_rx_arg = arg1;
+ mcip->mci_state_flags |= MCIS_CLIENT_POLL_CAPABLE;
+ return (B_TRUE);
+}
+
+/*
+ * Set the receive callback for the specified MAC client. There can be
+ * at most one such callback per MAC client.
+ */
+void
+mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+
+ /*
+ * Instead of adding an extra set of locks and refcnts in
+ * the datapath at the mac client boundary, we temporarily quiesce
+ * the SRS and related entities. We then change the receive function
+ * without interference from any receive data thread and then reenable
+ * the data flow subsequently.
+ */
+ i_mac_perim_enter(mip);
+ mac_rx_client_quiesce(mch);
+
+ mcip->mci_rx_fn = rx_fn;
+ mcip->mci_rx_arg = arg;
+ mac_rx_client_restart(mch);
+ i_mac_perim_exit(mip);
+}
+
+/*
+ * Reset the receive callback for the specified MAC client.
+ */
+void
+mac_rx_clear(mac_client_handle_t mch)
+{
+ mac_rx_set(mch, mac_pkt_drop, NULL);
+}
+
+/*
+ * Walk the MAC client subflow table and updates their priority values.
+ */
+static int
+mac_update_subflow_priority_cb(flow_entry_t *flent, void *arg)
+{
+ mac_flow_update_priority(arg, flent);
+ return (0);
+}
+
+void
+mac_update_subflow_priority(mac_client_impl_t *mcip)
+{
+ (void) mac_flow_walk(mcip->mci_subflow_tab,
+ mac_update_subflow_priority_cb, mcip);
+}
+
+/*
+ * When the MAC client is being brought up (i.e. we do a unicast_add) we need
+ * to initialize the cpu and resource control structure in the
+ * mac_client_impl_t from the mac_impl_t (i.e if there are any cached
+ * properties before the flow entry for the unicast address was created).
+ */
+int
+mac_resource_ctl_set(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = (mac_impl_t *)mcip->mci_mip;
+ int err = 0;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ err = mac_validate_props(mrp);
+ if (err != 0)
+ return (err);
+
+ mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip), B_FALSE);
+ if (MCIP_DATAPATH_SETUP(mcip)) {
+ /*
+ * We have to set this prior to calling mac_flow_modify.
+ */
+ if (mrp->mrp_mask & MRP_PRIORITY) {
+ if (mrp->mrp_priority == MPL_RESET) {
+ MAC_CLIENT_SET_PRIORITY_RANGE(mcip,
+ MPL_LINK_DEFAULT);
+ } else {
+ MAC_CLIENT_SET_PRIORITY_RANGE(mcip,
+ mrp->mrp_priority);
+ }
+ }
+
+ mac_flow_modify(mip->mi_flow_tab, mcip->mci_flent, mrp);
+ if (mrp->mrp_mask & MRP_PRIORITY)
+ mac_update_subflow_priority(mcip);
+ return (0);
+ }
+ return (0);
+}
+
+void
+mac_resource_ctl_get(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+
+ bcopy(mcip_mrp, mrp, sizeof (mac_resource_props_t));
+}
+
+static int
+mac_unicast_flow_create(mac_client_impl_t *mcip, uint8_t *mac_addr,
+ uint16_t vid, boolean_t is_primary, boolean_t first_flow,
+ flow_entry_t **flent, mac_resource_props_t *mrp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mcip->mci_mip;
+ flow_desc_t flow_desc;
+ char flowname[MAXFLOWNAME];
+ int err;
+ uint_t flent_flags;
+
+ /*
+ * First unicast address being added, create a new flow
+ * for that MAC client.
+ */
+ bzero(&flow_desc, sizeof (flow_desc));
+
+ flow_desc.fd_mac_len = mip->mi_type->mt_addr_length;
+ bcopy(mac_addr, flow_desc.fd_dst_mac, flow_desc.fd_mac_len);
+ flow_desc.fd_mask = FLOW_LINK_DST;
+ if (vid != 0) {
+ flow_desc.fd_vid = vid;
+ flow_desc.fd_mask |= FLOW_LINK_VID;
+ }
+
+ /*
+ * XXX-nicolas. For now I'm keeping the FLOW_PRIMARY_MAC
+ * and FLOW_VNIC. Even though they're a hack inherited
+ * from the SRS code, we'll keep them for now. They're currently
+ * consumed by mac_datapath_setup() to create the SRS.
+ * That code should be eventually moved out of
+ * mac_datapath_setup() and moved to a mac_srs_create()
+ * function of some sort to keep things clean.
+ *
+ * Also, there's no reason why the SRS for the primary MAC
+ * client should be different than any other MAC client. Until
+ * this is cleaned-up, we support only one MAC unicast address
+ * per client.
+ *
+ * We set FLOW_PRIMARY_MAC for the primary MAC address,
+ * FLOW_VNIC for everything else.
+ */
+ if (is_primary)
+ flent_flags = FLOW_PRIMARY_MAC;
+ else
+ flent_flags = FLOW_VNIC_MAC;
+
+ /*
+ * For the first flow we use the mac client's name - mci_name, for
+ * subsequent ones we just create a name with the vid. This is
+ * so that we can add these flows to the same flow table. This is
+ * fine as the flow name (except for the one with the mac client's
+ * name) is not visible. When the first flow is removed, we just replace
+ * its fdesc with another from the list, so we will still retain the
+ * flent with the MAC client's flow name.
+ */
+ if (first_flow) {
+ bcopy(mcip->mci_name, flowname, MAXFLOWNAME);
+ } else {
+ (void) sprintf(flowname, "%s%u", mcip->mci_name, vid);
+ flent_flags = FLOW_NO_STATS;
+ }
+
+ if ((err = mac_flow_create(&flow_desc, mrp, flowname, NULL,
+ flent_flags, flent)) != 0)
+ return (err);
+
+ FLOW_MARK(*flent, FE_INCIPIENT);
+ (*flent)->fe_mcip = mcip;
+
+ /*
+ * Place initial creation reference on the flow. This reference
+ * is released in the corresponding delete action viz.
+ * mac_unicast_remove after waiting for all transient refs to
+ * to go away. The wait happens in mac_flow_wait.
+ * We have already held the reference in mac_client_open().
+ */
+ if (!first_flow)
+ FLOW_REFHOLD(*flent);
+ return (0);
+}
+
+/* Refresh the multicast grouping for this VID. */
+int
+mac_client_update_mcast(void *arg, boolean_t add, const uint8_t *addrp)
+{
+ flow_entry_t *flent = arg;
+ mac_client_impl_t *mcip = flent->fe_mcip;
+ uint16_t vid;
+ flow_desc_t flow_desc;
+
+ mac_flow_get_desc(flent, &flow_desc);
+ vid = (flow_desc.fd_mask & FLOW_LINK_VID) != 0 ?
+ flow_desc.fd_vid : VLAN_ID_NONE;
+
+ /*
+ * We don't call mac_multicast_add()/mac_multicast_remove() as
+ * we want to add/remove for this specific vid.
+ */
+ if (add) {
+ return (mac_bcast_add(mcip, addrp, vid,
+ MAC_ADDRTYPE_MULTICAST));
+ } else {
+ mac_bcast_delete(mcip, addrp, vid);
+ return (0);
+ }
+}
+
+/*
+ * Add a new unicast address to the MAC client.
+ *
+ * The MAC address can be specified either by value, or the MAC client
+ * can specify that it wants to use the primary MAC address of the
+ * underlying MAC. See the introductory comments at the beginning
+ * of this file for more more information on primary MAC addresses.
+ *
+ * Note also the tuple (MAC address, VID) must be unique
+ * for the MAC clients defined on top of the same underlying MAC
+ * instance, unless the MAC_UNICAST_NODUPCHECK is specified.
+ */
+
+int
+i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
+ mac_unicast_handle_t *mah, uint16_t vid, mac_diag_t *diag)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_unicast_impl_t *muip;
+ flow_entry_t *flent;
+ int err;
+ uint_t mac_len = mip->mi_type->mt_addr_length;
+ boolean_t check_dups = !(flags & MAC_UNICAST_NODUPCHECK);
+ boolean_t is_primary = (flags & MAC_UNICAST_PRIMARY);
+ boolean_t is_vnic_primary = flags & MAC_UNICAST_VNIC_PRIMARY;
+ boolean_t bcast_added = B_FALSE;
+ boolean_t nactiveclients_added = B_FALSE;
+ boolean_t mac_started = B_FALSE;
+ mac_resource_props_t mrp;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /* when VID is non-zero, the underlying MAC can not be VNIC */
+ ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != 0)));
+
+ /*
+ * Check whether it's the primary client and flag it.
+ */
+ if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && vid == 0)
+ mcip->mci_flags |= MAC_CLIENT_FLAGS_PRIMARY;
+
+ /*
+ * is_vnic_primary is true when we come here as a VLAN VNIC
+ * which uses the primary mac client's address but with a non-zero
+ * VID. In this case the MAC address is not specified by an upper
+ * MAC client.
+ */
+ if ((mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary &&
+ !is_vnic_primary) {
+ /*
+ * The address is being set by the upper MAC client
+ * of a VNIC. The MAC address was already set by the
+ * VNIC driver during VNIC creation.
+ *
+ * Note: a VNIC has only one MAC address. We return
+ * the MAC unicast address handle of the lower MAC client
+ * corresponding to the VNIC. We allocate a new entry
+ * which is flagged appropriately, so that mac_unicast_remove()
+ * doesn't attempt to free the original entry that
+ * was allocated by the VNIC driver.
+ */
+ ASSERT(mcip->mci_unicast != NULL);
+
+ /*
+ * Ensure that the primary unicast address of the VNIC
+ * is added only once.
+ */
+ if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY)
+ return (EBUSY);
+
+ mcip->mci_flags |= MAC_CLIENT_FLAGS_VNIC_PRIMARY;
+
+ /*
+ * Create a handle for vid 0.
+ */
+ ASSERT(vid == 0);
+ muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP);
+ muip->mui_vid = vid;
+ *mah = (mac_unicast_handle_t)muip;
+ return (0);
+ }
+
+ /* primary MAC clients cannot be opened on top of anchor VNICs */
+ if ((is_vnic_primary || is_primary) &&
+ i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
+ return (ENXIO);
+ }
+
+ /*
+ * Return EBUSY if:
+ * - this is an exclusive active mac client and there already exist
+ * active mac clients, or
+ * - there already exist an exclusively active mac client.
+ */
+ if ((mcip->mci_state_flags & MCIS_EXCLUSIVE) &&
+ (mip->mi_nactiveclients != 0) || (mip->mi_state_flags &
+ MIS_EXCLUSIVE)) {
+ return (EBUSY);
+ }
+
+ if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
+ mip->mi_state_flags |= MIS_EXCLUSIVE;
+
+ bzero(&mrp, sizeof (mac_resource_props_t));
+ if (is_primary && !(mcip->mci_state_flags & MCIS_IS_VNIC)) {
+ /*
+ * Apply the property cached in the mac_impl_t to the primary
+ * mac client. If the mac client is a VNIC, its property were
+ * already set in the mcip when the VNIC was created.
+ */
+ mac_get_resources((mac_handle_t)mip, &mrp);
+ (void) mac_client_set_resources(mch, &mrp);
+ } else if (mcip->mci_state_flags & MCIS_IS_VNIC) {
+ bcopy(MCIP_RESOURCE_PROPS(mcip), &mrp,
+ sizeof (mac_resource_props_t));
+ }
+
+ muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP);
+ muip->mui_vid = vid;
+
+ if (is_primary || is_vnic_primary) {
+ mac_addr = mip->mi_addr;
+ check_dups = B_TRUE;
+ } else {
+
+ /*
+ * Verify the validity of the specified MAC addresses value.
+ */
+ if (!mac_unicst_verify((mac_handle_t)mip, mac_addr, mac_len)) {
+ *diag = MAC_DIAG_MACADDR_INVALID;
+ err = EINVAL;
+ goto bail;
+ }
+
+ /*
+ * Make sure that the specified MAC address is different
+ * than the unicast MAC address of the underlying NIC.
+ */
+ if (check_dups && bcmp(mip->mi_addr, mac_addr, mac_len) == 0) {
+ *diag = MAC_DIAG_MACADDR_NIC;
+ err = EINVAL;
+ goto bail;
+ }
+ }
+
+ /*
+ * Make sure the MAC address is not already used by
+ * another MAC client defined on top of the same
+ * underlying NIC.
+ * xxx-venu mac_unicast_add doesnt' seem to be called
+ * with MAC_UNICAST_NODUPCHECK currently, if it does
+ * get called we need to do mac_addr_in_use() just
+ * to check for addr_in_use till 6697876 is fixed.
+ */
+ if (check_dups && mac_addr_in_use(mip, mac_addr, vid)) {
+ *diag = MAC_DIAG_MACADDR_INUSE;
+ err = EEXIST;
+ goto bail;
+ }
+
+ if ((err = mac_start(mip)) != 0)
+ goto bail;
+
+ mac_started = B_TRUE;
+
+ /* add the MAC client to the broadcast address group by default */
+ if (mip->mi_type->mt_brdcst_addr != NULL) {
+ err = mac_bcast_add(mcip, mip->mi_type->mt_brdcst_addr, vid,
+ MAC_ADDRTYPE_BROADCAST);
+ if (err != 0)
+ goto bail;
+ bcast_added = B_TRUE;
+ }
+ flent = mcip->mci_flent;
+ ASSERT(flent != NULL);
+ /* We are configuring the unicast flow now */
+ if (!MCIP_DATAPATH_SETUP(mcip)) {
+
+ MAC_CLIENT_SET_PRIORITY_RANGE(mcip,
+ (mrp.mrp_mask & MRP_PRIORITY) ? mrp.mrp_priority :
+ MPL_LINK_DEFAULT);
+
+ if ((err = mac_unicast_flow_create(mcip, mac_addr, vid,
+ is_primary || is_vnic_primary, B_TRUE, &flent, &mrp)) != 0)
+ goto bail;
+
+ mip->mi_nactiveclients++;
+ nactiveclients_added = B_TRUE;
+ /*
+ * This will allocate the RX ring group if possible for the
+ * flow and program the software classifier as needed.
+ */
+ if ((err = mac_datapath_setup(mcip, flent, SRST_LINK)) != 0)
+ goto bail;
+
+ /*
+ * The unicast MAC address must have been added successfully.
+ */
+ ASSERT(mcip->mci_unicast != NULL);
+ } else {
+ mac_address_t *map = mcip->mci_unicast;
+
+ /*
+ * A unicast flow already exists for that MAC client,
+ * this flow must be the same mac address but with
+ * different VID. It has been checked by mac_addr_in_use().
+ *
+ * We will use the SRS etc. from the mci_flent. Note that
+ * We don't need to create kstat for this as except for
+ * the fdesc, everything will be used from in the 1st flent.
+ */
+
+ if (bcmp(mac_addr, map->ma_addr, map->ma_len) != 0) {
+ err = EINVAL;
+ goto bail;
+ }
+
+ if ((err = mac_unicast_flow_create(mcip, mac_addr, vid,
+ is_primary || is_vnic_primary, B_FALSE, &flent, NULL)) != 0)
+ goto bail;
+
+ if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0) {
+ FLOW_FINAL_REFRELE(flent);
+ goto bail;
+ }
+
+ /* update the multicast group for this vid */
+ mac_client_bcast_refresh(mcip, mac_client_update_mcast,
+ (void *)flent, B_TRUE);
+
+ }
+
+ /* populate the shared MAC address */
+ muip->mui_map = mcip->mci_unicast;
+
+ rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+ muip->mui_next = mcip->mci_unicast_list;
+ mcip->mci_unicast_list = muip;
+ rw_exit(&mcip->mci_rw_lock);
+
+ *mah = (mac_unicast_handle_t)muip;
+
+ /* add it to the flow list of this mcip */
+ mac_client_add_to_flow_list(mcip, flent);
+
+ /*
+ * Trigger a renegotiation of the capabilities when the number of
+ * active clients changes from 1 to 2, since some of the capabilities
+ * might have to be disabled. Also send a MAC_NOTE_LINK notification
+ * to all the MAC clients whenever physical link is DOWN.
+ */
+ if (mip->mi_nactiveclients == 2) {
+ mac_capab_update((mac_handle_t)mip);
+ mac_virtual_link_update(mip);
+ }
+ /*
+ * Now that the setup is complete, clear the INCIPIENT flag.
+ * The flag was set to avoid incoming packets seeing inconsistent
+ * structures while the setup was in progress. Clear the mci_tx_flag
+ * by calling mac_tx_client_block. It is possible that
+ * mac_unicast_remove was called prior to this mac_unicast_add which
+ * could have set the MCI_TX_QUIESCE flag.
+ */
+ if (flent->fe_rx_ring_group != NULL)
+ mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT);
+ FLOW_UNMARK(flent, FE_INCIPIENT);
+ FLOW_UNMARK(flent, FE_MC_NO_DATAPATH);
+ mac_tx_client_unblock(mcip);
+ return (0);
+bail:
+ if (bcast_added)
+ mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr, vid);
+ if (mac_started)
+ mac_stop(mip);
+
+ if (nactiveclients_added)
+ mip->mi_nactiveclients--;
+ if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
+ mip->mi_state_flags &= ~MIS_EXCLUSIVE;
+ kmem_free(muip, sizeof (mac_unicast_impl_t));
+ return (err);
+}
+
+int
+mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
+ mac_unicast_handle_t *mah, uint16_t vid, mac_diag_t *diag)
+{
+ mac_impl_t *mip = ((mac_client_impl_t *)mch)->mci_mip;
+ uint_t err;
+
+ i_mac_perim_enter(mip);
+ err = i_mac_unicast_add(mch, mac_addr, flags, mah, vid, diag);
+ i_mac_perim_exit(mip);
+
+ return (err);
+}
+
+/*
+ * Add the primary MAC address to the MAC client. This is a convenience
+ * function which can be called by primary MAC clients which do not
+ * need to specify any other additional flags.
+ *
+ * It's called in one of following situations:
+ * * dls as the primary MAC client
+ * * aggr as an exclusive client
+ * * by VNIC's client
+ */
+int
+mac_unicast_primary_add(mac_client_handle_t mch, mac_unicast_handle_t *mah,
+ mac_diag_t *diag)
+{
+ return (mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY, mah, 0, diag));
+}
+
+/*
+ * Remove a MAC address which was previously added by mac_unicast_add().
+ */
+int
+mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_unicast_impl_t *muip = (mac_unicast_impl_t *)mah;
+ mac_unicast_impl_t *pre;
+ mac_impl_t *mip = mcip->mci_mip;
+ flow_entry_t *flent;
+
+ i_mac_perim_enter(mip);
+ if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) {
+ /*
+ * Called made by the upper MAC client of a VNIC.
+ * There's nothing much to do, the unicast address will
+ * be removed by the VNIC driver when the VNIC is deleted,
+ * but let's ensure that all our transmit is done before
+ * the client does a mac_client_stop lest it trigger an
+ * assert in the driver.
+ */
+ ASSERT(muip->mui_vid == 0);
+
+ mac_tx_client_flush(mcip);
+ mcip->mci_flags &= ~MAC_CLIENT_FLAGS_VNIC_PRIMARY;
+
+ kmem_free(muip, sizeof (mac_unicast_impl_t));
+ i_mac_perim_exit(mip);
+ return (0);
+ }
+
+ ASSERT(muip != NULL);
+
+ /*
+ * Remove the VID from the list of client's VIDs.
+ */
+ pre = mcip->mci_unicast_list;
+ if (muip == pre)
+ mcip->mci_unicast_list = muip->mui_next;
+ else {
+ while ((pre->mui_next != NULL) && (pre->mui_next != muip))
+ pre = pre->mui_next;
+ ASSERT(pre->mui_next == muip);
+ rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+ pre->mui_next = muip->mui_next;
+ rw_exit(&mcip->mci_rw_lock);
+ }
+
+ if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && muip->mui_vid == 0)
+ mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY;
+
+ /*
+ * This MAC client is shared, so we will just remove the flent
+ * corresponding to the address being removed. We don't invoke
+ * mac_rx_classify_flow_rem() since the additional flow is
+ * not associated with its own separate set of SRS and rings,
+ * and these constructs are still needed for the remaining flows.
+ */
+ if (!mac_client_single_rcvr(mcip)) {
+ flent = mac_client_get_flow(mcip, muip);
+ ASSERT(flent != NULL);
+
+ /*
+ * The first one is disappearing, need to make sure
+ * we replace it with another from the list of
+ * shared clients.
+ */
+ if (flent == mcip->mci_flent)
+ flent = mac_client_swap_mciflent(mcip);
+ mac_client_remove_flow_from_list(mcip, flent);
+ mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
+ mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+
+ /*
+ * The multicast groups that were added by the client so
+ * far must be removed from the brodcast domain corresponding
+ * to the VID being removed.
+ */
+ mac_client_bcast_refresh(mcip, mac_client_update_mcast,
+ (void *)flent, B_FALSE);
+
+ if (mip->mi_type->mt_brdcst_addr != NULL) {
+ mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr,
+ muip->mui_vid);
+ }
+ mac_stop(mip);
+ FLOW_FINAL_REFRELE(flent);
+ i_mac_perim_exit(mip);
+ return (0);
+ }
+
+ mip->mi_nactiveclients--;
+
+ /* Tear down the Data path */
+ mac_datapath_teardown(mcip, mcip->mci_flent, SRST_LINK);
+
+ /*
+ * Prevent any future access to the flow entry through the mci_flent
+ * pointer by setting the mci_flent to NULL. Access to mci_flent in
+ * mac_bcast_send is also under mi_rw_lock.
+ */
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
+ flent = mcip->mci_flent;
+ mac_client_remove_flow_from_list(mcip, flent);
+
+ if (mcip->mci_state_flags & MCIS_DESC_LOGGED)
+ mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
+
+ /*
+ * This is the last unicast address being removed and there shouldn't
+ * be any outbound data threads at this point coming down from mac
+ * clients. We have waited for the data threads to finish before
+ * starting dld_str_detach. Non-data threads must access TX SRS
+ * under mi_rw_lock.
+ */
+ rw_exit(&mip->mi_rw_lock);
+
+ /*
+ * Update the multicast group for this vid.
+ */
+ mac_client_bcast_refresh(mcip, mac_client_update_mcast, (void *)flent,
+ B_FALSE);
+
+ /*
+ * Don't use FLOW_MARK with FE_MC_NO_DATAPATH, as the flow might
+ * contain other flags, such as FE_CONDEMNED, which we need to
+ * cleared. We don't call mac_flow_cleanup() for this unicast
+ * flow as we have a already cleaned up SRSs etc. (via the teadown
+ * path). We just clear the stats and reset the initial callback
+ * function, the rest will be set when we call mac_flow_create,
+ * if at all.
+ */
+ mutex_enter(&flent->fe_lock);
+ ASSERT(flent->fe_refcnt == 1 && flent->fe_mbg == NULL &&
+ flent->fe_tx_srs == NULL && flent->fe_rx_srs_cnt == 0);
+ flent->fe_flags = FE_MC_NO_DATAPATH;
+ flow_stat_destroy(flent);
+
+ /* Initialize the receiver function to a safe routine */
+ flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_arg1 = NULL;
+ flent->fe_cb_arg2 = NULL;
+
+ flent->fe_index = -1;
+ mutex_exit(&flent->fe_lock);
+
+ if (mip->mi_type->mt_brdcst_addr != NULL) {
+ mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr,
+ muip->mui_vid);
+ }
+
+ if (mip->mi_nactiveclients == 1) {
+ mac_capab_update((mac_handle_t)mip);
+ mac_virtual_link_update(mip);
+ }
+ if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
+ mip->mi_state_flags &= ~MIS_EXCLUSIVE;
+
+ mac_stop(mip);
+
+ i_mac_perim_exit(mip);
+ kmem_free(muip, sizeof (mac_unicast_impl_t));
+ return (0);
+}
+
+/*
+ * Multicast add function invoked by MAC clients.
+ */
+int
+mac_multicast_add(mac_client_handle_t mch, const uint8_t *addr)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ flow_entry_t *flent = mcip->mci_flent_list;
+ flow_entry_t *prev_fe = NULL;
+ uint16_t vid;
+ int err = 0;
+
+ /* Verify the address is a valid multicast address */
+ if ((err = mip->mi_type->mt_ops.mtops_multicst_verify(addr,
+ mip->mi_pdata)) != 0)
+ return (err);
+
+ i_mac_perim_enter(mip);
+ while (flent != NULL) {
+ vid = i_mac_flow_vid(flent);
+
+ err = mac_bcast_add((mac_client_impl_t *)mch, addr, vid,
+ MAC_ADDRTYPE_MULTICAST);
+ if (err != 0)
+ break;
+ prev_fe = flent;
+ flent = flent->fe_client_next;
+ }
+
+ /*
+ * If we failed adding, then undo all, rather than partial
+ * success.
+ */
+ if (flent != NULL && prev_fe != NULL) {
+ flent = mcip->mci_flent_list;
+ while (flent != prev_fe->fe_client_next) {
+ vid = i_mac_flow_vid(flent);
+ mac_bcast_delete((mac_client_impl_t *)mch, addr, vid);
+ flent = flent->fe_client_next;
+ }
+ }
+ i_mac_perim_exit(mip);
+ return (err);
+}
+
+/*
+ * Multicast delete function invoked by MAC clients.
+ */
+void
+mac_multicast_remove(mac_client_handle_t mch, const uint8_t *addr)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ flow_entry_t *flent;
+ uint16_t vid;
+
+ i_mac_perim_enter(mip);
+ for (flent = mcip->mci_flent_list; flent != NULL;
+ flent = flent->fe_client_next) {
+ vid = i_mac_flow_vid(flent);
+ mac_bcast_delete((mac_client_impl_t *)mch, addr, vid);
+ }
+ i_mac_perim_exit(mip);
+}
+
+/*
+ * When a MAC client desires to capture packets on an interface,
+ * it registers a promiscuous call back with mac_promisc_add().
+ * There are three types of promiscuous callbacks:
+ *
+ * * MAC_CLIENT_PROMISC_ALL
+ * Captures all packets sent and received by the MAC client,
+ * the physical interface, as well as all other MAC clients
+ * defined on top of the same MAC.
+ *
+ * * MAC_CLIENT_PROMISC_FILTERED
+ * Captures all packets sent and received by the MAC client,
+ * plus all multicast traffic sent and received by the phyisical
+ * interface and the other MAC clients.
+ *
+ * * MAC_CLIENT_PROMISC_MULTI
+ * Captures all broadcast and multicast packets sent and
+ * received by the MAC clients as well as the physical interface.
+ *
+ * In all cases, the underlying MAC is put in promiscuous mode.
+ */
+int
+mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
+ mac_rx_t fn, void *arg, mac_promisc_handle_t *mphp, uint16_t flags)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_promisc_impl_t *mpip;
+ mac_cb_info_t *mcbi;
+ int rc;
+
+ i_mac_perim_enter(mip);
+
+ if ((rc = mac_start(mip)) != 0) {
+ i_mac_perim_exit(mip);
+ return (rc);
+ }
+
+ if ((mcip->mci_state_flags & MCIS_IS_VNIC) &&
+ type == MAC_CLIENT_PROMISC_ALL) {
+ /*
+ * The function is being invoked by the upper MAC client
+ * of a VNIC. The VNIC should only see the traffic
+ * it is entitled to.
+ */
+ type = MAC_CLIENT_PROMISC_FILTERED;
+ }
+
+
+ /*
+ * Turn on promiscuous mode for the underlying NIC.
+ * This is needed even for filtered callbacks which
+ * expect to receive all multicast traffic on the wire.
+ *
+ * Physical promiscuous mode should not be turned on if
+ * MAC_PROMISC_FLAGS_NO_PHYS is set.
+ */
+ if ((flags & MAC_PROMISC_FLAGS_NO_PHYS) == 0) {
+ if ((rc = i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC))
+ != 0) {
+ mac_stop(mip);
+ i_mac_perim_exit(mip);
+ return (rc);
+ }
+ }
+
+ mpip = kmem_cache_alloc(mac_promisc_impl_cache, KM_SLEEP);
+
+ mpip->mpi_type = type;
+ mpip->mpi_fn = fn;
+ mpip->mpi_arg = arg;
+ mpip->mpi_mcip = mcip;
+ mpip->mpi_no_tx_loop = ((flags & MAC_PROMISC_FLAGS_NO_TX_LOOP) != 0);
+ mpip->mpi_no_phys = ((flags & MAC_PROMISC_FLAGS_NO_PHYS) != 0);
+
+ mcbi = &mip->mi_promisc_cb_info;
+ mutex_enter(mcbi->mcbi_lockp);
+
+ mac_callback_add(&mip->mi_promisc_cb_info, &mcip->mci_promisc_list,
+ &mpip->mpi_mci_link);
+ mac_callback_add(&mip->mi_promisc_cb_info, &mip->mi_promisc_list,
+ &mpip->mpi_mi_link);
+
+ mutex_exit(mcbi->mcbi_lockp);
+
+ *mphp = (mac_promisc_handle_t)mpip;
+ i_mac_perim_exit(mip);
+ return (0);
+}
+
+/*
+ * Remove a multicast address previously aded through mac_promisc_add().
+ */
+int
+mac_promisc_remove(mac_promisc_handle_t mph)
+{
+ mac_promisc_impl_t *mpip = (mac_promisc_impl_t *)mph;
+ mac_client_impl_t *mcip = mpip->mpi_mcip;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_cb_info_t *mcbi;
+ int rc = 0;
+
+ i_mac_perim_enter(mip);
+
+ /*
+ * Even if the device can't be reset into normal mode, we still
+ * need to clear the client promisc callbacks. The client may want
+ * to close the mac end point and we can't have stale callbacks.
+ */
+ if (!(mpip->mpi_no_phys)) {
+ rc = mac_promisc_set((mac_handle_t)mip, B_FALSE,
+ MAC_DEVPROMISC);
+ if (rc != 0)
+ goto done;
+ }
+ mcbi = &mip->mi_promisc_cb_info;
+ mutex_enter(mcbi->mcbi_lockp);
+ if (mac_callback_remove(mcbi, &mip->mi_promisc_list,
+ &mpip->mpi_mi_link)) {
+ VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
+ &mcip->mci_promisc_list, &mpip->mpi_mci_link));
+ kmem_cache_free(mac_promisc_impl_cache, mpip);
+ } else {
+ mac_callback_remove_wait(&mip->mi_promisc_cb_info);
+ }
+ mutex_exit(mcbi->mcbi_lockp);
+ mac_stop(mip);
+
+done:
+ i_mac_perim_exit(mip);
+ return (rc);
+}
+
+/*
+ * Reference count the number of active Tx threads. MCI_TX_QUIESCE indicates
+ * that a control operation wants to quiesce the Tx data flow in which case
+ * we return an error. Holding any of the per cpu locks ensures that the
+ * mci_tx_flag won't change.
+ *
+ * 'CPU' must be accessed just once and used to compute the index into the
+ * percpu array, and that index must be used for the entire duration of the
+ * packet send operation. Note that the thread may be preempted and run on
+ * another cpu any time and so we can't use 'CPU' more than once for the
+ * operation.
+ */
+#define MAC_TX_TRY_HOLD(mcip, mytx, error) \
+{ \
+ (error) = 0; \
+ (mytx) = &(mcip)->mci_tx_pcpu[CPU->cpu_seqid & mac_tx_percpu_cnt]; \
+ mutex_enter(&(mytx)->pcpu_tx_lock); \
+ if (!((mcip)->mci_tx_flag & MCI_TX_QUIESCE)) { \
+ (mytx)->pcpu_tx_refcnt++; \
+ } else { \
+ (error) = -1; \
+ } \
+ mutex_exit(&(mytx)->pcpu_tx_lock); \
+}
+
+/*
+ * Release the reference. If needed, signal any control operation waiting
+ * for Tx quiescence. The wait and signal are always done using the
+ * mci_tx_pcpu[0]'s lock
+ */
+#define MAC_TX_RELE(mcip, mytx) { \
+ mutex_enter(&(mytx)->pcpu_tx_lock); \
+ if (--(mytx)->pcpu_tx_refcnt == 0 && \
+ (mcip)->mci_tx_flag & MCI_TX_QUIESCE) { \
+ mutex_exit(&(mytx)->pcpu_tx_lock); \
+ mutex_enter(&(mcip)->mci_tx_pcpu[0].pcpu_tx_lock); \
+ cv_signal(&(mcip)->mci_tx_cv); \
+ mutex_exit(&(mcip)->mci_tx_pcpu[0].pcpu_tx_lock); \
+ } else { \
+ mutex_exit(&(mytx)->pcpu_tx_lock); \
+ } \
+}
+
+/*
+ * Bump the count of the number of active Tx threads. This is maintained as
+ * a per CPU counter. On (CMT kind of) machines with large number of CPUs,
+ * a single mci_tx_lock may become contended. However a count of the total
+ * number of Tx threads per client is needed in order to quiesce the Tx side
+ * prior to reassigning a Tx ring dynamically to another client. The thread
+ * that needs to quiesce the Tx traffic grabs all the percpu locks and checks
+ * the sum of the individual percpu refcnts. Each Tx data thread only grabs
+ * its own percpu lock and increments its own refcnt.
+ */
+void *
+mac_tx_hold(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_tx_percpu_t *mytx;
+ int error;
+
+ MAC_TX_TRY_HOLD(mcip, mytx, error);
+ return (error == 0 ? (void *)mytx : NULL);
+}
+
+void
+mac_tx_rele(mac_client_handle_t mch, void *mytx_handle)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_tx_percpu_t *mytx = mytx_handle;
+
+ MAC_TX_RELE(mcip, mytx)
+}
+
+/*
+ * Send function invoked by MAC clients.
+ */
+mac_tx_cookie_t
+mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
+ uint16_t flag, mblk_t **ret_mp)
+{
+ mac_tx_cookie_t cookie;
+ int error;
+ mac_tx_percpu_t *mytx;
+ mac_soft_ring_set_t *srs;
+ flow_entry_t *flent;
+ boolean_t is_subflow = B_FALSE;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_srs_tx_t *srs_tx;
+
+ /*
+ * Check whether the active Tx threads count is bumped already.
+ */
+ if (!(flag & MAC_TX_NO_HOLD)) {
+ MAC_TX_TRY_HOLD(mcip, mytx, error);
+ if (error != 0) {
+ freemsgchain(mp_chain);
+ return (NULL);
+ }
+ }
+
+ if (mcip->mci_subflow_tab != NULL &&
+ mcip->mci_subflow_tab->ft_flow_count > 0 &&
+ mac_flow_lookup(mcip->mci_subflow_tab, mp_chain,
+ FLOW_OUTBOUND, &flent) == 0) {
+ /*
+ * The main assumption here is that if in the event
+ * we get a chain, all the packets will be classified
+ * to the same Flow/SRS. If this changes for any
+ * reason, the following logic should change as well.
+ * I suppose the fanout_hint also assumes this .
+ */
+ ASSERT(flent != NULL);
+ is_subflow = B_TRUE;
+ } else {
+ flent = mcip->mci_flent;
+ }
+
+ srs = flent->fe_tx_srs;
+ srs_tx = &srs->srs_tx;
+ if (srs_tx->st_mode == SRS_TX_DEFAULT &&
+ (srs->srs_state & SRS_ENQUEUED) == 0 &&
+ mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL &&
+ mp_chain->b_next == NULL) {
+ uint64_t obytes;
+
+ /*
+ * Since dls always opens the underlying MAC, nclients equals
+ * to 1 means that the only active client is dls itself acting
+ * as a primary client of the MAC instance. Since dls will not
+ * send tagged packets in that case, and dls is trusted to send
+ * packets for its allowed VLAN(s), the VLAN tag insertion and
+ * check is required only if nclients is greater than 1.
+ */
+ if (mip->mi_nclients > 1) {
+ if (MAC_VID_CHECK_NEEDED(mcip)) {
+ int err = 0;
+
+ MAC_VID_CHECK(mcip, mp_chain, err);
+ if (err != 0) {
+ freemsg(mp_chain);
+ mcip->mci_stat_oerrors++;
+ goto done;
+ }
+ }
+ if (MAC_TAG_NEEDED(mcip)) {
+ mp_chain = mac_add_vlan_tag(mp_chain, 0,
+ mac_client_vid(mch));
+ if (mp_chain == NULL) {
+ mcip->mci_stat_oerrors++;
+ goto done;
+ }
+ }
+ }
+
+ obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) :
+ msgdsize(mp_chain));
+
+ MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip);
+
+ if (mp_chain == NULL) {
+ cookie = NULL;
+ mcip->mci_stat_obytes += obytes;
+ mcip->mci_stat_opackets += 1;
+ if ((srs->srs_type & SRST_FLOW) != 0) {
+ FLOW_STAT_UPDATE(flent, obytes, obytes);
+ FLOW_STAT_UPDATE(flent, opackets, 1);
+ }
+ } else {
+ mutex_enter(&srs->srs_lock);
+ cookie = mac_tx_srs_no_desc(srs, mp_chain,
+ flag, ret_mp);
+ mutex_exit(&srs->srs_lock);
+ }
+ } else {
+ cookie = srs_tx->st_func(srs, mp_chain, hint, flag, ret_mp);
+ }
+
+done:
+ if (is_subflow)
+ FLOW_REFRELE(flent);
+
+ if (!(flag & MAC_TX_NO_HOLD))
+ MAC_TX_RELE(mcip, mytx);
+
+ return (cookie);
+}
+
+/*
+ * mac_tx_is_blocked
+ *
+ * Given a cookie, it returns if the ring identified by the cookie is
+ * flow-controlled or not (this is not implemented yet). If NULL is
+ * passed in place of a cookie, then it finds out if any of the
+ * underlying rings belonging to the SRS is flow controlled or not
+ * and returns that status.
+ */
+/* ARGSUSED */
+boolean_t
+mac_tx_is_flow_blocked(mac_client_handle_t mch, mac_tx_cookie_t cookie)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_soft_ring_set_t *mac_srs = MCIP_TX_SRS(mcip);
+ mac_soft_ring_t *sringp;
+ boolean_t blocked = B_FALSE;
+ int i;
+
+ /*
+ * On etherstubs, there won't be a Tx SRS or an Rx
+ * SRS. Infact there won't even be a flow_entry.
+ */
+ if (mac_srs == NULL)
+ return (B_FALSE);
+
+ mutex_enter(&mac_srs->srs_lock);
+ if (mac_srs->srs_tx.st_mode == SRS_TX_FANOUT) {
+ for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+ sringp = mac_srs->srs_oth_soft_rings[i];
+ mutex_enter(&sringp->s_ring_lock);
+ if (sringp->s_ring_state & S_RING_TX_HIWAT) {
+ blocked = B_TRUE;
+ mutex_exit(&sringp->s_ring_lock);
+ break;
+ }
+ mutex_exit(&sringp->s_ring_lock);
+ }
+ } else {
+ blocked = (mac_srs->srs_state & SRS_TX_HIWAT);
+ }
+ mutex_exit(&mac_srs->srs_lock);
+ return (blocked);
+}
+
+/*
+ * Check if the MAC client is the primary MAC client.
+ */
+boolean_t
+mac_is_primary_client(mac_client_impl_t *mcip)
+{
+ return (mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY);
+}
+
+void
+mac_ioctl(mac_handle_t mh, queue_t *wq, mblk_t *bp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ int cmd = ((struct iocblk *)bp->b_rptr)->ioc_cmd;
+
+ if ((cmd == ND_GET && (mip->mi_callbacks->mc_callbacks & MC_GETPROP)) ||
+ (cmd == ND_SET && (mip->mi_callbacks->mc_callbacks & MC_SETPROP))) {
+ /*
+ * If ndd props were registered, call them.
+ * Note that ndd ioctls are Obsolete
+ */
+ mac_ndd_ioctl(mip, wq, bp);
+ return;
+ }
+
+ /*
+ * Call the driver to handle the ioctl. The driver may not support
+ * any ioctls, in which case we reply with a NAK on its behalf.
+ */
+ if (mip->mi_callbacks->mc_callbacks & MC_IOCTL)
+ mip->mi_ioctl(mip->mi_driver, wq, bp);
+ else
+ miocnak(wq, bp, 0, EINVAL);
+}
+
+/*
+ * Return the link state of the specified MAC instance.
+ */
+link_state_t
+mac_link_get(mac_handle_t mh)
+{
+ return (((mac_impl_t *)mh)->mi_linkstate);
+}
+
+/*
+ * Add a mac client specified notification callback. Please see the comments
+ * above mac_callback_add() for general information about mac callback
+ * addition/deletion in the presence of mac callback list walkers
+ */
+mac_notify_handle_t
+mac_notify_add(mac_handle_t mh, mac_notify_t notify_fn, void *arg)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_notify_cb_t *mncb;
+ mac_cb_info_t *mcbi;
+
+ /*
+ * Allocate a notify callback structure, fill in the details and
+ * use the mac callback list manipulation functions to chain into
+ * the list of callbacks.
+ */
+ mncb = kmem_zalloc(sizeof (mac_notify_cb_t), KM_SLEEP);
+ mncb->mncb_fn = notify_fn;
+ mncb->mncb_arg = arg;
+ mncb->mncb_mip = mip;
+ mncb->mncb_link.mcb_objp = mncb;
+ mncb->mncb_link.mcb_objsize = sizeof (mac_notify_cb_t);
+ mncb->mncb_link.mcb_flags = MCB_NOTIFY_CB_T;
+
+ mcbi = &mip->mi_notify_cb_info;
+
+ i_mac_perim_enter(mip);
+ mutex_enter(mcbi->mcbi_lockp);
+
+ mac_callback_add(&mip->mi_notify_cb_info, &mip->mi_notify_cb_list,
+ &mncb->mncb_link);
+
+ mutex_exit(mcbi->mcbi_lockp);
+ i_mac_perim_exit(mip);
+ return ((mac_notify_handle_t)mncb);
+}
+
+void
+mac_notify_remove_wait(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_cb_info_t *mcbi = &mip->mi_notify_cb_info;
+
+ mutex_enter(mcbi->mcbi_lockp);
+ mac_callback_remove_wait(&mip->mi_notify_cb_info);
+ mutex_exit(mcbi->mcbi_lockp);
+}
+
+/*
+ * Remove a mac client specified notification callback
+ */
+int
+mac_notify_remove(mac_notify_handle_t mnh, boolean_t wait)
+{
+ mac_notify_cb_t *mncb = (mac_notify_cb_t *)mnh;
+ mac_impl_t *mip = mncb->mncb_mip;
+ mac_cb_info_t *mcbi;
+ int err = 0;
+
+ mcbi = &mip->mi_notify_cb_info;
+
+ i_mac_perim_enter(mip);
+ mutex_enter(mcbi->mcbi_lockp);
+
+ ASSERT(mncb->mncb_link.mcb_objp == mncb);
+ /*
+ * If there aren't any list walkers, the remove would succeed
+ * inline, else we wait for the deferred remove to complete
+ */
+ if (mac_callback_remove(&mip->mi_notify_cb_info,
+ &mip->mi_notify_cb_list, &mncb->mncb_link)) {
+ kmem_free(mncb, sizeof (mac_notify_cb_t));
+ } else {
+ err = EBUSY;
+ }
+
+ mutex_exit(mcbi->mcbi_lockp);
+ i_mac_perim_exit(mip);
+
+ /*
+ * If we failed to remove the notification callback and "wait" is set
+ * to be B_TRUE, wait for the callback to finish after we exit the
+ * mac perimeter.
+ */
+ if (err != 0 && wait) {
+ mac_notify_remove_wait((mac_handle_t)mip);
+ return (0);
+ }
+
+ return (err);
+}
+
+/*
+ * Associate resource management callbacks with the specified MAC
+ * clients.
+ */
+
+void
+mac_resource_set_common(mac_client_handle_t mch, mac_resource_add_t add,
+ mac_resource_remove_t remove, mac_resource_quiesce_t quiesce,
+ mac_resource_restart_t restart, mac_resource_bind_t bind,
+ void *arg)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+
+ mcip->mci_resource_add = add;
+ mcip->mci_resource_remove = remove;
+ mcip->mci_resource_quiesce = quiesce;
+ mcip->mci_resource_restart = restart;
+ mcip->mci_resource_bind = bind;
+ mcip->mci_resource_arg = arg;
+
+ if (arg == NULL)
+ mcip->mci_state_flags &= ~MCIS_CLIENT_POLL_CAPABLE;
+}
+
+void
+mac_resource_set(mac_client_handle_t mch, mac_resource_add_t add, void *arg)
+{
+ /* update the 'resource_add' callback */
+ mac_resource_set_common(mch, add, NULL, NULL, NULL, NULL, arg);
+}
+
+/*
+ * Sets up the client resources and enable the polling interface over all the
+ * SRS's and the soft rings of the client
+ */
+void
+mac_client_poll_enable(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_soft_ring_set_t *mac_srs;
+ flow_entry_t *flent;
+ int i;
+
+ flent = mcip->mci_flent;
+ ASSERT(flent != NULL);
+
+ for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+ mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
+ ASSERT(mac_srs->srs_mcip == mcip);
+ mac_srs_client_poll_enable(mcip, mac_srs);
+ }
+}
+
+/*
+ * Tears down the client resources and disable the polling interface over all
+ * the SRS's and the soft rings of the client
+ */
+void
+mac_client_poll_disable(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_soft_ring_set_t *mac_srs;
+ flow_entry_t *flent;
+ int i;
+
+ flent = mcip->mci_flent;
+ ASSERT(flent != NULL);
+
+ for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+ mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
+ ASSERT(mac_srs->srs_mcip == mcip);
+ mac_srs_client_poll_disable(mcip, mac_srs);
+ }
+}
+
+/*
+ * Associate the CPUs specified by the given property with a MAC client.
+ */
+int
+mac_cpu_set(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ int err = 0;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ if ((err = mac_validate_props(mrp)) != 0)
+ return (err);
+
+ if (MCIP_DATAPATH_SETUP(mcip))
+ mac_flow_modify(mip->mi_flow_tab, mcip->mci_flent, mrp);
+
+ mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip), B_FALSE);
+ return (0);
+}
+
+/*
+ * Apply the specified properties to the specified MAC client.
+ */
+int
+mac_client_set_resources(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ int err = 0;
+
+ i_mac_perim_enter(mip);
+
+ if ((mrp->mrp_mask & MRP_MAXBW) || (mrp->mrp_mask & MRP_PRIORITY)) {
+ err = mac_resource_ctl_set(mch, mrp);
+ if (err != 0) {
+ i_mac_perim_exit(mip);
+ return (err);
+ }
+ }
+
+ if (mrp->mrp_mask & MRP_CPUS)
+ err = mac_cpu_set(mch, mrp);
+
+ i_mac_perim_exit(mip);
+ return (err);
+}
+
+/*
+ * Return the properties currently associated with the specified MAC client.
+ */
+void
+mac_client_get_resources(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+
+ bcopy(mcip_mrp, mrp, sizeof (mac_resource_props_t));
+}
+
+/*
+ * Pass a copy of the specified packet to the promiscuous callbacks
+ * of the specified MAC.
+ *
+ * If sender is NULL, the function is being invoked for a packet chain
+ * received from the wire. If sender is non-NULL, it points to
+ * the MAC client from which the packet is being sent.
+ *
+ * The packets are distributed to the promiscuous callbacks as follows:
+ *
+ * - all packets are sent to the MAC_CLIENT_PROMISC_ALL callbacks
+ * - all broadcast and multicast packets are sent to the
+ * MAC_CLIENT_PROMISC_FILTER and MAC_CLIENT_PROMISC_MULTI.
+ *
+ * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched
+ * after classification by mac_rx_deliver().
+ */
+
+static void
+mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
+ boolean_t loopback)
+{
+ mblk_t *mp_copy;
+
+ mp_copy = copymsg(mp);
+ if (mp_copy == NULL)
+ return;
+ mp_copy->b_next = NULL;
+
+ mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
+}
+
+/*
+ * Return the VID of a packet. Zero if the packet is not tagged.
+ */
+static uint16_t
+mac_ether_vid(mblk_t *mp)
+{
+ struct ether_header *eth = (struct ether_header *)mp->b_rptr;
+
+ if (ntohs(eth->ether_type) == ETHERTYPE_VLAN) {
+ struct ether_vlan_header *t_evhp =
+ (struct ether_vlan_header *)mp->b_rptr;
+ return (VLAN_ID(ntohs(t_evhp->ether_tci)));
+ }
+
+ return (0);
+}
+
+/*
+ * Return whether the specified packet contains a multicast or broadcast
+ * destination MAC address.
+ */
+static boolean_t
+mac_is_mcast(mac_impl_t *mip, mblk_t *mp)
+{
+ mac_header_info_t hdr_info;
+
+ if (mac_header_info((mac_handle_t)mip, mp, &hdr_info) != 0)
+ return (B_FALSE);
+ return ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) ||
+ (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST));
+}
+
+/*
+ * Send a copy of an mblk chain to the MAC clients of the specified MAC.
+ * "sender" points to the sender MAC client for outbound packets, and
+ * is set to NULL for inbound packets.
+ */
+void
+mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
+ mac_client_impl_t *sender)
+{
+ mac_promisc_impl_t *mpip;
+ mac_cb_t *mcb;
+ mblk_t *mp;
+ boolean_t is_mcast, is_sender;
+
+ MAC_PROMISC_WALKER_INC(mip);
+ for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
+ is_mcast = mac_is_mcast(mip, mp);
+ /* send packet to interested callbacks */
+ for (mcb = mip->mi_promisc_list; mcb != NULL;
+ mcb = mcb->mcb_nextp) {
+ mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
+ is_sender = (mpip->mpi_mcip == sender);
+
+ if (is_sender && mpip->mpi_no_tx_loop)
+ /*
+ * The sender doesn't want to receive
+ * copies of the packets it sends.
+ */
+ continue;
+
+ /*
+ * For an ethernet MAC, don't displatch a multicast
+ * packet to a non-PROMISC_ALL callbacks unless the VID
+ * of the packet matches the VID of the client.
+ */
+ if (is_mcast &&
+ mpip->mpi_type != MAC_CLIENT_PROMISC_ALL &&
+ !mac_client_check_flow_vid(mpip->mpi_mcip,
+ mac_ether_vid(mp)))
+ continue;
+
+ if (is_sender ||
+ mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
+ is_mcast)
+ mac_promisc_dispatch_one(mpip, mp, is_sender);
+ }
+ }
+ MAC_PROMISC_WALKER_DCR(mip);
+}
+
+void
+mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_promisc_impl_t *mpip;
+ boolean_t is_mcast;
+ mblk_t *mp;
+ mac_cb_t *mcb;
+
+ /*
+ * The unicast packets for the MAC client still
+ * need to be delivered to the MAC_CLIENT_PROMISC_FILTERED
+ * promiscuous callbacks. The broadcast and multicast
+ * packets were delivered from mac_rx().
+ */
+ MAC_PROMISC_WALKER_INC(mip);
+ for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
+ is_mcast = mac_is_mcast(mip, mp);
+ for (mcb = mcip->mci_promisc_list; mcb != NULL;
+ mcb = mcb->mcb_nextp) {
+ mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
+ if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED &&
+ !is_mcast) {
+ mac_promisc_dispatch_one(mpip, mp, B_FALSE);
+ }
+ }
+ }
+ MAC_PROMISC_WALKER_DCR(mip);
+}
+
+/*
+ * Return the margin value currently assigned to the specified MAC instance.
+ */
+void
+mac_margin_get(mac_handle_t mh, uint32_t *marginp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ rw_enter(&(mip->mi_rw_lock), RW_READER);
+ *marginp = mip->mi_margin;
+ rw_exit(&(mip->mi_rw_lock));
+}
+
+/*
+ * mac_info_get() is used for retrieving the mac_info when a DL_INFO_REQ is
+ * issued before a DL_ATTACH_REQ. we walk the i_mac_impl_hash table and find
+ * the first mac_impl_t with a matching driver name; then we copy its mac_info_t
+ * to the caller. we do all this with i_mac_impl_lock held so the mac_impl_t
+ * cannot disappear while we are accessing it.
+ */
+typedef struct i_mac_info_state_s {
+ const char *mi_name;
+ mac_info_t *mi_infop;
+} i_mac_info_state_t;
+
+/*ARGSUSED*/
+static uint_t
+i_mac_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+{
+ i_mac_info_state_t *statep = arg;
+ mac_impl_t *mip = (mac_impl_t *)val;
+
+ if (mip->mi_state_flags & MIS_DISABLED)
+ return (MH_WALK_CONTINUE);
+
+ if (strcmp(statep->mi_name,
+ ddi_driver_name(mip->mi_dip)) != 0)
+ return (MH_WALK_CONTINUE);
+
+ statep->mi_infop = &mip->mi_info;
+ return (MH_WALK_TERMINATE);
+}
+
+boolean_t
+mac_info_get(const char *name, mac_info_t *minfop)
+{
+ i_mac_info_state_t state;
+
+ rw_enter(&i_mac_impl_lock, RW_READER);
+ state.mi_name = name;
+ state.mi_infop = NULL;
+ mod_hash_walk(i_mac_impl_hash, i_mac_info_walker, &state);
+ if (state.mi_infop == NULL) {
+ rw_exit(&i_mac_impl_lock);
+ return (B_FALSE);
+ }
+ *minfop = *state.mi_infop;
+ rw_exit(&i_mac_impl_lock);
+ return (B_TRUE);
+}
+
+/*
+ * To get the capabilities that MAC layer cares about, such as rings, factory
+ * mac address, vnic or not, it should directly invoke this function
+ */
+boolean_t
+i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
+ return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
+ else
+ return (B_FALSE);
+}
+
+/*
+ * Capability query function. If number of active mac clients is greater than
+ * 1, only limited capabilities can be advertised to the caller no matter the
+ * driver has certain capability or not. Else, we query the driver to get the
+ * capability.
+ */
+boolean_t
+mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ /*
+ * if mi_nactiveclients > 1, only MAC_CAPAB_HCKSUM,
+ * MAC_CAPAB_NO_NATIVEVLAN, MAC_CAPAB_NO_ZCOPY can be advertised.
+ */
+ if (mip->mi_nactiveclients > 1) {
+ switch (cap) {
+ case MAC_CAPAB_HCKSUM:
+ return (i_mac_capab_get(mh, cap, cap_data));
+ case MAC_CAPAB_NO_NATIVEVLAN:
+ case MAC_CAPAB_NO_ZCOPY:
+ return (B_TRUE);
+ default:
+ return (B_FALSE);
+ }
+ }
+
+ /* else get capab from driver */
+ return (i_mac_capab_get(mh, cap, cap_data));
+}
+
+boolean_t
+mac_sap_verify(mac_handle_t mh, uint32_t sap, uint32_t *bind_sap)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return (mip->mi_type->mt_ops.mtops_sap_verify(sap, bind_sap,
+ mip->mi_pdata));
+}
+
+mblk_t *
+mac_header(mac_handle_t mh, const uint8_t *daddr, uint32_t sap, mblk_t *payload,
+ size_t extra_len)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return (mip->mi_type->mt_ops.mtops_header(mip->mi_addr, daddr, sap,
+ mip->mi_pdata, payload, extra_len));
+}
+
+int
+mac_header_info(mac_handle_t mh, mblk_t *mp, mac_header_info_t *mhip)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return (mip->mi_type->mt_ops.mtops_header_info(mp, mip->mi_pdata,
+ mhip));
+}
+
+mblk_t *
+mac_header_cook(mac_handle_t mh, mblk_t *mp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_COOK) {
+ if (DB_REF(mp) > 1) {
+ mblk_t *newmp = copymsg(mp);
+ if (newmp == NULL)
+ return (NULL);
+ freemsg(mp);
+ mp = newmp;
+ }
+ return (mip->mi_type->mt_ops.mtops_header_cook(mp,
+ mip->mi_pdata));
+ }
+ return (mp);
+}
+
+mblk_t *
+mac_header_uncook(mac_handle_t mh, mblk_t *mp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_UNCOOK) {
+ if (DB_REF(mp) > 1) {
+ mblk_t *newmp = copymsg(mp);
+ if (newmp == NULL)
+ return (NULL);
+ freemsg(mp);
+ mp = newmp;
+ }
+ return (mip->mi_type->mt_ops.mtops_header_uncook(mp,
+ mip->mi_pdata));
+ }
+ return (mp);
+}
+
+uint_t
+mac_addr_len(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return (mip->mi_type->mt_addr_length);
+}
+
+/* True if a MAC is a VNIC */
+boolean_t
+mac_is_vnic(mac_handle_t mh)
+{
+ return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC);
+}
+
+mac_handle_t
+mac_get_lower_mac_handle(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ ASSERT(mac_is_vnic(mh));
+ return (((vnic_t *)mip->mi_driver)->vn_lower_mh);
+}
+
+void
+mac_update_resources(mac_resource_props_t *nmrp, mac_resource_props_t *cmrp,
+ boolean_t is_user_flow)
+{
+ if (nmrp != NULL && cmrp != NULL) {
+ if (nmrp->mrp_mask & MRP_PRIORITY) {
+ if (nmrp->mrp_priority == MPL_RESET) {
+ cmrp->mrp_mask &= ~MRP_PRIORITY;
+ if (is_user_flow) {
+ cmrp->mrp_priority =
+ MPL_SUBFLOW_DEFAULT;
+ } else {
+ cmrp->mrp_priority = MPL_LINK_DEFAULT;
+ }
+ } else {
+ cmrp->mrp_mask |= MRP_PRIORITY;
+ cmrp->mrp_priority = nmrp->mrp_priority;
+ }
+ }
+ if (nmrp->mrp_mask & MRP_MAXBW) {
+ cmrp->mrp_maxbw = nmrp->mrp_maxbw;
+ if (nmrp->mrp_maxbw == MRP_MAXBW_RESETVAL)
+ cmrp->mrp_mask &= ~MRP_MAXBW;
+ else
+ cmrp->mrp_mask |= MRP_MAXBW;
+ }
+ if (nmrp->mrp_mask & MRP_CPUS)
+ MAC_COPY_CPUS(nmrp, cmrp);
+ }
+}
+
+/*
+ * i_mac_set_resources:
+ *
+ * This routine associates properties with the primary MAC client of
+ * the specified MAC instance.
+ * - Cache the properties in mac_impl_t
+ * - Apply the properties to the primary MAC client if exists
+ */
+int
+i_mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_client_impl_t *mcip;
+ int err = 0;
+ mac_resource_props_t tmrp;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ err = mac_validate_props(mrp);
+ if (err != 0)
+ return (err);
+
+ /*
+ * Since bind_cpu may be modified by mac_client_set_resources()
+ * we use a copy of bind_cpu and finally cache bind_cpu in mip.
+ * This allows us to cache only user edits in mip.
+ */
+ bcopy(mrp, &tmrp, sizeof (mac_resource_props_t));
+ mcip = mac_primary_client_handle(mip);
+ if (mcip != NULL) {
+ err =
+ mac_client_set_resources((mac_client_handle_t)mcip, &tmrp);
+ }
+ /* if mac_client_set_resources failed, do not update the values */
+ if (err == 0)
+ mac_update_resources(mrp, &mip->mi_resource_props, B_FALSE);
+ return (err);
+}
+
+int
+mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp)
+{
+ int err;
+
+ i_mac_perim_enter((mac_impl_t *)mh);
+ err = i_mac_set_resources(mh, mrp);
+ i_mac_perim_exit((mac_impl_t *)mh);
+ return (err);
+}
+
+/*
+ * Get the properties cached for the specified MAC instance.
+ */
+void
+mac_get_resources(mac_handle_t mh, mac_resource_props_t *mrp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_client_impl_t *mcip;
+
+ if (mip->mi_state_flags & MIS_IS_VNIC) {
+ mcip = mac_primary_client_handle(mip);
+ if (mcip != NULL) {
+ mac_client_get_resources((mac_client_handle_t)mcip,
+ mrp);
+ return;
+ }
+ }
+ bcopy(&mip->mi_resource_props, mrp, sizeof (mac_resource_props_t));
+}
+
+/*
+ * Rename a mac client, its flow, and the kstat.
+ */
+int
+mac_rename_primary(mac_handle_t mh, const char *new_name)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_client_impl_t *cur_clnt = NULL;
+ flow_entry_t *fep;
+
+ i_mac_perim_enter(mip);
+
+ /*
+ * VNICs: we need to change the sys flow name and
+ * the associated flow kstat.
+ */
+ if (mip->mi_state_flags & MIS_IS_VNIC) {
+ ASSERT(new_name != NULL);
+ mac_rename_flow_names(mac_vnic_lower(mip), new_name);
+ goto done;
+ }
+ /*
+ * This mac may itself be an aggr link, or it may have some client
+ * which is an aggr port. For both cases, we need to change the
+ * aggr port's mac client name, its flow name and the associated flow
+ * kstat.
+ */
+ if (mip->mi_state_flags & MIS_IS_AGGR) {
+ mac_capab_aggr_t aggr_cap;
+ mac_rename_fn_t rename_fn;
+ boolean_t ret;
+
+ ASSERT(new_name != NULL);
+ ret = i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR,
+ (void *)(&aggr_cap));
+ ASSERT(ret == B_TRUE);
+ rename_fn = aggr_cap.mca_rename_fn;
+ rename_fn(new_name, mip->mi_driver);
+ /*
+ * The aggr's client name and kstat flow name will be
+ * updated below, i.e. via mac_rename_flow_names.
+ */
+ }
+
+ for (cur_clnt = mip->mi_clients_list; cur_clnt != NULL;
+ cur_clnt = cur_clnt->mci_client_next) {
+ if (cur_clnt->mci_state_flags & MCIS_IS_AGGR_PORT) {
+ if (new_name != NULL) {
+ char *str_st = cur_clnt->mci_name;
+ char *str_del = strchr(str_st, '-');
+
+ ASSERT(str_del != NULL);
+ bzero(str_del + 1, MAXNAMELEN -
+ (str_del - str_st + 1));
+ bcopy(new_name, str_del + 1,
+ strlen(new_name));
+ }
+ fep = cur_clnt->mci_flent;
+ mac_rename_flow(fep, cur_clnt->mci_name);
+ break;
+ } else if (new_name != NULL &&
+ cur_clnt->mci_state_flags & MCIS_USE_DATALINK_NAME) {
+ mac_rename_flow_names(cur_clnt, new_name);
+ break;
+ }
+ }
+
+done:
+ i_mac_perim_exit(mip);
+ return (0);
+}
+
+/*
+ * Rename the MAC client's flow names
+ */
+static void
+mac_rename_flow_names(mac_client_impl_t *mcip, const char *new_name)
+{
+ flow_entry_t *flent;
+ uint16_t vid;
+ char flowname[MAXFLOWNAME];
+ mac_impl_t *mip = mcip->mci_mip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /*
+ * Use mi_rw_lock to ensure that threads not in the mac perimeter
+ * see a self-consistent value for mci_name
+ */
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
+ (void) strlcpy(mcip->mci_name, new_name, sizeof (mcip->mci_name));
+ rw_exit(&mip->mi_rw_lock);
+
+ mac_rename_flow(mcip->mci_flent, new_name);
+
+ if (mcip->mci_nflents == 1)
+ return;
+
+ /*
+ * We have to rename all the others too, no stats to destroy for
+ * these.
+ */
+ for (flent = mcip->mci_flent_list; flent != NULL;
+ flent = flent->fe_client_next) {
+ if (flent != mcip->mci_flent) {
+ vid = i_mac_flow_vid(flent);
+ (void) sprintf(flowname, "%s%u", new_name, vid);
+ mac_flow_set_name(flent, flowname);
+ }
+ }
+}
+
+
+/*
+ * Add a flow to the MAC client's flow list - i.e list of MAC/VID tuples
+ * defined for the specified MAC client.
+ */
+static void
+mac_client_add_to_flow_list(mac_client_impl_t *mcip, flow_entry_t *flent)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+ /*
+ * The promisc Rx data path walks the mci_flent_list. Protect by
+ * using mi_rw_lock
+ */
+ rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+
+ /* Add it to the head */
+ flent->fe_client_next = mcip->mci_flent_list;
+ mcip->mci_flent_list = flent;
+ mcip->mci_nflents++;
+
+ /*
+ * Keep track of the number of non-zero VIDs addresses per MAC
+ * client to avoid figuring it out in the data-path.
+ */
+ if (i_mac_flow_vid(flent) != VLAN_ID_NONE)
+ mcip->mci_nvids++;
+
+ rw_exit(&mcip->mci_rw_lock);
+}
+
+/*
+ * Remove a flow entry from the MAC client's list.
+ */
+static void
+mac_client_remove_flow_from_list(mac_client_impl_t *mcip, flow_entry_t *flent)
+{
+ flow_entry_t *fe = mcip->mci_flent_list;
+ flow_entry_t *prev_fe = NULL;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+ /*
+ * The promisc Rx data path walks the mci_flent_list. Protect by
+ * using mci_rw_lock
+ */
+ rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+ while ((fe != NULL) && (fe != flent)) {
+ prev_fe = fe;
+ fe = fe->fe_client_next;
+ }
+
+ /* XXX should be an ASSERT */
+ if (fe != NULL) {
+ if (prev_fe == NULL) {
+ /* Deleting the first node */
+ mcip->mci_flent_list = fe->fe_client_next;
+ } else {
+ prev_fe->fe_client_next = fe->fe_client_next;
+ }
+ mcip->mci_nflents--;
+
+ if (i_mac_flow_vid(flent) != VLAN_ID_NONE)
+ mcip->mci_nvids--;
+ }
+ rw_exit(&mcip->mci_rw_lock);
+}
+
+/*
+ * Check if the given VID belongs to this MAC client.
+ */
+boolean_t
+mac_client_check_flow_vid(mac_client_impl_t *mcip, uint16_t vid)
+{
+ flow_entry_t *flent;
+ uint16_t mci_vid;
+
+ /* The mci_flent_list is protected by mci_rw_lock */
+ rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+ for (flent = mcip->mci_flent_list; flent != NULL;
+ flent = flent->fe_client_next) {
+ mci_vid = i_mac_flow_vid(flent);
+ if (vid == mci_vid) {
+ rw_exit(&mcip->mci_rw_lock);
+ return (B_TRUE);
+ }
+ }
+ rw_exit(&mcip->mci_rw_lock);
+ return (B_FALSE);
+}
+
+/*
+ * Get the flow entry for the specified <MAC addr, VID> tuple.
+ */
+static flow_entry_t *
+mac_client_get_flow(mac_client_impl_t *mcip, mac_unicast_impl_t *muip)
+{
+ mac_address_t *map = mcip->mci_unicast;
+ flow_entry_t *flent;
+ uint16_t vid;
+ flow_desc_t flow_desc;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ mac_flow_get_desc(mcip->mci_flent, &flow_desc);
+ if (bcmp(flow_desc.fd_dst_mac, map->ma_addr, map->ma_len) != 0)
+ return (NULL);
+
+ for (flent = mcip->mci_flent_list; flent != NULL;
+ flent = flent->fe_client_next) {
+ vid = i_mac_flow_vid(flent);
+ if (vid == muip->mui_vid) {
+ return (flent);
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Since mci_flent has the SRSs, when we want to remove it, we replace
+ * the flow_desc_t in mci_flent with that of an existing flent and then
+ * remove that flent instead of mci_flent.
+ */
+static flow_entry_t *
+mac_client_swap_mciflent(mac_client_impl_t *mcip)
+{
+ flow_entry_t *flent = mcip->mci_flent;
+ flow_tab_t *ft = flent->fe_flow_tab;
+ flow_entry_t *flent1;
+ flow_desc_t fl_desc;
+ char fl_name[MAXFLOWNAME];
+ int err;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+ ASSERT(mcip->mci_nflents > 1);
+
+ /* get the next flent following the primary flent */
+ flent1 = mcip->mci_flent_list->fe_client_next;
+ ASSERT(flent1 != NULL && flent1->fe_flow_tab == ft);
+
+ /*
+ * Remove the flent from the flow table before updating the
+ * flow descriptor as the hash depends on the flow descriptor.
+ * This also helps incoming packet classification avoid having
+ * to grab fe_lock. Access to fe_flow_desc of a flent not in the
+ * flow table is done under the fe_lock so that log or stat functions
+ * see a self-consistent fe_flow_desc. The name and desc are specific
+ * to a flow, the rest are shared by all the clients, including
+ * resource control etc.
+ */
+ mac_flow_remove(ft, flent, B_TRUE);
+ mac_flow_remove(ft, flent1, B_TRUE);
+
+ bcopy(&flent->fe_flow_desc, &fl_desc, sizeof (flow_desc_t));
+ bcopy(flent->fe_flow_name, fl_name, MAXFLOWNAME);
+
+ /* update the primary flow entry */
+ mutex_enter(&flent->fe_lock);
+ bcopy(&flent1->fe_flow_desc, &flent->fe_flow_desc,
+ sizeof (flow_desc_t));
+ bcopy(&flent1->fe_flow_name, &flent->fe_flow_name, MAXFLOWNAME);
+ mutex_exit(&flent->fe_lock);
+
+ /* update the flow entry that is to be freed */
+ mutex_enter(&flent1->fe_lock);
+ bcopy(&fl_desc, &flent1->fe_flow_desc, sizeof (flow_desc_t));
+ bcopy(fl_name, &flent1->fe_flow_name, MAXFLOWNAME);
+ mutex_exit(&flent1->fe_lock);
+
+ /* now reinsert the flow entries in the table */
+ err = mac_flow_add(ft, flent);
+ ASSERT(err == 0);
+
+ err = mac_flow_add(ft, flent1);
+ ASSERT(err == 0);
+
+ return (flent1);
+}
+
+/*
+ * Return whether there is only one flow entry associated with this
+ * MAC client.
+ */
+static boolean_t
+mac_client_single_rcvr(mac_client_impl_t *mcip)
+{
+ return (mcip->mci_nflents == 1);
+}
+
+int
+mac_validate_props(mac_resource_props_t *mrp)
+{
+ if (mrp == NULL)
+ return (0);
+
+ if (mrp->mrp_mask & MRP_PRIORITY) {
+ mac_priority_level_t pri = mrp->mrp_priority;
+
+ if (pri < MPL_LOW || pri > MPL_RESET)
+ return (EINVAL);
+ }
+
+ if (mrp->mrp_mask & MRP_MAXBW) {
+ uint64_t maxbw = mrp->mrp_maxbw;
+
+ if (maxbw < MRP_MAXBW_MINVAL && maxbw != 0)
+ return (EINVAL);
+ }
+ if (mrp->mrp_mask & MRP_CPUS) {
+ int i;
+ mac_cpu_mode_t fanout;
+
+ if (mrp->mrp_ncpus > ncpus || mrp->mrp_ncpus > MAX_SR_FANOUT)
+ return (EINVAL);
+
+ for (i = 0; i < mrp->mrp_ncpus; i++) {
+ cpu_t *cp;
+ int rv;
+
+ mutex_enter(&cpu_lock);
+ cp = cpu_get(mrp->mrp_cpu[i]);
+ if (cp != NULL)
+ rv = cpu_is_online(cp);
+ else
+ rv = 0;
+ mutex_exit(&cpu_lock);
+ if (rv == 0)
+ return (EINVAL);
+ }
+
+ fanout = mrp->mrp_fanout_mode;
+ if (fanout < 0 || fanout > MCM_CPUS)
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * Send a MAC_NOTE_LINK notification to all the MAC clients whenever the
+ * underlying physical link is down. This is to allow MAC clients to
+ * communicate with other clients.
+ */
+void
+mac_virtual_link_update(mac_impl_t *mip)
+{
+ if (mip->mi_linkstate != LINK_STATE_UP)
+ i_mac_notify(mip, MAC_NOTE_LINK);
+}
+
+/*
+ * For clients that have a pass-thru MAC, e.g. VNIC, we set the VNIC's
+ * mac handle in the client.
+ */
+void
+mac_set_upper_mac(mac_client_handle_t mch, mac_handle_t mh)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+
+ mcip->mci_upper_mip = (mac_impl_t *)mh;
+}
+
+/*
+ * Mark the mac as being used exclusively by the single mac client that is
+ * doing some control operation on this mac. No further opens of this mac
+ * will be allowed until this client calls mac_unmark_exclusive. The mac
+ * client calling this function must already be in the mac perimeter
+ */
+int
+mac_mark_exclusive(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ ASSERT(MAC_PERIM_HELD(mh));
+ /*
+ * Look up its entry in the global hash table.
+ */
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ if (mip->mi_state_flags & MIS_DISABLED) {
+ rw_exit(&i_mac_impl_lock);
+ return (ENOENT);
+ }
+
+ /*
+ * A reference to mac is held even if the link is not plumbed.
+ * In i_dls_link_create() we open the MAC interface and hold the
+ * reference. There is an additional reference for the mac_open
+ * done in acquiring the mac perimeter
+ */
+ if (mip->mi_ref != 2) {
+ rw_exit(&i_mac_impl_lock);
+ return (EBUSY);
+ }
+
+ ASSERT(!(mip->mi_state_flags & MIS_EXCLUSIVE_HELD));
+ mip->mi_state_flags |= MIS_EXCLUSIVE_HELD;
+ rw_exit(&i_mac_impl_lock);
+ return (0);
+}
+
+void
+mac_unmark_exclusive(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ ASSERT(MAC_PERIM_HELD(mh));
+
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ /* 1 for the creation and another for the perimeter */
+ ASSERT(mip->mi_ref == 2 && (mip->mi_state_flags & MIS_EXCLUSIVE_HELD));
+ mip->mi_state_flags &= ~MIS_EXCLUSIVE_HELD;
+ rw_exit(&i_mac_impl_lock);
+}
+
+/*
+ * Set the MTU for the specified device. The function returns EBUSY if
+ * another MAC client prevents the caller to become the exclusive client.
+ * Returns EAGAIN if the client is started.
+ */
+int
+mac_set_mtu(mac_handle_t mh, uint_t new_mtu, uint_t *old_mtu_arg)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ uint_t old_mtu;
+ int rv;
+ boolean_t exclusive = B_FALSE;
+
+ i_mac_perim_enter(mip);
+
+ if ((mip->mi_callbacks->mc_callbacks & MC_SETPROP) == 0 ||
+ (mip->mi_callbacks->mc_callbacks & MC_GETPROP) == 0) {
+ rv = ENOTSUP;
+ goto bail;
+ }
+
+ if ((rv = mac_mark_exclusive(mh)) != 0)
+ goto bail;
+ exclusive = B_TRUE;
+
+ if (mip->mi_active > 0) {
+ /*
+ * The MAC instance is started, for example due to the
+ * presence of a promiscuous clients. Fail the operation
+ * since the MAC's MTU cannot be changed while the NIC
+ * is started.
+ */
+ rv = EAGAIN;
+ goto bail;
+ }
+
+ mac_sdu_get(mh, NULL, &old_mtu);
+
+ if (old_mtu != new_mtu) {
+ rv = mip->mi_callbacks->mc_setprop(mip->mi_driver,
+ "mtu", MAC_PROP_MTU, sizeof (uint_t), &new_mtu);
+ }
+
+bail:
+ if (exclusive)
+ mac_unmark_exclusive(mh);
+ i_mac_perim_exit(mip);
+
+ if (rv == 0 && old_mtu_arg != NULL)
+ *old_mtu_arg = old_mtu;
+ return (rv);
+}
+
+void
+mac_get_hwgrp_info(mac_handle_t mh, int grp_index, uint_t *grp_num,
+ uint_t *n_rings, uint_t *type, uint_t *n_clnts, char *clnts_name)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_grp_client_t *mcip;
+ uint_t i = 0, index = 0;
+
+ /* Revisit when we implement fully dynamic group allocation */
+ ASSERT(grp_index >= 0 && grp_index < mip->mi_rx_group_count);
+
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+ *grp_num = mip->mi_rx_groups[grp_index].mrg_index;
+ *type = mip->mi_rx_groups[grp_index].mrg_type;
+ *n_rings = mip->mi_rx_groups[grp_index].mrg_cur_count;
+ for (mcip = mip->mi_rx_groups[grp_index].mrg_clients; mcip != NULL;
+ mcip = mcip->mgc_next) {
+ int name_len = strlen(mcip->mgc_client->mci_name);
+
+ /*
+ * MAXCLIENTNAMELEN is the buffer size reserved for client
+ * names.
+ * XXXX Formating the client name string needs to be moved
+ * to user land when fixing the size of dhi_clnts in
+ * dld_hwgrpinfo_t. We should use n_clients * client_name for
+ * dhi_clntsin instead of MAXCLIENTNAMELEN
+ */
+ if (index + name_len >= MAXCLIENTNAMELEN) {
+ index = MAXCLIENTNAMELEN;
+ break;
+ }
+ bcopy(mcip->mgc_client->mci_name, &(clnts_name[index]),
+ name_len);
+ index += name_len;
+ clnts_name[index++] = ',';
+ i++;
+ }
+
+ /* Get rid of the last , */
+ if (index > 0)
+ clnts_name[index - 1] = '\0';
+ *n_clnts = i;
+ rw_exit(&mip->mi_rw_lock);
+}
+
+uint_t
+mac_hwgrp_num(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return (mip->mi_rx_group_count);
+}
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
new file mode 100644
index 0000000000..f265e53f13
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -0,0 +1,3347 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/callb.h>
+#include <sys/sdt.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ip_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
+
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/mac_flow_impl.h>
+
+static void mac_srs_soft_rings_signal(mac_soft_ring_set_t *, uint_t);
+static void mac_srs_update_fanout_list(mac_soft_ring_set_t *);
+static void mac_srs_poll_unbind(mac_soft_ring_set_t *);
+static void mac_srs_worker_unbind(mac_soft_ring_set_t *);
+static void mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *, uint_t);
+
+static int mac_srs_cpu_setup(cpu_setup_t, int, void *);
+static void mac_srs_worker_bind(mac_soft_ring_set_t *, processorid_t);
+static void mac_srs_poll_bind(mac_soft_ring_set_t *, processorid_t);
+static void mac_srs_threads_unbind(mac_soft_ring_set_t *);
+static void mac_srs_add_glist(mac_soft_ring_set_t *);
+static void mac_srs_remove_glist(mac_soft_ring_set_t *);
+static void mac_srs_fanout_list_free(mac_soft_ring_set_t *);
+static void mac_soft_ring_remove(mac_soft_ring_set_t *, mac_soft_ring_t *);
+
+static int mac_compute_soft_ring_count(flow_entry_t *, int);
+static void mac_walk_srs_and_bind(int);
+static void mac_walk_srs_and_unbind(int);
+
+extern mac_group_t *mac_reserve_rx_group(mac_client_impl_t *, uint8_t *,
+ mac_rx_group_reserve_type_t);
+extern void mac_release_rx_group(mac_client_impl_t *, mac_group_t *);
+
+extern boolean_t mac_latency_optimize;
+
+static kmem_cache_t *mac_srs_cache;
+kmem_cache_t *mac_soft_ring_cache;
+
+/*
+ * The duration in msec we wait before signalling the soft ring
+ * worker thread in case packets get queued.
+ */
+static uint32_t mac_soft_ring_worker_wait = 0;
+
+/*
+ * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency.
+ * Large values could end up in consuming lot of system memory and cause
+ * system hang.
+ */
+static int mac_soft_ring_max_q_cnt = 1024;
+static int mac_soft_ring_min_q_cnt = 256;
+static int mac_soft_ring_poll_thres = 16;
+
+/*
+ * Default value of number of TX rings to be assigned to a MAC client.
+ * If less than 'mac_tx_ring_count' worth of Tx rings is available, then
+ * as many as is available will be assigned to the newly created MAC client.
+ * If no TX rings are available, then MAC client(s) will be assigned the
+ * default Tx ring. Default Tx ring can be shared among multiple MAC clients.
+ */
+static uint32_t mac_tx_ring_count = 8;
+static boolean_t mac_tx_serialize = B_FALSE;
+
+/*
+ * mac_tx_srs_hiwat is the queue depth threshold at which callers of
+ * mac_tx() will be notified of flow control condition.
+ *
+ * TCP does not honour flow control condition sent up by mac_tx().
+ * Thus provision is made for TCP to allow more packets to be queued
+ * in SRS upto a maximum of mac_tx_srs_max_q_cnt.
+ *
+ * Note that mac_tx_srs_hiwat is always be lesser than
+ * mac_tx_srs_max_q_cnt.
+ */
+static uint32_t mac_tx_srs_max_q_cnt = 100000;
+static uint32_t mac_tx_srs_hiwat = 1000;
+
+/*
+ * mac_rx_soft_ring_count, mac_soft_ring_10gig_count:
+ *
+ * Global tunables that determines the number of soft rings to be used for
+ * fanning out incoming traffic on a link. These count will be used only
+ * when no explicit set of CPUs was assigned to the data-links.
+ *
+ * mac_rx_soft_ring_count tunable will come into effect only if
+ * mac_soft_ring_enable is set. mac_soft_ring_enable is turned on by
+ * default only for sun4v platforms.
+ *
+ * mac_rx_soft_ring_10gig_count will come into effect if you are running on a
+ * 10Gbps link and is not dependent upon mac_soft_ring_enable.
+ *
+ * The number of soft rings for fanout for a link or a flow is determined
+ * by mac_compute_soft_ring_count() routine. This routine will take into
+ * account mac_soft_ring_enable, mac_rx_soft_ring_count and
+ * mac_rx_soft_ring_10gig_count to determine the soft ring count for a link.
+ *
+ * If a bandwidth is specified, the determination of the number of soft
+ * rings is based on specified bandwidth, CPU speed and number of CPUs in
+ * the system.
+ */
+static uint_t mac_rx_soft_ring_count = 8;
+static uint_t mac_rx_soft_ring_10gig_count = 8;
+
+/*
+ * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added
+ * to mac_srs_g_list and mac_srs_g_lock protects mac_srs_g_list. The
+ * list is used to walk the list of all MAC threads when a CPU is
+ * coming online or going offline.
+ */
+static mac_soft_ring_set_t *mac_srs_g_list = NULL;
+static krwlock_t mac_srs_g_lock;
+
+/*
+ * Whether the SRS threads should be bound, or not.
+ */
+static boolean_t mac_srs_thread_bind = B_TRUE;
+
+/*
+ * CPU to fallback to, used by mac_next_bind_cpu().
+ */
+static processorid_t srs_bind_cpu = 0;
+
+/*
+ * Possible setting for soft_ring_process_flag is
+ * 0 or ST_RING_WORKER_ONLY.
+ */
+static int soft_ring_process_flag = ST_RING_WORKER_ONLY;
+
+/*
+ * If cpu bindings are specified by user, then Tx SRS and its soft
+ * rings should also be bound to the CPUs specified by user. The
+ * CPUs for Tx bindings are at the end of the cpu list provided by
+ * the user. If enough CPUs are not available (for Tx and Rx
+ * SRSes), then the CPUs are shared by both Tx and Rx SRSes.
+ */
+#define BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp) { \
+ processorid_t cpuid; \
+ int i, j; \
+ mac_soft_ring_t *softring; \
+ \
+ cpuid = mrp->mrp_cpu[mrp->mrp_ncpus - 1]; \
+ mac_srs_worker_bind(mac_tx_srs, cpuid); \
+ if (TX_MULTI_RING_MODE(mac_tx_srs)) { \
+ j = mrp->mrp_ncpus - 1; \
+ for (i = 0; \
+ i < mac_tx_srs->srs_oth_ring_count; i++, j--) { \
+ if (j < 0) \
+ j = mrp->mrp_ncpus - 1; \
+ cpuid = mrp->mrp_cpu[j]; \
+ softring = mac_tx_srs->srs_oth_soft_rings[i]; \
+ (void) mac_soft_ring_bind(softring, cpuid); \
+ } \
+ } \
+}
+
+/* INIT and FINI ROUTINES */
+
+void
+mac_soft_ring_init(void)
+{
+ mac_soft_ring_cache = kmem_cache_create("mac_soft_ring_cache",
+ sizeof (mac_soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
+
+ mac_srs_cache = kmem_cache_create("mac_srs_cache",
+ sizeof (mac_soft_ring_set_t),
+ 64, NULL, NULL, NULL, NULL, NULL, 0);
+
+ rw_init(&mac_srs_g_lock, NULL, RW_DEFAULT, NULL);
+ mutex_enter(&cpu_lock);
+ register_cpu_setup_func(mac_srs_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+}
+
+void
+mac_soft_ring_finish(void)
+{
+ mutex_enter(&cpu_lock);
+ unregister_cpu_setup_func(mac_srs_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+ rw_destroy(&mac_srs_g_lock);
+ kmem_cache_destroy(mac_soft_ring_cache);
+ kmem_cache_destroy(mac_srs_cache);
+}
+
+static void
+mac_srs_soft_rings_free(mac_soft_ring_set_t *mac_srs, boolean_t release_tx_ring)
+{
+ mac_soft_ring_t *softring, *next, *head;
+
+ /*
+ * Synchronize with mac_walk_srs_bind/unbind which are callbacks from
+ * DR. The callbacks from DR are called with cpu_lock held, and hence
+ * can't wait to grab the mac perimeter. The soft ring list is hence
+ * protected for read access by srs_lock. Changing the soft ring list
+ * needs the mac perimeter and the srs_lock.
+ */
+ mutex_enter(&mac_srs->srs_lock);
+
+ head = mac_srs->srs_soft_ring_head;
+ mac_srs->srs_soft_ring_head = NULL;
+ mac_srs->srs_soft_ring_tail = NULL;
+ mac_srs->srs_soft_ring_count = 0;
+
+ mutex_exit(&mac_srs->srs_lock);
+
+ for (softring = head; softring != NULL; softring = next) {
+ next = softring->s_ring_next;
+ mac_soft_ring_free(softring, release_tx_ring);
+ }
+}
+
+static void
+mac_srs_add_glist(mac_soft_ring_set_t *mac_srs)
+{
+ ASSERT(mac_srs->srs_next == NULL && mac_srs->srs_prev == NULL);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+
+ rw_enter(&mac_srs_g_lock, RW_WRITER);
+ mutex_enter(&mac_srs->srs_lock);
+
+ ASSERT((mac_srs->srs_state & SRS_IN_GLIST) == 0);
+
+ if (mac_srs_g_list == NULL) {
+ mac_srs_g_list = mac_srs;
+ } else {
+ mac_srs->srs_next = mac_srs_g_list;
+ mac_srs_g_list->srs_prev = mac_srs;
+ mac_srs->srs_prev = NULL;
+ mac_srs_g_list = mac_srs;
+ }
+ mac_srs->srs_state |= SRS_IN_GLIST;
+
+ mutex_exit(&mac_srs->srs_lock);
+ rw_exit(&mac_srs_g_lock);
+}
+
+static void
+mac_srs_remove_glist(mac_soft_ring_set_t *mac_srs)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+
+ rw_enter(&mac_srs_g_lock, RW_WRITER);
+ mutex_enter(&mac_srs->srs_lock);
+
+ ASSERT((mac_srs->srs_state & SRS_IN_GLIST) != 0);
+
+ if (mac_srs == mac_srs_g_list) {
+ mac_srs_g_list = mac_srs->srs_next;
+ if (mac_srs_g_list != NULL)
+ mac_srs_g_list->srs_prev = NULL;
+ } else {
+ mac_srs->srs_prev->srs_next = mac_srs->srs_next;
+ if (mac_srs->srs_next != NULL)
+ mac_srs->srs_next->srs_prev = mac_srs->srs_prev;
+ }
+ mac_srs->srs_state &= ~SRS_IN_GLIST;
+
+ mutex_exit(&mac_srs->srs_lock);
+ rw_exit(&mac_srs_g_lock);
+}
+
+/* POLLING SETUP AND TEAR DOWN ROUTINES */
+
+/*
+ * mac_srs_client_poll_quiesce and mac_srs_client_poll_restart
+ *
+ * These routines are used to call back into the upper layer
+ * (primarily TCP squeue) to stop polling the soft rings or
+ * restart polling.
+ */
+void
+mac_srs_client_poll_quiesce(mac_client_impl_t *mcip,
+ mac_soft_ring_set_t *mac_srs)
+{
+ mac_soft_ring_t *softring;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
+ ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
+ return;
+ }
+
+ for (softring = mac_srs->srs_soft_ring_head;
+ softring != NULL; softring = softring->s_ring_next) {
+ if ((softring->s_ring_type & ST_RING_TCP) &&
+ (softring->s_ring_rx_arg2 != NULL)) {
+ mcip->mci_resource_quiesce(mcip->mci_resource_arg,
+ softring->s_ring_rx_arg2);
+ }
+ }
+}
+
+void
+mac_srs_client_poll_restart(mac_client_impl_t *mcip,
+ mac_soft_ring_set_t *mac_srs)
+{
+ mac_soft_ring_t *softring;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
+ ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
+ return;
+ }
+
+ for (softring = mac_srs->srs_soft_ring_head;
+ softring != NULL; softring = softring->s_ring_next) {
+ if ((softring->s_ring_type & ST_RING_TCP) &&
+ (softring->s_ring_rx_arg2 != NULL)) {
+ mcip->mci_resource_restart(mcip->mci_resource_arg,
+ softring->s_ring_rx_arg2);
+ }
+ }
+}
+
+/*
+ * Register the given SRS and associated soft rings with the consumer and
+ * enable the polling interface used by the consumer.(i.e IP) over this
+ * SRS and associated soft rings.
+ */
+void
+mac_srs_client_poll_enable(mac_client_impl_t *mcip,
+ mac_soft_ring_set_t *mac_srs)
+{
+ mac_rx_fifo_t mrf;
+ mac_soft_ring_t *softring;
+
+ ASSERT(mac_srs->srs_mcip == mcip);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ if (!(mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE))
+ return;
+
+ bzero(&mrf, sizeof (mac_rx_fifo_t));
+ mrf.mrf_type = MAC_RX_FIFO;
+
+ /*
+ * A SRS is capable of acting as a soft ring for cases
+ * where no fanout is needed. This is the case for userland
+ * flows.
+ */
+ if (mac_srs->srs_type & SRST_NO_SOFT_RINGS)
+ return;
+
+ mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
+ mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable;
+ mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable;
+ mac_srs->srs_type |= SRST_CLIENT_POLL_ENABLED;
+
+ softring = mac_srs->srs_soft_ring_head;
+ while (softring != NULL) {
+ if (softring->s_ring_type & (ST_RING_TCP | ST_RING_UDP)) {
+ /*
+ * TCP and UDP support DLS bypass. Squeue polling
+ * support implies DLS bypass since the squeue poll
+ * path does not have DLS processing.
+ */
+ mac_soft_ring_dls_bypass(softring,
+ mcip->mci_direct_rx_fn, mcip->mci_direct_rx_arg);
+ }
+ /*
+ * Non-TCP protocols don't support squeues. Hence we don't
+ * make any ring addition callbacks for non-TCP rings
+ */
+ if (!(softring->s_ring_type & ST_RING_TCP)) {
+ softring->s_ring_rx_arg2 = NULL;
+ softring = softring->s_ring_next;
+ continue;
+ }
+ mrf.mrf_rx_arg = softring;
+ mrf.mrf_intr_handle = (mac_intr_handle_t)softring;
+ mrf.mrf_cpu_id = softring->s_ring_cpuid;
+ mrf.mrf_flow_priority = mac_srs->srs_pri;
+
+ softring->s_ring_rx_arg2 = mcip->mci_resource_add(
+ mcip->mci_resource_arg, (mac_resource_t *)&mrf);
+
+ softring = softring->s_ring_next;
+ }
+}
+
+/*
+ * Unregister the given SRS and associated soft rings with the consumer and
+ * disable the polling interface used by the consumer.(i.e IP) over this
+ * SRS and associated soft rings.
+ */
+void
+mac_srs_client_poll_disable(mac_client_impl_t *mcip,
+ mac_soft_ring_set_t *mac_srs)
+{
+ mac_soft_ring_t *softring;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ /*
+ * A SRS is capable of acting as a soft ring for cases
+ * where no protocol fanout is needed. This is the case
+ * for userland flows. Nothing to do here.
+ */
+ if (mac_srs->srs_type & SRST_NO_SOFT_RINGS)
+ return;
+
+ mutex_enter(&mac_srs->srs_lock);
+ if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
+ ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
+ mutex_exit(&mac_srs->srs_lock);
+ return;
+ }
+ mac_srs->srs_type &= ~(SRST_CLIENT_POLL_ENABLED | SRST_DLS_BYPASS);
+ mutex_exit(&mac_srs->srs_lock);
+
+ /*
+ * DLS bypass is now disabled in the case of both TCP and UDP.
+ * Reset the soft ring callbacks to the standard 'mac_rx_deliver'
+ * callback. In addition, in the case of TCP, invoke IP's callback
+ * for ring removal.
+ */
+ for (softring = mac_srs->srs_soft_ring_head;
+ softring != NULL; softring = softring->s_ring_next) {
+ if (!(softring->s_ring_type & (ST_RING_UDP | ST_RING_TCP)))
+ continue;
+
+ if ((softring->s_ring_type & ST_RING_TCP) &&
+ softring->s_ring_rx_arg2 != NULL) {
+ mcip->mci_resource_remove(mcip->mci_resource_arg,
+ softring->s_ring_rx_arg2);
+ }
+
+ mutex_enter(&softring->s_ring_lock);
+ while (softring->s_ring_state & S_RING_PROC) {
+ softring->s_ring_state |= S_RING_CLIENT_WAIT;
+ cv_wait(&softring->s_ring_client_cv,
+ &softring->s_ring_lock);
+ }
+ softring->s_ring_state &= ~S_RING_CLIENT_WAIT;
+ softring->s_ring_rx_arg2 = NULL;
+ softring->s_ring_rx_func = mac_rx_deliver;
+ softring->s_ring_rx_arg1 = mcip;
+ mutex_exit(&softring->s_ring_lock);
+ }
+}
+
+/*
+ * Enable or disable poll capability of the SRS on the underlying Rx ring.
+ *
+ * There is a need to enable or disable the poll capability of an SRS over an
+ * Rx ring depending on the number of mac clients sharing the ring and also
+ * whether user flows are configured on it. However the poll state is actively
+ * manipulated by the SRS worker and poll threads and uncoordinated changes by
+ * yet another thread to the underlying capability can surprise them leading
+ * to assert failures. Instead we quiesce the SRS, make the changes and then
+ * restart the SRS.
+ */
+static void
+mac_srs_poll_state_change(mac_soft_ring_set_t *mac_srs,
+ boolean_t turn_off_poll_capab, mac_rx_func_t rx_func)
+{
+ boolean_t need_restart = B_FALSE;
+ mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
+ mac_ring_t *ring;
+
+ if (!SRS_QUIESCED(mac_srs)) {
+ mac_rx_srs_quiesce(mac_srs, SRS_QUIESCE);
+ need_restart = B_TRUE;
+ }
+
+ ring = mac_srs->srs_ring;
+ if ((ring != NULL) &&
+ (ring->mr_classify_type == MAC_HW_CLASSIFIER)) {
+ if (turn_off_poll_capab)
+ mac_srs->srs_state &= ~SRS_POLLING_CAPAB;
+ else
+ mac_srs->srs_state |= SRS_POLLING_CAPAB;
+ }
+ srs_rx->sr_lower_proc = rx_func;
+
+ if (need_restart)
+ mac_rx_srs_restart(mac_srs);
+}
+
+/* CPU RECONFIGURATION AND FANOUT COMPUTATION ROUTINES */
+
+/*
+ * Return the next CPU to be used to bind a MAC kernel thread.
+ */
+static processorid_t
+mac_next_bind_cpu(void)
+{
+ static processorid_t srs_curr_cpu = -1;
+ cpu_t *cp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ srs_curr_cpu++;
+ cp = cpu_get(srs_curr_cpu);
+ if (cp == NULL || !cpu_is_online(cp))
+ srs_curr_cpu = srs_bind_cpu;
+
+ return (srs_curr_cpu);
+}
+
+/* ARGSUSED */
+static int
+mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
+{
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ switch (what) {
+ case CPU_CONFIG:
+ case CPU_ON:
+ case CPU_CPUPART_IN:
+ mac_walk_srs_and_bind(id);
+ break;
+
+ case CPU_UNCONFIG:
+ case CPU_OFF:
+ case CPU_CPUPART_OUT:
+ mac_walk_srs_and_unbind(id);
+ break;
+
+ default:
+ break;
+ }
+ return (0);
+}
+
+/*
+ * mac_compute_soft_ring_count():
+ *
+ * This routine computes the number of soft rings needed to handle incoming
+ * load given a flow_entry.
+ *
+ * The routine does the following:
+ * 1) soft rings will be created if mac_soft_ring_enable is set.
+ * 2) If the underlying link is a 10Gbps link, then soft rings will be
+ * created even if mac_soft_ring_enable is not set. The number of soft
+ * rings, so created, will equal mac_rx_soft_ring_10gig_count.
+ * 3) On a sun4v platform (i.e., mac_soft_ring_enable is set), 2 times the
+ * mac_rx_soft_ring_10gig_count number of soft rings will be created for a
+ * 10Gbps link.
+ *
+ * If a bandwidth limit is specified, the number that gets computed is
+ * dependent upon CPU speed, the number of Rx rings configured, and
+ * the bandwidth limit.
+ * If more Rx rings are available, less number of soft rings is needed.
+ *
+ * mac_use_bw_heuristic is another "hidden" variable that can be used to
+ * override the default use of soft ring count computation. Depending upon
+ * the usefulness of it, mac_use_bw_heuristic can later be made into a
+ * data-link property or removed altogether.
+ *
+ * TODO: Cleanup and tighten some of the assumptions.
+ */
+boolean_t mac_use_bw_heuristic = B_TRUE;
+static int
+mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt)
+{
+ uint64_t cpu_speed, bw = 0;
+ int srings = 0;
+ boolean_t bw_enabled = B_FALSE;
+
+ ASSERT(!(flent->fe_type & FLOW_USER));
+ if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
+ mac_use_bw_heuristic) {
+ /* bandwidth enabled */
+ bw_enabled = B_TRUE;
+ bw = flent->fe_resource_props.mrp_maxbw;
+ }
+ if (!bw_enabled) {
+ /* No bandwidth enabled */
+ if (mac_soft_ring_enable)
+ srings = mac_rx_soft_ring_count;
+
+ /* Is this a 10Gig link? */
+ flent->fe_nic_speed = mac_client_stat_get(flent->fe_mcip,
+ MAC_STAT_IFSPEED);
+ /* convert to Mbps */
+ if (((flent->fe_nic_speed)/1000000) > 1000 &&
+ mac_rx_soft_ring_10gig_count > 0) {
+ /* This is a 10Gig link */
+ srings = mac_rx_soft_ring_10gig_count;
+ /*
+ * Use 2 times mac_rx_soft_ring_10gig_count for
+ * sun4v systems.
+ */
+ if (mac_soft_ring_enable)
+ srings = srings * 2;
+ }
+ } else {
+ /*
+ * Soft ring computation using CPU speed and specified
+ * bandwidth limit.
+ */
+ /* Assumption: all CPUs have the same frequency */
+ cpu_speed = (uint64_t)CPU->cpu_type_info.pi_clock;
+
+ /* cpu_speed is in MHz; make bw in units of Mbps. */
+ bw = bw/1000000;
+
+ if (bw >= 1000) {
+ /*
+ * bw is greater than or equal to 1Gbps.
+ * The number of soft rings required is a function
+ * of bandwidth and CPU speed. To keep this simple,
+ * let's use this rule: 1GHz CPU can handle 1Gbps.
+ * If bw is less than 1 Gbps, then there is no need
+ * for soft rings. Assumption is that CPU speeds
+ * (on modern systems) are at least 1GHz.
+ */
+ srings = bw/cpu_speed;
+ if (srings <= 1 && mac_soft_ring_enable) {
+ /*
+ * Give at least 2 soft rings
+ * for sun4v systems
+ */
+ srings = 2;
+ }
+ }
+ }
+ /*
+ * If the flent has multiple Rx SRSs, then each SRS need not
+ * have that many soft rings on top of it. The number of
+ * soft rings for each Rx SRS is found by dividing srings by
+ * rx_srs_cnt.
+ */
+ if (rx_srs_cnt > 1) {
+ int remainder;
+
+ remainder = srings%rx_srs_cnt;
+ srings = srings/rx_srs_cnt;
+ if (remainder != 0)
+ srings++;
+ /*
+ * Fanning out to 1 soft ring is not very useful.
+ * Set it as well to 0 and mac_srs_fanout_init()
+ * will take care of creating a single soft ring
+ * for proto fanout.
+ */
+ if (srings == 1)
+ srings = 0;
+ }
+ /* Do some more massaging */
+ srings = min(srings, ncpus);
+ srings = min(srings, MAX_SR_FANOUT);
+ return (srings);
+}
+
+/*
+ * Assignment of user specified CPUs to a link.
+ *
+ * Minimum CPUs required to get an optimal assignmet:
+ * For each Rx SRS, atleast two CPUs are needed if mac_latency_optimize
+ * flag is set -- one for polling, one for fanout soft ring.
+ * If mac_latency_optimize is not set, then 3 CPUs are needed -- one
+ * for polling, one for SRS worker thread and one for fanout soft ring.
+ *
+ * The CPUs needed for Tx side is equal to the number of Tx rings
+ * the link is using.
+ *
+ * mac_flow_user_cpu_init() categorizes the CPU assignment depending
+ * upon the number of CPUs in 3 different buckets.
+ *
+ * In the first bucket, the most optimal case is handled. The user has
+ * passed enough number of CPUs and every thread gets its own CPU.
+ *
+ * The second and third are the sub-optimal cases. Enough CPUs are not
+ * available.
+ *
+ * The second bucket handles the case where atleast one distinct CPU is
+ * is available for each of the Rx rings (Rx SRSes) and Tx rings (Tx
+ * SRS or soft rings).
+ *
+ * In the third case (worst case scenario), specified CPU count is less
+ * than the Rx rings configured for the link. In this case, we round
+ * robin the CPUs among the Rx SRSes and Tx SRS/soft rings.
+ */
+static void
+mac_flow_user_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+ mac_soft_ring_set_t *rx_srs, *tx_srs;
+ int i, srs_cnt;
+ mac_cpus_t *srs_cpu;
+ int no_of_cpus, cpu_cnt;
+ int rx_srs_cnt, reqd_rx_cpu_cnt;
+ int fanout_cpu_cnt, reqd_tx_cpu_cnt;
+ int reqd_poll_worker_cnt, fanout_cnt_per_srs;
+
+ ASSERT(mrp->mrp_fanout_mode == MCM_CPUS);
+ /*
+ * The check for nbc_ncpus to be within limits for
+ * the user specified case was done earlier and if
+ * not within limits, an error would have been
+ * returned to the user.
+ */
+ ASSERT(mrp->mrp_ncpus > 0 && mrp->mrp_ncpus <= MAX_SR_FANOUT);
+
+ no_of_cpus = mrp->mrp_ncpus;
+
+ if (mrp->mrp_intr_cpu != -1) {
+ /*
+ * interrupt has been re-targetted. Poll
+ * thread needs to be bound to interrupt
+ * CPU. Presently only fixed interrupts
+ * are re-targetted, MSI-x aren't.
+ *
+ * Find where in the list is the intr
+ * CPU and swap it with the first one.
+ * We will be using the first CPU in the
+ * list for poll.
+ */
+ for (i = 0; i < no_of_cpus; i++) {
+ if (mrp->mrp_cpu[i] == mrp->mrp_intr_cpu)
+ break;
+ }
+ mrp->mrp_cpu[i] = mrp->mrp_cpu[0];
+ mrp->mrp_cpu[0] = mrp->mrp_intr_cpu;
+ }
+
+ /*
+ * Requirements:
+ * The number of CPUs that each Rx ring needs is dependent
+ * upon mac_latency_optimize flag.
+ * 1) If set, atleast 2 CPUs are needed -- one for
+ * polling, one for fanout soft ring.
+ * 2) If not set, then atleast 3 CPUs are needed -- one
+ * for polling, one for srs worker thread, and one for
+ * fanout soft ring.
+ */
+ rx_srs_cnt = (flent->fe_rx_srs_cnt > 1) ?
+ (flent->fe_rx_srs_cnt - 1) : flent->fe_rx_srs_cnt;
+ reqd_rx_cpu_cnt = mac_latency_optimize ?
+ (rx_srs_cnt * 2) : (rx_srs_cnt * 3);
+
+ /* How many CPUs are needed for Tx side? */
+ tx_srs = flent->fe_tx_srs;
+ reqd_tx_cpu_cnt = TX_MULTI_RING_MODE(tx_srs) ?
+ tx_srs->srs_oth_ring_count : 1;
+
+ /* CPUs needed for Rx SRSes poll and worker threads */
+ reqd_poll_worker_cnt = mac_latency_optimize ?
+ rx_srs_cnt : rx_srs_cnt * 2;
+
+ /* Has the user provided enough CPUs? */
+ if (no_of_cpus >= (reqd_rx_cpu_cnt + reqd_tx_cpu_cnt)) {
+ /*
+ * Best case scenario. There is enough CPUs. All
+ * Rx rings will get their own set of CPUs plus
+ * Tx soft rings will get their own.
+ */
+ /*
+ * fanout_cpu_cnt is the number of CPUs available
+ * for Rx side fanout soft rings.
+ */
+ fanout_cpu_cnt = no_of_cpus -
+ reqd_poll_worker_cnt - reqd_tx_cpu_cnt;
+
+ /*
+ * Divide fanout_cpu_cnt by rx_srs_cnt to find
+ * out how many fanout soft rings each Rx SRS
+ * can have.
+ */
+ fanout_cnt_per_srs = fanout_cpu_cnt/rx_srs_cnt;
+
+ /* Do the assignment for the default Rx ring */
+ cpu_cnt = 0;
+ rx_srs = flent->fe_rx_srs[0];
+ ASSERT(rx_srs->srs_ring == NULL);
+ if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
+ rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+ srs_cpu = &rx_srs->srs_cpu;
+ srs_cpu->mc_ncpus = no_of_cpus;
+ bcopy(mrp->mrp_cpu,
+ srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
+ srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs;
+ srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++];
+ srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+ srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+ if (!mac_latency_optimize)
+ srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++];
+ for (i = 0; i < fanout_cnt_per_srs; i++)
+ srs_cpu->mc_fanout_cpus[i] = mrp->mrp_cpu[cpu_cnt++];
+
+ /* Do the assignment for h/w Rx SRSes */
+ if (flent->fe_rx_srs_cnt > 1) {
+ cpu_cnt = 0;
+ for (srs_cnt = 1;
+ srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+ rx_srs = flent->fe_rx_srs[srs_cnt];
+ ASSERT(rx_srs->srs_ring != NULL);
+ if (rx_srs->srs_fanout_state ==
+ SRS_FANOUT_INIT) {
+ rx_srs->srs_fanout_state =
+ SRS_FANOUT_REINIT;
+ }
+ srs_cpu = &rx_srs->srs_cpu;
+ srs_cpu->mc_ncpus = no_of_cpus;
+ bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus,
+ sizeof (srs_cpu->mc_cpus));
+ srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs;
+ /* The first CPU in the list is the intr CPU */
+ srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++];
+ srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+ srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+ if (!mac_latency_optimize) {
+ srs_cpu->mc_workerid =
+ mrp->mrp_cpu[cpu_cnt++];
+ }
+ for (i = 0; i < fanout_cnt_per_srs; i++) {
+ srs_cpu->mc_fanout_cpus[i] =
+ mrp->mrp_cpu[cpu_cnt++];
+ }
+ ASSERT(cpu_cnt <= no_of_cpus);
+ }
+ }
+ return;
+ }
+
+ /*
+ * Sub-optimal case.
+ * We have the following information:
+ * no_of_cpus - no. of cpus that user passed.
+ * rx_srs_cnt - no. of rx rings.
+ * reqd_rx_cpu_cnt = mac_latency_optimize?rx_srs_cnt*2:rx_srs_cnt*3
+ * reqd_tx_cpu_cnt - no. of cpus reqd. for Tx side.
+ * reqd_poll_worker_cnt = mac_latency_optimize?rx_srs_cnt:rx_srs_cnt*2
+ */
+ /*
+ * If we bind the Rx fanout soft rings to the same CPUs
+ * as poll/worker, would that be enough?
+ */
+ if (no_of_cpus >= (rx_srs_cnt + reqd_tx_cpu_cnt)) {
+ boolean_t worker_assign = B_FALSE;
+
+ /*
+ * If mac_latency_optimize is not set, are there
+ * enough CPUs to assign a CPU for worker also?
+ */
+ if (no_of_cpus >= (reqd_poll_worker_cnt + reqd_tx_cpu_cnt))
+ worker_assign = B_TRUE;
+ /*
+ * Zero'th Rx SRS is the default Rx ring. It is not
+ * associated with h/w Rx ring.
+ */
+ rx_srs = flent->fe_rx_srs[0];
+ ASSERT(rx_srs->srs_ring == NULL);
+ if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
+ rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+ cpu_cnt = 0;
+ srs_cpu = &rx_srs->srs_cpu;
+ srs_cpu->mc_ncpus = no_of_cpus;
+ bcopy(mrp->mrp_cpu,
+ srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
+ srs_cpu->mc_fanout_cnt = 1;
+ srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++];
+ srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+ srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+ if (!mac_latency_optimize && worker_assign)
+ srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++];
+ srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt];
+
+ /* Do CPU bindings for SRSes having h/w Rx rings */
+ if (flent->fe_rx_srs_cnt > 1) {
+ cpu_cnt = 0;
+ for (srs_cnt = 1;
+ srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+ rx_srs = flent->fe_rx_srs[srs_cnt];
+ ASSERT(rx_srs->srs_ring != NULL);
+ if (rx_srs->srs_fanout_state ==
+ SRS_FANOUT_INIT) {
+ rx_srs->srs_fanout_state =
+ SRS_FANOUT_REINIT;
+ }
+ srs_cpu = &rx_srs->srs_cpu;
+ srs_cpu->mc_ncpus = no_of_cpus;
+ bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus,
+ sizeof (srs_cpu->mc_cpus));
+ srs_cpu->mc_pollid =
+ mrp->mrp_cpu[cpu_cnt];
+ srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+ srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+ if (!mac_latency_optimize && worker_assign) {
+ srs_cpu->mc_workerid =
+ mrp->mrp_cpu[++cpu_cnt];
+ }
+ srs_cpu->mc_fanout_cnt = 1;
+ srs_cpu->mc_fanout_cpus[0] =
+ mrp->mrp_cpu[cpu_cnt];
+ cpu_cnt++;
+ ASSERT(cpu_cnt <= no_of_cpus);
+ }
+ }
+ return;
+ }
+
+ /*
+ * Real sub-optimal case. Not enough CPUs for poll and
+ * Tx soft rings. Do a round robin assignment where
+ * each Rx SRS will get the same CPU for poll, worker
+ * and fanout soft ring.
+ */
+ cpu_cnt = 0;
+ for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+ rx_srs = flent->fe_rx_srs[srs_cnt];
+ srs_cpu = &rx_srs->srs_cpu;
+ if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
+ rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+ srs_cpu->mc_ncpus = no_of_cpus;
+ bcopy(mrp->mrp_cpu,
+ srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
+ srs_cpu->mc_fanout_cnt = 1;
+ srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt];
+ srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+ srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt];
+ srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt];
+ if (++cpu_cnt >= no_of_cpus)
+ cpu_cnt = 0;
+ }
+}
+
+/*
+ * mac_flow_cpu_init():
+ *
+ * Each SRS has a mac_cpu_t structure, srs_cpu. This routine fills in
+ * the CPU binding information in srs_cpu for all Rx SRSes associated
+ * with a flent.
+ */
+static void
+mac_flow_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+ mac_soft_ring_set_t *rx_srs;
+ processorid_t cpuid;
+ int j, srs_cnt, soft_ring_cnt = 0;
+ mac_cpus_t *srs_cpu;
+
+ if (mrp->mrp_mask & MRP_CPUS_USERSPEC) {
+ mac_flow_user_cpu_init(flent, mrp);
+ } else {
+ /*
+ * Compute the number of soft rings needed on top for each Rx
+ * SRS. "rx_srs_cnt-1" indicates the number of Rx SRS
+ * associated with h/w Rx rings. Soft ring count needed for
+ * each h/w Rx SRS is computed and the same is applied to
+ * software classified Rx SRS. The first Rx SRS in fe_rx_srs[]
+ * is the software classified Rx SRS.
+ */
+ soft_ring_cnt = mac_compute_soft_ring_count(flent,
+ flent->fe_rx_srs_cnt - 1);
+ if (soft_ring_cnt == 0) {
+ /*
+ * Even when soft_ring_cnt is 0, we still need
+ * to create a soft ring for TCP, UDP and
+ * OTHER. So set it to 1.
+ */
+ soft_ring_cnt = 1;
+ }
+ for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+ rx_srs = flent->fe_rx_srs[srs_cnt];
+ srs_cpu = &rx_srs->srs_cpu;
+ if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) {
+ if (soft_ring_cnt == srs_cpu->mc_fanout_cnt)
+ continue;
+ rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+ }
+ srs_cpu->mc_ncpus = soft_ring_cnt;
+ srs_cpu->mc_fanout_cnt = soft_ring_cnt;
+ mutex_enter(&cpu_lock);
+ for (j = 0; j < soft_ring_cnt; j++) {
+ cpuid = mac_next_bind_cpu();
+ srs_cpu->mc_cpus[j] = cpuid;
+ srs_cpu->mc_fanout_cpus[j] = cpuid;
+ }
+ cpuid = mac_next_bind_cpu();
+ srs_cpu->mc_pollid = cpuid;
+ /* increment ncpus to account for polling cpu */
+ srs_cpu->mc_ncpus++;
+ srs_cpu->mc_cpus[j++] = cpuid;
+ if (!mac_latency_optimize) {
+ cpuid = mac_next_bind_cpu();
+ srs_cpu->mc_ncpus++;
+ srs_cpu->mc_cpus[j++] = cpuid;
+ }
+ srs_cpu->mc_workerid = cpuid;
+ mutex_exit(&cpu_lock);
+ }
+ }
+}
+
+/*
+ * DATAPATH SETUP ROUTINES
+ * (setup SRS and set/update FANOUT, B/W and PRIORITY)
+ */
+
+static void
+mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs)
+{
+ mac_srs->srs_tcp_soft_rings = (mac_soft_ring_t **)
+ kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP);
+ mac_srs->srs_udp_soft_rings = (mac_soft_ring_t **)
+ kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP);
+ mac_srs->srs_oth_soft_rings = (mac_soft_ring_t **)
+ kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP);
+}
+
+static void
+mac_srs_worker_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid)
+{
+ cpu_t *cp;
+ boolean_t clear = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ if (!mac_srs_thread_bind)
+ return;
+
+ cp = cpu_get(cpuid);
+ if (cp == NULL || !cpu_is_online(cp))
+ return;
+
+ mutex_enter(&mac_srs->srs_lock);
+ mac_srs->srs_state |= SRS_WORKER_BOUND;
+ if (mac_srs->srs_worker_cpuid != -1)
+ clear = B_TRUE;
+ mac_srs->srs_worker_cpuid = cpuid;
+ mutex_exit(&mac_srs->srs_lock);
+
+ if (clear)
+ thread_affinity_clear(mac_srs->srs_worker);
+
+ thread_affinity_set(mac_srs->srs_worker, cpuid);
+ DTRACE_PROBE1(worker__CPU, processorid_t, cpuid);
+}
+
+static void
+mac_srs_poll_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid)
+{
+ cpu_t *cp;
+ boolean_t clear = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ if (!mac_srs_thread_bind || mac_srs->srs_poll_thr == NULL)
+ return;
+
+ cp = cpu_get(cpuid);
+ if (cp == NULL || !cpu_is_online(cp))
+ return;
+
+ mutex_enter(&mac_srs->srs_lock);
+ mac_srs->srs_state |= SRS_POLL_BOUND;
+ if (mac_srs->srs_poll_cpuid != -1)
+ clear = B_TRUE;
+ mac_srs->srs_poll_cpuid = cpuid;
+ mutex_exit(&mac_srs->srs_lock);
+
+ if (clear)
+ thread_affinity_clear(mac_srs->srs_poll_thr);
+
+ thread_affinity_set(mac_srs->srs_poll_thr, cpuid);
+ DTRACE_PROBE1(poll__CPU, processorid_t, cpuid);
+}
+
+/*
+ * When a CPU comes back online, bind the MAC kernel threads which
+ * were previously bound to that CPU, and had to be unbound because
+ * the CPU was going away.
+ *
+ * These functions are called with cpu_lock held and hence we can't
+ * cv_wait to grab the mac perimeter. Since these functions walk the soft
+ * ring list of an SRS without being in the perimeter, the list itself
+ * is protected by the SRS lock.
+ */
+static void
+mac_walk_srs_and_bind(int cpuid)
+{
+ mac_soft_ring_set_t *mac_srs;
+ mac_soft_ring_t *soft_ring;
+
+ rw_enter(&mac_srs_g_lock, RW_READER);
+
+ if ((mac_srs = mac_srs_g_list) == NULL)
+ goto done;
+
+ for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) {
+ if (mac_srs->srs_worker_cpuid == -1 &&
+ mac_srs->srs_worker_cpuid_save == cpuid) {
+ mac_srs->srs_worker_cpuid_save = -1;
+ mac_srs_worker_bind(mac_srs, cpuid);
+ }
+
+ if (!(mac_srs->srs_type & SRST_TX)) {
+ if (mac_srs->srs_poll_cpuid == -1 &&
+ mac_srs->srs_poll_cpuid_save == cpuid) {
+ mac_srs->srs_poll_cpuid_save = -1;
+ mac_srs_poll_bind(mac_srs, cpuid);
+ }
+ }
+
+ /* Next tackle the soft rings associated with the srs */
+ mutex_enter(&mac_srs->srs_lock);
+ for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
+ soft_ring = soft_ring->s_ring_next) {
+ if (soft_ring->s_ring_cpuid == -1 &&
+ soft_ring->s_ring_cpuid_save == cpuid) {
+ soft_ring->s_ring_cpuid_save = -1;
+ (void) mac_soft_ring_bind(soft_ring, cpuid);
+ }
+ }
+ mutex_exit(&mac_srs->srs_lock);
+ }
+done:
+ rw_exit(&mac_srs_g_lock);
+}
+
+/*
+ * Change the priority of the SRS's poll and worker thread. Additionally,
+ * update the priority of the worker threads for the SRS's soft rings.
+ * Need to modify any associated squeue threads.
+ */
+void
+mac_update_srs_priority(mac_soft_ring_set_t *mac_srs, pri_t prival)
+{
+ mac_soft_ring_t *ringp;
+
+ mac_srs->srs_pri = prival;
+ thread_lock(mac_srs->srs_worker);
+ (void) thread_change_pri(mac_srs->srs_worker, mac_srs->srs_pri, 0);
+ thread_unlock(mac_srs->srs_worker);
+ if (mac_srs->srs_poll_thr != NULL) {
+ thread_lock(mac_srs->srs_poll_thr);
+ (void) thread_change_pri(mac_srs->srs_poll_thr,
+ mac_srs->srs_pri, 0);
+ thread_unlock(mac_srs->srs_poll_thr);
+ }
+ if ((ringp = mac_srs->srs_soft_ring_head) == NULL)
+ return;
+ while (ringp != mac_srs->srs_soft_ring_tail) {
+ thread_lock(ringp->s_ring_worker);
+ (void) thread_change_pri(ringp->s_ring_worker,
+ mac_srs->srs_pri, 0);
+ thread_unlock(ringp->s_ring_worker);
+ ringp = ringp->s_ring_next;
+ }
+ ASSERT(ringp == mac_srs->srs_soft_ring_tail);
+ thread_lock(ringp->s_ring_worker);
+ (void) thread_change_pri(ringp->s_ring_worker, mac_srs->srs_pri, 0);
+ thread_unlock(ringp->s_ring_worker);
+}
+
+/*
+ * Change the receive bandwidth limit.
+ */
+static void
+mac_rx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp)
+{
+ mac_soft_ring_t *softring;
+
+ mutex_enter(&srs->srs_lock);
+ mutex_enter(&srs->srs_bw->mac_bw_lock);
+
+ if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
+ /* Reset bandwidth limit */
+ if (srs->srs_type & SRST_BW_CONTROL) {
+ softring = srs->srs_soft_ring_head;
+ while (softring != NULL) {
+ softring->s_ring_type &= ~ST_RING_BW_CTL;
+ softring = softring->s_ring_next;
+ }
+ srs->srs_type &= ~SRST_BW_CONTROL;
+ srs->srs_drain_func = mac_rx_srs_drain;
+ }
+ } else {
+ /* Set/Modify bandwidth limit */
+ srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
+ /*
+ * Give twice the queuing capability before
+ * dropping packets. The unit is bytes/tick.
+ */
+ srs->srs_bw->mac_bw_drop_threshold =
+ srs->srs_bw->mac_bw_limit << 1;
+ if (!(srs->srs_type & SRST_BW_CONTROL)) {
+ softring = srs->srs_soft_ring_head;
+ while (softring != NULL) {
+ softring->s_ring_type |= ST_RING_BW_CTL;
+ softring = softring->s_ring_next;
+ }
+ srs->srs_type |= SRST_BW_CONTROL;
+ srs->srs_drain_func = mac_rx_srs_drain_bw;
+ }
+ }
+done:
+ mutex_exit(&srs->srs_bw->mac_bw_lock);
+ mutex_exit(&srs->srs_lock);
+}
+
+/* Change the transmit bandwidth limit */
+static void
+mac_tx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp)
+{
+ mac_srs_tx_t *srs_tx = &srs->srs_tx;
+ uint32_t tx_mode;
+ mac_impl_t *mip = srs->srs_mcip->mci_mip;
+
+ mutex_enter(&srs->srs_lock);
+ mutex_enter(&srs->srs_bw->mac_bw_lock);
+
+ tx_mode = srs_tx->st_mode;
+
+ if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
+ /* Reset bandwidth limit */
+ if (tx_mode == SRS_TX_BW) {
+ if (mac_tx_serialize ||
+ (mip->mi_v12n_level & MAC_VIRT_SERIALIZE)) {
+ srs_tx->st_mode = SRS_TX_SERIALIZE;
+ } else {
+ srs_tx->st_mode = SRS_TX_DEFAULT;
+ }
+ } else if (tx_mode == SRS_TX_BW_FANOUT) {
+ srs_tx->st_mode = SRS_TX_FANOUT;
+ }
+ srs->srs_type &= ~SRST_BW_CONTROL;
+ } else {
+ /* Set/Modify bandwidth limit */
+ srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
+ /*
+ * Give twice the queuing capability before
+ * dropping packets. The unit is bytes/tick.
+ */
+ srs->srs_bw->mac_bw_drop_threshold =
+ srs->srs_bw->mac_bw_limit << 1;
+ srs->srs_type |= SRST_BW_CONTROL;
+ if (tx_mode != SRS_TX_BW &&
+ tx_mode != SRS_TX_BW_FANOUT) {
+ if (tx_mode == SRS_TX_SERIALIZE ||
+ tx_mode == SRS_TX_DEFAULT) {
+ srs_tx->st_mode = SRS_TX_BW;
+ } else if (tx_mode == SRS_TX_FANOUT) {
+ srs_tx->st_mode = SRS_TX_BW_FANOUT;
+ } else {
+ ASSERT(0);
+ }
+ }
+ }
+done:
+ srs_tx->st_func = mac_tx_get_func(srs_tx->st_mode);
+ mutex_exit(&srs->srs_bw->mac_bw_lock);
+ mutex_exit(&srs->srs_lock);
+}
+
+/*
+ * The uber function that deals with any update to bandwidth limits.
+ */
+void
+mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+ int count;
+
+ for (count = 0; count < flent->fe_rx_srs_cnt; count++)
+ mac_rx_srs_update_bwlimit(flent->fe_rx_srs[count], mrp);
+ mac_tx_srs_update_bwlimit(flent->fe_tx_srs, mrp);
+}
+
+void
+mac_srs_change_upcall(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
+{
+ mac_soft_ring_set_t *mac_srs = arg;
+ mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
+ mac_soft_ring_t *softring;
+
+ mutex_enter(&mac_srs->srs_lock);
+ ASSERT((mac_srs->srs_type & SRST_TX) == 0);
+ srs_rx->sr_func = rx_func;
+ srs_rx->sr_arg1 = rx_arg1;
+
+ softring = mac_srs->srs_soft_ring_head;
+ while (softring != NULL) {
+ mutex_enter(&softring->s_ring_lock);
+ softring->s_ring_rx_func = rx_func;
+ softring->s_ring_rx_arg1 = rx_arg1;
+ mutex_exit(&softring->s_ring_lock);
+ softring = softring->s_ring_next;
+ }
+
+ mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * When the first sub-flow is added to a link, we disable polling on the
+ * link and also modify the entry point to mac_rx_srs_subflow_process.
+ * (polling is disabled because with the subflow added, accounting
+ * for polling needs additional logic, it is assumed that when a subflow is
+ * added, we can take some hit as a result of disabling polling rather than
+ * adding more complexity - if this becomes a perf. issue we need to
+ * re-rvaluate this logic). When the last subflow is removed, we turn back
+ * polling and also reset the entry point to mac_rx_srs_process.
+ *
+ * In the future if there are multiple SRS, we can simply
+ * take one and give it to the flow rather than disabling polling and
+ * resetting the entry point.
+ */
+void
+mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable)
+{
+ flow_entry_t *flent = mcip->mci_flent;
+ int i;
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_rx_func_t rx_func;
+ uint_t rx_srs_cnt;
+ boolean_t enable_classifier;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ enable_classifier = !FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && enable;
+
+ rx_func = enable_classifier ? mac_rx_srs_subflow_process :
+ mac_rx_srs_process;
+
+ /*
+ * If receive function has already been configured correctly for
+ * current subflow configuration, do nothing.
+ */
+ if (flent->fe_cb_fn == (flow_fn_t)rx_func)
+ return;
+
+ rx_srs_cnt = flent->fe_rx_srs_cnt;
+ for (i = 0; i < rx_srs_cnt; i++) {
+ ASSERT(flent->fe_rx_srs[i] != NULL);
+ mac_srs_poll_state_change(flent->fe_rx_srs[i],
+ enable_classifier, rx_func);
+ }
+
+ /*
+ * Change the S/W classifier so that we can land in the
+ * correct processing function with correct argument.
+ * If all subflows have been removed we can revert to
+ * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process.
+ */
+ mutex_enter(&flent->fe_lock);
+ flent->fe_cb_fn = (flow_fn_t)rx_func;
+ flent->fe_cb_arg1 = (void *)mip;
+ flent->fe_cb_arg2 = flent->fe_rx_srs[0];
+ mutex_exit(&flent->fe_lock);
+}
+
+static void
+mac_srs_update_fanout_list(mac_soft_ring_set_t *mac_srs)
+{
+ int tcp_count = 0;
+ int udp_count = 0;
+ int oth_count = 0;
+ mac_soft_ring_t *softring;
+
+ softring = mac_srs->srs_soft_ring_head;
+ if (softring == NULL) {
+ ASSERT(mac_srs->srs_soft_ring_count == 0);
+ mac_srs->srs_tcp_ring_count = 0;
+ mac_srs->srs_udp_ring_count = 0;
+ mac_srs->srs_oth_ring_count = 0;
+ return;
+ }
+
+ softring = mac_srs->srs_soft_ring_head;
+ tcp_count = udp_count = oth_count = 0;
+
+ while (softring != NULL) {
+ if (softring->s_ring_type & ST_RING_TCP)
+ mac_srs->srs_tcp_soft_rings[tcp_count++] = softring;
+ else if (softring->s_ring_type & ST_RING_UDP)
+ mac_srs->srs_udp_soft_rings[udp_count++] = softring;
+ else
+ mac_srs->srs_oth_soft_rings[oth_count++] = softring;
+ softring = softring->s_ring_next;
+ }
+
+ ASSERT(mac_srs->srs_soft_ring_count ==
+ (tcp_count + udp_count + oth_count));
+
+ mac_srs->srs_tcp_ring_count = tcp_count;
+ mac_srs->srs_udp_ring_count = udp_count;
+ mac_srs->srs_oth_ring_count = oth_count;
+}
+
+void
+mac_srs_create_proto_softrings(int id, void *flent, uint16_t type,
+ pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs,
+ processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1,
+ mac_resource_handle_t x_arg2, boolean_t set_bypass)
+{
+ mac_soft_ring_t *softring;
+ mac_rx_fifo_t mrf;
+
+ bzero(&mrf, sizeof (mac_rx_fifo_t));
+ mrf.mrf_type = MAC_RX_FIFO;
+ mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
+ mrf.mrf_intr_enable =
+ (mac_intr_enable_t)mac_soft_ring_intr_enable;
+ mrf.mrf_intr_disable =
+ (mac_intr_disable_t)mac_soft_ring_intr_disable;
+ mrf.mrf_flow_priority = pri;
+
+ softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
+ (void *)flent, (type|ST_RING_TCP), pri, mcip, mac_srs,
+ cpuid, rx_func, x_arg1, x_arg2);
+ softring->s_ring_rx_arg2 = NULL;
+
+ /*
+ * TCP and UDP support DLS bypass. In addition TCP
+ * squeue can also poll their corresponding soft rings.
+ */
+ if (set_bypass && (mcip->mci_resource_arg != NULL)) {
+ mac_soft_ring_dls_bypass(softring,
+ mcip->mci_direct_rx_fn,
+ mcip->mci_direct_rx_arg);
+
+ mrf.mrf_rx_arg = softring;
+ mrf.mrf_intr_handle = (mac_intr_handle_t)softring;
+
+ /*
+ * Make a call in IP to get a TCP squeue assigned to
+ * this softring to maintain full CPU locality through
+ * the stack and allow the squeue to be able to poll
+ * the softring so the flow control can be pushed
+ * all the way to H/W.
+ */
+ softring->s_ring_rx_arg2 =
+ mcip->mci_resource_add((void *)mcip->mci_resource_arg,
+ (mac_resource_t *)&mrf);
+ }
+
+ /*
+ * Non-TCP protocols don't support squeues. Hence we
+ * don't make any ring addition callbacks for non-TCP
+ * rings. Now create the UDP softring and allow it to
+ * bypass the DLS layer.
+ */
+ softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
+ (void *)flent, (type|ST_RING_UDP), pri, mcip, mac_srs,
+ cpuid, rx_func, x_arg1, x_arg2);
+ softring->s_ring_rx_arg2 = NULL;
+
+ if (set_bypass && (mcip->mci_resource_arg != NULL)) {
+ mac_soft_ring_dls_bypass(softring,
+ mcip->mci_direct_rx_fn,
+ mcip->mci_direct_rx_arg);
+ }
+
+ /* Create the Oth softrings which has to go through the DLS */
+ softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
+ (void *)flent, (type|ST_RING_OTH), pri, mcip, mac_srs,
+ cpuid, rx_func, x_arg1, x_arg2);
+ softring->s_ring_rx_arg2 = NULL;
+}
+
+/*
+ * This routine associates a CPU or a set of CPU to process incoming
+ * traffic from a mac client. If multiple CPUs are specified, then
+ * so many soft rings are created with each soft ring worker thread
+ * bound to a CPU in the set. Each soft ring in turn will be
+ * associated with an squeue and the squeue will be moved to the
+ * same CPU as that of the soft ring's.
+ */
+static void
+mac_srs_fanout_modify(mac_client_impl_t *mcip, flow_entry_t *flent,
+ mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1,
+ mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs,
+ mac_soft_ring_set_t *mac_tx_srs)
+{
+ mac_soft_ring_t *softring;
+ uint32_t soft_ring_flag = soft_ring_process_flag;
+ processorid_t cpuid = -1;
+ boolean_t user_specified;
+ int i, srings_present, new_fanout_cnt;
+ mac_cpus_t *srs_cpu;
+
+ user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC;
+ /* fanout state is REINIT. Set it back to INIT */
+ ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_REINIT);
+ mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT;
+
+ /* how many are present right now */
+ srings_present = mac_rx_srs->srs_tcp_ring_count;
+ /* new request */
+ srs_cpu = &mac_rx_srs->srs_cpu;
+ new_fanout_cnt = srs_cpu->mc_fanout_cnt;
+
+ mutex_enter(&mac_rx_srs->srs_lock);
+ if (mac_rx_srs->srs_type & SRST_BW_CONTROL)
+ soft_ring_flag |= ST_RING_BW_CTL;
+ mutex_exit(&mac_rx_srs->srs_lock);
+
+ if (new_fanout_cnt > srings_present) {
+ /* soft rings increased */
+ mutex_enter(&mac_rx_srs->srs_lock);
+ mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP;
+ mutex_exit(&mac_rx_srs->srs_lock);
+
+ for (i = mac_rx_srs->srs_tcp_ring_count;
+ i < new_fanout_cnt; i++) {
+ /*
+ * Create the protocol softrings and set the
+ * DLS bypass where possible.
+ */
+ mac_srs_create_proto_softrings(i,
+ (void *)flent, soft_ring_flag,
+ mac_rx_srs->srs_pri, mcip, mac_rx_srs, cpuid,
+ rx_func, x_arg1, x_arg2, B_TRUE);
+ }
+ mac_srs_update_fanout_list(mac_rx_srs);
+ } else if (new_fanout_cnt < srings_present) {
+ /* soft rings decreased */
+ if (new_fanout_cnt == 1) {
+ mutex_enter(&mac_rx_srs->srs_lock);
+ mac_rx_srs->srs_type &= ~SRST_FANOUT_SRC_IP;
+ ASSERT(mac_rx_srs->srs_type & SRST_FANOUT_PROTO);
+ mutex_exit(&mac_rx_srs->srs_lock);
+ }
+ /* Get rid of extra soft rings */
+ for (i = new_fanout_cnt;
+ i < mac_rx_srs->srs_tcp_ring_count; i++) {
+ softring = mac_rx_srs->srs_tcp_soft_rings[i];
+ if (softring->s_ring_rx_arg2 != NULL) {
+ mcip->mci_resource_remove(
+ (void *)mcip->mci_resource_arg,
+ softring->s_ring_rx_arg2);
+ }
+ mac_soft_ring_remove(mac_rx_srs,
+ mac_rx_srs->srs_tcp_soft_rings[i]);
+ mac_soft_ring_remove(mac_rx_srs,
+ mac_rx_srs->srs_udp_soft_rings[i]);
+ mac_soft_ring_remove(mac_rx_srs,
+ mac_rx_srs->srs_oth_soft_rings[i]);
+ }
+ mac_srs_update_fanout_list(mac_rx_srs);
+ }
+
+ ASSERT(new_fanout_cnt == mac_rx_srs->srs_tcp_ring_count);
+ mutex_enter(&cpu_lock);
+ for (i = 0; i < mac_rx_srs->srs_tcp_ring_count; i++) {
+ cpuid = srs_cpu->mc_fanout_cpus[i];
+ (void) mac_soft_ring_bind(mac_rx_srs->srs_udp_soft_rings[i],
+ cpuid);
+ (void) mac_soft_ring_bind(mac_rx_srs->srs_oth_soft_rings[i],
+ cpuid);
+ (void) mac_soft_ring_bind(mac_rx_srs->srs_tcp_soft_rings[i],
+ cpuid);
+ softring = mac_rx_srs->srs_tcp_soft_rings[i];
+ if (softring->s_ring_rx_arg2 != NULL) {
+ mcip->mci_resource_bind((void *)mcip->mci_resource_arg,
+ softring->s_ring_rx_arg2, cpuid);
+ }
+ }
+
+ mac_srs_worker_bind(mac_rx_srs, srs_cpu->mc_pollid);
+ mac_srs_poll_bind(mac_rx_srs, srs_cpu->mc_workerid);
+
+ /*
+ * Bind Tx srs and soft ring threads too. Let's bind tx
+ * srs to the last cpu in mrp list.
+ */
+ if (mac_tx_srs != NULL && user_specified) {
+ BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp);
+ }
+ mutex_exit(&cpu_lock);
+}
+
+/*
+ * Bind SRS threads and soft rings to CPUs/create fanout list.
+ */
+void
+mac_srs_fanout_init(mac_client_impl_t *mcip, flow_entry_t *flent,
+ mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1,
+ mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs,
+ mac_soft_ring_set_t *mac_tx_srs)
+{
+ int i;
+ processorid_t cpuid, worker_cpuid, poll_cpuid;
+ uint32_t soft_ring_flag = soft_ring_process_flag;
+ int soft_ring_cnt;
+ boolean_t user_specified = B_FALSE;
+ mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu;
+
+ /*
+ * Remove the no soft ring flag and we will adjust it
+ * appropriately further down.
+ */
+ mutex_enter(&mac_rx_srs->srs_lock);
+ mac_rx_srs->srs_type &= ~SRST_NO_SOFT_RINGS;
+ mutex_exit(&mac_rx_srs->srs_lock);
+
+ ASSERT(mac_rx_srs->srs_soft_ring_head == NULL);
+
+ if (mac_rx_srs->srs_type & SRST_BW_CONTROL)
+ soft_ring_flag |= ST_RING_BW_CTL;
+
+ ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_UNINIT);
+ mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT;
+ user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC;
+ /*
+ * Ring count can be 0 if no fanout is required and no cpu
+ * were specified. Leave the SRS worker and poll thread
+ * unbound
+ */
+ ASSERT(mrp != NULL);
+ soft_ring_cnt = srs_cpu->mc_fanout_cnt;
+
+ /* Step 1: bind cpu contains cpu list where threads need to bind */
+ if (soft_ring_cnt > 0) {
+ mutex_enter(&cpu_lock);
+ for (i = 0; i < soft_ring_cnt; i++) {
+ cpuid = srs_cpu->mc_fanout_cpus[i];
+ /* Create the protocol softrings */
+ mac_srs_create_proto_softrings(i, (void *)flent,
+ soft_ring_flag, mac_rx_srs->srs_pri,
+ mcip, mac_rx_srs, cpuid, rx_func,
+ x_arg1, x_arg2, B_FALSE);
+ }
+ worker_cpuid = srs_cpu->mc_workerid;
+ poll_cpuid = srs_cpu->mc_pollid;
+ mac_srs_worker_bind(mac_rx_srs, worker_cpuid);
+ mac_srs_poll_bind(mac_rx_srs, poll_cpuid);
+
+ /*
+ * Bind Tx srs and soft ring threads too.
+ * Let's bind tx srs to the last cpu in
+ * mrp list.
+ */
+ if (mac_tx_srs == NULL) {
+ mutex_exit(&cpu_lock);
+ goto alldone;
+ }
+
+ if (user_specified) {
+ BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp);
+ }
+ mutex_exit(&cpu_lock);
+ } else {
+ mutex_enter(&cpu_lock);
+ /*
+ * For a subflow, mrp_workerid and mrp_pollid
+ * is not set.
+ */
+ mac_srs_worker_bind(mac_rx_srs, mrp->mrp_workerid);
+ mac_srs_poll_bind(mac_rx_srs, mrp->mrp_pollid);
+ mutex_exit(&cpu_lock);
+ goto no_softrings;
+ }
+
+alldone:
+ if (soft_ring_cnt > 1)
+ mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP;
+ mac_srs_update_fanout_list(mac_rx_srs);
+ mac_srs_client_poll_enable(mcip, mac_rx_srs);
+ return;
+
+no_softrings:
+ if (mac_rx_srs->srs_type & SRST_FANOUT_PROTO) {
+ mutex_enter(&cpu_lock);
+ cpuid = mac_next_bind_cpu();
+ /* Create the protocol softrings */
+ mac_srs_create_proto_softrings(0, (void *)flent,
+ soft_ring_flag, mac_rx_srs->srs_pri,
+ mcip, mac_rx_srs, cpuid, rx_func,
+ x_arg1, x_arg2, B_FALSE);
+ mutex_exit(&cpu_lock);
+ } else {
+ /*
+ * This is the case when there is no fanout which is
+ * true for subflows.
+ */
+ mac_rx_srs->srs_type |= SRST_NO_SOFT_RINGS;
+ }
+ mac_srs_update_fanout_list(mac_rx_srs);
+ mac_srs_client_poll_enable(mcip, mac_rx_srs);
+}
+
+/*
+ * mac_fanout_setup:
+ *
+ * Calls mac_srs_fanout_init() or modify() depending upon whether
+ * the SRS is getting initialized or re-initialized.
+ */
+void
+mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+ mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1,
+ mac_resource_handle_t x_arg2)
+{
+ mac_soft_ring_set_t *mac_rx_srs, *mac_tx_srs;
+ int i, rx_srs_cnt;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+ /*
+ * This is an aggregation port. Fanout will be setup
+ * over the aggregation itself.
+ */
+ if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT)
+ return;
+
+ mac_rx_srs = flent->fe_rx_srs[0];
+ /*
+ * Set up the fanout on the tx side only once, with the
+ * first rx SRS. The CPU binding, fanout, and bandwidth
+ * criteria are common to both RX and TX, so
+ * initializing them along side avoids redundant code.
+ */
+ mac_tx_srs = flent->fe_tx_srs;
+ rx_srs_cnt = flent->fe_rx_srs_cnt;
+
+ /* No fanout for subflows */
+ if (flent->fe_type & FLOW_USER) {
+ mac_srs_fanout_init(mcip, flent, mrp, rx_func,
+ x_arg1, x_arg2, mac_rx_srs, mac_tx_srs);
+ return;
+ }
+
+ mac_flow_cpu_init(flent, mrp);
+
+ /*
+ * Set up fanout for both SW (0th SRS) and HW classified
+ * SRS (the rest of Rx SRSs in flent).
+ */
+ for (i = 0; i < rx_srs_cnt; i++) {
+ mac_rx_srs = flent->fe_rx_srs[i];
+ if (i != 0)
+ mac_tx_srs = NULL;
+ switch (mac_rx_srs->srs_fanout_state) {
+ case SRS_FANOUT_UNINIT:
+ mac_srs_fanout_init(mcip, flent, mrp, rx_func,
+ x_arg1, x_arg2, mac_rx_srs, mac_tx_srs);
+ break;
+ case SRS_FANOUT_INIT:
+ break;
+ case SRS_FANOUT_REINIT:
+ mac_rx_srs_quiesce(mac_rx_srs, SRS_QUIESCE);
+ mac_srs_fanout_modify(mcip, flent, mrp, rx_func,
+ x_arg1, x_arg2, mac_rx_srs, mac_tx_srs);
+ mac_rx_srs_restart(mac_rx_srs);
+ break;
+ default:
+ VERIFY(mac_rx_srs->srs_fanout_state <=
+ SRS_FANOUT_REINIT);
+ break;
+ }
+ }
+}
+
+/*
+ * mac_create_soft_ring_set:
+ *
+ * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is
+ * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side
+ * processing is created.
+ *
+ * Details on Rx SRS:
+ * Create a SRS and also add the necessary soft rings for TCP and
+ * non-TCP based on fanout type and count specified.
+ *
+ * mac_soft_ring_fanout, mac_srs_fanout_modify (?),
+ * mac_soft_ring_stop_workers, mac_soft_ring_set_destroy, etc need
+ * to be heavily modified.
+ *
+ * mi_soft_ring_list_size, mi_soft_ring_size, etc need to disappear.
+ */
+mac_soft_ring_set_t *
+mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
+ mac_direct_rx_t rx_func, void *x_arg1, mac_resource_handle_t x_arg2,
+ mac_ring_t *ring)
+{
+ mac_soft_ring_set_t *mac_srs;
+ mac_srs_rx_t *srs_rx;
+ mac_srs_tx_t *srs_tx;
+ mac_bw_ctl_t *mac_bw;
+ mac_resource_props_t *mrp;
+ boolean_t is_tx_srs = ((srs_type & SRST_TX) != 0);
+
+ mac_srs = kmem_cache_alloc(mac_srs_cache, KM_SLEEP);
+ bzero(mac_srs, sizeof (mac_soft_ring_set_t));
+ srs_rx = &mac_srs->srs_rx;
+ srs_tx = &mac_srs->srs_tx;
+
+ mutex_enter(&flent->fe_lock);
+
+ /*
+ * Get the bandwidth control structure from the flent. Get
+ * rid of any residual values in the control structure for
+ * the tx bw struct and also for the rx, if the rx srs is
+ * the 1st one being brought up (the rx bw ctl struct may
+ * be shared by multiple SRSs)
+ */
+ if (is_tx_srs) {
+ mac_srs->srs_bw = &flent->fe_tx_bw;
+ bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t));
+ flent->fe_tx_srs = mac_srs;
+ } else {
+ /*
+ * The bw counter (stored in the flent) is shared
+ * by SRS's within an rx group.
+ */
+ mac_srs->srs_bw = &flent->fe_rx_bw;
+ /* First rx SRS, clear the bw structure */
+ if (flent->fe_rx_srs_cnt == 0)
+ bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t));
+ ASSERT(flent->fe_rx_srs_cnt < MAX_RINGS_PER_GROUP);
+ flent->fe_rx_srs[flent->fe_rx_srs_cnt] = mac_srs;
+ flent->fe_rx_srs_cnt++;
+ }
+ mac_srs->srs_flent = flent;
+ mutex_exit(&flent->fe_lock);
+
+ mac_srs->srs_state = 0;
+ mac_srs->srs_type = (srs_type | SRST_NO_SOFT_RINGS);
+ mac_srs->srs_worker_cpuid = mac_srs->srs_worker_cpuid_save = -1;
+ mac_srs->srs_poll_cpuid = mac_srs->srs_poll_cpuid_save = -1;
+ mac_srs_fanout_list_alloc(mac_srs);
+
+ /*
+ * For a flow we use the underlying MAC client's priority range with
+ * the priority value to find an absolute priority value. For a MAC
+ * client we use the MAC client's maximum priority as the value.
+ */
+ mrp = &flent->fe_effective_props;
+ if ((mac_srs->srs_type & SRST_FLOW) != 0) {
+ mac_srs->srs_pri = FLOW_PRIORITY(mcip->mci_min_pri,
+ mcip->mci_max_pri, mrp->mrp_priority);
+ } else {
+ mac_srs->srs_pri = mcip->mci_max_pri;
+ }
+ mac_srs->srs_mcip = mcip;
+ /*
+ * We need to insert the SRS in the global list before
+ * binding the SRS and SR threads. Otherwise there is a
+ * is a small window where the cpu reconfig callbacks
+ * may miss the SRS in the list walk and DR could fail
+ * as there are bound threads.
+ */
+ mac_srs_add_glist(mac_srs);
+
+ /* Initialize bw limit */
+ if ((mrp->mrp_mask & MRP_MAXBW) != 0) {
+ mac_srs->srs_drain_func = mac_rx_srs_drain_bw;
+
+ mac_bw = mac_srs->srs_bw;
+ mutex_enter(&mac_bw->mac_bw_lock);
+ mac_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
+
+ /*
+ * Give twice the queuing capability before
+ * dropping packets. The unit is bytes/tick.
+ */
+ mac_bw->mac_bw_drop_threshold = mac_bw->mac_bw_limit << 1;
+ mutex_exit(&mac_bw->mac_bw_lock);
+ mac_srs->srs_type |= SRST_BW_CONTROL;
+ } else {
+ mac_srs->srs_drain_func = mac_rx_srs_drain;
+ }
+
+ /*
+ * We use the following policy to control Receive
+ * Side Dynamic Polling:
+ * 1) We switch to poll mode anytime the processing thread causes
+ * a backlog to build up in SRS and its associated Soft Rings
+ * (sr_poll_pkt_cnt > 0).
+ * 2) As long as the backlog stays under the low water mark
+ * (sr_lowat), we poll the H/W for more packets.
+ * 3) If the backlog (sr_poll_pkt_cnt) exceeds low water mark, we
+ * stay in poll mode but don't poll the H/W for more packets.
+ * 4) Anytime in polling mode, if we poll the H/W for packets and
+ * find nothing plus we have an existing backlog
+ * (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll
+ * the H/W for packets anymore (let the polling thread go to sleep).
+ * 5) Once the backlog is relived (packets are processed) we reenable
+ * polling (by signalling the poll thread) only when the backlog
+ * dips below sr_poll_thres.
+ * 6) sr_hiwat is used exclusively when we are not polling capable
+ * and is used to decide when to drop packets so the SRS queue
+ * length doesn't grow infinitely.
+ */
+ if (!is_tx_srs) {
+ srs_rx->sr_hiwat = mac_soft_ring_max_q_cnt;
+ /* Low water mark needs to be less than high water mark */
+ srs_rx->sr_lowat = mac_soft_ring_min_q_cnt <=
+ mac_soft_ring_max_q_cnt ? mac_soft_ring_min_q_cnt :
+ (mac_soft_ring_max_q_cnt >> 2);
+ /* Poll threshold need to be half of low water mark or less */
+ srs_rx->sr_poll_thres = mac_soft_ring_poll_thres <=
+ (srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres :
+ (srs_rx->sr_lowat >> 1);
+ if (mac_latency_optimize)
+ mac_srs->srs_state |= SRS_LATENCY_OPT;
+ }
+
+ mac_srs->srs_worker = thread_create(NULL, 0,
+ mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri);
+
+ if (is_tx_srs) {
+ /* Handle everything about Tx SRS and return */
+ mac_srs->srs_drain_func = mac_tx_srs_drain;
+ srs_tx->st_max_q_cnt = mac_tx_srs_max_q_cnt;
+ srs_tx->st_hiwat =
+ (mac_tx_srs_hiwat > mac_tx_srs_max_q_cnt) ?
+ mac_tx_srs_max_q_cnt : mac_tx_srs_hiwat;
+ srs_tx->st_arg1 = x_arg1;
+ srs_tx->st_arg2 = x_arg2;
+ return (mac_srs);
+ }
+
+ if ((srs_type & SRST_FLOW) != 0 ||
+ FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+ srs_rx->sr_lower_proc = mac_rx_srs_process;
+ else
+ srs_rx->sr_lower_proc = mac_rx_srs_subflow_process;
+
+ srs_rx->sr_func = rx_func;
+ srs_rx->sr_arg1 = x_arg1;
+ srs_rx->sr_arg2 = x_arg2;
+
+ if (ring != NULL) {
+ /* Is the mac_srs created over the RX default group? */
+ if (ring->mr_gh == (mac_group_handle_t)
+ (&mcip->mci_mip->mi_rx_groups[0]))
+ mac_srs->srs_type |= SRST_DEFAULT_GRP;
+
+ mac_srs->srs_ring = ring;
+ ring->mr_srs = mac_srs;
+ ring->mr_classify_type = MAC_HW_CLASSIFIER;
+ ring->mr_flag |= MR_INCIPIENT;
+
+ if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+ mac_srs->srs_state |= SRS_POLLING_CAPAB;
+
+ mac_srs->srs_poll_thr = thread_create(NULL, 0,
+ mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN,
+ mac_srs->srs_pri);
+ }
+ return (mac_srs);
+}
+
+/*
+ * Figure out the number of soft rings required. Its dependant on
+ * if protocol fanout is required (for LINKs), global settings
+ * require us to do fanout for performance (based on mac_soft_ring_enable),
+ * or user has specifically requested fanout.
+ */
+static uint32_t
+mac_find_fanout(flow_entry_t *flent, uint32_t link_type)
+{
+ uint32_t fanout_type;
+ mac_resource_props_t *mrp = &flent->fe_effective_props;
+
+ /* no fanout for subflows */
+ switch (link_type) {
+ case SRST_FLOW:
+ fanout_type = SRST_NO_SOFT_RINGS;
+ break;
+ case SRST_LINK:
+ fanout_type = SRST_FANOUT_PROTO;
+ break;
+ }
+
+ /* A primary NIC/link is being plumbed */
+ if (flent->fe_type & FLOW_PRIMARY_MAC) {
+ if (mac_soft_ring_enable && mac_rx_soft_ring_count > 1) {
+ fanout_type |= SRST_FANOUT_SRC_IP;
+ }
+ } else if (flent->fe_type & FLOW_VNIC) {
+ /* A VNIC is being created */
+ if (mrp != NULL && mrp->mrp_ncpus > 0) {
+ fanout_type |= SRST_FANOUT_SRC_IP;
+ }
+ }
+
+ return (fanout_type);
+}
+
+/*
+ * Change a group from h/w to s/w classification.
+ */
+static void
+mac_rx_switch_grp_to_sw(mac_group_t *group)
+{
+ mac_ring_t *ring;
+ mac_soft_ring_set_t *mac_srs;
+
+ for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+ if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
+ /*
+ * Remove the SRS associated with the HW ring.
+ * As a result, polling will be disabled.
+ */
+ mac_srs = ring->mr_srs;
+ ASSERT(mac_srs != NULL);
+ mac_rx_srs_remove(mac_srs);
+ ring->mr_srs = NULL;
+ }
+
+ if (ring->mr_state != MR_INUSE)
+ (void) mac_start_ring(ring);
+ /*
+ * We need to perform SW classification
+ * for packets landing in these rings
+ */
+ ring->mr_state = MR_INUSE;
+ ring->mr_flag = 0;
+ ring->mr_classify_type = MAC_SW_CLASSIFIER;
+ }
+}
+
+/*
+ * Create the Rx SRS for S/W classifier and for each ring in the
+ * group (if exclusive group). Also create the Tx SRS.
+ */
+void
+mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+ mac_group_t *group, uint32_t link_type)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_soft_ring_set_t *mac_srs;
+ mac_soft_ring_set_t *tx_srs = NULL;
+ mac_ring_t *ring;
+ uint32_t fanout_type;
+ boolean_t created_srs = B_FALSE;
+
+ fanout_type = mac_find_fanout(flent, link_type);
+
+ /* Create the SRS for S/W classification if none exists */
+ if (flent->fe_rx_srs[0] == NULL) {
+ ASSERT(flent->fe_rx_srs_cnt == 0);
+ /* Setup the Rx SRS */
+ mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type,
+ mac_rx_deliver, mcip, NULL, NULL);
+
+ mutex_enter(&flent->fe_lock);
+ flent->fe_cb_fn = (flow_fn_t)mac_srs->srs_rx.sr_lower_proc;
+ flent->fe_cb_arg1 = (void *)mip;
+ flent->fe_cb_arg2 = (void *)mac_srs;
+ mutex_exit(&flent->fe_lock);
+
+ /* Setup the Tx SRS as well */
+ ASSERT(flent->fe_tx_srs == NULL);
+ tx_srs = mac_srs_create(mcip, flent, SRST_TX | link_type,
+ NULL, mcip, NULL, NULL);
+
+ if (mcip->mci_share != NULL) {
+ mac_srs_tx_t *tx = &tx_srs->srs_tx;
+ ASSERT(!mcip->mci_no_hwrings);
+ /*
+ * A share requires a dedicated TX group.
+ * mac_reserve_tx_group() does the work needed to
+ * allocate a new group and populate that group
+ * with rings according to the driver requirements
+ * and limitations.
+ */
+ tx->st_group =
+ mac_reserve_tx_group(mip, mcip->mci_share);
+ ASSERT(tx->st_group != NULL);
+ tx->st_group->mrg_tx_client = mcip;
+ }
+ mac_tx_srs_setup(mcip, flent, link_type);
+ created_srs = B_TRUE;
+ }
+
+ if (group == NULL) {
+ if (created_srs) {
+ mac_fanout_setup(mcip, flent,
+ MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
+ mcip, NULL);
+ }
+ return;
+ }
+
+ /*
+ * fanout for default SRS is done when default SRS are created
+ * above. As each ring is added to the group, we setup the
+ * SRS and fanout to it.
+ */
+ switch (group->mrg_state) {
+ case MAC_GROUP_STATE_RESERVED:
+ /*
+ * The group is exclusively ours. Create a SRS
+ * for each ring in the group and allow the
+ * individual SRS to dynamically poll their
+ * Rx ring. Do this only if the client is not
+ * a VLAN MAC client since for VLAN we do
+ * s/w classification for the VID check.
+ */
+ if (i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE)
+ break;
+ for (ring = group->mrg_rings; ring != NULL;
+ ring = ring->mr_next) {
+ switch (ring->mr_state) {
+ case MR_INUSE:
+ case MR_FREE:
+ if (ring->mr_srs != NULL)
+ break;
+ if (ring->mr_state != MR_INUSE)
+ (void) mac_start_ring(ring);
+
+ ring->mr_state = MR_INUSE;
+
+ mac_srs = mac_srs_create(mcip, flent,
+ fanout_type | link_type,
+ mac_rx_deliver, mcip, NULL, ring);
+ if (mip->mi_v12n_level & MAC_VIRT_SERIALIZE) {
+ mac_srs->srs_rx.sr_enqueue_always =
+ B_TRUE;
+ }
+ break;
+ default:
+ cmn_err(CE_PANIC, "srs_setup: mcip = %p "
+ "trying to add UNKNOWN ring = %p\n",
+ (void *)mcip, (void *)ring);
+ break;
+ }
+ }
+ break;
+ case MAC_GROUP_STATE_SHARED:
+ /*
+ * Set all rings of this group to software classified.
+ *
+ * If the group is current RESERVED, the existing mac client
+ * (the only client on this group) is using this group
+ * exclusively. In that case we need to disable polling on
+ * the rings of the group (if it was enabled), and free the
+ * SRS associated with the rings.
+ */
+ mac_rx_switch_grp_to_sw(group);
+ break;
+ default:
+ ASSERT(B_FALSE);
+ break;
+ }
+ mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
+ mac_rx_deliver, mcip, NULL);
+}
+
+void
+mac_srs_group_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
+ uint32_t link_type)
+{
+ mac_soft_ring_set_t *mac_srs;
+ mac_soft_ring_set_t *tx_srs;
+ mac_srs_tx_t *tx;
+ int i;
+
+ for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+ mac_srs = flent->fe_rx_srs[i];
+ mac_rx_srs_quiesce(mac_srs, SRS_CONDEMNED);
+ /*
+ * Deal with all fanout tear down etc.
+ */
+ mac_srs_free(mac_srs);
+ flent->fe_rx_srs[i] = NULL;
+ }
+ flent->fe_rx_srs_cnt = 0;
+
+ tx_srs = flent->fe_tx_srs;
+ tx = &tx_srs->srs_tx;
+ switch (link_type) {
+ case SRST_FLOW:
+ /*
+ * For flows, we need to work with passed
+ * flent to find the Rx/Tx SRS.
+ */
+ mac_tx_srs_quiesce(tx_srs, SRS_CONDEMNED);
+ break;
+ case SRST_LINK:
+ mac_tx_client_quiesce(mcip, SRS_CONDEMNED);
+ /*
+ * Release the TX resources. First the TX group, if any
+ * was assigned to the MAC client, which will cause the
+ * TX rings to be moved back to the pool. Then free the
+ * rings themselves.
+ */
+ if (tx->st_group != NULL) {
+ mac_release_tx_group(tx_srs->srs_mcip->mci_mip,
+ tx->st_group);
+ tx->st_group = NULL;
+ }
+ if (tx->st_arg2 != NULL) {
+ ASSERT(tx_srs->srs_type & SRST_TX);
+ mac_release_tx_ring(tx->st_arg2);
+ }
+ break;
+ default:
+ ASSERT(B_FALSE);
+ break;
+ }
+ mac_srs_free(tx_srs);
+ flent->fe_tx_srs = NULL;
+}
+
+/*
+ * This is the group state machine. The state of an Rx group is given by
+ * the following table. The default group and its rings are started in
+ * mac_start itself and the default group stays in SHARED state until
+ * mac_stop at which time the group and rings are stopped and and it
+ * reverts to the Registered state.
+ *
+ * Typically this function is called on a group after adding or removing a
+ * client from it, to find out what should be the new state of the group.
+ * If the new state is RESERVED, then the client that owns this group
+ * exclusively is also returned. Note that adding or removing a client from
+ * a group could also impact the default group and the caller needs to
+ * evaluate the effect on the default group.
+ *
+ * Group type # of clients mi_nactiveclients Group State
+ * in the group
+ *
+ * Non-default 0 N.A. REGISTERED
+ * Non-default 1 N.A. RESERVED
+ * Non-default > 1 N.A. SHARED
+ *
+ * Default 0 N.A. SHARED
+ * Default 1 1 RESERVED
+ * Default 1 > 1 SHARED
+ * Default > 1 N.A. SHARED
+ */
+mac_group_state_t
+mac_rx_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip)
+{
+ mac_impl_t *mip = (mac_impl_t *)grp->mrg_mh;
+
+ *group_only_mcip = NULL;
+
+ /* Non-default group */
+
+ if (grp != mip->mi_rx_groups) {
+ if (MAC_RX_GROUP_NO_CLIENT(grp))
+ return (MAC_GROUP_STATE_REGISTERED);
+
+ *group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp);
+ if (*group_only_mcip != NULL)
+ return (MAC_GROUP_STATE_RESERVED);
+
+ return (MAC_GROUP_STATE_SHARED);
+ }
+
+ /* Default group */
+
+ if (MAC_RX_GROUP_NO_CLIENT(grp) || mip->mi_nactiveclients != 1)
+ return (MAC_GROUP_STATE_SHARED);
+
+ *group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp);
+ ASSERT(*group_only_mcip != NULL);
+ return (MAC_GROUP_STATE_RESERVED);
+}
+
+/*
+ * OVERVIEW NOTES FOR DATAPATH
+ * ===========================
+ *
+ * Create an SRS and setup the corresponding flow function and args.
+ * Add a classification rule for the flow specified by 'flent' and program
+ * the hardware classifier when applicable.
+ *
+ * Rx ring assignment, SRS, polling and B/W enforcement
+ * ----------------------------------------------------
+ *
+ * We try to use H/W classification on NIC and assign traffic to a
+ * MAC address to a particular Rx ring. There is a 1-1 mapping
+ * between a SRS and a Rx ring. The SRS (short for soft ring set)
+ * dynamically switches the underlying Rx ring between interrupt
+ * and polling mode and enforces any specified B/W control.
+ *
+ * There is always a SRS created and tied to each H/W and S/W rule.
+ * Whenever we create a H/W rule, we always add the the same rule to
+ * S/W classifier and tie a SRS to it.
+ *
+ * In case a B/W control is specified, its broken into bytes
+ * per ticks and as soon as the quota for a tick is exhausted,
+ * the underlying Rx ring is forced into poll mode for remianing
+ * tick. The SRS poll thread only polls for bytes that are
+ * allowed to come in the SRS. We typically let 4x the configured
+ * B/W worth of packets to come in the SRS (to prevent unnecessary
+ * drops due to bursts) but only process the specified amount.
+ *
+ * A Link (primary NIC, VNIC, VLAN or aggr) can have 1 or more
+ * Rx rings (and corresponding SRSs) assigned to it. The SRS
+ * in turn can have softrings to do protocol level fanout or
+ * softrings to do S/W based fanout or both. In case the NIC
+ * has no Rx rings, we do S/W classification to respective SRS.
+ * The S/W classification rule is always setup and ready. This
+ * allows the MAC layer to reassign Rx rings whenever needed
+ * but packets still continue to flow via the default path and
+ * getting S/W classified to correct SRS.
+ *
+ * In other cases where a NIC or VNIC is plumbed, our goal is use
+ * H/W classifier and get two Rx ring assigned for the Link. One
+ * for TCP and one for UDP|SCTP. The respective SRS still do the
+ * polling on the Rx ring. For Link that is plumbed for IP, there
+ * is a TCP squeue which also does polling and can control the
+ * the Rx ring directly (where SRS is just pass through). For
+ * the following cases, the SRS does the polling underneath.
+ * 1) non IP based Links (Links which are not plumbed via ifconfig)
+ * and paths which have no IP squeues (UDP & SCTP)
+ * 2) If B/W control is specified on the Link
+ * 3) If S/W fanout is secified
+ *
+ * Note1: As of current implementation, we try to assign only 1 Rx
+ * ring per Link and more than 1 Rx ring for primary Link for
+ * H/W based fanout. We always create following softrings per SRS:
+ * 1) TCP softring which is polled by TCP squeue where possible
+ * (and also bypasses DLS)
+ * 2) UDP/SCTP based which bypasses DLS
+ * 3) OTH softring which goes via DLS (currently deal with IPv6
+ * and non TCP/UDP/SCTP for IPv4 packets).
+ *
+ * It is necessary to create 3 softrings since SRS has to poll
+ * the single Rx ring underneath and enforce any link level B/W
+ * control (we can't switch the Rx ring in poll mode just based
+ * on TCP squeue if the same Rx ring is sharing UDP and other
+ * traffic as well). Once polling is done and any Link level B/W
+ * control is specified, the packets are assigned to respective
+ * softring based on protocol. Since TCP has IP based squeue
+ * which benefits by polling, we separate TCP packets into
+ * its own softring which can be polled by IP squeue. We need
+ * to separate out UDP/SCTP to UDP softring since it can bypass
+ * the DLS layer which has heavy performance advanatges and we
+ * need a softring (OTH) for rest.
+ *
+ * ToDo: The 3 softrings for protocol are needed only till we can
+ * get rid of DLS from datapath, make IPv4 and IPv6 paths
+ * symmetric (deal with mac_header_info for v6 and polling for
+ * IPv4 TCP - ip_accept_tcp is IPv4 specific although squeues
+ * are generic), and bring SAP based classification to MAC layer
+ *
+ * H/W and S/W based fanout and multiple Rx rings per Link
+ * -------------------------------------------------------
+ *
+ * In case, fanout is requested (or determined automatically based
+ * on Link speed and processor speed), we try to assign multiple
+ * Rx rings per Link with their respective SRS. In this case
+ * the NIC should be capable of fanning out incoming packets between
+ * the assigned Rx rings (H/W based fanout). All the SRS
+ * individually switch their Rx ring between interrupt and polling
+ * mode but share a common B/W control counter in case of Link
+ * level B/W is specified.
+ *
+ * If S/W based fanout is specified in lieu of H/W based fanout,
+ * the Link SRS creates the specified number of softrings for
+ * each protocol (TCP, UDP, OTH). Incoming packets are fanned
+ * out to the correct softring based on their protocol and
+ * protocol specific hash function.
+ *
+ * Primary and non primary MAC clients
+ * -----------------------------------
+ *
+ * The NICs, VNICs, Vlans, and Aggrs are typically termed as Links
+ * and are a Layer 2 construct.
+ *
+ * Primary NIC:
+ * The Link that owns the primary MAC address and typically
+ * is used as the data NIC in non virtualized cases. As such
+ * H/W resources are preferntially given to primary NIC. As
+ * far as code is concerned, there is no difference in the
+ * primary NIC vs VNICs. They are all treated as Links.
+ * At the very first call to mac_unicast_add() we program the S/W
+ * classifier for the primary MAC address, get a soft ring set
+ * (and soft rings based on 'ip_soft_ring_cnt')
+ * and a Rx ring assigned for polling to get enabled.
+ * When IP get plumbed and negotiates polling, we can
+ * let squeue do the polling on TCP softring.
+ *
+ * VNICs:
+ * Same as any other Link. As long as the H/W resource assignments
+ * are equal, the data path and setup for all Links is same.
+ *
+ * Flows:
+ * Can be configured on Links. They have their own SRS and the
+ * S/W classifier is programmed appropriately based on the flow.
+ * The flows typically deal with layer 3 and above and
+ * creates a soft ring set specific to the flow. The receive
+ * side function is switched from mac_rx_srs_process to
+ * mac_rx_srs_subflow_process which first tries to assign the
+ * packet to appropriate flow SRS and failing which assigns it
+ * to link SRS. This allows us to avoid the layered approach
+ * which gets complex.
+ *
+ * By the time mac_datapath_setup() completes, we already have the
+ * soft rings set, Rx rings, soft rings, etc figured out and both H/W
+ * and S/W classifiers programmed. IP is not plumbed yet (and might
+ * never be for Virtual Machines guest OS path). When IP is plumbed
+ * (for both NIC and VNIC), we do a capability negotiation for polling
+ * and upcall functions etc.
+ *
+ * Rx ring Assignement NOTES
+ * -------------------------
+ *
+ * For NICs which have only 1 Rx ring (we treat NICs with no Rx rings
+ * as NIC with a single default ring), we assign the only ring to
+ * primary Link as MAC_RX_HW_DEFAULT_RING. The primary Link SRS can do
+ * polling on it as long as it is the only link in use and we compare
+ * the MAC address for unicast packets before accepting an incoming
+ * packet (there is no need for S/W classification in this case). We
+ * disable polling on the only ring the moment 2nd link gets created
+ * (the polling remains enabled even though there are broadcast and
+ * multicast flows created).
+ *
+ * If the NIC has more than 1 Rx ring, we assign the default ring (the
+ * 1st ring) to deal with broadcast, multicast and traffic for other
+ * NICs which needs S/W classification. We assign the primary mac
+ * addresses to another ring by specifiying a classification rule for
+ * primary unicast MAC address to the selected ring. The primary Link
+ * (and its SRS) can continue to poll the assigned Rx ring at all times
+ * independantly.
+ *
+ * Right now we just assign MAC_RX_HW_DEFAULT_RING to note that it is
+ * primary NIC and later we will check to see how many Rx rings we
+ * have and can we get a non default Rx ring for the primary MAC.
+ *
+ * Note: In future, if no fanout is specified, we try to assign 2 Rx
+ * rings for the primary Link with the primary MAC address + TCP going
+ * to one ring and primary MAC address + UDP|SCTP going to other ring.
+ * Any remaining traffic for primary MAC address can go to the default
+ * Rx ring and get S/W classified. This way the respective SRSs don't
+ * need to do proto fanout and don't need to have softrings at all and
+ * can poll their respective Rx rings.
+ *
+ * As an optimization, when a new NIC or VNIC is created, we can get
+ * only one Rx ring and make it a TCP specific Rx ring and use the
+ * H/W default Rx ring for the rest (this Rx ring is never polled).
+ */
+int
+mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+ uint32_t link_type)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_group_t *group = NULL;
+ mac_group_t *default_group;
+ int err;
+ uint8_t *mac_addr;
+ mac_rx_group_reserve_type_t rtype = MAC_RX_RESERVE_NONDEFAULT;
+ mac_group_state_t next_state;
+ mac_client_impl_t *group_only_mcip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ switch (link_type) {
+ case SRST_FLOW:
+ mac_srs_group_setup(mcip, flent, NULL, link_type);
+ return (0);
+
+ case SRST_LINK:
+ mac_addr = flent->fe_flow_desc.fd_dst_mac;
+
+ /* Check if we need to reserve the default group */
+ if (flent->fe_type & FLOW_PRIMARY_MAC)
+ rtype = MAC_RX_RESERVE_DEFAULT;
+
+ if (!mcip->mci_no_hwrings) {
+ /*
+ * Check to see if we can get an exclusive group for
+ * this mac address or if there already exists a
+ * group that has this mac address (case of VLANs).
+ * If no groups are available, use the default group.
+ */
+ group = mac_reserve_rx_group(mcip, mac_addr, rtype);
+ }
+
+ if (group == NULL) {
+ if (mcip->mci_req_hwrings)
+ return (ENOSPC);
+ group = &mip->mi_rx_groups[0];
+ }
+
+ /*
+ * Some NICs don't support any Rx rings, so there may not
+ * even be a default group.
+ */
+ if (group != NULL) {
+ flent->fe_rx_ring_group = group;
+ /*
+ * Add the client to the group. This could cause
+ * either this group to move to the shared state or
+ * cause the default group to move to the shared state.
+ * The actions on this group are done here, while the
+ * actions on the default group are postponed to
+ * the end of this function.
+ */
+ mac_rx_group_add_client(group, mcip);
+ next_state = mac_rx_group_next_state(group,
+ &group_only_mcip);
+
+ ASSERT((next_state == MAC_GROUP_STATE_RESERVED &&
+ mcip == group_only_mcip) ||
+ (next_state == MAC_GROUP_STATE_SHARED &&
+ group_only_mcip == NULL));
+
+ mac_set_rx_group_state(group, next_state);
+ }
+
+ /*
+ * Setup the Rx and Tx SRSes. If we got a pristine group
+ * exclusively above, mac_srs_group_setup would simply create
+ * the required SRSes. If we ended up sharing a previously
+ * reserved group, mac_srs_group_setup would also dismantle the
+ * SRSes of the previously exclusive group
+ */
+ mac_srs_group_setup(mcip, flent, group, link_type);
+
+ /* Program the S/W Classifer */
+ if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0)
+ goto setup_failed;
+
+ /* Program the H/W Classifier */
+ if ((err = mac_add_macaddr(mip, group, mac_addr)) != 0)
+ goto setup_failed;
+ mcip->mci_unicast = mac_find_macaddr(mip, mac_addr);
+ ASSERT(mcip->mci_unicast != NULL);
+ break;
+
+ default:
+ ASSERT(B_FALSE);
+ break;
+ }
+
+ /*
+ * All broadcast and multicast traffic is received only on the default
+ * group. If we have setup the datapath for a non-default group above
+ * then move the default group to shared state to allow distribution of
+ * incoming broadcast traffic to the other groups and dismantle the
+ * SRSes over the default group.
+ */
+ if (group != NULL) {
+ if (group != mip->mi_rx_groups) {
+ default_group = mip->mi_rx_groups;
+ if (default_group->mrg_state ==
+ MAC_GROUP_STATE_RESERVED) {
+ group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(
+ default_group);
+ ASSERT(group_only_mcip != NULL &&
+ mip->mi_nactiveclients > 1);
+
+ mac_set_rx_group_state(default_group,
+ MAC_GROUP_STATE_SHARED);
+ mac_srs_group_setup(group_only_mcip,
+ group_only_mcip->mci_flent,
+ default_group, SRST_LINK);
+ }
+ ASSERT(default_group->mrg_state ==
+ MAC_GROUP_STATE_SHARED);
+ }
+ /*
+ * If we get an exclusive group for a VLAN MAC client we
+ * need to take the s/w path to make the additional check for
+ * the vid. Disable polling and set it to s/w classification.
+ */
+ if (group->mrg_state == MAC_GROUP_STATE_RESERVED &&
+ i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE) {
+ mac_rx_switch_grp_to_sw(group);
+ }
+ }
+ return (0);
+
+setup_failed:
+ mac_datapath_teardown(mcip, flent, link_type);
+ return (err);
+}
+
+void
+mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
+ uint32_t link_type)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_group_t *group = NULL;
+ mac_client_impl_t *grp_only_mcip;
+ flow_entry_t *group_only_flent;
+ mac_group_t *default_group;
+ boolean_t check_default_group = B_FALSE;
+ mac_group_state_t next_state;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ switch (link_type) {
+ case SRST_FLOW:
+ mac_srs_group_teardown(mcip, flent, SRST_FLOW);
+ return;
+
+ case SRST_LINK:
+ /* Stop sending packets */
+ mac_tx_client_block(mcip);
+
+ /* Stop the packets coming from the H/W */
+ if (mcip->mci_unicast != NULL) {
+ int err;
+ err = mac_remove_macaddr(mcip->mci_unicast);
+ if (err != 0) {
+ cmn_err(CE_WARN, "%s: failed to remove a MAC"
+ " address because of error 0x%x",
+ mip->mi_name, err);
+ }
+ mcip->mci_unicast = NULL;
+ }
+
+ /* Stop the packets coming from the S/W classifier */
+ mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
+ mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+
+ /* Now quiesce and destroy all SRS and soft rings */
+ mac_srs_group_teardown(mcip, flent, SRST_LINK);
+ ASSERT((mcip->mci_flent == flent) &&
+ (flent->fe_next == NULL));
+
+ /*
+ * Release our hold on the group as well. We need
+ * to check if the shared group has only one client
+ * left who can use it exclusively. Also, if we
+ * were the last client, release the group.
+ */
+ group = flent->fe_rx_ring_group;
+ if (group != NULL) {
+ mac_rx_group_remove_client(group, mcip);
+ next_state = mac_rx_group_next_state(group,
+ &grp_only_mcip);
+ if (next_state == MAC_GROUP_STATE_RESERVED) {
+ /*
+ * Only one client left on this RX group.
+ */
+ ASSERT(grp_only_mcip != NULL);
+ mac_set_rx_group_state(group,
+ MAC_GROUP_STATE_RESERVED);
+ group_only_flent = grp_only_mcip->mci_flent;
+
+ /*
+ * The only remaining client has exclusive
+ * access on the group. Allow it to
+ * dynamically poll the H/W rings etc.
+ */
+ mac_srs_group_setup(grp_only_mcip,
+ group_only_flent, group, SRST_LINK);
+ mac_rx_group_unmark(group, MR_INCIPIENT);
+ } else if (next_state == MAC_GROUP_STATE_REGISTERED) {
+ /*
+ * This is a non-default group being freed up.
+ * We need to reevaluate the default group
+ * to see if the primary client can get
+ * exclusive access to the default group.
+ */
+ ASSERT(group != mip->mi_rx_groups);
+ mac_release_rx_group(mcip, group);
+ mac_set_rx_group_state(group,
+ MAC_GROUP_STATE_REGISTERED);
+ check_default_group = B_TRUE;
+ } else {
+ ASSERT(next_state == MAC_GROUP_STATE_SHARED);
+ mac_set_rx_group_state(group,
+ MAC_GROUP_STATE_SHARED);
+ mac_rx_group_unmark(group, MR_CONDEMNED);
+ }
+ flent->fe_rx_ring_group = NULL;
+ }
+ break;
+ default:
+ ASSERT(B_FALSE);
+ break;
+ }
+
+ /*
+ * The mac client using the default group gets exclusive access to the
+ * default group if and only if it is the sole client on the entire
+ * mip. If so set the group state to reserved, and set up the SRSes
+ * over the default group.
+ */
+ if (check_default_group) {
+ default_group = mip->mi_rx_groups;
+ ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED);
+ next_state = mac_rx_group_next_state(default_group,
+ &grp_only_mcip);
+ if (next_state == MAC_GROUP_STATE_RESERVED) {
+ ASSERT(grp_only_mcip != NULL &&
+ mip->mi_nactiveclients == 1);
+ mac_set_rx_group_state(default_group,
+ MAC_GROUP_STATE_RESERVED);
+ mac_srs_group_setup(grp_only_mcip,
+ grp_only_mcip->mci_flent,
+ default_group, SRST_LINK);
+ }
+ }
+}
+
+/* DATAPATH TEAR DOWN ROUTINES (SRS and FANOUT teardown) */
+
+static void
+mac_srs_fanout_list_free(mac_soft_ring_set_t *mac_srs)
+{
+ ASSERT(mac_srs->srs_tcp_soft_rings != NULL);
+ kmem_free(mac_srs->srs_tcp_soft_rings,
+ sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
+ mac_srs->srs_tcp_soft_rings = NULL;
+ ASSERT(mac_srs->srs_udp_soft_rings != NULL);
+ kmem_free(mac_srs->srs_udp_soft_rings,
+ sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
+ mac_srs->srs_udp_soft_rings = NULL;
+ ASSERT(mac_srs->srs_oth_soft_rings != NULL);
+ kmem_free(mac_srs->srs_oth_soft_rings,
+ sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
+ mac_srs->srs_oth_soft_rings = NULL;
+}
+
+/*
+ * An RX SRS is attached to at most one mac_ring.
+ * A TX SRS has no rings.
+ */
+static void
+mac_srs_ring_free(mac_soft_ring_set_t *mac_srs)
+{
+ mac_client_impl_t *mcip;
+ mac_ring_t *ring;
+ flow_entry_t *flent;
+
+ ring = mac_srs->srs_ring;
+ if (mac_srs->srs_type & SRST_TX) {
+ ASSERT(ring == NULL);
+ return;
+ }
+
+ if (ring == NULL)
+ return;
+
+ /*
+ * Broadcast flows don't have a client impl association, but they
+ * use only soft rings.
+ */
+ flent = mac_srs->srs_flent;
+ mcip = flent->fe_mcip;
+ ASSERT(mcip != NULL);
+
+ ring->mr_classify_type = MAC_NO_CLASSIFIER;
+ ring->mr_srs = NULL;
+}
+
+/*
+ * Physical unlink and free of the data structures happen below. This is
+ * driven from mac_flow_destroy(), on the last refrele of a flow.
+ *
+ * Assumes Rx srs is 1-1 mapped with an ring.
+ */
+void
+mac_srs_free(mac_soft_ring_set_t *mac_srs)
+{
+ ASSERT(mac_srs->srs_mcip == NULL ||
+ MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+ ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
+ SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
+
+ mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
+ mac_srs_ring_free(mac_srs);
+ mac_srs_soft_rings_free(mac_srs, B_TRUE);
+ mac_srs_fanout_list_free(mac_srs);
+
+ mac_srs->srs_bw = NULL;
+ kmem_cache_free(mac_srs_cache, mac_srs);
+}
+
+static void
+mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *mac_srs, uint_t s_ring_flag)
+{
+ mac_soft_ring_t *softring;
+
+ ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+
+ mac_srs_soft_rings_signal(mac_srs, s_ring_flag);
+ if (s_ring_flag == S_RING_CONDEMNED) {
+ while (mac_srs->srs_soft_ring_condemned_count !=
+ mac_srs->srs_soft_ring_count)
+ cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+ } else {
+ while (mac_srs->srs_soft_ring_quiesced_count !=
+ mac_srs->srs_soft_ring_count)
+ cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+ }
+ mutex_exit(&mac_srs->srs_lock);
+
+ for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
+ softring = softring->s_ring_next)
+ (void) untimeout(softring->s_ring_tid);
+
+ (void) untimeout(mac_srs->srs_tid);
+
+ mutex_enter(&mac_srs->srs_lock);
+}
+
+/*
+ * The block comment above mac_rx_classify_flow_state_change explains the
+ * background. At this point upcalls from the driver (both hardware classified
+ * and software classified) have been cut off. We now need to quiesce the
+ * SRS worker, poll, and softring threads. The SRS worker thread serves as
+ * the master controller. The steps involved are described below in the function
+ */
+void
+mac_srs_worker_quiesce(mac_soft_ring_set_t *mac_srs)
+{
+ uint_t s_ring_flag;
+ uint_t srs_poll_wait_flag;
+
+ ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+ ASSERT(mac_srs->srs_state & (SRS_CONDEMNED | SRS_QUIESCE));
+
+ if (mac_srs->srs_state & SRS_CONDEMNED) {
+ s_ring_flag = S_RING_CONDEMNED;
+ srs_poll_wait_flag = SRS_POLL_THR_EXITED;
+ } else {
+ s_ring_flag = S_RING_QUIESCE;
+ srs_poll_wait_flag = SRS_POLL_THR_QUIESCED;
+ }
+
+ /*
+ * In the case of Rx SRS wait till the poll thread is done.
+ */
+ if ((mac_srs->srs_type & SRST_TX) == 0 &&
+ mac_srs->srs_poll_thr != NULL) {
+ while (!(mac_srs->srs_state & srs_poll_wait_flag))
+ cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+
+ /*
+ * Turn off polling as part of the quiesce operation.
+ */
+ MAC_SRS_POLLING_OFF(mac_srs);
+ mac_srs->srs_state &= ~(SRS_POLLING | SRS_GET_PKTS);
+ }
+
+ /*
+ * Then signal the soft ring worker threads to quiesce or quit
+ * as needed and then wait till that happens.
+ */
+ mac_srs_soft_rings_quiesce(mac_srs, s_ring_flag);
+
+ if (mac_srs->srs_state & SRS_CONDEMNED)
+ mac_srs->srs_state |= (SRS_QUIESCE_DONE | SRS_CONDEMNED_DONE);
+ else
+ mac_srs->srs_state |= SRS_QUIESCE_DONE;
+ cv_signal(&mac_srs->srs_quiesce_done_cv);
+}
+
+/*
+ * Signal an SRS to start a temporary quiesce, or permanent removal, or restart
+ * a quiesced SRS by setting the appropriate flags and signaling the SRS worker
+ * or poll thread. This function is internal to the quiescing logic and is
+ * called internally from the SRS quiesce or flow quiesce or client quiesce
+ * higher level functions.
+ */
+void
+mac_srs_signal(mac_soft_ring_set_t *mac_srs, uint_t srs_flag)
+{
+ mac_ring_t *ring;
+
+ ring = mac_srs->srs_ring;
+ ASSERT(ring == NULL || ring->mr_refcnt == 0);
+
+ if (srs_flag == SRS_CONDEMNED) {
+ /*
+ * The SRS is going away. We need to unbind the SRS and SR
+ * threads before removing from the global SRS list. Otherwise
+ * there is a small window where the cpu reconfig callbacks
+ * may miss the SRS in the list walk and DR could fail since
+ * there are still bound threads.
+ */
+ mac_srs_threads_unbind(mac_srs);
+ mac_srs_remove_glist(mac_srs);
+ }
+ /*
+ * Wakeup the SRS worker and poll threads.
+ */
+ mutex_enter(&mac_srs->srs_lock);
+ mac_srs->srs_state |= srs_flag;
+ cv_signal(&mac_srs->srs_async);
+ cv_signal(&mac_srs->srs_cv);
+ mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
+ * from the driver are done, then the Rx SRS is quiesced and only then can
+ * we signal the soft rings. Thus this function can't be called arbitrarily
+ * without satisfying the prerequisites. On the Tx side, the threads from
+ * top need to quiesced, then the Tx SRS and only then can we signal the
+ * Tx soft rings.
+ */
+static void
+mac_srs_soft_rings_signal(mac_soft_ring_set_t *mac_srs, uint_t sr_flag)
+{
+ mac_soft_ring_t *softring;
+
+ for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
+ softring = softring->s_ring_next)
+ mac_soft_ring_signal(softring, sr_flag);
+}
+
+/*
+ * The block comment above mac_rx_classify_flow_state_change explains the
+ * background. At this point the SRS is quiesced and we need to restart the
+ * SRS worker, poll, and softring threads. The SRS worker thread serves as
+ * the master controller. The steps involved are described below in the function
+ */
+void
+mac_srs_worker_restart(mac_soft_ring_set_t *mac_srs)
+{
+ boolean_t iam_rx_srs;
+ mac_soft_ring_t *softring;
+
+ ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+ if ((mac_srs->srs_type & SRST_TX) != 0) {
+ iam_rx_srs = B_FALSE;
+ ASSERT((mac_srs->srs_state &
+ (SRS_POLL_THR_QUIESCED | SRS_QUIESCE_DONE | SRS_QUIESCE)) ==
+ (SRS_QUIESCE_DONE | SRS_QUIESCE));
+ } else {
+ iam_rx_srs = B_TRUE;
+ ASSERT((mac_srs->srs_state &
+ (SRS_QUIESCE_DONE | SRS_QUIESCE)) ==
+ (SRS_QUIESCE_DONE | SRS_QUIESCE));
+ if (mac_srs->srs_poll_thr != NULL) {
+ ASSERT((mac_srs->srs_state & SRS_POLL_THR_QUIESCED) ==
+ SRS_POLL_THR_QUIESCED);
+ }
+ }
+
+ /*
+ * Signal any quiesced soft ring workers to restart and wait for the
+ * soft ring down count to come down to zero.
+ */
+ if (mac_srs->srs_soft_ring_quiesced_count != 0) {
+ for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
+ softring = softring->s_ring_next) {
+ if (!(softring->s_ring_state & S_RING_QUIESCE))
+ continue;
+ mac_soft_ring_signal(softring, S_RING_RESTART);
+ }
+ while (mac_srs->srs_soft_ring_quiesced_count != 0)
+ cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+ }
+
+ mac_srs->srs_state &= ~(SRS_QUIESCE_DONE | SRS_QUIESCE | SRS_RESTART);
+ if (iam_rx_srs && mac_srs->srs_poll_thr != NULL) {
+ /*
+ * Signal the poll thread and ask it to restart. Wait till it
+ * actually restarts and the SRS_POLL_THR_QUIESCED flag gets
+ * cleared.
+ */
+ mac_srs->srs_state |= SRS_POLL_THR_RESTART;
+ cv_signal(&mac_srs->srs_cv);
+ while (mac_srs->srs_state & SRS_POLL_THR_QUIESCED)
+ cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+ ASSERT(!(mac_srs->srs_state & SRS_POLL_THR_RESTART));
+ }
+ /* Wake up any waiter waiting for the restart to complete */
+ mac_srs->srs_state |= SRS_RESTART_DONE;
+ cv_signal(&mac_srs->srs_quiesce_done_cv);
+}
+
+static void
+mac_srs_worker_unbind(mac_soft_ring_set_t *mac_srs)
+{
+ mutex_enter(&mac_srs->srs_lock);
+ if (!(mac_srs->srs_state & SRS_WORKER_BOUND)) {
+ ASSERT(mac_srs->srs_worker_cpuid == -1);
+ mutex_exit(&mac_srs->srs_lock);
+ return;
+ }
+
+ mac_srs->srs_worker_cpuid = -1;
+ mac_srs->srs_state &= ~SRS_WORKER_BOUND;
+ thread_affinity_clear(mac_srs->srs_worker);
+ mutex_exit(&mac_srs->srs_lock);
+}
+
+static void
+mac_srs_poll_unbind(mac_soft_ring_set_t *mac_srs)
+{
+ mutex_enter(&mac_srs->srs_lock);
+ if (mac_srs->srs_poll_thr == NULL ||
+ (mac_srs->srs_state & SRS_POLL_BOUND) == 0) {
+ ASSERT(mac_srs->srs_poll_cpuid == -1);
+ mutex_exit(&mac_srs->srs_lock);
+ return;
+ }
+
+ mac_srs->srs_poll_cpuid = -1;
+ mac_srs->srs_state &= ~SRS_POLL_BOUND;
+ thread_affinity_clear(mac_srs->srs_poll_thr);
+ mutex_exit(&mac_srs->srs_lock);
+}
+
+static void
+mac_srs_threads_unbind(mac_soft_ring_set_t *mac_srs)
+{
+ mac_soft_ring_t *soft_ring;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+
+ mutex_enter(&cpu_lock);
+ mac_srs_worker_unbind(mac_srs);
+ if (!(mac_srs->srs_type & SRST_TX))
+ mac_srs_poll_unbind(mac_srs);
+
+ for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
+ soft_ring = soft_ring->s_ring_next) {
+ mac_soft_ring_unbind(soft_ring);
+ }
+ mutex_exit(&cpu_lock);
+}
+
+/*
+ * When a CPU is going away, unbind all MAC threads which are bound
+ * to that CPU. The affinity of the thread to the CPU is saved to allow
+ * the thread to be rebound to the CPU if it comes back online.
+ */
+static void
+mac_walk_srs_and_unbind(int cpuid)
+{
+ mac_soft_ring_set_t *mac_srs;
+ mac_soft_ring_t *soft_ring;
+
+ rw_enter(&mac_srs_g_lock, RW_READER);
+
+ if ((mac_srs = mac_srs_g_list) == NULL)
+ goto done;
+
+ for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) {
+ if (mac_srs->srs_worker_cpuid == cpuid) {
+ mac_srs->srs_worker_cpuid_save = cpuid;
+ mac_srs_worker_unbind(mac_srs);
+ }
+
+ if (!(mac_srs->srs_type & SRST_TX)) {
+ if (mac_srs->srs_poll_cpuid == cpuid) {
+ mac_srs->srs_poll_cpuid_save = cpuid;
+ mac_srs_poll_unbind(mac_srs);
+ }
+ }
+
+ /* Next tackle the soft rings associated with the srs */
+ mutex_enter(&mac_srs->srs_lock);
+ for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
+ soft_ring = soft_ring->s_ring_next) {
+ if (soft_ring->s_ring_cpuid == cpuid) {
+ soft_ring->s_ring_cpuid_save = cpuid;
+ mac_soft_ring_unbind(soft_ring);
+ }
+ }
+ mutex_exit(&mac_srs->srs_lock);
+ }
+done:
+ rw_exit(&mac_srs_g_lock);
+}
+
+/* TX SETUP and TEARDOWN ROUTINES */
+
+/*
+ * XXXHIO need to make sure the two mac_tx_srs_{add,del}_ring()
+ * handle the case where the number of rings is one. I.e. there is
+ * a ring pointed to by mac_srs->srs_tx_arg2.
+ */
+void
+mac_tx_srs_add_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
+{
+ mac_client_impl_t *mcip = mac_srs->srs_mcip;
+ mac_soft_ring_t *soft_ring;
+ int count = mac_srs->srs_oth_ring_count;
+
+ ASSERT(mac_srs->srs_state & SRS_QUIESCE);
+ soft_ring = mac_soft_ring_create(count, 0, NULL,
+ (ST_RING_OTH | ST_RING_TX), maxclsyspri, mcip, mac_srs, -1,
+ NULL, mcip, (mac_resource_handle_t)tx_ring);
+ mac_srs->srs_oth_ring_count++;
+ /*
+ * put this soft ring in quiesce mode too so when we restart
+ * all soft rings in the srs are in the same state.
+ */
+ mac_soft_ring_signal(soft_ring, S_RING_QUIESCE);
+}
+
+static void
+mac_soft_ring_remove(mac_soft_ring_set_t *mac_srs, mac_soft_ring_t *softring)
+{
+ int sringcnt;
+
+ mutex_enter(&mac_srs->srs_lock);
+ sringcnt = mac_srs->srs_soft_ring_count;
+ ASSERT(sringcnt > 0);
+ mac_soft_ring_signal(softring, S_RING_CONDEMNED);
+
+ ASSERT(mac_srs->srs_soft_ring_condemned_count == 0);
+ while (mac_srs->srs_soft_ring_condemned_count != 1)
+ cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+
+ if (softring == mac_srs->srs_soft_ring_head) {
+ mac_srs->srs_soft_ring_head = softring->s_ring_next;
+ if (mac_srs->srs_soft_ring_head != NULL) {
+ mac_srs->srs_soft_ring_head->s_ring_prev = NULL;
+ } else {
+ mac_srs->srs_soft_ring_tail = NULL;
+ }
+ } else {
+ softring->s_ring_prev->s_ring_next =
+ softring->s_ring_next;
+ if (softring->s_ring_next != NULL) {
+ softring->s_ring_next->s_ring_prev =
+ softring->s_ring_prev;
+ } else {
+ mac_srs->srs_soft_ring_tail =
+ softring->s_ring_prev;
+ }
+ }
+ mac_srs->srs_soft_ring_count--;
+
+ mac_srs->srs_soft_ring_condemned_count--;
+ mutex_exit(&mac_srs->srs_lock);
+
+ mac_soft_ring_free(softring, B_FALSE);
+}
+
+void
+mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
+{
+ int i;
+ mac_soft_ring_t *soft_ring, *remove_sring;
+
+ mutex_enter(&mac_srs->srs_lock);
+ for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+ soft_ring = mac_srs->srs_oth_soft_rings[i];
+ if (soft_ring->s_ring_tx_arg2 == tx_ring)
+ break;
+ }
+ mutex_exit(&mac_srs->srs_lock);
+ ASSERT(i < mac_srs->srs_oth_ring_count);
+ remove_sring = soft_ring;
+ mac_soft_ring_remove(mac_srs, remove_sring);
+ mac_srs_update_fanout_list(mac_srs);
+}
+
+/*
+ * mac_tx_srs_setup():
+ *
+ * Used to setup Tx rings. If no free Tx ring is available, then default
+ * Tx ring is used.
+ */
+void
+mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+ uint32_t srs_type)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ mac_soft_ring_set_t *tx_srs;
+ int i, tx_ring_count = 0, tx_rings_reserved;
+ mac_ring_handle_t *tx_ring = NULL;
+ uint32_t soft_ring_type;
+ mac_group_t *grp = NULL;
+ mac_ring_t *ring;
+ mac_srs_tx_t *tx;
+ boolean_t serialize = B_FALSE;
+
+ tx_srs = flent->fe_tx_srs;
+ tx = &tx_srs->srs_tx;
+
+ if (tx->st_group != NULL) {
+ grp = tx->st_group;
+ tx_ring_count = grp->mrg_cur_count;
+ } else {
+ tx_ring_count = mac_tx_ring_count;
+ }
+
+ if (tx_ring_count != 0) {
+ tx_ring = kmem_zalloc(sizeof (mac_ring_handle_t) *
+ tx_ring_count, KM_SLEEP);
+ }
+
+ /*
+ * Just use the default ring for now. We need to use
+ * the underlying link's ring set instead of the underlying
+ * NIC's.
+ */
+ if (srs_type == SRST_FLOW || mcip->mci_no_hwrings)
+ goto use_default_ring;
+
+ if (mcip->mci_share != NULL)
+ ring = grp->mrg_rings;
+ /*
+ * An attempt is made to reserve 'tx_ring_count' number
+ * of Tx rings. If tx_ring_count is 0, default Tx ring
+ * is used. If it is 1, an attempt is made to reserve one
+ * Tx ring. In both the cases, the ring information is
+ * stored in Tx SRS. If multiple Tx rings are specified,
+ * then each Tx ring will have a Tx-side soft ring. All
+ * these soft rings will be hang off Tx SRS.
+ */
+ for (i = 0, tx_rings_reserved = 0;
+ i < tx_ring_count; i++, tx_rings_reserved++) {
+ if (mcip->mci_share != NULL) {
+ /*
+ * The ring was already chosen and associated
+ * with the TX group. Save it in the new
+ * array to keep as much of the code below common
+ * between the share and non-share cases.
+ */
+ ASSERT(ring != NULL);
+ tx_ring[i] = (mac_ring_handle_t)ring;
+ ring = ring->mr_next;
+ } else {
+ tx_ring[i] =
+ (mac_ring_handle_t)mac_reserve_tx_ring(mip, NULL);
+ if (tx_ring[i] == NULL)
+ break;
+ }
+ }
+ if (mac_tx_serialize || (mip->mi_v12n_level & MAC_VIRT_SERIALIZE))
+ serialize = B_TRUE;
+ /*
+ * Did we get the requested number of tx rings?
+ * There are 3 actions we can take depending upon the number
+ * of tx_rings we got.
+ * 1) If we got none, then hook up the tx_srs with the
+ * default ring.
+ * 2) If we got one, then get the tx_ring from the soft ring,
+ * save it in SRS and free up the soft ring.
+ * 3) If we got more than 1, then do the tx fanout among the
+ * rings we obtained.
+ */
+ switch (tx_rings_reserved) {
+ case 1:
+ /*
+ * No need to allocate Tx soft rings. Tx-side soft
+ * rings are for Tx fanout case. Just use Tx SRS.
+ */
+ /* FALLTHRU */
+
+ case 0:
+use_default_ring:
+ if (tx_rings_reserved == 0)
+ tx->st_arg2 = (void *)mip->mi_default_tx_ring;
+ else
+ tx->st_arg2 = (void *)tx_ring[0];
+ /* For ring_count of 0 or 1, set the tx_mode and return */
+ if (tx_srs->srs_type & SRST_BW_CONTROL)
+ tx->st_mode = SRS_TX_BW;
+ else if (serialize)
+ tx->st_mode = SRS_TX_SERIALIZE;
+ else
+ tx->st_mode = SRS_TX_DEFAULT;
+ break;
+
+ default:
+ /*
+ * We got multiple Tx rings for Tx fanout.
+ *
+ * cpuid of -1 is passed. This creates an unbound
+ * worker thread. Instead the code should get CPU
+ * binding information and pass that to
+ * mac_soft_ring_create(). This needs to be done
+ * in conjunction with Rx-side soft ring
+ * bindings.
+ */
+ soft_ring_type = ST_RING_OTH | ST_RING_TX;
+ if (tx_srs->srs_type & SRST_BW_CONTROL) {
+ tx->st_mode = SRS_TX_BW_FANOUT;
+ } else {
+ tx->st_mode = SRS_TX_FANOUT;
+ if (serialize)
+ soft_ring_type |= ST_RING_WORKER_ONLY;
+ }
+ for (i = 0; i < tx_rings_reserved; i++) {
+ (void) mac_soft_ring_create(i, 0, NULL, soft_ring_type,
+ maxclsyspri, mcip, tx_srs, -1, NULL, mcip,
+ (mac_resource_handle_t)tx_ring[i]);
+ }
+ mac_srs_update_fanout_list(tx_srs);
+ }
+ tx->st_func = mac_tx_get_func(tx->st_mode);
+
+ DTRACE_PROBE3(tx__srs___setup__return, mac_soft_ring_set_t *, tx_srs,
+ int, tx->st_mode, int, tx_srs->srs_oth_ring_count);
+
+ if (tx_ring_count != 0) {
+ kmem_free(tx_ring,
+ sizeof (mac_ring_handle_t) * tx_ring_count);
+ }
+}
+
+/*
+ * Walk through the list of mac clients for the MAC.
+ * For each active mac client, recompute the number of soft rings
+ * associated with every client, only if current speed is different
+ * from the speed that was previously used for soft ring computation.
+ * If the cable is disconnected whlie the NIC is started, we would get
+ * notification with speed set to 0. We do not recompute in that case.
+ */
+void
+mac_fanout_recompute(mac_impl_t *mip)
+{
+ mac_client_impl_t *mcip;
+ uint64_t ifspeed;
+ mac_resource_props_t *mcip_mrp;
+
+ i_mac_perim_enter(mip);
+ ASSERT(!(mip->mi_state_flags & MIS_IS_VNIC));
+
+ if (mip->mi_linkstate != LINK_STATE_UP) {
+ i_mac_perim_exit(mip);
+ return;
+ }
+
+ for (mcip = mip->mi_clients_list; mcip != NULL;
+ mcip = mcip->mci_client_next) {
+ if (!MCIP_DATAPATH_SETUP(mcip))
+ continue;
+
+ ifspeed = mac_client_stat_get(mcip->mci_flent->fe_mcip,
+ MAC_STAT_IFSPEED);
+ if ((ifspeed != 0) &&
+ (ifspeed != mcip->mci_flent->fe_nic_speed)) {
+ mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+ mac_fanout_setup(mcip, mcip->mci_flent,
+ mcip_mrp, mac_rx_deliver, mcip, NULL);
+ }
+ }
+ i_mac_perim_exit(mip);
+}
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
new file mode 100644
index 0000000000..f4c2113f61
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -0,0 +1,2373 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/strsun.h>
+#include <sys/sdt.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/dls.h>
+#include <sys/dls_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/ethernet.h>
+#include <sys/vlan.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/sctp.h>
+
+/* global flow table, will be a per exclusive-zone table later */
+static mod_hash_t *flow_hash;
+static krwlock_t flow_tab_lock;
+
+static kmem_cache_t *flow_cache;
+static kmem_cache_t *flow_tab_cache;
+static flow_ops_t flow_l2_ops;
+
+typedef struct {
+ const char *fs_name;
+ uint_t fs_offset;
+} flow_stats_info_t;
+
+#define FS_OFF(f) (offsetof(flow_stats_t, f))
+static flow_stats_info_t flow_stats_list[] = {
+ {"rbytes", FS_OFF(fs_rbytes)},
+ {"ipackets", FS_OFF(fs_ipackets)},
+ {"ierrors", FS_OFF(fs_ierrors)},
+ {"obytes", FS_OFF(fs_obytes)},
+ {"opackets", FS_OFF(fs_opackets)},
+ {"oerrors", FS_OFF(fs_oerrors)}
+};
+#define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
+
+/*
+ * Checks whether a flow mask is legal.
+ */
+static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t);
+
+static void
+flow_stat_init(kstat_named_t *knp)
+{
+ int i;
+
+ for (i = 0; i < FS_SIZE; i++, knp++) {
+ kstat_named_init(knp, flow_stats_list[i].fs_name,
+ KSTAT_DATA_UINT64);
+ }
+}
+
+static int
+flow_stat_update(kstat_t *ksp, int rw)
+{
+ flow_entry_t *fep = ksp->ks_private;
+ flow_stats_t *fsp = &fep->fe_flowstats;
+ kstat_named_t *knp = ksp->ks_data;
+ uint64_t *statp;
+ zoneid_t zid;
+ int i;
+
+ if (rw != KSTAT_READ)
+ return (EACCES);
+
+ zid = getzoneid();
+ if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) {
+ for (i = 0; i < FS_SIZE; i++, knp++)
+ knp->value.ui64 = 0;
+
+ return (0);
+ }
+
+ for (i = 0; i < FS_SIZE; i++, knp++) {
+ statp = (uint64_t *)
+ ((uchar_t *)fsp + flow_stats_list[i].fs_offset);
+
+ knp->value.ui64 = *statp;
+ }
+ return (0);
+}
+
+static void
+flow_stat_create(flow_entry_t *fep)
+{
+ kstat_t *ksp;
+ kstat_named_t *knp;
+ uint_t nstats = FS_SIZE;
+
+ ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow",
+ KSTAT_TYPE_NAMED, nstats, 0);
+ if (ksp == NULL)
+ return;
+
+ ksp->ks_update = flow_stat_update;
+ ksp->ks_private = fep;
+ fep->fe_ksp = ksp;
+
+ knp = (kstat_named_t *)ksp->ks_data;
+ flow_stat_init(knp);
+ kstat_install(ksp);
+}
+
+void
+flow_stat_destroy(flow_entry_t *fep)
+{
+ if (fep->fe_ksp != NULL) {
+ kstat_delete(fep->fe_ksp);
+ fep->fe_ksp = NULL;
+ }
+}
+
+/*
+ * Initialize the flow table
+ */
+void
+mac_flow_init()
+{
+ flow_cache = kmem_cache_create("flow_entry_cache",
+ sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ flow_tab_cache = kmem_cache_create("flow_tab_cache",
+ sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ flow_hash = mod_hash_create_extended("flow_hash",
+ 100, mod_hash_null_keydtor, mod_hash_null_valdtor,
+ mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+ rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * Cleanup and release the flow table
+ */
+void
+mac_flow_fini()
+{
+ kmem_cache_destroy(flow_cache);
+ kmem_cache_destroy(flow_tab_cache);
+ mod_hash_destroy_hash(flow_hash);
+ rw_destroy(&flow_tab_lock);
+}
+
+/*
+ * mac_create_flow(): create a flow_entry_t.
+ */
+int
+mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
+ void *client_cookie, uint_t type, flow_entry_t **flentp)
+{
+ flow_entry_t *flent = *flentp;
+ int err = 0;
+
+ if (mrp != NULL) {
+ err = mac_validate_props(mrp);
+ if (err != 0)
+ return (err);
+ }
+
+ if (flent == NULL) {
+ flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
+ bzero(flent, sizeof (*flent));
+ mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Initialize the receiver function to a safe routine */
+ flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_index = -1;
+ }
+ (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME);
+
+ /* This is an initial flow, will be configured later */
+ if (fd == NULL) {
+ *flentp = flent;
+ return (0);
+ }
+
+ flent->fe_client_cookie = client_cookie;
+ flent->fe_type = type;
+
+ /*
+ * As flow creation is only allowed in global zone, this will
+ * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will
+ * later set the right value.
+ */
+ flent->fe_zoneid = getzoneid();
+
+ /* Save flow desc */
+ bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
+
+ if (mrp != NULL) {
+ /*
+ * We have already set fe_resource_props for a Link.
+ */
+ if (type & FLOW_USER) {
+ bcopy(mrp, &flent->fe_resource_props,
+ sizeof (mac_resource_props_t));
+ }
+ /*
+ * The effective resource list should reflect the priority
+ * that we set implicitly.
+ */
+ if (!(mrp->mrp_mask & MRP_PRIORITY))
+ mrp->mrp_mask |= MRP_PRIORITY;
+ if (type & FLOW_USER)
+ mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
+ else
+ mrp->mrp_priority = MPL_LINK_DEFAULT;
+ bcopy(mrp, &flent->fe_effective_props,
+ sizeof (mac_resource_props_t));
+ }
+ flow_stat_create(flent);
+
+ *flentp = flent;
+ return (0);
+}
+
+/*
+ * Validate flow entry and add it to a flow table.
+ */
+int
+mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_entry_t **headp, **p;
+ flow_ops_t *ops = &ft->ft_ops;
+ flow_mask_t mask;
+ uint32_t index;
+ int err;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+ /*
+ * Check for invalid bits in mask.
+ */
+ mask = flent->fe_flow_desc.fd_mask;
+ if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
+ return (EOPNOTSUPP);
+
+ /*
+ * Validate flent.
+ */
+ if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
+ DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
+ flow_entry_t *, flent, int, err);
+ return (err);
+ }
+
+ /*
+ * Flent is valid. now calculate hash and insert it
+ * into hash table.
+ */
+ index = ops->fo_hash_fe(ft, flent);
+
+ /*
+ * We do not need a lock up until now because we were
+ * not accessing the flow table.
+ */
+ rw_enter(&ft->ft_lock, RW_WRITER);
+ headp = &ft->ft_table[index];
+
+ /*
+ * Check for duplicate flow.
+ */
+ for (p = headp; *p != NULL; p = &(*p)->fe_next) {
+ if ((*p)->fe_flow_desc.fd_mask !=
+ flent->fe_flow_desc.fd_mask)
+ continue;
+
+ if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
+ rw_exit(&ft->ft_lock);
+ DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
+ flow_entry_t *, flent, int, err);
+ return (EALREADY);
+ }
+ }
+
+ /*
+ * Insert flow to hash list.
+ */
+ err = ops->fo_insert_fe(ft, headp, flent);
+ if (err != 0) {
+ rw_exit(&ft->ft_lock);
+ DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
+ flow_entry_t *, flent, int, err);
+ return (err);
+ }
+
+ /*
+ * Save the hash index so it can be used by mac_flow_remove().
+ */
+ flent->fe_index = (int)index;
+
+ /*
+ * Save the flow tab back reference.
+ */
+ flent->fe_flow_tab = ft;
+ FLOW_MARK(flent, FE_FLOW_TAB);
+ ft->ft_flow_count++;
+ rw_exit(&ft->ft_lock);
+ return (0);
+}
+
+/*
+ * Remove a flow from a mac client's subflow table
+ */
+void
+mac_flow_rem_subflow(flow_entry_t *flent)
+{
+ flow_tab_t *ft = flent->fe_flow_tab;
+ mac_client_impl_t *mcip = ft->ft_mcip;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+ mac_flow_remove(ft, flent, B_FALSE);
+ if (flent->fe_mcip == NULL) {
+ /*
+ * The interface is not yet plumbed and mac_client_flow_add
+ * was not done.
+ */
+ if (FLOW_TAB_EMPTY(ft)) {
+ mac_flow_tab_destroy(ft);
+ mcip->mci_subflow_tab = NULL;
+ }
+ return;
+ }
+ mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+ mac_link_flow_clean((mac_client_handle_t)mcip, flent);
+}
+
+/*
+ * Add a flow to a mac client's subflow table and instantiate the flow
+ * in the mac by creating the associated SRSs etc.
+ */
+int
+mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
+ boolean_t instantiate_flow)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ flow_tab_info_t *ftinfo;
+ flow_mask_t mask;
+ flow_tab_t *ft;
+ int err;
+ boolean_t ft_created = B_FALSE;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+ /*
+ * If the subflow table exists already just add the new subflow
+ * to the existing table, else we create a new subflow table below.
+ */
+ ft = mcip->mci_subflow_tab;
+ if (ft == NULL) {
+ mask = flent->fe_flow_desc.fd_mask;
+ /*
+ * Try to create a new table and then add the subflow to the
+ * newly created subflow table
+ */
+ if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL)
+ return (EOPNOTSUPP);
+
+ mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
+ mcip->mci_mip, &ft);
+ ft_created = B_TRUE;
+ }
+
+ err = mac_flow_add(ft, flent);
+ if (err != 0) {
+ if (ft_created)
+ mac_flow_tab_destroy(ft);
+ return (err);
+ }
+
+ if (instantiate_flow) {
+ /* Now activate the flow by creating its SRSs */
+ ASSERT(MCIP_DATAPATH_SETUP(mcip));
+ err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
+ if (err != 0) {
+ mac_flow_remove(ft, flent, B_FALSE);
+ if (ft_created)
+ mac_flow_tab_destroy(ft);
+ return (err);
+ }
+ } else {
+ FLOW_MARK(flent, FE_UF_NO_DATAPATH);
+ }
+ if (ft_created) {
+ ASSERT(mcip->mci_subflow_tab == NULL);
+ ft->ft_mcip = mcip;
+ mcip->mci_subflow_tab = ft;
+ if (instantiate_flow)
+ mac_client_update_classifier(mcip, B_TRUE);
+ }
+ return (0);
+}
+
+/*
+ * Remove flow entry from flow table.
+ */
+void
+mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
+{
+ flow_entry_t **fp;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+ if (!(flent->fe_flags & FE_FLOW_TAB))
+ return;
+
+ rw_enter(&ft->ft_lock, RW_WRITER);
+ /*
+ * If this is a permanent removal from the flow table, mark it
+ * CONDEMNED to prevent future references. If this is a temporary
+ * removal from the table, say to update the flow descriptor then
+ * we don't mark it CONDEMNED
+ */
+ if (!temp)
+ FLOW_MARK(flent, FE_CONDEMNED);
+ /*
+ * Locate the specified flent.
+ */
+ fp = &ft->ft_table[flent->fe_index];
+ while (*fp != flent)
+ fp = &(*fp)->fe_next;
+
+ /*
+ * The flent must exist. Otherwise it's a bug.
+ */
+ ASSERT(fp != NULL);
+ *fp = flent->fe_next;
+ flent->fe_next = NULL;
+
+ /*
+ * Reset fe_index to -1 so any attempt to call mac_flow_remove()
+ * on a flent that is supposed to be in the table (FE_FLOW_TAB)
+ * will panic.
+ */
+ flent->fe_index = -1;
+ FLOW_UNMARK(flent, FE_FLOW_TAB);
+ ft->ft_flow_count--;
+ rw_exit(&ft->ft_lock);
+}
+
+/*
+ * This is the flow lookup routine used by the mac sw classifier engine.
+ */
+int
+mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
+{
+ flow_state_t s;
+ flow_entry_t *flent;
+ flow_ops_t *ops = &ft->ft_ops;
+ boolean_t retried = B_FALSE;
+ int i, err;
+
+ s.fs_flags = flags;
+ s.fs_mp = mp;
+retry:
+
+ /*
+ * Walk the list of predeclared accept functions.
+ * Each of these would accumulate enough state to allow the next
+ * accept routine to make progress.
+ */
+ for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
+ if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
+ /*
+ * ENOBUFS indicates that the mp could be too short
+ * and may need a pullup.
+ */
+ if (err != ENOBUFS || retried)
+ return (err);
+
+ /*
+ * Don't modify the mblk if there are references to it.
+ * Also, there is no point pulling up if b_cont is NULL.
+ */
+ if (DB_REF(mp) > 1 || mp->b_cont == NULL ||
+ pullupmsg(mp, -1) == 0)
+ return (EINVAL);
+
+ retried = B_TRUE;
+ DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
+ flow_state_t *, &s);
+ goto retry;
+ }
+ }
+
+ /*
+ * The packet is considered sane. We may now attempt to
+ * find the corresponding flent.
+ */
+ rw_enter(&ft->ft_lock, RW_READER);
+ flent = ft->ft_table[ops->fo_hash(ft, &s)];
+ for (; flent != NULL; flent = flent->fe_next) {
+ if (flent->fe_match(ft, flent, &s)) {
+ FLOW_TRY_REFHOLD(flent, err);
+ if (err != 0)
+ continue;
+ *flentp = flent;
+ rw_exit(&ft->ft_lock);
+ return (0);
+ }
+ }
+ rw_exit(&ft->ft_lock);
+ return (ENOENT);
+}
+
+/*
+ * Walk flow table.
+ * The caller is assumed to have proper perimeter protection.
+ */
+int
+mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
+ void *arg)
+{
+ int err, i, cnt = 0;
+ flow_entry_t *flent;
+
+ if (ft == NULL)
+ return (0);
+
+ for (i = 0; i < ft->ft_size; i++) {
+ for (flent = ft->ft_table[i]; flent != NULL;
+ flent = flent->fe_next) {
+ cnt++;
+ err = (*fn)(flent, arg);
+ if (err != 0)
+ return (err);
+ }
+ }
+ VERIFY(cnt == ft->ft_flow_count);
+ return (0);
+}
+
+/*
+ * Same as the above except a mutex is used for protection here.
+ */
+int
+mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
+ void *arg)
+{
+ int err;
+
+ if (ft == NULL)
+ return (0);
+
+ rw_enter(&ft->ft_lock, RW_WRITER);
+ err = mac_flow_walk_nolock(ft, fn, arg);
+ rw_exit(&ft->ft_lock);
+ return (err);
+}
+
+static boolean_t mac_flow_clean(flow_entry_t *);
+
+/*
+ * Destroy a flow entry. Called when the last reference on a flow is released.
+ */
+void
+mac_flow_destroy(flow_entry_t *flent)
+{
+ ASSERT(flent->fe_refcnt == 0);
+
+ if ((flent->fe_type & FLOW_USER) != 0) {
+ ASSERT(mac_flow_clean(flent));
+ } else {
+ mac_flow_cleanup(flent);
+ }
+
+ mutex_destroy(&flent->fe_lock);
+ cv_destroy(&flent->fe_cv);
+ flow_stat_destroy(flent);
+ kmem_cache_free(flow_cache, flent);
+}
+
+/*
+ * XXX eric
+ * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
+ * mac_link_flow_modify() should really be moved/reworked into the
+ * two functions below. This would consolidate all the mac property
+ * checking in one place. I'm leaving this alone for now since it's
+ * out of scope of the new flows work.
+ */
+/* ARGSUSED */
+uint32_t
+mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+ uint32_t changed_mask = 0;
+ mac_resource_props_t *fmrp = &flent->fe_effective_props;
+ int i;
+
+ if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
+ (fmrp->mrp_maxbw != mrp->mrp_maxbw)) {
+ changed_mask |= MRP_MAXBW;
+ fmrp->mrp_maxbw = mrp->mrp_maxbw;
+ if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
+ fmrp->mrp_mask &= ~MRP_MAXBW;
+ } else {
+ fmrp->mrp_mask |= MRP_MAXBW;
+ }
+ }
+
+ if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
+ if (fmrp->mrp_priority != mrp->mrp_priority)
+ changed_mask |= MRP_PRIORITY;
+ if (mrp->mrp_priority == MPL_RESET) {
+ fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
+ fmrp->mrp_mask &= ~MRP_PRIORITY;
+ } else {
+ fmrp->mrp_priority = mrp->mrp_priority;
+ fmrp->mrp_mask |= MRP_PRIORITY;
+ }
+ }
+
+ /* modify fanout */
+ if ((mrp->mrp_mask & MRP_CPUS) != 0) {
+ if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
+ (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
+ for (i = 0; i < mrp->mrp_ncpus; i++) {
+ if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
+ break;
+ }
+ if (i == mrp->mrp_ncpus) {
+ /*
+ * The new set of cpus passed is exactly
+ * the same as the existing set.
+ */
+ return (changed_mask);
+ }
+ }
+ changed_mask |= MRP_CPUS;
+ MAC_COPY_CPUS(mrp, fmrp);
+ }
+ return (changed_mask);
+}
+
+void
+mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+ uint32_t changed_mask;
+ mac_client_impl_t *mcip = flent->fe_mcip;
+ mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+
+ ASSERT(flent != NULL);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+ rw_enter(&ft->ft_lock, RW_WRITER);
+
+ /* Update the cached values inside the subflow entry */
+ changed_mask = mac_flow_modify_props(flent, mrp);
+ rw_exit(&ft->ft_lock);
+ /*
+ * Push the changed parameters to the scheduling code in the
+ * SRS's, to take effect right away.
+ */
+ if (changed_mask & MRP_MAXBW) {
+ mac_srs_update_bwlimit(flent, mrp);
+ /*
+ * If bandwidth is changed, we may have to change
+ * the number of soft ring to be used for fanout.
+ * Call mac_flow_update_fanout() if MAC_BIND_CPU
+ * is not set and there is no user supplied cpu
+ * info. This applies only to link at this time.
+ */
+ if (!(flent->fe_type & FLOW_USER) &&
+ !(changed_mask & MRP_CPUS) &&
+ !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
+ mac_fanout_setup(mcip, flent, mcip_mrp,
+ mac_rx_deliver, mcip, NULL);
+ }
+ }
+ if (mrp->mrp_mask & MRP_PRIORITY)
+ mac_flow_update_priority(mcip, flent);
+
+ if (changed_mask & MRP_CPUS)
+ mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL);
+}
+
+/*
+ * This function waits for a certain condition to be met and is generally
+ * used before a destructive or quiescing operation.
+ */
+void
+mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
+{
+ mutex_enter(&flent->fe_lock);
+ flent->fe_flags |= FE_WAITER;
+
+ switch (event) {
+ case FLOW_DRIVER_UPCALL:
+ /*
+ * We want to make sure the driver upcalls have finished before
+ * we signal the Rx SRS worker to quit.
+ */
+ while (flent->fe_refcnt != 1)
+ cv_wait(&flent->fe_cv, &flent->fe_lock);
+ break;
+
+ case FLOW_USER_REF:
+ /*
+ * Wait for the fe_user_refcnt to drop to 0. The flow has
+ * been removed from the global flow hash.
+ */
+ ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
+ while (flent->fe_user_refcnt != 0)
+ cv_wait(&flent->fe_cv, &flent->fe_lock);
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ flent->fe_flags &= ~FE_WAITER;
+ mutex_exit(&flent->fe_lock);
+}
+
+static boolean_t
+mac_flow_clean(flow_entry_t *flent)
+{
+ ASSERT(flent->fe_next == NULL);
+ ASSERT(flent->fe_tx_srs == NULL);
+ ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
+ ASSERT(flent->fe_mbg == NULL);
+
+ return (B_TRUE);
+}
+
+void
+mac_flow_cleanup(flow_entry_t *flent)
+{
+ if ((flent->fe_type & FLOW_USER) == 0) {
+ ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
+ (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
+ ASSERT(flent->fe_refcnt == 0);
+ } else {
+ ASSERT(flent->fe_refcnt == 1);
+ }
+
+ if (flent->fe_mbg != NULL) {
+ ASSERT(flent->fe_tx_srs == NULL);
+ /* This is a multicast or broadcast flow entry */
+ mac_bcast_grp_free(flent->fe_mbg);
+ flent->fe_mbg = NULL;
+ }
+
+ if (flent->fe_tx_srs != NULL) {
+ ASSERT(flent->fe_mbg == NULL);
+ mac_srs_free(flent->fe_tx_srs);
+ flent->fe_tx_srs = NULL;
+ }
+
+ /*
+ * In the normal case fe_rx_srs_cnt is 1. However in the error case
+ * when mac_unicast_add fails we may not have set up any SRS
+ * in which case fe_rx_srs_cnt will be zero.
+ */
+ if (flent->fe_rx_srs_cnt != 0) {
+ ASSERT(flent->fe_rx_srs_cnt == 1);
+ mac_srs_free(flent->fe_rx_srs[0]);
+ flent->fe_rx_srs[0] = NULL;
+ flent->fe_rx_srs_cnt = 0;
+ }
+ ASSERT(flent->fe_rx_srs[0] == NULL);
+}
+
+void
+mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
+{
+ /*
+ * Grab the fe_lock to see a self-consistent fe_flow_desc.
+ * Updates to the fe_flow_desc happen under the fe_lock
+ * after removing the flent from the flow table
+ */
+ mutex_enter(&flent->fe_lock);
+ bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
+ mutex_exit(&flent->fe_lock);
+}
+
+/*
+ * Update a field of a flow entry. The mac perimeter ensures that
+ * this is the only thread doing a modify operation on this mac end point.
+ * So the flow table can't change or disappear. The ft_lock protects access
+ * to the flow entry, and holding the lock ensures that there isn't any thread
+ * accessing the flow entry or attempting a flow table lookup. However
+ * data threads that are using the flow entry based on the old descriptor
+ * will continue to use the flow entry. If strong coherence is required
+ * then the flow will have to be quiesced before the descriptor can be
+ * changed.
+ */
+void
+mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
+{
+ flow_tab_t *ft = flent->fe_flow_tab;
+ flow_desc_t old_desc;
+ int err;
+
+ if (ft == NULL) {
+ /*
+ * The flow hasn't yet been inserted into the table,
+ * so only the caller knows about this flow, however for
+ * uniformity we grab the fe_lock here.
+ */
+ mutex_enter(&flent->fe_lock);
+ bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
+ mutex_exit(&flent->fe_lock);
+ }
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+ /*
+ * Need to remove the flow entry from the table and reinsert it,
+ * into a potentially diference hash line. The hash depends on
+ * the new descriptor fields. However access to fe_desc itself
+ * is always under the fe_lock. This helps log and stat functions
+ * see a self-consistent fe_flow_desc.
+ */
+ mac_flow_remove(ft, flent, B_TRUE);
+ old_desc = flent->fe_flow_desc;
+
+ mutex_enter(&flent->fe_lock);
+ bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
+ mutex_exit(&flent->fe_lock);
+
+ if (mac_flow_add(ft, flent) != 0) {
+ /*
+ * The add failed say due to an invalid flow descriptor.
+ * Undo the update
+ */
+ flent->fe_flow_desc = old_desc;
+ err = mac_flow_add(ft, flent);
+ ASSERT(err == 0);
+ }
+}
+
+void
+mac_flow_set_name(flow_entry_t *flent, const char *name)
+{
+ flow_tab_t *ft = flent->fe_flow_tab;
+
+ if (ft == NULL) {
+ /*
+ * The flow hasn't yet been inserted into the table,
+ * so only the caller knows about this flow
+ */
+ (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME);
+ } else {
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+ }
+
+ mutex_enter(&flent->fe_lock);
+ (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME);
+ mutex_exit(&flent->fe_lock);
+}
+
+/*
+ * Return the client-private cookie that was associated with
+ * the flow when it was created.
+ */
+void *
+mac_flow_get_client_cookie(flow_entry_t *flent)
+{
+ return (flent->fe_client_cookie);
+}
+
+/*
+ * Forward declarations.
+ */
+static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *);
+static int flow_l2_accept(flow_tab_t *, flow_state_t *);
+static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *);
+static int flow_ether_accept(flow_tab_t *, flow_state_t *);
+
+/*
+ * Create flow table.
+ */
+void
+mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
+ mac_impl_t *mip, flow_tab_t **ftp)
+{
+ flow_tab_t *ft;
+ flow_ops_t *new_ops;
+
+ ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
+ bzero(ft, sizeof (*ft));
+
+ ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
+
+ /*
+ * We make a copy of the ops vector instead of just pointing to it
+ * because we might want to customize the ops vector on a per table
+ * basis (e.g. for optimization).
+ */
+ new_ops = &ft->ft_ops;
+ bcopy(ops, new_ops, sizeof (*ops));
+ ft->ft_mask = mask;
+ ft->ft_size = size;
+ ft->ft_mip = mip;
+
+ /*
+ * Optimization for DL_ETHER media.
+ */
+ if (mip->mi_info.mi_nativemedia == DL_ETHER) {
+ if (new_ops->fo_hash == flow_l2_hash)
+ new_ops->fo_hash = flow_ether_hash;
+
+ if (new_ops->fo_accept[0] == flow_l2_accept)
+ new_ops->fo_accept[0] = flow_ether_accept;
+
+ }
+ *ftp = ft;
+}
+
+void
+mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
+{
+ mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
+ 1024, mip, ftp);
+}
+
+/*
+ * Destroy flow table.
+ */
+void
+mac_flow_tab_destroy(flow_tab_t *ft)
+{
+ if (ft == NULL)
+ return;
+
+ ASSERT(ft->ft_flow_count == 0);
+ kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
+ bzero(ft, sizeof (*ft));
+ kmem_cache_free(flow_tab_cache, ft);
+}
+
+/*
+ * Add a new flow entry to the global flow hash table
+ */
+int
+mac_flow_hash_add(flow_entry_t *flent)
+{
+ int err;
+
+ rw_enter(&flow_tab_lock, RW_WRITER);
+ err = mod_hash_insert(flow_hash,
+ (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
+ if (err != 0) {
+ rw_exit(&flow_tab_lock);
+ return (EEXIST);
+ }
+ /* Mark as inserted into the global flow hash table */
+ FLOW_MARK(flent, FE_G_FLOW_HASH);
+ rw_exit(&flow_tab_lock);
+ return (err);
+}
+
+/*
+ * Remove a flow entry from the global flow hash table
+ */
+void
+mac_flow_hash_remove(flow_entry_t *flent)
+{
+ mod_hash_val_t val;
+
+ rw_enter(&flow_tab_lock, RW_WRITER);
+ VERIFY(mod_hash_remove(flow_hash,
+ (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
+
+ /* Clear the mark that says inserted into the global flow hash table */
+ FLOW_UNMARK(flent, FE_G_FLOW_HASH);
+ rw_exit(&flow_tab_lock);
+}
+
+/*
+ * Retrieve a flow entry from the global flow hash table.
+ */
+int
+mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
+{
+ int err;
+ flow_entry_t *flent;
+
+ rw_enter(&flow_tab_lock, RW_READER);
+ err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
+ (mod_hash_val_t *)&flent);
+ if (err != 0) {
+ rw_exit(&flow_tab_lock);
+ return (ENOENT);
+ }
+ ASSERT(flent != NULL);
+ FLOW_USER_REFHOLD(flent);
+ rw_exit(&flow_tab_lock);
+
+ *flentp = flent;
+ return (0);
+}
+
+/*
+ * Initialize or release mac client flows by walking the subflow table.
+ * These are typically invoked during plumb/unplumb of links.
+ */
+
+static int
+mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
+{
+ mac_client_impl_t *mcip = arg;
+
+ if (mac_link_flow_init(arg, flent) != 0) {
+ cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
+ flent->fe_flow_name, mcip->mci_name);
+ } else {
+ FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
+ }
+ return (0);
+}
+
+void
+mac_link_init_flows(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+
+ (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+ mac_link_init_flows_cb, mcip);
+ /*
+ * If mac client had subflow(s) configured before plumb, change
+ * function to mac_rx_srs_subflow_process and in case of hardware
+ * classification, disable polling.
+ */
+ mac_client_update_classifier(mcip, B_TRUE);
+
+}
+
+boolean_t
+mac_link_has_flows(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+
+ if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static int
+mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
+{
+ FLOW_MARK(flent, FE_UF_NO_DATAPATH);
+ mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+ mac_link_flow_clean(arg, flent);
+ return (0);
+}
+
+void
+mac_link_release_flows(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+
+ /*
+ * Change the mci_flent callback back to mac_rx_srs_process()
+ * because flows are about to be deactivated.
+ */
+ mac_client_update_classifier(mcip, B_FALSE);
+ (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+ mac_link_release_flows_cb, mcip);
+}
+
+void
+mac_rename_flow(flow_entry_t *fep, const char *new_name)
+{
+ mac_flow_set_name(fep, new_name);
+ if (fep->fe_ksp != NULL) {
+ flow_stat_destroy(fep);
+ flow_stat_create(fep);
+ }
+}
+
+/*
+ * mac_link_flow_init()
+ * Internal flow interface used for allocating SRSs and related
+ * data structures. Not meant to be used by mac clients.
+ */
+int
+mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ int err;
+
+ ASSERT(mch != NULL);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
+ return (err);
+
+ sub_flow->fe_mcip = mcip;
+
+ return (0);
+}
+
+/*
+ * mac_link_flow_add()
+ * Used by flowadm(1m) or kernel mac clients for creating flows.
+ */
+int
+mac_link_flow_add(datalink_id_t linkid, char *flow_name,
+ flow_desc_t *flow_desc, mac_resource_props_t *mrp)
+{
+ flow_entry_t *flent = NULL;
+ int err;
+ dls_dl_handle_t dlh;
+ dls_link_t *dlp;
+ boolean_t link_held = B_FALSE;
+ boolean_t hash_added = B_FALSE;
+ mac_perim_handle_t mph;
+
+ err = mac_flow_lookup_byname(flow_name, &flent);
+ if (err == 0) {
+ FLOW_USER_REFRELE(flent);
+ return (EEXIST);
+ }
+
+ /*
+ * First create a flow entry given the description provided
+ * by the caller.
+ */
+ err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
+ FLOW_USER | FLOW_OTHER, &flent);
+
+ if (err != 0)
+ return (err);
+
+ /*
+ * We've got a local variable referencing this flow now, so we need
+ * to hold it. We'll release this flow before returning.
+ * All failures until we return will undo any action that may internally
+ * held the flow, so the last REFRELE will assure a clean freeing
+ * of resources.
+ */
+ FLOW_REFHOLD(flent);
+
+ flent->fe_link_id = linkid;
+ FLOW_MARK(flent, FE_INCIPIENT);
+
+ err = mac_perim_enter_by_linkid(linkid, &mph);
+ if (err != 0) {
+ FLOW_FINAL_REFRELE(flent);
+ return (err);
+ }
+
+ /*
+ * dls will eventually be merged with mac so it's ok
+ * to call dls' internal functions.
+ */
+ err = dls_devnet_hold_link(linkid, &dlh, &dlp);
+ if (err != 0)
+ goto bail;
+
+ link_held = B_TRUE;
+
+ /*
+ * Add the flow to the global flow table, this table will be per
+ * exclusive zone so each zone can have its own flow namespace.
+ * RFE 6625651 will fix this.
+ *
+ */
+ if ((err = mac_flow_hash_add(flent)) != 0)
+ goto bail;
+
+ hash_added = B_TRUE;
+
+ /*
+ * do not allow flows to be configured on an anchor VNIC
+ */
+ if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
+ err = ENOTSUP;
+ goto bail;
+ }
+
+ /*
+ * Save the zoneid of the underlying link in the flow entry,
+ * this is needed to prevent non-global zone from getting
+ * statistics information of global zone.
+ */
+ flent->fe_zoneid = dlp->dl_zid;
+
+ /*
+ * Add the subflow to the subflow table. Also instantiate the flow
+ * in the mac if there is an active DLS user. The dl_mah is set when
+ * dls_active_set() is called, typically during interface plumb.
+ */
+ err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL);
+ if (err != 0)
+ goto bail;
+
+ FLOW_UNMARK(flent, FE_INCIPIENT);
+ dls_devnet_rele_link(dlh, dlp);
+ mac_perim_exit(mph);
+ return (0);
+
+bail:
+ if (hash_added)
+ mac_flow_hash_remove(flent);
+
+ if (link_held)
+ dls_devnet_rele_link(dlh, dlp);
+
+ /*
+ * Wait for any transient global flow hash refs to clear
+ * and then release the creation reference on the flow
+ */
+ mac_flow_wait(flent, FLOW_USER_REF);
+ FLOW_FINAL_REFRELE(flent);
+ mac_perim_exit(mph);
+ return (err);
+}
+
+/*
+ * mac_link_flow_clean()
+ * Internal flow interface used for freeing SRSs and related
+ * data structures. Not meant to be used by mac clients.
+ */
+void
+mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ boolean_t last_subflow;
+
+ ASSERT(mch != NULL);
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ /*
+ * This sub flow entry may fail to be fully initialized by
+ * mac_link_flow_init(). If so, simply return.
+ */
+ if (sub_flow->fe_mcip == NULL)
+ return;
+
+ last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
+ /*
+ * Tear down the data path
+ */
+ mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
+ sub_flow->fe_mcip = NULL;
+
+ /*
+ * Delete the SRSs associated with this subflow. If this is being
+ * driven by flowadm(1M) then the subflow will be deleted by
+ * dls_rem_flow. However if this is a result of the interface being
+ * unplumbed then the subflow itself won't be deleted.
+ */
+ mac_flow_cleanup(sub_flow);
+
+ /*
+ * If all the subflows are gone, renable some of the stuff
+ * we disabled when adding a subflow, polling etc.
+ */
+ if (last_subflow) {
+ /*
+ * The subflow table itself is not protected by any locks or
+ * refcnts. Hence quiesce the client upfront before clearing
+ * mci_subflow_tab.
+ */
+ mac_client_quiesce(mcip);
+ mac_client_update_classifier(mcip, B_FALSE);
+ mac_flow_tab_destroy(mcip->mci_subflow_tab);
+ mcip->mci_subflow_tab = NULL;
+ mac_client_restart(mcip);
+ }
+}
+
+/*
+ * mac_link_flow_remove()
+ * Used by flowadm(1m) or kernel mac clients for removing flows.
+ */
+int
+mac_link_flow_remove(char *flow_name)
+{
+ flow_entry_t *flent;
+ mac_perim_handle_t mph;
+ int err;
+ datalink_id_t linkid;
+
+ err = mac_flow_lookup_byname(flow_name, &flent);
+ if (err != 0)
+ return (err);
+
+ linkid = flent->fe_link_id;
+ FLOW_USER_REFRELE(flent);
+
+ /*
+ * The perim must be acquired before acquiring any other references
+ * to maintain the lock and perimeter hierarchy. Please note the
+ * FLOW_REFRELE above.
+ */
+ err = mac_perim_enter_by_linkid(linkid, &mph);
+ if (err != 0)
+ return (err);
+
+ /*
+ * Note the second lookup of the flow, because a concurrent thread
+ * may have removed it already while we were waiting to enter the
+ * link's perimeter.
+ */
+ err = mac_flow_lookup_byname(flow_name, &flent);
+ if (err != 0) {
+ mac_perim_exit(mph);
+ return (err);
+ }
+ FLOW_USER_REFRELE(flent);
+
+ /*
+ * Remove the flow from the subflow table and deactivate the flow
+ * by quiescing and removings its SRSs
+ */
+ mac_flow_rem_subflow(flent);
+
+ /*
+ * Finally, remove the flow from the global table.
+ */
+ mac_flow_hash_remove(flent);
+
+ /*
+ * Wait for any transient global flow hash refs to clear
+ * and then release the creation reference on the flow
+ */
+ mac_flow_wait(flent, FLOW_USER_REF);
+ FLOW_FINAL_REFRELE(flent);
+
+ mac_perim_exit(mph);
+
+ return (0);
+}
+
+/*
+ * mac_link_flow_modify()
+ * Modifies the properties of a flow identified by its name.
+ */
+int
+mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
+{
+ flow_entry_t *flent;
+ mac_client_impl_t *mcip;
+ int err = 0;
+ mac_perim_handle_t mph;
+ datalink_id_t linkid;
+ flow_tab_t *flow_tab;
+
+ err = mac_validate_props(mrp);
+ if (err != 0)
+ return (err);
+
+ err = mac_flow_lookup_byname(flow_name, &flent);
+ if (err != 0)
+ return (err);
+
+ linkid = flent->fe_link_id;
+ FLOW_USER_REFRELE(flent);
+
+ /*
+ * The perim must be acquired before acquiring any other references
+ * to maintain the lock and perimeter hierarchy. Please note the
+ * FLOW_REFRELE above.
+ */
+ err = mac_perim_enter_by_linkid(linkid, &mph);
+ if (err != 0)
+ return (err);
+
+ /*
+ * Note the second lookup of the flow, because a concurrent thread
+ * may have removed it already while we were waiting to enter the
+ * link's perimeter.
+ */
+ err = mac_flow_lookup_byname(flow_name, &flent);
+ if (err != 0) {
+ mac_perim_exit(mph);
+ return (err);
+ }
+ FLOW_USER_REFRELE(flent);
+
+ /*
+ * If this flow is attached to a MAC client, then pass the request
+ * along to the client.
+ * Otherwise, just update the cached values.
+ */
+ mcip = flent->fe_mcip;
+ mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
+ if (mcip != NULL) {
+ if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
+ err = ENOENT;
+ } else {
+ mac_flow_modify(flow_tab, flent, mrp);
+ }
+ } else {
+ (void) mac_flow_modify_props(flent, mrp);
+ }
+
+done:
+ mac_perim_exit(mph);
+ return (err);
+}
+
+
+/*
+ * State structure and misc functions used by mac_link_flow_walk().
+ */
+typedef struct {
+ int (*ws_func)(mac_flowinfo_t *, void *);
+ void *ws_arg;
+} flow_walk_state_t;
+
+static void
+mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
+{
+ (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, MAXNAMELEN);
+ finfop->fi_link_id = flent->fe_link_id;
+ finfop->fi_flow_desc = flent->fe_flow_desc;
+ finfop->fi_resource_props = flent->fe_resource_props;
+}
+
+static int
+mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
+{
+ flow_walk_state_t *statep = arg;
+ mac_flowinfo_t finfo;
+
+ mac_link_flowinfo_copy(&finfo, flent);
+ return (statep->ws_func(&finfo, statep->ws_arg));
+}
+
+/*
+ * mac_link_flow_walk()
+ * Invokes callback 'func' for all flows belonging to the specified link.
+ */
+int
+mac_link_flow_walk(datalink_id_t linkid,
+ int (*func)(mac_flowinfo_t *, void *), void *arg)
+{
+ mac_client_impl_t *mcip;
+ mac_perim_handle_t mph;
+ flow_walk_state_t state;
+ dls_dl_handle_t dlh;
+ dls_link_t *dlp;
+ int err;
+
+ err = mac_perim_enter_by_linkid(linkid, &mph);
+ if (err != 0)
+ return (err);
+
+ err = dls_devnet_hold_link(linkid, &dlh, &dlp);
+ if (err != 0) {
+ mac_perim_exit(mph);
+ return (err);
+ }
+
+ mcip = (mac_client_impl_t *)dlp->dl_mch;
+ state.ws_func = func;
+ state.ws_arg = arg;
+
+ err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
+ mac_link_flow_walk_cb, &state);
+
+ dls_devnet_rele_link(dlh, dlp);
+ mac_perim_exit(mph);
+ return (err);
+}
+
+/*
+ * mac_link_flow_info()
+ * Retrieves information about a specific flow.
+ */
+int
+mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
+{
+ flow_entry_t *flent;
+ int err;
+
+ err = mac_flow_lookup_byname(flow_name, &flent);
+ if (err != 0)
+ return (err);
+
+ mac_link_flowinfo_copy(finfo, flent);
+ FLOW_USER_REFRELE(flent);
+ return (0);
+}
+
+#define HASH_MAC_VID(a, v, s) \
+ ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
+
+#define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
+
+/* ARGSUSED */
+static boolean_t
+flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+ flow_l2info_t *l2 = &s->fs_l2info;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ return (l2->l2_vid == fd->fd_vid &&
+ bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
+}
+
+/*
+ * Layer 2 hash function.
+ * Must be paired with flow_l2_accept() within a set of flow_ops
+ * because it assumes the dest address is already extracted.
+ */
+static uint32_t
+flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l2info_t *l2 = &s->fs_l2info;
+
+ return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
+}
+
+/*
+ * This is the generic layer 2 accept function.
+ * It makes use of mac_header_info() to extract the header length,
+ * sap, vlan ID and destination address.
+ */
+static int
+flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
+{
+ boolean_t is_ether;
+ flow_l2info_t *l2 = &s->fs_l2info;
+ mac_header_info_t mhi;
+ int err;
+
+ is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
+ if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
+ s->fs_mp, &mhi)) != 0) {
+ if (err == EINVAL)
+ err = ENOBUFS;
+
+ return (err);
+ }
+
+ l2->l2_start = s->fs_mp->b_rptr;
+ l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
+
+ if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
+ ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
+ struct ether_vlan_header *evhp =
+ (struct ether_vlan_header *)l2->l2_start;
+
+ if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
+ return (ENOBUFS);
+
+ l2->l2_sap = ntohs(evhp->ether_type);
+ l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
+ l2->l2_hdrsize = sizeof (*evhp);
+ } else {
+ l2->l2_sap = mhi.mhi_bindsap;
+ l2->l2_vid = 0;
+ l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
+ }
+ return (0);
+}
+
+/*
+ * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
+ * accept(). The notable difference is that dest address is now extracted
+ * by hash() rather than by accept(). This saves a few memory references
+ * for flow tables that do not care about mac addresses.
+ */
+static uint32_t
+flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l2info_t *l2 = &s->fs_l2info;
+ struct ether_vlan_header *evhp;
+
+ evhp = (struct ether_vlan_header *)l2->l2_start;
+ l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
+ return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
+}
+
+/* ARGSUSED */
+static int
+flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l2info_t *l2 = &s->fs_l2info;
+ struct ether_vlan_header *evhp;
+ uint16_t sap;
+
+ evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
+ l2->l2_start = (uchar_t *)evhp;
+
+ if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
+ return (ENOBUFS);
+
+ if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
+ ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
+ if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
+ return (ENOBUFS);
+
+ l2->l2_sap = ntohs(evhp->ether_type);
+ l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
+ l2->l2_hdrsize = sizeof (struct ether_vlan_header);
+ } else {
+ l2->l2_sap = sap;
+ l2->l2_vid = 0;
+ l2->l2_hdrsize = sizeof (struct ether_header);
+ }
+ return (0);
+}
+
+/*
+ * Validates a layer 2 flow entry.
+ */
+static int
+flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ int i;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ /*
+ * Dest address is mandatory.
+ */
+ if ((fd->fd_mask & FLOW_LINK_DST) == 0)
+ return (EINVAL);
+
+ for (i = 0; i < fd->fd_mac_len; i++) {
+ if (fd->fd_dst_mac[i] != 0)
+ break;
+ }
+ if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL)
+ return (EINVAL);
+
+ if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
+ /*
+ * VLAN flows are only supported over ethernet macs.
+ */
+ if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
+ return (EINVAL);
+
+ if (fd->fd_vid == 0)
+ return (EINVAL);
+
+ }
+ flent->fe_match = flow_l2_match;
+ return (0);
+}
+
+/*
+ * Calculates hash index of flow entry.
+ */
+static uint32_t
+flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
+ return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
+}
+
+/*
+ * This is used for duplicate flow checking.
+ */
+/* ARGSUSED */
+static boolean_t
+flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+ flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+
+ ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
+ return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
+ fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
+}
+
+/*
+ * Generic flow entry insertion function.
+ * Used by flow tables that do not have ordering requirements.
+ */
+/* ARGSUSED */
+static int
+flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
+ flow_entry_t *flent)
+{
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+ if (*headp != NULL) {
+ ASSERT(flent->fe_next == NULL);
+ flent->fe_next = *headp;
+ }
+ *headp = flent;
+ return (0);
+}
+
+/*
+ * IP version independent DSField matching function.
+ */
+/* ARGSUSED */
+static boolean_t
+flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ switch (l3info->l3_version) {
+ case IPV4_VERSION: {
+ ipha_t *ipha = (ipha_t *)l3info->l3_start;
+
+ return ((ipha->ipha_type_of_service &
+ fd->fd_dsfield_mask) == fd->fd_dsfield);
+ }
+ case IPV6_VERSION: {
+ ip6_t *ip6h = (ip6_t *)l3info->l3_start;
+
+ return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
+ fd->fd_dsfield_mask) == fd->fd_dsfield);
+ }
+ default:
+ return (B_FALSE);
+ }
+}
+
+/*
+ * IP v4 and v6 address matching.
+ * The netmask only needs to be applied on the packet but not on the
+ * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
+ */
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+ ipha_t *ipha = (ipha_t *)l3info->l3_start;
+ in_addr_t addr;
+
+ addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
+ if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
+ return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
+ V4_PART_OF_V6(fd->fd_local_addr));
+ }
+ return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
+ V4_PART_OF_V6(fd->fd_remote_addr));
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+ ip6_t *ip6h = (ip6_t *)l3info->l3_start;
+ in6_addr_t *addrp;
+
+ addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
+ if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
+ return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
+ fd->fd_local_addr));
+ }
+ return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ return (l3info->l3_protocol == fd->fd_protocol);
+}
+
+static uint32_t
+flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_mask_t mask = ft->ft_mask;
+
+ if ((mask & FLOW_IP_LOCAL) != 0) {
+ l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
+ } else if ((mask & FLOW_IP_REMOTE) != 0) {
+ l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
+ } else if ((mask & FLOW_IP_DSFIELD) != 0) {
+ /*
+ * DSField flents are arranged as a single list.
+ */
+ return (0);
+ }
+ /*
+ * IP addr flents are hashed into two lists, v4 or v6.
+ */
+ ASSERT(ft->ft_size >= 2);
+ return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
+}
+
+static uint32_t
+flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+
+ return (l3info->l3_protocol % ft->ft_size);
+}
+
+/* ARGSUSED */
+static int
+flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l2info_t *l2info = &s->fs_l2info;
+ flow_l3info_t *l3info = &s->fs_l3info;
+ uint16_t sap = l2info->l2_sap;
+ uchar_t *l3_start;
+
+ l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize;
+ if (!OK_32PTR(l3_start))
+ return (EINVAL);
+
+ switch (sap) {
+ case ETHERTYPE_IP: {
+ ipha_t *ipha = (ipha_t *)l3_start;
+
+ if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
+ return (ENOBUFS);
+
+ l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
+ l3info->l3_protocol = ipha->ipha_protocol;
+ l3info->l3_version = IPV4_VERSION;
+ l3info->l3_fragmented =
+ IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
+ break;
+ }
+ case ETHERTYPE_IPV6: {
+ ip6_t *ip6h = (ip6_t *)l3_start;
+ uint16_t ip6_hdrlen;
+ uint8_t nexthdr;
+
+ if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen,
+ &nexthdr)) {
+ return (ENOBUFS);
+ }
+ l3info->l3_hdrsize = ip6_hdrlen;
+ l3info->l3_protocol = nexthdr;
+ l3info->l3_version = IPV6_VERSION;
+ l3info->l3_fragmented = B_FALSE;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ switch (fd->fd_protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ flent->fe_match = flow_ip_proto_match;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+/* ARGSUSED */
+static int
+flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_desc_t *fd = &flent->fe_flow_desc;
+ flow_mask_t mask;
+ uint8_t version;
+ in6_addr_t *addr, *netmask;
+
+ /*
+ * DSField does not require a IP version.
+ */
+ if (fd->fd_mask == FLOW_IP_DSFIELD) {
+ if (fd->fd_dsfield_mask == 0)
+ return (EINVAL);
+
+ flent->fe_match = flow_ip_dsfield_match;
+ return (0);
+ }
+
+ /*
+ * IP addresses must come with a version to avoid ambiguity.
+ */
+ if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
+ return (EINVAL);
+
+ version = fd->fd_ipversion;
+ if (version != IPV4_VERSION && version != IPV6_VERSION)
+ return (EINVAL);
+
+ mask = fd->fd_mask & ~FLOW_IP_VERSION;
+ switch (mask) {
+ case FLOW_IP_LOCAL:
+ addr = &fd->fd_local_addr;
+ netmask = &fd->fd_local_netmask;
+ break;
+ case FLOW_IP_REMOTE:
+ addr = &fd->fd_remote_addr;
+ netmask = &fd->fd_remote_netmask;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * Apply netmask onto specified address.
+ */
+ V6_MASK_COPY(*addr, *netmask, *addr);
+ if (version == IPV4_VERSION) {
+ ipaddr_t v4addr = V4_PART_OF_V6((*addr));
+ ipaddr_t v4mask = V4_PART_OF_V6((*netmask));
+
+ if (v4addr == 0 || v4mask == 0)
+ return (EINVAL);
+ flent->fe_match = flow_ip_v4_match;
+ } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
+ IN6_IS_ADDR_UNSPECIFIED(netmask))
+ return (EINVAL);
+ flent->fe_match = flow_ip_v6_match;
+ }
+ return (0);
+}
+
+static uint32_t
+flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ return (fd->fd_protocol % ft->ft_size);
+}
+
+static uint32_t
+flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ /*
+ * DSField flents are arranged as a single list.
+ */
+ if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
+ return (0);
+
+ /*
+ * IP addr flents are hashed into two lists, v4 or v6.
+ */
+ ASSERT(ft->ft_size >= 2);
+ return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+ flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+
+ return (fd1->fd_protocol == fd2->fd_protocol);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+ flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+ in6_addr_t *a1, *m1, *a2, *m2;
+
+ ASSERT(fd1->fd_mask == fd2->fd_mask);
+ if (fd1->fd_mask == FLOW_IP_DSFIELD) {
+ return (fd1->fd_dsfield == fd2->fd_dsfield &&
+ fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
+ }
+
+ /*
+ * flow_ip_accept_fe() already validated the version.
+ */
+ ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
+ if (fd1->fd_ipversion != fd2->fd_ipversion)
+ return (B_FALSE);
+
+ switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
+ case FLOW_IP_LOCAL:
+ a1 = &fd1->fd_local_addr;
+ m1 = &fd1->fd_local_netmask;
+ a2 = &fd2->fd_local_addr;
+ m2 = &fd2->fd_local_netmask;
+ break;
+ case FLOW_IP_REMOTE:
+ a1 = &fd1->fd_remote_addr;
+ m1 = &fd1->fd_remote_netmask;
+ a2 = &fd2->fd_remote_addr;
+ m2 = &fd2->fd_remote_netmask;
+ break;
+ default:
+ /*
+ * This is unreachable given the checks in
+ * flow_ip_accept_fe().
+ */
+ return (B_FALSE);
+ }
+
+ if (fd1->fd_ipversion == IPV4_VERSION) {
+ return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
+ V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
+
+ } else {
+ return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
+ IN6_ARE_ADDR_EQUAL(m1, m2));
+ }
+}
+
+static int
+flow_ip_mask2plen(in6_addr_t *v6mask)
+{
+ int bits;
+ int plen = IPV6_ABITS;
+ int i;
+
+ for (i = 3; i >= 0; i--) {
+ if (v6mask->s6_addr32[i] == 0) {
+ plen -= 32;
+ continue;
+ }
+ bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
+ if (bits == 0)
+ break;
+ plen -= bits;
+ }
+ return (plen);
+}
+
+/* ARGSUSED */
+static int
+flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
+ flow_entry_t *flent)
+{
+ flow_entry_t **p = headp;
+ flow_desc_t *fd0, *fd;
+ in6_addr_t *m0, *m;
+ int plen0, plen;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+ /*
+ * No special ordering needed for dsfield.
+ */
+ fd0 = &flent->fe_flow_desc;
+ if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
+ if (*p != NULL) {
+ ASSERT(flent->fe_next == NULL);
+ flent->fe_next = *p;
+ }
+ *p = flent;
+ return (0);
+ }
+
+ /*
+ * IP address flows are arranged in descending prefix length order.
+ */
+ m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
+ &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
+ plen0 = flow_ip_mask2plen(m0);
+ ASSERT(plen0 != 0);
+
+ for (; *p != NULL; p = &(*p)->fe_next) {
+ fd = &(*p)->fe_flow_desc;
+
+ /*
+ * Normally a dsfield flent shouldn't end up on the same
+ * list as an IP address because flow tables are (for now)
+ * disjoint. If we decide to support both IP and dsfield
+ * in the same table in the future, this check will allow
+ * for that.
+ */
+ if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
+ continue;
+
+ /*
+ * We also allow for the mixing of local and remote address
+ * flents within one list.
+ */
+ m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
+ &fd->fd_local_netmask : &fd->fd_remote_netmask;
+ plen = flow_ip_mask2plen(m);
+
+ if (plen <= plen0)
+ break;
+ }
+ if (*p != NULL) {
+ ASSERT(flent->fe_next == NULL);
+ flent->fe_next = *p;
+ }
+ *p = flent;
+ return (0);
+}
+
+/*
+ * Transport layer protocol and port matching functions.
+ */
+
+/* ARGSUSED */
+static boolean_t
+flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_l4info_t *l4info = &s->fs_l4info;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ return (fd->fd_protocol == l3info->l3_protocol &&
+ fd->fd_local_port == l4info->l4_hash_port);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_l4info_t *l4info = &s->fs_l4info;
+ flow_desc_t *fd = &flent->fe_flow_desc;
+
+ return (fd->fd_protocol == l3info->l3_protocol &&
+ fd->fd_remote_port == l4info->l4_hash_port);
+}
+
+/*
+ * Transport hash function.
+ * Since we only support either local or remote port flows,
+ * we only need to extract one of the ports to be used for
+ * matching.
+ */
+static uint32_t
+flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_l4info_t *l4info = &s->fs_l4info;
+ uint8_t proto = l3info->l3_protocol;
+ boolean_t dst_or_src;
+
+ if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
+ dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
+ } else {
+ dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
+ }
+
+ l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
+ l4info->l4_src_port;
+
+ return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
+}
+
+/*
+ * Unlike other accept() functions above, we do not need to get the header
+ * size because this is our highest layer so far. If we want to do support
+ * other higher layer protocols, we would need to save the l4_hdrsize
+ * in the code below.
+ */
+
+/* ARGSUSED */
+static int
+flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
+{
+ flow_l3info_t *l3info = &s->fs_l3info;
+ flow_l4info_t *l4info = &s->fs_l4info;
+ uint8_t proto = l3info->l3_protocol;
+ uchar_t *l4_start;
+
+ l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize;
+ if (!OK_32PTR(l4_start))
+ return (EINVAL);
+
+ if (l3info->l3_fragmented == B_TRUE)
+ return (EINVAL);
+
+ switch (proto) {
+ case IPPROTO_TCP: {
+ struct tcphdr *tcph = (struct tcphdr *)l4_start;
+
+ if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
+ return (ENOBUFS);
+
+ l4info->l4_src_port = tcph->th_sport;
+ l4info->l4_dst_port = tcph->th_dport;
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr *udph = (struct udphdr *)l4_start;
+
+ if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
+ return (ENOBUFS);
+
+ l4info->l4_src_port = udph->uh_sport;
+ l4info->l4_dst_port = udph->uh_dport;
+ break;
+ }
+ case IPPROTO_SCTP: {
+ sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start;
+
+ if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
+ return (ENOBUFS);
+
+ l4info->l4_src_port = sctph->sh_sport;
+ l4info->l4_dst_port = sctph->sh_dport;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * Validates transport flow entry.
+ * The protocol field must be present.
+ */
+
+/* ARGSUSED */
+static int
+flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_desc_t *fd = &flent->fe_flow_desc;
+ flow_mask_t mask = fd->fd_mask;
+
+ if ((mask & FLOW_IP_PROTOCOL) == 0)
+ return (EINVAL);
+
+ switch (fd->fd_protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ switch (mask & ~FLOW_IP_PROTOCOL) {
+ case FLOW_ULP_PORT_LOCAL:
+ if (fd->fd_local_port == 0)
+ return (EINVAL);
+
+ flent->fe_match = flow_transport_lport_match;
+ break;
+ case FLOW_ULP_PORT_REMOTE:
+ if (fd->fd_remote_port == 0)
+ return (EINVAL);
+
+ flent->fe_match = flow_transport_rport_match;
+ break;
+ case 0:
+ /*
+ * transport-only flows conflicts with our table type.
+ */
+ return (EOPNOTSUPP);
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static uint32_t
+flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+ flow_desc_t *fd = &flent->fe_flow_desc;
+ uint16_t port = 0;
+
+ port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
+ fd->fd_local_port : fd->fd_remote_port;
+
+ return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+ flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+
+ if (fd1->fd_protocol != fd2->fd_protocol)
+ return (B_FALSE);
+
+ if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
+ return (fd1->fd_local_port == fd2->fd_local_port);
+
+ return (fd1->fd_remote_port == fd2->fd_remote_port);
+}
+
+static flow_ops_t flow_l2_ops = {
+ flow_l2_accept_fe,
+ flow_l2_hash_fe,
+ flow_l2_match_fe,
+ flow_generic_insert_fe,
+ flow_l2_hash,
+ {flow_l2_accept}
+};
+
+static flow_ops_t flow_ip_ops = {
+ flow_ip_accept_fe,
+ flow_ip_hash_fe,
+ flow_ip_match_fe,
+ flow_ip_insert_fe,
+ flow_ip_hash,
+ {flow_l2_accept, flow_ip_accept}
+};
+
+static flow_ops_t flow_ip_proto_ops = {
+ flow_ip_proto_accept_fe,
+ flow_ip_proto_hash_fe,
+ flow_ip_proto_match_fe,
+ flow_generic_insert_fe,
+ flow_ip_proto_hash,
+ {flow_l2_accept, flow_ip_accept}
+};
+
+static flow_ops_t flow_transport_ops = {
+ flow_transport_accept_fe,
+ flow_transport_hash_fe,
+ flow_transport_match_fe,
+ flow_generic_insert_fe,
+ flow_transport_hash,
+ {flow_l2_accept, flow_ip_accept, flow_transport_accept}
+};
+
+static flow_tab_info_t flow_tab_info_list[] = {
+ {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
+ {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
+ {&flow_ip_ops, FLOW_IP_DSFIELD, 1},
+ {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
+ {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}
+};
+
+#define FLOW_MAX_TAB_INFO \
+ ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
+
+static flow_tab_info_t *
+mac_flow_tab_info_get(flow_mask_t mask)
+{
+ int i;
+
+ for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
+ if (mask == flow_tab_info_list[i].fti_mask)
+ return (&flow_tab_info_list[i]);
+ }
+ return (NULL);
+}
diff --git a/usr/src/uts/common/io/mac/mac_hio.c b/usr/src/uts/common/io/mac/mac_hio.c
new file mode 100644
index 0000000000..d930506ae7
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_hio.c
@@ -0,0 +1,182 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * MAC Hybrid I/O related code.
+ */
+
+#include <sys/types.h>
+#include <sys/sdt.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+
+
+/*
+ * Return the number of shares supported by the specified MAC.
+ */
+int
+mac_share_capable(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return (mip->mi_share_capab.ms_snum);
+}
+
+
+/*
+ * Allocate a share to the specified MAC client. Invoked when
+ * mac_client_open() is invoked with MAC_OPEN_FLAGS_SHARES_DESIRED set.
+ */
+void
+i_mac_share_alloc(mac_client_impl_t *mcip)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+ int rv;
+
+ i_mac_perim_enter(mip);
+
+ ASSERT(mcip->mci_share == NULL);
+
+ if (mac_share_capable((mac_handle_t)mcip->mci_mip) == 0) {
+ DTRACE_PROBE1(i__mac__share__alloc__not__sup,
+ mac_client_impl_t *, mcip);
+ i_mac_perim_exit(mip);
+ return;
+ }
+
+ rv = mip->mi_share_capab.ms_salloc(mip->mi_share_capab.ms_handle,
+ &mcip->mci_share);
+ DTRACE_PROBE3(i__mac__share__alloc, mac_client_impl_t *, mcip,
+ int, rv, mac_share_handle_t, mcip->mci_share);
+
+ mcip->mci_share_bound = B_FALSE;
+
+ i_mac_perim_exit(mip);
+}
+
+
+/*
+ * Free a share previously allocated through i_mac_share_alloc().
+ * Safely handles the case when no shares were allocated to the MAC client.
+ */
+void
+i_mac_share_free(mac_client_impl_t *mcip)
+{
+ mac_impl_t *mip = mcip->mci_mip;
+
+ i_mac_perim_enter(mip);
+
+ /* MAC clients are required to unbind they shares before freeing them */
+ ASSERT(!mcip->mci_share_bound);
+
+ if (mcip->mci_share == NULL) {
+ i_mac_perim_exit(mip);
+ return;
+ }
+
+ mip->mi_share_capab.ms_sfree(mcip->mci_share);
+ i_mac_perim_exit(mip);
+}
+
+
+/*
+ * Bind a share. After this operation the rings that were associated
+ * with the MAC client are mapped directly into the corresponding
+ * guest domain.
+ */
+int
+mac_share_bind(mac_client_handle_t mch, uint64_t cookie, uint64_t *rcookie)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+ int rv;
+
+ i_mac_perim_enter(mip);
+
+ if (mcip->mci_share == NULL) {
+ i_mac_perim_exit(mip);
+ return (ENOTSUP);
+ }
+
+ ASSERT(!mcip->mci_share_bound);
+
+ /*
+ * Temporarly suspend the TX traffic for that client to make sure
+ * there are no in flight packets through a transmit ring
+ * which is being bound to another domain.
+ */
+ mac_tx_client_quiesce(mcip, SRS_QUIESCE);
+
+ /*
+ * For the receive path, no traffic will be sent up through
+ * the rings to the IO domain. For TX, we need to ensure
+ * that traffic sent by the MAC client are sent through
+ * the default ring.
+ *
+ * For TX XXX will ensure that packets are sent through the
+ * default ring if the share of the MAC client is bound.
+ */
+
+ rv = mip->mi_share_capab.ms_sbind(mcip->mci_share, cookie, rcookie);
+ if (rv == 0)
+ mcip->mci_share_bound = B_TRUE;
+
+ /*
+ * Resume TX traffic for the MAC client. Since mci_share_bound is set
+ * to B_TRUE, mac_tx_send() will not send traffic to individual TX
+ * rings until the share is unbound.
+ */
+ mac_tx_client_restart(mcip);
+
+ i_mac_perim_exit(mip);
+
+ return (rv);
+}
+
+
+/*
+ * Unbind a share.
+ */
+void
+mac_share_unbind(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+
+ i_mac_perim_enter(mip);
+
+ if (mcip->mci_share == NULL) {
+ i_mac_perim_exit(mip);
+ return;
+ }
+
+ mip->mi_share_capab.ms_sunbind(mcip->mci_share);
+
+ mcip->mci_share_bound = B_FALSE;
+
+ i_mac_perim_exit(mip);
+}
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
new file mode 100644
index 0000000000..714fb79afb
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -0,0 +1,1031 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/id_space.h>
+#include <sys/esunddi.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/modhash.h>
+#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/modctl.h>
+#include <sys/fs/dv_node.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/callb.h>
+#include <sys/cpuvar.h>
+#include <sys/atomic.h>
+#include <sys/sdt.h>
+#include <sys/mac_flow.h>
+#include <sys/ddi_intr_impl.h>
+#include <sys/disp.h>
+#include <sys/sdt.h>
+
+/*
+ * MAC Provider Interface.
+ *
+ * Interface for GLDv3 compatible NIC drivers.
+ */
+
+static void i_mac_notify_thread(void *);
+
+typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
+
+typedef struct mac_notify_default_cb_s {
+ mac_notify_type_t mac_notify_type;
+ mac_notify_default_cb_fn_t mac_notify_cb_fn;
+}mac_notify_default_cb_t;
+
+mac_notify_default_cb_t mac_notify_cb_list[] = {
+ { MAC_NOTE_LINK, mac_fanout_recompute},
+ { MAC_NOTE_PROMISC, NULL},
+ { MAC_NOTE_UNICST, NULL},
+ { MAC_NOTE_TX, NULL},
+ { MAC_NOTE_RESOURCE, NULL},
+ { MAC_NOTE_DEVPROMISC, NULL},
+ { MAC_NOTE_FASTPATH_FLUSH, NULL},
+ { MAC_NOTE_SDU_SIZE, NULL},
+ { MAC_NOTE_MARGIN, NULL},
+ { MAC_NOTE_CAPAB_CHG, NULL},
+ { MAC_NNOTE, NULL},
+};
+
+/*
+ * Driver support functions.
+ */
+
+/* REGISTRATION */
+
+mac_register_t *
+mac_alloc(uint_t mac_version)
+{
+ mac_register_t *mregp;
+
+ /*
+ * Make sure there isn't a version mismatch between the driver and
+ * the framework. In the future, if multiple versions are
+ * supported, this check could become more sophisticated.
+ */
+ if (mac_version != MAC_VERSION)
+ return (NULL);
+
+ mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
+ mregp->m_version = mac_version;
+ return (mregp);
+}
+
+void
+mac_free(mac_register_t *mregp)
+{
+ kmem_free(mregp, sizeof (mac_register_t));
+}
+
+/*
+ * mac_register() is how drivers register new MACs with the GLDv3
+ * framework. The mregp argument is allocated by drivers using the
+ * mac_alloc() function, and can be freed using mac_free() immediately upon
+ * return from mac_register(). Upon success (0 return value), the mhp
+ * opaque pointer becomes the driver's handle to its MAC interface, and is
+ * the argument to all other mac module entry points.
+ */
+/* ARGSUSED */
+int
+mac_register(mac_register_t *mregp, mac_handle_t *mhp)
+{
+ mac_impl_t *mip;
+ mactype_t *mtype;
+ int err = EINVAL;
+ struct devnames *dnp = NULL;
+ uint_t instance;
+ boolean_t style1_created = B_FALSE;
+ boolean_t style2_created = B_FALSE;
+ mac_capab_legacy_t legacy;
+ char *driver;
+ minor_t minor = 0;
+
+ /* Find the required MAC-Type plugin. */
+ if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
+ return (EINVAL);
+
+ /* Create a mac_impl_t to represent this MAC. */
+ mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
+
+ /*
+ * The mac is not ready for open yet.
+ */
+ mip->mi_state_flags |= MIS_DISABLED;
+
+ /*
+ * When a mac is registered, the m_instance field can be set to:
+ *
+ * 0: Get the mac's instance number from m_dip.
+ * This is usually used for physical device dips.
+ *
+ * [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
+ * For example, when an aggregation is created with the key option,
+ * "key" will be used as the instance number.
+ *
+ * -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
+ * This is often used when a MAC of a virtual link is registered
+ * (e.g., aggregation when "key" is not specified, or vnic).
+ *
+ * Note that the instance number is used to derive the mi_minor field
+ * of mac_impl_t, which will then be used to derive the name of kstats
+ * and the devfs nodes. The first 2 cases are needed to preserve
+ * backward compatibility.
+ */
+ switch (mregp->m_instance) {
+ case 0:
+ instance = ddi_get_instance(mregp->m_dip);
+ break;
+ case ((uint_t)-1):
+ minor = mac_minor_hold(B_TRUE);
+ if (minor == 0) {
+ err = ENOSPC;
+ goto fail;
+ }
+ instance = minor - 1;
+ break;
+ default:
+ instance = mregp->m_instance;
+ if (instance >= MAC_MAX_MINOR) {
+ err = EINVAL;
+ goto fail;
+ }
+ break;
+ }
+
+ mip->mi_minor = (minor_t)(instance + 1);
+ mip->mi_dip = mregp->m_dip;
+ mip->mi_clients_list = NULL;
+ mip->mi_nclients = 0;
+
+ driver = (char *)ddi_driver_name(mip->mi_dip);
+
+ /* Construct the MAC name as <drvname><instance> */
+ (void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
+ driver, instance);
+
+ mip->mi_driver = mregp->m_driver;
+
+ mip->mi_type = mtype;
+ mip->mi_margin = mregp->m_margin;
+ mip->mi_info.mi_media = mtype->mt_type;
+ mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
+ if (mregp->m_max_sdu <= mregp->m_min_sdu)
+ goto fail;
+ mip->mi_sdu_min = mregp->m_min_sdu;
+ mip->mi_sdu_max = mregp->m_max_sdu;
+ mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
+ /*
+ * If the media supports a broadcast address, cache a pointer to it
+ * in the mac_info_t so that upper layers can use it.
+ */
+ mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
+
+ mip->mi_v12n_level = mregp->m_v12n;
+
+ /*
+ * Copy the unicast source address into the mac_info_t, but only if
+ * the MAC-Type defines a non-zero address length. We need to
+ * handle MAC-Types that have an address length of 0
+ * (point-to-point protocol MACs for example).
+ */
+ if (mip->mi_type->mt_addr_length > 0) {
+ if (mregp->m_src_addr == NULL)
+ goto fail;
+ mip->mi_info.mi_unicst_addr =
+ kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
+ bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
+ mip->mi_type->mt_addr_length);
+
+ /*
+ * Copy the fixed 'factory' MAC address from the immutable
+ * info. This is taken to be the MAC address currently in
+ * use.
+ */
+ bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
+ mip->mi_type->mt_addr_length);
+
+ /*
+ * At this point, we should set up the classification
+ * rules etc but we delay it till mac_open() so that
+ * the resource discovery has taken place and we
+ * know someone wants to use the device. Otherwise
+ * memory gets allocated for Rx ring structures even
+ * during probe.
+ */
+
+ /* Copy the destination address if one is provided. */
+ if (mregp->m_dst_addr != NULL) {
+ bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
+ mip->mi_type->mt_addr_length);
+ }
+ } else if (mregp->m_src_addr != NULL) {
+ goto fail;
+ }
+
+ /*
+ * The format of the m_pdata is specific to the plugin. It is
+ * passed in as an argument to all of the plugin callbacks. The
+ * driver can update this information by calling
+ * mac_pdata_update().
+ */
+ if (mregp->m_pdata != NULL) {
+ /*
+ * Verify that the plugin supports MAC plugin data and that
+ * the supplied data is valid.
+ */
+ if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
+ goto fail;
+ if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
+ mregp->m_pdata_size)) {
+ goto fail;
+ }
+ mip->mi_pdata = kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
+ bcopy(mregp->m_pdata, mip->mi_pdata, mregp->m_pdata_size);
+ mip->mi_pdata_size = mregp->m_pdata_size;
+ }
+
+ /*
+ * Register the private properties.
+ */
+ mac_register_priv_prop(mip, mregp->m_priv_props,
+ mregp->m_priv_prop_count);
+
+ /*
+ * Stash the driver callbacks into the mac_impl_t, but first sanity
+ * check to make sure all mandatory callbacks are set.
+ */
+ if (mregp->m_callbacks->mc_getstat == NULL ||
+ mregp->m_callbacks->mc_start == NULL ||
+ mregp->m_callbacks->mc_stop == NULL ||
+ mregp->m_callbacks->mc_setpromisc == NULL ||
+ mregp->m_callbacks->mc_multicst == NULL) {
+ goto fail;
+ }
+ mip->mi_callbacks = mregp->m_callbacks;
+
+ if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY, &legacy))
+ mip->mi_state_flags |= MIS_LEGACY;
+
+ if (mip->mi_state_flags & MIS_LEGACY) {
+ mip->mi_unsup_note = legacy.ml_unsup_note;
+ mip->mi_phy_dev = legacy.ml_dev;
+ } else {
+ mip->mi_unsup_note = 0;
+ mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
+ ddi_get_instance(mip->mi_dip) + 1);
+ }
+
+ /*
+ * Allocate a notification thread. thread_create blocks for memory
+ * if needed, it never fails.
+ */
+ mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
+ mip, 0, &p0, TS_RUN, minclsyspri);
+
+ /*
+ * Initialize the capabilities
+ */
+
+ if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
+ mip->mi_state_flags |= MIS_IS_VNIC;
+
+ if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
+ mip->mi_state_flags |= MIS_IS_AGGR;
+
+ mac_addr_factory_init(mip);
+
+ /*
+ * Enforce the virtrualization level registered.
+ */
+ if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
+ if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
+ mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
+ goto fail;
+
+ /*
+ * The driver needs to register at least rx rings for this
+ * virtualization level.
+ */
+ if (mip->mi_rx_groups == NULL)
+ goto fail;
+ }
+
+ /*
+ * The driver must set mc_unicst entry point to NULL when it advertises
+ * CAP_RINGS for rx groups.
+ */
+ if (mip->mi_rx_groups != NULL) {
+ if (mregp->m_callbacks->mc_unicst != NULL)
+ goto fail;
+ } else {
+ if (mregp->m_callbacks->mc_unicst == NULL)
+ goto fail;
+ }
+
+ /*
+ * The driver must set mc_tx entry point to NULL when it advertises
+ * CAP_RINGS for tx rings.
+ */
+ if (mip->mi_tx_groups != NULL) {
+ if (mregp->m_callbacks->mc_tx != NULL)
+ goto fail;
+ } else {
+ if (mregp->m_callbacks->mc_tx == NULL)
+ goto fail;
+ }
+
+ /*
+ * Initialize MAC addresses. Must be called after mac_init_rings().
+ */
+ mac_init_macaddr(mip);
+
+ mip->mi_share_capab.ms_snum = 0;
+ if (mip->mi_v12n_level & MAC_VIRT_HIO) {
+ (void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
+ &mip->mi_share_capab);
+ }
+
+ /*
+ * Initialize the kstats for this device.
+ */
+ mac_stat_create(mip);
+
+ /* Zero out any properties. */
+ bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
+
+ /* set the gldv3 flag in dn_flags */
+ dnp = &devnamesp[ddi_driver_major(mip->mi_dip)];
+ LOCK_DEV_OPS(&dnp->dn_lock);
+ dnp->dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
+ UNLOCK_DEV_OPS(&dnp->dn_lock);
+
+ if (mip->mi_minor < MAC_MAX_MINOR + 1) {
+ /* Create a style-2 DLPI device */
+ if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
+ DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
+ goto fail;
+ style2_created = B_TRUE;
+
+ /* Create a style-1 DLPI device */
+ if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
+ mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
+ goto fail;
+ style1_created = B_TRUE;
+ }
+
+ mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
+
+ rw_enter(&i_mac_impl_lock, RW_WRITER);
+ if (mod_hash_insert(i_mac_impl_hash,
+ (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
+ rw_exit(&i_mac_impl_lock);
+ err = EEXIST;
+ goto fail;
+ }
+
+ DTRACE_PROBE2(mac__register, struct devnames *, dnp,
+ (mac_impl_t *), mip);
+
+ /*
+ * Mark the MAC to be ready for open.
+ */
+ mip->mi_state_flags &= ~MIS_DISABLED;
+ rw_exit(&i_mac_impl_lock);
+
+ atomic_inc_32(&i_mac_impl_count);
+
+ cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
+ *mhp = (mac_handle_t)mip;
+ return (0);
+
+fail:
+ if (style1_created)
+ ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
+
+ if (style2_created)
+ ddi_remove_minor_node(mip->mi_dip, driver);
+
+ mac_addr_factory_fini(mip);
+
+ /* Clean up registered MAC addresses */
+ mac_fini_macaddr(mip);
+
+ /* Clean up registered rings */
+ mac_free_rings(mip, MAC_RING_TYPE_RX);
+ mac_free_rings(mip, MAC_RING_TYPE_TX);
+
+ /* Clean up notification thread */
+ if (mip->mi_notify_thread != NULL)
+ i_mac_notify_exit(mip);
+
+ if (mip->mi_info.mi_unicst_addr != NULL) {
+ kmem_free(mip->mi_info.mi_unicst_addr,
+ mip->mi_type->mt_addr_length);
+ mip->mi_info.mi_unicst_addr = NULL;
+ }
+
+ mac_stat_destroy(mip);
+
+ if (mip->mi_type != NULL) {
+ atomic_dec_32(&mip->mi_type->mt_ref);
+ mip->mi_type = NULL;
+ }
+
+ if (mip->mi_pdata != NULL) {
+ kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+ mip->mi_pdata = NULL;
+ mip->mi_pdata_size = 0;
+ }
+
+ if (minor != 0) {
+ ASSERT(minor > MAC_MAX_MINOR);
+ mac_minor_rele(minor);
+ }
+
+ mac_unregister_priv_prop(mip);
+
+ kmem_cache_free(i_mac_impl_cachep, mip);
+ return (err);
+}
+
+/*
+ * Unregister from the GLDv3 framework
+ */
+int
+mac_unregister(mac_handle_t mh)
+{
+ int err;
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mod_hash_val_t val;
+ mac_margin_req_t *mmr, *nextmmr;
+
+ /* Fail the unregister if there are any open references to this mac. */
+ if ((err = mac_disable_nowait(mh)) != 0)
+ return (err);
+
+ /*
+ * Clean up notification thread and wait for it to exit.
+ */
+ i_mac_notify_exit(mip);
+
+ i_mac_perim_enter(mip);
+
+ if (mip->mi_minor < MAC_MAX_MINOR + 1) {
+ ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
+ ddi_remove_minor_node(mip->mi_dip,
+ (char *)ddi_driver_name(mip->mi_dip));
+ }
+
+ ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
+ MIS_EXCLUSIVE));
+
+ mac_stat_destroy(mip);
+
+ (void) mod_hash_remove(i_mac_impl_hash,
+ (mod_hash_key_t)mip->mi_name, &val);
+ ASSERT(mip == (mac_impl_t *)val);
+
+ ASSERT(i_mac_impl_count > 0);
+ atomic_dec_32(&i_mac_impl_count);
+
+ if (mip->mi_pdata != NULL)
+ kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+ mip->mi_pdata = NULL;
+ mip->mi_pdata_size = 0;
+
+ /*
+ * Free the list of margin request.
+ */
+ for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
+ nextmmr = mmr->mmr_nextp;
+ kmem_free(mmr, sizeof (mac_margin_req_t));
+ }
+ mip->mi_mmrp = NULL;
+
+ mip->mi_linkstate = LINK_STATE_UNKNOWN;
+ kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
+ mip->mi_info.mi_unicst_addr = NULL;
+
+ atomic_dec_32(&mip->mi_type->mt_ref);
+ mip->mi_type = NULL;
+
+ /*
+ * Free the primary MAC address.
+ */
+ mac_fini_macaddr(mip);
+
+ /*
+ * free all rings
+ */
+ mac_free_rings(mip, MAC_RING_TYPE_RX);
+ mac_free_rings(mip, MAC_RING_TYPE_TX);
+
+ mac_addr_factory_fini(mip);
+
+ bzero(mip->mi_addr, MAXMACADDRLEN);
+ bzero(mip->mi_dstaddr, MAXMACADDRLEN);
+
+ /* and the flows */
+ mac_flow_tab_destroy(mip->mi_flow_tab);
+ mip->mi_flow_tab = NULL;
+
+ if (mip->mi_minor > MAC_MAX_MINOR)
+ mac_minor_rele(mip->mi_minor);
+
+ cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
+
+ /*
+ * Reset the perim related fields to default values before
+ * kmem_cache_free
+ */
+ i_mac_perim_exit(mip);
+ mip->mi_state_flags = 0;
+
+ mac_unregister_priv_prop(mip);
+ kmem_cache_free(i_mac_impl_cachep, mip);
+
+ return (0);
+}
+
+/* DATA RECEPTION */
+
+/*
+ * This function is invoked for packets received by the MAC driver in
+ * interrupt context. The ring generation number provided by the driver
+ * is matched with the ring generation number held in MAC. If they do not
+ * match, received packets are considered stale packets coming from an older
+ * assignment of the ring. Drop them.
+ */
+void
+mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
+ uint64_t mr_gen_num)
+{
+ mac_ring_t *mr = (mac_ring_t *)mrh;
+
+ if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
+ DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
+ mr->mr_gen_num, uint64_t, mr_gen_num);
+ freemsgchain(mp_chain);
+ return;
+ }
+ mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
+}
+
+/*
+ * This function is invoked for each packet received by the underlying
+ * driver.
+ */
+void
+mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_ring_t *mr = (mac_ring_t *)mrh;
+ mac_soft_ring_set_t *mac_srs;
+ mblk_t *bp = mp_chain;
+ boolean_t hw_classified = B_FALSE;
+
+ /*
+ * If there are any promiscuous mode callbacks defined for
+ * this MAC, pass them a copy if appropriate.
+ */
+ if (mip->mi_promisc_list != NULL)
+ mac_promisc_dispatch(mip, mp_chain, NULL);
+
+ if (mr != NULL) {
+ /*
+ * If the SRS teardown has started, just return. The 'mr'
+ * continues to be valid until the driver unregisters the mac.
+ * Hardware classified packets will not make their way up
+ * beyond this point once the teardown has started. The driver
+ * is never passed a pointer to a flow entry or SRS or any
+ * structure that can be freed much before mac_unregister.
+ */
+ mutex_enter(&mr->mr_lock);
+ if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
+ (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
+ mutex_exit(&mr->mr_lock);
+ freemsgchain(mp_chain);
+ return;
+ }
+ if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
+ hw_classified = B_TRUE;
+ MR_REFHOLD_LOCKED(mr);
+ }
+ mutex_exit(&mr->mr_lock);
+
+ /*
+ * We check if an SRS is controlling this ring.
+ * If so, we can directly call the srs_lower_proc
+ * routine otherwise we need to go through mac_rx_classify
+ * to reach the right place.
+ */
+ if (hw_classified) {
+ mac_srs = mr->mr_srs;
+ /*
+ * This is supposed to be the fast path.
+ * All packets received though here were steered by
+ * the hardware classifier, and share the same
+ * MAC header info.
+ */
+ mac_srs->srs_rx.sr_lower_proc(mh,
+ (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
+ MR_REFRELE(mr);
+ return;
+ }
+ /* We'll fall through to software classification */
+ }
+
+ if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
+ if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
+ return;
+ }
+
+ freemsgchain(bp);
+}
+
+/* DATA TRANSMISSION */
+
+/*
+ * A driver's notification to resume transmission, in case of a provider
+ * without TX rings.
+ */
+void
+mac_tx_update(mac_handle_t mh)
+{
+ /*
+ * Walk the list of MAC clients (mac_client_handle)
+ * and update
+ */
+ i_mac_tx_srs_notify((mac_impl_t *)mh, NULL);
+}
+
+/*
+ * A driver's notification to resume transmission on the specified TX ring.
+ */
+void
+mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
+{
+ i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
+}
+
+/* LINK STATE */
+/*
+ * Notify the MAC layer about a link state change
+ */
+void
+mac_link_update(mac_handle_t mh, link_state_t link)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ /*
+ * Save the link state.
+ */
+ mip->mi_linkstate = link;
+
+ /*
+ * Send a MAC_NOTE_LINK notification.
+ */
+ i_mac_notify(mip, MAC_NOTE_LINK);
+}
+
+/* OTHER CONTROL INFORMATION */
+
+/*
+ * A driver notified us that its primary MAC address has changed.
+ */
+void
+mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ if (mip->mi_type->mt_addr_length == 0)
+ return;
+
+ i_mac_perim_enter(mip);
+ /*
+ * If address doesn't change, do nothing.
+ */
+ if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) {
+ i_mac_perim_exit(mip);
+ return;
+ }
+
+ /*
+ * Freshen the MAC address value and update all MAC clients that
+ * share this MAC address.
+ */
+ mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
+ (uint8_t *)addr);
+
+ i_mac_perim_exit(mip);
+
+ /*
+ * Send a MAC_NOTE_UNICST notification.
+ */
+ i_mac_notify(mip, MAC_NOTE_UNICST);
+}
+
+/*
+ * The provider's hw resources (e.g. rings grouping) has changed.
+ * Notify the MAC framework to trigger a re-negotiation of the capabilities.
+ */
+void
+mac_resource_update(mac_handle_t mh)
+{
+ /*
+ * Send a MAC_NOTE_RESOURCE notification.
+ */
+ i_mac_notify((mac_impl_t *)mh, MAC_NOTE_RESOURCE);
+}
+
+/*
+ * MAC plugin information changed.
+ */
+int
+mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ /*
+ * Verify that the plugin supports MAC plugin data and that the
+ * supplied data is valid.
+ */
+ if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
+ return (EINVAL);
+ if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
+ return (EINVAL);
+
+ if (mip->mi_pdata != NULL)
+ kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+
+ mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
+ bcopy(mac_pdata, mip->mi_pdata, dsize);
+ mip->mi_pdata_size = dsize;
+
+ /*
+ * Since the MAC plugin data is used to construct MAC headers that
+ * were cached in fast-path headers, we need to flush fast-path
+ * information for links associated with this mac.
+ */
+ i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
+ return (0);
+}
+
+/*
+ * Invoked by driver as well as the framework to notify its capability change.
+ */
+void
+mac_capab_update(mac_handle_t mh)
+{
+ /* Send MAC_NOTE_CAPAB_CHG notification */
+ i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
+}
+
+int
+mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ if (sdu_max <= mip->mi_sdu_min)
+ return (EINVAL);
+ mip->mi_sdu_max = sdu_max;
+
+ /* Send a MAC_NOTE_SDU_SIZE notification. */
+ i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
+ return (0);
+}
+
+/* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
+
+/*
+ * Updates the mac_impl structure with the current state of the link
+ */
+static void
+i_mac_log_link_state(mac_impl_t *mip)
+{
+ /*
+ * If no change, then it is not interesting.
+ */
+ if (mip->mi_lastlinkstate == mip->mi_linkstate)
+ return;
+
+ switch (mip->mi_linkstate) {
+ case LINK_STATE_UP:
+ if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
+ char det[200];
+
+ mip->mi_type->mt_ops.mtops_link_details(det,
+ sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
+
+ cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
+ } else {
+ cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
+ }
+ break;
+
+ case LINK_STATE_DOWN:
+ /*
+ * Only transitions from UP to DOWN are interesting
+ */
+ if (mip->mi_lastlinkstate != LINK_STATE_UNKNOWN)
+ cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
+ break;
+
+ case LINK_STATE_UNKNOWN:
+ /*
+ * This case is normally not interesting.
+ */
+ break;
+ }
+ mip->mi_lastlinkstate = mip->mi_linkstate;
+}
+
+/*
+ * Main routine for the callbacks notifications thread
+ */
+static void
+i_mac_notify_thread(void *arg)
+{
+ mac_impl_t *mip = arg;
+ callb_cpr_t cprinfo;
+ mac_cb_t *mcb;
+ mac_cb_info_t *mcbi;
+ mac_notify_cb_t *mncb;
+
+ mcbi = &mip->mi_notify_cb_info;
+ CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
+ "i_mac_notify_thread");
+
+ mutex_enter(mcbi->mcbi_lockp);
+
+ for (;;) {
+ uint32_t bits;
+ uint32_t type;
+
+ bits = mip->mi_notify_bits;
+ if (bits == 0) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
+ CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
+ continue;
+ }
+ mip->mi_notify_bits = 0;
+ if ((bits & (1 << MAC_NNOTE)) != 0) {
+ /* request to quit */
+ ASSERT(mip->mi_state_flags & MIS_DISABLED);
+ break;
+ }
+
+ mutex_exit(mcbi->mcbi_lockp);
+
+ /*
+ * Log link changes.
+ */
+ if ((bits & (1 << MAC_NOTE_LINK)) != 0)
+ i_mac_log_link_state(mip);
+
+ /*
+ * Do notification callbacks for each notification type.
+ */
+ for (type = 0; type < MAC_NNOTE; type++) {
+ if ((bits & (1 << type)) == 0) {
+ continue;
+ }
+
+ if (mac_notify_cb_list[type].mac_notify_cb_fn)
+ mac_notify_cb_list[type].mac_notify_cb_fn(mip);
+
+ /*
+ * Walk the list of notifications.
+ */
+ MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
+ for (mcb = mip->mi_notify_cb_list; mcb != NULL;
+ mcb = mcb->mcb_nextp) {
+ mncb = (mac_notify_cb_t *)mcb->mcb_objp;
+ mncb->mncb_fn(mncb->mncb_arg, type);
+ }
+ MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
+ &mip->mi_notify_cb_list);
+ }
+
+ mutex_enter(mcbi->mcbi_lockp);
+ }
+
+ mip->mi_state_flags |= MIS_NOTIFY_DONE;
+ cv_broadcast(&mcbi->mcbi_cv);
+
+ /* CALLB_CPR_EXIT drops the lock */
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
+}
+
+/*
+ * Signal the i_mac_notify_thread asking it to quit.
+ * Then wait till it is done.
+ */
+void
+i_mac_notify_exit(mac_impl_t *mip)
+{
+ mac_cb_info_t *mcbi;
+
+ mcbi = &mip->mi_notify_cb_info;
+
+ mutex_enter(mcbi->mcbi_lockp);
+ mip->mi_notify_bits = (1 << MAC_NNOTE);
+ cv_broadcast(&mcbi->mcbi_cv);
+
+
+ while ((mip->mi_notify_thread != NULL) &&
+ !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
+ cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
+ }
+
+ /* Necessary clean up before doing kmem_cache_free */
+ mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
+ mip->mi_notify_bits = 0;
+ mip->mi_notify_thread = NULL;
+ mutex_exit(mcbi->mcbi_lockp);
+}
+
+/*
+ * Entry point invoked by drivers to dynamically add a ring to an
+ * existing group.
+ */
+int
+mac_group_add_ring(mac_group_handle_t gh, int index)
+{
+ mac_group_t *group = (mac_group_t *)gh;
+ mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+ int ret;
+
+ i_mac_perim_enter(mip);
+
+ /*
+ * Only RX rings can be added or removed by drivers currently.
+ */
+ ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+
+ ret = i_mac_group_add_ring(group, NULL, index);
+
+ i_mac_perim_exit(mip);
+
+ return (ret);
+}
+
+/*
+ * Entry point invoked by drivers to dynamically remove a ring
+ * from an existing group. The specified ring handle must no longer
+ * be used by the driver after a call to this function.
+ */
+void
+mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
+{
+ mac_group_t *group = (mac_group_t *)gh;
+ mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+
+ i_mac_perim_enter(mip);
+
+ /*
+ * Only RX rings can be added or removed by drivers currently.
+ */
+ ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+
+ i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
+
+ i_mac_perim_exit(mip);
+}
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
new file mode 100644
index 0000000000..290366f5d2
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -0,0 +1,3819 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/callb.h>
+#include <sys/sdt.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ip_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
+#include <inet/ip6.h>
+
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/mac_flow_impl.h>
+
+static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
+ uintptr_t, uint16_t, mblk_t **);
+static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
+ uintptr_t, uint16_t, mblk_t **);
+static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
+ uintptr_t, uint16_t, mblk_t **);
+static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
+ uintptr_t, uint16_t, mblk_t **);
+
+typedef struct mac_tx_mode_s {
+ mac_tx_srs_mode_t mac_tx_mode;
+ mac_tx_func_t mac_tx_func;
+} mac_tx_mode_t;
+
+/*
+ * There are five modes of operation on the Tx side. These modes get set
+ * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
+ * none of the other modes are user configurable. They get selected by
+ * the system depending upon whether the link (or flow) has multiple Tx
+ * rings or a bandwidth configured, etc.
+ */
+mac_tx_mode_t mac_tx_mode_list[] = {
+ {SRS_TX_DEFAULT, mac_tx_single_ring_mode},
+ {SRS_TX_SERIALIZE, mac_tx_serializer_mode},
+ {SRS_TX_FANOUT, mac_tx_fanout_mode},
+ {SRS_TX_BW, mac_tx_bw_mode},
+ {SRS_TX_BW_FANOUT, mac_tx_bw_mode}
+};
+
+/*
+ * Soft Ring Set (SRS) - The Run time code that deals with
+ * dynamic polling from the hardware, bandwidth enforcement,
+ * fanout etc.
+ *
+ * We try to use H/W classification on NIC and assign traffic for
+ * a MAC address to a particular Rx ring or ring group. There is a
+ * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
+ * switches the underlying Rx ring between interrupt and
+ * polling mode and enforces any specified B/W control.
+ *
+ * There is always a SRS created and tied to each H/W and S/W rule.
+ * Whenever we create a H/W rule, we always add the the same rule to
+ * S/W classifier and tie a SRS to it.
+ *
+ * In case a B/W control is specified, it is broken into bytes
+ * per ticks and as soon as the quota for a tick is exhausted,
+ * the underlying Rx ring is forced into poll mode for remainder of
+ * the tick. The SRS poll thread only polls for bytes that are
+ * allowed to come in the SRS. We typically let 4x the configured
+ * B/W worth of packets to come in the SRS (to prevent unnecessary
+ * drops due to bursts) but only process the specified amount.
+ *
+ * A MAC client (e.g. a VNIC or aggr) can have 1 or more
+ * Rx rings (and corresponding SRSs) assigned to it. The SRS
+ * in turn can have softrings to do protocol level fanout or
+ * softrings to do S/W based fanout or both. In case the NIC
+ * has no Rx rings, we do S/W classification to respective SRS.
+ * The S/W classification rule is always setup and ready. This
+ * allows the MAC layer to reassign Rx rings whenever needed
+ * but packets still continue to flow via the default path and
+ * getting S/W classified to correct SRS.
+ *
+ * The SRS's are used on both Tx and Rx side. They use the same
+ * data structure but the processing routines have slightly different
+ * semantics due to the fact that Rx side needs to do dynamic
+ * polling etc.
+ *
+ * Dynamic Polling Notes
+ * =====================
+ *
+ * Each Soft ring set is capable of switching its Rx ring between
+ * interrupt and poll mode and actively 'polls' for packets in
+ * poll mode. If the SRS is implementing a B/W limit, it makes
+ * sure that only Max allowed packets are pulled in poll mode
+ * and goes to poll mode as soon as B/W limit is exceeded. As
+ * such, there are no overheads to implement B/W limits.
+ *
+ * In poll mode, its better to keep the pipeline going where the
+ * SRS worker thread keeps processing packets and poll thread
+ * keeps bringing more packets (specially if they get to run
+ * on different CPUs). This also prevents the overheads associated
+ * by excessive signalling (on NUMA machines, this can be
+ * pretty devastating). The exception is latency optimized case
+ * where worker thread does no work and interrupt and poll thread
+ * are allowed to do their own drain.
+ *
+ * We use the following policy to control Dynamic Polling:
+ * 1) We switch to poll mode anytime the processing
+ * thread causes a backlog to build up in SRS and
+ * its associated Soft Rings (sr_poll_pkt_cnt > 0).
+ * 2) As long as the backlog stays under the low water
+ * mark (sr_lowat), we poll the H/W for more packets.
+ * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
+ * water mark, we stay in poll mode but don't poll
+ * the H/W for more packets.
+ * 4) Anytime in polling mode, if we poll the H/W for
+ * packets and find nothing plus we have an existing
+ * backlog (sr_poll_pkt_cnt > 0), we stay in polling
+ * mode but don't poll the H/W for packets anymore
+ * (let the polling thread go to sleep).
+ * 5) Once the backlog is relived (packets are processed)
+ * we reenable polling (by signalling the poll thread)
+ * only when the backlog dips below sr_poll_thres.
+ * 6) sr_hiwat is used exclusively when we are not
+ * polling capable and is used to decide when to
+ * drop packets so the SRS queue length doesn't grow
+ * infinitely.
+ *
+ * NOTE: Also see the block level comment on top of mac_soft_ring.c
+ */
+
+/*
+ * mac_latency_optimize
+ *
+ * Controls whether the poll thread can process the packets inline
+ * or let the SRS worker thread do the processing. This applies if
+ * the SRS was not being processed. For latency sensitive traffic,
+ * this needs to be true to allow inline processing. For throughput
+ * under load, this should be false.
+ *
+ * This (and other similar) tunable should be rolled into a link
+ * or flow specific workload hint that can be set using dladm
+ * linkprop (instead of multiple such tunables).
+ */
+boolean_t mac_latency_optimize = B_TRUE;
+
+/*
+ * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
+ *
+ * queue a mp or chain in soft ring set and increment the
+ * local count (srs_count) for the SRS and the shared counter
+ * (srs_poll_pkt_cnt - shared between SRS and its soft rings
+ * to track the total unprocessed packets for polling to work
+ * correctly).
+ *
+ * The size (total bytes queued) counters are incremented only
+ * if we are doing B/W control.
+ */
+#define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ if ((mac_srs)->srs_last != NULL) \
+ (mac_srs)->srs_last->b_next = (head); \
+ else \
+ (mac_srs)->srs_first = (head); \
+ (mac_srs)->srs_last = (tail); \
+ (mac_srs)->srs_count += count; \
+}
+
+#define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
+ mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
+ \
+ MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
+ srs_rx->sr_poll_pkt_cnt += count; \
+ ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \
+ if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
+ (mac_srs)->srs_size += (sz); \
+ mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \
+ (mac_srs)->srs_bw->mac_bw_sz += (sz); \
+ mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \
+ } \
+}
+
+#define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
+ mac_srs->srs_state |= SRS_ENQUEUED; \
+ MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
+ if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
+ (mac_srs)->srs_size += (sz); \
+ (mac_srs)->srs_bw->mac_bw_sz += (sz); \
+ } \
+}
+
+/*
+ * Turn polling on routines
+ */
+#define MAC_SRS_POLLING_ON(mac_srs) { \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ if (((mac_srs)->srs_state & \
+ (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \
+ (mac_srs)->srs_state |= SRS_POLLING; \
+ (void) mac_hwring_disable_intr((mac_ring_handle_t) \
+ (mac_srs)->srs_ring); \
+ (mac_srs)->srs_rx.sr_poll_on++; \
+ } \
+}
+
+#define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ if (((mac_srs)->srs_state & \
+ (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \
+ (SRS_POLLING_CAPAB|SRS_WORKER)) { \
+ (mac_srs)->srs_state |= SRS_POLLING; \
+ (void) mac_hwring_disable_intr((mac_ring_handle_t) \
+ (mac_srs)->srs_ring); \
+ (mac_srs)->srs_rx.sr_worker_poll_on++; \
+ } \
+}
+
+/*
+ * MAC_SRS_POLL_RING
+ *
+ * Signal the SRS poll thread to poll the underlying H/W ring
+ * provided it wasn't already polling (SRS_GET_PKTS was set).
+ *
+ * Poll thread gets to run only from mac_rx_srs_drain() and only
+ * if the drain was being done by the worker thread.
+ */
+#define MAC_SRS_POLL_RING(mac_srs) { \
+ mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
+ \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ srs_rx->sr_poll_thr_sig++; \
+ if (((mac_srs)->srs_state & \
+ (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \
+ (SRS_WORKER|SRS_POLLING_CAPAB)) { \
+ (mac_srs)->srs_state |= SRS_GET_PKTS; \
+ cv_signal(&(mac_srs)->srs_cv); \
+ } else { \
+ srs_rx->sr_poll_thr_busy++; \
+ } \
+}
+
+/*
+ * MAC_SRS_CHECK_BW_CONTROL
+ *
+ * Check to see if next tick has started so we can reset the
+ * SRS_BW_ENFORCED flag and allow more packets to come in the
+ * system.
+ */
+#define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ ASSERT(((mac_srs)->srs_type & SRST_TX) || \
+ MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \
+ if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \
+ (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \
+ (mac_srs)->srs_bw->mac_bw_used = 0; \
+ if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \
+ (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
+ } \
+}
+
+/*
+ * MAC_SRS_WORKER_WAKEUP
+ *
+ * Wake up the SRS worker thread to process the queue as long as
+ * no one else is processing the queue. If we are optimizing for
+ * latency, we wake up the worker thread immediately or else we
+ * wait mac_srs_worker_wakeup_ticks before worker thread gets
+ * woken up.
+ */
+int mac_srs_worker_wakeup_ticks = 0;
+#define MAC_SRS_WORKER_WAKEUP(mac_srs) { \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ if (!((mac_srs)->srs_state & SRS_PROC) && \
+ (mac_srs)->srs_tid == NULL) { \
+ if (mac_latency_optimize || \
+ (mac_srs_worker_wakeup_ticks == 0)) \
+ cv_signal(&(mac_srs)->srs_async); \
+ else \
+ (mac_srs)->srs_tid = \
+ timeout(mac_srs_fire, (mac_srs), \
+ mac_srs_worker_wakeup_ticks); \
+ } \
+}
+
+#define TX_SINGLE_RING_MODE(mac_srs) \
+ ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \
+ (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \
+ (mac_srs)->srs_tx.st_mode == SRS_TX_BW)
+
+#define TX_BANDWIDTH_MODE(mac_srs) \
+ ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \
+ (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT)
+
+#define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \
+ uint_t hash, indx; \
+ hash = HASH_HINT(hint); \
+ indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \
+ softring = mac_srs->srs_oth_soft_rings[indx]; \
+ (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \
+}
+
+/*
+ * MAC_TX_SRS_BLOCK
+ *
+ * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
+ * will be set only if srs_tx_woken_up is FALSE. If
+ * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
+ * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
+ * attempt to transmit again and not setting SRS_TX_BLOCKED does
+ * that.
+ */
+#define MAC_TX_SRS_BLOCK(srs, mp) { \
+ ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \
+ if ((srs)->srs_tx.st_woken_up) { \
+ (srs)->srs_tx.st_woken_up = B_FALSE; \
+ } else { \
+ ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \
+ (srs)->srs_state |= SRS_TX_BLOCKED; \
+ (srs)->srs_tx.st_blocked_cnt++; \
+ } \
+}
+
+/*
+ * MAC_TX_SRS_TEST_HIWAT
+ *
+ * Called before queueing a packet onto Tx SRS to test and set
+ * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
+ */
+#define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \
+ boolean_t enqueue = 1; \
+ \
+ if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \
+ /* \
+ * flow-controlled. Store srs in cookie so that it \
+ * can be returned as mac_tx_cookie_t to client \
+ */ \
+ (srs)->srs_state |= SRS_TX_HIWAT; \
+ cookie = (mac_tx_cookie_t)srs; \
+ (srs)->srs_tx.st_hiwat_cnt++; \
+ if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \
+ /* increment freed stats */ \
+ (srs)->srs_tx.st_drop_count += cnt; \
+ /* \
+ * b_prev may be set to the fanout hint \
+ * hence can't use freemsg directly \
+ */ \
+ mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \
+ DTRACE_PROBE1(tx_queued_hiwat, \
+ mac_soft_ring_set_t *, srs); \
+ enqueue = 0; \
+ } \
+ } \
+ if (enqueue) \
+ MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \
+}
+
+/* Some utility macros */
+#define MAC_SRS_BW_LOCK(srs) \
+ if (!(srs->srs_type & SRST_TX)) \
+ mutex_enter(&srs->srs_bw->mac_bw_lock);
+
+#define MAC_SRS_BW_UNLOCK(srs) \
+ if (!(srs->srs_type & SRST_TX)) \
+ mutex_exit(&srs->srs_bw->mac_bw_lock);
+
+#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \
+ mac_pkt_drop(NULL, NULL, mp, B_FALSE); \
+ /* increment freed stats */ \
+ mac_srs->srs_tx.st_drop_count++; \
+ cookie = (mac_tx_cookie_t)srs; \
+}
+
+#define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \
+ mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \
+ cookie = (mac_tx_cookie_t)srs; \
+ *ret_mp = mp_chain; \
+}
+
+/*
+ * Drop the rx packet and advance to the next one in the chain.
+ */
+static void
+mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
+{
+ mac_srs_rx_t *srs_rx = &srs->srs_rx;
+
+ ASSERT(mp->b_next == NULL);
+ mutex_enter(&srs->srs_lock);
+ MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
+ MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
+ mutex_exit(&srs->srs_lock);
+
+ srs_rx->sr_drop_count++;
+ freemsg(mp);
+}
+
+/* DATAPATH RUNTIME ROUTINES */
+
+/*
+ * mac_srs_fire
+ *
+ * Timer callback routine for waking up the SRS worker thread.
+ */
+static void
+mac_srs_fire(void *arg)
+{
+ mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
+
+ mutex_enter(&mac_srs->srs_lock);
+ if (mac_srs->srs_tid == 0) {
+ mutex_exit(&mac_srs->srs_lock);
+ return;
+ }
+
+ mac_srs->srs_tid = 0;
+ if (!(mac_srs->srs_state & SRS_PROC))
+ cv_signal(&mac_srs->srs_async);
+
+ mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
+ * and it is used on the TX path.
+ */
+#define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16))
+
+/*
+ * hash based on the src address and the port information.
+ */
+#define HASH_ADDR(src, ports) \
+ (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \
+ ((ports) >> 8) ^ (ports))
+
+#define COMPUTE_INDEX(key, sz) (key % sz)
+
+#define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \
+ if ((tail) != NULL) { \
+ ASSERT((tail)->b_next == NULL); \
+ (tail)->b_next = (mp); \
+ } else { \
+ ASSERT((head) == NULL); \
+ (head) = (mp); \
+ } \
+ (tail) = (mp); \
+ (cnt)++; \
+ if ((bw_ctl)) \
+ (sz) += (sz0); \
+}
+
+#define MAC_FANOUT_DEFAULT 0
+#define MAC_FANOUT_RND_ROBIN 1
+int mac_fanout_type = MAC_FANOUT_DEFAULT;
+
+#define MAX_SR_TYPES 3
+/* fanout types for port based hashing */
+enum pkt_type {
+ V4_TCP = 0,
+ V4_UDP,
+ OTH,
+ UNDEF
+};
+
+/*
+ * In general we do port based hashing to spread traffic over different
+ * softrings. The below tunable allows to override that behavior. Setting it
+ * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
+ * is also the applicable to ipv6 packets carrying multiple optional headers
+ * and other uncommon packet types.
+ */
+boolean_t mac_src_ipv6_fanout = B_FALSE;
+
+/*
+ * Pair of local and remote ports in the transport header
+ */
+#define PORTS_SIZE 4
+
+/*
+ * mac_rx_srs_proto_fanout
+ *
+ * This routine delivers packets destined to an SRS into one of the
+ * protocol soft rings.
+ *
+ * Given a chain of packets we need to split it up into multiple sub chains
+ * destined into TCP, UDP or OTH soft ring. Instead of entering
+ * the soft ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker thread
+ * goes to sleep and then next packets comes in forcing it to wake up etc.
+ */
+static void
+mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
+{
+ struct ether_header *ehp;
+ uint16_t etype;
+ ipha_t *ipha;
+ mac_soft_ring_t *softring;
+ size_t ether_hlen;
+ mblk_t *mp;
+ mblk_t *headmp[MAX_SR_TYPES];
+ mblk_t *tailmp[MAX_SR_TYPES];
+ int cnt[MAX_SR_TYPES];
+ size_t sz[MAX_SR_TYPES];
+ size_t sz1;
+ boolean_t bw_ctl = B_FALSE;
+ boolean_t hw_classified;
+ boolean_t dls_bypass = B_TRUE;
+ enum pkt_type type;
+ mac_client_impl_t *mcip = mac_srs->srs_mcip;
+ struct ether_vlan_header *evhp;
+
+ if (mac_srs->srs_type & SRST_BW_CONTROL)
+ bw_ctl = B_TRUE;
+
+ /*
+ * If we don't have a Rx ring, S/W classification would have done
+ * its job and its a packet meant for us. If we were polling on
+ * the default ring (i.e. there was a ring assigned to this SRS),
+ * then we need to make sure that the mac address really belongs
+ * to us.
+ */
+ hw_classified = mac_srs->srs_ring != NULL &&
+ mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
+
+ /*
+ * Special clients (eg. VLAN, non ether, etc) need DLS
+ * processing in the Rx path. SRST_DLS_BYPASS will be clear for
+ * such SRSs.
+ */
+ if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
+ dls_bypass = B_FALSE;
+
+ bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
+ bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
+ bzero(cnt, MAX_SR_TYPES * sizeof (int));
+ bzero(sz, MAX_SR_TYPES * sizeof (size_t));
+
+ /*
+ * We got a chain from SRS that we need to send to the soft rings.
+ * Since squeues for TCP & IPv4 sap poll their soft rings (for
+ * performance reasons), we need to separate out v4_tcp, v4_udp
+ * and the rest goes in other.
+ */
+ while (head != NULL) {
+ mp = head;
+ head = head->b_next;
+ mp->b_next = NULL;
+
+ type = OTH;
+ sz1 = msgdsize(mp);
+
+ if (!dls_bypass) {
+ mac_impl_t *mip = mcip->mci_mip;
+
+ ehp = (struct ether_header *)mp->b_rptr;
+
+ /*
+ * For VLAN packets, if the VLAN id doesn't belong
+ * to this client, we drop the packet.
+ */
+ if (mip->mi_info.mi_nativemedia == DL_ETHER &&
+ ntohs(ehp->ether_type) == VLAN_TPID) {
+ /*
+ * LINTED: cast may result in improper
+ * alignment
+ */
+ evhp = (struct ether_vlan_header *)ehp;
+ if (!mac_client_check_flow_vid(mcip,
+ VLAN_ID(ntohs(evhp->ether_tci)))) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ }
+ FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
+ cnt[type], bw_ctl, sz[type], sz1, mp);
+ continue;
+ }
+
+ /*
+ * At this point we can be sure the packet at least
+ * has an ether header.
+ */
+ if (sz1 < sizeof (struct ether_header)) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ /* LINTED: cast may result in improper alignment */
+ ehp = (struct ether_header *)mp->b_rptr;
+
+ /*
+ * Determine if this is a VLAN or non-VLAN packet.
+ */
+ if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
+ /* LINTED: cast may result in improper alignment */
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ etype = ntohs(evhp->ether_type);
+ ether_hlen = sizeof (struct ether_vlan_header);
+ /*
+ * Check if the VID of the packet, if any, belongs
+ * to this client.
+ */
+ if (!mac_client_check_flow_vid(mcip,
+ VLAN_ID(ntohs(evhp->ether_tci)))) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ } else {
+ ether_hlen = sizeof (struct ether_header);
+ }
+
+ if (etype == ETHERTYPE_IP) {
+ /*
+ * If we are H/W classified, but we have promisc
+ * on, then we need to check for the unicast address.
+ */
+ if (hw_classified && mcip->mci_promisc_list != NULL) {
+ mac_address_t *map;
+
+ rw_enter(&mcip->mci_rw_lock, RW_READER);
+ map = mcip->mci_unicast;
+ if (bcmp(&ehp->ether_dhost, map->ma_addr,
+ map->ma_len) == 0)
+ type = UNDEF;
+ rw_exit(&mcip->mci_rw_lock);
+ } else if (((((uint8_t *)&ehp->ether_dhost)[0] &
+ 0x01) == 0)) {
+ type = UNDEF;
+ }
+ }
+
+ /*
+ * This needs to become a contract with the driver for
+ * the fast path.
+ *
+ * In the normal case the packet will have at least the L2
+ * header and the IP + Transport header in the same mblk.
+ * This is usually the case when the NIC driver sends up
+ * the packet. This is also true when the stack generates
+ * a packet that is looped back and when the stack uses the
+ * fastpath mechanism. The normal case is optimized for
+ * performance and may bypass DLS. All other cases go through
+ * the 'OTH' type path without DLS bypass.
+ */
+
+ /* LINTED: cast may result in improper alignment */
+ ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+ if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
+ type = OTH;
+
+ if (type == OTH) {
+ FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
+ cnt[type], bw_ctl, sz[type], sz1, mp);
+ continue;
+ }
+
+ ASSERT(type == UNDEF);
+ /*
+ * We look for at least 4 bytes past the IP header to get
+ * the port information. If we get an IP fragment, we don't
+ * have the port information, and we use just the protocol
+ * information.
+ */
+ switch (ipha->ipha_protocol) {
+ case IPPROTO_TCP:
+ type = V4_TCP;
+ mp->b_rptr += ether_hlen;
+ break;
+ case IPPROTO_UDP:
+ type = V4_UDP;
+ mp->b_rptr += ether_hlen;
+ break;
+ default:
+ type = OTH;
+ break;
+ }
+
+ ASSERT(type != UNDEF);
+
+ FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
+ bw_ctl, sz[type], sz1, mp);
+ }
+
+ for (type = V4_TCP; type < UNDEF; type++) {
+ if (headmp[type] != NULL) {
+ ASSERT(tailmp[type]->b_next == NULL);
+ switch (type) {
+ case V4_TCP:
+ softring = mac_srs->srs_tcp_soft_rings[0];
+ break;
+ case V4_UDP:
+ softring = mac_srs->srs_udp_soft_rings[0];
+ break;
+ case OTH:
+ softring = mac_srs->srs_oth_soft_rings[0];
+ }
+ mac_rx_soft_ring_process(mac_srs->srs_mcip, softring,
+ headmp[type], tailmp[type], cnt[type], sz[type]);
+ }
+ }
+}
+
+int fanout_unalligned = 0;
+
+/*
+ * mac_rx_srs_long_fanout
+ *
+ * The fanout routine for IPv6
+ */
+static int
+mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
+ uint16_t etype, enum pkt_type *type, uint_t *indx)
+{
+ ip6_t *ip6h;
+ uint8_t *whereptr;
+ uint_t hash;
+ uint16_t remlen;
+ uint8_t nexthdr;
+ uint16_t hdr_len;
+
+ if (etype == ETHERTYPE_IPV6) {
+ boolean_t modifiable = B_TRUE;
+
+ ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
+
+ ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header));
+ if ((unsigned char *)ip6h == mp->b_wptr) {
+ /*
+ * The first mblk_t only includes the ethernet header.
+ * Note that it is safe to change the mp pointer here,
+ * as the subsequent operation does not assume mp
+ * points to the start of the ethernet header.
+ */
+ mp = mp->b_cont;
+
+ /*
+ * Make sure ip6h holds the full ip6_t structure.
+ */
+ if (mp == NULL)
+ return (-1);
+
+ if (MBLKL(mp) < IPV6_HDR_LEN) {
+ modifiable = (DB_REF(mp) == 1);
+
+ if (modifiable &&
+ !pullupmsg(mp, IPV6_HDR_LEN)) {
+ return (-1);
+ }
+ }
+
+ ip6h = (ip6_t *)mp->b_rptr;
+ }
+
+ if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
+ ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
+ /*
+ * If either ip6h is not alligned, or ip6h does not
+ * hold the complete ip6_t structure (a pullupmsg()
+ * is not an option since it would result in an
+ * unalligned ip6h), fanout to the default ring. Note
+ * that this may cause packets reordering.
+ */
+ *indx = 0;
+ *type = OTH;
+ fanout_unalligned++;
+ return (0);
+ }
+
+ remlen = ntohs(ip6h->ip6_plen);
+ nexthdr = ip6h->ip6_nxt;
+
+ if (remlen < MIN_EHDR_LEN)
+ return (-1);
+ /*
+ * Do src based fanout if below tunable is set to B_TRUE or
+ * when mac_ip_hdr_length_v6() fails because of malformed
+ * packets or because mblk's need to be concatenated using
+ * pullupmsg().
+ */
+ if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h,
+ &hdr_len, &nexthdr)) {
+ goto src_based_fanout;
+ }
+ whereptr = (uint8_t *)ip6h + hdr_len;
+
+ /* If the transport is one of below, we do port based fanout */
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ case IPPROTO_ESP:
+ /*
+ * If the ports in the transport header is not part of
+ * the mblk, do src_based_fanout, instead of calling
+ * pullupmsg().
+ */
+ if (mp->b_cont != NULL &&
+ whereptr + PORTS_SIZE > mp->b_wptr) {
+ goto src_based_fanout;
+ }
+ break;
+ default:
+ break;
+ }
+
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
+ *(uint32_t *)whereptr);
+ *indx = COMPUTE_INDEX(hash,
+ mac_srs->srs_tcp_ring_count);
+ *type = OTH;
+ break;
+
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ case IPPROTO_ESP:
+ if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
+ hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
+ *(uint32_t *)whereptr);
+ *indx = COMPUTE_INDEX(hash,
+ mac_srs->srs_udp_ring_count);
+ } else {
+ *indx = mac_srs->srs_ind %
+ mac_srs->srs_udp_ring_count;
+ mac_srs->srs_ind++;
+ }
+ *type = OTH;
+ break;
+
+ /* For all other protocol, do source based fanout */
+ default:
+ goto src_based_fanout;
+ }
+ } else {
+ *indx = 0;
+ *type = OTH;
+ }
+ return (0);
+
+src_based_fanout:
+ hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
+ *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
+ *type = OTH;
+ return (0);
+}
+
+/*
+ * mac_rx_srs_fanout
+ *
+ * This routine delivers packets destined to an SRS into a soft ring member
+ * of the set.
+ *
+ * Given a chain of packets we need to split it up into multiple sub chains
+ * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
+ * the soft ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker thread
+ * goes to sleep and then next packets comes in forcing it to wake up etc.
+ *
+ * Note:
+ * Since we know what is the maximum fanout possible, we create a 2D array
+ * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
+ * variables so that we can enter the softrings with chain. We need the
+ * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
+ * for each packet would be expensive). If we ever want to have the
+ * ability to have unlimited fanout, we should probably declare a head,
+ * tail, cnt, sz with each soft ring (a data struct which contains a softring
+ * along with these members) and create an array of this uber struct so we
+ * don't have to do kmem_alloc.
+ */
+int fanout_oth1 = 0;
+int fanout_oth2 = 0;
+int fanout_oth3 = 0;
+int fanout_oth4 = 0;
+int fanout_oth5 = 0;
+
+static void
+mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
+{
+ struct ether_header *ehp;
+ uint16_t etype;
+ ipha_t *ipha;
+ uint_t indx;
+ int ports_offset = -1;
+ int ipha_len;
+ uint_t hash;
+ mac_soft_ring_t *softring;
+ size_t ether_hlen;
+ uint16_t frag_offset_flags;
+ mblk_t *mp;
+ mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
+ mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
+ int cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
+ size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT];
+ size_t sz1;
+ boolean_t bw_ctl = B_FALSE;
+ boolean_t hw_classified;
+ boolean_t dls_bypass = B_TRUE;
+ int i;
+ int fanout_cnt;
+ enum pkt_type type;
+ mac_client_impl_t *mcip = mac_srs->srs_mcip;
+ struct ether_vlan_header *evhp;
+
+ if (mac_srs->srs_type & SRST_BW_CONTROL)
+ bw_ctl = B_TRUE;
+
+ /*
+ * If we don't have a Rx ring, S/W classification would have done
+ * its job and its a packet meant for us. If we were polling on
+ * the default ring (i.e. there was a ring assigned to this SRS),
+ * then we need to make sure that the mac address really belongs
+ * to us.
+ */
+ hw_classified = mac_srs->srs_ring != NULL &&
+ mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
+
+ /*
+ * Special clients (eg. VLAN, non ether, etc) need DLS
+ * processing in the Rx path. SRST_DLS_BYPASS will be clear for
+ * such SRSs.
+ */
+ if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
+ dls_bypass = B_FALSE;
+
+ /*
+ * Since the softrings are never destroyed and we always
+ * create equal number of softrings for TCP, UDP and rest,
+ * its OK to check one of them for count and use it without
+ * any lock. In future, if soft rings get destroyed because
+ * of reduction in fanout, we will need to ensure that happens
+ * behind the SRS_PROC.
+ */
+ fanout_cnt = mac_srs->srs_tcp_ring_count;
+
+ bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
+ bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
+ bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
+ bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
+
+ /*
+ * We got a chain from SRS that we need to send to the soft rings.
+ * Since squeues for TCP & IPv4 sap poll their soft rings (for
+ * performance reasons), we need to separate out v4_tcp, v4_udp
+ * and the rest goes in other.
+ */
+ while (head != NULL) {
+ mp = head;
+ head = head->b_next;
+ mp->b_next = NULL;
+
+ type = OTH;
+ sz1 = msgdsize(mp);
+
+ if (!dls_bypass) {
+ mac_impl_t *mip = mcip->mci_mip;
+
+ indx = 0;
+ if (mip->mi_info.mi_nativemedia == DL_ETHER) {
+ ehp = (struct ether_header *)mp->b_rptr;
+ etype = ntohs(ehp->ether_type);
+ /*
+ * For VLAN packets, if the VLAN id doesn't
+ * belong to this client, we drop the packet.
+ */
+ if (etype == VLAN_TPID) {
+ /*
+ * LINTED: cast may result in improper
+ * alignment
+ */
+ evhp = (struct ether_vlan_header *)
+ mp->b_rptr;
+ if (!mac_client_check_flow_vid(mcip,
+ VLAN_ID(ntohs(evhp->ether_tci)))) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ }
+ if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
+ &type, &indx) == -1) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ }
+
+ FANOUT_ENQUEUE_MP(headmp[type][indx],
+ tailmp[type][indx], cnt[type][indx], bw_ctl,
+ sz[type][indx], sz1, mp);
+ continue;
+ }
+
+ /*
+ * At this point we can be sure the packet at least
+ * has an ether header. On the outbound side, GLD/stack
+ * ensure this. On the inbound side, the driver needs
+ * to ensure this.
+ */
+ if (sz1 < sizeof (struct ether_header)) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ /* LINTED: cast may result in improper alignment */
+ ehp = (struct ether_header *)mp->b_rptr;
+
+ /*
+ * Determine if this is a VLAN or non-VLAN packet.
+ */
+ if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
+ /* LINTED: cast may result in improper alignment */
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ etype = ntohs(evhp->ether_type);
+ ether_hlen = sizeof (struct ether_vlan_header);
+ /*
+ * Check if the VID of the packet, if any, belongs
+ * to this client.
+ */
+ if (!mac_client_check_flow_vid(mcip,
+ VLAN_ID(ntohs(evhp->ether_tci)))) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ } else {
+ ether_hlen = sizeof (struct ether_header);
+ }
+
+
+ /*
+ * If we are using the default Rx ring where H/W or S/W
+ * classification has not happened, we need to verify if
+ * this unicast packet really belongs to us.
+ */
+ if (etype == ETHERTYPE_IP) {
+ /*
+ * If we are H/W classified, but we have promisc
+ * on, then we need to check for the unicast address.
+ */
+ if (hw_classified && mcip->mci_promisc_list != NULL) {
+ mac_address_t *map;
+
+ rw_enter(&mcip->mci_rw_lock, RW_READER);
+ map = mcip->mci_unicast;
+ if (bcmp(&ehp->ether_dhost, map->ma_addr,
+ map->ma_len) == 0)
+ type = UNDEF;
+ rw_exit(&mcip->mci_rw_lock);
+ } else if (((((uint8_t *)&ehp->ether_dhost)[0] &
+ 0x01) == 0)) {
+ type = UNDEF;
+ }
+ }
+
+ /*
+ * This needs to become a contract with the driver for
+ * the fast path.
+ */
+
+ /* LINTED: cast may result in improper alignment */
+ ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+ if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
+ type = OTH;
+ fanout_oth1++;
+ }
+
+ if (type != OTH) {
+ switch (ipha->ipha_protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ case IPPROTO_ESP:
+ ipha_len = IPH_HDR_LENGTH(ipha);
+ if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
+ mp->b_wptr) {
+ type = OTH;
+ break;
+ }
+ frag_offset_flags =
+ ntohs(ipha->ipha_fragment_offset_and_flags);
+ if ((frag_offset_flags &
+ (IPH_MF | IPH_OFFSET)) != 0) {
+ type = OTH;
+ fanout_oth3++;
+ break;
+ }
+ ports_offset = ether_hlen + ipha_len;
+ break;
+ default:
+ type = OTH;
+ fanout_oth4++;
+ break;
+ }
+ }
+
+ if (type == OTH) {
+ if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
+ &type, &indx) == -1) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+
+ FANOUT_ENQUEUE_MP(headmp[type][indx],
+ tailmp[type][indx], cnt[type][indx], bw_ctl,
+ sz[type][indx], sz1, mp);
+ continue;
+ }
+
+ ASSERT(type == UNDEF);
+
+ /*
+ * XXX-Sunay: We should hold srs_lock since ring_count
+ * below can change. But if we are always called from
+ * mac_rx_srs_drain and SRS_PROC is set, then we can
+ * enforce that ring_count can't be changed i.e.
+ * to change fanout type or ring count, the calling
+ * thread needs to be behind SRS_PROC.
+ */
+ switch (ipha->ipha_protocol) {
+ case IPPROTO_TCP:
+ /*
+ * Note that for ESP, we fanout on SPI and it is at the
+ * same offset as the 2x16-bit ports. So it is clumped
+ * along with TCP, UDP and SCTP.
+ */
+ hash = HASH_ADDR(ipha->ipha_src,
+ *(uint32_t *)(mp->b_rptr + ports_offset));
+ indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
+ type = V4_TCP;
+ mp->b_rptr += ether_hlen;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ case IPPROTO_ESP:
+ if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
+ hash = HASH_ADDR(ipha->ipha_src,
+ *(uint32_t *)(mp->b_rptr + ports_offset));
+ indx = COMPUTE_INDEX(hash,
+ mac_srs->srs_udp_ring_count);
+ } else {
+ indx = mac_srs->srs_ind %
+ mac_srs->srs_udp_ring_count;
+ mac_srs->srs_ind++;
+ }
+ type = V4_UDP;
+ mp->b_rptr += ether_hlen;
+ break;
+ }
+
+ ASSERT(type != UNDEF);
+
+ FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
+ cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
+ }
+
+ for (type = V4_TCP; type < UNDEF; type++) {
+ for (i = 0; i < fanout_cnt; i++) {
+ if (headmp[type][i] != NULL) {
+ ASSERT(tailmp[type][i]->b_next == NULL);
+ switch (type) {
+ case V4_TCP:
+ softring =
+ mac_srs->srs_tcp_soft_rings[i];
+ break;
+ case V4_UDP:
+ softring =
+ mac_srs->srs_udp_soft_rings[i];
+ break;
+ case OTH:
+ softring =
+ mac_srs->srs_oth_soft_rings[i];
+ break;
+ }
+ mac_rx_soft_ring_process(mac_srs->srs_mcip,
+ softring, headmp[type][i], tailmp[type][i],
+ cnt[type][i], sz[type][i]);
+ }
+ }
+ }
+}
+
+#define SRS_BYTES_TO_PICKUP 150000
+ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
+
+/*
+ * mac_rx_srs_poll_ring
+ *
+ * This SRS Poll thread uses this routine to poll the underlying hardware
+ * Rx ring to get a chain of packets. It can inline process that chain
+ * if mac_latency_optimize is set (default) or signal the SRS worker thread
+ * to do the remaining processing.
+ *
+ * Since packets come in the system via interrupt or poll path, we also
+ * update the stats and deal with promiscous clients here.
+ */
+void
+mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
+{
+ kmutex_t *lock = &mac_srs->srs_lock;
+ kcondvar_t *async = &mac_srs->srs_cv;
+ mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
+ mblk_t *head, *tail, *mp;
+ callb_cpr_t cprinfo;
+ ssize_t bytes_to_pickup;
+ size_t sz;
+ int count;
+ mac_client_impl_t *smcip;
+
+ CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
+ mutex_enter(lock);
+
+start:
+ for (;;) {
+ if (mac_srs->srs_state & SRS_PAUSE)
+ goto done;
+
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(async, lock);
+ CALLB_CPR_SAFE_END(&cprinfo, lock);
+
+ if (mac_srs->srs_state & SRS_PAUSE)
+ goto done;
+
+check_again:
+ if (mac_srs->srs_type & SRST_BW_CONTROL) {
+ /*
+ * We pick as many bytes as we are allowed to queue.
+ * Its possible that we will exceed the total
+ * packets queued in case this SRS is part of the
+ * Rx ring group since > 1 poll thread can be pulling
+ * upto the max allowed packets at the same time
+ * but that should be OK.
+ */
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ bytes_to_pickup =
+ mac_srs->srs_bw->mac_bw_drop_threshold -
+ mac_srs->srs_bw->mac_bw_sz;
+ /*
+ * We shouldn't have been signalled if we
+ * have 0 or less bytes to pick but since
+ * some of the bytes accounting is driver
+ * dependant, we do the safety check.
+ */
+ if (bytes_to_pickup < 0)
+ bytes_to_pickup = 0;
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ } else {
+ /*
+ * ToDO: Need to change the polling API
+ * to add a packet count and a flag which
+ * tells the driver whether we want packets
+ * based on a count, or bytes, or all the
+ * packets queued in the driver/HW. This
+ * way, we never have to check the limits
+ * on poll path. We truly let only as many
+ * packets enter the system as we are willing
+ * to process or queue.
+ *
+ * Something along the lines of
+ * pkts_to_pickup = mac_soft_ring_max_q_cnt -
+ * mac_srs->srs_poll_pkt_cnt
+ */
+
+ /*
+ * Since we are not doing B/W control, pick
+ * as many packets as allowed.
+ */
+ bytes_to_pickup = max_bytes_to_pickup;
+ }
+
+ /* Poll the underlying Hardware */
+ mutex_exit(lock);
+ head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
+ mutex_enter(lock);
+
+ ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
+ SRS_POLL_THR_OWNER);
+
+ mp = tail = head;
+ count = 0;
+ sz = 0;
+ while (mp != NULL) {
+ tail = mp;
+ sz += msgdsize(mp);
+ mp = mp->b_next;
+ count++;
+ }
+
+ if (head != NULL) {
+ tail->b_next = NULL;
+ smcip = mac_srs->srs_mcip;
+
+ if ((mac_srs->srs_type & SRST_FLOW) ||
+ (smcip == NULL)) {
+ FLOW_STAT_UPDATE(mac_srs->srs_flent,
+ rbytes, sz);
+ FLOW_STAT_UPDATE(mac_srs->srs_flent,
+ ipackets, count);
+ }
+
+ /*
+ * If there are any promiscuous mode callbacks
+ * defined for this MAC client, pass them a copy
+ * if appropriate and also update the counters.
+ */
+ if (smcip != NULL) {
+ smcip->mci_stat_ibytes += sz;
+ smcip->mci_stat_ipackets += count;
+
+ if (smcip->mci_mip->mi_promisc_list != NULL) {
+ mutex_exit(lock);
+ mac_promisc_dispatch(smcip->mci_mip,
+ head, NULL);
+ mutex_enter(lock);
+ }
+ }
+ if (mac_srs->srs_type & SRST_BW_CONTROL) {
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ mac_srs->srs_bw->mac_bw_polled += sz;
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ }
+ srs_rx->sr_poll_count += count;
+ MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
+ count, sz);
+ if (count <= 10)
+ srs_rx->sr_chain_cnt_undr10++;
+ else if (count > 10 && count <= 50)
+ srs_rx->sr_chain_cnt_10to50++;
+ else
+ srs_rx->sr_chain_cnt_over50++;
+ }
+
+ /*
+ * We are guaranteed that SRS_PROC will be set if we
+ * are here. Also, poll thread gets to run only if
+ * the drain was being done by a worker thread although
+ * its possible that worker thread is still running
+ * and poll thread was sent down to keep the pipeline
+ * going instead of doing a complete drain and then
+ * trying to poll the NIC.
+ *
+ * So we need to check SRS_WORKER flag to make sure
+ * that the worker thread is not processing the queue
+ * in parallel to us. The flags and conditions are
+ * protected by the srs_lock to prevent any race. We
+ * ensure that we don't drop the srs_lock from now
+ * till the end and similarly we don't drop the srs_lock
+ * in mac_rx_srs_drain() till similar condition check
+ * are complete. The mac_rx_srs_drain() needs to ensure
+ * that SRS_WORKER flag remains set as long as its
+ * processing the queue.
+ */
+ if (!(mac_srs->srs_state & SRS_WORKER) &&
+ (mac_srs->srs_first != NULL)) {
+ /*
+ * We have packets to process and worker thread
+ * is not running. Check to see if poll thread is
+ * allowed to process. Let it do processing only if it
+ * picked up some packets from the NIC otherwise
+ * wakeup the worker thread.
+ */
+ if ((mac_srs->srs_state & SRS_LATENCY_OPT) &&
+ (head != NULL)) {
+ mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
+ if (srs_rx->sr_poll_pkt_cnt <=
+ srs_rx->sr_lowat) {
+ srs_rx->sr_poll_again++;
+ goto check_again;
+ } else {
+ /*
+ * We are already above low water mark
+ * so stay in the polling mode but no
+ * need to poll. Once we dip below
+ * the polling threshold, the processing
+ * thread (soft ring) will signal us
+ * to poll again (MAC_UPDATE_SRS_COUNT)
+ */
+ srs_rx->sr_poll_drain_no_poll++;
+ mac_srs->srs_state &=
+ ~(SRS_PROC|SRS_GET_PKTS);
+ /*
+ * In B/W control case, its possible
+ * that the backlog built up due to
+ * B/W limit being reached and packets
+ * are queued only in SRS. In this case,
+ * we should schedule worker thread
+ * since no one else will wake us up.
+ */
+ if ((mac_srs->srs_type &
+ SRST_BW_CONTROL) &&
+ (mac_srs->srs_tid == NULL)) {
+ mac_srs->srs_tid =
+ timeout(mac_srs_fire,
+ mac_srs, 1);
+ srs_rx->sr_poll_worker_wakeup++;
+ }
+ }
+ } else {
+ /*
+ * Wakeup the worker thread for more processing.
+ * We optimize for throughput in this case.
+ */
+ mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
+ MAC_SRS_WORKER_WAKEUP(mac_srs);
+ srs_rx->sr_poll_sig_worker++;
+ }
+ } else if ((mac_srs->srs_first == NULL) &&
+ !(mac_srs->srs_state & SRS_WORKER)) {
+ /*
+ * There is nothing queued in SRS and
+ * no worker thread running. Plus we
+ * didn't get anything from the H/W
+ * as well (head == NULL);
+ */
+ ASSERT(head == NULL);
+ mac_srs->srs_state &=
+ ~(SRS_PROC|SRS_GET_PKTS);
+
+ /*
+ * If we have a packets in soft ring, don't allow
+ * more packets to come into this SRS by keeping the
+ * interrupts off but not polling the H/W. The
+ * poll thread will get signaled as soon as
+ * srs_poll_pkt_cnt dips below poll threshold.
+ */
+ if (srs_rx->sr_poll_pkt_cnt == 0) {
+ srs_rx->sr_poll_intr_enable++;
+ MAC_SRS_POLLING_OFF(mac_srs);
+ } else {
+ /*
+ * We know nothing is queued in SRS
+ * since we are here after checking
+ * srs_first is NULL. The backlog
+ * is entirely due to packets queued
+ * in Soft ring which will wake us up
+ * and get the interface out of polling
+ * mode once the backlog dips below
+ * sr_poll_thres.
+ */
+ srs_rx->sr_poll_no_poll++;
+ }
+ } else {
+ /*
+ * Worker thread is already running.
+ * Nothing much to do. If the polling
+ * was enabled, worker thread will deal
+ * with that.
+ */
+ mac_srs->srs_state &= ~SRS_GET_PKTS;
+ srs_rx->sr_poll_goto_sleep++;
+ }
+ }
+done:
+ mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
+ cv_signal(&mac_srs->srs_async);
+ /*
+ * If this is a temporary quiesce then wait for the restart signal
+ * from the srs worker. Then clear the flags and signal the srs worker
+ * to ensure a positive handshake and go back to start.
+ */
+ while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
+ cv_wait(async, lock);
+ if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
+ ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
+ mac_srs->srs_state &=
+ ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
+ cv_signal(&mac_srs->srs_async);
+ goto start;
+ } else {
+ mac_srs->srs_state |= SRS_POLL_THR_EXITED;
+ cv_signal(&mac_srs->srs_async);
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
+ }
+}
+
+/*
+ * mac_srs_pick_chain
+ *
+ * In Bandwidth control case, checks how many packets can be processed
+ * and return them in a sub chain.
+ */
+static mblk_t *
+mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
+ size_t *chain_sz, int *chain_cnt)
+{
+ mblk_t *head = NULL;
+ mblk_t *tail = NULL;
+ size_t sz;
+ size_t tsz = 0;
+ int cnt = 0;
+ mblk_t *mp;
+
+ ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
+ mac_srs->srs_bw->mac_bw_limit) ||
+ (mac_srs->srs_bw->mac_bw_limit == 0)) {
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ head = mac_srs->srs_first;
+ mac_srs->srs_first = NULL;
+ *chain_tail = mac_srs->srs_last;
+ mac_srs->srs_last = NULL;
+ *chain_sz = mac_srs->srs_size;
+ *chain_cnt = mac_srs->srs_count;
+ mac_srs->srs_count = 0;
+ mac_srs->srs_size = 0;
+ return (head);
+ }
+
+ /*
+ * Can't clear the entire backlog.
+ * Need to find how many packets to pick
+ */
+ ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
+ while ((mp = mac_srs->srs_first) != NULL) {
+ sz = msgdsize(mp);
+ if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
+ mac_srs->srs_bw->mac_bw_limit) {
+ if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
+ mac_srs->srs_bw->mac_bw_state |=
+ SRS_BW_ENFORCED;
+ break;
+ }
+
+ /*
+ * The _size & cnt is decremented from the softrings
+ * when they send up the packet for polling to work
+ * properly.
+ */
+ tsz += sz;
+ cnt++;
+ mac_srs->srs_count--;
+ mac_srs->srs_size -= sz;
+ if (tail != NULL)
+ tail->b_next = mp;
+ else
+ head = mp;
+ tail = mp;
+ mac_srs->srs_first = mac_srs->srs_first->b_next;
+ }
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ if (mac_srs->srs_first == NULL)
+ mac_srs->srs_last = NULL;
+
+ if (tail != NULL)
+ tail->b_next = NULL;
+ *chain_tail = tail;
+ *chain_cnt = cnt;
+ *chain_sz = tsz;
+
+ return (head);
+}
+
+/*
+ * mac_rx_srs_drain
+ *
+ * The SRS drain routine. Gets to run to clear the queue. Any thread
+ * (worker, interrupt, poll) can call this based on processing model.
+ * The first thing we do is disable interrupts if possible and then
+ * drain the queue. we also try to poll the underlying hardware if
+ * there is a dedicated hardware Rx ring assigned to this SRS.
+ *
+ * There is a equivalent drain routine in bandwidth control mode
+ * mac_rx_srs_drain_bw. There is some code duplication between the two
+ * routines but they are highly performance sensitive and are easier
+ * to read/debug if they stay separate. Any code changes here might
+ * also apply to mac_rx_srs_drain_bw as well.
+ */
+void
+mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
+{
+ mblk_t *head;
+ mblk_t *tail;
+ timeout_id_t tid;
+ int cnt = 0;
+ mac_client_impl_t *mcip = mac_srs->srs_mcip;
+ mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
+
+ ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+ ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
+again:
+ /* If we are blanked i.e. can't do upcalls, then we are done */
+ if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
+ ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
+ (mac_srs->srs_state & SRS_PAUSE));
+ goto out;
+ }
+
+ if (mac_srs->srs_first == NULL)
+ goto out;
+
+ head = mac_srs->srs_first;
+ mac_srs->srs_first = NULL;
+ tail = mac_srs->srs_last;
+ mac_srs->srs_last = NULL;
+ cnt = mac_srs->srs_count;
+ mac_srs->srs_count = 0;
+
+ ASSERT(head != NULL);
+ ASSERT(tail != NULL);
+
+ if ((tid = mac_srs->srs_tid) != 0)
+ mac_srs->srs_tid = 0;
+
+ mac_srs->srs_state |= (SRS_PROC|proc_type);
+
+ /* Switch to polling mode */
+ MAC_SRS_WORKER_POLLING_ON(mac_srs);
+ if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
+ MAC_SRS_POLL_RING(mac_srs);
+ /*
+ * mcip is NULL for broadcast and multicast flows. The promisc
+ * callbacks for broadcast and multicast packets are delivered from
+ * mac_rx() and we don't need to worry about that case in this path
+ */
+ if (mcip != NULL && mcip->mci_promisc_list != NULL) {
+ mutex_exit(&mac_srs->srs_lock);
+ mac_promisc_client_dispatch(mcip, head);
+ mutex_enter(&mac_srs->srs_lock);
+ }
+
+ /*
+ * Check if SRS itself is doing the processing
+ * This direct path does not apply when subflows are present. In this
+ * case, packets need to be dispatched to a soft ring according to the
+ * flow's bandwidth and other resources contraints.
+ */
+ if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
+ mac_direct_rx_t proc;
+ void *arg1;
+ mac_resource_handle_t arg2;
+
+ /*
+ * This is the case when a Rx is directly
+ * assigned and we have a fully classified
+ * protocol chain. We can deal with it in
+ * one shot.
+ */
+ proc = srs_rx->sr_func;
+ arg1 = srs_rx->sr_arg1;
+ arg2 = srs_rx->sr_arg2;
+
+ mac_srs->srs_state |= SRS_CLIENT_PROC;
+ mutex_exit(&mac_srs->srs_lock);
+ if (tid != 0) {
+ (void) untimeout(tid);
+ tid = 0;
+ }
+
+ proc(arg1, arg2, head, NULL);
+ /*
+ * Decrement the size and count here itelf
+ * since the packet has been processed.
+ */
+ mutex_enter(&mac_srs->srs_lock);
+ MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+ if (mac_srs->srs_state & SRS_CLIENT_WAIT)
+ cv_signal(&mac_srs->srs_client_cv);
+ mac_srs->srs_state &= ~SRS_CLIENT_PROC;
+ } else {
+ /* Some kind of softrings based fanout is required */
+ mutex_exit(&mac_srs->srs_lock);
+ if (tid != 0) {
+ (void) untimeout(tid);
+ tid = 0;
+ }
+
+ /*
+ * Since the fanout routines can deal with chains,
+ * shoot the entire chain up.
+ */
+ if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
+ mac_rx_srs_fanout(mac_srs, head);
+ else
+ mac_rx_srs_proto_fanout(mac_srs, head);
+ mutex_enter(&mac_srs->srs_lock);
+ }
+
+ /*
+ * Send the poll thread to pick up any packets arrived
+ * so far. This also serves as the last check in case
+ * nothing else is queued in the SRS. The poll thread
+ * is signalled only in the case the drain was done
+ * by the worker thread and SRS_WORKER is set. The
+ * worker thread can run in parallel as long as the
+ * SRS_WORKER flag is set. We we have nothing else to
+ * process, we can exit while leaving SRS_PROC set
+ * which gives the poll thread control to process and
+ * cleanup once it returns from the NIC.
+ *
+ * If we have nothing else to process, we need to
+ * ensure that we keep holding the srs_lock till
+ * all the checks below are done and control is
+ * handed to the poll thread if it was running.
+ */
+ if (mac_srs->srs_first != NULL) {
+ if (proc_type == SRS_WORKER) {
+ if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
+ MAC_SRS_POLL_RING(mac_srs);
+ srs_rx->sr_drain_again++;
+ goto again;
+ } else {
+ srs_rx->sr_drain_worker_sig++;
+ cv_signal(&mac_srs->srs_async);
+ }
+ }
+
+out:
+
+ if (mac_srs->srs_state & SRS_GET_PKTS) {
+ /*
+ * Poll thread is already running. Leave the
+ * SRS_RPOC set and hand over the control to
+ * poll thread.
+ */
+ mac_srs->srs_state &= ~proc_type;
+ srs_rx->sr_drain_poll_running++;
+ return;
+ }
+
+ /*
+ * Even if there are no packets queued in SRS, we
+ * need to make sure that the shared counter is
+ * clear and any associated softrings have cleared
+ * all the backlog. Otherwise, leave the interface
+ * in polling mode and the poll thread will get
+ * signalled once the count goes down to zero.
+ *
+ * If someone is already draining the queue (SRS_PROC is
+ * set) when the srs_poll_pkt_cnt goes down to zero,
+ * then it means that drain is already running and we
+ * will turn off polling at that time if there is
+ * no backlog.
+ *
+ * As long as there are packets queued either
+ * in soft ring set or its soft rings, we will leave
+ * the interface in polling mode (even if the drain
+ * was done being the interrupt thread). We signal
+ * the poll thread as well if we have dipped below
+ * low water mark.
+ *
+ * NOTE: We can't use the MAC_SRS_POLLING_ON macro
+ * since that turn polling on only for worker thread.
+ * Its not worth turning polling on for interrupt
+ * thread (since NIC will not issue another interrupt)
+ * unless a backlog builds up.
+ */
+ if ((srs_rx->sr_poll_pkt_cnt > 0) &&
+ (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
+ mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+ srs_rx->sr_drain_keep_polling++;
+ MAC_SRS_POLLING_ON(mac_srs);
+ if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
+ MAC_SRS_POLL_RING(mac_srs);
+ return;
+ }
+
+ /* Nothing else to do. Get out of poll mode */
+ MAC_SRS_POLLING_OFF(mac_srs);
+ mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+ srs_rx->sr_drain_finish_intr++;
+}
+
+/*
+ * mac_rx_srs_drain_bw
+ *
+ * The SRS BW drain routine. Gets to run to clear the queue. Any thread
+ * (worker, interrupt, poll) can call this based on processing model.
+ * The first thing we do is disable interrupts if possible and then
+ * drain the queue. we also try to poll the underlying hardware if
+ * there is a dedicated hardware Rx ring assigned to this SRS.
+ *
+ * There is a equivalent drain routine in non bandwidth control mode
+ * mac_rx_srs_drain. There is some code duplication between the two
+ * routines but they are highly performance sensitive and are easier
+ * to read/debug if they stay separate. Any code changes here might
+ * also apply to mac_rx_srs_drain as well.
+ */
+void
+mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
+{
+ mblk_t *head;
+ mblk_t *tail;
+ timeout_id_t tid;
+ size_t sz = 0;
+ int cnt = 0;
+ mac_client_impl_t *mcip = mac_srs->srs_mcip;
+ mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
+
+ ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+ ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
+again:
+ /* Check if we are doing B/W control */
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+ mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+ mac_srs->srs_bw->mac_bw_used = 0;
+ if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
+ mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
+ } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ goto done;
+ } else if (mac_srs->srs_bw->mac_bw_used >
+ mac_srs->srs_bw->mac_bw_limit) {
+ mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ goto done;
+ }
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+
+ /* If we are blanked i.e. can't do upcalls, then we are done */
+ if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
+ ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
+ (mac_srs->srs_state & SRS_PAUSE));
+ goto done;
+ }
+
+ sz = 0;
+ cnt = 0;
+ if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
+ /*
+ * We couldn't pick up a single packet.
+ */
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ if ((mac_srs->srs_bw->mac_bw_used == 0) &&
+ (mac_srs->srs_size != 0) &&
+ !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
+ /*
+ * Seems like configured B/W doesn't
+ * even allow processing of 1 packet
+ * per tick.
+ *
+ * XXX: raise the limit to processing
+ * at least 1 packet per tick.
+ */
+ mac_srs->srs_bw->mac_bw_limit +=
+ mac_srs->srs_bw->mac_bw_limit;
+ mac_srs->srs_bw->mac_bw_drop_threshold +=
+ mac_srs->srs_bw->mac_bw_drop_threshold;
+ cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
+ "raised B/W limit to %d since not even a "
+ "single packet can be processed per "
+ "tick %d\n", (void *)mac_srs,
+ (int)mac_srs->srs_bw->mac_bw_limit,
+ (int)msgdsize(mac_srs->srs_first));
+ }
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ goto done;
+ }
+
+ ASSERT(head != NULL);
+ ASSERT(tail != NULL);
+
+ /* zero bandwidth: drop all and return to interrupt mode */
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ if (mac_srs->srs_bw->mac_bw_limit == 0) {
+ srs_rx->sr_drop_count += cnt;
+ ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
+ mac_srs->srs_bw->mac_bw_sz -= sz;
+ mac_srs->srs_bw->mac_bw_drop_bytes += sz;
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ mac_pkt_drop(NULL, NULL, head, B_FALSE);
+ goto leave_poll;
+ } else {
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ }
+
+ /*
+ * We can continue processing the queue.
+ * We need to figure out if there is a fanout needed or
+ * we can just process this here.
+ */
+
+ if ((tid = mac_srs->srs_tid) != 0)
+ mac_srs->srs_tid = 0;
+
+ mac_srs->srs_state |= (SRS_PROC|proc_type);
+ MAC_SRS_WORKER_POLLING_ON(mac_srs);
+
+ /*
+ * mcip is NULL for broadcast and multicast flows. The promisc
+ * callbacks for broadcast and multicast packets are delivered from
+ * mac_rx() and we don't need to worry about that case in this path
+ */
+ if (mcip != NULL && mcip->mci_promisc_list != NULL) {
+ mutex_exit(&mac_srs->srs_lock);
+ mac_promisc_client_dispatch(mcip, head);
+ mutex_enter(&mac_srs->srs_lock);
+ }
+
+ /*
+ * Check if SRS itself is doing the processing
+ * This direct path does not apply when subflows are present. In this
+ * case, packets need to be dispatched to a soft ring according to the
+ * flow's bandwidth and other resources contraints.
+ */
+ if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
+ mac_direct_rx_t proc;
+ void *arg1;
+ mac_resource_handle_t arg2;
+
+ /*
+ * This is the case when a Rx is directly
+ * assigned and we have a fully classified
+ * protocol chain. We can deal with it in
+ * one shot.
+ */
+ proc = srs_rx->sr_func;
+ arg1 = srs_rx->sr_arg1;
+ arg2 = srs_rx->sr_arg2;
+
+ mac_srs->srs_state |= SRS_CLIENT_PROC;
+ mutex_exit(&mac_srs->srs_lock);
+ if (tid != 0) {
+ (void) untimeout(tid);
+ tid = 0;
+ }
+
+ proc(arg1, arg2, head, NULL);
+ /*
+ * Decrement the size and count here itelf
+ * since the packet has been processed.
+ */
+ mutex_enter(&mac_srs->srs_lock);
+ MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+ MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+
+ if (mac_srs->srs_state & SRS_CLIENT_WAIT)
+ cv_signal(&mac_srs->srs_client_cv);
+ mac_srs->srs_state &= ~SRS_CLIENT_PROC;
+ } else {
+ /* Some kind of softrings based fanout is required */
+ mutex_exit(&mac_srs->srs_lock);
+ if (tid != 0) {
+ (void) untimeout(tid);
+ tid = 0;
+ }
+
+ /*
+ * Since the fanout routines can deal with chains,
+ * shoot the entire chain up.
+ */
+ if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
+ mac_rx_srs_fanout(mac_srs, head);
+ else
+ mac_rx_srs_proto_fanout(mac_srs, head);
+ mutex_enter(&mac_srs->srs_lock);
+ }
+
+ /*
+ * Send the poll thread to pick up any packets arrived
+ * so far. This also serves as the last check in case
+ * nothing else is queued in the SRS. The poll thread
+ * is signalled only in the case the drain was done
+ * by the worker thread and SRS_WORKER is set. The
+ * worker thread can run in parallel as long as the
+ * SRS_WORKER flag is set. We we have nothing else to
+ * process, we can exit while leaving SRS_PROC set
+ * which gives the poll thread control to process and
+ * cleanup once it returns from the NIC.
+ *
+ * If we have nothing else to process, we need to
+ * ensure that we keep holding the srs_lock till
+ * all the checks below are done and control is
+ * handed to the poll thread if it was running.
+ */
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
+ if (mac_srs->srs_first != NULL) {
+ if (proc_type == SRS_WORKER) {
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ if (srs_rx->sr_poll_pkt_cnt <=
+ srs_rx->sr_lowat)
+ MAC_SRS_POLL_RING(mac_srs);
+ goto again;
+ } else {
+ cv_signal(&mac_srs->srs_async);
+ }
+ }
+ }
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+
+done:
+
+ if (mac_srs->srs_state & SRS_GET_PKTS) {
+ /*
+ * Poll thread is already running. Leave the
+ * SRS_RPOC set and hand over the control to
+ * poll thread.
+ */
+ mac_srs->srs_state &= ~proc_type;
+ return;
+ }
+
+ /*
+ * If we can't process packets because we have exceeded
+ * B/W limit for this tick, just set the timeout
+ * and leave.
+ *
+ * Even if there are no packets queued in SRS, we
+ * need to make sure that the shared counter is
+ * clear and any associated softrings have cleared
+ * all the backlog. Otherwise, leave the interface
+ * in polling mode and the poll thread will get
+ * signalled once the count goes down to zero.
+ *
+ * If someone is already draining the queue (SRS_PROC is
+ * set) when the srs_poll_pkt_cnt goes down to zero,
+ * then it means that drain is already running and we
+ * will turn off polling at that time if there is
+ * no backlog. As long as there are packets queued either
+ * is soft ring set or its soft rings, we will leave
+ * the interface in polling mode.
+ */
+ mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+ if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
+ ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
+ (srs_rx->sr_poll_pkt_cnt > 0))) {
+ MAC_SRS_POLLING_ON(mac_srs);
+ mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+ if ((mac_srs->srs_first != NULL) &&
+ (mac_srs->srs_tid == NULL))
+ mac_srs->srs_tid = timeout(mac_srs_fire,
+ mac_srs, 1);
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+ return;
+ }
+ mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+
+leave_poll:
+
+ /* Nothing else to do. Get out of poll mode */
+ MAC_SRS_POLLING_OFF(mac_srs);
+ mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+}
+
+/*
+ * mac_srs_worker
+ *
+ * The SRS worker routine. Drains the queue when no one else is
+ * processing it.
+ */
+void
+mac_srs_worker(mac_soft_ring_set_t *mac_srs)
+{
+ kmutex_t *lock = &mac_srs->srs_lock;
+ kcondvar_t *async = &mac_srs->srs_async;
+ callb_cpr_t cprinfo;
+ boolean_t bw_ctl_flag;
+
+ CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
+ mutex_enter(lock);
+
+start:
+ for (;;) {
+ bw_ctl_flag = B_FALSE;
+ if (mac_srs->srs_type & SRST_BW_CONTROL) {
+ MAC_SRS_BW_LOCK(mac_srs);
+ MAC_SRS_CHECK_BW_CONTROL(mac_srs);
+ if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
+ bw_ctl_flag = B_TRUE;
+ MAC_SRS_BW_UNLOCK(mac_srs);
+ }
+ /*
+ * The SRS_BW_ENFORCED flag may change since we have dropped
+ * the mac_bw_lock. However the drain function can handle both
+ * a drainable SRS or a bandwidth controlled SRS, and the
+ * effect of scheduling a timeout is to wakeup the worker
+ * thread which in turn will call the drain function. Since
+ * we release the srs_lock atomically only in the cv_wait there
+ * isn't a fear of waiting for ever.
+ */
+ while (((mac_srs->srs_state & SRS_PROC) ||
+ (mac_srs->srs_first == NULL) || bw_ctl_flag ||
+ (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
+ !(mac_srs->srs_state & SRS_PAUSE)) {
+ /*
+ * If we have packets queued and we are here
+ * because B/W control is in place, we better
+ * schedule the worker wakeup after 1 tick
+ * to see if bandwidth control can be relaxed.
+ */
+ if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
+ /*
+ * We need to ensure that a timer is already
+ * scheduled or we force schedule one for
+ * later so that we can continue processing
+ * after this quanta is over.
+ */
+ mac_srs->srs_tid = timeout(mac_srs_fire,
+ mac_srs, 1);
+ }
+wait:
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(async, lock);
+ CALLB_CPR_SAFE_END(&cprinfo, lock);
+
+ if (mac_srs->srs_state & SRS_PAUSE)
+ goto done;
+ if (mac_srs->srs_state & SRS_PROC)
+ goto wait;
+
+ if (mac_srs->srs_first != NULL &&
+ mac_srs->srs_type & SRST_BW_CONTROL) {
+ MAC_SRS_BW_LOCK(mac_srs);
+ if (mac_srs->srs_bw->mac_bw_state &
+ SRS_BW_ENFORCED) {
+ MAC_SRS_CHECK_BW_CONTROL(mac_srs);
+ }
+ bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
+ SRS_BW_ENFORCED;
+ MAC_SRS_BW_UNLOCK(mac_srs);
+ }
+ }
+
+ if (mac_srs->srs_state & SRS_PAUSE)
+ goto done;
+ mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
+ }
+done:
+ /*
+ * The Rx SRS quiesce logic first cuts off packet supply to the SRS
+ * from both hard and soft classifications and waits for such threads
+ * to finish before signaling the worker. So at this point the only
+ * thread left that could be competing with the worker is the poll
+ * thread. In the case of Tx, there shouldn't be any thread holding
+ * SRS_PROC at this point.
+ */
+ if (!(mac_srs->srs_state & SRS_PROC)) {
+ mac_srs->srs_state |= SRS_PROC;
+ } else {
+ ASSERT((mac_srs->srs_type & SRST_TX) == 0);
+ /*
+ * Poll thread still owns the SRS and is still running
+ */
+ ASSERT((mac_srs->srs_poll_thr == NULL) ||
+ ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
+ SRS_POLL_THR_OWNER));
+ }
+ mac_srs_worker_quiesce(mac_srs);
+ /*
+ * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
+ * of the quiesce operation
+ */
+ while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
+ cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+
+ if (mac_srs->srs_state & SRS_RESTART) {
+ ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
+ mac_srs_worker_restart(mac_srs);
+ mac_srs->srs_state &= ~SRS_PROC;
+ goto start;
+ }
+
+ if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
+ mac_srs_worker_quiesce(mac_srs);
+
+ mac_srs->srs_state &= ~SRS_PROC;
+ /* The macro drops the srs_lock */
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
+}
+
+/*
+ * mac_rx_srs_subflow_process
+ *
+ * Receive side routine called from interrupt path when there are
+ * sub flows present on this SRS.
+ */
+/* ARGSUSED */
+void
+mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
+ mblk_t *mp_chain, boolean_t loopback)
+{
+ flow_entry_t *flent = NULL;
+ flow_entry_t *prev_flent = NULL;
+ mblk_t *mp = NULL;
+ mblk_t *tail = NULL;
+ mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
+ mac_client_impl_t *mcip;
+
+ mcip = mac_srs->srs_mcip;
+ ASSERT(mcip != NULL);
+
+ /*
+ * We need to determine the SRS for every packet
+ * by walking the flow table, if we don't get any,
+ * then we proceed using the SRS we came with.
+ */
+ mp = tail = mp_chain;
+ while (mp != NULL) {
+
+ /*
+ * We will increment the stats for the mactching subflow.
+ * when we get the bytes/pkt count for the classified packets
+ * later in mac_rx_srs_process.
+ */
+ (void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
+ FLOW_INBOUND, &flent);
+
+ if (mp == mp_chain || flent == prev_flent) {
+ if (prev_flent != NULL)
+ FLOW_REFRELE(prev_flent);
+ prev_flent = flent;
+ flent = NULL;
+ tail = mp;
+ mp = mp->b_next;
+ continue;
+ }
+ tail->b_next = NULL;
+ /*
+ * A null indicates, this is for the mac_srs itself.
+ * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
+ */
+ if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
+ mac_rx_srs_process(arg,
+ (mac_resource_handle_t)mac_srs, mp_chain,
+ loopback);
+ } else {
+ (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
+ prev_flent->fe_cb_arg2, mp_chain, loopback);
+ FLOW_REFRELE(prev_flent);
+ }
+ prev_flent = flent;
+ flent = NULL;
+ mp_chain = mp;
+ tail = mp;
+ mp = mp->b_next;
+ }
+ /* Last chain */
+ ASSERT(mp_chain != NULL);
+ if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
+ mac_rx_srs_process(arg,
+ (mac_resource_handle_t)mac_srs, mp_chain, loopback);
+ } else {
+ (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
+ prev_flent->fe_cb_arg2, mp_chain, loopback);
+ FLOW_REFRELE(prev_flent);
+ }
+}
+
+/*
+ * mac_rx_srs_process
+ *
+ * Receive side routine called from the interrupt path.
+ *
+ * loopback is set to force a context switch on the loopback
+ * path between MAC clients.
+ */
+/* ARGSUSED */
+void
+mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
+ boolean_t loopback)
+{
+ mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
+ mblk_t *mp, *tail, *head;
+ int count = 0;
+ int count1;
+ size_t sz = 0;
+ size_t chain_sz, sz1;
+ mac_bw_ctl_t *mac_bw;
+ mac_client_impl_t *smcip;
+ mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
+
+ /*
+ * Set the tail, count and sz. We set the sz irrespective
+ * of whether we are doing B/W control or not for the
+ * purpose of updating the stats.
+ */
+ mp = tail = mp_chain;
+ while (mp != NULL) {
+ tail = mp;
+ count++;
+ sz += msgdsize(mp);
+ mp = mp->b_next;
+ }
+
+ mutex_enter(&mac_srs->srs_lock);
+ smcip = mac_srs->srs_mcip;
+
+ if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) {
+ FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz);
+ FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count);
+ }
+ if (smcip != NULL) {
+ smcip->mci_stat_ibytes += sz;
+ smcip->mci_stat_ipackets += count;
+ }
+
+ /*
+ * If the SRS in already being processed; has been blanked;
+ * can be processed by worker thread only; or the B/W limit
+ * has been reached, then queue the chain and check if
+ * worker thread needs to be awakend.
+ */
+ if (mac_srs->srs_type & SRST_BW_CONTROL) {
+ mac_bw = mac_srs->srs_bw;
+ ASSERT(mac_bw != NULL);
+ mutex_enter(&mac_bw->mac_bw_lock);
+ /* Count the packets and bytes via interrupt */
+ srs_rx->sr_intr_count += count;
+ mac_bw->mac_bw_intr += sz;
+ if (mac_bw->mac_bw_limit == 0) {
+ /* zero bandwidth: drop all */
+ srs_rx->sr_drop_count += count;
+ mac_bw->mac_bw_drop_bytes += sz;
+ mutex_exit(&mac_bw->mac_bw_lock);
+ mutex_exit(&mac_srs->srs_lock);
+ mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ return;
+ } else {
+ if ((mac_bw->mac_bw_sz + sz) <=
+ mac_bw->mac_bw_drop_threshold) {
+ mutex_exit(&mac_bw->mac_bw_lock);
+ MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
+ tail, count, sz);
+ } else {
+ mp = mp_chain;
+ chain_sz = 0;
+ count1 = 0;
+ tail = NULL;
+ head = NULL;
+ while (mp != NULL) {
+ sz1 = msgdsize(mp);
+ if (mac_bw->mac_bw_sz + chain_sz + sz1 >
+ mac_bw->mac_bw_drop_threshold)
+ break;
+ chain_sz += sz1;
+ count1++;
+ tail = mp;
+ mp = mp->b_next;
+ }
+ mutex_exit(&mac_bw->mac_bw_lock);
+ if (tail != NULL) {
+ head = tail->b_next;
+ tail->b_next = NULL;
+ MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
+ mp_chain, tail, count1, chain_sz);
+ sz -= chain_sz;
+ count -= count1;
+ } else {
+ /* Can't pick up any */
+ head = mp_chain;
+ }
+ if (head != NULL) {
+ /* Drop any packet over the threshold */
+ srs_rx->sr_drop_count += count;
+ mutex_enter(&mac_bw->mac_bw_lock);
+ mac_bw->mac_bw_drop_bytes += sz;
+ mutex_exit(&mac_bw->mac_bw_lock);
+ freemsgchain(head);
+ }
+ }
+ MAC_SRS_WORKER_WAKEUP(mac_srs);
+ mutex_exit(&mac_srs->srs_lock);
+ return;
+ }
+ }
+
+ /*
+ * If the total number of packets queued in the SRS and
+ * its associated soft rings exceeds the max allowed,
+ * then drop the chain. If we are polling capable, this
+ * shouldn't be happening.
+ */
+ if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
+ (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
+ mac_bw = mac_srs->srs_bw;
+ srs_rx->sr_drop_count += count;
+ mutex_enter(&mac_bw->mac_bw_lock);
+ mac_bw->mac_bw_drop_bytes += sz;
+ mutex_exit(&mac_bw->mac_bw_lock);
+ freemsgchain(mp_chain);
+ mutex_exit(&mac_srs->srs_lock);
+ return;
+ }
+
+ MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
+ /* Count the packets entering via interrupt path */
+ srs_rx->sr_intr_count += count;
+
+ if (!(mac_srs->srs_state & SRS_PROC)) {
+ /*
+ * If we are coming via loopback or if we are not
+ * optimizing for latency, we should signal the
+ * worker thread.
+ */
+ if (loopback || ((count > 1) &&
+ !(mac_srs->srs_state & SRS_LATENCY_OPT))) {
+ /*
+ * For loopback, We need to let the worker take
+ * over as we don't want to continue in the same
+ * thread even if we can. This could lead to stack
+ * overflows and may also end up using
+ * resources (cpu) incorrectly.
+ */
+ cv_signal(&mac_srs->srs_async);
+ } else {
+ /*
+ * Seems like no one is processing the SRS and
+ * there is no backlog. We also inline process
+ * our packet if its a single packet in non
+ * latency optimized case (in latency optimized
+ * case, we inline process chains of any size).
+ */
+ mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
+ }
+ }
+ mutex_exit(&mac_srs->srs_lock);
+}
+
+/* TX SIDE ROUTINES (RUNTIME) */
+
+/*
+ * mac_tx_srs_no_desc
+ *
+ * This routine is called by Tx single ring default mode
+ * when Tx ring runs out of descs.
+ */
+mac_tx_cookie_t
+mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+ uint16_t flag, mblk_t **ret_mp)
+{
+ mac_tx_cookie_t cookie = NULL;
+ mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+ boolean_t wakeup_worker = B_TRUE;
+ uint32_t tx_mode = srs_tx->st_mode;
+ int cnt, sz;
+ mblk_t *tail;
+
+ ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
+ if (flag & MAC_DROP_ON_NO_DESC) {
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ } else {
+ if (mac_srs->srs_first != NULL)
+ wakeup_worker = B_FALSE;
+ MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+ if (flag & MAC_TX_NO_ENQUEUE) {
+ /*
+ * If TX_QUEUED is not set, queue the
+ * packet and let mac_tx_srs_drain()
+ * set the TX_BLOCKED bit for the
+ * reasons explained above. Otherwise,
+ * return the mblks.
+ */
+ if (wakeup_worker) {
+ MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+ mp_chain, tail, cnt, sz);
+ } else {
+ MAC_TX_SET_NO_ENQUEUE(mac_srs,
+ mp_chain, ret_mp, cookie);
+ }
+ } else {
+ MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
+ tail, cnt, sz, cookie);
+ }
+ if (wakeup_worker)
+ cv_signal(&mac_srs->srs_async);
+ }
+ return (cookie);
+}
+
+/*
+ * mac_tx_srs_enqueue
+ *
+ * This routine is called when Tx SRS is operating in either serializer
+ * or bandwidth mode. In serializer mode, a packet will get enqueued
+ * when a thread cannot enter SRS exclusively. In bandwidth mode,
+ * packets gets queued if allowed byte-count limit for a tick is
+ * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
+ * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
+ * the default mode or fanout mode. Here packets get dropped or
+ * returned back to the caller only after hi-watermark worth of data
+ * is queued.
+ */
+static mac_tx_cookie_t
+mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+ uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
+{
+ mac_tx_cookie_t cookie = NULL;
+ int cnt, sz;
+ mblk_t *tail;
+ boolean_t wakeup_worker = B_TRUE;
+
+ if (mac_srs->srs_first != NULL)
+ wakeup_worker = B_FALSE;
+ MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+ if (flag & MAC_DROP_ON_NO_DESC) {
+ if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ } else {
+ MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+ mp_chain, tail, cnt, sz);
+ }
+ } else if (flag & MAC_TX_NO_ENQUEUE) {
+ if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
+ (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
+ MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
+ ret_mp, cookie);
+ } else {
+ mp_chain->b_prev = (mblk_t *)fanout_hint;
+ MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+ mp_chain, tail, cnt, sz);
+ }
+ } else {
+ /*
+ * If you are BW_ENFORCED, just enqueue the
+ * packet. srs_worker will drain it at the
+ * prescribed rate. Before enqueueing, save
+ * the fanout hint.
+ */
+ mp_chain->b_prev = (mblk_t *)fanout_hint;
+ MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
+ tail, cnt, sz, cookie);
+ }
+ if (wakeup_worker)
+ cv_signal(&mac_srs->srs_async);
+ return (cookie);
+}
+
+/*
+ * There are five tx modes:
+ *
+ * 1) Default mode (SRS_TX_DEFAULT)
+ * 2) Serialization mode (SRS_TX_SERIALIZE)
+ * 3) Fanout mode (SRS_TX_FANOUT)
+ * 4) Bandwdith mode (SRS_TX_BW)
+ * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
+ *
+ * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
+ * based on the number of Tx rings requested for an SRS and whether
+ * bandwidth control is requested or not.
+ *
+ * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a
+ * pass-thru. Packets will go directly to mac_tx_send(). When the underlying
+ * Tx ring runs out of Tx descs, it starts queueing up packets in SRS.
+ * When flow-control is relieved, the srs_worker drains the queued
+ * packets and informs blocked clients to restart sending packets.
+ *
+ * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized.
+ *
+ * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
+ * Tx rings. Each Tx ring will have a soft ring associated with it.
+ * These soft rings will be hung off the Tx SRS. Queueing if it happens
+ * due to lack of Tx desc will be in individual soft ring (and not srs)
+ * associated with Tx ring.
+ *
+ * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
+ * only if bw is available. Otherwise the packets will be queued in
+ * SRS. If fanout to multiple Tx rings is configured, the packets will
+ * be fanned out among the soft rings associated with the Tx rings.
+ *
+ * Four flags are used in srs_state for indicating flow control
+ * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
+ * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
+ * driver below.
+ * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
+ * and flow-control pressure is applied back to clients. The clients expect
+ * wakeup when flow-control is relieved.
+ * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
+ * got returned back to client either due to lack of Tx descs or due to bw
+ * control reasons. The clients expect a wakeup when condition is relieved.
+ *
+ * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
+ * some clients set the following values too: MAC_DROP_ON_NO_DESC,
+ * MAC_TX_NO_ENQUEUE
+ * Mac clients that do not want packets to be enqueued in the mac layer set
+ * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
+ * Tx soft rings but instead get dropped when the NIC runs out of desc. The
+ * behaviour of this flag is different when the Tx is running in serializer
+ * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
+ * get dropped when Tx high watermark is reached.
+ * There are some mac clients like vsw, aggr that want the mblks to be
+ * returned back to clients instead of being queued in Tx SRS (or Tx soft
+ * rings) under flow-control (i.e., out of desc or exceeding bw limits)
+ * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
+ * In the default and Tx fanout mode, the un-transmitted mblks will be
+ * returned back to the clients when the driver runs out of Tx descs.
+ * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
+ * soft ring) so that the clients can be woken up when Tx desc become
+ * available. When running in serializer or bandwidth mode mode,
+ * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
+ */
+
+mac_tx_func_t
+mac_tx_get_func(uint32_t mode)
+{
+ return (mac_tx_mode_list[mode].mac_tx_func);
+}
+
+/* ARGSUSED */
+static mac_tx_cookie_t
+mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+ uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+ mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+ boolean_t is_subflow;
+ mac_tx_stats_t stats;
+ mac_tx_cookie_t cookie = NULL;
+
+ ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
+
+ /* Regular case with a single Tx ring */
+ /*
+ * SRS_TX_BLOCKED is set when underlying NIC runs
+ * out of Tx descs and messages start getting
+ * queued. It won't get reset until
+ * tx_srs_drain() completely drains out the
+ * messages.
+ */
+ if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
+ /* Tx descs/resources not available */
+ mutex_enter(&mac_srs->srs_lock);
+ if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
+ cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
+ flag, ret_mp);
+ mutex_exit(&mac_srs->srs_lock);
+ return (cookie);
+ }
+ /*
+ * While we were computing mblk count, the
+ * flow control condition got relieved.
+ * Continue with the transmission.
+ */
+ mutex_exit(&mac_srs->srs_lock);
+ }
+
+ is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+ mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+ mp_chain, (is_subflow ? &stats : NULL));
+
+ /*
+ * Multiple threads could be here sending packets.
+ * Under such conditions, it is not possible to
+ * automically set SRS_TX_BLOCKED bit to indicate
+ * out of tx desc condition. To atomically set
+ * this, we queue the returned packet and do
+ * the setting of SRS_TX_BLOCKED in
+ * mac_tx_srs_drain().
+ */
+ if (mp_chain != NULL) {
+ mutex_enter(&mac_srs->srs_lock);
+ cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
+ mutex_exit(&mac_srs->srs_lock);
+ return (cookie);
+ }
+
+ if (is_subflow)
+ FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+
+ return (NULL);
+}
+
+/*
+ * mac_tx_serialize_mode
+ *
+ * This is an experimental mode implemented as per the request of PAE.
+ * In this mode, all callers attempting to send a packet to the NIC
+ * will get serialized. Only one thread at any time will access the
+ * NIC to send the packet out.
+ */
+/* ARGSUSED */
+static mac_tx_cookie_t
+mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+ uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+ boolean_t is_subflow;
+ mac_tx_stats_t stats;
+ mac_tx_cookie_t cookie = NULL;
+ mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+
+ /* Single ring, serialize below */
+ ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
+ mutex_enter(&mac_srs->srs_lock);
+ if ((mac_srs->srs_first != NULL) ||
+ (mac_srs->srs_state & SRS_PROC)) {
+ /*
+ * In serialization mode, queue all packets until
+ * TX_HIWAT is set.
+ * If drop bit is set, drop if TX_HIWAT is set.
+ * If no_enqueue is set, still enqueue until hiwat
+ * is set and return mblks after TX_HIWAT is set.
+ */
+ cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
+ flag, NULL, ret_mp);
+ mutex_exit(&mac_srs->srs_lock);
+ return (cookie);
+ }
+ /*
+ * No packets queued, nothing on proc and no flow
+ * control condition. Fast-path, ok. Do inline
+ * processing.
+ */
+ mac_srs->srs_state |= SRS_PROC;
+ mutex_exit(&mac_srs->srs_lock);
+
+ is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+ mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+ mp_chain, (is_subflow ? &stats : NULL));
+
+ mutex_enter(&mac_srs->srs_lock);
+ mac_srs->srs_state &= ~SRS_PROC;
+ if (mp_chain != NULL) {
+ cookie = mac_tx_srs_enqueue(mac_srs,
+ mp_chain, flag, NULL, ret_mp);
+ }
+ if (mac_srs->srs_first != NULL) {
+ /*
+ * We processed inline our packet and a new
+ * packet/s got queued while we were
+ * processing. Wakeup srs worker
+ */
+ cv_signal(&mac_srs->srs_async);
+ }
+ mutex_exit(&mac_srs->srs_lock);
+
+ if (is_subflow && cookie == NULL)
+ FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+
+ return (cookie);
+}
+
+/*
+ * mac_tx_fanout_mode
+ *
+ * In this mode, the SRS will have access to multiple Tx rings to send
+ * the packet out. The fanout hint that is passed as an argument is
+ * used to find an appropriate ring to fanout the traffic. Each Tx
+ * ring, in turn, will have a soft ring associated with it. If a Tx
+ * ring runs out of Tx desc's the returned packet will be queued in
+ * the soft ring associated with that Tx ring. The srs itself will not
+ * queue any packets.
+ */
+static mac_tx_cookie_t
+mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+ uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+ mac_soft_ring_t *softring;
+ uint_t indx, hash;
+
+ ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT);
+ hash = HASH_HINT(fanout_hint);
+ indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
+ softring = mac_srs->srs_oth_soft_rings[indx];
+ return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp));
+}
+
+/*
+ * mac_tx_bw_mode
+ *
+ * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
+ * only if bw is available. Otherwise the packets will be queued in
+ * SRS. If the SRS has multiple Tx rings, then packets will get fanned
+ * out to a Tx rings.
+ */
+static mac_tx_cookie_t
+mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+ uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+ int cnt, sz;
+ mblk_t *tail;
+ mac_tx_cookie_t cookie = NULL;
+ mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+
+ ASSERT(TX_BANDWIDTH_MODE(mac_srs));
+ ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
+ mutex_enter(&mac_srs->srs_lock);
+ if (mac_srs->srs_bw->mac_bw_limit == 0) {
+ /* zero bandwidth: drop all */
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ mutex_exit(&mac_srs->srs_lock);
+ return (cookie);
+ } else if ((mac_srs->srs_first != NULL) ||
+ (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
+ cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
+ fanout_hint, ret_mp);
+ mutex_exit(&mac_srs->srs_lock);
+ return (cookie);
+ }
+ MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+ if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+ mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+ mac_srs->srs_bw->mac_bw_used = 0;
+ } else if (mac_srs->srs_bw->mac_bw_used >
+ mac_srs->srs_bw->mac_bw_limit) {
+ mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+ MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+ mp_chain, tail, cnt, sz);
+ /*
+ * Wakeup worker thread. Note that worker
+ * thread has to be woken up so that it
+ * can fire up the timer to be woken up
+ * on the next tick. Also once
+ * BW_ENFORCED is set, it can only be
+ * reset by srs_worker thread. Until then
+ * all packets will get queued up in SRS
+ * and hence this this code path won't be
+ * entered until BW_ENFORCED is reset.
+ */
+ cv_signal(&mac_srs->srs_async);
+ mutex_exit(&mac_srs->srs_lock);
+ return (cookie);
+ }
+
+ mac_srs->srs_bw->mac_bw_used += sz;
+ mutex_exit(&mac_srs->srs_lock);
+
+ if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
+ mac_soft_ring_t *softring;
+ uint_t indx, hash;
+
+ hash = HASH_HINT(fanout_hint);
+ indx = COMPUTE_INDEX(hash,
+ mac_srs->srs_oth_ring_count);
+ softring = mac_srs->srs_oth_soft_rings[indx];
+ return (mac_tx_soft_ring_process(softring, mp_chain, flag,
+ ret_mp));
+ } else {
+ boolean_t is_subflow;
+ mac_tx_stats_t stats;
+
+ is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+ mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+ mp_chain, (is_subflow ? &stats : NULL));
+
+ if (mp_chain != NULL) {
+ mutex_enter(&mac_srs->srs_lock);
+ MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+ if (mac_srs->srs_bw->mac_bw_used > sz)
+ mac_srs->srs_bw->mac_bw_used -= sz;
+ else
+ mac_srs->srs_bw->mac_bw_used = 0;
+ cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
+ fanout_hint, ret_mp);
+ mutex_exit(&mac_srs->srs_lock);
+ return (cookie);
+ }
+ if (is_subflow)
+ FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+
+ return (NULL);
+ }
+}
+
+/* ARGSUSED */
+void
+mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
+{
+ mblk_t *head, *tail;
+ size_t sz;
+ uint32_t tx_mode;
+ uint_t saved_pkt_count;
+ boolean_t is_subflow;
+ mac_tx_stats_t stats;
+ mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+
+ saved_pkt_count = 0;
+ ASSERT(mutex_owned(&mac_srs->srs_lock));
+ ASSERT(!(mac_srs->srs_state & SRS_PROC));
+
+ mac_srs->srs_state |= SRS_PROC;
+
+ is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+ tx_mode = srs_tx->st_mode;
+ if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
+ if (mac_srs->srs_first != NULL) {
+ head = mac_srs->srs_first;
+ tail = mac_srs->srs_last;
+ saved_pkt_count = mac_srs->srs_count;
+ mac_srs->srs_first = NULL;
+ mac_srs->srs_last = NULL;
+ mac_srs->srs_count = 0;
+ mutex_exit(&mac_srs->srs_lock);
+
+ head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+ head, &stats);
+
+ mutex_enter(&mac_srs->srs_lock);
+ if (head != NULL) {
+ /* Device out of tx desc, set block */
+ if (head->b_next == NULL)
+ VERIFY(head == tail);
+ tail->b_next = mac_srs->srs_first;
+ mac_srs->srs_first = head;
+ mac_srs->srs_count +=
+ (saved_pkt_count - stats.ts_opackets);
+ if (mac_srs->srs_last == NULL)
+ mac_srs->srs_last = tail;
+ MAC_TX_SRS_BLOCK(mac_srs, head);
+ } else {
+ srs_tx->st_woken_up = B_FALSE;
+ if (is_subflow) {
+ FLOW_TX_STATS_UPDATE(
+ mac_srs->srs_flent, &stats);
+ }
+ }
+ }
+ } else if (tx_mode == SRS_TX_BW) {
+ /*
+ * We are here because the timer fired and we have some data
+ * to tranmit. Also mac_tx_srs_worker should have reset
+ * SRS_BW_ENFORCED flag
+ */
+ ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
+ head = tail = mac_srs->srs_first;
+ while (mac_srs->srs_first != NULL) {
+ tail = mac_srs->srs_first;
+ tail->b_prev = NULL;
+ mac_srs->srs_first = tail->b_next;
+ if (mac_srs->srs_first == NULL)
+ mac_srs->srs_last = NULL;
+ mac_srs->srs_count--;
+ sz = msgdsize(tail);
+ mac_srs->srs_size -= sz;
+ saved_pkt_count++;
+ MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
+
+ if (mac_srs->srs_bw->mac_bw_used <
+ mac_srs->srs_bw->mac_bw_limit)
+ continue;
+
+ if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+ mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+ mac_srs->srs_bw->mac_bw_used = sz;
+ continue;
+ }
+ mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+ break;
+ }
+
+ ASSERT((head == NULL && tail == NULL) ||
+ (head != NULL && tail != NULL));
+ if (tail != NULL) {
+ tail->b_next = NULL;
+ mutex_exit(&mac_srs->srs_lock);
+
+ head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+ head, &stats);
+
+ mutex_enter(&mac_srs->srs_lock);
+ if (head != NULL) {
+ uint_t size_sent;
+
+ /* Device out of tx desc, set block */
+ if (head->b_next == NULL)
+ VERIFY(head == tail);
+ tail->b_next = mac_srs->srs_first;
+ mac_srs->srs_first = head;
+ mac_srs->srs_count +=
+ (saved_pkt_count - stats.ts_opackets);
+ if (mac_srs->srs_last == NULL)
+ mac_srs->srs_last = tail;
+ size_sent = sz - stats.ts_obytes;
+ mac_srs->srs_size += size_sent;
+ mac_srs->srs_bw->mac_bw_sz += size_sent;
+ if (mac_srs->srs_bw->mac_bw_used > size_sent) {
+ mac_srs->srs_bw->mac_bw_used -=
+ size_sent;
+ } else {
+ mac_srs->srs_bw->mac_bw_used = 0;
+ }
+ MAC_TX_SRS_BLOCK(mac_srs, head);
+ } else {
+ srs_tx->st_woken_up = B_FALSE;
+ if (is_subflow) {
+ FLOW_TX_STATS_UPDATE(
+ mac_srs->srs_flent, &stats);
+ }
+ }
+ }
+ } else if (tx_mode == SRS_TX_BW_FANOUT) {
+ mblk_t *prev;
+ mac_soft_ring_t *softring;
+ uint64_t hint;
+
+ /*
+ * We are here because the timer fired and we
+ * have some quota to tranmit.
+ */
+ prev = NULL;
+ head = tail = mac_srs->srs_first;
+ while (mac_srs->srs_first != NULL) {
+ tail = mac_srs->srs_first;
+ mac_srs->srs_first = tail->b_next;
+ if (mac_srs->srs_first == NULL)
+ mac_srs->srs_last = NULL;
+ mac_srs->srs_count--;
+ sz = msgdsize(tail);
+ mac_srs->srs_size -= sz;
+ mac_srs->srs_bw->mac_bw_used += sz;
+ if (prev == NULL)
+ hint = (ulong_t)tail->b_prev;
+ if (hint != (ulong_t)tail->b_prev) {
+ prev->b_next = NULL;
+ mutex_exit(&mac_srs->srs_lock);
+ TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
+ head = tail;
+ hint = (ulong_t)tail->b_prev;
+ mutex_enter(&mac_srs->srs_lock);
+ }
+
+ prev = tail;
+ tail->b_prev = NULL;
+ if (mac_srs->srs_bw->mac_bw_used <
+ mac_srs->srs_bw->mac_bw_limit)
+ continue;
+
+ if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+ mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+ mac_srs->srs_bw->mac_bw_used = 0;
+ continue;
+ }
+ mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+ break;
+ }
+ ASSERT((head == NULL && tail == NULL) ||
+ (head != NULL && tail != NULL));
+ if (tail != NULL) {
+ tail->b_next = NULL;
+ mutex_exit(&mac_srs->srs_lock);
+ TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
+ mutex_enter(&mac_srs->srs_lock);
+ }
+ }
+ /*
+ * SRS_TX_FANOUT case not considered here because packets
+ * won't be queued in the SRS for this case. Packets will
+ * be sent directly to soft rings underneath and if there
+ * is any queueing at all, it would be in Tx side soft
+ * rings.
+ */
+
+ /*
+ * When srs_count becomes 0, reset SRS_TX_HIWAT and
+ * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
+ */
+ if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
+ (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
+ mac_tx_notify_cb_t *mtnfp;
+ mac_cb_t *mcb;
+ mac_client_impl_t *mcip = mac_srs->srs_mcip;
+ boolean_t wakeup_required = B_FALSE;
+
+ if (mac_srs->srs_state &
+ (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
+ wakeup_required = B_TRUE;
+ }
+ mac_srs->srs_state &= ~(SRS_TX_HIWAT |
+ SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
+ mutex_exit(&mac_srs->srs_lock);
+ if (wakeup_required) {
+ /* Wakeup callback registered clients */
+ MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
+ for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
+ mcb = mcb->mcb_nextp) {
+ mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
+ mtnfp->mtnf_fn(mtnfp->mtnf_arg,
+ (mac_tx_cookie_t)mac_srs);
+ }
+ MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
+ &mcip->mci_tx_notify_cb_list);
+ /*
+ * If the client is not the primary MAC client, then we
+ * need to send the notification to the clients upper
+ * MAC, i.e. mci_upper_mip.
+ */
+ mac_tx_notify(mcip->mci_upper_mip != NULL ?
+ mcip->mci_upper_mip : mcip->mci_mip);
+ }
+ mutex_enter(&mac_srs->srs_lock);
+ }
+ mac_srs->srs_state &= ~SRS_PROC;
+}
+
+/*
+ * Given a packet, get the flow_entry that identifies the flow
+ * to which that packet belongs. The flow_entry will contain
+ * the transmit function to be used to send the packet. If the
+ * function returns NULL, the packet should be sent using the
+ * underlying NIC.
+ */
+static flow_entry_t *
+mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
+{
+ flow_entry_t *flent = NULL;
+ mac_client_impl_t *mcip;
+ int err;
+
+ /*
+ * Do classification on the packet.
+ */
+ err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
+ if (err != 0)
+ return (NULL);
+
+ /*
+ * This flent might just be an additional one on the MAC client,
+ * i.e. for classification purposes (different fdesc), however
+ * the resources, SRS et. al., are in the mci_flent, so if
+ * this isn't the mci_flent, we need to get it.
+ */
+ if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
+ FLOW_REFRELE(flent);
+ flent = mcip->mci_flent;
+ FLOW_TRY_REFHOLD(flent, err);
+ if (err != 0)
+ return (NULL);
+ }
+
+ return (flent);
+}
+
+/*
+ * This macro is only meant to be used by mac_tx_send().
+ */
+#define CHECK_VID_AND_ADD_TAG(mp) { \
+ if (vid_check) { \
+ int err = 0; \
+ \
+ MAC_VID_CHECK(src_mcip, (mp), err); \
+ if (err != 0) { \
+ freemsg((mp)); \
+ (mp) = next; \
+ oerrors++; \
+ continue; \
+ } \
+ } \
+ if (add_tag) { \
+ (mp) = mac_add_vlan_tag((mp), 0, vid); \
+ if ((mp) == NULL) { \
+ (mp) = next; \
+ oerrors++; \
+ continue; \
+ } \
+ } \
+}
+
+mblk_t *
+mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
+ mac_tx_stats_t *stats)
+{
+ mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = src_mcip->mci_mip;
+ uint_t obytes = 0, opackets = 0, oerrors = 0;
+ mblk_t *mp = NULL, *next;
+ boolean_t vid_check, add_tag;
+ uint16_t vid = 0;
+
+ if (mip->mi_nclients > 1) {
+ vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
+ add_tag = MAC_TAG_NEEDED(src_mcip);
+ if (add_tag)
+ vid = mac_client_vid(mch);
+ } else {
+ ASSERT(mip->mi_nclients == 1);
+ vid_check = add_tag = B_FALSE;
+ }
+
+ /*
+ * Fastpath: if there's only one client, and there's no
+ * multicast listeners, we simply send the packet down to the
+ * underlying NIC.
+ */
+ if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) {
+ DTRACE_PROBE2(fastpath,
+ mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
+
+ mp = mp_chain;
+ while (mp != NULL) {
+ next = mp->b_next;
+ mp->b_next = NULL;
+ opackets++;
+ obytes += (mp->b_cont == NULL ? MBLKL(mp) :
+ msgdsize(mp));
+
+ CHECK_VID_AND_ADD_TAG(mp);
+ MAC_TX(mip, ring, mp, src_mcip);
+
+ /*
+ * If the driver is out of descriptors and does a
+ * partial send it will return a chain of unsent
+ * mblks. Adjust the accounting stats.
+ */
+ if (mp != NULL) {
+ opackets--;
+ obytes -= msgdsize(mp);
+ mp->b_next = next;
+ break;
+ }
+ mp = next;
+ }
+ goto done;
+ }
+
+ /*
+ * No fastpath, we either have more than one MAC client
+ * defined on top of the same MAC, or one or more MAC
+ * client promiscuous callbacks.
+ */
+ DTRACE_PROBE3(slowpath, mac_client_impl_t *,
+ src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
+
+ if (mip->mi_promisc_list != NULL)
+ mac_promisc_dispatch(mip, mp_chain, src_mcip);
+
+ mp = mp_chain;
+ while (mp != NULL) {
+ flow_entry_t *dst_flow_ent;
+ void *flow_cookie;
+ size_t pkt_size;
+ mblk_t *mp1;
+
+ next = mp->b_next;
+ mp->b_next = NULL;
+ opackets++;
+ pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
+ obytes += pkt_size;
+ CHECK_VID_AND_ADD_TAG(mp);
+
+ /*
+ * Find the destination.
+ */
+ dst_flow_ent = mac_tx_classify(mip, mp);
+
+ if (dst_flow_ent != NULL) {
+ size_t hdrsize;
+ int err = 0;
+
+ if (mip->mi_info.mi_nativemedia == DL_ETHER) {
+ struct ether_vlan_header *evhp =
+ (struct ether_vlan_header *)mp->b_rptr;
+
+ if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
+ hdrsize = sizeof (*evhp);
+ else
+ hdrsize = sizeof (struct ether_header);
+ } else {
+ mac_header_info_t mhi;
+
+ err = mac_header_info((mac_handle_t)mip,
+ mp, &mhi);
+ if (err == 0)
+ hdrsize = mhi.mhi_hdrsize;
+ }
+
+ /*
+ * Got a matching flow. It's either another
+ * MAC client, or a broadcast/multicast flow.
+ * Make sure the packet size is within the
+ * allowed size. If not drop the packet and
+ * move to next packet.
+ */
+ if (err != 0 ||
+ (pkt_size - hdrsize) > mip->mi_sdu_max) {
+ oerrors++;
+ DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
+ mblk_t *, mp);
+ freemsg(mp);
+ mp = next;
+ FLOW_REFRELE(dst_flow_ent);
+ continue;
+ }
+ flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
+ if (flow_cookie != NULL) {
+ /*
+ * The vnic_bcast_send function expects
+ * to receive the sender MAC client
+ * as value for arg2.
+ */
+ mac_bcast_send(flow_cookie, src_mcip, mp,
+ B_TRUE);
+ } else {
+ /*
+ * loopback the packet to a
+ * local MAC client. We force a context
+ * switch if both source and destination
+ * MAC clients are used by IP, i.e. bypass
+ * is set.
+ */
+ boolean_t do_switch;
+ mac_client_impl_t *dst_mcip =
+ dst_flow_ent->fe_mcip;
+
+ do_switch = ((src_mcip->mci_state_flags &
+ dst_mcip->mci_state_flags &
+ MCIS_CLIENT_POLL_CAPABLE) != 0);
+
+ if ((mp1 = mac_fix_cksum(mp)) != NULL) {
+ (dst_flow_ent->fe_cb_fn)(
+ dst_flow_ent->fe_cb_arg1,
+ dst_flow_ent->fe_cb_arg2,
+ mp1, do_switch);
+ }
+ }
+ FLOW_REFRELE(dst_flow_ent);
+ } else {
+ /*
+ * Unknown destination, send via the underlying
+ * NIC.
+ */
+ MAC_TX(mip, ring, mp, src_mcip);
+ if (mp != NULL) {
+ /*
+ * Adjust for the last packet that
+ * could not be transmitted
+ */
+ opackets--;
+ obytes -= pkt_size;
+ mp->b_next = next;
+ break;
+ }
+ }
+ mp = next;
+ }
+
+done:
+ src_mcip->mci_stat_obytes += obytes;
+ src_mcip->mci_stat_opackets += opackets;
+ src_mcip->mci_stat_oerrors += oerrors;
+
+ if (stats != NULL) {
+ stats->ts_opackets = opackets;
+ stats->ts_obytes = obytes;
+ stats->ts_oerrors = oerrors;
+ }
+ return (mp);
+}
+
+/*
+ * mac_tx_srs_ring_present
+ *
+ * Returns whether the specified ring is part of the specified SRS.
+ */
+boolean_t
+mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
+{
+ int i;
+ mac_soft_ring_t *soft_ring;
+
+ if (srs->srs_tx.st_arg2 == tx_ring)
+ return (B_TRUE);
+
+ for (i = 0; i < srs->srs_oth_ring_count; i++) {
+ soft_ring = srs->srs_oth_soft_rings[i];
+ if (soft_ring->s_ring_tx_arg2 == tx_ring)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * mac_tx_srs_wakeup
+ *
+ * Called when Tx desc become available. Wakeup the appropriate worker
+ * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
+ * state field.
+ */
+void
+mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
+{
+ int i;
+ mac_soft_ring_t *sringp;
+ mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+
+ mutex_enter(&mac_srs->srs_lock);
+ if (TX_SINGLE_RING_MODE(mac_srs)) {
+ if (srs_tx->st_arg2 == ring &&
+ mac_srs->srs_state & SRS_TX_BLOCKED) {
+ mac_srs->srs_state &= ~SRS_TX_BLOCKED;
+ srs_tx->st_unblocked_cnt++;
+ cv_signal(&mac_srs->srs_async);
+ }
+ /*
+ * A wakeup can come before tx_srs_drain() could
+ * grab srs lock and set SRS_TX_BLOCKED. So
+ * always set woken_up flag when we come here.
+ */
+ srs_tx->st_woken_up = B_TRUE;
+ mutex_exit(&mac_srs->srs_lock);
+ return;
+ }
+
+ /* If you are here, it is for FANOUT or BW_FANOUT case */
+ ASSERT(TX_MULTI_RING_MODE(mac_srs));
+ for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+ sringp = mac_srs->srs_oth_soft_rings[i];
+ mutex_enter(&sringp->s_ring_lock);
+ if (sringp->s_ring_tx_arg2 == ring) {
+ if (sringp->s_ring_state & S_RING_BLOCK) {
+ sringp->s_ring_state &= ~S_RING_BLOCK;
+ sringp->s_ring_unblocked_cnt++;
+ cv_signal(&sringp->s_ring_async);
+ }
+ sringp->s_ring_tx_woken_up = B_TRUE;
+ }
+ mutex_exit(&sringp->s_ring_lock);
+ }
+ mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
+ * the blocked clients again.
+ */
+void
+mac_tx_notify(mac_impl_t *mip)
+{
+ i_mac_notify(mip, MAC_NOTE_TX);
+}
+
+/*
+ * RX SOFTRING RELATED FUNCTIONS
+ *
+ * These functions really belong in mac_soft_ring.c and here for
+ * a short period.
+ */
+
+#define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \
+ /* \
+ * Enqueue our mblk chain. \
+ */ \
+ ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \
+ \
+ if ((ringp)->s_ring_last != NULL) \
+ (ringp)->s_ring_last->b_next = (mp); \
+ else \
+ (ringp)->s_ring_first = (mp); \
+ (ringp)->s_ring_last = (tail); \
+ (ringp)->s_ring_count += (cnt); \
+ ASSERT((ringp)->s_ring_count > 0); \
+ if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \
+ (ringp)->s_ring_size += sz; \
+ } \
+}
+
+/*
+ * Default entry point to deliver a packet chain to a MAC client.
+ * If the MAC client has flows, do the classification with these
+ * flows as well.
+ */
+/* ARGSUSED */
+void
+mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
+ mac_header_info_t *arg3)
+{
+ mac_client_impl_t *mcip = arg1;
+
+ if (mcip->mci_nvids == 1 &&
+ !(mcip->mci_state_flags & MCIS_TAG_DISABLE)) {
+ /*
+ * If the client has exactly one VID associated with it
+ * and striping of VLAN header is not disabled,
+ * remove the VLAN tag from the packet before
+ * passing it on to the client's receive callback.
+ * Note that this needs to be done after we dispatch
+ * the packet to the promiscuous listeners of the
+ * client, since they expect to see the whole
+ * frame including the VLAN headers.
+ */
+ mp_chain = mac_strip_vlan_tag_chain(mp_chain);
+ }
+
+ mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
+}
+
+/*
+ * mac_rx_soft_ring_process
+ *
+ * process a chain for a given soft ring. The number of packets queued
+ * in the SRS and its associated soft rings (including this one) is
+ * very small (tracked by srs_poll_pkt_cnt), then allow the entering
+ * thread (interrupt or poll thread) to do inline processing. This
+ * helps keep the latency down under low load.
+ *
+ * The proc and arg for each mblk is already stored in the mblk in
+ * appropriate places.
+ */
+/* ARGSUSED */
+void
+mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
+ mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
+{
+ mac_direct_rx_t proc;
+ void *arg1;
+ mac_resource_handle_t arg2;
+ mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+
+ ASSERT(ringp != NULL);
+ ASSERT(mp_chain != NULL);
+ ASSERT(tail != NULL);
+ ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+
+ mutex_enter(&ringp->s_ring_lock);
+ ringp->s_ring_total_inpkt += cnt;
+ if ((ringp->s_ring_type & ST_RING_ANY) ||
+ ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
+ !mac_srs->srs_rx.sr_enqueue_always)) {
+ /* If on processor or blanking on, then enqueue and return */
+ if (ringp->s_ring_state & S_RING_BLANK ||
+ ringp->s_ring_state & S_RING_PROC) {
+ SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+
+ proc = ringp->s_ring_rx_func;
+ arg1 = ringp->s_ring_rx_arg1;
+ arg2 = ringp->s_ring_rx_arg2;
+ /*
+ * See if anything is already queued. If we are the
+ * first packet, do inline processing else queue the
+ * packet and do the drain.
+ */
+ if (ringp->s_ring_first == NULL) {
+ /*
+ * Fast-path, ok to process and nothing queued.
+ */
+ ringp->s_ring_run = curthread;
+ ringp->s_ring_state |= (S_RING_PROC);
+
+ mutex_exit(&ringp->s_ring_lock);
+
+ /*
+ * We are the chain of 1 packet so
+ * go through this fast path.
+ */
+ ASSERT(mp_chain->b_next == NULL);
+
+ (*proc)(arg1, arg2, mp_chain, NULL);
+
+ ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+ /*
+ * If we have a soft ring set which is doing
+ * bandwidth control, we need to decrement
+ * srs_size and count so it the SRS can have a
+ * accurate idea of what is the real data
+ * queued between SRS and its soft rings. We
+ * decrement the counters only when the packet
+ * gets processed by both SRS and the soft ring.
+ */
+ mutex_enter(&mac_srs->srs_lock);
+ MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+ MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+ mutex_exit(&mac_srs->srs_lock);
+
+ mutex_enter(&ringp->s_ring_lock);
+ ringp->s_ring_run = NULL;
+ ringp->s_ring_state &= ~S_RING_PROC;
+ if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
+ cv_signal(&ringp->s_ring_client_cv);
+
+ if ((ringp->s_ring_first == NULL) ||
+ (ringp->s_ring_state & S_RING_BLANK)) {
+ /*
+ * We processed inline our packet and
+ * nothing new has arrived or our
+ * receiver doesn't want to receive
+ * any packets. We are done.
+ */
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+ } else {
+ SOFT_RING_ENQUEUE_CHAIN(ringp,
+ mp_chain, tail, cnt, sz);
+ }
+
+ /*
+ * We are here because either we couldn't do inline
+ * processing (because something was already
+ * queued), or we had a chain of more than one
+ * packet, or something else arrived after we were
+ * done with inline processing.
+ */
+ ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
+ ASSERT(ringp->s_ring_first != NULL);
+
+ ringp->s_ring_drain_func(ringp);
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ } else {
+ /* ST_RING_WORKER_ONLY case */
+ SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
+ mac_soft_ring_worker_wakeup(ringp);
+ mutex_exit(&ringp->s_ring_lock);
+ }
+}
+
+/*
+ * TX SOFTRING RELATED FUNCTIONS
+ *
+ * These functions really belong in mac_soft_ring.c and here for
+ * a short period.
+ */
+
+#define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \
+ ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \
+ ringp->s_ring_state |= S_RING_ENQUEUED; \
+ SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \
+}
+
+/*
+ * mac_tx_sring_queued
+ *
+ * When we are out of transmit descriptors and we already have a
+ * queue that exceeds hiwat (or the client called us with
+ * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
+ * soft ring pointer as the opaque cookie for the client enable
+ * flow control.
+ */
+static mac_tx_cookie_t
+mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
+ mblk_t **ret_mp)
+{
+ int cnt;
+ size_t sz;
+ mblk_t *tail;
+ mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+ mac_tx_cookie_t cookie = NULL;
+ boolean_t wakeup_worker = B_TRUE;
+
+ ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
+ MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+ if (flag & MAC_DROP_ON_NO_DESC) {
+ mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ /* increment freed stats */
+ ringp->s_ring_drops += cnt;
+ cookie = (mac_tx_cookie_t)ringp;
+ } else {
+ if (ringp->s_ring_first != NULL)
+ wakeup_worker = B_FALSE;
+
+ if (flag & MAC_TX_NO_ENQUEUE) {
+ /*
+ * If QUEUED is not set, queue the packet
+ * and let mac_tx_soft_ring_drain() set
+ * the TX_BLOCKED bit for the reasons
+ * explained above. Otherwise, return the
+ * mblks.
+ */
+ if (wakeup_worker) {
+ TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
+ mp_chain, tail, cnt, sz);
+ } else {
+ ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
+ cookie = (mac_tx_cookie_t)ringp;
+ *ret_mp = mp_chain;
+ }
+ } else {
+ boolean_t enqueue = B_TRUE;
+
+ if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
+ /*
+ * flow-controlled. Store ringp in cookie
+ * so that it can be returned as
+ * mac_tx_cookie_t to client
+ */
+ ringp->s_ring_state |= S_RING_TX_HIWAT;
+ cookie = (mac_tx_cookie_t)ringp;
+ ringp->s_ring_hiwat_cnt++;
+ if (ringp->s_ring_count >
+ ringp->s_ring_tx_max_q_cnt) {
+ /* increment freed stats */
+ ringp->s_ring_drops += cnt;
+ /*
+ * b_prev may be set to the fanout hint
+ * hence can't use freemsg directly
+ */
+ mac_pkt_drop(NULL, NULL,
+ mp_chain, B_FALSE);
+ DTRACE_PROBE1(tx_queued_hiwat,
+ mac_soft_ring_t *, ringp);
+ enqueue = B_FALSE;
+ }
+ }
+ if (enqueue) {
+ TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
+ tail, cnt, sz);
+ }
+ }
+ if (wakeup_worker)
+ cv_signal(&ringp->s_ring_async);
+ }
+ return (cookie);
+}
+
+
+/*
+ * mac_tx_soft_ring_process
+ *
+ * This routine is called when fanning out outgoing traffic among
+ * multipe Tx rings.
+ * Note that a soft ring is associated with a h/w Tx ring.
+ */
+mac_tx_cookie_t
+mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
+ uint16_t flag, mblk_t **ret_mp)
+{
+ mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+ int cnt;
+ size_t sz;
+ mblk_t *tail;
+ mac_tx_cookie_t cookie = NULL;
+
+ ASSERT(ringp != NULL);
+ ASSERT(mp_chain != NULL);
+ ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+ /*
+ * Only two modes can come here; either it can be
+ * SRS_TX_BW_FANOUT or SRS_TX_FANOUT
+ */
+ ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
+ mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
+
+ if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
+ /* Serialization mode */
+
+ mutex_enter(&ringp->s_ring_lock);
+ if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
+ cookie = mac_tx_sring_enqueue(ringp, mp_chain,
+ flag, ret_mp);
+ mutex_exit(&ringp->s_ring_lock);
+ return (cookie);
+ }
+ MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+ TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
+ if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
+ /*
+ * If ring is blocked due to lack of Tx
+ * descs, just return. Worker thread
+ * will get scheduled when Tx desc's
+ * become available.
+ */
+ mutex_exit(&ringp->s_ring_lock);
+ return (cookie);
+ }
+ mac_soft_ring_worker_wakeup(ringp);
+ mutex_exit(&ringp->s_ring_lock);
+ return (cookie);
+ } else {
+ /* Default fanout mode */
+ /*
+ * S_RING_BLOCKED is set when underlying NIC runs
+ * out of Tx descs and messages start getting
+ * queued. It won't get reset until
+ * tx_srs_drain() completely drains out the
+ * messages.
+ */
+ boolean_t is_subflow;
+ mac_tx_stats_t stats;
+
+ if (ringp->s_ring_state & S_RING_ENQUEUED) {
+ /* Tx descs/resources not available */
+ mutex_enter(&ringp->s_ring_lock);
+ if (ringp->s_ring_state & S_RING_ENQUEUED) {
+ cookie = mac_tx_sring_enqueue(ringp, mp_chain,
+ flag, ret_mp);
+ mutex_exit(&ringp->s_ring_lock);
+ return (cookie);
+ }
+ /*
+ * While we were computing mblk count, the
+ * flow control condition got relieved.
+ * Continue with the transmission.
+ */
+ mutex_exit(&ringp->s_ring_lock);
+ }
+ is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+ mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
+ ringp->s_ring_tx_arg2, mp_chain,
+ (is_subflow ? &stats : NULL));
+
+ /*
+ * Multiple threads could be here sending packets.
+ * Under such conditions, it is not possible to
+ * automically set S_RING_BLOCKED bit to indicate
+ * out of tx desc condition. To atomically set
+ * this, we queue the returned packet and do
+ * the setting of S_RING_BLOCKED in
+ * mac_tx_soft_ring_drain().
+ */
+ if (mp_chain != NULL) {
+ mutex_enter(&ringp->s_ring_lock);
+ cookie =
+ mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
+ mutex_exit(&ringp->s_ring_lock);
+ return (cookie);
+ }
+ if (is_subflow) {
+ FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+ }
+ return (NULL);
+ }
+}
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
new file mode 100644
index 0000000000..ff6991ada2
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -0,0 +1,732 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * General Soft rings - Simulating Rx rings in S/W.
+ *
+ * Soft ring is a data abstraction containing a queue and a worker
+ * thread and represents a hardware Rx ring in software. Each soft
+ * ring set can have a collection of soft rings for separating
+ * L3/L4 specific traffic (IPv4 from IPv6 or TCP from UDP) or for
+ * allowing a higher degree of parallelism by sending traffic to
+ * one of the soft rings for a SRS (using a hash on src IP or port).
+ * Each soft ring worker thread can be bound to a different CPU
+ * allowing the processing for each soft ring to happen in parallel
+ * and independent from each other.
+ *
+ * Protocol soft rings:
+ *
+ * Each SRS has at an minimum 3 softrings. One each for IPv4 TCP,
+ * IPv4 UDP and rest (OTH - for IPv6 and everything else). The
+ * SRS does dynamic polling and enforces link level bandwidth but
+ * it does so for all traffic (IPv4 and IPv6 and all protocols) on
+ * that link. However, each protocol layer wants a different
+ * behaviour. For instance IPv4 TCP has per CPU squeues which
+ * enforce their own polling and flow control so IPv4 TCP traffic
+ * needs to go to a separate soft ring which can be polled by the
+ * TCP squeue. It also allows TCP squeue to push back flow control
+ * all the way to NIC hardware (if it puts its corresponding soft
+ * ring in the poll mode and soft ring queue builds up, the
+ * shared srs_poll_pkt_cnt goes up and SRS automatically stops
+ * more packets from entering the system).
+ *
+ * Similarly, the UDP benefits from a DLS bypass and packet chaining
+ * so sending it to a separate soft ring is desired. All the rest of
+ * the traffic (including IPv6 is sent to OTH softring). The IPv6
+ * traffic current goes through OTH softring and via DLS because
+ * it need more processing to be done. Irrespective of the sap
+ * (IPv4 or IPv6) or the transport, the dynamic polling, B/W enforcement,
+ * cpu assignment, fanout, etc apply to all traffic since they
+ * are implement by the SRS which is agnostic to sap or transport.
+ *
+ * Fanout soft rings:
+ *
+ * On a multithreaded system, we can assign more CPU and multi thread
+ * the stack by creating a soft ring per CPU and spreading traffic
+ * based on a hash computed on src IP etc. Since we still need to
+ * keep the protocol separation, we create a set of 3 soft ring per
+ * CPU (specified by cpu list or degree of fanout).
+ *
+ * NOTE: See the block level comment on top of mac_sched.c
+ */
+
+#include <sys/types.h>
+#include <sys/callb.h>
+#include <sys/sdt.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ip_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
+
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/mac_flow_impl.h>
+
+static void mac_rx_soft_ring_drain(mac_soft_ring_t *);
+static void mac_soft_ring_fire(void *);
+static void mac_soft_ring_worker(mac_soft_ring_t *);
+static void mac_tx_soft_ring_drain(mac_soft_ring_t *);
+
+uint32_t mac_tx_soft_ring_max_q_cnt = 100000;
+uint32_t mac_tx_soft_ring_hiwat = 1000;
+
+extern kmem_cache_t *mac_soft_ring_cache;
+
+#define ADD_SOFTRING_TO_SET(mac_srs, softring) { \
+ if (mac_srs->srs_soft_ring_head == NULL) { \
+ mac_srs->srs_soft_ring_head = softring; \
+ mac_srs->srs_soft_ring_tail = softring; \
+ } else { \
+ /* ADD to the list */ \
+ softring->s_ring_prev = \
+ mac_srs->srs_soft_ring_tail; \
+ mac_srs->srs_soft_ring_tail->s_ring_next = softring; \
+ mac_srs->srs_soft_ring_tail = softring; \
+ } \
+ mac_srs->srs_soft_ring_count++; \
+}
+
+/*
+ * mac_soft_ring_worker_wakeup
+ *
+ * Wake up the soft ring worker thread to process the queue as long
+ * as no one else is processing it and upper layer (client) is still
+ * ready to receive packets.
+ */
+void
+mac_soft_ring_worker_wakeup(mac_soft_ring_t *ringp)
+{
+ ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
+ if (!(ringp->s_ring_state & S_RING_PROC) &&
+ !(ringp->s_ring_state & S_RING_BLANK) &&
+ (ringp->s_ring_tid == NULL)) {
+ if (ringp->s_ring_wait != 0) {
+ ringp->s_ring_tid =
+ timeout(mac_soft_ring_fire, ringp,
+ ringp->s_ring_wait);
+ } else {
+ /* Schedule the worker thread. */
+ cv_signal(&ringp->s_ring_async);
+ }
+ }
+}
+
+/*
+ * mac_soft_ring_create
+ *
+ * Create a soft ring, do the necessary setup and bind the worker
+ * thread to the assigned CPU.
+ */
+mac_soft_ring_t *
+mac_soft_ring_create(int id, clock_t wait, void *flent, uint16_t type,
+ pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs,
+ processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1,
+ mac_resource_handle_t x_arg2)
+{
+ mac_soft_ring_t *ringp;
+ char name[64];
+
+ bzero(name, 64);
+ ringp = kmem_cache_alloc(mac_soft_ring_cache, KM_SLEEP);
+
+ if (type & ST_RING_TCP) {
+ (void) snprintf(name, sizeof (name),
+ "mac_tcp_soft_ring_%d_%p", id, mac_srs);
+ } else if (type & ST_RING_UDP) {
+ (void) snprintf(name, sizeof (name),
+ "mac_udp_soft_ring_%d_%p", id, mac_srs);
+ } else {
+ (void) snprintf(name, sizeof (name),
+ "mac_oth_soft_ring_%d_%p", id, mac_srs);
+ }
+
+ bzero(ringp, sizeof (mac_soft_ring_t));
+ (void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1);
+ ringp->s_ring_name[S_RING_NAMELEN] = '\0';
+ mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL);
+ ringp->s_ring_notify_cb_info.mcbi_lockp = &ringp->s_ring_lock;
+
+ ringp->s_ring_type = type;
+ ringp->s_ring_wait = MSEC_TO_TICK(wait);
+ ringp->s_ring_mcip = mcip;
+ ringp->s_ring_set = mac_srs;
+ ringp->s_ring_flent = flent;
+
+ /*
+ * Protect against access from DR callbacks (mac_walk_srs_bind/unbind)
+ * which can't grab the mac perimeter
+ */
+ mutex_enter(&mac_srs->srs_lock);
+ ADD_SOFTRING_TO_SET(mac_srs, ringp);
+ mutex_exit(&mac_srs->srs_lock);
+
+ /*
+ * set the bind CPU to -1 to indicate
+ * no thread affinity set
+ */
+ ringp->s_ring_cpuid = ringp->s_ring_cpuid_save = -1;
+ ringp->s_ring_worker = thread_create(NULL, 0,
+ mac_soft_ring_worker, ringp, 0, &p0, TS_RUN, pri);
+ if (type & ST_RING_TX) {
+ ringp->s_ring_drain_func = mac_tx_soft_ring_drain;
+ ringp->s_ring_tx_arg1 = x_arg1;
+ ringp->s_ring_tx_arg2 = x_arg2;
+ ringp->s_ring_tx_max_q_cnt = mac_tx_soft_ring_max_q_cnt;
+ ringp->s_ring_tx_hiwat =
+ (mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ?
+ mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat;
+ } else {
+ ringp->s_ring_drain_func = mac_rx_soft_ring_drain;
+ ringp->s_ring_rx_func = rx_func;
+ ringp->s_ring_rx_arg1 = x_arg1;
+ ringp->s_ring_rx_arg2 = x_arg2;
+ }
+ if (cpuid != -1)
+ (void) mac_soft_ring_bind(ringp, cpuid);
+
+ return (ringp);
+}
+
+/*
+ * mac_soft_ring_free
+ *
+ * Free the soft ring once we are done with it.
+ */
+void
+mac_soft_ring_free(mac_soft_ring_t *softring, boolean_t release_tx_ring)
+{
+ ASSERT((softring->s_ring_state &
+ (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
+ (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
+ mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
+ if (release_tx_ring && softring->s_ring_tx_arg2 != NULL) {
+ ASSERT(softring->s_ring_type & ST_RING_TX);
+ mac_release_tx_ring(softring->s_ring_tx_arg2);
+ }
+ if (softring->s_ring_ksp)
+ kstat_delete(softring->s_ring_ksp);
+ mac_callback_free(softring->s_ring_notify_cb_list);
+ kmem_cache_free(mac_soft_ring_cache, softring);
+}
+
+int mac_soft_ring_thread_bind = 1;
+
+/*
+ * mac_soft_ring_bind
+ *
+ * Bind a soft ring worker thread to supplied CPU.
+ */
+cpu_t *
+mac_soft_ring_bind(mac_soft_ring_t *ringp, processorid_t cpuid)
+{
+ cpu_t *cp;
+ boolean_t clear = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ if (mac_soft_ring_thread_bind == 0) {
+ DTRACE_PROBE1(mac__soft__ring__no__cpu__bound,
+ mac_soft_ring_t *, ringp);
+ return (NULL);
+ }
+
+ cp = cpu_get(cpuid);
+ if (cp == NULL || !cpu_is_online(cp))
+ return (NULL);
+
+ mutex_enter(&ringp->s_ring_lock);
+ ringp->s_ring_state |= S_RING_BOUND;
+ if (ringp->s_ring_cpuid != -1)
+ clear = B_TRUE;
+ ringp->s_ring_cpuid = cpuid;
+ mutex_exit(&ringp->s_ring_lock);
+
+ if (clear)
+ thread_affinity_clear(ringp->s_ring_worker);
+
+ DTRACE_PROBE2(mac__soft__ring__cpu__bound, mac_soft_ring_t *,
+ ringp, processorid_t, cpuid);
+
+ thread_affinity_set(ringp->s_ring_worker, cpuid);
+
+ return (cp);
+}
+
+/*
+ * mac_soft_ring_unbind
+ *
+ * Un Bind a soft ring worker thread.
+ */
+void
+mac_soft_ring_unbind(mac_soft_ring_t *ringp)
+{
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ mutex_enter(&ringp->s_ring_lock);
+ if (!(ringp->s_ring_state & S_RING_BOUND)) {
+ ASSERT(ringp->s_ring_cpuid == -1);
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+
+ ringp->s_ring_cpuid = -1;
+ ringp->s_ring_state &= ~S_RING_BOUND;
+ thread_affinity_clear(ringp->s_ring_worker);
+ mutex_exit(&ringp->s_ring_lock);
+}
+
+/*
+ * PRIVATE FUNCTIONS
+ */
+
+static void
+mac_soft_ring_fire(void *arg)
+{
+ mac_soft_ring_t *ringp = arg;
+
+ mutex_enter(&ringp->s_ring_lock);
+ if (ringp->s_ring_tid == 0) {
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+
+ ringp->s_ring_tid = 0;
+
+ if (!(ringp->s_ring_state & S_RING_PROC)) {
+ cv_signal(&ringp->s_ring_async);
+ }
+ mutex_exit(&ringp->s_ring_lock);
+}
+
+/*
+ * mac_rx_soft_ring_drain
+ *
+ * Called when worker thread model (ST_RING_WORKER_ONLY) of processing
+ * incoming packets is used. s_ring_first contain the queued packets.
+ * s_ring_rx_func contains the upper level (client) routine where the
+ * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the
+ * cookie meant for the client.
+ */
+/* ARGSUSED */
+static void
+mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
+{
+ mblk_t *mp;
+ void *arg1;
+ mac_resource_handle_t arg2;
+ timeout_id_t tid;
+ mac_direct_rx_t proc;
+ size_t sz;
+ int cnt;
+ mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+
+ ringp->s_ring_run = curthread;
+ ASSERT(mutex_owned(&ringp->s_ring_lock));
+ ASSERT(!(ringp->s_ring_state & S_RING_PROC));
+
+ if ((tid = ringp->s_ring_tid) != 0)
+ ringp->s_ring_tid = 0;
+
+ ringp->s_ring_state |= S_RING_PROC;
+
+ proc = ringp->s_ring_rx_func;
+ arg1 = ringp->s_ring_rx_arg1;
+ arg2 = ringp->s_ring_rx_arg2;
+
+ while ((ringp->s_ring_first != NULL) &&
+ !(ringp->s_ring_state & S_RING_PAUSE)) {
+ mp = ringp->s_ring_first;
+ ringp->s_ring_first = NULL;
+ ringp->s_ring_last = NULL;
+ cnt = ringp->s_ring_count;
+ ringp->s_ring_count = 0;
+ sz = ringp->s_ring_size;
+ ringp->s_ring_size = 0;
+ mutex_exit(&ringp->s_ring_lock);
+
+ if (tid != 0) {
+ (void) untimeout(tid);
+ tid = 0;
+ }
+
+ (*proc)(arg1, arg2, mp, NULL);
+
+ /*
+ * If we have a soft ring set which is doing
+ * bandwidth control, we need to decrement its
+ * srs_size so it can have a accurate idea of
+ * what is the real data queued between SRS and
+ * its soft rings. We decrement the size for a
+ * packet only when it gets processed by both
+ * SRS and the soft ring.
+ */
+ mutex_enter(&mac_srs->srs_lock);
+ MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+ MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+ mutex_exit(&mac_srs->srs_lock);
+
+ mutex_enter(&ringp->s_ring_lock);
+ }
+ ringp->s_ring_state &= ~S_RING_PROC;
+ if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
+ cv_signal(&ringp->s_ring_client_cv);
+ ringp->s_ring_run = NULL;
+}
+
+/*
+ * mac_soft_ring_worker
+ *
+ * The soft ring worker routine to process any queued packets. In
+ * normal case, the worker thread is bound to a CPU. It the soft
+ * ring is dealing with TCP packets, then the worker thread will
+ * be bound to the same CPU as the TCP squeue.
+ */
+static void
+mac_soft_ring_worker(mac_soft_ring_t *ringp)
+{
+ kmutex_t *lock = &ringp->s_ring_lock;
+ kcondvar_t *async = &ringp->s_ring_async;
+ mac_soft_ring_set_t *srs = ringp->s_ring_set;
+ callb_cpr_t cprinfo;
+
+ CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_soft_ring");
+ mutex_enter(lock);
+start:
+ for (;;) {
+ while (((ringp->s_ring_first == NULL ||
+ (ringp->s_ring_state & S_RING_BLOCK)) &&
+ !(ringp->s_ring_state & S_RING_PAUSE)) ||
+ (ringp->s_ring_state & S_RING_PROC)) {
+
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(async, lock);
+ CALLB_CPR_SAFE_END(&cprinfo, lock);
+ }
+
+ /*
+ * Either we have work to do, or we have been asked to
+ * shutdown temporarily or permanently
+ */
+ if (ringp->s_ring_state & S_RING_PAUSE)
+ goto done;
+
+ ringp->s_ring_drain_func(ringp);
+ }
+done:
+ mutex_exit(lock);
+ mutex_enter(&srs->srs_lock);
+ mutex_enter(lock);
+
+ ringp->s_ring_state |= S_RING_QUIESCE_DONE;
+ if (!(ringp->s_ring_state & S_RING_CONDEMNED)) {
+ srs->srs_soft_ring_quiesced_count++;
+ cv_broadcast(&srs->srs_async);
+ mutex_exit(&srs->srs_lock);
+ while (!(ringp->s_ring_state &
+ (S_RING_RESTART | S_RING_CONDEMNED)))
+ cv_wait(&ringp->s_ring_async, &ringp->s_ring_lock);
+ mutex_exit(lock);
+ mutex_enter(&srs->srs_lock);
+ mutex_enter(lock);
+ srs->srs_soft_ring_quiesced_count--;
+ if (ringp->s_ring_state & S_RING_RESTART) {
+ ASSERT(!(ringp->s_ring_state & S_RING_CONDEMNED));
+ ringp->s_ring_state &= ~(S_RING_RESTART |
+ S_RING_QUIESCE | S_RING_QUIESCE_DONE);
+ cv_broadcast(&srs->srs_async);
+ mutex_exit(&srs->srs_lock);
+ goto start;
+ }
+ }
+ ASSERT(ringp->s_ring_state & S_RING_CONDEMNED);
+ ringp->s_ring_state |= S_RING_CONDEMNED_DONE;
+ CALLB_CPR_EXIT(&cprinfo);
+ srs->srs_soft_ring_condemned_count++;
+ cv_broadcast(&srs->srs_async);
+ mutex_exit(&srs->srs_lock);
+ thread_exit();
+}
+
+/*
+ * mac_soft_ring_intr_enable and mac_soft_ring_intr_disable
+ *
+ * these functions are called to toggle the sending of packets to the
+ * client. They are called by the client. the client gets the name
+ * of these routine and corresponding cookie (pointing to softring)
+ * during capability negotiation at setup time.
+ *
+ * Enabling is allow the processing thread to send packets to the
+ * client while disabling does the opposite.
+ */
+void
+mac_soft_ring_intr_enable(void *arg)
+{
+ mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
+ mutex_enter(&ringp->s_ring_lock);
+ ringp->s_ring_state &= ~S_RING_BLANK;
+ if (ringp->s_ring_first != NULL)
+ mac_soft_ring_worker_wakeup(ringp);
+ mutex_exit(&ringp->s_ring_lock);
+}
+
+void
+mac_soft_ring_intr_disable(void *arg)
+{
+ mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
+ /*
+ * Stop worker thread from sending packets above.
+ * Squeue will poll soft ring when it needs packets.
+ */
+ mutex_enter(&ringp->s_ring_lock);
+ ringp->s_ring_state |= S_RING_BLANK;
+ mutex_exit(&ringp->s_ring_lock);
+}
+
+/*
+ * mac_soft_ring_poll
+ *
+ * This routine is called by the client to poll for packets from
+ * the soft ring. The function name and cookie corresponding to
+ * the soft ring is exchanged during capability negotiation during
+ * setup.
+ */
+mblk_t *
+mac_soft_ring_poll(mac_soft_ring_t *ringp, int bytes_to_pickup)
+{
+ mblk_t *head, *tail;
+ mblk_t *mp;
+ size_t sz = 0;
+ int cnt = 0;
+ mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+
+ ASSERT(mac_srs != NULL);
+
+ mutex_enter(&ringp->s_ring_lock);
+ head = tail = mp = ringp->s_ring_first;
+ if (head == NULL) {
+ mutex_exit(&ringp->s_ring_lock);
+ return (NULL);
+ }
+
+ if (ringp->s_ring_size <= bytes_to_pickup) {
+ head = ringp->s_ring_first;
+ ringp->s_ring_first = NULL;
+ ringp->s_ring_last = NULL;
+ cnt = ringp->s_ring_count;
+ ringp->s_ring_count = 0;
+ sz = ringp->s_ring_size;
+ ringp->s_ring_size = 0;
+ } else {
+ while (mp && sz <= bytes_to_pickup) {
+ sz += msgdsize(mp);
+ cnt++;
+ tail = mp;
+ mp = mp->b_next;
+ }
+ ringp->s_ring_count -= cnt;
+ ringp->s_ring_size -= sz;
+ tail->b_next = NULL;
+ if (mp == NULL) {
+ ringp->s_ring_first = NULL;
+ ringp->s_ring_last = NULL;
+ ASSERT(ringp->s_ring_count == 0);
+ } else {
+ ringp->s_ring_first = mp;
+ }
+ }
+
+ mutex_exit(&ringp->s_ring_lock);
+ /*
+ * Update the shared count and size counters so
+ * that SRS has a accurate idea of queued packets.
+ */
+ mutex_enter(&mac_srs->srs_lock);
+ MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+ MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+ mutex_exit(&mac_srs->srs_lock);
+ return (head);
+}
+
+/*
+ * mac_soft_ring_dls_bypass
+ *
+ * Enable direct client (IP) callback function from the softrings.
+ * Callers need to make sure they don't need any DLS layer processing
+ */
+void
+mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
+{
+ mac_soft_ring_t *softring = arg;
+ mac_soft_ring_set_t *srs;
+
+ ASSERT(rx_func != NULL);
+
+ mutex_enter(&softring->s_ring_lock);
+ softring->s_ring_rx_func = rx_func;
+ softring->s_ring_rx_arg1 = rx_arg1;
+ mutex_exit(&softring->s_ring_lock);
+
+ srs = softring->s_ring_set;
+ mutex_enter(&srs->srs_lock);
+ srs->srs_type |= SRST_DLS_BYPASS;
+ mutex_exit(&srs->srs_lock);
+}
+
+/*
+ * mac_soft_ring_signal
+ *
+ * Typically used to set the soft ring state to QUIESCE, CONDEMNED, or
+ * RESTART.
+ *
+ * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
+ * from the driver are done, then the Rx SRS is quiesced and only then can
+ * we signal the soft rings. Thus this function can't be called arbitrarily
+ * without satisfying the prerequisites. On the Tx side, the threads from
+ * top need to quiesced, then the Tx SRS and only then can we signal the
+ * Tx soft rings.
+ */
+void
+mac_soft_ring_signal(mac_soft_ring_t *softring, uint_t sr_flag)
+{
+ mutex_enter(&softring->s_ring_lock);
+ softring->s_ring_state |= sr_flag;
+ cv_signal(&softring->s_ring_async);
+ mutex_exit(&softring->s_ring_lock);
+}
+
+/*
+ * mac_tx_soft_ring_drain
+ *
+ * The transmit side drain routine in case the soft ring was being
+ * used to transmit packets.
+ */
+static void
+mac_tx_soft_ring_drain(mac_soft_ring_t *ringp)
+{
+ mblk_t *mp;
+ void *arg1;
+ void *arg2;
+ mblk_t *tail;
+ uint_t saved_pkt_count, saved_size;
+ boolean_t is_subflow;
+ mac_tx_stats_t stats;
+ mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+
+ saved_pkt_count = saved_size = 0;
+ ringp->s_ring_run = curthread;
+ ASSERT(mutex_owned(&ringp->s_ring_lock));
+ ASSERT(!(ringp->s_ring_state & S_RING_PROC));
+
+ ringp->s_ring_state |= S_RING_PROC;
+ is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+ arg1 = ringp->s_ring_tx_arg1;
+ arg2 = ringp->s_ring_tx_arg2;
+
+ while (ringp->s_ring_first != NULL) {
+ mp = ringp->s_ring_first;
+ tail = ringp->s_ring_last;
+ saved_pkt_count = ringp->s_ring_count;
+ saved_size = ringp->s_ring_size;
+ ringp->s_ring_first = NULL;
+ ringp->s_ring_last = NULL;
+ ringp->s_ring_count = 0;
+ ringp->s_ring_size = 0;
+ mutex_exit(&ringp->s_ring_lock);
+
+ mp = mac_tx_send(arg1, arg2, mp, &stats);
+
+ mutex_enter(&ringp->s_ring_lock);
+ if (mp != NULL) {
+ /* Device out of tx desc, set block */
+ tail->b_next = ringp->s_ring_first;
+ ringp->s_ring_first = mp;
+ ringp->s_ring_count +=
+ (saved_pkt_count - stats.ts_opackets);
+ ringp->s_ring_size += (saved_size - stats.ts_obytes);
+ if (ringp->s_ring_last == NULL)
+ ringp->s_ring_last = tail;
+
+ if (ringp->s_ring_tx_woken_up) {
+ ringp->s_ring_tx_woken_up = B_FALSE;
+ } else {
+ ringp->s_ring_state |= S_RING_BLOCK;
+ ringp->s_ring_blocked_cnt++;
+ }
+
+ ringp->s_ring_state &= ~S_RING_PROC;
+ ringp->s_ring_run = NULL;
+ return;
+ } else {
+ ringp->s_ring_tx_woken_up = B_FALSE;
+ if (is_subflow) {
+ FLOW_TX_STATS_UPDATE(
+ mac_srs->srs_flent, &stats);
+ }
+ }
+ }
+
+ if (ringp->s_ring_count == 0 && ringp->s_ring_state &
+ (S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED)) {
+ mac_tx_notify_cb_t *mtnfp;
+ mac_cb_t *mcb;
+ mac_client_impl_t *mcip = ringp->s_ring_mcip;
+ boolean_t wakeup_required = B_FALSE;
+
+ if (ringp->s_ring_state &
+ (S_RING_TX_HIWAT|S_RING_WAKEUP_CLIENT)) {
+ wakeup_required = B_TRUE;
+ }
+ ringp->s_ring_state &=
+ ~(S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED);
+ mutex_exit(&ringp->s_ring_lock);
+ if (wakeup_required) {
+ /* Wakeup callback registered clients */
+ MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
+ for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
+ mcb = mcb->mcb_nextp) {
+ mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
+ mtnfp->mtnf_fn(mtnfp->mtnf_arg,
+ (mac_tx_cookie_t)ringp);
+ }
+ MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
+ &mcip->mci_tx_notify_cb_list);
+ /*
+ * If the client is not the primary MAC client, then we
+ * need to send the notification to the clients upper
+ * MAC, i.e. mci_upper_mip.
+ */
+ mac_tx_notify(mcip->mci_upper_mip != NULL ?
+ mcip->mci_upper_mip : mcip->mci_mip);
+ }
+ mutex_enter(&ringp->s_ring_lock);
+ }
+ ringp->s_ring_state &= ~S_RING_PROC;
+ ringp->s_ring_run = NULL;
+}
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
new file mode 100644
index 0000000000..1615060736
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -0,0 +1,823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * MAC Services Module - misc utilities
+ */
+
+#include <sys/types.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <sys/pattr.h>
+#include <sys/pci_tools.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <sys/vtrace.h>
+#include <sys/dlpi.h>
+#include <sys/sunndi.h>
+
+/*
+ * Copy an mblk, preserving its hardware checksum flags.
+ */
+static mblk_t *
+mac_copymsg_cksum(mblk_t *mp)
+{
+ mblk_t *mp1;
+ uint32_t start, stuff, end, value, flags;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL)
+ return (NULL);
+
+ hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
+ (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
+ flags, KM_NOSLEEP);
+
+ return (mp1);
+}
+
+/*
+ * Copy an mblk chain, presenting the hardware checksum flags of the
+ * individual mblks.
+ */
+mblk_t *
+mac_copymsgchain_cksum(mblk_t *mp)
+{
+ mblk_t *nmp = NULL;
+ mblk_t **nmpp = &nmp;
+
+ for (; mp != NULL; mp = mp->b_next) {
+ if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
+ freemsgchain(nmp);
+ return (NULL);
+ }
+
+ nmpp = &((*nmpp)->b_next);
+ }
+
+ return (nmp);
+}
+
+/*
+ * Process the specified mblk chain for proper handling of hardware
+ * checksum offload. This routine is invoked for loopback traffic
+ * between MAC clients.
+ * The function handles a NULL mblk chain passed as argument.
+ */
+mblk_t *
+mac_fix_cksum(mblk_t *mp_chain)
+{
+ mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
+ uint32_t flags, start, stuff, end, value;
+
+ for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
+ uint16_t len;
+ uint32_t offset;
+ struct ether_header *ehp;
+ uint16_t sap;
+
+ hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
+ &flags);
+ if (flags == 0)
+ continue;
+
+ /*
+ * Since the processing of checksum offload for loopback
+ * traffic requires modification of the packet contents,
+ * ensure sure that we are always modifying our own copy.
+ */
+ if (DB_REF(mp) > 1) {
+ mp1 = copymsg(mp);
+ if (mp1 == NULL)
+ continue;
+ mp1->b_next = mp->b_next;
+ mp->b_next = NULL;
+ freemsg(mp);
+ if (prev != NULL)
+ prev->b_next = mp1;
+ else
+ new_chain = mp1;
+ mp = mp1;
+ }
+
+ /*
+ * Ethernet, and optionally VLAN header.
+ */
+ /* LINTED: improper alignment cast */
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == VLAN_TPID) {
+ struct ether_vlan_header *evhp;
+
+ ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
+ /* LINTED: improper alignment cast */
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ sap = ntohs(evhp->ether_type);
+ offset = sizeof (struct ether_vlan_header);
+ } else {
+ sap = ntohs(ehp->ether_type);
+ offset = sizeof (struct ether_header);
+ }
+
+ if (MBLKL(mp) <= offset) {
+ offset -= MBLKL(mp);
+ if (mp->b_cont == NULL) {
+ /* corrupted packet, skip it */
+ if (prev != NULL)
+ prev->b_next = mp->b_next;
+ else
+ new_chain = mp->b_next;
+ mp1 = mp->b_next;
+ mp->b_next = NULL;
+ freemsg(mp);
+ mp = mp1;
+ continue;
+ }
+ mp = mp->b_cont;
+ }
+
+ if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
+ ipha_t *ipha = NULL;
+
+ /*
+ * In order to compute the full and header
+ * checksums, we need to find and parse
+ * the IP and/or ULP headers.
+ */
+
+ sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+
+ /*
+ * IP header.
+ */
+ if (sap != ETHERTYPE_IP)
+ continue;
+
+ ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
+ /* LINTED: improper alignment cast */
+ ipha = (ipha_t *)(mp->b_rptr + offset);
+
+ if (flags & HCK_FULLCKSUM) {
+ ipaddr_t src, dst;
+ uint32_t cksum;
+ uint16_t *up;
+ uint8_t proto;
+
+ /*
+ * Pointer to checksum field in ULP header.
+ */
+ proto = ipha->ipha_protocol;
+ ASSERT(ipha->ipha_version_and_hdr_length ==
+ IP_SIMPLE_HDR_VERSION);
+ if (proto == IPPROTO_TCP) {
+ /* LINTED: improper alignment cast */
+ up = IPH_TCPH_CHECKSUMP(ipha,
+ IP_SIMPLE_HDR_LENGTH);
+ } else {
+ ASSERT(proto == IPPROTO_UDP);
+ /* LINTED: improper alignment cast */
+ up = IPH_UDPH_CHECKSUMP(ipha,
+ IP_SIMPLE_HDR_LENGTH);
+ }
+
+ /*
+ * Pseudo-header checksum.
+ */
+ src = ipha->ipha_src;
+ dst = ipha->ipha_dst;
+ len = ntohs(ipha->ipha_length) -
+ IP_SIMPLE_HDR_LENGTH;
+
+ cksum = (dst >> 16) + (dst & 0xFFFF) +
+ (src >> 16) + (src & 0xFFFF);
+ cksum += htons(len);
+
+ /*
+ * The checksum value stored in the packet needs
+ * to be correct. Compute it here.
+ */
+ *up = 0;
+ cksum += (((proto) == IPPROTO_UDP) ?
+ IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
+ cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
+ offset, cksum);
+ *(up) = (uint16_t)(cksum ? cksum : ~cksum);
+
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0xffff;
+ }
+
+ if (flags & HCK_IPV4_HDRCKSUM) {
+ ASSERT(ipha != NULL);
+ ipha->ipha_hdr_checksum =
+ (uint16_t)ip_csum_hdr(ipha);
+ }
+ }
+
+ if (flags & HCK_PARTIALCKSUM) {
+ uint16_t *up, partial, cksum;
+ uchar_t *ipp; /* ptr to beginning of IP header */
+
+ if (mp->b_cont != NULL) {
+ mblk_t *mp1;
+
+ mp1 = msgpullup(mp, offset + end);
+ if (mp1 == NULL)
+ continue;
+ mp1->b_next = mp->b_next;
+ mp->b_next = NULL;
+ freemsg(mp);
+ if (prev != NULL)
+ prev->b_next = mp1;
+ else
+ new_chain = mp1;
+ mp = mp1;
+ }
+
+ ipp = mp->b_rptr + offset;
+ /* LINTED: cast may result in improper alignment */
+ up = (uint16_t *)((uchar_t *)ipp + stuff);
+ partial = *up;
+ *up = 0;
+
+ cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
+ end - start, partial);
+ cksum = ~cksum;
+ *up = cksum ? cksum : ~cksum;
+
+ /*
+ * Since we already computed the whole checksum,
+ * indicate to the stack that it has already
+ * been verified by the hardware.
+ */
+ flags &= ~HCK_PARTIALCKSUM;
+ flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
+ value = 0xffff;
+ }
+
+ (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
+ value, flags, KM_NOSLEEP);
+ }
+
+ return (new_chain);
+}
+
+/*
+ * Add VLAN tag to the specified mblk.
+ */
+mblk_t *
+mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
+{
+ mblk_t *hmp;
+ struct ether_vlan_header *evhp;
+ struct ether_header *ehp;
+ uint32_t start, stuff, end, value, flags;
+
+ ASSERT(pri != 0 || vid != 0);
+
+ /*
+ * Allocate an mblk for the new tagged ethernet header,
+ * and copy the MAC addresses and ethertype from the
+ * original header.
+ */
+
+ hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
+ if (hmp == NULL) {
+ freemsg(mp);
+ return (NULL);
+ }
+
+ evhp = (struct ether_vlan_header *)hmp->b_rptr;
+ ehp = (struct ether_header *)mp->b_rptr;
+
+ bcopy(ehp, evhp, (ETHERADDRL * 2));
+ evhp->ether_type = ehp->ether_type;
+ evhp->ether_tpid = htons(ETHERTYPE_VLAN);
+
+ hmp->b_wptr += sizeof (struct ether_vlan_header);
+ mp->b_rptr += sizeof (struct ether_header);
+
+ /*
+ * Free the original message if it's now empty. Link the
+ * rest of messages to the header message.
+ */
+ hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
+ (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
+ KM_NOSLEEP);
+ if (MBLKL(mp) == 0) {
+ hmp->b_cont = mp->b_cont;
+ freeb(mp);
+ } else {
+ hmp->b_cont = mp;
+ }
+ ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
+
+ /*
+ * Initialize the new TCI (Tag Control Information).
+ */
+ evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
+
+ return (hmp);
+}
+
+/*
+ * Adds a VLAN tag with the specified VID and priority to each mblk of
+ * the specified chain.
+ */
+mblk_t *
+mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
+{
+ mblk_t *next_mp, **prev, *mp;
+
+ mp = mp_chain;
+ prev = &mp_chain;
+
+ while (mp != NULL) {
+ next_mp = mp->b_next;
+ mp->b_next = NULL;
+ if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
+ freemsgchain(next_mp);
+ break;
+ }
+ *prev = mp;
+ prev = &mp->b_next;
+ mp = mp->b_next = next_mp;
+ }
+
+ return (mp_chain);
+}
+
+/*
+ * Strip VLAN tag
+ */
+mblk_t *
+mac_strip_vlan_tag(mblk_t *mp)
+{
+ mblk_t *newmp;
+ struct ether_vlan_header *evhp;
+
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
+ ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
+
+ if (DB_REF(mp) > 1) {
+ newmp = copymsg(mp);
+ if (newmp == NULL)
+ return (NULL);
+ freemsg(mp);
+ mp = newmp;
+ }
+
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+
+ ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
+ mp->b_rptr += VLAN_TAGSZ;
+ }
+ return (mp);
+}
+
+/*
+ * Strip VLAN tag from each mblk of the chain.
+ */
+mblk_t *
+mac_strip_vlan_tag_chain(mblk_t *mp_chain)
+{
+ mblk_t *mp, *next_mp, **prev;
+
+ mp = mp_chain;
+ prev = &mp_chain;
+
+ while (mp != NULL) {
+ next_mp = mp->b_next;
+ mp->b_next = NULL;
+ if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
+ freemsgchain(next_mp);
+ break;
+ }
+ *prev = mp;
+ prev = &mp->b_next;
+ mp = mp->b_next = next_mp;
+ }
+
+ return (mp_chain);
+}
+
+/*
+ * Default callback function. Used when the datapath is not yet initialized.
+ */
+/* ARGSUSED */
+void
+mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
+ boolean_t loopback)
+{
+ mblk_t *mp1 = mp;
+
+ while (mp1 != NULL) {
+ mp1->b_prev = NULL;
+ mp1->b_queue = NULL;
+ mp1 = mp1->b_next;
+ }
+ freemsgchain(mp);
+}
+
+/*
+ * Determines the IPv6 header length accounting for all the optional IPv6
+ * headers (hop-by-hop, destination, routing and fragment). The header length
+ * and next header value (a transport header) is captured.
+ *
+ * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
+ * returns B_TRUE.
+ */
+boolean_t
+mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length,
+ uint8_t *next_hdr)
+{
+ uint16_t length;
+ uint_t ehdrlen;
+ uint8_t *whereptr;
+ uint8_t *endptr;
+ uint8_t *nexthdrp;
+ ip6_dest_t *desthdr;
+ ip6_rthdr_t *rthdr;
+ ip6_frag_t *fraghdr;
+
+ endptr = mp->b_wptr;
+ if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
+ return (B_FALSE);
+ ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION);
+ length = IPV6_HDR_LEN;
+ whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
+
+ nexthdrp = &ip6h->ip6_nxt;
+ while (whereptr < endptr) {
+ /* Is there enough left for len + nexthdr? */
+ if (whereptr + MIN_EHDR_LEN > endptr)
+ break;
+
+ switch (*nexthdrp) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS:
+ /* Assumes the headers are identical for hbh and dst */
+ desthdr = (ip6_dest_t *)whereptr;
+ ehdrlen = 8 * (desthdr->ip6d_len + 1);
+ if ((uchar_t *)desthdr + ehdrlen > endptr)
+ return (B_FALSE);
+ nexthdrp = &desthdr->ip6d_nxt;
+ break;
+ case IPPROTO_ROUTING:
+ rthdr = (ip6_rthdr_t *)whereptr;
+ ehdrlen = 8 * (rthdr->ip6r_len + 1);
+ if ((uchar_t *)rthdr + ehdrlen > endptr)
+ return (B_FALSE);
+ nexthdrp = &rthdr->ip6r_nxt;
+ break;
+ case IPPROTO_FRAGMENT:
+ fraghdr = (ip6_frag_t *)whereptr;
+ ehdrlen = sizeof (ip6_frag_t);
+ if ((uchar_t *)&fraghdr[1] > endptr)
+ return (B_FALSE);
+ nexthdrp = &fraghdr->ip6f_nxt;
+ break;
+ case IPPROTO_NONE:
+ /* No next header means we're finished */
+ default:
+ *hdr_length = length;
+ *next_hdr = *nexthdrp;
+ return (B_TRUE);
+ }
+ length += ehdrlen;
+ whereptr += ehdrlen;
+ *hdr_length = length;
+ *next_hdr = *nexthdrp;
+ }
+ switch (*nexthdrp) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_FRAGMENT:
+ /*
+ * If any know extension headers are still to be processed,
+ * the packet's malformed (or at least all the IP header(s) are
+ * not in the same mblk - and that should never happen.
+ */
+ return (B_FALSE);
+
+ default:
+ /*
+ * If we get here, we know that all of the IP headers were in
+ * the same mblk, even if the ULP header is in the next mblk.
+ */
+ *hdr_length = length;
+ *next_hdr = *nexthdrp;
+ return (B_TRUE);
+ }
+}
+
+typedef struct mac_dladm_intr {
+ int ino;
+ int cpu_id;
+ char driver_path[MAXPATHLEN];
+ char nexus_path[MAXPATHLEN];
+} mac_dladm_intr_t;
+
+/* Bind the interrupt to cpu_num */
+static int
+mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int ino)
+{
+ pcitool_intr_set_t iset;
+ int err;
+
+ iset.ino = ino;
+ iset.cpu_id = cpu_num;
+ iset.user_version = PCITOOL_VERSION;
+ err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
+ kcred, NULL);
+
+ return (err);
+}
+
+/*
+ * Search interrupt information. iget is filled in with the info to search
+ */
+static boolean_t
+mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
+{
+ int i;
+ char driver_path[2 * MAXPATHLEN];
+
+ for (i = 0; i < iget_p->num_devs; i++) {
+ (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
+ (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
+ ":%s%d", iget_p->dev[i].driver_name,
+ iget_p->dev[i].dev_inst);
+ /* Match the device path for the device path */
+ if (strcmp(driver_path, dln->driver_path) == 0) {
+ dln->ino = iget_p->ino;
+ dln->cpu_id = iget_p->cpu_id;
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Get information about ino, i.e. if this is the interrupt for our
+ * device and where it is bound etc.
+ */
+static boolean_t
+mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln)
+{
+ pcitool_intr_get_t *iget_p;
+ int ipsz;
+ int nipsz;
+ int err;
+ uint8_t inum;
+
+ /*
+ * Check if SLEEP is OK, i.e if could come here in response to
+ * changing the fanout due to some callback from the driver, say
+ * link speed changes.
+ */
+ ipsz = PCITOOL_IGET_SIZE(0);
+ iget_p = kmem_zalloc(ipsz, KM_SLEEP);
+
+ iget_p->num_devs_ret = 0;
+ iget_p->user_version = PCITOOL_VERSION;
+ iget_p->ino = ino;
+
+ err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
+ FKIOCTL, kcred, NULL);
+ if (err != 0) {
+ kmem_free(iget_p, ipsz);
+ return (B_FALSE);
+ }
+ if (iget_p->num_devs == 0) {
+ kmem_free(iget_p, ipsz);
+ return (B_FALSE);
+ }
+ inum = iget_p->num_devs;
+ if (iget_p->num_devs_ret < iget_p->num_devs) {
+ /* Reallocate */
+ nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
+
+ kmem_free(iget_p, ipsz);
+ ipsz = nipsz;
+ iget_p = kmem_zalloc(ipsz, KM_SLEEP);
+
+ iget_p->num_devs_ret = inum;
+ iget_p->ino = ino;
+ iget_p->user_version = PCITOOL_VERSION;
+ err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
+ FKIOCTL, kcred, NULL);
+ if (err != 0) {
+ kmem_free(iget_p, ipsz);
+ return (B_FALSE);
+ }
+ /* defensive */
+ if (iget_p->num_devs != iget_p->num_devs_ret) {
+ kmem_free(iget_p, ipsz);
+ return (B_FALSE);
+ }
+ }
+
+ if (mac_search_intrinfo(iget_p, dln)) {
+ kmem_free(iget_p, ipsz);
+ return (B_TRUE);
+ }
+ kmem_free(iget_p, ipsz);
+ return (B_FALSE);
+}
+
+/*
+ * Get the interrupts and check each one to see if it is for our device.
+ */
+static int
+mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
+{
+ pcitool_intr_info_t intr_info;
+ int err;
+ int ino;
+
+ err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
+ FKIOCTL, kcred, NULL);
+ if (err != 0)
+ return (-1);
+
+ for (ino = 0; ino < intr_info.num_intr; ino++) {
+ if (mac_get_single_intr(lh, ino, dln)) {
+ if (dln->cpu_id == cpuid)
+ return (0);
+ return (1);
+ }
+ }
+ return (-1);
+}
+
+/*
+ * Obtain the nexus parent node info. for mdip.
+ */
+static dev_info_t *
+mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
+{
+ struct dev_info *tdip = (struct dev_info *)mdip;
+ struct ddi_minor_data *minordata;
+ int circ;
+ dev_info_t *pdip;
+ char pathname[MAXPATHLEN];
+
+ while (tdip != NULL) {
+ ndi_devi_enter((dev_info_t *)tdip, &circ);
+ for (minordata = tdip->devi_minor; minordata != NULL;
+ minordata = minordata->next) {
+ if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
+ strlen(DDI_NT_INTRCTL)) == 0) {
+ pdip = minordata->dip;
+ (void) ddi_pathname(pdip, pathname);
+ (void) snprintf(dln->nexus_path, MAXPATHLEN,
+ "/devices%s:intr", pathname);
+ (void) ddi_pathname_minor(minordata, pathname);
+ ndi_devi_exit((dev_info_t *)tdip, circ);
+ return (pdip);
+ }
+ }
+ ndi_devi_exit((dev_info_t *)tdip, circ);
+ tdip = tdip->devi_parent;
+ }
+ return (NULL);
+}
+
+/*
+ * For a primary MAC client, if the user has set a list or CPUs or
+ * we have obtained it implicitly, we try to retarget the interrupt
+ * for that device on one of the CPUs in the list.
+ * We assign the interrupt to the same CPU as the poll thread.
+ */
+static boolean_t
+mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
+{
+ ldi_handle_t lh = NULL;
+ ldi_ident_t li = NULL;
+ int err;
+ int ret;
+ mac_dladm_intr_t dln;
+ dev_info_t *dip;
+ struct ddi_minor_data *minordata;
+
+ dln.nexus_path[0] = '\0';
+ dln.driver_path[0] = '\0';
+
+ minordata = ((struct dev_info *)mdip)->devi_minor;
+ while (minordata != NULL) {
+ if (minordata->type == DDM_MINOR)
+ break;
+ minordata = minordata->next;
+ }
+ if (minordata == NULL)
+ return (B_FALSE);
+
+ (void) ddi_pathname_minor(minordata, dln.driver_path);
+
+ dip = mac_get_nexus_node(mdip, &dln);
+ /* defensive */
+ if (dip == NULL)
+ return (B_FALSE);
+
+ err = ldi_ident_from_major(ddi_driver_major(dip), &li);
+ if (err != 0)
+ return (B_FALSE);
+
+ err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
+ if (err != 0)
+ return (B_FALSE);
+
+ ret = mac_validate_intr(lh, &dln, cpuid);
+ if (ret < 0) {
+ (void) ldi_close(lh, FREAD|FWRITE, kcred);
+ return (B_FALSE);
+ }
+ /* cmn_note? */
+ if (ret != 0)
+ if ((err = (mac_set_intr(lh, cpuid, dln.ino))) != 0) {
+ (void) ldi_close(lh, FREAD|FWRITE, kcred);
+ return (B_FALSE);
+ }
+ (void) ldi_close(lh, FREAD|FWRITE, kcred);
+ return (B_TRUE);
+}
+
+void
+mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
+{
+ dev_info_t *mdip = (dev_info_t *)arg;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_resource_props_t *mrp;
+ mac_perim_handle_t mph;
+
+ if (cpuid == -1 || !mac_check_interrupt_binding(mdip, cpuid))
+ return;
+
+ mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
+ mrp = MCIP_RESOURCE_PROPS(mcip);
+ mrp->mrp_intr_cpu = cpuid;
+ mac_perim_exit(mph);
+}
+
+int32_t
+mac_client_intr_cpu(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_cpus_t *srs_cpu;
+ mac_soft_ring_set_t *rx_srs;
+ flow_entry_t *flent = mcip->mci_flent;
+ mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
+
+ /*
+ * Check if we need to retarget the interrupt. We do this only
+ * for the primary MAC client. We do this if we have the only
+ * exclusive ring in the group.
+ */
+ if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
+ rx_srs = flent->fe_rx_srs[1];
+ srs_cpu = &rx_srs->srs_cpu;
+ if (mrp->mrp_intr_cpu == srs_cpu->mc_pollid)
+ return (-1);
+ return (srs_cpu->mc_pollid);
+ }
+ return (-1);
+}
+
+void *
+mac_get_devinfo(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return ((void *)mip->mi_dip);
+}
diff --git a/usr/src/uts/common/io/mac/plugins/mac_ether.c b/usr/src/uts/common/io/mac/plugins/mac_ether.c
index f4cf08eb66..abaab66add 100644
--- a/usr/src/uts/common/io/mac/plugins/mac_ether.c
+++ b/usr/src/uts/common/io/mac/plugins/mac_ether.c
@@ -30,9 +30,8 @@
#include <sys/types.h>
#include <sys/modctl.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/dld_impl.h>
#include <sys/mac_ether.h>
-#include <sys/dls.h>
#include <sys/ethernet.h>
#include <sys/byteorder.h>
#include <sys/strsun.h>
diff --git a/usr/src/uts/common/io/mac/plugins/mac_wifi.c b/usr/src/uts/common/io/mac/plugins/mac_wifi.c
index 668d7dbda1..fb45c8ef1c 100644
--- a/usr/src/uts/common/io/mac/plugins/mac_wifi.c
+++ b/usr/src/uts/common/io/mac/plugins/mac_wifi.c
@@ -32,9 +32,8 @@
#include <sys/types.h>
#include <sys/modctl.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/dld_impl.h>
#include <sys/mac_wifi.h>
-#include <sys/dls.h>
#include <sys/ethernet.h>
#include <sys/byteorder.h>
#include <sys/strsun.h>
diff --git a/usr/src/uts/common/io/mxfe/mxfe.c b/usr/src/uts/common/io/mxfe/mxfe.c
index 9470ac6b6b..044274acbf 100644
--- a/usr/src/uts/common/io/mxfe/mxfe.c
+++ b/usr/src/uts/common/io/mxfe/mxfe.c
@@ -177,7 +177,6 @@ static mac_callbacks_t mxfe_m_callbacks = {
mxfe_m_multicst,
mxfe_m_unicst,
mxfe_m_tx,
- NULL, /* mc_resources */
NULL, /* mc_ioctl */
NULL, /* mc_getcapab */
NULL, /* mc_open */
diff --git a/usr/src/uts/common/io/mxfe/mxfeimpl.h b/usr/src/uts/common/io/mxfe/mxfeimpl.h
index c1bc8ab265..d5742eeceb 100644
--- a/usr/src/uts/common/io/mxfe/mxfeimpl.h
+++ b/usr/src/uts/common/io/mxfe/mxfeimpl.h
@@ -36,14 +36,14 @@
#ifndef _MXFEIMPL_H
#define _MXFEIMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This entire file is private to the MXFE driver.
*/
#ifdef _KERNEL
+#include <sys/mac_provider.h>
+
/*
* Compile time tunables.
*/
diff --git a/usr/src/uts/common/io/net80211/net80211.c b/usr/src/uts/common/io/net80211/net80211.c
index 4b74943c85..fd49066fcc 100644
--- a/usr/src/uts/common/io/net80211/net80211.c
+++ b/usr/src/uts/common/io/net80211/net80211.c
@@ -35,8 +35,6 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* IEEE 802.11 generic handler
*/
@@ -47,6 +45,7 @@
#include <sys/modctl.h>
#include <sys/stropts.h>
#include <sys/door.h>
+#include <sys/mac_provider.h>
#include "net80211_impl.h"
uint32_t ieee80211_debug = 0x0; /* debug msg flags */
diff --git a/usr/src/uts/common/io/net80211/net80211_input.c b/usr/src/uts/common/io/net80211/net80211_input.c
index ca948788d0..eb95149ea6 100644
--- a/usr/src/uts/common/io/net80211/net80211_input.c
+++ b/usr/src/uts/common/io/net80211/net80211_input.c
@@ -39,6 +39,7 @@
* Process received frame
*/
+#include <sys/mac_provider.h>
#include <sys/byteorder.h>
#include <sys/strsun.h>
#include "net80211_impl.h"
diff --git a/usr/src/uts/common/io/net80211/net80211_ioctl.c b/usr/src/uts/common/io/net80211/net80211_ioctl.c
index 8e905971ff..44935e0979 100644
--- a/usr/src/uts/common/io/net80211/net80211_ioctl.c
+++ b/usr/src/uts/common/io/net80211/net80211_ioctl.c
@@ -41,7 +41,7 @@
#include <inet/nd.h>
#include <inet/mi.h>
#include <sys/note.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <inet/wifi_ioctl.h>
#include "net80211_impl.h"
diff --git a/usr/src/uts/common/io/nge/nge.h b/usr/src/uts/common/io/nge/nge.h
index 430df8b83b..2944c6b820 100644
--- a/usr/src/uts/common/io/nge/nge.h
+++ b/usr/src/uts/common/io/nge/nge.h
@@ -61,7 +61,7 @@ extern "C" {
#include <sys/ddi.h>
#include <sys/sunddi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
/*
diff --git a/usr/src/uts/common/io/nge/nge_main.c b/usr/src/uts/common/io/nge/nge_main.c
index 7ea4165779..f7b22f86e6 100644
--- a/usr/src/uts/common/io/nge/nge_main.c
+++ b/usr/src/uts/common/io/nge/nge_main.c
@@ -196,7 +196,6 @@ static mac_callbacks_t nge_m_callbacks = {
nge_m_multicst,
nge_m_unicst,
nge_m_tx,
- NULL,
nge_m_ioctl,
nge_m_getcapab,
NULL,
@@ -2137,12 +2136,6 @@ nge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
return (B_FALSE);
break;
}
- case MAC_CAPAB_POLL:
- /*
- * There's nothing for us to fill in, simply returning
- * B_TRUE, stating that we support polling is sufficient.
- */
- break;
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/common/io/ntxn/unm_nic.h b/usr/src/uts/common/io/ntxn/unm_nic.h
index 6c8232757f..e23c385ce5 100644
--- a/usr/src/uts/common/io/ntxn/unm_nic.h
+++ b/usr/src/uts/common/io/ntxn/unm_nic.h
@@ -54,7 +54,7 @@
#include <inet/mi.h>
#include <inet/nd.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/miiregs.h> /* by fjlite out of intel */
diff --git a/usr/src/uts/common/io/ntxn/unm_nic_main.c b/usr/src/uts/common/io/ntxn/unm_nic_main.c
index b7e0c5832d..3db781fc8f 100644
--- a/usr/src/uts/common/io/ntxn/unm_nic_main.c
+++ b/usr/src/uts/common/io/ntxn/unm_nic_main.c
@@ -2513,9 +2513,6 @@ ntxn_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM);
}
break;
-
- case MAC_CAPAB_POLL:
- case MAC_CAPAB_MULTIADDRESS:
default:
return (B_FALSE);
}
@@ -2534,7 +2531,6 @@ static mac_callbacks_t ntxn_m_callbacks = {
ntxn_m_multicst,
ntxn_m_unicst,
ntxn_m_tx,
- NULL, /* mc_resources */
ntxn_m_ioctl,
ntxn_m_getcapab,
NULL, /* mc_open */
diff --git a/usr/src/uts/common/io/nxge/nxge_fzc.c b/usr/src/uts/common/io/nxge/nxge_fzc.c
index 91b5712895..3831d77eed 100644
--- a/usr/src/uts/common/io/nxge/nxge_fzc.c
+++ b/usr/src/uts/common/io/nxge/nxge_fzc.c
@@ -942,15 +942,18 @@ nxge_fzc_rdc_tbl_unbind(p_nxge_t nxge, int rdc_tbl)
NXGE_DEBUG_MSG((nxge, DMA_CTL, "==> nxge_fzc_rdc_tbl_unbind(%d)",
rdc_tbl));
+ MUTEX_ENTER(&nhd->lock);
table = &nhd->rdc_tbl[rdc_tbl];
if (table->nxge != (uintptr_t)nxge) {
NXGE_ERROR_MSG((nxge, DMA_CTL,
"nxge_fzc_rdc_tbl_unbind(%d): func%d not owner",
nxge->function_num, rdc_tbl));
+ MUTEX_EXIT(&nhd->lock);
return (EINVAL);
} else {
bzero(table, sizeof (*table));
}
+ MUTEX_EXIT(&nhd->lock);
NXGE_DEBUG_MSG((nxge, DMA_CTL, "<== nxge_fzc_rdc_tbl_unbind(%d)",
rdc_tbl));
diff --git a/usr/src/uts/common/io/nxge/nxge_hcall.s b/usr/src/uts/common/io/nxge/nxge_hcall.s
index c9f82b52df..56c85945b5 100644
--- a/usr/src/uts/common/io/nxge/nxge_hcall.s
+++ b/usr/src/uts/common/io/nxge/nxge_hcall.s
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Hypervisor calls called by niu leaf driver.
*/
@@ -34,6 +32,8 @@
#include <sys/hypervisor_api.h>
#include <sys/nxge/nxge_impl.h>
+#if defined(sun4v)
+
/*
* NIU HV API v1.0 definitions
*/
@@ -518,3 +518,5 @@ hv_niu_vrrx_set_ino(uint32_t cookie, uint64_t vridx, uint32_t ino)
SET_SIZE(hv_niu_vrtx_param_set)
#endif /* lint || __lint */
+
+#endif /*defined(sun4v)*/
diff --git a/usr/src/uts/common/io/nxge/nxge_hio.c b/usr/src/uts/common/io/nxge/nxge_hio.c
index f4aa20706d..2b9a972fec 100644
--- a/usr/src/uts/common/io/nxge/nxge_hio.c
+++ b/usr/src/uts/common/io/nxge/nxge_hio.c
@@ -34,6 +34,7 @@
*
*/
+#include <sys/mac_provider.h>
#include <sys/nxge/nxge_impl.h>
#include <sys/nxge/nxge_fzc.h>
#include <sys/nxge/nxge_rxdma.h>
@@ -49,7 +50,9 @@
extern npi_status_t npi_rxdma_dump_rdc_table(npi_handle_t, uint8_t);
/* The following function may be found in nxge_main.c */
-extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
+extern int nxge_m_mmac_remove(void *arg, int slot);
+extern int nxge_m_mmac_add_g(void *arg, const uint8_t *maddr, int rdctbl,
+ boolean_t usetbl);
/* The following function may be found in nxge_[t|r]xdma.c */
extern npi_status_t nxge_txdma_channel_disable(nxge_t *, int);
@@ -129,6 +132,7 @@ int
nxge_hio_init(nxge_t *nxge)
{
nxge_hio_data_t *nhd;
+ int i;
nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
if (nhd == 0) {
@@ -137,6 +141,31 @@ nxge_hio_init(nxge_t *nxge)
nxge->nxge_hw_p->hio = (uintptr_t)nhd;
}
+ /*
+ * Initialize share and ring group structures.
+ */
+ for (i = 0; i < NXGE_MAX_TDCS; i++)
+ nxge->tdc_is_shared[i] = B_FALSE;
+
+ for (i = 0; i < NXGE_MAX_TDC_GROUPS; i++) {
+ nxge->tx_hio_groups[i].ghandle = NULL;
+ nxge->tx_hio_groups[i].nxgep = nxge;
+ nxge->tx_hio_groups[i].type = MAC_RING_TYPE_TX;
+ nxge->tx_hio_groups[i].gindex = 0;
+ nxge->tx_hio_groups[i].sindex = 0;
+ }
+
+ for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
+ nxge->rx_hio_groups[i].ghandle = NULL;
+ nxge->rx_hio_groups[i].nxgep = nxge;
+ nxge->rx_hio_groups[i].type = MAC_RING_TYPE_RX;
+ nxge->rx_hio_groups[i].gindex = 0;
+ nxge->rx_hio_groups[i].sindex = 0;
+ nxge->rx_hio_groups[i].started = B_FALSE;
+ nxge->rx_hio_groups[i].rdctbl = -1;
+ nxge->rx_hio_groups[i].n_mac_addrs = 0;
+ }
+
nhd->hio.ldoms = B_FALSE;
return (NXGE_OK);
@@ -400,7 +429,7 @@ nxge_grp_dc_add(
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_grp_dc_add"));
- if (group == NULL)
+ if (group == 0)
return (0);
switch (type) {
@@ -424,7 +453,6 @@ nxge_grp_dc_add(
default:
NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
"nxge_grp_dc_add: unknown type channel(%d)", channel));
- return (NXGE_ERROR);
}
NXGE_DEBUG_MSG((nxge, HIO_CTL,
@@ -540,9 +568,6 @@ nxge_grp_dc_remove(
MUTEX_ENTER(&nhd->lock);
set = dc->type == VP_BOUND_TX ? &nxge->tx_set : &nxge->rx_set;
- if (isLDOMs(nxge) && ((1 << channel) && set->shared.map)) {
- NXGE_DC_RESET(group->map, channel);
- }
/* Remove the DC from its group. */
if (nxge_grp_dc_unlink(nxge, group, channel) != dc) {
@@ -663,7 +688,10 @@ nxge_grp_dc_append(
* Any domain
*/
nxge_hio_dc_t *
-nxge_grp_dc_unlink(nxge_t *nxge, nxge_grp_t *group, int channel)
+nxge_grp_dc_unlink(
+ nxge_t *nxge,
+ nxge_grp_t *group,
+ int channel)
{
nxge_hio_dc_t *current, *previous;
@@ -699,6 +727,7 @@ nxge_grp_dc_unlink(nxge_t *nxge, nxge_grp_t *group, int channel)
current->next = 0;
current->group = 0;
+ NXGE_DC_RESET(group->map, channel);
group->count--;
}
@@ -914,15 +943,14 @@ nxge_ddi_perror(
* Local prototypes
*/
static nxge_hio_vr_t *nxge_hio_vr_share(nxge_t *);
-
-static int nxge_hio_dc_share(nxge_t *, nxge_hio_vr_t *, mac_ring_type_t);
static void nxge_hio_unshare(nxge_hio_vr_t *);
-static int nxge_hio_addres(nxge_hio_vr_t *, mac_ring_type_t, int);
+static int nxge_hio_addres(nxge_hio_vr_t *, mac_ring_type_t, uint64_t *);
static void nxge_hio_remres(nxge_hio_vr_t *, mac_ring_type_t, res_map_t);
-static void nxge_hio_tdc_unshare(nxge_t *nxge, int channel);
-static void nxge_hio_rdc_unshare(nxge_t *nxge, int channel);
+static void nxge_hio_tdc_unshare(nxge_t *nxge, int dev_grpid, int channel);
+static void nxge_hio_rdc_unshare(nxge_t *nxge, int dev_grpid, int channel);
+static int nxge_hio_dc_share(nxge_t *, nxge_hio_vr_t *, mac_ring_type_t, int);
static void nxge_hio_dc_unshare(nxge_t *, nxge_hio_vr_t *,
mac_ring_type_t, int);
@@ -967,6 +995,28 @@ nxge_hio_init(
}
}
+ /*
+ * Initialize share and ring group structures.
+ */
+ for (i = 0; i < NXGE_MAX_TDC_GROUPS; i++) {
+ nxge->tx_hio_groups[i].ghandle = NULL;
+ nxge->tx_hio_groups[i].nxgep = nxge;
+ nxge->tx_hio_groups[i].type = MAC_RING_TYPE_TX;
+ nxge->tx_hio_groups[i].gindex = 0;
+ nxge->tx_hio_groups[i].sindex = 0;
+ }
+
+ for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
+ nxge->rx_hio_groups[i].ghandle = NULL;
+ nxge->rx_hio_groups[i].nxgep = nxge;
+ nxge->rx_hio_groups[i].type = MAC_RING_TYPE_RX;
+ nxge->rx_hio_groups[i].gindex = 0;
+ nxge->rx_hio_groups[i].sindex = 0;
+ nxge->rx_hio_groups[i].started = B_FALSE;
+ nxge->rx_hio_groups[i].rdctbl = -1;
+ nxge->rx_hio_groups[i].n_mac_addrs = 0;
+ }
+
if (!isLDOMs(nxge)) {
nhd->hio.ldoms = B_FALSE;
return (NXGE_OK);
@@ -983,22 +1033,15 @@ nxge_hio_init(
nhd->vrs = NXGE_VR_SR_MAX - 2;
/*
- * Initialize tdc share state, shares and ring group structures.
+ * Initialize the share stuctures.
*/
for (i = 0; i < NXGE_MAX_TDCS; i++)
nxge->tdc_is_shared[i] = B_FALSE;
- for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
- nxge->rx_hio_groups[i].ghandle = NULL;
- nxge->rx_hio_groups[i].nxgep = nxge;
- nxge->rx_hio_groups[i].gindex = 0;
- nxge->rx_hio_groups[i].sindex = 0;
- }
-
for (i = 0; i < NXGE_VR_SR_MAX; i++) {
nxge->shares[i].nxgep = nxge;
nxge->shares[i].index = 0;
- nxge->shares[i].vrp = (void *)NULL;
+ nxge->shares[i].vrp = NULL;
nxge->shares[i].tmap = 0;
nxge->shares[i].rmap = 0;
nxge->shares[i].rxgroup = 0;
@@ -1033,77 +1076,251 @@ nxge_hio_init(
return (0);
}
+#endif /* defined(sun4v) */
+
+static int
+nxge_hio_group_mac_add(nxge_t *nxge, nxge_ring_group_t *g,
+ const uint8_t *macaddr)
+{
+ int rv;
+ nxge_rdc_grp_t *group;
+
+ mutex_enter(nxge->genlock);
+
+ /*
+ * Initialize the NXGE RDC table data structure.
+ */
+ group = &nxge->pt_config.rdc_grps[g->rdctbl];
+ if (!group->flag) {
+ group->port = NXGE_GET_PORT_NUM(nxge->function_num);
+ group->config_method = RDC_TABLE_ENTRY_METHOD_REP;
+ group->flag = B_TRUE; /* This group has been configured. */
+ }
+
+ mutex_exit(nxge->genlock);
+
+ /*
+ * Add the MAC address.
+ */
+ if ((rv = nxge_m_mmac_add_g((void *)nxge, macaddr,
+ g->rdctbl, B_TRUE)) != 0) {
+ return (rv);
+ }
+
+ mutex_enter(nxge->genlock);
+ g->n_mac_addrs++;
+ mutex_exit(nxge->genlock);
+ return (0);
+}
static int
nxge_hio_add_mac(void *arg, const uint8_t *mac_addr)
{
- nxge_rx_ring_group_t *rxgroup = (nxge_rx_ring_group_t *)arg;
- p_nxge_t nxge = rxgroup->nxgep;
- int group = rxgroup->gindex;
- int rv, sindex;
+ nxge_ring_group_t *group = (nxge_ring_group_t *)arg;
+ p_nxge_t nxge = group->nxgep;
+ int rv;
nxge_hio_vr_t *vr; /* The Virtualization Region */
- sindex = nxge->rx_hio_groups[group].sindex;
- vr = (nxge_hio_vr_t *)nxge->shares[sindex].vrp;
+ ASSERT(group->type == MAC_RING_TYPE_RX);
+
+ mutex_enter(nxge->genlock);
/*
- * Program the mac address for the group/share.
+ * If the group is associated with a VR, then only one
+ * address may be assigned to the group.
*/
- if ((rv = nxge_hio_hostinfo_init(nxge, vr,
- (ether_addr_t *)mac_addr)) != 0) {
+ vr = (nxge_hio_vr_t *)nxge->shares[group->sindex].vrp;
+ if ((vr != NULL) && (group->n_mac_addrs)) {
+ mutex_exit(nxge->genlock);
+ return (ENOSPC);
+ }
+
+ mutex_exit(nxge->genlock);
+
+ /*
+ * Program the mac address for the group.
+ */
+ if ((rv = nxge_hio_group_mac_add(nxge, group,
+ mac_addr)) != 0) {
return (rv);
}
return (0);
}
+static int
+find_mac_slot(nxge_mmac_t *mmac_info, const uint8_t *mac_addr)
+{
+ int i;
+ for (i = 0; i <= mmac_info->num_mmac; i++) {
+ if (memcmp(mmac_info->mac_pool[i].addr, mac_addr,
+ ETHERADDRL) == 0) {
+ return (i);
+ }
+ }
+ return (-1);
+}
+
/* ARGSUSED */
static int
nxge_hio_rem_mac(void *arg, const uint8_t *mac_addr)
{
- nxge_rx_ring_group_t *rxgroup = (nxge_rx_ring_group_t *)arg;
- p_nxge_t nxge = rxgroup->nxgep;
- int group = rxgroup->gindex;
- int sindex;
- nxge_hio_vr_t *vr; /* The Virtualization Region */
+ nxge_ring_group_t *group = (nxge_ring_group_t *)arg;
+ p_nxge_t nxge = group->nxgep;
+ nxge_mmac_t *mmac_info;
+ int rv, slot;
+
+ ASSERT(group->type == MAC_RING_TYPE_RX);
+
+ mutex_enter(nxge->genlock);
+
+ mmac_info = &nxge->nxge_mmac_info;
+ slot = find_mac_slot(mmac_info, mac_addr);
+ if (slot < 0) {
+ mutex_exit(nxge->genlock);
+ return (EINVAL);
+ }
+
+ mutex_exit(nxge->genlock);
+
+ /*
+ * Remove the mac address for the group
+ */
+ if ((rv = nxge_m_mmac_remove(nxge, slot)) != 0) {
+ return (rv);
+ }
+
+ mutex_enter(nxge->genlock);
+ group->n_mac_addrs--;
+ mutex_exit(nxge->genlock);
+
+ return (0);
+}
- sindex = nxge->rx_hio_groups[group].sindex;
- vr = (nxge_hio_vr_t *)nxge->shares[sindex].vrp;
+static int
+nxge_hio_group_start(mac_group_driver_t gdriver)
+{
+ nxge_ring_group_t *group = (nxge_ring_group_t *)gdriver;
+ int rdctbl;
+ int dev_gindex;
+
+ ASSERT(group->type == MAC_RING_TYPE_RX);
+
+#ifdef later
+ ASSERT(group->nxgep->nxge_mac_state == NXGE_MAC_STARTED);
+#endif
+ if (group->nxgep->nxge_mac_state != NXGE_MAC_STARTED)
+ return (ENXIO);
+
+ mutex_enter(group->nxgep->genlock);
+ dev_gindex = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid +
+ group->gindex;
/*
- * Remove the mac address for the group/share.
+ * Get an rdc table for this group.
+ * Group ID is given by the caller, and that's the group it needs
+ * to bind to. The default group is already bound when the driver
+ * was attached.
+ *
+ * For Group 0, it's RDC table was allocated at attach time
+ * no need to allocate a new table.
*/
- nxge_hio_hostinfo_uninit(nxge, vr);
+ if (group->gindex != 0) {
+ rdctbl = nxge_fzc_rdc_tbl_bind(group->nxgep,
+ dev_gindex, B_TRUE);
+ if (rdctbl < 0) {
+ mutex_exit(group->nxgep->genlock);
+ return (rdctbl);
+ }
+ } else {
+ rdctbl = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid;
+ }
+
+ group->rdctbl = rdctbl;
+
+ (void) nxge_init_fzc_rdc_tbl(group->nxgep, rdctbl);
+
+ group->started = B_TRUE;
+ mutex_exit(group->nxgep->genlock);
return (0);
}
+static void
+nxge_hio_group_stop(mac_group_driver_t gdriver)
+{
+ nxge_ring_group_t *group = (nxge_ring_group_t *)gdriver;
+
+ ASSERT(group->type == MAC_RING_TYPE_RX);
+
+ mutex_enter(group->nxgep->genlock);
+ group->started = B_FALSE;
+
+ /*
+ * Unbind the RDC table previously bound for this group.
+ *
+ * Since RDC table for group 0 was allocated at attach
+ * time, no need to unbind the table here.
+ */
+ if (group->gindex != 0)
+ (void) nxge_fzc_rdc_tbl_unbind(group->nxgep, group->rdctbl);
+
+ mutex_exit(group->nxgep->genlock);
+}
+
/* ARGSUSED */
void
-nxge_hio_group_get(void *arg, mac_ring_type_t type, int group,
+nxge_hio_group_get(void *arg, mac_ring_type_t type, int groupid,
mac_group_info_t *infop, mac_group_handle_t ghdl)
{
- p_nxge_t nxgep = (p_nxge_t)arg;
- nxge_rx_ring_group_t *rxgroup;
+ p_nxge_t nxgep = (p_nxge_t)arg;
+ nxge_ring_group_t *group;
+ int dev_gindex;
switch (type) {
case MAC_RING_TYPE_RX:
- rxgroup = &nxgep->rx_hio_groups[group];
- rxgroup->gindex = group;
-
- infop->mrg_driver = (mac_group_driver_t)rxgroup;
- infop->mrg_start = NULL;
- infop->mrg_stop = NULL;
- infop->mrg_addmac = nxge_hio_add_mac;
- infop->mrg_remmac = nxge_hio_rem_mac;
- infop->mrg_count = NXGE_HIO_SHARE_MAX_CHANNELS;
+ group = &nxgep->rx_hio_groups[groupid];
+ group->nxgep = nxgep;
+ group->ghandle = ghdl;
+ group->gindex = groupid;
+ group->sindex = 0; /* not yet bound to a share */
+
+ dev_gindex = nxgep->pt_config.hw_config.def_mac_rxdma_grpid +
+ groupid;
+
+ infop->mgi_driver = (mac_group_driver_t)group;
+ infop->mgi_start = nxge_hio_group_start;
+ infop->mgi_stop = nxge_hio_group_stop;
+ infop->mgi_addmac = nxge_hio_add_mac;
+ infop->mgi_remmac = nxge_hio_rem_mac;
+ infop->mgi_count =
+ nxgep->pt_config.rdc_grps[dev_gindex].max_rdcs;
break;
case MAC_RING_TYPE_TX:
+ /*
+ * 'groupid' for TX should be incremented by one since
+ * the default group (groupid 0) is not known by the MAC layer
+ */
+ group = &nxgep->tx_hio_groups[groupid + 1];
+ group->nxgep = nxgep;
+ group->ghandle = ghdl;
+ group->gindex = groupid + 1;
+ group->sindex = 0; /* not yet bound to a share */
+
+ infop->mgi_driver = (mac_group_driver_t)group;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = NULL; /* not needed */
+ infop->mgi_remmac = NULL; /* not needed */
+ /* no rings associated with group initially */
+ infop->mgi_count = 0;
break;
}
}
+#if defined(sun4v)
+
int
nxge_hio_share_assign(
nxge_t *nxge,
@@ -1126,7 +1343,6 @@ nxge_hio_share_assign(
NXGE_ERROR_MSG((nxge, HIO_CTL,
"nxge_hio_share_assign: "
"vr->assign() returned %d", hv_rv));
- nxge_hio_unshare(vr);
return (-EIO);
}
@@ -1189,7 +1405,7 @@ nxge_hio_share_assign(
return (0);
}
-int
+void
nxge_hio_share_unassign(
nxge_hio_vr_t *vr)
{
@@ -1237,23 +1453,15 @@ nxge_hio_share_unassign(
vr->cookie, hv_rv));
}
}
-
- return (0);
}
int
-nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie,
- mac_share_handle_t *shandle)
+nxge_hio_share_alloc(void *arg, mac_share_handle_t *shandle)
{
- p_nxge_t nxge = (p_nxge_t)arg;
- nxge_rx_ring_group_t *rxgroup;
- nxge_share_handle_t *shp;
-
- nxge_hio_vr_t *vr; /* The Virtualization Region */
- uint64_t rmap, tmap;
- int rdctbl, rv;
-
- nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
+ p_nxge_t nxge = (p_nxge_t)arg;
+ nxge_share_handle_t *shp;
+ nxge_hio_vr_t *vr; /* The Virtualization Region */
+ nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_share"));
@@ -1269,65 +1477,257 @@ nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie,
if ((vr = nxge_hio_vr_share(nxge)) == 0)
return (EAGAIN);
+ shp = &nxge->shares[vr->region];
+ shp->nxgep = nxge;
+ shp->index = vr->region;
+ shp->vrp = (void *)vr;
+ shp->tmap = shp->rmap = 0; /* to be assigned by ms_sbind */
+ shp->rxgroup = 0; /* to be assigned by ms_sadd */
+ shp->active = B_FALSE; /* not bound yet */
+
+ *shandle = (mac_share_handle_t)shp;
+
+ NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_share"));
+ return (0);
+}
+
+
+void
+nxge_hio_share_free(mac_share_handle_t shandle)
+{
+ nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
+ nxge_hio_vr_t *vr;
+
/*
- * Get an RDC group for us to use.
+ * Clear internal handle state.
*/
- if ((rdctbl = nxge_hio_hostinfo_get_rdc_table(nxge)) < 0) {
- nxge_hio_unshare(vr);
- return (EBUSY);
+ vr = shp->vrp;
+ shp->vrp = (void *)NULL;
+ shp->index = 0;
+ shp->tmap = 0;
+ shp->rmap = 0;
+ shp->rxgroup = 0;
+ shp->active = B_FALSE;
+
+ /*
+ * Free VR resource.
+ */
+ nxge_hio_unshare(vr);
+}
+
+
+void
+nxge_hio_share_query(mac_share_handle_t shandle, mac_ring_type_t type,
+ mac_ring_handle_t *rings, uint_t *n_rings)
+{
+ nxge_t *nxge;
+ nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
+ nxge_ring_handle_t *rh;
+ uint32_t offset;
+
+ nxge = shp->nxgep;
+
+ switch (type) {
+ case MAC_RING_TYPE_RX:
+ rh = nxge->rx_ring_handles;
+ offset = nxge->pt_config.hw_config.start_rdc;
+ break;
+
+ case MAC_RING_TYPE_TX:
+ rh = nxge->tx_ring_handles;
+ offset = nxge->pt_config.hw_config.tdc.start;
+ break;
}
- vr->rdc_tbl = (uint8_t)rdctbl;
+
+ /*
+ * In version 1.0, we may only give a VR 2 RDCs/TDCs. Not only that,
+ * but the HV has statically assigned the channels like so:
+ * VR0: RDC0 & RDC1
+ * VR1: RDC2 & RDC3, etc.
+ * The TDCs are assigned in exactly the same way.
+ */
+ if (rings != NULL) {
+ rings[0] = rh[(shp->index * 2) - offset].ring_handle;
+ rings[1] = rh[(shp->index * 2 + 1) - offset].ring_handle;
+ }
+ if (n_rings != NULL) {
+ *n_rings = 2;
+ }
+}
+
+int
+nxge_hio_share_add_group(mac_share_handle_t shandle,
+ mac_group_driver_t ghandle)
+{
+ nxge_t *nxge;
+ nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
+ nxge_ring_group_t *rg = (nxge_ring_group_t *)ghandle;
+ nxge_hio_vr_t *vr; /* The Virtualization Region */
+ nxge_grp_t *group;
+ int i;
+
+ if (rg->sindex != 0) {
+ /* the group is already bound to a share */
+ return (EALREADY);
+ }
+
+ nxge = rg->nxgep;
+ vr = shp->vrp;
+
+ switch (rg->type) {
+ case MAC_RING_TYPE_RX:
+ /*
+ * Make sure that the group has the right rings associated
+ * for the share. In version 1.0, we may only give a VR
+ * 2 RDCs. Not only that, but the HV has statically
+ * assigned the channels like so:
+ * VR0: RDC0 & RDC1
+ * VR1: RDC2 & RDC3, etc.
+ */
+ group = nxge->rx_set.group[rg->gindex];
+
+ if (group->count > 2) {
+ /* a share can have at most 2 rings */
+ return (EINVAL);
+ }
+
+ for (i = 0; i < NXGE_MAX_RDCS; i++) {
+ if (group->map & (1 << i)) {
+ if ((i != shp->index * 2) &&
+ (i != (shp->index * 2 + 1))) {
+ /*
+ * A group with invalid rings was
+ * attempted to bind to this share
+ */
+ return (EINVAL);
+ }
+ }
+ }
+
+ rg->sindex = vr->region;
+ vr->rdc_tbl = rg->rdctbl;
+ shp->rxgroup = vr->rdc_tbl;
+ break;
+
+ case MAC_RING_TYPE_TX:
+ /*
+ * Make sure that the group has the right rings associated
+ * for the share. In version 1.0, we may only give a VR
+ * 2 TDCs. Not only that, but the HV has statically
+ * assigned the channels like so:
+ * VR0: TDC0 & TDC1
+ * VR1: TDC2 & TDC3, etc.
+ */
+ group = nxge->tx_set.group[rg->gindex];
+
+ if (group->count > 2) {
+ /* a share can have at most 2 rings */
+ return (EINVAL);
+ }
+
+ for (i = 0; i < NXGE_MAX_TDCS; i++) {
+ if (group->map & (1 << i)) {
+ if ((i != shp->index * 2) &&
+ (i != (shp->index * 2 + 1))) {
+ /*
+ * A group with invalid rings was
+ * attempted to bind to this share
+ */
+ return (EINVAL);
+ }
+ }
+ }
+
+ vr->tdc_tbl = nxge->pt_config.hw_config.def_mac_txdma_grpid +
+ rg->gindex;
+ rg->sindex = vr->region;
+ break;
+ }
+ return (0);
+}
+
+int
+nxge_hio_share_rem_group(mac_share_handle_t shandle,
+ mac_group_driver_t ghandle)
+{
+ nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
+ nxge_ring_group_t *group = (nxge_ring_group_t *)ghandle;
+ nxge_hio_vr_t *vr; /* The Virtualization Region */
+ int rv = 0;
+
+ vr = shp->vrp;
+
+ switch (group->type) {
+ case MAC_RING_TYPE_RX:
+ group->sindex = 0;
+ vr->rdc_tbl = 0;
+ shp->rxgroup = 0;
+ break;
+
+ case MAC_RING_TYPE_TX:
+ group->sindex = 0;
+ vr->tdc_tbl = 0;
+ break;
+ }
+
+ return (rv);
+}
+
+int
+nxge_hio_share_bind(mac_share_handle_t shandle, uint64_t cookie,
+ uint64_t *rcookie)
+{
+ nxge_t *nxge;
+ nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
+ nxge_hio_vr_t *vr;
+ uint64_t rmap, tmap, hv_rmap, hv_tmap;
+ int rv;
+
+ nxge = shp->nxgep;
+ vr = (nxge_hio_vr_t *)shp->vrp;
/*
* Add resources to the share.
+ * For each DMA channel associated with the VR, bind its resources
+ * to the VR.
*/
tmap = 0;
- rv = nxge_hio_addres(vr, MAC_RING_TYPE_TX,
- NXGE_HIO_SHARE_MAX_CHANNELS);
+ rv = nxge_hio_addres(vr, MAC_RING_TYPE_TX, &tmap);
if (rv != 0) {
- nxge_hio_unshare(vr);
return (rv);
}
rmap = 0;
- rv = nxge_hio_addres(vr, MAC_RING_TYPE_RX,
- NXGE_HIO_SHARE_MAX_CHANNELS);
+ rv = nxge_hio_addres(vr, MAC_RING_TYPE_RX, &rmap);
if (rv != 0) {
nxge_hio_remres(vr, MAC_RING_TYPE_TX, tmap);
- nxge_hio_unshare(vr);
return (rv);
}
- if ((rv = nxge_hio_share_assign(nxge, cookie, &tmap, &rmap, vr))) {
- nxge_hio_remres(vr, MAC_RING_TYPE_RX, tmap);
+ /*
+ * Ask the Hypervisor to set up the VR and allocate slots for
+ * each rings associated with the VR.
+ */
+ hv_tmap = hv_rmap = 0;
+ if ((rv = nxge_hio_share_assign(nxge, cookie,
+ &hv_tmap, &hv_rmap, vr))) {
nxge_hio_remres(vr, MAC_RING_TYPE_TX, tmap);
- nxge_hio_unshare(vr);
+ nxge_hio_remres(vr, MAC_RING_TYPE_RX, rmap);
return (rv);
}
- rxgroup = &nxge->rx_hio_groups[vr->rdc_tbl];
- rxgroup->gindex = vr->rdc_tbl;
- rxgroup->sindex = vr->region;
-
- shp = &nxge->shares[vr->region];
- shp->index = vr->region;
- shp->vrp = (void *)vr;
- shp->tmap = tmap;
- shp->rmap = rmap;
- shp->rxgroup = vr->rdc_tbl;
shp->active = B_TRUE;
+ shp->tmap = hv_tmap;
+ shp->rmap = hv_rmap;
/* high 32 bits are cfg_hdl and low 32 bits are HV cookie */
*rcookie = (((uint64_t)nxge->niu_cfg_hdl) << 32) | vr->cookie;
- *shandle = (mac_share_handle_t)shp;
-
- NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_share"));
return (0);
}
void
-nxge_hio_share_free(mac_share_handle_t shandle)
+nxge_hio_share_unbind(mac_share_handle_t shandle)
{
nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
@@ -1335,52 +1735,15 @@ nxge_hio_share_free(mac_share_handle_t shandle)
* First, unassign the VR (take it back),
* so we can enable interrupts again.
*/
- (void) nxge_hio_share_unassign(shp->vrp);
+ nxge_hio_share_unassign(shp->vrp);
/*
* Free Ring Resources for TX and RX
*/
nxge_hio_remres(shp->vrp, MAC_RING_TYPE_TX, shp->tmap);
nxge_hio_remres(shp->vrp, MAC_RING_TYPE_RX, shp->rmap);
-
- /*
- * Free VR resource.
- */
- nxge_hio_unshare(shp->vrp);
-
- /*
- * Clear internal handle state.
- */
- shp->index = 0;
- shp->vrp = (void *)NULL;
- shp->tmap = 0;
- shp->rmap = 0;
- shp->rxgroup = 0;
- shp->active = B_FALSE;
}
-void
-nxge_hio_share_query(mac_share_handle_t shandle, mac_ring_type_t type,
- uint32_t *rmin, uint32_t *rmax, uint64_t *rmap, uint64_t *gnum)
-{
- nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
-
- switch (type) {
- case MAC_RING_TYPE_RX:
- *rmin = NXGE_HIO_SHARE_MIN_CHANNELS;
- *rmax = NXGE_HIO_SHARE_MAX_CHANNELS;
- *rmap = shp->rmap;
- *gnum = shp->rxgroup;
- break;
-
- case MAC_RING_TYPE_TX:
- *rmin = NXGE_HIO_SHARE_MIN_CHANNELS;
- *rmax = NXGE_HIO_SHARE_MAX_CHANNELS;
- *rmap = shp->tmap;
- *gnum = 0;
- break;
- }
-}
/*
* nxge_hio_vr_share
@@ -1474,7 +1837,11 @@ nxge_hio_unshare(
*
* nxge_hio_hostinfo_uninit(nxge, vr);
*/
- (void) nxge_fzc_rdc_tbl_unbind(nxge, vr->rdc_tbl);
+
+ /*
+ * XXX: This is done by ms_sremove?
+ * (void) nxge_fzc_rdc_tbl_unbind(nxge, vr->rdc_tbl);
+ */
nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
@@ -1495,23 +1862,53 @@ int
nxge_hio_addres(
nxge_hio_vr_t *vr,
mac_ring_type_t type,
- int count)
+ uint64_t *map)
{
- nxge_t *nxge = (nxge_t *)vr->nxge;
- int i;
+ nxge_t *nxge = (nxge_t *)vr->nxge;
+ nxge_grp_t *group;
+ int groupid;
+ int i;
+ int max_dcs;
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_addres"));
if (!nxge)
return (EINVAL);
- for (i = 0; i < count; i++) {
- int rv;
- if ((rv = nxge_hio_dc_share(nxge, vr, type)) < 0) {
- if (i == 0) /* Couldn't get even one DC. */
- return (-rv);
- else
- break;
+ /*
+ * For each ring associated with the group, add the resources
+ * to the group and bind.
+ */
+ max_dcs = (type == MAC_RING_TYPE_TX) ? NXGE_MAX_TDCS : NXGE_MAX_RDCS;
+ if (type == MAC_RING_TYPE_TX) {
+ /* set->group is an array of group indexed by a port group id */
+ groupid = vr->tdc_tbl -
+ nxge->pt_config.hw_config.def_mac_txdma_grpid;
+ group = nxge->tx_set.group[groupid];
+ } else {
+ /* set->group is an array of group indexed by a port group id */
+ groupid = vr->rdc_tbl -
+ nxge->pt_config.hw_config.def_mac_rxdma_grpid;
+ group = nxge->rx_set.group[groupid];
+ }
+
+ if (group->map == 0) {
+ NXGE_DEBUG_MSG((nxge, HIO_CTL, "There is no rings associated "
+ "with this VR"));
+ return (EINVAL);
+ }
+
+ for (i = 0; i < max_dcs; i++) {
+ if (group->map & (1 << i)) {
+ int rv;
+
+ if ((rv = nxge_hio_dc_share(nxge, vr, type, i)) < 0) {
+ if (*map == 0) /* Couldn't get even one DC. */
+ return (-rv);
+ else
+ break;
+ }
+ *map |= (1 << i);
}
}
@@ -1538,6 +1935,10 @@ nxge_hio_remres(
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_remres(%lx)", res_map));
+ /*
+ * For each ring bound to the group, remove the DMA resources
+ * from the group and unbind.
+ */
group = (type == MAC_RING_TYPE_TX ? &vr->tx_group : &vr->rx_group);
while (group->dc) {
nxge_hio_dc_t *dc = group->dc;
@@ -1628,12 +2029,11 @@ nxge_hio_tdc_share(
nxge->tdc_is_shared[channel] = B_TRUE;
MUTEX_EXIT(&nhd->lock);
-
if (nxge_intr_remove(nxge, VP_BOUND_TX, channel) != NXGE_OK) {
NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_tdc_share: "
"Failed to remove interrupt for TxDMA channel %d",
channel));
- return (NXGE_ERROR);
+ return (-EINVAL);
}
/* Disable TxDMA A.9.6.10 */
@@ -1698,13 +2098,9 @@ nxge_hio_rdc_share(
nxge_hio_vr_t *vr,
int channel)
{
- nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
- nxge_hw_pt_cfg_t *hardware = &nxge->pt_config.hw_config;
nxge_grp_set_t *set = &nxge->rx_set;
nxge_rdc_grp_t *rdc_grp;
- int current, last;
-
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_rdc_share"));
/* Disable interrupts. */
@@ -1739,21 +2135,6 @@ nxge_hio_rdc_share(
nxge_grp_dc_remove(nxge, VP_BOUND_RX, channel);
/*
- * We have to reconfigure the RDC table(s)
- * to which this channel belongs.
- */
- current = hardware->def_mac_rxdma_grpid;
- last = current + hardware->max_rdc_grpids;
- for (; current < last; current++) {
- if (nhd->rdc_tbl[current].nxge == (uintptr_t)nxge) {
- rdc_grp = &nxge->pt_config.rdc_grps[current];
- rdc_grp->map = set->owned.map;
- rdc_grp->max_rdcs--;
- (void) nxge_init_fzc_rdc_tbl(nxge, current);
- }
- }
-
- /*
* The guest domain will reconfigure the RDC later.
*
* But in the meantime, we must re-enable the Rx MAC so
@@ -1791,12 +2172,6 @@ nxge_hio_rdc_share(
}
NXGE_DC_SET(rdc_grp->map, channel);
- if (nxge_init_fzc_rdc_tbl(nxge, vr->rdc_tbl) != NXGE_OK) {
- NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
- "nxge_hio_rdc_share: nxge_init_fzc_rdc_tbl failed"));
- return (-EIO);
- }
-
NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_rdc_share"));
return (0);
@@ -1811,8 +2186,7 @@ nxge_hio_rdc_share(
* nxge
* vr The VR that <channel> will belong to.
* type Tx or Rx.
- * res_map The resource map used by the caller, which we will
- * update if successful.
+ * channel Channel to share
*
* Notes:
*
@@ -1823,59 +2197,17 @@ int
nxge_hio_dc_share(
nxge_t *nxge,
nxge_hio_vr_t *vr,
- mac_ring_type_t type)
+ mac_ring_type_t type,
+ int channel)
{
nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
- nxge_hw_pt_cfg_t *hardware;
nxge_hio_dc_t *dc;
- int channel, limit;
-
- nxge_grp_set_t *set;
nxge_grp_t *group;
-
int slot;
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_dc_share(%cdc %d",
type == MAC_RING_TYPE_TX ? 't' : 'r', channel));
- /*
- * In version 1.0, we may only give a VR 2 RDCs or TDCs.
- * Not only that, but the HV has statically assigned the
- * channels like so:
- * VR0: RDC0 & RDC1
- * VR1: RDC2 & RDC3, etc.
- * The TDCs are assigned in exactly the same way.
- *
- * So, for example
- * hardware->start_rdc + vr->region * 2;
- * VR1: hardware->start_rdc + 1 * 2;
- * VR3: hardware->start_rdc + 3 * 2;
- * If start_rdc is 0, we end up with 2 or 6.
- * If start_rdc is 8, we end up with 10 or 14.
- */
-
- set = (type == MAC_RING_TYPE_TX ? &nxge->tx_set : &nxge->rx_set);
- hardware = &nxge->pt_config.hw_config;
-
- // This code is still NIU-specific (assuming only 2 ports)
- channel = hardware->start_rdc + (vr->region % 4) * 2;
- limit = channel + 2;
-
- MUTEX_ENTER(&nhd->lock);
- for (; channel < limit; channel++) {
- if ((1 << channel) & set->owned.map) {
- break;
- }
- }
-
- if (channel == limit) {
- MUTEX_EXIT(&nhd->lock);
- NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
- "nxge_hio_dc_share: there are no channels to share"));
- return (-EIO);
- }
-
- MUTEX_EXIT(&nhd->lock);
/* -------------------------------------------------- */
slot = (type == MAC_RING_TYPE_TX) ?
@@ -1884,9 +2216,9 @@ nxge_hio_dc_share(
if (slot < 0) {
if (type == MAC_RING_TYPE_RX) {
- nxge_hio_rdc_unshare(nxge, channel);
+ nxge_hio_rdc_unshare(nxge, vr->rdc_tbl, channel);
} else {
- nxge_hio_tdc_unshare(nxge, channel);
+ nxge_hio_tdc_unshare(nxge, vr->tdc_tbl, channel);
}
return (slot);
}
@@ -1912,7 +2244,6 @@ nxge_hio_dc_share(
group = (type == MAC_RING_TYPE_TX ? &vr->tx_group : &vr->rx_group);
dc->group = group;
-
/* Initialize <group>, if necessary */
if (group->count == 0) {
group->nxge = nxge;
@@ -1952,16 +2283,21 @@ nxge_hio_dc_share(
void
nxge_hio_tdc_unshare(
nxge_t *nxge,
+ int dev_grpid,
int channel)
{
nxge_grp_set_t *set = &nxge->tx_set;
- nxge_grp_t *group = set->group[0];
+ nxge_grp_t *group;
+ int grpid;
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_tdc_unshare"));
NXGE_DC_RESET(set->shared.map, channel);
set->shared.count--;
+ grpid = dev_grpid - nxge->pt_config.hw_config.def_mac_txdma_grpid;
+ group = set->group[grpid];
+
if ((nxge_grp_dc_add(nxge, group, VP_BOUND_TX, channel))) {
NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_tdc_unshare: "
"Failed to initialize TxDMA channel %d", channel));
@@ -1994,14 +2330,12 @@ nxge_hio_tdc_unshare(
void
nxge_hio_rdc_unshare(
nxge_t *nxge,
+ int dev_grpid,
int channel)
{
- nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
- nxge_hw_pt_cfg_t *hardware = &nxge->pt_config.hw_config;
-
- nxge_grp_set_t *set = &nxge->rx_set;
- nxge_grp_t *group = set->group[0];
- int current, last;
+ nxge_grp_set_t *set = &nxge->rx_set;
+ nxge_grp_t *group;
+ int grpid;
NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_rdc_unshare"));
@@ -2024,6 +2358,9 @@ nxge_hio_rdc_unshare(
NXGE_DC_RESET(set->shared.map, channel);
set->shared.count--;
+ grpid = dev_grpid - nxge->pt_config.hw_config.def_mac_rxdma_grpid;
+ group = set->group[grpid];
+
/*
* Assert RST: RXDMA_CFIG1[30] = 1
*
@@ -2035,7 +2372,7 @@ nxge_hio_rdc_unshare(
/* Be sure to re-enable the RX MAC. */
if (nxge_rx_mac_enable(nxge) != NXGE_OK) {
NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
- "nxge_hio_rdc_unshare: Rx MAC still disabled"));
+ "nxge_hio_rdc_share: Rx MAC still disabled"));
}
NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_rdc_unshare: "
"Failed to initialize RxDMA channel %d", channel));
@@ -2043,27 +2380,11 @@ nxge_hio_rdc_unshare(
}
/*
- * We have to reconfigure the RDC table(s)
- * to which this channel once again belongs.
- */
- current = hardware->def_mac_rxdma_grpid;
- last = current + hardware->max_rdc_grpids;
- for (; current < last; current++) {
- if (nhd->rdc_tbl[current].nxge == (uintptr_t)nxge) {
- nxge_rdc_grp_t *group;
- group = &nxge->pt_config.rdc_grps[current];
- group->map = set->owned.map;
- group->max_rdcs++;
- (void) nxge_init_fzc_rdc_tbl(nxge, current);
- }
- }
-
- /*
* Enable RxMAC = A.9.2.10
*/
if (nxge_rx_mac_enable(nxge) != NXGE_OK) {
NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
- "nxge_hio_rdc_unshare: Rx MAC still disabled"));
+ "nxge_hio_rdc_share: Rx MAC still disabled"));
return;
}
@@ -2120,9 +2441,9 @@ nxge_hio_dc_unshare(
dc->cookie = 0;
if (type == MAC_RING_TYPE_RX) {
- nxge_hio_rdc_unshare(nxge, channel);
+ nxge_hio_rdc_unshare(nxge, vr->rdc_tbl, channel);
} else {
- nxge_hio_tdc_unshare(nxge, channel);
+ nxge_hio_tdc_unshare(nxge, vr->tdc_tbl, channel);
}
NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_dc_unshare"));
diff --git a/usr/src/uts/common/io/nxge/nxge_hio_guest.c b/usr/src/uts/common/io/nxge/nxge_hio_guest.c
index 5fbcbfdfe1..5517b9ceee 100644
--- a/usr/src/uts/common/io/nxge/nxge_hio_guest.c
+++ b/usr/src/uts/common/io/nxge/nxge_hio_guest.c
@@ -208,7 +208,6 @@ static void nxge_check_guest_state(nxge_hio_vr_t *);
* Guest domain
*/
/* ARGSUSED */
-
int
nxge_hio_vr_add(nxge_t *nxge)
{
@@ -249,7 +248,7 @@ nxge_hio_vr_add(nxge_t *nxge)
return (NXGE_ERROR);
}
- cookie = (uint32_t)reg_val[0];
+ cookie = (uint32_t)(reg_val[0]);
ddi_prop_free(reg_val);
fp = &nhd->hio.vr;
@@ -521,11 +520,17 @@ res_map_parse(
*/
if (type == NXGE_TRANSMIT_GROUP) {
nxge_dma_pt_cfg_t *port = &nxge->pt_config;
+ nxge_tdc_grp_t *tdc_grp = &nxge->pt_config.tdc_grps[0];
hardware->tdc.start = first;
hardware->tdc.count = count;
hardware->tdc.owned = count;
+ tdc_grp->start_tdc = first;
+ tdc_grp->max_tdcs = (uint8_t)count;
+ tdc_grp->grp_index = group->index;
+ tdc_grp->map = slots;
+
group->map = slots;
/*
@@ -944,7 +949,6 @@ nxge_check_guest_state(
NXGE_DEBUG_MSG((nxge, SYSERR_CTL, "==> nxge_check_guest_state"));
MUTEX_ENTER(nxge->genlock);
-
nxge->nxge_timerid = 0;
if (nxge->nxge_mac_state == NXGE_MAC_STARTED) {
diff --git a/usr/src/uts/common/io/nxge/nxge_hv.c b/usr/src/uts/common/io/nxge/nxge_hv.c
index a454b3ee72..1a42fcd9a7 100644
--- a/usr/src/uts/common/io/nxge/nxge_hv.c
+++ b/usr/src/uts/common/io/nxge/nxge_hv.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* nxge_hv.c
*
@@ -37,6 +35,8 @@
#include <sys/nxge/nxge_impl.h>
#include <sys/nxge/nxge_hio.h>
+#if defined(sun4v)
+
void
nxge_hio_hv_init(nxge_t *nxge)
{
@@ -79,3 +79,5 @@ nxge_hio_hv_init(nxge_t *nxge)
rx->getinfo = &hv_niu_vrrx_getinfo;
}
+
+#endif /* defined(sun4v) */
diff --git a/usr/src/uts/common/io/nxge/nxge_hw.c b/usr/src/uts/common/io/nxge/nxge_hw.c
index 4a6cbbea6d..5513ce4f4e 100644
--- a/usr/src/uts/common/io/nxge/nxge_hw.c
+++ b/usr/src/uts/common/io/nxge/nxge_hw.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nxge/nxge_impl.h>
/*
@@ -221,7 +219,6 @@ nxge_intr(void *arg1, void *arg2)
NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr(%d): #ldvs %d "
" #intrs %d", i, nldvs, nintrs));
/* Get this group's flag bits. */
- t_ldgp->interrupted = B_FALSE;
rs = npi_ldsv_ldfs_get(handle, t_ldgp->ldg,
&vector0, &vector1, &vector2);
if (rs) {
@@ -235,7 +232,6 @@ nxge_intr(void *arg1, void *arg2)
NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: "
"vector0 0x%llx vector1 0x%llx vector2 0x%llx",
vector0, vector1, vector2));
- t_ldgp->interrupted = B_TRUE;
nldvs = t_ldgp->nldvs;
for (j = 0; j < nldvs; j++, t_ldvp++) {
/*
@@ -261,12 +257,10 @@ nxge_intr(void *arg1, void *arg2)
t_ldgp = ldgp;
for (i = 0; i < nintrs; i++, t_ldgp++) {
/* rearm group interrupts */
- if (t_ldgp->interrupted) {
- NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: arm "
- "group %d", t_ldgp->ldg));
- (void) npi_intr_ldg_mgmt_set(handle, t_ldgp->ldg,
- t_ldgp->arm, t_ldgp->ldg_timer);
- }
+ NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: arm "
+ "group %d", t_ldgp->ldg));
+ (void) npi_intr_ldg_mgmt_set(handle, t_ldgp->ldg,
+ t_ldgp->arm, t_ldgp->ldg_timer);
}
NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_intr: serviced 0x%x",
diff --git a/usr/src/uts/common/io/nxge/nxge_mac.c b/usr/src/uts/common/io/nxge/nxge_mac.c
index d009bdbd98..8ca60cf7a7 100644
--- a/usr/src/uts/common/io/nxge/nxge_mac.c
+++ b/usr/src/uts/common/io/nxge/nxge_mac.c
@@ -46,13 +46,6 @@ extern uint32_t nxge_lb_dbg;
extern boolean_t nxge_jumbo_enable;
extern uint32_t nxge_jumbo_mtu;
- /* The following functions may be found in nxge_main.c */
-extern void nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot,
- boolean_t factory);
-extern int nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr);
-extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
-extern int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr,
- mac_addr_slot_t slot, uint8_t rdctbl);
typedef enum {
CHECK_LINK_RESCHEDULE,
@@ -3040,160 +3033,6 @@ fail:
return (NXGE_ERROR | rs);
}
-int
-nxge_hio_hostinfo_get_rdc_table(p_nxge_t nxgep)
-{
- int rdc_tbl;
-
- /*
- * Get an RDC table (version 0).
- */
- if ((rdc_tbl = nxge_fzc_rdc_tbl_bind(nxgep, -1, B_FALSE)) < 0) {
- NXGE_ERROR_MSG((nxgep, OBP_CTL,
- "nxge_hio_hostinfo_get_rdc_table: "
- "there are no free RDC tables!"));
- return (EBUSY);
- }
-
- return (rdc_tbl);
-}
-
-/*
- * nxge_hio_hostinfo_init
- *
- * Initialize an alternate MAC address, and bind a macrdctbln to it.
- *
- * Arguments:
- * nxge
- * vr The Virtualization Region
- * macaddr The alternate MAC address
- *
- * Notes:
- * 1. Find & bind an RDC table to <nxge>.
- * 2. Program an alternate MAC address (<macaddr>).
- * 3. Bind the RDC table to <macaddr>.
- *
- * Context:
- * Service domain
- *
- * Side Effects:
- * nxge->class_config.mac_host_info[slot].rdctbl
- * vr->slot & vr->altmac
- *
- */
-int
-nxge_hio_hostinfo_init(nxge_t *nxge, nxge_hio_vr_t *vr, ether_addr_t *macaddr)
-{
- int slot, error;
- uint8_t rdc_tbl;
- nxge_mmac_t *mmac_info;
- nxge_rdc_grp_t *group;
- uint8_t *addr = (uint8_t *)macaddr;
-
- mutex_enter(nxge->genlock);
-
- rdc_tbl = (uint8_t)vr->rdc_tbl;
-
- /* Initialize the NXGE RDC table data structure. */
- group = &nxge->pt_config.rdc_grps[rdc_tbl];
- group->port = NXGE_GET_PORT_NUM(nxge->function_num);
- group->config_method = RDC_TABLE_ENTRY_METHOD_REP;
- group->flag = 1; /* This group has been configured. */
-
- mmac_info = &nxge->nxge_mmac_info;
-
- /*
- * Are there free slots.
- */
- if (mmac_info->naddrfree == 0) {
- mutex_exit(nxge->genlock);
- return (ENOSPC);
- }
-
- /*
- * Find a slot for the VR to use for Hybrid I/O.
- */
- if (mmac_info->num_factory_mmac < mmac_info->num_mmac) {
- for (slot = mmac_info->num_factory_mmac + 1;
- slot <= mmac_info->num_mmac; slot++) {
- if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
- break;
- }
- if (slot > mmac_info->num_mmac) {
- for (slot = 1; slot <= mmac_info->num_factory_mmac;
- slot++) {
- if (!(mmac_info->mac_pool[slot].flags
- & MMAC_SLOT_USED))
- break;
- }
- }
- } else {
- for (slot = 1; slot <= mmac_info->num_mmac; slot++) {
- if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
- break;
- }
- }
- ASSERT(slot <= mmac_info->num_mmac);
- vr->slot = slot;
-
- error = nxge_altmac_set(nxge, addr, slot, rdc_tbl);
- if (error != 0) {
- mutex_exit(nxge->genlock);
- return (EIO);
- }
-
- bcopy(macaddr, vr->altmac, sizeof (vr->altmac));
-
- /*
- * Update mmac
- */
- bcopy(addr, mmac_info->mac_pool[vr->slot].addr, ETHERADDRL);
- mmac_info->mac_pool[vr->slot].flags |= MMAC_SLOT_USED;
- mmac_info->mac_pool[vr->slot].flags &= ~MMAC_VENDOR_ADDR;
- mmac_info->naddrfree--;
- nxge_mmac_kstat_update(nxge, vr->slot, B_FALSE);
-
- mutex_exit(nxge->genlock);
- return (0);
-}
-
-/*
- * nxge_hio_hostinfo_uninit
- *
- * Uninitialize an alternate MAC address.
- *
- * Arguments:
- * nxge
- * vr The Virtualization Region
- *
- * Notes:
- * Remove the VR's alternate MAC address.
- *
- * Context:
- * Service domain
- *
- * Side Effects:
- * nxge->class_config.mac_host_info[slot].rdctbl
- *
- */
-void
-nxge_hio_hostinfo_uninit(nxge_t *nxge, nxge_hio_vr_t *vr)
-{
- nxge_class_pt_cfg_t *class;
- uint8_t addrn;
-
- addrn = vr->slot - 1;
- (void) npi_mac_altaddr_disable(nxge->npi_handle,
- nxge->mac.portnum, addrn);
-
- /* Set this variable to its default. */
- class = (p_nxge_class_pt_cfg_t)&nxge->class_config;
- class->mac_host_info[addrn].rdctbl =
- nxge->pt_config.hw_config.def_mac_rxdma_grpid;
-
- (void) nxge_m_mmac_remove(nxge, vr->slot);
- vr->slot = -1;
-}
/* Initialize the RxMAC sub-block */
diff --git a/usr/src/uts/common/io/nxge/nxge_main.c b/usr/src/uts/common/io/nxge/nxge_main.c
index ca2ca6b30b..9b20c438f4 100644
--- a/usr/src/uts/common/io/nxge/nxge_main.c
+++ b/usr/src/uts/common/io/nxge/nxge_main.c
@@ -117,14 +117,6 @@ nxge_tx_mode_t nxge_tx_scheme = NXGE_USE_SERIAL;
#define NXGE_LSO_MAXLEN 65535
uint32_t nxge_lso_max = NXGE_LSO_MAXLEN;
-/*
- * Debugging flags:
- * nxge_no_tx_lb : transmit load balancing
- * nxge_tx_lb_policy: 0 - TCP port (default)
- * 3 - DEST MAC
- */
-uint32_t nxge_no_tx_lb = 0;
-uint32_t nxge_tx_lb_policy = NXGE_TX_LB_TCPUDP;
/*
* Add tunable to reduce the amount of time spent in the
@@ -208,8 +200,7 @@ static void nxge_remove_hard_properties(p_nxge_t);
/*
* These two functions are required by nxge_hio.c
*/
-extern int nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr);
-extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
+extern int nxge_m_mmac_remove(void *arg, int slot);
extern void nxge_grp_cleanup(p_nxge_t nxge);
static nxge_status_t nxge_setup_system_dma_pages(p_nxge_t);
@@ -224,9 +215,7 @@ static void nxge_test_map_regs(p_nxge_t nxgep);
#endif
static nxge_status_t nxge_add_intrs(p_nxge_t nxgep);
-static nxge_status_t nxge_add_soft_intrs(p_nxge_t nxgep);
static void nxge_remove_intrs(p_nxge_t nxgep);
-static void nxge_remove_soft_intrs(p_nxge_t nxgep);
static nxge_status_t nxge_add_intrs_adv(p_nxge_t nxgep);
static nxge_status_t nxge_add_intrs_adv_type(p_nxge_t, uint32_t);
@@ -284,20 +273,19 @@ extern int nxge_param_set_mac(p_nxge_t, queue_t *, mblk_t *,
*/
static int nxge_m_start(void *);
static void nxge_m_stop(void *);
-static int nxge_m_unicst(void *, const uint8_t *);
static int nxge_m_multicst(void *, boolean_t, const uint8_t *);
static int nxge_m_promisc(void *, boolean_t);
static void nxge_m_ioctl(void *, queue_t *, mblk_t *);
-static void nxge_m_resources(void *);
-mblk_t *nxge_m_tx(void *arg, mblk_t *);
static nxge_status_t nxge_mac_register(p_nxge_t);
-int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr,
- mac_addr_slot_t slot, uint8_t rdctbl);
-void nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot,
+static int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr,
+ int slot, int rdctbl, boolean_t usetbl);
+void nxge_mmac_kstat_update(p_nxge_t nxgep, int slot,
boolean_t factory);
-static int nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr);
-static int nxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr);
-static int nxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr);
+#if defined(sun4v)
+extern mblk_t *nxge_m_tx(void *arg, mblk_t *mp);
+#endif
+
+static void nxge_m_getfactaddr(void *, uint_t, uint8_t *);
static boolean_t nxge_m_getcapab(void *, mac_capab_t, void *);
static int nxge_m_setprop(void *, const char *, mac_prop_id_t,
uint_t, const void *);
@@ -308,6 +296,12 @@ static int nxge_set_priv_prop(nxge_t *, const char *, uint_t,
static int nxge_get_priv_prop(nxge_t *, const char *, uint_t, uint_t,
void *, uint_t *);
static int nxge_get_def_val(nxge_t *, mac_prop_id_t, uint_t, void *);
+static void nxge_fill_ring(void *, mac_ring_type_t, const int, const int,
+ mac_ring_info_t *, mac_ring_handle_t);
+static void nxge_group_add_ring(mac_group_driver_t, mac_ring_driver_t,
+ mac_ring_type_t);
+static void nxge_group_rem_ring(mac_group_driver_t, mac_ring_driver_t,
+ mac_ring_type_t);
static void nxge_niu_peu_reset(p_nxge_t nxgep);
static void nxge_set_pci_replay_timeout(nxge_t *);
@@ -336,15 +330,11 @@ mac_priv_prop_t nxge_priv_props[] = {
#define NXGE_MAX_PRIV_PROPS \
(sizeof (nxge_priv_props)/sizeof (mac_priv_prop_t))
-#define NXGE_M_CALLBACK_FLAGS\
- (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
-
-
#define NXGE_NEPTUNE_MAGIC 0x4E584745UL
#define MAX_DUMP_SZ 256
#define NXGE_M_CALLBACK_FLAGS \
- (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
+ (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
mac_callbacks_t nxge_m_callbacks = {
NXGE_M_CALLBACK_FLAGS,
@@ -353,9 +343,8 @@ mac_callbacks_t nxge_m_callbacks = {
nxge_m_stop,
nxge_m_promisc,
nxge_m_multicst,
- nxge_m_unicst,
- nxge_m_tx,
- nxge_m_resources,
+ NULL,
+ NULL,
nxge_m_ioctl,
nxge_m_getcapab,
NULL,
@@ -631,6 +620,11 @@ nxge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
if (nxgep->niu_type != N2_NIU) {
nxge_set_pci_replay_timeout(nxgep);
}
+#if defined(sun4v)
+ if (isLDOMguest(nxgep)) {
+ nxge_m_callbacks.mc_tx = nxge_m_tx;
+ }
+#endif
#if defined(sun4v)
/* This is required by nxge_hio_init(), which follows. */
@@ -847,13 +841,6 @@ nxge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
goto nxge_attach_fail;
}
- status = nxge_add_soft_intrs(nxgep);
- if (status != DDI_SUCCESS) {
- NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
- "add_soft_intr failed"));
- goto nxge_attach_fail;
- }
-
/* If a guest, register with vio_net instead. */
if ((status = nxge_mac_register(nxgep)) != NXGE_OK) {
NXGE_DEBUG_MSG((nxgep, DDI_CTL,
@@ -1032,9 +1019,6 @@ nxge_unattach(p_nxge_t nxgep)
*/
nxge_remove_intrs(nxgep);
- /* remove soft interrups */
- nxge_remove_soft_intrs(nxgep);
-
/*
* Stop the device and free resources.
*/
@@ -3742,6 +3726,20 @@ nxge_m_start_exit:
return (0);
}
+
+static boolean_t
+nxge_check_groups_stopped(p_nxge_t nxgep)
+{
+ int i;
+
+ for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
+ if (nxgep->rx_hio_groups[i].started)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
/*
* nxge_m_stop(): stop transmitting and receiving.
*/
@@ -3749,9 +3747,21 @@ static void
nxge_m_stop(void *arg)
{
p_nxge_t nxgep = (p_nxge_t)arg;
+ boolean_t groups_stopped;
NXGE_DEBUG_MSG((nxgep, NXGE_CTL, "==> nxge_m_stop"));
+ groups_stopped = nxge_check_groups_stopped(nxgep);
+#ifdef later
+ ASSERT(groups_stopped == B_FALSE);
+#endif
+
+ if (!groups_stopped) {
+ cmn_err(CE_WARN, "nxge(%d): groups are not stopped!\n",
+ nxgep->instance);
+ return;
+ }
+
MUTEX_ENTER(nxgep->genlock);
nxgep->nxge_mac_state = NXGE_MAC_STOPPING;
@@ -3770,26 +3780,6 @@ nxge_m_stop(void *arg)
}
static int
-nxge_m_unicst(void *arg, const uint8_t *macaddr)
-{
- p_nxge_t nxgep = (p_nxge_t)arg;
- struct ether_addr addrp;
-
- NXGE_DEBUG_MSG((nxgep, MAC_CTL, "==> nxge_m_unicst"));
-
- bcopy(macaddr, (uint8_t *)&addrp, ETHERADDRL);
- if (nxge_set_mac_addr(nxgep, &addrp)) {
- NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
- "<== nxge_m_unicst: set unitcast failed"));
- return (EINVAL);
- }
-
- NXGE_DEBUG_MSG((nxgep, MAC_CTL, "<== nxge_m_unicst"));
-
- return (0);
-}
-
-static int
nxge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
{
p_nxge_t nxgep = (p_nxge_t)arg;
@@ -3942,77 +3932,8 @@ nxge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
extern void nxge_rx_hw_blank(void *arg, time_t ticks, uint_t count);
-static void
-nxge_m_resources(void *arg)
-{
- p_nxge_t nxgep = arg;
- mac_rx_fifo_t mrf;
-
- nxge_grp_set_t *set = &nxgep->rx_set;
- uint8_t rdc;
-
- rx_rcr_ring_t *ring;
-
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_m_resources"));
-
- MUTEX_ENTER(nxgep->genlock);
-
- if (set->owned.map == 0) {
- NXGE_ERROR_MSG((NULL, NXGE_ERR_CTL,
- "nxge_m_resources: no receive resources"));
- goto nxge_m_resources_exit;
- }
-
- /*
- * CR 6492541 Check to see if the drv_state has been initialized,
- * if not * call nxge_init().
- */
- if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
- if (nxge_init(nxgep) != NXGE_OK)
- goto nxge_m_resources_exit;
- }
-
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = nxge_rx_hw_blank;
- mrf.mrf_arg = (void *)nxgep;
-
- mrf.mrf_normal_blank_time = 128;
- mrf.mrf_normal_pkt_count = 8;
-
- /*
- * Export our receive resources to the MAC layer.
- */
- for (rdc = 0; rdc < NXGE_MAX_RDCS; rdc++) {
- if ((1 << rdc) & set->owned.map) {
- ring = nxgep->rx_rcr_rings->rcr_rings[rdc];
- if (ring == 0) {
- /*
- * This is a big deal only if we are
- * *not* in an LDOMs environment.
- */
- if (nxgep->environs == SOLARIS_DOMAIN) {
- cmn_err(CE_NOTE,
- "==> nxge_m_resources: "
- "ring %d == 0", rdc);
- }
- continue;
- }
- ring->rcr_mac_handle = mac_resource_add
- (nxgep->mach, (mac_resource_t *)&mrf);
-
- NXGE_DEBUG_MSG((nxgep, NXGE_CTL,
- "==> nxge_m_resources: RDC %d RCR %p MAC handle %p",
- rdc, ring, ring->rcr_mac_handle));
- }
- }
-
-nxge_m_resources_exit:
- MUTEX_EXIT(nxgep->genlock);
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_m_resources"));
-}
-
void
-nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, boolean_t factory)
+nxge_mmac_kstat_update(p_nxge_t nxgep, int slot, boolean_t factory)
{
p_nxge_mmac_stats_t mmac_stats;
int i;
@@ -4040,9 +3961,9 @@ nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, boolean_t factory)
/*
* nxge_altmac_set() -- Set an alternate MAC address
*/
-int
-nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
- uint8_t rdctbl)
+static int
+nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, int slot,
+ int rdctbl, boolean_t usetbl)
{
uint8_t addrn;
uint8_t portn;
@@ -4050,6 +3971,7 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
hostinfo_t mac_rdc;
p_nxge_class_pt_cfg_t clscfgp;
+
altmac.w2 = ((uint16_t)maddr[0] << 8) | ((uint16_t)maddr[1] & 0x0ff);
altmac.w1 = ((uint16_t)maddr[2] << 8) | ((uint16_t)maddr[3] & 0x0ff);
altmac.w0 = ((uint16_t)maddr[4] << 8) | ((uint16_t)maddr[5] & 0x0ff);
@@ -4057,8 +3979,8 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
portn = nxgep->mac.portnum;
addrn = (uint8_t)slot - 1;
- if (npi_mac_altaddr_entry(nxgep->npi_handle, OP_SET, portn,
- addrn, &altmac) != NPI_SUCCESS)
+ if (npi_mac_altaddr_entry(nxgep->npi_handle, OP_SET,
+ nxgep->function_num, addrn, &altmac) != NPI_SUCCESS)
return (EIO);
/*
@@ -4067,8 +3989,11 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
*/
clscfgp = (p_nxge_class_pt_cfg_t)&nxgep->class_config;
mac_rdc.value = 0;
- clscfgp->mac_host_info[addrn].rdctbl = rdctbl;
- mac_rdc.bits.w0.rdc_tbl_num = rdctbl;
+ if (usetbl)
+ mac_rdc.bits.w0.rdc_tbl_num = rdctbl;
+ else
+ mac_rdc.bits.w0.rdc_tbl_num =
+ clscfgp->mac_host_info[addrn].rdctbl;
mac_rdc.bits.w0.mac_pref = clscfgp->mac_host_info[addrn].mpr_npr;
if (npi_mac_hostinfo_entry(nxgep->npi_handle, OP_SET,
@@ -4088,22 +4013,25 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
else
addrn = (uint8_t)slot;
- if (npi_mac_altaddr_enable(nxgep->npi_handle, portn, addrn)
- != NPI_SUCCESS)
+ if (npi_mac_altaddr_enable(nxgep->npi_handle,
+ nxgep->function_num, addrn) != NPI_SUCCESS) {
return (EIO);
+ }
+
return (0);
}
/*
- * nxeg_m_mmac_add() - find an unused address slot, set the address
+ * nxeg_m_mmac_add_g() - find an unused address slot, set the address
* value to the one specified, enable the port to start filtering on
* the new MAC address. Returns 0 on success.
*/
int
-nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
+nxge_m_mmac_add_g(void *arg, const uint8_t *maddr, int rdctbl,
+ boolean_t usetbl)
{
p_nxge_t nxgep = arg;
- mac_addr_slot_t slot;
+ int slot;
nxge_mmac_t *mmac_info;
int err;
nxge_status_t status;
@@ -4127,16 +4055,10 @@ nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
mutex_exit(nxgep->genlock);
return (ENOSPC);
}
- if (!mac_unicst_verify(nxgep->mach, maddr->mma_addr,
- maddr->mma_addrlen)) {
- mutex_exit(nxgep->genlock);
- return (EINVAL);
- }
+
/*
* Search for the first available slot. Because naddrfree
* is not zero, we are guaranteed to find one.
- * Slot 0 is for unique (primary) MAC. The first alternate
- * MAC slot is slot 1.
* Each of the first two ports of Neptune has 16 alternate
* MAC slots but only the first 7 (of 15) slots have assigned factory
* MAC addresses. We first search among the slots without bundled
@@ -4146,131 +4068,26 @@ nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
* But the slot could be used by factory MAC again after calling
* nxge_m_mmac_remove and nxge_m_mmac_reserve.
*/
- if (mmac_info->num_factory_mmac < mmac_info->num_mmac) {
- for (slot = mmac_info->num_factory_mmac + 1;
- slot <= mmac_info->num_mmac; slot++) {
- if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
- break;
- }
- if (slot > mmac_info->num_mmac) {
- for (slot = 1; slot <= mmac_info->num_factory_mmac;
- slot++) {
- if (!(mmac_info->mac_pool[slot].flags
- & MMAC_SLOT_USED))
- break;
- }
- }
- } else {
- for (slot = 1; slot <= mmac_info->num_mmac; slot++) {
- if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
- break;
- }
+ for (slot = 0; slot <= mmac_info->num_mmac; slot++) {
+ if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
+ break;
}
+
ASSERT(slot <= mmac_info->num_mmac);
- /*
- * def_mac_rxdma_grpid is the default rdc table for the port.
- */
- if ((err = nxge_altmac_set(nxgep, maddr->mma_addr, slot,
- nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) != 0) {
+ if ((err = nxge_altmac_set(nxgep, (uint8_t *)maddr, slot, rdctbl,
+ usetbl)) != 0) {
mutex_exit(nxgep->genlock);
return (err);
}
- bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, ETHERADDRL);
+ bcopy(maddr, mmac_info->mac_pool[slot].addr, ETHERADDRL);
mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED;
mmac_info->mac_pool[slot].flags &= ~MMAC_VENDOR_ADDR;
mmac_info->naddrfree--;
nxge_mmac_kstat_update(nxgep, slot, B_FALSE);
- maddr->mma_slot = slot;
-
- mutex_exit(nxgep->genlock);
- return (0);
-}
-
-/*
- * This function reserves an unused slot and programs the slot and the HW
- * with a factory mac address.
- */
-static int
-nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr)
-{
- p_nxge_t nxgep = arg;
- mac_addr_slot_t slot;
- nxge_mmac_t *mmac_info;
- int err;
- nxge_status_t status;
-
- mutex_enter(nxgep->genlock);
-
- /*
- * Make sure that nxge is initialized, if _start() has
- * not been called.
- */
- if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = nxge_init(nxgep);
- if (status != NXGE_OK) {
- mutex_exit(nxgep->genlock);
- return (ENXIO);
- }
- }
-
- mmac_info = &nxgep->nxge_mmac_info;
- if (mmac_info->naddrfree == 0) {
- mutex_exit(nxgep->genlock);
- return (ENOSPC);
- }
-
- slot = maddr->mma_slot;
- if (slot == -1) { /* -1: Take the first available slot */
- for (slot = 1; slot <= mmac_info->num_factory_mmac; slot++) {
- if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
- break;
- }
- if (slot > mmac_info->num_factory_mmac) {
- mutex_exit(nxgep->genlock);
- return (ENOSPC);
- }
- }
- if (slot < 1 || slot > mmac_info->num_factory_mmac) {
- /*
- * Do not support factory MAC at a slot greater than
- * num_factory_mmac even when there are available factory
- * MAC addresses because the alternate MACs are bundled with
- * slot[1] through slot[num_factory_mmac]
- */
- mutex_exit(nxgep->genlock);
- return (EINVAL);
- }
- if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
- mutex_exit(nxgep->genlock);
- return (EBUSY);
- }
- /* Verify the address to be reserved */
- if (!mac_unicst_verify(nxgep->mach,
- mmac_info->factory_mac_pool[slot], ETHERADDRL)) {
- mutex_exit(nxgep->genlock);
- return (EINVAL);
- }
- if (err = nxge_altmac_set(nxgep,
- mmac_info->factory_mac_pool[slot], slot,
- nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) {
- mutex_exit(nxgep->genlock);
- return (err);
- }
- bcopy(mmac_info->factory_mac_pool[slot], maddr->mma_addr, ETHERADDRL);
- mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED | MMAC_VENDOR_ADDR;
- mmac_info->naddrfree--;
-
- nxge_mmac_kstat_update(nxgep, slot, B_TRUE);
mutex_exit(nxgep->genlock);
-
- /* Pass info back to the caller */
- maddr->mma_slot = slot;
- maddr->mma_addrlen = ETHERADDRL;
- maddr->mma_flags = MMAC_SLOT_USED | MMAC_VENDOR_ADDR;
-
return (0);
}
@@ -4279,7 +4096,7 @@ nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr)
* the mac address anymore.
*/
int
-nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot)
+nxge_m_mmac_remove(void *arg, int slot)
{
p_nxge_t nxgep = arg;
nxge_mmac_t *mmac_info;
@@ -4350,141 +4167,37 @@ nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot)
}
/*
- * Modify a mac address added by nxge_m_mmac_add or nxge_m_mmac_reserve().
- */
-static int
-nxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr)
-{
- p_nxge_t nxgep = arg;
- mac_addr_slot_t slot;
- nxge_mmac_t *mmac_info;
- int err = 0;
- nxge_status_t status;
-
- if (!mac_unicst_verify(nxgep->mach, maddr->mma_addr,
- maddr->mma_addrlen))
- return (EINVAL);
-
- slot = maddr->mma_slot;
-
- mutex_enter(nxgep->genlock);
-
- /*
- * Make sure that nxge is initialized, if _start() has
- * not been called.
- */
- if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = nxge_init(nxgep);
- if (status != NXGE_OK) {
- mutex_exit(nxgep->genlock);
- return (ENXIO);
- }
- }
-
- mmac_info = &nxgep->nxge_mmac_info;
- if (slot < 1 || slot > mmac_info->num_mmac) {
- mutex_exit(nxgep->genlock);
- return (EINVAL);
- }
- if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
- if ((err = nxge_altmac_set(nxgep,
- maddr->mma_addr, slot,
- nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) != 0) {
- bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr,
- ETHERADDRL);
- /*
- * Assume that the MAC passed down from the caller
- * is not a factory MAC address (The user should
- * call mmac_remove followed by mmac_reserve if
- * he wants to use the factory MAC for this slot).
- */
- mmac_info->mac_pool[slot].flags &= ~MMAC_VENDOR_ADDR;
- nxge_mmac_kstat_update(nxgep, slot, B_FALSE);
- }
- } else {
- err = EINVAL;
- }
- mutex_exit(nxgep->genlock);
- return (err);
-}
-
-/*
- * nxge_m_mmac_get() - Get the MAC address and other information
- * related to the slot. mma_flags should be set to 0 in the call.
- * Note: although kstat shows MAC address as zero when a slot is
- * not used, Crossbow expects nxge_m_mmac_get to copy factory MAC
- * to the caller as long as the slot is not using a user MAC address.
- * The following table shows the rules,
- *
- * USED VENDOR mma_addr
- * ------------------------------------------------------------
- * (1) Slot uses a user MAC: yes no user MAC
- * (2) Slot uses a factory MAC: yes yes factory MAC
- * (3) Slot is not used but is
- * factory MAC capable: no yes factory MAC
- * (4) Slot is not used and is
- * not factory MAC capable: no no 0
- * ------------------------------------------------------------
+ * The callback to query all the factory addresses. naddr must be the same as
+ * the number of factory addresses (returned by MAC_CAPAB_MULTIFACTADDR), and
+ * mcm_addr is the space allocated for keep all the addresses, whose size is
+ * naddr * MAXMACADDRLEN.
*/
-static int
-nxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr)
+static void
+nxge_m_getfactaddr(void *arg, uint_t naddr, uint8_t *addr)
{
- nxge_t *nxgep = arg;
- mac_addr_slot_t slot;
- nxge_mmac_t *mmac_info;
- nxge_status_t status;
-
- slot = maddr->mma_slot;
+ nxge_t *nxgep = arg;
+ nxge_mmac_t *mmac_info;
+ int i;
mutex_enter(nxgep->genlock);
- /*
- * Make sure that nxge is initialized, if _start() has
- * not been called.
- */
- if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
- status = nxge_init(nxgep);
- if (status != NXGE_OK) {
- mutex_exit(nxgep->genlock);
- return (ENXIO);
- }
- }
-
mmac_info = &nxgep->nxge_mmac_info;
+ ASSERT(naddr == mmac_info->num_factory_mmac);
- if (slot < 1 || slot > mmac_info->num_mmac) {
- mutex_exit(nxgep->genlock);
- return (EINVAL);
+ for (i = 0; i < naddr; i++) {
+ bcopy(mmac_info->factory_mac_pool[i + 1],
+ addr + i * MAXMACADDRLEN, ETHERADDRL);
}
- maddr->mma_flags = 0;
- if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)
- maddr->mma_flags |= MMAC_SLOT_USED;
- if (mmac_info->mac_pool[slot].flags & MMAC_VENDOR_ADDR) {
- maddr->mma_flags |= MMAC_VENDOR_ADDR;
- bcopy(mmac_info->factory_mac_pool[slot],
- maddr->mma_addr, ETHERADDRL);
- maddr->mma_addrlen = ETHERADDRL;
- } else {
- if (maddr->mma_flags & MMAC_SLOT_USED) {
- bcopy(mmac_info->mac_pool[slot].addr,
- maddr->mma_addr, ETHERADDRL);
- maddr->mma_addrlen = ETHERADDRL;
- } else {
- bzero(maddr->mma_addr, ETHERADDRL);
- maddr->mma_addrlen = 0;
- }
- }
mutex_exit(nxgep->genlock);
- return (0);
}
+
static boolean_t
nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
nxge_t *nxgep = arg;
uint32_t *txflags = cap_data;
- multiaddress_capab_t *mmacp = cap_data;
switch (cap) {
case MAC_CAPAB_HCKSUM:
@@ -4495,33 +4208,15 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
}
break;
- case MAC_CAPAB_POLL:
- /*
- * There's nothing for us to fill in, simply returning
- * B_TRUE stating that we support polling is sufficient.
- */
- break;
+ case MAC_CAPAB_MULTIFACTADDR: {
+ mac_capab_multifactaddr_t *mfacp = cap_data;
- case MAC_CAPAB_MULTIADDRESS:
- mmacp = (multiaddress_capab_t *)cap_data;
mutex_enter(nxgep->genlock);
-
- mmacp->maddr_naddr = nxgep->nxge_mmac_info.num_mmac;
- mmacp->maddr_naddrfree = nxgep->nxge_mmac_info.naddrfree;
- mmacp->maddr_flag = 0; /* 0 is required by PSARC2006/265 */
- /*
- * maddr_handle is driver's private data, passed back to
- * entry point functions as arg.
- */
- mmacp->maddr_handle = nxgep;
- mmacp->maddr_add = nxge_m_mmac_add;
- mmacp->maddr_remove = nxge_m_mmac_remove;
- mmacp->maddr_modify = nxge_m_mmac_modify;
- mmacp->maddr_get = nxge_m_mmac_get;
- mmacp->maddr_reserve = nxge_m_mmac_reserve;
-
+ mfacp->mcm_naddr = nxgep->nxge_mmac_info.num_factory_mmac;
+ mfacp->mcm_getaddr = nxge_m_getfactaddr;
mutex_exit(nxgep->genlock);
break;
+ }
case MAC_CAPAB_LSO: {
mac_capab_lso_t *cap_lso = cap_data;
@@ -4541,39 +4236,49 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
}
}
-#if defined(sun4v)
case MAC_CAPAB_RINGS: {
- mac_capab_rings_t *mrings = (mac_capab_rings_t *)cap_data;
-
- /*
- * Only the service domain driver responds to
- * this capability request.
- */
- if (isLDOMservice(nxgep)) {
- mrings->mr_handle = (void *)nxgep;
+ mac_capab_rings_t *cap_rings = cap_data;
+ p_nxge_hw_pt_cfg_t p_cfgp = &nxgep->pt_config.hw_config;
- /*
- * No dynamic allocation of groups and
- * rings at this time. Shares dictate the
- * configuration.
- */
- mrings->mr_gadd_ring = NULL;
- mrings->mr_grem_ring = NULL;
- mrings->mr_rget = NULL;
- mrings->mr_gget = nxge_hio_group_get;
-
- if (mrings->mr_type == MAC_RING_TYPE_RX) {
- mrings->mr_rnum = 8; /* XXX */
- mrings->mr_gnum = 6; /* XXX */
+ mutex_enter(nxgep->genlock);
+ if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC;
+ cap_rings->mr_rnum = p_cfgp->max_rdcs;
+ cap_rings->mr_rget = nxge_fill_ring;
+ cap_rings->mr_gnum = p_cfgp->max_rdc_grpids;
+ cap_rings->mr_gget = nxge_hio_group_get;
+ cap_rings->mr_gaddring = nxge_group_add_ring;
+ cap_rings->mr_gremring = nxge_group_rem_ring;
+
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "==> nxge_m_getcapab: rx nrings[%d] ngroups[%d]",
+ p_cfgp->max_rdcs, p_cfgp->max_rdc_grpids));
+ } else {
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC;
+ cap_rings->mr_rnum = p_cfgp->tdc.count;
+ cap_rings->mr_rget = nxge_fill_ring;
+ if (isLDOMservice(nxgep)) {
+ /* share capable */
+ /* Do not report the default ring: hence -1 */
+ cap_rings->mr_gnum =
+ NXGE_MAX_TDC_GROUPS / nxgep->nports - 1;
} else {
- mrings->mr_rnum = 8; /* XXX */
- mrings->mr_gnum = 0; /* XXX */
+ cap_rings->mr_gnum = 0;
}
- } else
- return (B_FALSE);
+
+ cap_rings->mr_gget = nxge_hio_group_get;
+ cap_rings->mr_gaddring = nxge_group_add_ring;
+ cap_rings->mr_gremring = nxge_group_rem_ring;
+
+ NXGE_DEBUG_MSG((nxgep, TX_CTL,
+ "==> nxge_m_getcapab: tx rings # of rings %d",
+ p_cfgp->tdc.count));
+ }
+ mutex_exit(nxgep->genlock);
break;
}
+#if defined(sun4v)
case MAC_CAPAB_SHARES: {
mac_capab_share_t *mshares = (mac_capab_share_t *)cap_data;
@@ -4581,16 +4286,22 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
* Only the service domain driver responds to
* this capability request.
*/
+ mutex_enter(nxgep->genlock);
if (isLDOMservice(nxgep)) {
mshares->ms_snum = 3;
mshares->ms_handle = (void *)nxgep;
mshares->ms_salloc = nxge_hio_share_alloc;
mshares->ms_sfree = nxge_hio_share_free;
- mshares->ms_sadd = NULL;
- mshares->ms_sremove = NULL;
+ mshares->ms_sadd = nxge_hio_share_add_group;
+ mshares->ms_sremove = nxge_hio_share_rem_group;
mshares->ms_squery = nxge_hio_share_query;
- } else
+ mshares->ms_sbind = nxge_hio_share_bind;
+ mshares->ms_sunbind = nxge_hio_share_unbind;
+ mutex_exit(nxgep->genlock);
+ } else {
+ mutex_exit(nxgep->genlock);
return (B_FALSE);
+ }
break;
}
#endif
@@ -5160,12 +4871,6 @@ nxge_set_priv_prop(p_nxge_t nxgep, const char *pr_name, uint_t pr_valsize,
}
if (strcmp(pr_name, "_soft_lso_enable") == 0) {
- if (nxgep->nxge_mac_state == NXGE_MAC_STARTED) {
- NXGE_DEBUG_MSG((nxgep, NXGE_CTL,
- "==> nxge_set_priv_prop: name %s (busy)", pr_name));
- err = EBUSY;
- return (err);
- }
if (pr_val == NULL) {
NXGE_DEBUG_MSG((nxgep, NXGE_CTL,
"==> nxge_set_priv_prop: name %s (null)", pr_name));
@@ -5695,6 +5400,290 @@ _info(struct modinfo *modinfop)
}
/*ARGSUSED*/
+static int
+nxge_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num)
+{
+ p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver;
+ p_nxge_t nxgep = rhp->nxgep;
+ uint32_t channel;
+ p_tx_ring_t ring;
+
+ channel = nxgep->pt_config.hw_config.tdc.start + rhp->index;
+ ring = nxgep->tx_rings->rings[channel];
+
+ MUTEX_ENTER(&ring->lock);
+ ring->tx_ring_handle = rhp->ring_handle;
+ MUTEX_EXIT(&ring->lock);
+
+ return (0);
+}
+
+static void
+nxge_tx_ring_stop(mac_ring_driver_t rdriver)
+{
+ p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver;
+ p_nxge_t nxgep = rhp->nxgep;
+ uint32_t channel;
+ p_tx_ring_t ring;
+
+ channel = nxgep->pt_config.hw_config.tdc.start + rhp->index;
+ ring = nxgep->tx_rings->rings[channel];
+
+ MUTEX_ENTER(&ring->lock);
+ ring->tx_ring_handle = (mac_ring_handle_t)NULL;
+ MUTEX_EXIT(&ring->lock);
+}
+
+static int
+nxge_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num)
+{
+ p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver;
+ p_nxge_t nxgep = rhp->nxgep;
+ uint32_t channel;
+ p_rx_rcr_ring_t ring;
+ int i;
+
+ channel = nxgep->pt_config.hw_config.start_rdc + rhp->index;
+ ring = nxgep->rx_rcr_rings->rcr_rings[channel];
+
+ MUTEX_ENTER(&ring->lock);
+
+ if (nxgep->rx_channel_started[channel] == B_TRUE) {
+ MUTEX_EXIT(&ring->lock);
+ return (0);
+ }
+
+ /* set rcr_ring */
+ for (i = 0; i < nxgep->ldgvp->maxldvs; i++) {
+ if ((nxgep->ldgvp->ldvp[i].is_rxdma == 1) &&
+ (nxgep->ldgvp->ldvp[i].channel == channel)) {
+ ring->ldvp = &nxgep->ldgvp->ldvp[i];
+ ring->ldgp = nxgep->ldgvp->ldvp[i].ldgp;
+ }
+ }
+
+ nxgep->rx_channel_started[channel] = B_TRUE;
+ ring->rcr_mac_handle = rhp->ring_handle;
+ ring->rcr_gen_num = mr_gen_num;
+ MUTEX_EXIT(&ring->lock);
+
+ return (0);
+}
+
+static void
+nxge_rx_ring_stop(mac_ring_driver_t rdriver)
+{
+ p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver;
+ p_nxge_t nxgep = rhp->nxgep;
+ uint32_t channel;
+ p_rx_rcr_ring_t ring;
+
+ channel = nxgep->pt_config.hw_config.start_rdc + rhp->index;
+ ring = nxgep->rx_rcr_rings->rcr_rings[channel];
+
+ MUTEX_ENTER(&ring->lock);
+ nxgep->rx_channel_started[channel] = B_FALSE;
+ ring->rcr_mac_handle = NULL;
+ MUTEX_EXIT(&ring->lock);
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ */
+static void
+nxge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+ const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ p_nxge_t nxgep = (p_nxge_t)arg;
+ p_nxge_hw_pt_cfg_t p_cfgp = &nxgep->pt_config.hw_config;
+
+ NXGE_DEBUG_MSG((nxgep, TX_CTL,
+ "==> nxge_fill_ring 0x%x index %d", rtype, index));
+
+ switch (rtype) {
+ case MAC_RING_TYPE_TX: {
+ p_nxge_ring_handle_t rhandlep;
+
+ NXGE_DEBUG_MSG((nxgep, TX_CTL,
+ "==> nxge_fill_ring (TX) 0x%x index %d ntdcs %d",
+ rtype, index, p_cfgp->tdc.count));
+
+ ASSERT((index >= 0) && (index < p_cfgp->tdc.count));
+ rhandlep = &nxgep->tx_ring_handles[index];
+ rhandlep->nxgep = nxgep;
+ rhandlep->index = index;
+ rhandlep->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)rhandlep;
+ infop->mri_start = nxge_tx_ring_start;
+ infop->mri_stop = nxge_tx_ring_stop;
+ infop->mri_tx = nxge_tx_ring_send;
+
+ break;
+ }
+ case MAC_RING_TYPE_RX: {
+ p_nxge_ring_handle_t rhandlep;
+ int nxge_rindex;
+ mac_intr_t nxge_mac_intr;
+
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "==> nxge_fill_ring (RX) 0x%x index %d nrdcs %d",
+ rtype, index, p_cfgp->max_rdcs));
+
+ /*
+ * 'index' is the ring index within the group.
+ * Find the ring index in the nxge instance.
+ */
+ nxge_rindex = nxge_get_rxring_index(nxgep, rg_index, index);
+
+ ASSERT((nxge_rindex >= 0) && (nxge_rindex < p_cfgp->max_rdcs));
+ rhandlep = &nxgep->rx_ring_handles[nxge_rindex];
+ rhandlep->nxgep = nxgep;
+ rhandlep->index = nxge_rindex;
+ rhandlep->ring_handle = rh;
+
+ /*
+ * Entrypoint to enable interrupt (disable poll) and
+ * disable interrupt (enable poll).
+ */
+ nxge_mac_intr.mi_handle = (mac_intr_handle_t)rhandlep;
+ nxge_mac_intr.mi_enable = (mac_intr_enable_t)nxge_disable_poll;
+ nxge_mac_intr.mi_disable = (mac_intr_disable_t)nxge_enable_poll;
+ infop->mri_driver = (mac_ring_driver_t)rhandlep;
+ infop->mri_start = nxge_rx_ring_start;
+ infop->mri_stop = nxge_rx_ring_stop;
+ infop->mri_intr = nxge_mac_intr; /* ??? */
+ infop->mri_poll = nxge_rx_poll;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_fill_ring 0x%x",
+ rtype));
+}
+
+static void
+nxge_group_add_ring(mac_group_driver_t gh, mac_ring_driver_t rh,
+ mac_ring_type_t type)
+{
+ nxge_ring_group_t *rgroup = (nxge_ring_group_t *)gh;
+ nxge_ring_handle_t *rhandle = (nxge_ring_handle_t *)rh;
+ nxge_t *nxge;
+ nxge_grp_t *grp;
+ nxge_rdc_grp_t *rdc_grp;
+ uint16_t channel; /* device-wise ring id */
+ int dev_gindex;
+ int rv;
+
+ nxge = rgroup->nxgep;
+
+ switch (type) {
+ case MAC_RING_TYPE_TX:
+ /*
+ * nxge_grp_dc_add takes a channel number which is a
+ * "devise" ring ID.
+ */
+ channel = nxge->pt_config.hw_config.tdc.start + rhandle->index;
+
+ /*
+ * Remove the ring from the default group
+ */
+ if (rgroup->gindex != 0) {
+ (void) nxge_grp_dc_remove(nxge, VP_BOUND_TX, channel);
+ }
+
+ /*
+ * nxge->tx_set.group[] is an array of groups indexed by
+ * a "port" group ID.
+ */
+ grp = nxge->tx_set.group[rgroup->gindex];
+ rv = nxge_grp_dc_add(nxge, grp, VP_BOUND_TX, channel);
+ if (rv != 0) {
+ NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
+ "nxge_group_add_ring: nxge_grp_dc_add failed"));
+ }
+ break;
+
+ case MAC_RING_TYPE_RX:
+ /*
+ * nxge->rx_set.group[] is an array of groups indexed by
+ * a "port" group ID.
+ */
+ grp = nxge->rx_set.group[rgroup->gindex];
+
+ dev_gindex = nxge->pt_config.hw_config.def_mac_rxdma_grpid +
+ rgroup->gindex;
+ rdc_grp = &nxge->pt_config.rdc_grps[dev_gindex];
+
+ /*
+ * nxge_grp_dc_add takes a channel number which is a
+ * "devise" ring ID.
+ */
+ channel = nxge->pt_config.hw_config.start_rdc + rhandle->index;
+ rv = nxge_grp_dc_add(nxge, grp, VP_BOUND_RX, channel);
+ if (rv != 0) {
+ NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
+ "nxge_group_add_ring: nxge_grp_dc_add failed"));
+ }
+
+ rdc_grp->map |= (1 << channel);
+ rdc_grp->max_rdcs++;
+
+ (void) nxge_init_fzc_rdc_tbl(nxge, rgroup->rdctbl);
+ break;
+ }
+}
+
+static void
+nxge_group_rem_ring(mac_group_driver_t gh, mac_ring_driver_t rh,
+ mac_ring_type_t type)
+{
+ nxge_ring_group_t *rgroup = (nxge_ring_group_t *)gh;
+ nxge_ring_handle_t *rhandle = (nxge_ring_handle_t *)rh;
+ nxge_t *nxge;
+ uint16_t channel; /* device-wise ring id */
+ nxge_rdc_grp_t *rdc_grp;
+ int dev_gindex;
+
+ nxge = rgroup->nxgep;
+
+ switch (type) {
+ case MAC_RING_TYPE_TX:
+ dev_gindex = nxge->pt_config.hw_config.def_mac_txdma_grpid +
+ rgroup->gindex;
+ channel = nxge->pt_config.hw_config.tdc.start + rhandle->index;
+ nxge_grp_dc_remove(nxge, VP_BOUND_TX, channel);
+
+ /*
+ * Add the ring back to the default group
+ */
+ if (rgroup->gindex != 0) {
+ nxge_grp_t *grp;
+ grp = nxge->tx_set.group[0];
+ (void) nxge_grp_dc_add(nxge, grp, VP_BOUND_TX, channel);
+ }
+ break;
+
+ case MAC_RING_TYPE_RX:
+ dev_gindex = nxge->pt_config.hw_config.def_mac_rxdma_grpid +
+ rgroup->gindex;
+ rdc_grp = &nxge->pt_config.rdc_grps[dev_gindex];
+ channel = rdc_grp->start_rdc + rhandle->index;
+ nxge_grp_dc_remove(nxge, VP_BOUND_RX, channel);
+
+ rdc_grp->map &= ~(1 << channel);
+ rdc_grp->max_rdcs--;
+
+ (void) nxge_init_fzc_rdc_tbl(nxge, rgroup->rdctbl);
+ break;
+ }
+}
+
+
+/*ARGSUSED*/
static nxge_status_t
nxge_add_intrs(p_nxge_t nxgep)
{
@@ -5818,33 +5807,6 @@ nxge_add_intrs(p_nxge_t nxgep)
return (status);
}
-/*ARGSUSED*/
-static nxge_status_t
-nxge_add_soft_intrs(p_nxge_t nxgep)
-{
-
- int ddi_status = DDI_SUCCESS;
- nxge_status_t status = NXGE_OK;
-
- NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_add_soft_intrs"));
-
- nxgep->resched_id = NULL;
- nxgep->resched_running = B_FALSE;
- ddi_status = ddi_add_softintr(nxgep->dip, DDI_SOFTINT_LOW,
- &nxgep->resched_id,
- NULL, NULL, nxge_reschedule, (caddr_t)nxgep);
- if (ddi_status != DDI_SUCCESS) {
- NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL, "<== nxge_add_soft_intrs: "
- "ddi_add_softintrs failed: status 0x%08x",
- ddi_status));
- return (NXGE_ERROR | NXGE_DDI_FAILED);
- }
-
- NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_ddi_add_soft_intrs"));
-
- return (status);
-}
-
static nxge_status_t
nxge_add_intrs_adv(p_nxge_t nxgep)
{
@@ -6277,21 +6239,6 @@ nxge_remove_intrs(p_nxge_t nxgep)
/*ARGSUSED*/
static void
-nxge_remove_soft_intrs(p_nxge_t nxgep)
-{
- NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_remove_soft_intrs"));
- if (nxgep->resched_id) {
- ddi_remove_softintr(nxgep->resched_id);
- NXGE_DEBUG_MSG((nxgep, INT_CTL,
- "==> nxge_remove_soft_intrs: removed"));
- nxgep->resched_id = NULL;
- }
-
- NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_remove_soft_intrs"));
-}
-
-/*ARGSUSED*/
-static void
nxge_intrs_enable(p_nxge_t nxgep)
{
p_nxge_intr_t intrp;
@@ -6389,6 +6336,7 @@ nxge_mac_register(p_nxge_t nxgep)
macp->m_margin = VLAN_TAGSZ;
macp->m_priv_props = nxge_priv_props;
macp->m_priv_prop_count = NXGE_MAX_PRIV_PROPS;
+ macp->m_v12n = MAC_VIRT_HIO | MAC_VIRT_LEVEL1 | MAC_VIRT_SERIALIZE;
NXGE_DEBUG_MSG((nxgep, MAC_CTL,
"==> nxge_mac_register: instance %d "
@@ -6941,7 +6889,7 @@ nxge_niu_peu_reset(p_nxge_t nxgep)
static void
nxge_set_pci_replay_timeout(p_nxge_t nxgep)
{
- p_dev_regs_t dev_regs;
+ p_dev_regs_t dev_regs;
uint32_t value;
NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_set_pci_replay_timeout"));
diff --git a/usr/src/uts/common/io/nxge/nxge_ndd.c b/usr/src/uts/common/io/nxge/nxge_ndd.c
index 90c8128428..38bf3d5969 100644
--- a/usr/src/uts/common/io/nxge/nxge_ndd.c
+++ b/usr/src/uts/common/io/nxge/nxge_ndd.c
@@ -980,15 +980,13 @@ nxge_param_get_txdma_info(p_nxge_t nxgep, queue_t *q, p_mblk_t mp, caddr_t cp)
mp->b_cont = np;
print_len = 0;
- ((mblk_t *)np)->b_wptr += print_len;
- buf_len -= print_len;
print_len = snprintf((char *)((mblk_t *)np)->b_wptr, buf_len,
"TDC\t HW TDC\t\n");
((mblk_t *)np)->b_wptr += print_len;
buf_len -= print_len;
set = &nxgep->tx_set;
- for (tdc = 0; tdc < NXGE_MAX_RDCS; tdc++) {
+ for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) {
if ((1 << tdc) & set->owned.map) {
print_len = snprintf((char *)((mblk_t *)np)->b_wptr,
buf_len, "%d\n", tdc);
diff --git a/usr/src/uts/common/io/nxge/nxge_rxdma.c b/usr/src/uts/common/io/nxge/nxge_rxdma.c
index e0e81491c6..8aeb88f7c5 100644
--- a/usr/src/uts/common/io/nxge/nxge_rxdma.c
+++ b/usr/src/uts/common/io/nxge/nxge_rxdma.c
@@ -39,6 +39,13 @@
(rdc + nxgep->pt_config.hw_config.start_rdc)
/*
+ * XXX: This is a tunable to limit the number of packets each interrupt
+ * handles. 0 (default) means that each interrupt takes as much packets
+ * as it finds.
+ */
+extern int nxge_max_intr_pkts;
+
+/*
* Globals: tunable parameters (/etc/system or adb)
*
*/
@@ -115,7 +122,7 @@ nxge_status_t nxge_disable_rxdma_channel(p_nxge_t, uint16_t);
static p_rx_msg_t nxge_allocb(size_t, uint32_t, p_nxge_dma_common_t);
static void nxge_freeb(p_rx_msg_t);
-static void nxge_rx_pkts_vring(p_nxge_t, uint_t, rx_dma_ctl_stat_t);
+static mblk_t *nxge_rx_pkts_vring(p_nxge_t, uint_t, rx_dma_ctl_stat_t);
static nxge_status_t nxge_rx_err_evnts(p_nxge_t, int, rx_dma_ctl_stat_t);
static nxge_status_t nxge_rxdma_handle_port_errors(p_nxge_t,
@@ -137,8 +144,10 @@ nxge_status_t
nxge_init_rxdma_channels(p_nxge_t nxgep)
{
nxge_grp_set_t *set = &nxgep->rx_set;
- int i, count, rdc, channel;
+ int i, count, channel;
nxge_grp_t *group;
+ dc_map_t map;
+ int dev_gindex;
NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "==> nxge_init_rxdma_channels"));
@@ -158,9 +167,11 @@ nxge_init_rxdma_channels(p_nxge_t nxgep)
for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
if ((1 << i) & set->lg.map) {
group = set->group[i];
-
+ dev_gindex =
+ nxgep->pt_config.hw_config.def_mac_rxdma_grpid + i;
+ map = nxgep->pt_config.rdc_grps[dev_gindex].map;
for (channel = 0; channel < NXGE_MAX_RDCS; channel++) {
- if ((1 << channel) & group->map) {
+ if ((1 << channel) & map) {
if ((nxge_grp_dc_add(nxgep,
group, VP_BOUND_RX, channel)))
goto init_rxdma_channels_exit;
@@ -178,15 +189,16 @@ init_rxdma_channels_exit:
for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
if ((1 << i) & set->lg.map) {
group = set->group[i];
-
- for (rdc = 0; rdc < NXGE_MAX_RDCS; rdc++) {
- if ((1 << rdc) & group->map) {
+ dev_gindex =
+ nxgep->pt_config.hw_config.def_mac_rxdma_grpid + i;
+ map = nxgep->pt_config.rdc_grps[dev_gindex].map;
+ for (channel = 0; channel < NXGE_MAX_RDCS; channel++) {
+ if ((1 << channel) & map) {
nxge_grp_dc_remove(nxgep,
- VP_BOUND_RX, rdc);
+ VP_BOUND_RX, channel);
}
}
}
-
if (++count == set->lg.count)
break;
}
@@ -1175,35 +1187,6 @@ nxge_rxdma_regs_dump(p_nxge_t nxgep, int rdc)
"<== nxge_rxdma_regs_dump: rdc rdc %d", rdc));
}
-void
-nxge_rxdma_stop(p_nxge_t nxgep)
-{
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rxdma_stop"));
-
- (void) nxge_link_monitor(nxgep, LINK_MONITOR_STOP);
- (void) nxge_rx_mac_disable(nxgep);
- (void) nxge_rxdma_hw_mode(nxgep, NXGE_DMA_STOP);
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_stop"));
-}
-
-void
-nxge_rxdma_stop_reinit(p_nxge_t nxgep)
-{
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rxdma_stop_reinit"));
-
- (void) nxge_rxdma_stop(nxgep);
- (void) nxge_uninit_rxdma_channels(nxgep);
- (void) nxge_init_rxdma_channels(nxgep);
-
-#ifndef AXIS_DEBUG_LB
- (void) nxge_xcvr_init(nxgep);
- (void) nxge_link_monitor(nxgep, LINK_MONITOR_START);
-#endif
- (void) nxge_rx_mac_enable(nxgep);
-
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_stop_reinit"));
-}
-
nxge_status_t
nxge_rxdma_hw_mode(p_nxge_t nxgep, boolean_t enable)
{
@@ -1438,11 +1421,53 @@ nxge_rxdma_fixup_channel_fail:
NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_fixup_channel"));
}
-/* ARGSUSED */
+/*
+ * Convert an absolute RDC number to a Receive Buffer Ring index. That is,
+ * map <channel> to an index into nxgep->rx_rbr_rings.
+ * (device ring index -> port ring index)
+ */
int
nxge_rxdma_get_ring_index(p_nxge_t nxgep, uint16_t channel)
{
- return (channel);
+ int i, ndmas;
+ uint16_t rdc;
+ p_rx_rbr_rings_t rx_rbr_rings;
+ p_rx_rbr_ring_t *rbr_rings;
+
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "==> nxge_rxdma_get_ring_index: channel %d", channel));
+
+ rx_rbr_rings = nxgep->rx_rbr_rings;
+ if (rx_rbr_rings == NULL) {
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "<== nxge_rxdma_get_ring_index: NULL ring pointer"));
+ return (-1);
+ }
+ ndmas = rx_rbr_rings->ndmas;
+ if (!ndmas) {
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "<== nxge_rxdma_get_ring_index: no channel"));
+ return (-1);
+ }
+
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "==> nxge_rxdma_get_ring_index (ndmas %d)", ndmas));
+
+ rbr_rings = rx_rbr_rings->rbr_rings;
+ for (i = 0; i < ndmas; i++) {
+ rdc = rbr_rings[i]->rdc;
+ if (channel == rdc) {
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "==> nxge_rxdma_get_rbr_ring: channel %d "
+ "(index %d) ring %d", channel, i, rbr_rings[i]));
+ return (i);
+ }
+ }
+
+ NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ "<== nxge_rxdma_get_rbr_ring_index: not found"));
+
+ return (-1);
}
p_rx_rbr_ring_t
@@ -1792,11 +1817,12 @@ nxge_rx_intr(void *arg1, void *arg2)
uint8_t channel;
npi_handle_t handle;
rx_dma_ctl_stat_t cs;
+ p_rx_rcr_ring_t rcr_ring;
+ mblk_t *mp;
#ifdef NXGE_DEBUG
rxdma_cfig1_t cfg;
#endif
- uint_t serviced = DDI_INTR_UNCLAIMED;
if (ldvp == NULL) {
NXGE_DEBUG_MSG((NULL, INT_CTL,
@@ -1826,11 +1852,37 @@ nxge_rx_intr(void *arg1, void *arg2)
* receive dma channel.
*/
handle = NXGE_DEV_NPI_HANDLE(nxgep);
+
+ rcr_ring = nxgep->rx_rcr_rings->rcr_rings[ldvp->vdma_index];
+
+ /*
+ * The RCR ring lock must be held when packets
+ * are being processed and the hardware registers are
+ * being read or written to prevent race condition
+ * among the interrupt thread, the polling thread
+ * (will cause fatal errors such as rcrincon bit set)
+ * and the setting of the poll_flag.
+ */
+ MUTEX_ENTER(&rcr_ring->lock);
+
/*
* Get the control and status for this channel.
*/
channel = ldvp->channel;
ldgp = ldvp->ldgp;
+
+ if (!isLDOMguest(nxgep)) {
+ if (!nxgep->rx_channel_started[channel]) {
+ NXGE_DEBUG_MSG((nxgep, INT_CTL,
+ "<== nxge_rx_intr: channel is not started"));
+ MUTEX_EXIT(&rcr_ring->lock);
+ return (DDI_INTR_CLAIMED);
+ }
+ }
+
+ ASSERT(rcr_ring->ldgp == ldgp);
+ ASSERT(rcr_ring->ldvp == ldvp);
+
RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, channel, &cs.value);
NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_intr:channel %d "
@@ -1840,15 +1892,13 @@ nxge_rx_intr(void *arg1, void *arg2)
cs.bits.hdw.rcrto,
cs.bits.hdw.rcrthres));
- nxge_rx_pkts_vring(nxgep, ldvp->vdma_index, cs);
- serviced = DDI_INTR_CLAIMED;
+ mp = nxge_rx_pkts_vring(nxgep, ldvp->vdma_index, cs);
/* error events. */
if (cs.value & RX_DMA_CTL_STAT_ERROR) {
(void) nxge_rx_err_evnts(nxgep, channel, cs);
}
-nxge_intr_exit:
/*
* Enable the mailbox update interrupt if we want
* to use mailbox. We probably don't need to use
@@ -1856,40 +1906,82 @@ nxge_intr_exit:
* Also write 1 to rcrthres and rcrto to clear
* these two edge triggered bits.
*/
-
cs.value &= RX_DMA_CTL_STAT_WR1C;
- cs.bits.hdw.mex = 1;
+ cs.bits.hdw.mex = rcr_ring->poll_flag ? 0 : 1;
RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel,
cs.value);
/*
- * Rearm this logical group if this is a single device
- * group.
+ * If the polling mode is enabled, disable the interrupt.
*/
- if (ldgp->nldvs == 1) {
- ldgimgm_t mgm;
- mgm.value = 0;
- mgm.bits.ldw.arm = 1;
- mgm.bits.ldw.timer = ldgp->ldg_timer;
- if (isLDOMguest(nxgep)) {
- nxge_hio_ldgimgn(nxgep, ldgp);
- } else {
+ if (rcr_ring->poll_flag) {
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_rx_intr: rdc %d ldgp $%p ldvp $%p "
+ "(disabling interrupts)", channel, ldgp, ldvp));
+ /*
+ * Disarm this logical group if this is a single device
+ * group.
+ */
+ if (ldgp->nldvs == 1) {
+ ldgimgm_t mgm;
+ mgm.value = 0;
+ mgm.bits.ldw.arm = 0;
NXGE_REG_WR64(handle,
- LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg),
- mgm.value);
+ LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value);
+ }
+ } else {
+ /*
+ * Rearm this logical group if this is a single device group.
+ */
+ if (ldgp->nldvs == 1) {
+ if (isLDOMguest(nxgep)) {
+ nxge_hio_ldgimgn(nxgep, ldgp);
+ } else {
+ ldgimgm_t mgm;
+
+ mgm.value = 0;
+ mgm.bits.ldw.arm = 1;
+ mgm.bits.ldw.timer = ldgp->ldg_timer;
+
+ NXGE_REG_WR64(handle,
+ LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg),
+ mgm.value);
+ }
}
+
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_rx_intr: rdc %d ldgp $%p "
+ "exiting ISR (and call mac_rx_ring)", channel, ldgp));
}
+ MUTEX_EXIT(&rcr_ring->lock);
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: serviced %d",
- serviced));
- return (serviced);
+ if (mp) {
+ if (!isLDOMguest(nxgep))
+ mac_rx_ring(nxgep->mach, rcr_ring->rcr_mac_handle, mp,
+ rcr_ring->rcr_gen_num);
+#if defined(sun4v)
+ else { /* isLDOMguest(nxgep) */
+ nxge_hio_data_t *nhd = (nxge_hio_data_t *)
+ nxgep->nxge_hw_p->hio;
+ nx_vio_fp_t *vio = &nhd->hio.vio;
+
+ if (vio->cb.vio_net_rx_cb) {
+ (*vio->cb.vio_net_rx_cb)
+ (nxgep->hio_vr->vhp, mp);
+ }
+ }
+#endif
+ }
+ NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: DDI_INTR_CLAIMED"));
+ return (DDI_INTR_CLAIMED);
}
/*
* Process the packets received in the specified logical device
* and pass up a chain of message blocks to the upper layer.
+ * The RCR ring lock must be held before calling this function.
*/
-static void
+static mblk_t *
nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
{
p_mblk_t mp;
@@ -1897,15 +1989,14 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts_vring"));
rcrp = nxgep->rx_rcr_rings->rcr_rings[vindex];
- if (rcrp->poll_flag) {
- /* It is in the poll mode */
- return;
- }
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_rx_pkts_vring: (calling nxge_rx_pkts)rdc %d "
+ "rcr_mac_handle $%p ", rcrp->rdc, rcrp->rcr_mac_handle));
if ((mp = nxge_rx_pkts(nxgep, rcrp, cs, -1)) == NULL) {
NXGE_DEBUG_MSG((nxgep, RX_CTL,
"<== nxge_rx_pkts_vring: no mp"));
- return;
+ return (NULL);
}
NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts_vring: $%p",
@@ -1947,21 +2038,11 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
mp->b_next->b_wptr - mp->b_next->b_rptr)));
}
#endif
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "<== nxge_rx_pkts_vring: returning rdc %d rcr_mac_handle $%p ",
+ rcrp->rdc, rcrp->rcr_mac_handle));
- if (!isLDOMguest(nxgep))
- mac_rx(nxgep->mach, rcrp->rcr_mac_handle, mp);
-#if defined(sun4v)
- else { /* isLDOMguest(nxgep) */
- nxge_hio_data_t *nhd = (nxge_hio_data_t *)
- nxgep->nxge_hw_p->hio;
- nx_vio_fp_t *vio = &nhd->hio.vio;
-
- if (vio->cb.vio_net_rx_cb) {
- (*vio->cb.vio_net_rx_cb)
- (nxgep->hio_vr->vhp, mp);
- }
- }
-#endif
+ return (mp);
}
@@ -1978,6 +2059,7 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
* a hardware control status register will be updated with the number of
* packets were removed from the hardware queue.
*
+ * The RCR ring lock is held when entering this function.
*/
static mblk_t *
nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
@@ -1998,7 +2080,7 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
npi_status_t rs = NPI_SUCCESS;
#endif
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts: "
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "==> nxge_rx_pkts: "
"channel %d", rcr_p->rdc));
if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
@@ -2032,7 +2114,7 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
if (!qlen) {
- NXGE_DEBUG_MSG((nxgep, RX_CTL,
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
"==> nxge_rx_pkts:rcr channel %d "
"qlen %d (no pkts)", channel, qlen));
@@ -2140,6 +2222,13 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
(totallen >= bytes_to_pickup)) {
break;
}
+
+ /* limit the number of packets for interrupt */
+ if (!(rcr_p->poll_flag)) {
+ if (npkt_read == nxge_max_intr_pkts) {
+ break;
+ }
+ }
}
rcr_p->rcr_desc_rd_head_pp = rcr_desc_rd_head_pp;
@@ -2174,7 +2263,9 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
* read.
*/
- NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_pkts"));
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "<== nxge_rx_pkts: return"
+ "channel %d", rcr_p->rdc));
+
return (head_mp);
}
@@ -2280,7 +2371,7 @@ nxge_receive_packet(p_nxge_t nxgep,
}
/*
- * Sofware workaround for BMAC hardware limitation that allows
+ * Software workaround for BMAC hardware limitation that allows
* maxframe size of 1526, instead of 1522 for non-jumbo and 0x2406
* instead of 0x2400 for jumbo.
*/
@@ -2318,7 +2409,6 @@ nxge_receive_packet(p_nxge_t nxgep,
hdr_size));
}
- MUTEX_ENTER(&rcr_p->lock);
MUTEX_ENTER(&rx_rbr_p->lock);
NXGE_DEBUG_MSG((nxgep, RX_CTL,
@@ -2344,7 +2434,6 @@ nxge_receive_packet(p_nxge_t nxgep,
if (status != NXGE_OK) {
MUTEX_EXIT(&rx_rbr_p->lock);
- MUTEX_EXIT(&rcr_p->lock);
NXGE_DEBUG_MSG((nxgep, RX_CTL,
"<== nxge_receive_packet: found vaddr failed %d",
status));
@@ -2392,7 +2481,6 @@ nxge_receive_packet(p_nxge_t nxgep,
break;
default:
MUTEX_EXIT(&rx_rbr_p->lock);
- MUTEX_EXIT(&rcr_p->lock);
return;
}
@@ -2558,7 +2646,6 @@ nxge_receive_packet(p_nxge_t nxgep,
}
MUTEX_EXIT(&rx_rbr_p->lock);
- MUTEX_EXIT(&rcr_p->lock);
nxge_freeb(rx_msg_p);
return;
}
@@ -2643,7 +2730,6 @@ nxge_receive_packet(p_nxge_t nxgep,
rx_msg_p->free = B_TRUE;
}
MUTEX_EXIT(&rx_rbr_p->lock);
- MUTEX_EXIT(&rcr_p->lock);
nxge_freeb(rx_msg_p);
return;
}
@@ -2657,7 +2743,6 @@ nxge_receive_packet(p_nxge_t nxgep,
rcr_p->rcvd_pkt_bytes = bytes_read;
MUTEX_EXIT(&rx_rbr_p->lock);
- MUTEX_EXIT(&rcr_p->lock);
if (rx_msg_p->free && rx_msg_p->rx_use_bcopy) {
atomic_inc_32(&rx_msg_p->ref_cnt);
@@ -2682,8 +2767,6 @@ nxge_receive_packet(p_nxge_t nxgep,
if (is_valid && !multi) {
/*
- * Update hardware checksuming.
- *
* If the checksum flag nxge_chksum_offload
* is 1, TCP and UDP packets can be sent
* up with good checksum. If the checksum flag
@@ -2727,6 +2810,177 @@ nxge_receive_packet(p_nxge_t nxgep,
*multi_p, nmp, *mp, *mp_cont));
}
+/*
+ * Enable polling for a ring. Interrupt for the ring is disabled when
+ * the nxge interrupt comes (see nxge_rx_intr).
+ */
+int
+nxge_enable_poll(void *arg)
+{
+ p_nxge_ring_handle_t ring_handle = (p_nxge_ring_handle_t)arg;
+ p_rx_rcr_ring_t ringp;
+ p_nxge_t nxgep;
+ p_nxge_ldg_t ldgp;
+ uint32_t channel;
+
+ if (ring_handle == NULL) {
+ return (0);
+ }
+
+ nxgep = ring_handle->nxgep;
+ channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index;
+ ringp = nxgep->rx_rcr_rings->rcr_rings[channel];
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_enable_poll: rdc %d ", ringp->rdc));
+ ldgp = ringp->ldgp;
+ if (ldgp == NULL) {
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_enable_poll: rdc %d NULL ldgp: no change",
+ ringp->rdc));
+ return (0);
+ }
+
+ MUTEX_ENTER(&ringp->lock);
+ /* enable polling */
+ if (ringp->poll_flag == 0) {
+ ringp->poll_flag = 1;
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_enable_poll: rdc %d set poll flag to 1",
+ ringp->rdc));
+ }
+
+ MUTEX_EXIT(&ringp->lock);
+ return (0);
+}
+/*
+ * Disable polling for a ring and enable its interrupt.
+ */
+int
+nxge_disable_poll(void *arg)
+{
+ p_nxge_ring_handle_t ring_handle = (p_nxge_ring_handle_t)arg;
+ p_rx_rcr_ring_t ringp;
+ p_nxge_t nxgep;
+ uint32_t channel;
+
+ if (ring_handle == NULL) {
+ return (0);
+ }
+
+ nxgep = ring_handle->nxgep;
+ channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index;
+ ringp = nxgep->rx_rcr_rings->rcr_rings[channel];
+
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_disable_poll: rdc %d poll_flag %d", ringp->rdc));
+
+ MUTEX_ENTER(&ringp->lock);
+
+ /* disable polling: enable interrupt */
+ if (ringp->poll_flag) {
+ npi_handle_t handle;
+ rx_dma_ctl_stat_t cs;
+ uint8_t channel;
+ p_nxge_ldg_t ldgp;
+
+ /*
+ * Get the control and status for this channel.
+ */
+ handle = NXGE_DEV_NPI_HANDLE(nxgep);
+ channel = ringp->rdc;
+ RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG,
+ channel, &cs.value);
+
+ /*
+ * Enable mailbox update
+ * Since packets were not read and the hardware uses
+ * bits pktread and ptrread to update the queue
+ * length, we need to set both bits to 0.
+ */
+ cs.bits.ldw.pktread = 0;
+ cs.bits.ldw.ptrread = 0;
+ cs.bits.hdw.mex = 1;
+ RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel,
+ cs.value);
+
+ /*
+ * Rearm this logical group if this is a single device
+ * group.
+ */
+ ldgp = ringp->ldgp;
+ if (ldgp == NULL) {
+ ringp->poll_flag = 0;
+ MUTEX_EXIT(&ringp->lock);
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_disable_poll: no ldgp rdc %d "
+ "(still set poll to 0", ringp->rdc));
+ return (0);
+ }
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_disable_poll: rdc %d ldgp $%p (enable intr)",
+ ringp->rdc, ldgp));
+ if (ldgp->nldvs == 1) {
+ ldgimgm_t mgm;
+ mgm.value = 0;
+ mgm.bits.ldw.arm = 1;
+ mgm.bits.ldw.timer = ldgp->ldg_timer;
+ NXGE_REG_WR64(handle,
+ LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value);
+ }
+ ringp->poll_flag = 0;
+ }
+
+ MUTEX_EXIT(&ringp->lock);
+ return (0);
+}
+
+/*
+ * Poll 'bytes_to_pickup' bytes of message from the rx ring.
+ */
+mblk_t *
+nxge_rx_poll(void *arg, int bytes_to_pickup)
+{
+ p_nxge_ring_handle_t ring_handle = (p_nxge_ring_handle_t)arg;
+ p_rx_rcr_ring_t rcr_p;
+ p_nxge_t nxgep;
+ npi_handle_t handle;
+ rx_dma_ctl_stat_t cs;
+ mblk_t *mblk;
+ p_nxge_ldv_t ldvp;
+ uint32_t channel;
+
+ nxgep = ring_handle->nxgep;
+
+ /*
+ * Get the control and status for this channel.
+ */
+ handle = NXGE_DEV_NPI_HANDLE(nxgep);
+ channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index;
+ rcr_p = nxgep->rx_rcr_rings->rcr_rings[channel];
+ MUTEX_ENTER(&rcr_p->lock);
+ ASSERT(rcr_p->poll_flag == 1);
+
+ RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, rcr_p->rdc, &cs.value);
+
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "==> nxge_rx_poll: calling nxge_rx_pkts: rdc %d poll_flag %d",
+ rcr_p->rdc, rcr_p->poll_flag));
+ mblk = nxge_rx_pkts(nxgep, rcr_p, cs, bytes_to_pickup);
+
+ ldvp = rcr_p->ldvp;
+ /* error events. */
+ if (ldvp && (cs.value & RX_DMA_CTL_STAT_ERROR)) {
+ (void) nxge_rx_err_evnts(nxgep, ldvp->vdma_index, cs);
+ }
+
+ MUTEX_EXIT(&rcr_p->lock);
+
+ NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+ "<== nxge_rx_poll: rdc %d mblk $%p", rcr_p->rdc, mblk));
+ return (mblk);
+}
+
+
/*ARGSUSED*/
static nxge_status_t
nxge_rx_err_evnts(p_nxge_t nxgep, int channel, rx_dma_ctl_stat_t cs)
@@ -4231,6 +4485,7 @@ nxge_rxdma_stop_channel(p_nxge_t nxgep, uint16_t channel)
* Make sure channel is disabled.
*/
status = nxge_disable_rxdma_channel(nxgep, channel);
+
if (status != NXGE_OK) {
NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
" nxge_rxdma_stop_channel: "
diff --git a/usr/src/uts/common/io/nxge/nxge_send.c b/usr/src/uts/common/io/nxge/nxge_send.c
index 7e656c9072..2b21d22a1c 100644
--- a/usr/src/uts/common/io/nxge/nxge_send.c
+++ b/usr/src/uts/common/io/nxge/nxge_send.c
@@ -40,8 +40,6 @@ static void nxge_hcksum_retrieve(mblk_t *,
uint32_t *, uint32_t *);
static uint32_t nxge_csgen(uint16_t *, int);
-extern void nxge_txdma_freemsg_task(p_tx_ring_t ringp);
-
extern uint32_t nxge_reclaim_pending;
extern uint32_t nxge_bcopy_thresh;
extern uint32_t nxge_dvma_thresh;
@@ -51,18 +49,116 @@ extern uint32_t nxge_tx_intr_thres;
extern uint32_t nxge_tx_max_gathers;
extern uint32_t nxge_tx_tiny_pack;
extern uint32_t nxge_tx_use_bcopy;
-extern uint32_t nxge_tx_lb_policy;
-extern uint32_t nxge_no_tx_lb;
extern nxge_tx_mode_t nxge_tx_scheme;
uint32_t nxge_lso_kick_cnt = 2;
-typedef struct _mac_tx_hint {
- uint16_t sap;
- uint16_t vid;
- void *hash;
-} mac_tx_hint_t, *p_mac_tx_hint_t;
-int nxge_tx_lb_ring_1(p_mblk_t, uint32_t, p_mac_tx_hint_t);
+void
+nxge_tx_ring_task(void *arg)
+{
+ p_tx_ring_t ring = (p_tx_ring_t)arg;
+
+ MUTEX_ENTER(&ring->lock);
+ (void) nxge_txdma_reclaim(ring->nxgep, ring, 0);
+ MUTEX_EXIT(&ring->lock);
+
+ if (!isLDOMguest(ring->nxgep) && !ring->tx_ring_offline)
+ mac_tx_ring_update(ring->nxgep->mach, ring->tx_ring_handle);
+#if defined(sun4v)
+ else {
+ nxge_hio_data_t *nhd =
+ (nxge_hio_data_t *)ring->nxgep->nxge_hw_p->hio;
+ nx_vio_fp_t *vio = &nhd->hio.vio;
+
+ /* Call back vnet. */
+ if (vio->cb.vio_net_tx_update) {
+ (*vio->cb.vio_net_tx_update)(ring->nxgep->hio_vr->vhp);
+ }
+ }
+#endif
+}
+
+static void
+nxge_tx_ring_dispatch(p_tx_ring_t ring)
+{
+ /*
+ * Kick the ring task to reclaim some buffers.
+ */
+ (void) ddi_taskq_dispatch(ring->taskq,
+ nxge_tx_ring_task, (void *)ring, DDI_SLEEP);
+}
+
+mblk_t *
+nxge_tx_ring_send(void *arg, mblk_t *mp)
+{
+ p_nxge_ring_handle_t nrhp = (p_nxge_ring_handle_t)arg;
+ p_nxge_t nxgep;
+ p_tx_ring_t tx_ring_p;
+ int status, channel;
+
+ ASSERT(nrhp != NULL);
+ nxgep = nrhp->nxgep;
+ channel = nxgep->pt_config.hw_config.tdc.start + nrhp->index;
+ tx_ring_p = nxgep->tx_rings->rings[channel];
+
+ ASSERT(nxgep == tx_ring_p->nxgep);
+
+#ifdef DEBUG
+ if (isLDOMservice(nxgep)) {
+ ASSERT(!tx_ring_p->tx_ring_offline);
+ }
+#endif
+
+ status = nxge_start(nxgep, tx_ring_p, mp);
+ if (status) {
+ nxge_tx_ring_dispatch(tx_ring_p);
+ return (mp);
+ }
+
+ return ((mblk_t *)NULL);
+}
+
+#if defined(sun4v)
+
+/*
+ * nxge_m_tx() is needed for Hybrid I/O operation of the vnet in
+ * the guest domain. See CR 6778758 for long term solution.
+ */
+
+mblk_t *
+nxge_m_tx(void *arg, mblk_t *mp)
+{
+ p_nxge_t nxgep = (p_nxge_t)arg;
+ mblk_t *next;
+ p_tx_ring_t tx_ring_p;
+ int status;
+
+ NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx"));
+
+ /*
+ * Get the default ring handle.
+ */
+ tx_ring_p = nxgep->tx_rings->rings[0];
+
+ while (mp != NULL) {
+ next = mp->b_next;
+ mp->b_next = NULL;
+
+ status = nxge_start(nxgep, tx_ring_p, mp);
+ if (status != 0) {
+ mp->b_next = next;
+ nxge_tx_ring_dispatch(tx_ring_p);
+ return (mp);
+ }
+
+ mp = next;
+ }
+
+ NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_m_tx"));
+ return ((mblk_t *)NULL);
+}
+
+#endif
int
nxge_start(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, p_mblk_t mp)
@@ -305,8 +401,6 @@ start_again:
tx_ring_p->tdc));
goto nxge_start_fail_lso;
} else {
- boolean_t skip_sched = B_FALSE;
-
cas32((uint32_t *)&tx_ring_p->queueing, 0, 1);
tdc_stats->tx_no_desc++;
@@ -316,16 +410,10 @@ start_again:
(void) atomic_swap_32(
&tx_ring_p->tx_ring_offline,
NXGE_TX_RING_OFFLINED);
- skip_sched = B_TRUE;
}
}
MUTEX_EXIT(&tx_ring_p->lock);
- if (nxgep->resched_needed &&
- !nxgep->resched_running && !skip_sched) {
- nxgep->resched_running = B_TRUE;
- ddi_trigger_softintr(nxgep->resched_id);
- }
status = 1;
goto nxge_start_fail1;
}
@@ -1012,10 +1100,7 @@ nxge_start_control_header_only:
MUTEX_EXIT(&tx_ring_p->lock);
- nxge_txdma_freemsg_task(tx_ring_p);
-
NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start"));
-
return (status);
nxge_start_fail_lso:
@@ -1105,8 +1190,6 @@ nxge_start_fail2:
tx_ring_p->tx_wrap_mask);
}
-
- nxgep->resched_needed = B_TRUE;
}
if (isLDOMservice(nxgep)) {
@@ -1123,300 +1206,9 @@ nxge_start_fail1:
/* Add FMA to check the access handle nxge_hregh */
NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start"));
-
- return (status);
-}
-
-int
-nxge_serial_tx(mblk_t *mp, void *arg)
-{
- p_tx_ring_t tx_ring_p = (p_tx_ring_t)arg;
- p_nxge_t nxgep = tx_ring_p->nxgep;
- int status = 0;
-
- if (isLDOMservice(nxgep)) {
- if (tx_ring_p->tx_ring_offline) {
- freemsg(mp);
- return (status);
- }
- }
-
- status = nxge_start(nxgep, tx_ring_p, mp);
return (status);
}
-boolean_t
-nxge_send(p_nxge_t nxgep, mblk_t *mp, p_mac_tx_hint_t hp)
-{
- p_tx_ring_t *tx_rings;
- uint8_t ring_index;
- p_tx_ring_t tx_ring_p;
- nxge_grp_t *group;
-
- NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_send"));
-
- ASSERT(mp->b_next == NULL);
-
- group = nxgep->tx_set.group[0]; /* The default group */
- ring_index = nxge_tx_lb_ring_1(mp, group->count, hp);
-
- tx_rings = nxgep->tx_rings->rings;
- tx_ring_p = tx_rings[group->legend[ring_index]];
-
- if (isLDOMservice(nxgep)) {
- if (tx_ring_p->tx_ring_offline) {
- /*
- * OFFLINE means that it is in the process of being
- * shared - that is, it has been claimed by the HIO
- * code, but hasn't been unlinked from <group> yet.
- * So in this case use the first TDC, which always
- * belongs to the service domain and can't be shared.
- */
- ring_index = 0;
- tx_ring_p = tx_rings[group->legend[ring_index]];
- }
- }
-
- NXGE_DEBUG_MSG((nxgep, TX_CTL, "count %d, tx_rings[%d] = %p",
- (int)group->count, group->legend[ring_index], tx_ring_p));
-
- switch (nxge_tx_scheme) {
- case NXGE_USE_START:
- if (nxge_start(nxgep, tx_ring_p, mp)) {
- NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: failed "
- "ring index %d", ring_index));
- return (B_FALSE);
- }
- break;
-
- case NXGE_USE_SERIAL:
- default:
- nxge_serialize_enter(tx_ring_p->serial, mp);
- break;
- }
-
- NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: ring index %d",
- ring_index));
-
- return (B_TRUE);
-}
-
-/*
- * nxge_m_tx() - send a chain of packets
- */
-mblk_t *
-nxge_m_tx(void *arg, mblk_t *mp)
-{
- p_nxge_t nxgep = (p_nxge_t)arg;
- mblk_t *next;
- mac_tx_hint_t hint;
-
- NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_m_tx"));
-
- if ((!(nxgep->drv_state & STATE_HW_INITIALIZED)) ||
- (nxgep->nxge_mac_state != NXGE_MAC_STARTED)) {
- NXGE_DEBUG_MSG((nxgep, DDI_CTL,
- "==> nxge_m_tx: hardware not initialized"));
- NXGE_DEBUG_MSG((nxgep, DDI_CTL,
- "<== nxge_m_tx"));
- freemsgchain(mp);
- mp = NULL;
- return (mp);
- }
-
- hint.hash = NULL;
- hint.vid = 0;
- hint.sap = 0;
-
- while (mp != NULL) {
- next = mp->b_next;
- mp->b_next = NULL;
-
- /*
- * Until Nemo tx resource works, the mac driver
- * does the load balancing based on TCP port,
- * or CPU. For debugging, we use a system
- * configurable parameter.
- */
- if (!nxge_send(nxgep, mp, &hint)) {
- mp->b_next = next;
- break;
- }
-
- mp = next;
-
- NXGE_DEBUG_MSG((NULL, TX_CTL,
- "==> nxge_m_tx: (go back to loop) mp $%p next $%p",
- mp, next));
- }
-
- NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_m_tx"));
- return (mp);
-}
-
-int
-nxge_tx_lb_ring_1(p_mblk_t mp, uint32_t maxtdcs, p_mac_tx_hint_t hp)
-{
- uint8_t ring_index = 0;
- uint8_t *tcp_port;
- p_mblk_t nmp;
- size_t mblk_len;
- size_t iph_len;
- size_t hdrs_size;
- uint8_t hdrs_buf[sizeof (struct ether_vlan_header) +
- IP_MAX_HDR_LENGTH + sizeof (uint32_t)];
- /*
- * allocate space big enough to cover
- * the max ip header length and the first
- * 4 bytes of the TCP/IP header.
- */
-
- boolean_t qos = B_FALSE;
- ushort_t eth_type;
- size_t eth_hdr_size;
-
- NXGE_DEBUG_MSG((NULL, TX_CTL, "==> nxge_tx_lb_ring"));
-
- if (hp->vid) {
- qos = B_TRUE;
- }
- switch (nxge_tx_lb_policy) {
- case NXGE_TX_LB_TCPUDP: /* default IPv4 TCP/UDP */
- default:
- tcp_port = mp->b_rptr;
- eth_type = ntohs(((struct ether_header *)tcp_port)->ether_type);
- if (eth_type == VLAN_ETHERTYPE) {
- eth_type = ntohs(((struct ether_vlan_header *)
- tcp_port)->ether_type);
- eth_hdr_size = sizeof (struct ether_vlan_header);
- } else {
- eth_hdr_size = sizeof (struct ether_header);
- }
-
- if (!nxge_no_tx_lb && !qos && eth_type == ETHERTYPE_IP) {
- nmp = mp;
- mblk_len = MBLKL(nmp);
- tcp_port = NULL;
- if (mblk_len > eth_hdr_size + sizeof (uint8_t)) {
- tcp_port = nmp->b_rptr + eth_hdr_size;
- mblk_len -= eth_hdr_size;
- iph_len = ((*tcp_port) & 0x0f) << 2;
- if (mblk_len > (iph_len + sizeof (uint32_t))) {
- tcp_port = nmp->b_rptr;
- } else {
- tcp_port = NULL;
- }
- }
- if (tcp_port == NULL) {
- hdrs_size = 0;
- while ((nmp) && (hdrs_size <
- sizeof (hdrs_buf))) {
- mblk_len = MBLKL(nmp);
- if (mblk_len >=
- (sizeof (hdrs_buf) - hdrs_size))
- mblk_len = sizeof (hdrs_buf) -
- hdrs_size;
- bcopy(nmp->b_rptr,
- &hdrs_buf[hdrs_size], mblk_len);
- hdrs_size += mblk_len;
- nmp = nmp->b_cont;
- }
- tcp_port = hdrs_buf;
- }
- tcp_port += eth_hdr_size;
- if (!(tcp_port[6] & 0x3f) && !(tcp_port[7] & 0xff)) {
- switch (tcp_port[9]) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_ESP:
- tcp_port += ((*tcp_port) & 0x0f) << 2;
- ring_index =
- ((tcp_port[0] ^
- tcp_port[1] ^
- tcp_port[2] ^
- tcp_port[3]) % maxtdcs);
- break;
-
- case IPPROTO_AH:
- /* SPI starts at the 4th byte */
- tcp_port += ((*tcp_port) & 0x0f) << 2;
- ring_index =
- ((tcp_port[4] ^
- tcp_port[5] ^
- tcp_port[6] ^
- tcp_port[7]) % maxtdcs);
- break;
-
- default:
- ring_index = tcp_port[19] % maxtdcs;
- break;
- }
- } else { /* fragmented packet */
- ring_index = tcp_port[19] % maxtdcs;
- }
- } else {
- ring_index = mp->b_band % maxtdcs;
- }
- break;
-
- case NXGE_TX_LB_HASH:
- if (hp->hash) {
-#if defined(__i386)
- ring_index = ((uint32_t)(hp->hash) % maxtdcs);
-#else
- ring_index = ((uint64_t)(hp->hash) % maxtdcs);
-#endif
- } else {
- ring_index = mp->b_band % maxtdcs;
- }
- break;
-
- case NXGE_TX_LB_DEST_MAC: /* Use destination MAC address */
- tcp_port = mp->b_rptr;
- ring_index = tcp_port[5] % maxtdcs;
- break;
- }
-
- NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_tx_lb_ring"));
-
- return (ring_index);
-}
-
-uint_t
-nxge_reschedule(caddr_t arg)
-{
- p_nxge_t nxgep;
-
- nxgep = (p_nxge_t)arg;
-
- NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_reschedule"));
-
- if (nxgep->nxge_mac_state == NXGE_MAC_STARTED &&
- nxgep->resched_needed) {
- if (!isLDOMguest(nxgep))
- mac_tx_update(nxgep->mach);
-#if defined(sun4v)
- else { /* isLDOMguest(nxgep) */
- nxge_hio_data_t *nhd = (nxge_hio_data_t *)
- nxgep->nxge_hw_p->hio;
- nx_vio_fp_t *vio = &nhd->hio.vio;
-
- /* Call back vnet. */
- if (vio->cb.vio_net_tx_update) {
- (*vio->cb.vio_net_tx_update)
- (nxgep->hio_vr->vhp);
- }
- }
-#endif
- nxgep->resched_needed = B_FALSE;
- nxgep->resched_running = B_FALSE;
- }
-
- NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_reschedule"));
- return (DDI_INTR_CLAIMED);
-}
-
-
/* Software LSO starts here */
static void
nxge_hcksum_retrieve(mblk_t *mp,
diff --git a/usr/src/uts/common/io/nxge/nxge_txdma.c b/usr/src/uts/common/io/nxge/nxge_txdma.c
index 892c7bb65a..766e900da7 100644
--- a/usr/src/uts/common/io/nxge/nxge_txdma.c
+++ b/usr/src/uts/common/io/nxge/nxge_txdma.c
@@ -31,7 +31,7 @@
#include <sys/llc1.h>
uint32_t nxge_reclaim_pending = TXDMA_RECLAIM_PENDING_DEFAULT;
-uint32_t nxge_tx_minfree = 32;
+uint32_t nxge_tx_minfree = 64;
uint32_t nxge_tx_intr_thres = 0;
uint32_t nxge_tx_max_gathers = TX_MAX_GATHER_POINTERS;
uint32_t nxge_tx_tiny_pack = 1;
@@ -53,9 +53,7 @@ extern ddi_device_acc_attr_t nxge_dev_buf_dma_acc_attr;
extern ddi_dma_attr_t nxge_desc_dma_attr;
extern ddi_dma_attr_t nxge_tx_dma_attr;
-extern int nxge_serial_tx(mblk_t *mp, void *arg);
-
-void nxge_txdma_freemsg_task(p_tx_ring_t tx_ring_p);
+extern void nxge_tx_ring_task(void *arg);
static nxge_status_t nxge_map_txdma(p_nxge_t, int);
@@ -97,22 +95,25 @@ nxge_init_txdma_channels(p_nxge_t nxgep)
nxge_grp_set_t *set = &nxgep->tx_set;
int i, tdc, count;
nxge_grp_t *group;
+ dc_map_t map;
+ int dev_gindex;
NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "==> nxge_init_txdma_channels"));
for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
if ((1 << i) & set->lg.map) {
group = set->group[i];
-
+ dev_gindex =
+ nxgep->pt_config.hw_config.def_mac_txdma_grpid + i;
+ map = nxgep->pt_config.tdc_grps[dev_gindex].map;
for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) {
- if ((1 << tdc) & group->map) {
- if ((nxge_grp_dc_add(nxgep, group,
- VP_BOUND_TX, tdc)))
+ if ((1 << tdc) & map) {
+ if ((nxge_grp_dc_add(nxgep,
+ group, VP_BOUND_TX, tdc)))
goto init_txdma_channels_exit;
}
}
}
-
if (++count == set->lg.count)
break;
}
@@ -124,21 +125,22 @@ init_txdma_channels_exit:
for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
if ((1 << i) & set->lg.map) {
group = set->group[i];
-
+ dev_gindex =
+ nxgep->pt_config.hw_config.def_mac_txdma_grpid + i;
+ map = nxgep->pt_config.tdc_grps[dev_gindex].map;
for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) {
- if ((1 << tdc) & group->map) {
+ if ((1 << tdc) & map) {
nxge_grp_dc_remove(nxgep,
VP_BOUND_TX, tdc);
}
}
}
-
if (++count == set->lg.count)
break;
}
- NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "<== nxge_init_txdma_channels"));
return (NXGE_ERROR);
+
}
nxge_status_t
@@ -890,44 +892,6 @@ nxge_tx_pkt_nmblocks(p_mblk_t mp, int *tot_xfer_len_p)
return (nmblks);
}
-static void
-nxge_txdma_freemsg_list_add(p_tx_ring_t tx_ring_p, p_tx_msg_t msgp)
-{
- MUTEX_ENTER(&tx_ring_p->freelock);
- if (tx_ring_p->tx_free_list_p != NULL)
- msgp->nextp = tx_ring_p->tx_free_list_p;
- tx_ring_p->tx_free_list_p = msgp;
- MUTEX_EXIT(&tx_ring_p->freelock);
-}
-
-/*
- * void
- * nxge_txdma_freemsg_task() -- walk the list of messages to be
- * freed and free the messages.
- */
-void
-nxge_txdma_freemsg_task(p_tx_ring_t tx_ring_p)
-{
- p_tx_msg_t msgp, nextp;
-
- if (tx_ring_p->tx_free_list_p != NULL) {
- MUTEX_ENTER(&tx_ring_p->freelock);
- msgp = tx_ring_p->tx_free_list_p;
- tx_ring_p->tx_free_list_p = (p_tx_msg_t)NULL;
- MUTEX_EXIT(&tx_ring_p->freelock);
-
- while (msgp != NULL) {
- nextp = msgp->nextp;
- if (msgp->tx_message != NULL) {
- freemsg(msgp->tx_message);
- msgp->tx_message = NULL;
- }
- msgp->nextp = NULL;
- msgp = nextp;
- }
- }
-}
-
boolean_t
nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks)
{
@@ -947,7 +911,7 @@ nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks)
uint16_t head_index, tail_index;
uint8_t tdc;
boolean_t head_wrap, tail_wrap;
- p_nxge_tx_ring_stats_t tdc_stats;
+ p_nxge_tx_ring_stats_t tdc_stats;
int rc;
NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_txdma_reclaim"));
@@ -1093,13 +1057,12 @@ nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks)
}
NXGE_DEBUG_MSG((nxgep, TX_CTL,
"==> nxge_txdma_reclaim: count packets"));
-
/*
* count a chained packet only once.
*/
if (tx_msg_p->tx_message != NULL) {
- nxge_txdma_freemsg_list_add(tx_ring_p,
- tx_msg_p);
+ freemsg(tx_msg_p->tx_message);
+ tx_msg_p->tx_message = NULL;
}
tx_msg_p->flags.dma_type = USE_NONE;
@@ -1223,13 +1186,7 @@ nxge_tx_intr(void *arg1, void *arg2)
"status 0x%08x (mk bit set, calling reclaim)",
channel, vindex, rs));
- MUTEX_ENTER(&tx_ring_p->lock);
- (void) nxge_txdma_reclaim(nxgep, tx_rings[vindex], 0);
- MUTEX_EXIT(&tx_ring_p->lock);
-
- nxge_txdma_freemsg_task(tx_ring_p);
-
- mac_tx_update(nxgep->mach);
+ nxge_tx_ring_task((void *)tx_ring_p);
}
/*
@@ -1596,7 +1553,6 @@ nxge_txdma_fixup_channel(p_nxge_t nxgep, p_tx_ring_t ring_p, uint16_t channel)
ring_p->ring_kick_tail.value = 0;
ring_p->descs_pending = 0;
MUTEX_EXIT(&ring_p->lock);
- nxge_txdma_freemsg_task(ring_p);
NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_txdma_fixup_channel"));
}
@@ -1831,7 +1787,6 @@ nxge_txdma_channel_hung(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, uint16_t channel)
tail_wrap = tx_ring_p->wr_index_wrap;
tx_rd_index = tx_ring_p->rd_index;
MUTEX_EXIT(&tx_ring_p->lock);
- nxge_txdma_freemsg_task(tx_ring_p);
NXGE_DEBUG_MSG((nxgep, TX_CTL,
"==> nxge_txdma_channel_hung: tdc %d tx_rd_index %d "
@@ -2010,8 +1965,6 @@ nxge_txdma_fixup_hung_channel(p_nxge_t nxgep, p_tx_ring_t ring_p,
(void) nxge_txdma_reclaim(nxgep, ring_p, 0);
MUTEX_EXIT(&ring_p->lock);
- nxge_txdma_freemsg_task(ring_p);
-
handle = NXGE_DEV_NPI_HANDLE(nxgep);
/*
* Stop the dma channel waits for the stop done.
@@ -2072,10 +2025,8 @@ nxge_reclaim_rings(p_nxge_t nxgep)
NXGE_DEBUG_MSG((nxgep, TX_CTL,
"==> nxge_reclaim_rings: TDC %d", tdc));
MUTEX_ENTER(&ring->lock);
- (void) nxge_txdma_reclaim(nxgep, ring, tdc);
+ (void) nxge_txdma_reclaim(nxgep, ring, 0);
MUTEX_EXIT(&ring->lock);
-
- nxge_txdma_freemsg_task(ring);
}
}
}
@@ -2580,6 +2531,7 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
int i, j, index;
uint32_t size, bsize;
uint32_t nblocks, nmsgs;
+ char qname[TASKQ_NAMELEN];
NXGE_DEBUG_MSG((nxgep, MEM3_CTL,
"==> nxge_map_txdma_channel_buf_ring"));
@@ -2611,14 +2563,19 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
KMEM_ZALLOC(sizeof (tx_ring_t), KM_SLEEP);
MUTEX_INIT(&tx_ring_p->lock, NULL, MUTEX_DRIVER,
(void *)nxgep->interrupt_cookie);
- MUTEX_INIT(&tx_ring_p->freelock, NULL, MUTEX_DRIVER,
- (void *)nxgep->interrupt_cookie);
(void) atomic_swap_32(&tx_ring_p->tx_ring_offline, NXGE_TX_RING_ONLINE);
tx_ring_p->tx_ring_busy = B_FALSE;
tx_ring_p->nxgep = nxgep;
- tx_ring_p->serial = nxge_serialize_create(nmsgs,
- nxge_serial_tx, tx_ring_p);
+ tx_ring_p->tx_ring_handle = (mac_ring_handle_t)NULL;
+ (void) snprintf(qname, TASKQ_NAMELEN, "tx_%d_%d",
+ nxgep->instance, channel);
+ tx_ring_p->taskq = ddi_taskq_create(nxgep->dip, qname, 1,
+ TASKQ_DEFAULTPRI, 0);
+ if (tx_ring_p->taskq == NULL) {
+ goto nxge_map_txdma_channel_buf_ring_fail1;
+ }
+
/*
* Allocate transmit message rings and handles for packets
* not to be copied to premapped buffers.
@@ -2683,7 +2640,6 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
for (j = 0; j < nblocks; j++) {
tx_msg_ring[index].buf_dma_handle = tx_buf_dma_handle;
- tx_msg_ring[index].nextp = NULL;
dmap = &tx_msg_ring[index++].buf_dma;
#ifdef TX_MEM_DEBUG
NXGE_DEBUG_MSG((nxgep, MEM3_CTL,
@@ -2705,9 +2661,9 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
goto nxge_map_txdma_channel_buf_ring_exit;
nxge_map_txdma_channel_buf_ring_fail1:
- if (tx_ring_p->serial) {
- nxge_serialize_destroy(tx_ring_p->serial);
- tx_ring_p->serial = NULL;
+ if (tx_ring_p->taskq) {
+ ddi_taskq_destroy(tx_ring_p->taskq);
+ tx_ring_p->taskq = NULL;
}
index--;
@@ -2716,8 +2672,6 @@ nxge_map_txdma_channel_buf_ring_fail1:
ddi_dma_free_handle(&tx_msg_ring[index].dma_handle);
}
}
-
- MUTEX_DESTROY(&tx_ring_p->freelock);
MUTEX_DESTROY(&tx_ring_p->lock);
KMEM_FREE(tx_msg_ring, size);
KMEM_FREE(tx_ring_p, sizeof (tx_ring_t));
@@ -2783,12 +2737,11 @@ nxge_unmap_txdma_channel_buf_ring(p_nxge_t nxgep, p_tx_ring_t tx_ring_p)
MUTEX_EXIT(&tx_ring_p->lock);
- if (tx_ring_p->serial) {
- nxge_serialize_destroy(tx_ring_p->serial);
- tx_ring_p->serial = NULL;
+ if (tx_ring_p->taskq) {
+ ddi_taskq_destroy(tx_ring_p->taskq);
+ tx_ring_p->taskq = NULL;
}
- MUTEX_DESTROY(&tx_ring_p->freelock);
MUTEX_DESTROY(&tx_ring_p->lock);
KMEM_FREE(tx_msg_ring, sizeof (tx_msg_t) * tx_ring_p->tx_ring_size);
KMEM_FREE(tx_ring_p, sizeof (tx_ring_t));
@@ -3408,8 +3361,6 @@ nxge_txdma_fatal_err_recover(
if (status != NXGE_OK)
goto fail;
- nxge_txdma_freemsg_task(tx_ring_p);
-
NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
"Recovery Successful, TxDMAChannel#%d Restored",
channel));
@@ -3420,8 +3371,6 @@ nxge_txdma_fatal_err_recover(
fail:
MUTEX_EXIT(&tx_ring_p->lock);
- nxge_txdma_freemsg_task(tx_ring_p);
-
NXGE_DEBUG_MSG((nxgep, TX_CTL,
"nxge_txdma_fatal_err_recover (channel %d): "
"failed to recover this txdma channel", channel));
@@ -3519,7 +3468,6 @@ nxge_tx_port_fatal_err_recover(p_nxge_t nxgep)
tx_ring_t *ring = nxgep->tx_rings->rings[tdc];
if (ring) {
(void) nxge_txdma_reclaim(nxgep, ring, 0);
- nxge_txdma_freemsg_task(ring);
}
}
}
diff --git a/usr/src/uts/common/io/nxge/nxge_virtual.c b/usr/src/uts/common/io/nxge/nxge_virtual.c
index 818f8451c2..2498f77e90 100644
--- a/usr/src/uts/common/io/nxge/nxge_virtual.c
+++ b/usr/src/uts/common/io/nxge/nxge_virtual.c
@@ -77,6 +77,12 @@ extern uint32_t nxge_rbr_spare_size;
extern npi_status_t npi_mac_altaddr_disable(npi_handle_t, uint8_t, uint8_t);
+/*
+ * XXX: Use temporarily to specify the number of packets each interrupt process
+ * By default, the number of packet processed per interrupt is 1.
+ */
+int nxge_max_intr_pkts;
+
static uint8_t p2_tx_fair[2] = {12, 12};
static uint8_t p2_tx_equal[2] = {12, 12};
static uint8_t p4_tx_fair[4] = {6, 6, 6, 6};
@@ -783,7 +789,7 @@ nxge_update_txdma_properties(p_nxge_t nxgep, config_token_t token,
int ddi_status = DDI_SUCCESS;
int num_ports = nxgep->nports;
int port, bits, j;
- uint8_t start_tdc = 0, num_tdc = 0;
+ uint8_t start_tdc, num_tdc = 0;
p_nxge_param_t param_arr;
uint32_t tdc_bitmap[MAX_SIBLINGS];
int custom_start_tdc[MAX_SIBLINGS];
@@ -1616,6 +1622,14 @@ nxge_get_config_properties(p_nxge_t nxgep)
}
/*
+ * XXX: read-in the config file to determine the number of packet
+ * to process by each interrupt.
+ */
+ nxge_max_intr_pkts = ddi_getprop(DDI_DEV_T_ANY, nxgep->dip,
+ DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "max_intr_pkts", 1);
+
+
+ /*
* Get info on how many ports Neptune card has.
*/
nxgep->nports = nxge_get_nports(nxgep);
@@ -1806,12 +1820,12 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
return (NXGE_DDI_FAILED);
}
- p_cfgp->tdc.count = nxgep->max_tdcs = ndmas;
+ p_cfgp->tdc.count = ndmas;
p_cfgp->tdc.owned = p_cfgp->tdc.count;
NXGE_DEBUG_MSG((nxgep, OBP_CTL, "==> nxge_use_default_dma_config_n2: "
- "p_cfgp 0x%llx max_tdcs %d nxgep->max_tdcs %d start %d",
- p_cfgp, p_cfgp->tdc.count, nxgep->max_tdcs, p_cfgp->tdc.start));
+ "p_cfgp 0x%llx max_tdcs %d start %d",
+ p_cfgp, p_cfgp->tdc.count, p_cfgp->tdc.start));
/* Receive DMA */
ndmas = NXGE_RDMA_PER_NIU_PORT;
@@ -1834,12 +1848,11 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
return (NXGE_DDI_FAILED);
}
- p_cfgp->max_rdcs = nxgep->max_rdcs = ndmas;
+ p_cfgp->max_rdcs = ndmas;
nxgep->rdc_mask = (ndmas - 1);
/* Hypervisor: rdc # and group # use the same # !! */
p_cfgp->max_grpids = p_cfgp->max_rdcs + p_cfgp->tdc.owned;
- p_cfgp->start_grpid = 0;
p_cfgp->mif_ldvid = p_cfgp->mac_ldvid = p_cfgp->ser_ldvid = 0;
if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, nxgep->dip, 0,
@@ -1909,13 +1922,12 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
p_cfgp->max_ldgs = p_cfgp->max_grpids;
NXGE_DEBUG_MSG((nxgep, OBP_CTL,
- "==> nxge_use_default_dma_config_n2: "
- "p_cfgp 0x%llx max_rdcs %d nxgep->max_rdcs %d max_grpids %d"
- "start_grpid %d macid %d mifid %d serrid %d",
- p_cfgp, p_cfgp->max_rdcs, nxgep->max_rdcs, p_cfgp->max_grpids,
- p_cfgp->start_grpid,
+ "==> nxge_use_default_dma_config_n2: p_cfgp 0x%llx max_rdcs %d "
+ "max_grpids %d macid %d mifid %d serrid %d",
+ p_cfgp, p_cfgp->max_rdcs, p_cfgp->max_grpids,
p_cfgp->mac_ldvid, p_cfgp->mif_ldvid, p_cfgp->ser_ldvid));
+
NXGE_DEBUG_MSG((nxgep, OBP_CTL, "==> nxge_use_default_dma_config_n2: "
"p_cfgp p%p start_ldg %d nxgep->max_ldgs %d",
p_cfgp, p_cfgp->start_ldg, p_cfgp->max_ldgs));
@@ -1923,12 +1935,14 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
/*
* RDC groups and the beginning RDC group assigned to this function.
*/
- p_cfgp->max_rdc_grpids = 1;
- p_cfgp->def_mac_rxdma_grpid = (nxgep->function_num * 1);
-
- if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind
- (nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE))
- >= NXGE_MAX_RDC_GRPS) {
+ p_cfgp->max_rdc_grpids = NXGE_MAX_RDC_GROUPS / nxgep->nports;
+ p_cfgp->def_mac_rxdma_grpid =
+ nxgep->function_num * NXGE_MAX_RDC_GROUPS / nxgep->nports;
+ p_cfgp->def_mac_txdma_grpid =
+ nxgep->function_num * NXGE_MAX_TDC_GROUPS / nxgep->nports;
+
+ if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(nxgep,
+ p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >= NXGE_MAX_RDC_GRPS) {
NXGE_ERROR_MSG((nxgep, CFG_CTL,
"nxge_use_default_dma_config_n2(): "
"nxge_fzc_rdc_tbl_bind failed"));
@@ -2060,11 +2074,10 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
prop, tx_ndmas);
}
- p_cfgp->tdc.count = nxgep->max_tdcs = tx_ndmas;
+ p_cfgp->tdc.count = tx_ndmas;
p_cfgp->tdc.owned = p_cfgp->tdc.count;
NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_cfg_dma_config: "
- "p_cfgp 0x%llx max_tdcs %d nxgep->max_tdcs %d",
- p_cfgp, p_cfgp->tdc.count, nxgep->max_tdcs));
+ "p_cfgp 0x%llx max_tdcs %d", p_cfgp, p_cfgp->tdc.count));
prop = param_arr[param_rxdma_channels_begin].fcode_name;
@@ -2149,44 +2162,23 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
prop, rx_ndmas);
}
- p_cfgp->max_rdcs = nxgep->max_rdcs = rx_ndmas;
+ p_cfgp->max_rdcs = rx_ndmas;
- prop = param_arr[param_rdc_grps_start].fcode_name;
- if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, prop,
- &prop_val, &prop_len) == DDI_PROP_SUCCESS) {
- p_cfgp->def_mac_rxdma_grpid = *prop_val;
- ddi_prop_free(prop_val);
- if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind
- (nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE))
- >= NXGE_MAX_RDC_GRPS) {
- NXGE_ERROR_MSG((nxgep, CFG_CTL,
- "nxge_use_cfg_dma_config(): "
- "nxge_fzc_rdc_tbl_bind failed"));
- cmn_err(CE_CONT, "nxge%d: group not available!\n",
- nxgep->instance);
- goto nxge_use_cfg_dma_config_exit;
- }
+ /*
+ * RDC groups and the beginning RDC group assigned to this function.
+ * XXX: this may be wrong if prop value is used.
+ */
+ p_cfgp->def_mac_rxdma_grpid =
+ nxgep->function_num * NXGE_MAX_RDC_GROUPS / nxgep->nports;
+ p_cfgp->def_mac_txdma_grpid =
+ nxgep->function_num * NXGE_MAX_TDC_GROUPS / nxgep->nports;
- NXGE_DEBUG_MSG((nxgep, CFG_CTL,
- "==> nxge_use_default_dma_config: "
- "use property " "start_grpid %d ",
- p_cfgp->start_grpid));
- } else {
- p_cfgp->def_mac_rxdma_grpid = nxgep->function_num;
- if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(
- nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >=
- NXGE_MAX_RDC_GRPS) {
- cmn_err(CE_CONT, "nxge%d: group not available!\n",
- nxgep->instance);
- goto nxge_use_cfg_dma_config_exit;
- }
- (void) ddi_prop_update_int(DDI_DEV_T_NONE, nxgep->dip,
- prop, p_cfgp->def_mac_rxdma_grpid);
- NXGE_DEBUG_MSG((nxgep, CFG_CTL,
- "==> nxge_use_default_dma_config: "
- "use default "
- "start_grpid %d (same as function #)",
- p_cfgp->start_grpid));
+ if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(nxgep,
+ p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >= NXGE_MAX_RDC_GRPS) {
+ NXGE_ERROR_MSG((nxgep, CFG_CTL,
+ "nxge_use_default_dma_config2(): "
+ "nxge_fzc_rdc_tbl_bind failed"));
+ goto nxge_use_cfg_dma_config_exit;
}
prop = param_arr[param_rx_rdc_grps].fcode_name;
@@ -2195,7 +2187,7 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
nrxgp = *prop_val;
ddi_prop_free(prop_val);
} else {
- nrxgp = 1;
+ nrxgp = NXGE_MAX_RDC_GRPS / nxgep->nports;
(void) ddi_prop_update_int(DDI_DEV_T_NONE, nxgep->dip,
prop, nrxgp);
NXGE_DEBUG_MSG((nxgep, CFG_CTL,
@@ -2203,7 +2195,6 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
"num_rdc_grpid not found: use def:# of "
"rdc groups %d\n", nrxgp));
}
-
p_cfgp->max_rdc_grpids = nrxgp;
/*
@@ -2213,10 +2204,9 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
p_cfgp->max_ldgs = NXGE_LDGRP_PER_4PORTS;
NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_default_dma_config: "
- "p_cfgp 0x%llx max_rdcs %d nxgep->max_rdcs %d max_grpids %d"
- "start_grpid %d",
- p_cfgp, p_cfgp->max_rdcs, nxgep->max_rdcs, p_cfgp->max_grpids,
- p_cfgp->start_grpid));
+ "p_cfgp 0x%llx max_rdcs %d max_grpids %d default_grpid %d",
+ p_cfgp, p_cfgp->max_rdcs, p_cfgp->max_grpids,
+ p_cfgp->def_mac_rxdma_grpid));
NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_cfg_dma_config: "
"p_cfgp 0x%016llx start_ldg %d nxgep->max_ldgs %d "
@@ -2264,7 +2254,7 @@ nxge_get_logical_props(p_nxge_t nxgep)
(void) memset(port, 0, sizeof (*port));
- port->mac_port = 0; /* := function number */
+ port->mac_port = nxgep->function_num; /* := function number */
/*
* alloc_buf_size:
@@ -2300,8 +2290,9 @@ nxge_get_logical_props(p_nxge_t nxgep)
group = &port->rdc_grps[0];
- group->flag = 1; /* configured */
+ group->flag = B_TRUE; /* configured */
group->config_method = RDC_TABLE_ENTRY_METHOD_REP;
+ group->port = NXGE_GET_PORT_NUM(nxgep->function_num);
/* HIO futures: this is still an open question. */
hardware->max_macs = 1;
@@ -2407,129 +2398,138 @@ nxge_set_rdc_intr_property(p_nxge_t nxgep)
static void
nxge_set_hw_dma_config(p_nxge_t nxgep)
{
- int i, ndmas, ngrps, bitmap, end, st_rdc;
- int32_t status;
- uint8_t rdcs_per_grp;
- p_nxge_dma_pt_cfg_t p_dma_cfgp;
- p_nxge_hw_pt_cfg_t p_cfgp;
- p_nxge_rdc_grp_t rdc_grp_p;
- int rdcgrp_cfg = CFG_NOT_SPECIFIED, rx_quick_cfg;
- char *prop, *prop_val;
- p_nxge_param_t param_arr;
- config_token_t token;
- nxge_grp_t *group;
+ int i, j, ngrps, bitmap, end, st_rdc;
+ p_nxge_dma_pt_cfg_t p_dma_cfgp;
+ p_nxge_hw_pt_cfg_t p_cfgp;
+ p_nxge_rdc_grp_t rdc_grp_p;
+ p_nxge_tdc_grp_t tdc_grp_p;
+ nxge_grp_t *group;
+ uint8_t nrdcs;
+ dc_map_t map = 0;
NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_set_hw_dma_config"));
p_dma_cfgp = (p_nxge_dma_pt_cfg_t)&nxgep->pt_config;
p_cfgp = (p_nxge_hw_pt_cfg_t)&p_dma_cfgp->hw_config;
- rdc_grp_p = p_dma_cfgp->rdc_grps;
+ switch (nxgep->niu_type) {
+ case NEPTUNE_4_1GC:
+ case NEPTUNE_2_10GF_2_1GC:
+ case NEPTUNE_1_10GF_3_1GC:
+ case NEPTUNE_1_1GC_1_10GF_2_1GC:
+ case NEPTUNE_2_10GF_2_1GRF:
+ default:
+ ngrps = 2;
+ break;
+ case NEPTUNE_2_10GF:
+ case NEPTUNE_2_1GRF:
+ case N2_NIU:
+ ngrps = 4;
+ break;
+ }
+
+ /*
+ * Setup TDC groups
+ */
bitmap = 0;
end = p_cfgp->tdc.start + p_cfgp->tdc.owned;
- p_dma_cfgp->tx_dma_map = 0;
for (i = p_cfgp->tdc.start; i < end; i++) {
bitmap |= (1 << i);
}
nxgep->tx_set.owned.map |= bitmap; /* Owned, & not shared. */
+ nxgep->tx_set.owned.count = p_cfgp->tdc.owned;
+ p_dma_cfgp->tx_dma_map = bitmap;
- group = (nxge_grp_t *)nxge_grp_add(nxgep, NXGE_TRANSMIT_GROUP);
- group->map = bitmap;
+ for (i = 0; i < ngrps; i++) {
+ group = (nxge_grp_t *)nxge_grp_add(nxgep,
+ NXGE_TRANSMIT_GROUP);
+ tdc_grp_p = &p_dma_cfgp->tdc_grps[
+ p_cfgp->def_mac_txdma_grpid + i];
+ if (i == 0)
+ tdc_grp_p->map = bitmap;
+ else
+ tdc_grp_p->map = 0;
+ /* no ring is associated with a group initially */
+ tdc_grp_p->start_tdc = 0;
+ tdc_grp_p->max_tdcs = 0;
+ tdc_grp_p->grp_index = group->index;
+ }
- p_dma_cfgp->tx_dma_map = bitmap;
- param_arr = nxgep->param_arr;
+ for (i = 0; i < NXGE_MAX_RDCS; i++) {
+ nxgep->rx_channel_started[i] = B_FALSE;
+ }
- /* Assume RDCs are evenly distributed */
- rx_quick_cfg = param_arr[param_rx_quick_cfg].value;
- switch (rx_quick_cfg) {
- case CFG_NOT_SPECIFIED:
- prop = "rxdma-grp-cfg";
- status = ddi_prop_lookup_string(DDI_DEV_T_NONE,
- nxgep->dip, 0, prop, (char **)&prop_val);
- if (status != DDI_PROP_SUCCESS) {
- NXGE_DEBUG_MSG((nxgep, CFG_CTL,
- " property %s not found", prop));
- rdcgrp_cfg = CFG_L3_DISTRIBUTE;
- } else {
- token = nxge_get_config_token(prop_val);
- switch (token) {
- case L2_CLASSIFY:
+ /*
+ * Setup RDC groups
+ */
+ st_rdc = p_cfgp->start_rdc;
+ for (i = 0; i < ngrps; i++) {
+ /*
+ * All rings are associated with the default group initially
+ */
+ if (i == 0) {
+ /* default group */
+ switch (nxgep->niu_type) {
+ case NEPTUNE_4_1GC:
+ nrdcs = rx_4_1G[nxgep->function_num];
+ break;
+ case N2_NIU:
+ case NEPTUNE_2_10GF:
+ nrdcs = rx_2_10G[nxgep->function_num];
+ break;
+ case NEPTUNE_2_10GF_2_1GC:
+ nrdcs = rx_2_10G_2_1G[nxgep->function_num];
break;
- case CLASSIFY:
- case L3_CLASSIFY:
- case L3_DISTRIBUTE:
- case L3_TCAM:
- rdcgrp_cfg = CFG_L3_DISTRIBUTE;
+ case NEPTUNE_1_10GF_3_1GC:
+ nrdcs = rx_1_10G_3_1G[nxgep->function_num];
+ break;
+ case NEPTUNE_1_1GC_1_10GF_2_1GC:
+ nrdcs = rx_1_1G_1_10G_2_1G[nxgep->function_num];
break;
default:
- rdcgrp_cfg = CFG_L3_DISTRIBUTE;
+ switch (nxgep->platform_type) {
+ case P_NEPTUNE_ALONSO:
+ nrdcs =
+ rx_2_10G_2_1G[nxgep->function_num];
+ break;
+ default:
+ nrdcs = rx_4_1G[nxgep->function_num];
+ break;
+ }
break;
}
- ddi_prop_free(prop_val);
+ } else {
+ nrdcs = 0;
}
- break;
- case CFG_L3_WEB:
- case CFG_L3_DISTRIBUTE:
- case CFG_L2_CLASSIFY:
- case CFG_L3_TCAM:
- rdcgrp_cfg = rx_quick_cfg;
- break;
- default:
- rdcgrp_cfg = CFG_L3_DISTRIBUTE;
- break;
- }
-
- st_rdc = p_cfgp->start_rdc;
-
- switch (rdcgrp_cfg) {
- case CFG_L3_DISTRIBUTE:
- case CFG_L3_WEB:
- case CFG_L3_TCAM:
- ndmas = p_cfgp->max_rdcs;
- ngrps = 1;
- rdcs_per_grp = ndmas / ngrps;
- break;
- case CFG_L2_CLASSIFY:
- ndmas = p_cfgp->max_rdcs / 2;
- if (p_cfgp->max_rdcs < 2)
- ndmas = 1;
- ngrps = 1;
- rdcs_per_grp = ndmas / ngrps;
- break;
- default:
- ngrps = p_cfgp->max_rdc_grpids;
- ndmas = p_cfgp->max_rdcs;
- rdcs_per_grp = ndmas / ngrps;
- break;
- }
-
- for (i = 0; i < ngrps; i++) {
- uint8_t count = rdcs_per_grp;
- dc_map_t map = 0;
rdc_grp_p = &p_dma_cfgp->rdc_grps[
p_cfgp->def_mac_rxdma_grpid + i];
- rdc_grp_p->start_rdc = st_rdc + i * rdcs_per_grp;
- rdc_grp_p->max_rdcs = rdcs_per_grp;
+ rdc_grp_p->start_rdc = st_rdc;
+ rdc_grp_p->max_rdcs = nrdcs;
rdc_grp_p->def_rdc = rdc_grp_p->start_rdc;
/* default to: 0, 1, 2, 3, ...., 0, 1, 2, 3.... */
- while (count) {
- map |= (1 << count);
- count--;
- }
- map >>= 1; /* In case <start_rdc> is zero (0) */
- map <<= rdc_grp_p->start_rdc;
+ if (nrdcs != 0) {
+ for (j = 0; j < nrdcs; j++) {
+ map |= (1 << j);
+ }
+ map <<= rdc_grp_p->start_rdc;
+ } else
+ map = 0;
rdc_grp_p->map = map;
nxgep->rx_set.owned.map |= map; /* Owned, & not shared. */
+ nxgep->rx_set.owned.count = nrdcs;
group = (nxge_grp_t *)nxge_grp_add(nxgep, NXGE_RECEIVE_GROUP);
- group->map = rdc_grp_p->map;
rdc_grp_p->config_method = RDC_TABLE_ENTRY_METHOD_SEQ;
- rdc_grp_p->flag = 1; /* This group has been configured. */
+ rdc_grp_p->flag = B_TRUE; /* This group has been configured. */
+ rdc_grp_p->grp_index = group->index;
+ rdc_grp_p->port = NXGE_GET_PORT_NUM(nxgep->function_num);
+
+ map = 0;
}
@@ -2742,7 +2742,7 @@ nxge_set_hw_mac_class_config(p_nxge_t nxgep)
" id %d grp %d",
mac_map->param_id, mac_map->map_to));
mac_host_info[mac_map->param_id].mpr_npr =
- mac_map->pref;
+ p_cfgp->mac_pref;
mac_host_info[mac_map->param_id].rdctbl =
mac_map->map_to +
p_cfgp->def_mac_rxdma_grpid;
@@ -2967,16 +2967,12 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p)
}
/*
- * Port0 uses the HW based syserr interrupt, and port1 uses the
- * SW based syserr interrupt. There is only one syserr and the
- * function zero device gets it.
+ * HW based syserr interrupt for port0, and SW based syserr interrupt
+ * for port1
*/
if (own_sys_err && p_cfgp->ser_ldvid) {
ldv = p_cfgp->ser_ldvid;
/*
- * Port0 - HW based: use an intr vector
- */
- /*
* Unmask the system interrupt states.
*/
(void) nxge_fzc_sys_err_mask_set(nxgep, SYS_ERR_SMX_MASK |
@@ -2999,8 +2995,8 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p)
nldvs++;
} else {
/*
- * Port1 - SW based: allocate the ldv for the syserr since
- * the vector should not be consumed for port1
+ * SW based: allocate the ldv for the syserr since the vector
+ * should not be consumed for port1
*/
sysldvp = KMEM_ZALLOC(sizeof (nxge_ldv_t), KM_SLEEP);
sysldvp->use_timer = B_TRUE;
@@ -3010,9 +3006,10 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p)
sysldvp->ldv_ldf_masks = 0;
sysldvp->nxgep = nxgep;
ldgvp->ldvp_syserr = sysldvp;
- ldgvp->ldvp_syserr_allocated = B_TRUE;
+ ldgvp->ldvp_syserr_alloced = B_TRUE;
}
+
NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_ldgv_init_n2: "
"(before rx) func %d nldvs %d navail %d nrequired %d",
func, nldvs, *navail_p, *nrequired_p));
@@ -3326,7 +3323,7 @@ nxge_ldgv_uninit(p_nxge_t nxgep)
"no logical group configured."));
return (NXGE_OK);
}
- if (ldgvp->ldvp_syserr_allocated == B_TRUE) {
+ if (ldgvp->ldvp_syserr_alloced == B_TRUE) {
KMEM_FREE(ldgvp->ldvp_syserr, sizeof (nxge_ldv_t));
}
if (ldgvp->ldgp) {
@@ -3925,3 +3922,29 @@ nxge_init_mmac(p_nxge_t nxgep, boolean_t compute_addrs)
nxgep->statsp->mmac_stats.mmac_max_cnt = mmac_info->num_mmac;
nxgep->statsp->mmac_stats.mmac_avail_cnt = mmac_info->num_mmac;
}
+
+/*
+ * Convert an RDC group index into a port ring index. That is, map
+ * <groupid> to an index into nxgep->rx_ring_handles.
+ * (group ring index -> port ring index)
+ */
+int
+nxge_get_rxring_index(p_nxge_t nxgep, int groupid, int ringidx)
+{
+ int i;
+ int index = 0;
+ p_nxge_rdc_grp_t rdc_grp_p;
+ p_nxge_dma_pt_cfg_t p_dma_cfgp;
+ p_nxge_hw_pt_cfg_t p_cfgp;
+
+ p_dma_cfgp = &nxgep->pt_config;
+ p_cfgp = &p_dma_cfgp->hw_config;
+
+ for (i = 0; i < groupid; i++) {
+ rdc_grp_p =
+ &p_dma_cfgp->rdc_grps[p_cfgp->def_mac_rxdma_grpid + i];
+ index += rdc_grp_p->max_rdcs;
+ }
+
+ return (index + ringidx);
+}
diff --git a/usr/src/uts/common/io/pcan/pcan.c b/usr/src/uts/common/io/pcan/pcan.c
index b5b0604831..498a9eea60 100644
--- a/usr/src/uts/common/io/pcan/pcan.c
+++ b/usr/src/uts/common/io/pcan/pcan.c
@@ -46,7 +46,7 @@
#include <sys/pccard.h>
#include <sys/pci.h>
#include <sys/policy.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/stream.h>
#include <inet/common.h>
#include <inet/nd.h>
@@ -104,7 +104,6 @@ mac_callbacks_t pcan_m_callbacks = {
pcan_sdmulti,
pcan_saddr,
pcan_tx,
- NULL,
pcan_ioctl
};
diff --git a/usr/src/uts/common/io/pcwl/pcwl.c b/usr/src/uts/common/io/pcwl/pcwl.c
index f8d0cd2c4b..a2bad90c68 100644
--- a/usr/src/uts/common/io/pcwl/pcwl.c
+++ b/usr/src/uts/common/io/pcwl/pcwl.c
@@ -46,7 +46,7 @@
#include <sys/pccard.h>
#include <sys/pci.h>
#include <sys/policy.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/stream.h>
#include <inet/common.h>
#include <inet/nd.h>
@@ -89,7 +89,6 @@ mac_callbacks_t pcwl_m_callbacks = {
pcwl_sdmulti,
pcwl_saddr,
pcwl_tx,
- NULL,
pcwl_ioctl
};
diff --git a/usr/src/uts/common/io/ral/rt2560.c b/usr/src/uts/common/io/ral/rt2560.c
index d1473e1972..e6feee3ff4 100644
--- a/usr/src/uts/common/io/ral/rt2560.c
+++ b/usr/src/uts/common/io/ral/rt2560.c
@@ -43,7 +43,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_wifi.h>
#include <sys/net80211.h>
#include <sys/net80211_proto.h>
@@ -196,7 +196,6 @@ static mac_callbacks_t rt2560_m_callbacks = {
rt2560_m_multicst,
rt2560_m_unicst,
rt2560_m_tx,
- NULL, /* mc_resources; */
rt2560_m_ioctl,
NULL, /* mc_getcapab */
NULL,
diff --git a/usr/src/uts/common/io/rge/rge.h b/usr/src/uts/common/io/rge/rge.h
index 4cab63b289..4a58da1c92 100755..100644
--- a/usr/src/uts/common/io/rge/rge.h
+++ b/usr/src/uts/common/io/rge/rge.h
@@ -26,8 +26,6 @@
#ifndef _RGE_H
#define _RGE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -59,7 +57,7 @@ extern "C" {
#include <sys/ddi.h>
#include <sys/sunddi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
/*
@@ -430,7 +428,6 @@ typedef struct rge {
uint32_t rf_next; /* current free buf index */
uint32_t rc_next; /* current recycle buf index */
uint32_t rx_free; /* number of rx free buf */
- mac_resource_handle_t handle;
/* used for send */
rge_bd_t *tx_ring;
@@ -705,7 +702,7 @@ void rge_chip_init(rge_t *rgep);
void rge_chip_start(rge_t *rgep);
void rge_chip_stop(rge_t *rgep, boolean_t fault);
void rge_chip_sync(rge_t *rgep, enum rge_sync_op todo);
-void rge_chip_blank(void *arg, time_t ticks, uint_t count);
+void rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag);
void rge_tx_trigger(rge_t *rgep);
void rge_hw_stats_dump(rge_t *rgep);
uint_t rge_intr(caddr_t arg1, caddr_t arg2);
diff --git a/usr/src/uts/common/io/rge/rge_chip.c b/usr/src/uts/common/io/rge/rge_chip.c
index 6210fc25fc..c509e01ebb 100644
--- a/usr/src/uts/common/io/rge/rge_chip.c
+++ b/usr/src/uts/common/io/rge/rge_chip.c
@@ -1258,11 +1258,12 @@ rge_chip_sync(rge_t *rgep, enum rge_sync_op todo)
}
}
-void rge_chip_blank(void *arg, time_t ticks, uint_t count);
+void rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag);
#pragma no_inline(rge_chip_blank)
+/* ARGSUSED */
void
-rge_chip_blank(void *arg, time_t ticks, uint_t count)
+rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag)
{
_NOTE(ARGUNUSED(arg, ticks, count));
}
diff --git a/usr/src/uts/common/io/rge/rge_main.c b/usr/src/uts/common/io/rge/rge_main.c
index c473a86b7f..ab9ed63203 100644
--- a/usr/src/uts/common/io/rge/rge_main.c
+++ b/usr/src/uts/common/io/rge/rge_main.c
@@ -109,11 +109,10 @@ static void rge_m_stop(void *);
static int rge_m_promisc(void *, boolean_t);
static int rge_m_multicst(void *, boolean_t, const uint8_t *);
static int rge_m_unicst(void *, const uint8_t *);
-static void rge_m_resources(void *);
static void rge_m_ioctl(void *, queue_t *, mblk_t *);
static boolean_t rge_m_getcapab(void *, mac_capab_t, void *);
-#define RGE_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB)
+#define RGE_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB)
static mac_callbacks_t rge_m_callbacks = {
RGE_M_CALLBACK_FLAGS,
@@ -124,7 +123,6 @@ static mac_callbacks_t rge_m_callbacks = {
rge_m_multicst,
rge_m_unicst,
rge_m_tx,
- rge_m_resources,
rge_m_ioctl,
rge_m_getcapab
};
@@ -1249,28 +1247,6 @@ rge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
}
}
-static void
-rge_m_resources(void *arg)
-{
- rge_t *rgep = arg;
- mac_rx_fifo_t mrf;
-
- mutex_enter(rgep->genlock);
-
- /*
- * Register Rx rings as resources and save mac
- * resource id for future reference
- */
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = rge_chip_blank;
- mrf.mrf_arg = (void *)rgep;
- mrf.mrf_normal_blank_time = RGE_RX_INT_TIME;
- mrf.mrf_normal_pkt_count = RGE_RX_INT_PKTS;
- rgep->handle = mac_resource_add(rgep->mh, (mac_resource_t *)&mrf);
-
- mutex_exit(rgep->genlock);
-}
-
/* ARGSUSED */
static boolean_t
rge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
@@ -1302,12 +1278,6 @@ rge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
}
break;
}
- case MAC_CAPAB_POLL:
- /*
- * There's nothing for us to fill in, simply returning
- * B_TRUE stating that we support polling is sufficient.
- */
- break;
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/common/io/rge/rge_rxtx.c b/usr/src/uts/common/io/rge/rge_rxtx.c
index 301b023e5a..09d23825d3 100755..100644
--- a/usr/src/uts/common/io/rge/rge_rxtx.c
+++ b/usr/src/uts/common/io/rge/rge_rxtx.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "rge.h"
#define U32TOPTR(x) ((void *)(uintptr_t)(uint32_t)(x))
@@ -369,7 +367,7 @@ rge_receive(rge_t *rgep)
mutex_exit(rgep->rx_lock);
if (mp != NULL)
- mac_rx(rgep->mh, rgep->handle, mp);
+ mac_rx(rgep->mh, NULL, mp);
}
diff --git a/usr/src/uts/common/io/rtw/rtw.c b/usr/src/uts/common/io/rtw/rtw.c
index 1b99f01099..fa471c83a8 100644
--- a/usr/src/uts/common/io/rtw/rtw.c
+++ b/usr/src/uts/common/io/rtw/rtw.c
@@ -54,7 +54,7 @@
#include <sys/sunddi.h>
#include <sys/pci.h>
#include <sys/errno.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <sys/list.h>
@@ -178,7 +178,6 @@ static mac_callbacks_t rtw_m_callbacks = {
rtw_m_multicst,
rtw_m_unicst,
rtw_m_tx,
- NULL,
rtw_m_ioctl,
NULL, /* mc_getcapab */
NULL,
diff --git a/usr/src/uts/common/io/rum/rum.c b/usr/src/uts/common/io/rum/rum.c
index 8b09c53171..6c61cbbebd 100644
--- a/usr/src/uts/common/io/rum/rum.c
+++ b/usr/src/uts/common/io/rum/rum.c
@@ -43,7 +43,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_wifi.h>
#include <sys/net80211.h>
#include <sys/net80211_proto.h>
@@ -291,7 +291,6 @@ static mac_callbacks_t rum_m_callbacks = {
rum_m_multicst,
rum_m_unicst,
rum_m_tx,
- NULL, /* mc_resources; */
rum_m_ioctl,
NULL, /* mc_getcapab */
NULL,
diff --git a/usr/src/uts/common/io/sfe/sfe_util.c b/usr/src/uts/common/io/sfe/sfe_util.c
index 0d8f736d15..fdee7b6d2f 100644
--- a/usr/src/uts/common/io/sfe/sfe_util.c
+++ b/usr/src/uts/common/io/sfe/sfe_util.c
@@ -32,6 +32,11 @@
*/
/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
* System Header files.
*/
#include <sys/types.h>
@@ -1958,7 +1963,7 @@ next:
* send up received packets
*/
mutex_exit(&dp->intrlock);
- mac_rx(dp->mh, dp->mac_rx_ring_ha, rx_head);
+ mac_rx(dp->mh, NULL, rx_head);
mutex_enter(&dp->intrlock);
}
@@ -4050,11 +4055,10 @@ static int gem_m_setpromisc(void *, boolean_t);
static int gem_m_multicst(void *, boolean_t, const uint8_t *);
static int gem_m_unicst(void *, const uint8_t *);
static mblk_t *gem_m_tx(void *, mblk_t *);
-static void gem_m_resources(void *);
static void gem_m_ioctl(void *, queue_t *, mblk_t *);
static boolean_t gem_m_getcapab(void *, mac_capab_t, void *);
-#define GEM_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB)
+#define GEM_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB)
static mac_callbacks_t gem_m_callbacks = {
GEM_M_CALLBACK_FLAGS,
@@ -4065,7 +4069,6 @@ static mac_callbacks_t gem_m_callbacks = {
gem_m_multicst,
gem_m_unicst,
gem_m_tx,
- gem_m_resources,
gem_m_ioctl,
gem_m_getcapab,
};
@@ -4590,45 +4593,6 @@ gem_m_tx(void *arg, mblk_t *mp)
}
static void
-gem_set_coalease(void *arg, time_t ticks, uint_t count)
-{
- struct gem_dev *dp = arg;
- DPRINTF(1, (CE_CONT, "%s: %s: ticks:%d count:%d",
- dp->name, __func__, ticks, count));
-
- mutex_enter(&dp->intrlock);
- dp->poll_pkt_delay = min(count, dp->gc.gc_rx_ring_size/2);
- mutex_exit(&dp->intrlock);
-}
-
-static void
-gem_m_resources(void *arg)
-{
- struct gem_dev *dp = arg;
- mac_rx_fifo_t mrf;
-
- DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
-
- mutex_enter(&dp->intrlock);
- mutex_enter(&dp->xmitlock);
-
- /*
- * Register Rx rings as resources and save mac
- * resource id for future reference
- */
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = gem_set_coalease;
- mrf.mrf_arg = (void *)dp;
- mrf.mrf_normal_blank_time = 1; /* in uS */
- mrf.mrf_normal_pkt_count = dp->poll_pkt_delay;
-
- dp->mac_rx_ring_ha = mac_resource_add(dp->mh, (mac_resource_t *)&mrf);
-
- mutex_exit(&dp->xmitlock);
- mutex_exit(&dp->intrlock);
-}
-
-static void
gem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
{
DPRINTF(0, (CE_CONT, "!%s: %s: called",
@@ -4637,18 +4601,11 @@ gem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
gem_mac_ioctl((struct gem_dev *)arg, wq, mp);
}
+/* ARGSUSED */
static boolean_t
gem_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
- boolean_t ret;
-
- ret = B_FALSE;
- switch (cap) {
- case MAC_CAPAB_POLL:
- ret = B_TRUE;
- break;
- }
- return (ret);
+ return (B_FALSE);
}
static void
diff --git a/usr/src/uts/common/io/sfe/sfe_util.h b/usr/src/uts/common/io/sfe/sfe_util.h
index 576a3d5d08..6c8ca8fea4 100644
--- a/usr/src/uts/common/io/sfe/sfe_util.h
+++ b/usr/src/uts/common/io/sfe/sfe_util.h
@@ -31,9 +31,14 @@
* DAMAGE.
*/
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
#ifndef _SFE_UTIL_H_
#define _SFE_UTIL_H_
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
/*
diff --git a/usr/src/uts/common/io/softmac/softmac_ctl.c b/usr/src/uts/common/io/softmac/softmac_ctl.c
index b1b8cd4f42..99c665aae6 100644
--- a/usr/src/uts/common/io/softmac/softmac_ctl.c
+++ b/usr/src/uts/common/io/softmac/softmac_ctl.c
@@ -23,9 +23,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/stropts.h>
+#include <sys/strsubr.h>
+#include <sys/callb.h>
#include <sys/softmac_impl.h>
int
@@ -192,11 +192,9 @@ softmac_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
}
static void
-softmac_process_notify_ind(queue_t *rq, mblk_t *mp)
+softmac_process_notify_ind(softmac_t *softmac, mblk_t *mp)
{
- softmac_lower_t *slp = rq->q_ptr;
dl_notify_ind_t *dlnip = (dl_notify_ind_t *)mp->b_rptr;
- softmac_t *softmac = slp->sl_softmac;
uint_t addroff, addrlen;
ASSERT(dlnip->dl_primitive == DL_NOTIFY_IND);
@@ -231,6 +229,73 @@ softmac_process_notify_ind(queue_t *rq, mblk_t *mp)
freemsg(mp);
}
+void
+softmac_notify_thread(void *arg)
+{
+ softmac_t *softmac = arg;
+ callb_cpr_t cprinfo;
+
+ CALLB_CPR_INIT(&cprinfo, &softmac->smac_mutex, callb_generic_cpr,
+ "softmac_notify_thread");
+
+ mutex_enter(&softmac->smac_mutex);
+
+ /*
+ * Quit the thread if smac_mh is unregistered.
+ */
+ while (softmac->smac_mh != NULL &&
+ !(softmac->smac_flags & SOFTMAC_NOTIFY_QUIT)) {
+ mblk_t *mp, *nextmp;
+
+ if ((mp = softmac->smac_notify_head) == NULL) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+ CALLB_CPR_SAFE_END(&cprinfo, &softmac->smac_mutex);
+ continue;
+ }
+
+ softmac->smac_notify_head = softmac->smac_notify_tail = NULL;
+ mutex_exit(&softmac->smac_mutex);
+
+ while (mp != NULL) {
+ nextmp = mp->b_next;
+ mp->b_next = NULL;
+ softmac_process_notify_ind(softmac, mp);
+ mp = nextmp;
+ }
+ mutex_enter(&softmac->smac_mutex);
+ }
+
+ /*
+ * The softmac is being destroyed, simply free all of the DL_NOTIFY_IND
+ * messages left in the queue which did not have the chance to be
+ * processed.
+ */
+ freemsgchain(softmac->smac_notify_head);
+ softmac->smac_notify_head = softmac->smac_notify_tail = NULL;
+ softmac->smac_notify_thread = NULL;
+ cv_broadcast(&softmac->smac_cv);
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
+}
+
+static void
+softmac_enqueue_notify_ind(queue_t *rq, mblk_t *mp)
+{
+ softmac_lower_t *slp = rq->q_ptr;
+ softmac_t *softmac = slp->sl_softmac;
+
+ mutex_enter(&softmac->smac_mutex);
+ if (softmac->smac_notify_tail == NULL) {
+ softmac->smac_notify_head = softmac->smac_notify_tail = mp;
+ } else {
+ softmac->smac_notify_tail->b_next = mp;
+ softmac->smac_notify_tail = mp;
+ }
+ cv_broadcast(&softmac->smac_cv);
+ mutex_exit(&softmac->smac_mutex);
+}
+
static void
softmac_process_dlpi(softmac_lower_t *slp, mblk_t *mp, uint_t minlen,
t_uscalar_t reqprim)
@@ -295,7 +360,29 @@ softmac_rput_process_proto(queue_t *rq, mblk_t *mp)
if (len < DL_NOTIFY_IND_SIZE)
goto runt;
- softmac_process_notify_ind(rq, mp);
+ /*
+ * Enqueue all the DL_NOTIFY_IND messages and process them
+ * in another separate thread to avoid deadlock. Here is an
+ * example of the deadlock scenario:
+ *
+ * Thread A: mac_promisc_set()->softmac_m_promisc()
+ *
+ * The softmac driver waits for the ACK of the
+ * DL_PROMISC_PHYS request with the MAC perimeter;
+ *
+ * Thread B:
+ *
+ * The driver handles the DL_PROMISC_PHYS request. Before
+ * it sends back the ACK, it could first send a
+ * DL_NOTE_PROMISC_ON_PHYS notification.
+ *
+ * Since DL_NOTIFY_IND could eventually cause softmac to call
+ * mac_xxx_update(), which requires MAC perimeter, this would
+ * cause deadlock between the two threads. Enqueuing the
+ * DL_NOTIFY_IND message and defer its processing would
+ * avoid the potential deadlock.
+ */
+ softmac_enqueue_notify_ind(rq, mp);
return;
case DL_NOTIFY_ACK:
diff --git a/usr/src/uts/common/io/softmac/softmac_dev.c b/usr/src/uts/common/io/softmac/softmac_dev.c
index 3d2164e782..f548df055d 100644
--- a/usr/src/uts/common/io/softmac/softmac_dev.c
+++ b/usr/src/uts/common/io/softmac/softmac_dev.c
@@ -222,11 +222,6 @@ softmac_close(queue_t *rq)
slp->sl_softmac = NULL;
slp->sl_lh = NULL;
- /*
- * slp->sl_handle could be non-NULL if it is in the aggregation.
- */
- slp->sl_handle = (mac_resource_handle_t)NULL;
-
ASSERT(slp->sl_ack_mp == NULL);
ASSERT(slp->sl_ctl_inprogress == B_FALSE);
ASSERT(slp->sl_pending_prim == DL_PRIM_INVAL);
@@ -266,6 +261,16 @@ softmac_rput(queue_t *rq, mblk_t *mp)
}
/*
+ * If this message is looped back from the legacy devices,
+ * drop it as the Nemo framework will be responsible for
+ * looping it back by the mac_txloop() function.
+ */
+ if (mp->b_flag & MSGNOLOOP) {
+ freemsg(mp);
+ return;
+ }
+
+ /*
* This is the most common case.
*/
if (DB_REF(mp) == 1) {
@@ -276,7 +281,7 @@ softmac_rput(queue_t *rq, mblk_t *mp)
* is reset to NULL when DL_CAPAB_POLL is
* disabled.
*/
- mac_rx(slp->sl_softmac->smac_mh, slp->sl_handle, mp);
+ mac_rx(slp->sl_softmac->smac_mh, NULL, mp);
return;
} else {
softmac_rput_process_data(slp, mp);
diff --git a/usr/src/uts/common/io/softmac/softmac_main.c b/usr/src/uts/common/io/softmac/softmac_main.c
index d325e3b4c6..0187cf8a28 100644
--- a/usr/src/uts/common/io/softmac/softmac_main.c
+++ b/usr/src/uts/common/io/softmac/softmac_main.c
@@ -44,6 +44,8 @@
#include <sys/file.h>
#include <sys/cred.h>
#include <sys/dlpi.h>
+#include <sys/mac_provider.h>
+#include <sys/disp.h>
#include <sys/sunndi.h>
#include <sys/modhash.h>
#include <sys/stropts.h>
@@ -53,11 +55,19 @@
#include <sys/softmac.h>
#include <sys/dls.h>
+/* Used as a parameter to the mod hash walk of softmac structures */
+typedef struct {
+ softmac_t *smw_softmac;
+ boolean_t smw_retry;
+} softmac_walk_t;
+
/*
* Softmac hash table including softmacs for both style-2 and style-1 devices.
*/
static krwlock_t softmac_hash_lock;
static mod_hash_t *softmac_hash;
+static kmutex_t smac_global_lock;
+static kcondvar_t smac_global_cv;
#define SOFTMAC_HASHSZ 64
@@ -71,7 +81,7 @@ static void softmac_m_close(void *);
static boolean_t softmac_m_getcapab(void *, mac_capab_t, void *);
#define SOFTMAC_M_CALLBACK_FLAGS \
- (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_OPEN | MC_CLOSE)
+ (MC_IOCTL | MC_GETCAPAB | MC_OPEN | MC_CLOSE)
static mac_callbacks_t softmac_m_callbacks = {
SOFTMAC_M_CALLBACK_FLAGS,
@@ -82,7 +92,6 @@ static mac_callbacks_t softmac_m_callbacks = {
softmac_m_multicst,
softmac_m_unicst,
softmac_m_tx,
- softmac_m_resources,
softmac_m_ioctl,
softmac_m_getcapab,
softmac_m_open,
@@ -97,6 +106,8 @@ softmac_init()
mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
rw_init(&softmac_hash_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&smac_global_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&smac_global_cv, NULL, CV_DRIVER, NULL);
}
void
@@ -104,6 +115,8 @@ softmac_fini()
{
rw_destroy(&softmac_hash_lock);
mod_hash_destroy_hash(softmac_hash);
+ mutex_destroy(&smac_global_lock);
+ cv_destroy(&smac_global_cv);
}
/* ARGSUSED */
@@ -128,7 +141,8 @@ softmac_busy()
}
/*
- * This function is called for each minor node during the post-attach of
+ *
+ * softmac_create() is called for each minor node during the post-attach of
* each DDI_NT_NET device instance. Note that it is possible that a device
* instance has two minor nodes (DLPI style-1 and style-2), so that for that
* specific device, softmac_create() could be called twice.
@@ -139,7 +153,99 @@ softmac_busy()
* For each minor node of a legacy device, a taskq is started to finish
* softmac_mac_register(), which will finish the rest of work (see comments
* above softmac_mac_register()).
+ *
+ * softmac state machine
+ * --------------------------------------------------------------------------
+ * OLD STATE EVENT NEW STATE
+ * --------------------------------------------------------------------------
+ * UNINIT attach of 1st minor node ATTACH_INPROG
+ * okcnt = 0 net_postattach -> softmac_create okcnt = 1
+ *
+ * ATTACH_INPROG attach of 2nd minor node (GLDv3) ATTACH_DONE
+ * okcnt = 1 net_postattach -> softmac_create okcnt = 2
+ *
+ * ATTACH_INPROG attach of 2nd minor node (legacy) ATTACH_INPROG
+ * okcnt = 1 net_postattach -> softmac_create okcnt = 2
+ * schedule softmac_mac_register
+ *
+ * ATTACH_INPROG legacy device node ATTACH_DONE
+ * okcnt = 2 softmac_mac_register okcnt = 2
+ *
+ * ATTACH_DONE detach of 1st minor node DETACH_INPROG
+ * okcnt = 2 (success) okcnt = 1
+ *
+ * DETACH_INPROG detach of 2nd minor node UNINIT (or free)
+ * okcnt = 1 (success) okcnt = 0
+ *
+ * ATTACH_DONE detach failure state unchanged
+ * DETACH_INPROG left = okcnt
+ *
+ * DETACH_INPROG reattach ATTACH_INPROG
+ * okcnt = 0,1 net_postattach -> softmac_create
+ *
+ * ATTACH_DONE reattach ATTACH_DONE
+ * left != 0 net_postattach -> softmac_create left = 0
+ *
+ * Abbreviation notes:
+ * states have SOFTMAC_ prefix,
+ * okcnt - softmac_attach_okcnt,
+ * left - softmac_attached_left
*/
+
+#ifdef DEBUG
+void
+softmac_state_verify(softmac_t *softmac)
+{
+ ASSERT(MUTEX_HELD(&softmac->smac_mutex));
+
+ /*
+ * There are at most 2 minor nodes, one per DLPI style
+ */
+ ASSERT(softmac->smac_cnt <= 2 && softmac->smac_attachok_cnt <= 2);
+
+ /*
+ * The smac_attachok_cnt represents the number of attaches i.e. the
+ * number of times net_postattach -> softmac_create() has been called
+ * for a device instance.
+ */
+ ASSERT(softmac->smac_attachok_cnt == SMAC_NONZERO_NODECNT(softmac));
+
+ /*
+ * softmac_create (or softmac_mac_register) -> softmac_create_datalink
+ * happens only after all minor nodes have been attached
+ */
+ ASSERT(softmac->smac_state != SOFTMAC_ATTACH_DONE ||
+ softmac->smac_attachok_cnt == softmac->smac_cnt);
+
+ if (softmac->smac_attachok_cnt == 0) {
+ ASSERT(softmac->smac_state == SOFTMAC_UNINIT);
+ ASSERT(softmac->smac_mh == NULL);
+ } else if (softmac->smac_attachok_cnt < softmac->smac_cnt) {
+ ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG ||
+ softmac->smac_state == SOFTMAC_DETACH_INPROG);
+ ASSERT(softmac->smac_mh == NULL);
+ } else {
+ /*
+ * In the stable condition the state whould be
+ * SOFTMAC_ATTACH_DONE. But there is a small transient window
+ * in softmac_destroy where we change the state to
+ * SOFTMAC_DETACH_INPROG and drop the lock before doing
+ * the link destroy
+ */
+ ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt);
+ ASSERT(softmac->smac_state != SOFTMAC_UNINIT);
+ }
+ if (softmac->smac_mh != NULL)
+ ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt);
+}
+#endif
+
+#ifdef DEBUG
+#define SOFTMAC_STATE_VERIFY(softmac) softmac_state_verify(softmac)
+#else
+#define SOFTMAC_STATE_VERIFY(softmac)
+#endif
+
int
softmac_create(dev_info_t *dip, dev_t dev)
{
@@ -181,9 +287,7 @@ softmac_create(dev_info_t *dip, dev_t dev)
softmac = kmem_zalloc(sizeof (softmac_t), KM_SLEEP);
mutex_init(&softmac->smac_mutex, NULL, MUTEX_DRIVER, NULL);
cv_init(&softmac->smac_cv, NULL, CV_DRIVER, NULL);
- rw_init(&softmac->smac_lock, NULL, RW_DRIVER, NULL);
(void) strlcpy(softmac->smac_devname, devname, MAXNAMELEN);
-
/*
* Insert the softmac into the hash table.
*/
@@ -191,9 +295,15 @@ softmac_create(dev_info_t *dip, dev_t dev)
(mod_hash_key_t)softmac->smac_devname,
(mod_hash_val_t)softmac);
ASSERT(err == 0);
+ mutex_enter(&smac_global_lock);
+ cv_broadcast(&smac_global_cv);
+ mutex_exit(&smac_global_lock);
}
mutex_enter(&softmac->smac_mutex);
+ SOFTMAC_STATE_VERIFY(softmac);
+ if (softmac->smac_state != SOFTMAC_ATTACH_DONE)
+ softmac->smac_state = SOFTMAC_ATTACH_INPROG;
if (softmac->smac_attachok_cnt == 0) {
/*
* Initialize the softmac if this is the post-attach of the
@@ -231,45 +341,26 @@ softmac_create(dev_info_t *dip, dev_t dev)
index = (getmajor(dev) == ddi_name_to_major("clone"));
if (softmac->smac_softmac[index] != NULL) {
/*
- * This is possible if the post_attach() is called:
- *
- * a. after pre_detach() fails.
- *
- * b. for a new round of reattachment. Note that DACF will not
- * call pre_detach() for successfully post_attached minor
- * nodes even when the post-attach failed after all.
- *
- * Both seem to be defects in the DACF framework. To work
- * around it and only clear the SOFTMAC_ATTACH_DONE flag for
- * the b case, a smac_attached_left field is used to tell
- * the two cases apart.
+ * This is possible if the post_attach() is called after
+ * pre_detach() fails. This seems to be a defect of the DACF
+ * framework. We work around it by using a smac_attached_left
+ * field that tracks this
*/
- ASSERT(softmac->smac_attachok_cnt != 0);
-
- if (softmac->smac_attached_left != 0)
- /* case a */
- softmac->smac_attached_left--;
- else if (softmac->smac_attachok_cnt != softmac->smac_cnt) {
- /* case b */
- softmac->smac_flags &= ~SOFTMAC_ATTACH_DONE;
- }
+ ASSERT(softmac->smac_attached_left != 0);
+ softmac->smac_attached_left--;
mutex_exit(&softmac->smac_mutex);
rw_exit(&softmac_hash_lock);
return (0);
+
}
mutex_exit(&softmac->smac_mutex);
rw_exit(&softmac_hash_lock);
- /*
- * No lock is needed for access this softmac pointer, as pre-detach and
- * post-attach won't happen at the same time.
- */
- mutex_enter(&softmac->smac_mutex);
-
softmac_dev = kmem_zalloc(sizeof (softmac_dev_t), KM_SLEEP);
softmac_dev->sd_dev = dev;
- softmac->smac_softmac[index] = softmac_dev;
+ mutex_enter(&softmac->smac_mutex);
+ softmac->smac_softmac[index] = softmac_dev;
/*
* Continue to register the mac and create the datalink only when all
* the minor nodes are attached.
@@ -281,18 +372,22 @@ softmac_create(dev_info_t *dip, dev_t dev)
/*
* All of the minor nodes have been attached; start a taskq
- * to do the rest of the work. We use a taskq instead of of
+ * to do the rest of the work. We use a taskq instead of
* doing the work here because:
*
- * - We could be called as a result of an open() system call
- * where spec_open() already SLOCKED the snode. Using a taskq
- * sidesteps the risk that our ldi_open_by_dev() call would
- * deadlock trying to set SLOCKED on the snode again.
+ * We could be called as a result of a open() system call
+ * where spec_open() already SLOCKED the snode. Using a taskq
+ * sidesteps the risk that our ldi_open_by_dev() call would
+ * deadlock trying to set SLOCKED on the snode again.
*
- * - The devfs design requires no interruptible function calls
- * in the device post-attach routine, but we need to make an
- * (interruptible) upcall. Using a taskq to make the upcall
- * sidesteps this.
+ * The devfs design requires that the downcalls don't use any
+ * interruptible cv_wait which happens when we do door upcalls.
+ * Otherwise the downcalls which may be holding devfs resources
+ * may cause a deadlock if the thread is stopped. Also we need to make
+ * sure these downcalls into softmac_create or softmac_destroy
+ * don't cv_wait on any devfs related condition. Thus softmac_destroy
+ * returns EBUSY if the asynchronous threads started in softmac_create
+ * haven't finished.
*/
ASSERT(softmac->smac_taskq == NULL);
softmac->smac_taskq = taskq_dispatch(system_taskq,
@@ -331,7 +426,6 @@ softmac_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
* simply return B_TRUE if we support it.
*/
case MAC_CAPAB_NO_ZCOPY:
- case MAC_CAPAB_POLL:
case MAC_CAPAB_NO_NATIVEVLAN:
default:
break;
@@ -396,8 +490,6 @@ softmac_create_datalink(softmac_t *softmac)
datalink_id_t linkid = DATALINK_INVALID_LINKID;
int err;
- ASSERT(MUTEX_HELD(&softmac->smac_mutex));
-
/*
* Inform dlmgmtd of this link so that softmac_hold_device() is able
* to know the existence of this link. If this failed with EBADF,
@@ -429,8 +521,11 @@ softmac_create_datalink(softmac_t *softmac)
return (err);
}
- if (linkid == DATALINK_INVALID_LINKID)
+ if (linkid == DATALINK_INVALID_LINKID) {
+ mutex_enter(&softmac->smac_mutex);
softmac->smac_flags |= SOFTMAC_NEED_RECREATE;
+ mutex_exit(&softmac->smac_mutex);
+ }
return (0);
}
@@ -453,6 +548,8 @@ softmac_create_task(void *arg)
mutex_enter(&softmac->smac_mutex);
softmac->smac_media = (mac_info(mh))->mi_nativemedia;
softmac->smac_mh = mh;
+ softmac->smac_taskq = NULL;
+ mutex_exit(&softmac->smac_mutex);
/*
* We can safely release the reference on the mac because
@@ -467,10 +564,13 @@ softmac_create_task(void *arg)
*/
err = softmac_create_datalink(softmac);
+ mutex_enter(&softmac->smac_mutex);
done:
- ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE));
- softmac->smac_flags |= SOFTMAC_ATTACH_DONE;
- softmac->smac_attacherr = err;
+ if (err != 0) {
+ softmac->smac_mh = NULL;
+ softmac->smac_attacherr = err;
+ }
+ softmac->smac_state = SOFTMAC_ATTACH_DONE;
softmac->smac_taskq = NULL;
cv_broadcast(&softmac->smac_cv);
mutex_exit(&softmac->smac_mutex);
@@ -498,6 +598,8 @@ softmac_mac_register(softmac_t *softmac)
* as softmac_destroy() will wait until this function is called.
*/
ASSERT(softmac != NULL);
+ ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG &&
+ softmac->smac_attachok_cnt == softmac->smac_cnt);
if ((err = ldi_ident_from_dip(softmac_dip, &li)) != 0) {
mutex_enter(&softmac->smac_mutex);
@@ -617,11 +719,9 @@ softmac_mac_register(softmac_t *softmac)
* dl_bind() because some drivers return DL_ERROR_ACK if the
* stream is not bound. It is also before mac_register(), so
* we don't need any lock protection here.
- *
- * Softmac always supports POLL.
*/
softmac->smac_capab_flags =
- (MAC_CAPAB_POLL | MAC_CAPAB_NO_ZCOPY | MAC_CAPAB_LEGACY);
+ (MAC_CAPAB_NO_ZCOPY | MAC_CAPAB_LEGACY);
softmac->smac_no_capability_req = B_FALSE;
if (softmac_fill_capab(lh, softmac) != 0)
@@ -714,6 +814,7 @@ softmac_mac_register(softmac_t *softmac)
goto done;
}
}
+ mutex_exit(&softmac->smac_mutex);
/*
* Try to create the datalink for this softmac.
@@ -724,10 +825,21 @@ softmac_mac_register(softmac_t *softmac)
softmac->smac_mh = NULL;
}
}
+ /*
+ * If succeed, create the thread which handles the DL_NOTIFY_IND from
+ * the lower stream.
+ */
+ if (softmac->smac_mh != NULL) {
+ softmac->smac_notify_thread = thread_create(NULL, 0,
+ softmac_notify_thread, softmac, 0, &p0,
+ TS_RUN, minclsyspri);
+ }
+ mutex_enter(&softmac->smac_mutex);
done:
- ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE));
- softmac->smac_flags |= SOFTMAC_ATTACH_DONE;
+ ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG &&
+ softmac->smac_attachok_cnt == softmac->smac_cnt);
+ softmac->smac_state = SOFTMAC_ATTACH_DONE;
softmac->smac_attacherr = err;
softmac->smac_taskq = NULL;
cv_broadcast(&softmac->smac_cv);
@@ -743,24 +855,37 @@ softmac_destroy(dev_info_t *dip, dev_t dev)
int index;
int ppa, err;
datalink_id_t linkid;
+ mac_handle_t smac_mh;
+ uint32_t smac_flags;
ppa = ddi_get_instance(dip);
(void) snprintf(devname, MAXNAMELEN, "%s%d", ddi_driver_name(dip), ppa);
- rw_enter(&softmac_hash_lock, RW_WRITER);
+ /*
+ * We are called only from the predetach entry point. The DACF
+ * framework ensures there can't be a concurrent postattach call
+ * for the same softmac. The softmac found out from the modhash
+ * below can't vanish beneath us since this is the only place where
+ * it is deleted.
+ */
err = mod_hash_find(softmac_hash, (mod_hash_key_t)devname,
(mod_hash_val_t *)&softmac);
ASSERT(err == 0);
mutex_enter(&softmac->smac_mutex);
+ SOFTMAC_STATE_VERIFY(softmac);
/*
* Fail the predetach routine if this softmac is in-use.
+ * Make sure these downcalls into softmac_create or softmac_destroy
+ * don't cv_wait on any devfs related condition. Thus softmac_destroy
+ * returns EBUSY if the asynchronous thread started in softmac_create
+ * hasn't finished
*/
- if (softmac->smac_hold_cnt != 0) {
+ if ((softmac->smac_hold_cnt != 0) ||
+ (softmac->smac_state == SOFTMAC_ATTACH_INPROG)) {
softmac->smac_attached_left = softmac->smac_attachok_cnt;
mutex_exit(&softmac->smac_mutex);
- rw_exit(&softmac_hash_lock);
return (EBUSY);
}
@@ -772,78 +897,106 @@ softmac_destroy(dev_info_t *dip, dev_t dev)
*/
if (softmac->smac_attached_left != 0) {
mutex_exit(&softmac->smac_mutex);
- rw_exit(&softmac_hash_lock);
return (EBUSY);
}
- if (softmac->smac_attachok_cnt != softmac->smac_cnt)
- goto done;
-
- /*
- * This is the detach for the first minor node. Wait until all the
- * minor nodes are attached.
- */
- while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE))
- cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+ smac_mh = softmac->smac_mh;
+ smac_flags = softmac->smac_flags;
+ softmac->smac_state = SOFTMAC_DETACH_INPROG;
+ mutex_exit(&softmac->smac_mutex);
- if (softmac->smac_mh != NULL) {
- if (!(softmac->smac_flags & SOFTMAC_NOSUPP)) {
- if ((err = dls_devnet_destroy(softmac->smac_mh,
- &linkid)) != 0) {
- goto done;
+ if (smac_mh != NULL) {
+ /*
+ * This is the first minor node that is being detached for this
+ * softmac.
+ */
+ ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt);
+ if (!(smac_flags & SOFTMAC_NOSUPP)) {
+ if ((err = dls_devnet_destroy(smac_mh, &linkid,
+ B_FALSE)) != 0) {
+ goto error;
}
}
/*
* If softmac_mac_register() succeeds in registering the mac
* of the legacy device, unregister it.
*/
- if (!(softmac->smac_flags & (SOFTMAC_GLDV3 | SOFTMAC_NOSUPP))) {
- if ((err = mac_unregister(softmac->smac_mh)) != 0) {
- (void) dls_devnet_create(softmac->smac_mh,
- linkid);
- goto done;
+ if (!(smac_flags & (SOFTMAC_GLDV3 | SOFTMAC_NOSUPP))) {
+ if ((err = mac_disable_nowait(smac_mh)) != 0) {
+ (void) dls_devnet_create(smac_mh, linkid);
+ goto error;
}
+ /*
+ * Ask softmac_notify_thread to quit, and wait for
+ * that to be done.
+ */
+ mutex_enter(&softmac->smac_mutex);
+ softmac->smac_flags |= SOFTMAC_NOTIFY_QUIT;
+ cv_broadcast(&softmac->smac_cv);
+ while (softmac->smac_notify_thread != NULL) {
+ cv_wait(&softmac->smac_cv,
+ &softmac->smac_mutex);
+ }
+ mutex_exit(&softmac->smac_mutex);
+ VERIFY(mac_unregister(smac_mh) == 0);
}
softmac->smac_mh = NULL;
}
- softmac->smac_flags &= ~SOFTMAC_ATTACH_DONE;
-done:
- if (err == 0) {
- /*
- * Free softmac_dev
- */
- index = (getmajor(dev) == ddi_name_to_major("clone"));
- softmac_dev = softmac->smac_softmac[index];
- ASSERT(softmac_dev != NULL);
- softmac->smac_softmac[index] = NULL;
- kmem_free(softmac_dev, sizeof (softmac_dev_t));
-
- if (--softmac->smac_attachok_cnt == 0) {
- mod_hash_val_t hashval;
-
- err = mod_hash_remove(softmac_hash,
- (mod_hash_key_t)devname,
- (mod_hash_val_t *)&hashval);
- ASSERT(err == 0);
+ /*
+ * Free softmac_dev
+ */
+ rw_enter(&softmac_hash_lock, RW_WRITER);
+ mutex_enter(&softmac->smac_mutex);
+ ASSERT(softmac->smac_state == SOFTMAC_DETACH_INPROG &&
+ softmac->smac_attachok_cnt != 0);
+ softmac->smac_mh = NULL;
+ index = (getmajor(dev) == ddi_name_to_major("clone"));
+ softmac_dev = softmac->smac_softmac[index];
+ ASSERT(softmac_dev != NULL);
+ softmac->smac_softmac[index] = NULL;
+ kmem_free(softmac_dev, sizeof (softmac_dev_t));
+
+ if (--softmac->smac_attachok_cnt == 0) {
+ mod_hash_val_t hashval;
+
+ softmac->smac_state = SOFTMAC_UNINIT;
+ if (softmac->smac_hold_cnt != 0) {
+ /*
+ * Someone did a softmac_hold_device while we dropped
+ * the locks. Leave the softmac itself intact which
+ * will be reused by the reattach
+ */
mutex_exit(&softmac->smac_mutex);
rw_exit(&softmac_hash_lock);
-
- ASSERT(softmac->smac_taskq == NULL);
- ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE));
- mutex_destroy(&softmac->smac_mutex);
- cv_destroy(&softmac->smac_cv);
- rw_destroy(&softmac->smac_lock);
- kmem_free(softmac, sizeof (softmac_t));
return (0);
}
- } else {
- softmac->smac_attached_left = softmac->smac_attachok_cnt;
- }
+ ASSERT(softmac->smac_taskq == NULL);
+ err = mod_hash_remove(softmac_hash,
+ (mod_hash_key_t)devname,
+ (mod_hash_val_t *)&hashval);
+ ASSERT(err == 0);
+
+ mutex_exit(&softmac->smac_mutex);
+ rw_exit(&softmac_hash_lock);
+
+ mutex_destroy(&softmac->smac_mutex);
+ cv_destroy(&softmac->smac_cv);
+ kmem_free(softmac, sizeof (softmac_t));
+ return (0);
+ }
mutex_exit(&softmac->smac_mutex);
rw_exit(&softmac_hash_lock);
+ return (0);
+
+error:
+ mutex_enter(&softmac->smac_mutex);
+ softmac->smac_attached_left = softmac->smac_attachok_cnt;
+ softmac->smac_state = SOFTMAC_ATTACH_DONE;
+ cv_broadcast(&softmac->smac_cv);
+ mutex_exit(&softmac->smac_mutex);
return (err);
}
@@ -863,17 +1016,33 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
softmac_t *softmac = (softmac_t *)val;
datalink_id_t linkid;
int err;
-
- ASSERT(RW_READ_HELD(&softmac_hash_lock));
+ softmac_walk_t *smwp = arg;
/*
- * Wait for softmac_create() and softmac_mac_register() to exit.
+ * The framework itself must not hold any locks across calls to the
+ * mac perimeter. Thus this function does not call any framework
+ * function that needs to grab the mac perimeter.
*/
+ ASSERT(RW_READ_HELD(&softmac_hash_lock));
+
+ smwp->smw_retry = B_FALSE;
mutex_enter(&softmac->smac_mutex);
- while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE))
- cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+ SOFTMAC_STATE_VERIFY(softmac);
+ if (softmac->smac_state == SOFTMAC_ATTACH_INPROG) {
+ /*
+ * Wait till softmac_create or softmac_mac_register finishes
+ * Hold the softmac to ensure it stays around. The wait itself
+ * is done in the caller, since we need to drop all locks
+ * including the mod hash's internal lock before calling
+ * cv_wait.
+ */
+ smwp->smw_retry = B_TRUE;
+ smwp->smw_softmac = softmac;
+ softmac->smac_hold_cnt++;
+ return (MH_WALK_TERMINATE);
+ }
- if ((softmac->smac_attacherr != 0) ||
+ if ((softmac->smac_state != SOFTMAC_ATTACH_DONE) ||
!(softmac->smac_flags & SOFTMAC_NEED_RECREATE)) {
mutex_exit(&softmac->smac_mutex);
return (MH_WALK_CONTINUE);
@@ -918,13 +1087,30 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
void
softmac_recreate()
{
+ softmac_walk_t smw;
+ softmac_t *softmac;
+
/*
* Walk through the softmac_hash table. Request to create the
* [link name, linkid] mapping if we failed to do so.
*/
- rw_enter(&softmac_hash_lock, RW_READER);
- mod_hash_walk(softmac_hash, softmac_mac_recreate, NULL);
- rw_exit(&softmac_hash_lock);
+ do {
+ smw.smw_retry = B_FALSE;
+ rw_enter(&softmac_hash_lock, RW_READER);
+ mod_hash_walk(softmac_hash, softmac_mac_recreate, &smw);
+ rw_exit(&softmac_hash_lock);
+ if (smw.smw_retry) {
+ /*
+ * softmac_create or softmac_mac_register hasn't yet
+ * finished and the softmac is not yet in the
+ * SOFTMAC_ATTACH_DONE state.
+ */
+ softmac = smw.smw_softmac;
+ cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+ softmac->smac_hold_cnt--;
+ mutex_exit(&softmac->smac_mutex);
+ }
+ } while (smw.smw_retry);
}
/* ARGSUSED */
@@ -1064,20 +1250,14 @@ softmac_m_open(void *arg)
softmac_lower_t *slp;
int err;
- rw_enter(&softmac->smac_lock, RW_READER);
- if (softmac->smac_state == SOFTMAC_READY)
- goto done;
- rw_exit(&softmac->smac_lock);
+ ASSERT(MAC_PERIM_HELD(softmac->smac_mh));
+ ASSERT(softmac->smac_lower_state == SOFTMAC_INITIALIZED);
if ((err = softmac_lower_setup(softmac, &slp)) != 0)
return (err);
- rw_enter(&softmac->smac_lock, RW_WRITER);
- ASSERT(softmac->smac_state == SOFTMAC_INITIALIZED);
softmac->smac_lower = slp;
- softmac->smac_state = SOFTMAC_READY;
-done:
- rw_exit(&softmac->smac_lock);
+ softmac->smac_lower_state = SOFTMAC_READY;
return (0);
}
@@ -1087,7 +1267,8 @@ softmac_m_close(void *arg)
softmac_t *softmac = arg;
softmac_lower_t *slp;
- rw_enter(&softmac->smac_lock, RW_WRITER);
+ ASSERT(MAC_PERIM_HELD(softmac->smac_mh));
+ ASSERT(softmac->smac_lower_state == SOFTMAC_READY);
slp = softmac->smac_lower;
ASSERT(slp != NULL);
@@ -1095,9 +1276,8 @@ softmac_m_close(void *arg)
* Note that slp is destroyed when lh is closed.
*/
(void) ldi_close(slp->sl_lh, FREAD|FWRITE, kcred);
- softmac->smac_state = SOFTMAC_INITIALIZED;
+ softmac->smac_lower_state = SOFTMAC_INITIALIZED;
softmac->smac_lower = NULL;
- rw_exit(&softmac->smac_lock);
}
int
@@ -1146,7 +1326,10 @@ again:
* be recreated when device fails to detach (as this device
* is held).
*/
+ mutex_enter(&smac_global_lock);
rw_exit(&softmac_hash_lock);
+ cv_wait(&smac_global_cv, &smac_global_lock);
+ mutex_exit(&smac_global_lock);
goto again;
}
@@ -1155,17 +1338,16 @@ again:
*/
mutex_enter(&softmac->smac_mutex);
softmac->smac_hold_cnt++;
- mutex_exit(&softmac->smac_mutex);
-
rw_exit(&softmac_hash_lock);
/*
* Wait till the device is fully attached.
*/
- mutex_enter(&softmac->smac_mutex);
- while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE))
+ while (softmac->smac_state != SOFTMAC_ATTACH_DONE)
cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+ SOFTMAC_STATE_VERIFY(softmac);
+
if ((err = softmac->smac_attacherr) != 0)
softmac->smac_hold_cnt--;
else
diff --git a/usr/src/uts/common/io/softmac/softmac_pkt.c b/usr/src/uts/common/io/softmac/softmac_pkt.c
index 3587fa515a..4b8d7e3049 100644
--- a/usr/src/uts/common/io/softmac/softmac_pkt.c
+++ b/usr/src/uts/common/io/softmac/softmac_pkt.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/strsubr.h>
#include <inet/led.h>
#include <sys/softmac_impl.h>
@@ -69,40 +67,6 @@ softmac_m_tx(void *arg, mblk_t *mp)
return (mp);
}
-/*ARGSUSED*/
-static void
-softmac_blank(void *arg, time_t ticks, uint_t count)
-{
-}
-
-void
-softmac_m_resources(void *arg)
-{
- softmac_t *softmac = arg;
- softmac_lower_t *slp = softmac->smac_lower;
- mac_rx_fifo_t mrf;
-
- ASSERT((softmac->smac_state == SOFTMAC_READY) && (slp != NULL));
-
- /*
- * Register rx resources and save resource handle for future reference.
- * Note that the mac_resources() function must be called when the lower
- * stream is plumbed.
- */
-
- mutex_enter(&slp->sl_mutex);
-
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = softmac_blank;
- mrf.mrf_arg = slp;
- mrf.mrf_normal_blank_time = SOFTMAC_BLANK_TICKS;
- mrf.mrf_normal_pkt_count = SOFTMAC_BLANK_PKT_COUNT;
-
- slp->sl_handle =
- mac_resource_add(softmac->smac_mh, (mac_resource_t *)&mrf);
-
- mutex_exit(&slp->sl_mutex);
-}
void
softmac_rput_process_data(softmac_lower_t *slp, mblk_t *mp)
@@ -125,7 +89,7 @@ softmac_rput_process_data(softmac_lower_t *slp, mblk_t *mp)
mp = tmp;
}
- mac_rx(slp->sl_softmac->smac_mh, slp->sl_handle, mp);
+ mac_rx(slp->sl_softmac->smac_mh, NULL, mp);
return;
failed:
diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c
index ffb7753e09..27b9cc8843 100644
--- a/usr/src/uts/common/io/strplumb.c
+++ b/usr/src/uts/common/io/strplumb.c
@@ -69,7 +69,7 @@
#include <sys/ddi_implfuncs.h>
#include <sys/dld.h>
-#include <sys/mac.h>
+#include <sys/mac_client.h>
/*
* Debug Macros
diff --git a/usr/src/uts/common/io/ural/ural.c b/usr/src/uts/common/io/ural/ural.c
index 5b54d54935..b474dd8c2c 100644
--- a/usr/src/uts/common/io/ural/ural.c
+++ b/usr/src/uts/common/io/ural/ural.c
@@ -43,7 +43,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_wifi.h>
#include <sys/net80211.h>
#include <sys/net80211_proto.h>
@@ -295,7 +295,6 @@ static mac_callbacks_t ural_m_callbacks = {
ural_m_multicst,
ural_m_unicst,
ural_m_tx,
- NULL, /* mc_resources; */
ural_m_ioctl,
NULL, /* mc_getcapab */
NULL,
diff --git a/usr/src/uts/common/io/vnic/vnic_bcast.c b/usr/src/uts/common/io/vnic/vnic_bcast.c
deleted file mode 100644
index 28ba800fd5..0000000000
--- a/usr/src/uts/common/io/vnic/vnic_bcast.c
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/list.h>
-#include <sys/kmem.h>
-#include <sys/stream.h>
-#include <sys/modctl.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/atomic.h>
-#include <sys/stat.h>
-#include <sys/modhash.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <sys/mac.h>
-#include <sys/vnic.h>
-#include <sys/vnic_impl.h>
-
-/*
- * Broadcast and multicast traffic must be distributed to the VNICs
- * that are defined on top of the same underlying NIC. The set of
- * destinations to which a multicast packet must be sent is a subset
- * of all VNICs defined on top of the same NIC. A VNIC can be member
- * of more than one such subset.
- *
- * To accomodate these requirements, we introduce broadcast groups.
- * A broadcast group is associated with a broadcast or multicast
- * address. The members of a broadcast group consist of the VNICs
- * that should received copies of packets sent to the address
- * associated with the group, and are defined on top of the
- * same underlying NIC. The underlying NIC is always implicetely
- * part of the group.
- *
- * The broadcast groups defined on top of a underlying NIC are chained,
- * hanging off vnic_mac_t structures.
- */
-
-typedef struct vnic_bcast_grp_s {
- struct vnic_bcast_grp_s *vbg_next;
- uint_t vbg_refs;
- void *vbg_addr;
- vnic_mac_t *vbg_vnic_mac;
- mac_addrtype_t vbg_addrtype;
- vnic_flow_t *vbg_flow_ent;
- vnic_t **vbg_vnics;
- uint_t vbg_nvnics;
- uint_t vbg_nvnics_alloc;
- uint64_t vbg_vnics_gen;
-} vnic_bcast_grp_t;
-
-#define VNIC_BCAST_GRP_REFHOLD(grp) { \
- atomic_add_32(&(grp)->vbg_refs, 1); \
- ASSERT((grp)->vbg_refs != 0); \
-}
-
-#define VNIC_BCAST_GRP_REFRELE(grp) { \
- ASSERT((grp)->vbg_refs != 0); \
- membar_exit(); \
- if (atomic_add_32_nv(&(grp)->vbg_refs, -1) == 0) \
- vnic_bcast_grp_free(grp); \
-}
-
-static kmem_cache_t *vnic_bcast_grp_cache;
-
-void
-vnic_bcast_init(void)
-{
- vnic_bcast_grp_cache = kmem_cache_create("vnic_bcast_grp_cache",
- sizeof (vnic_bcast_grp_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-vnic_bcast_fini(void)
-{
- kmem_cache_destroy(vnic_bcast_grp_cache);
-}
-
-/*
- * Free the specific broadcast group. Invoked when the last reference
- * to the group is released.
- */
-static void
-vnic_bcast_grp_free(vnic_bcast_grp_t *grp)
-{
- vnic_mac_t *vnic_mac = grp->vbg_vnic_mac;
-
- if (grp->vbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
- /*
- * The address is a multicast address, have the
- * underlying NIC leave the multicast group.
- */
- (void) mac_multicst_remove(vnic_mac->va_mh, grp->vbg_addr);
- }
-
- ASSERT(grp->vbg_addr != NULL);
- kmem_free(grp->vbg_addr, grp->vbg_vnic_mac->va_addr_len);
-
- ASSERT(grp->vbg_vnics != NULL);
- kmem_free(grp->vbg_vnics, grp->vbg_nvnics_alloc * sizeof (vnic_t *));
-
- kmem_cache_free(vnic_bcast_grp_cache, grp);
-}
-
-void
-vnic_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain)
-{
- vnic_bcast_grp_t *grp = arg1;
- vnic_t *sender_vnic = arg2, *vnic;
- const vnic_flow_fn_info_t *fn_info;
- krwlock_t *grp_lock = &grp->vbg_vnic_mac->va_bcast_grp_lock;
- uint64_t gen;
- uint_t i;
- mblk_t *mp_chain1;
- vnic_mac_t *vnic_mac;
-
- VNIC_BCAST_GRP_REFHOLD(grp);
- rw_enter(grp_lock, RW_READER);
-
- if (grp->vbg_nvnics == 0)
- goto bail;
- vnic_mac = grp->vbg_vnics[0]->vn_vnic_mac;
-
- /*
- * Pass a copy of the mp chain to every VNIC except the sender
- * VNIC, if the packet was not received from the underlying NIC.
- *
- * The broadcast group lock across calls to the flow's callback
- * function, since the same group could potentially be accessed
- * from the same context. When the lock is reacquired, changes
- * to the broadcast group while the lock was released
- * are caught using a generation counter incremented each time
- * the list of VNICs associated with the broadcast group
- * is changed.
- */
- for (i = 0; i < grp->vbg_nvnics; i++) {
- vnic = grp->vbg_vnics[i];
- if (vnic == sender_vnic)
- continue;
-
- /*
- * If this consumer is in promiscuous mode then it
- * will have already seen a copy of the packet.
- */
- if (vnic->vn_promisc)
- continue;
- /*
- * It is important to hold a reference on the
- * flow_ent here. vnic_dev_delete() may be waiting
- * to delete the vnic after removing it from grp.
- */
- if ((mp_chain1 = vnic_copymsgchain_cksum(mp_chain)) == NULL)
- break;
- /*
- * Fix the checksum for packets originating
- * from the local machine.
- */
- if ((sender_vnic != NULL) &&
- ((mp_chain1 = vnic_fix_cksum(mp_chain1)) == NULL))
- break;
- VNIC_FLOW_REFHOLD(vnic->vn_flow_ent);
- fn_info = vnic_classifier_get_fn_info(vnic->vn_flow_ent);
- gen = grp->vbg_vnics_gen;
- rw_exit(grp_lock);
- (fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp_chain1);
- VNIC_FLOW_REFRELE(vnic->vn_flow_ent);
- rw_enter(grp_lock, RW_READER);
-
- /* update stats */
- if (grp->vbg_addrtype == MAC_ADDRTYPE_MULTICAST)
- vnic->vn_stat_multircv++;
- else
- vnic->vn_stat_brdcstrcv++;
-
- if (grp->vbg_vnics_gen != gen) {
- /*
- * The list of VNICs associated with the group
- * was changed while the lock was released.
- * Give up on the current packet.
- */
- freemsgchain(mp_chain);
- goto bail;
- }
- }
-
- if (sender_vnic != NULL) {
- /*
- * The packet was sent from one of the VNICs
- * (vnic_active_tx()), or from the active MAC
- * (vnic_active_tx()). In both cases, we need to send
- * a copy of the packet to the underlying NIC so that
- * it can be sent on the wire.
- */
- const mac_txinfo_t *mtp = vnic_mac->va_txinfo;
- mblk_t *rest;
-
- if ((mp_chain1 = vnic_copymsgchain_cksum(mp_chain)) != NULL) {
- rw_exit(grp_lock);
- rest = mtp->mt_fn(mtp->mt_arg, mp_chain1);
- rw_enter(grp_lock, RW_READER);
- if (rest != NULL)
- freemsgchain(rest);
- }
- }
-
- if ((sender_vnic != (vnic_t *)-1) && (sender_vnic != NULL)) {
- /*
- * Called while sending a packet from one of the VNICs.
- * Make sure the active interface gets its copy.
- */
- mp_chain1 = (sender_vnic != NULL) ? vnic_fix_cksum(mp_chain) :
- mp_chain;
- if (mp_chain1 != NULL) {
- rw_exit(grp_lock);
- mac_active_rx(vnic_mac->va_mh, NULL, mp_chain1);
- rw_enter(grp_lock, RW_READER);
- }
- } else {
- freemsgchain(mp_chain);
- }
-bail:
- rw_exit(grp_lock);
- VNIC_BCAST_GRP_REFRELE(grp);
-}
-
-/*
- * Add the specified VNIC to the group corresponding to the specified
- * broadcast or multicast address.
- * Return 0 on success, or an errno value on failure.
- */
-int
-vnic_bcast_add(vnic_t *vnic, const uint8_t *addr, mac_addrtype_t addrtype)
-{
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- vnic_bcast_grp_t *grp = NULL, **last_grp;
- int rc = 0;
-
- ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST ||
- addrtype == MAC_ADDRTYPE_BROADCAST);
-
- rw_enter(&vnic_mac->va_bcast_grp_lock, RW_WRITER);
-
- /*
- * Does a group with the specified broadcast address already
- * exist for the underlying NIC?
- */
- last_grp = &vnic_mac->va_bcast_grp;
- for (grp = *last_grp; grp != NULL;
- last_grp = &grp->vbg_next, grp = grp->vbg_next) {
- if (bcmp(grp->vbg_addr, addr, vnic_mac->va_addr_len) == 0)
- break;
- }
-
- if (grp == NULL) {
- /*
- * The group does not yet exist, create it.
- */
- grp = kmem_cache_alloc(vnic_bcast_grp_cache, KM_SLEEP);
- bzero(grp, sizeof (vnic_bcast_grp_t));
- grp->vbg_next = NULL;
- ASSERT(grp->vbg_refs == 0);
- grp->vbg_vnic_mac = vnic_mac;
-
- grp->vbg_addr = kmem_zalloc(vnic_mac->va_addr_len, KM_SLEEP);
- bcopy(addr, grp->vbg_addr, vnic_mac->va_addr_len);
- grp->vbg_addrtype = addrtype;
-
- /*
- * Add a new flow for the broadcast address.
- */
- grp->vbg_flow_ent = vnic_classifier_flow_create(
- vnic_mac->va_addr_len, (uchar_t *)addr, grp, B_FALSE,
- KM_NOSLEEP);
- if (grp->vbg_flow_ent == NULL) {
- rc = ENOMEM;
- goto bail;
- }
-
- /*
- * When the multicast and broadcast packet is received
- * by the underlying NIC, mac_rx_classify() will invoke
- * vnic_bcast_send() with arg2=NULL, which will cause
- * vnic_bcast_send() to send a copy of the packet(s)
- * to every VNIC defined on top of the underlying MAC.
- *
- * When the vnic_bcast_send() function is invoked from
- * the VNIC transmit path, it will specify the transmitting
- * VNIC as the arg2 value, which will allow vnic_bcast_send()
- * to skip that VNIC and not send it a copy of the packet.
- *
- * We program the classifier to dispatch matching broadcast
- * packets to vnic_bcast_send().
- * We need a ring allocated for this bcast flow, so that
- * later snooping of the underlying MAC uses the same scheme
- * of intercepting the ring's receiver to mac_rx_promisc().
- * For the economy of hardware resources, we command the MAC
- * classifier to use a soft ring for these broadcast and
- * multicast flows.
- */
- vnic_classifier_flow_add(vnic_mac, grp->vbg_flow_ent,
- vnic_bcast_send, grp, NULL);
-
- /*
- * For multicast addresses, have the underlying MAC
- * join the corresponsing multicast group.
- */
- if ((addrtype == MAC_ADDRTYPE_MULTICAST) &&
- ((rc = mac_multicst_add(vnic_mac->va_mh, addr)) != 0)) {
- vnic_classifier_flow_remove(vnic->vn_vnic_mac,
- grp->vbg_flow_ent);
- vnic_classifier_flow_destroy(grp->vbg_flow_ent);
- goto bail;
- }
-
- *last_grp = grp;
- }
-
- /*
- * Add the VNIC to the list of VNICs associated with the group.
- */
- if (grp->vbg_nvnics_alloc == grp->vbg_nvnics) {
- vnic_t **new_vnics;
- uint_t new_size = grp->vbg_nvnics+1;
-
- new_vnics = kmem_zalloc(new_size * sizeof (vnic_t *),
- KM_SLEEP);
-
- if (grp->vbg_nvnics) {
- ASSERT(grp->vbg_vnics != NULL);
- bcopy(grp->vbg_vnics, new_vnics, grp->vbg_nvnics *
- sizeof (vnic_t *));
- kmem_free(grp->vbg_vnics, grp->vbg_nvnics *
- sizeof (vnic_t *));
- }
-
- grp->vbg_vnics = new_vnics;
- grp->vbg_nvnics_alloc = new_size;
- }
-
- grp->vbg_vnics[grp->vbg_nvnics++] = vnic;
-
- /*
- * Since we're adding to the list of VNICs using that group,
- * kick the generation count, which will allow vnic_bcast_send()
- * to detect that condition.
- */
- grp->vbg_vnics_gen++;
-
- VNIC_BCAST_GRP_REFHOLD(grp);
-
-bail:
- if (rc != 0 && grp != NULL) {
- kmem_free(grp->vbg_addr, vnic_mac->va_addr_len);
- kmem_cache_free(vnic_bcast_grp_cache, grp);
- }
-
- rw_exit(&vnic->vn_vnic_mac->va_bcast_grp_lock);
- return (rc);
-}
-
-/*
- * Remove the specified VNIC from the group corresponding to
- * the specific broadcast or multicast address.
- *
- * Note: vnic_bcast_delete() calls net_remove_flow() which
- * will call cv_wait for fe_refcnt to drop to 0. So this function
- * should not be called from interrupt or STREAMS context. The only
- * callers are vnic_dev_delete() and vnic_m_multicst() (both of
- * which are called from taskq thread context).
- */
-void
-vnic_bcast_delete(vnic_t *vnic, const uint8_t *addr)
-{
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- vnic_bcast_grp_t *grp, **prev;
- uint_t i;
- boolean_t removing_grp = B_FALSE;
-
- rw_enter(&vnic_mac->va_bcast_grp_lock, RW_WRITER);
-
- /* find the broadcast group */
- prev = &vnic_mac->va_bcast_grp;
- for (grp = vnic_mac->va_bcast_grp; grp != NULL; prev = &grp->vbg_next,
- grp = grp->vbg_next) {
- if (bcmp(grp->vbg_addr, addr, vnic_mac->va_addr_len) == 0)
- break;
- }
- ASSERT(grp != NULL);
-
- /*
- * Remove the VNIC from the list of VNICs associated with that
- * broadcast group.
- *
- * We keep the vbg_vnics[] always compact by repacing
- * the removed vnic with the last non NULL element in that array.
- */
-
- for (i = 0; i < grp->vbg_nvnics; i++) {
- if (grp->vbg_vnics[i] == vnic)
- break;
- }
-
- ASSERT(i < grp->vbg_nvnics);
-
- if (i == (grp->vbg_nvnics-1)) {
- grp->vbg_vnics[i] = NULL;
- } else {
- grp->vbg_vnics[i] = grp->vbg_vnics[grp->vbg_nvnics-1];
- grp->vbg_vnics[grp->vbg_nvnics-1] = NULL;
- }
-
- /*
- * Since we're removing from the list of VNICs using that group,
- * kick the generation count, which will allow vnic_bcast_send()
- * to detect that condition.
- */
- grp->vbg_vnics_gen++;
-
- if (--grp->vbg_nvnics == 0) {
- /*
- * Unlink the current group from the list of groups
- * defined on top of the underlying NIC. The group
- * structure will stay around until the last reference
- * is dropped.
- */
- *prev = grp->vbg_next;
- removing_grp = B_TRUE;
- }
-
- rw_exit(&vnic->vn_vnic_mac->va_bcast_grp_lock);
-
- /*
- * If the group itself is being removed, remove the
- * corresponding flow from the underlying NIC.
- */
- if (removing_grp) {
- vnic_classifier_flow_remove(vnic->vn_vnic_mac,
- grp->vbg_flow_ent);
- vnic_classifier_flow_destroy(grp->vbg_flow_ent);
- }
-
- VNIC_BCAST_GRP_REFRELE(grp);
-}
diff --git a/usr/src/uts/common/io/vnic/vnic_cl.c b/usr/src/uts/common/io/vnic/vnic_cl.c
deleted file mode 100644
index b7939f141d..0000000000
--- a/usr/src/uts/common/io/vnic/vnic_cl.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/vnic.h>
-#include <sys/vnic_impl.h>
-
-/*
- * Virtual Network Interface Card (VNIC) classification.
- *
- * The VNIC implements a software classifier which is used to steer
- * traffic (locally and externally generated) to the appropriate VNIC
- * based on MAC addresses.
- */
-
-static kmem_cache_t *vnic_flow_cache;
-static kmem_cache_t *vnic_flow_tab_cache;
-
-static void vnic_classifier_rx(void *, mac_resource_handle_t, mblk_t *);
-
-/* ARGSUSED */
-static int
-vnic_classifier_flow_tab_ctor(void *buf, void *arg, int km_flag)
-{
- vnic_flow_tab_t *flow_tab = buf;
-
- bzero(flow_tab, sizeof (vnic_flow_tab_t));
- rw_init(&flow_tab->vt_lock, NULL, RW_DRIVER, NULL);
- return (0);
-}
-
-/* ARGSUSED */
-static void
-vnic_classifier_flow_tab_dtor(void *buf, void *arg)
-{
- vnic_flow_tab_t *flow_tab = buf;
-
- rw_destroy(&flow_tab->vt_lock);
-}
-
-/* ARGSUSED */
-static int
-vnic_classifier_flow_ctor(void *buf, void *arg, int km_flag)
-{
- vnic_flow_t *flow = buf;
-
- bzero(flow, sizeof (vnic_flow_t));
- mutex_init(&flow->vf_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&flow->vf_cv, NULL, CV_DRIVER, NULL);
- return (0);
-}
-
-/* ARGSUSED */
-static void
-vnic_classifier_flow_dtor(void *buf, void *arg)
-{
- vnic_flow_t *flow = buf;
-
- ASSERT(flow->vf_refs == 0);
- mutex_destroy(&flow->vf_lock);
- cv_destroy(&flow->vf_cv);
-}
-
-void
-vnic_classifier_init(void)
-{
- vnic_flow_cache = kmem_cache_create("vnic_flow_cache",
- sizeof (vnic_flow_t), 0, vnic_classifier_flow_ctor,
- vnic_classifier_flow_dtor, NULL, NULL, NULL, 0);
- vnic_flow_tab_cache = kmem_cache_create("vnic_flow_tab_cache",
- sizeof (vnic_flow_tab_t), 0, vnic_classifier_flow_tab_ctor,
- vnic_classifier_flow_tab_dtor, NULL, NULL, NULL, 0);
-}
-
-void
-vnic_classifier_fini(void)
-{
- kmem_cache_destroy(vnic_flow_cache);
- kmem_cache_destroy(vnic_flow_tab_cache);
-}
-
-int
-vnic_classifier_flow_tab_init(vnic_mac_t *vnic_mac, uint_t mac_len,
- int km_flag)
-{
- vnic_mac->va_flow_tab = kmem_cache_alloc(vnic_flow_tab_cache, km_flag);
- if (vnic_mac->va_flow_tab == NULL)
- return (ENOMEM);
- vnic_mac->va_rx_hdl = mac_rx_add(vnic_mac->va_mh, vnic_classifier_rx,
- vnic_mac);
- vnic_mac->va_flow_tab->vt_addr_len = mac_len;
- return (0);
-}
-
-void
-vnic_classifier_flow_tab_fini(vnic_mac_t *vnic_mac)
-{
- vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
-
- ASSERT(flow_tab->vt_flow_list == NULL);
- mac_rx_remove(vnic_mac->va_mh, vnic_mac->va_rx_hdl, B_TRUE);
- kmem_cache_free(vnic_flow_tab_cache, flow_tab);
- vnic_mac->va_flow_tab = NULL;
-}
-
-vnic_flow_t *
-vnic_classifier_flow_create(uint_t mac_len, uchar_t *mac_addr,
- void *flow_cookie, boolean_t is_active, int km_flag)
-{
- vnic_flow_t *flow;
-
- ASSERT(mac_len <= MAXMACADDRLEN);
-
- if ((flow = kmem_cache_alloc(vnic_flow_cache, km_flag)) == NULL)
- return (NULL);
-
- flow->vf_addr_len = mac_len;
- flow->vf_cookie = flow_cookie;
- flow->vf_clearing = B_FALSE;
- flow->vf_is_active = is_active;
- bcopy(mac_addr, flow->vf_addr, mac_len);
- return (flow);
-}
-
-void
-vnic_classifier_flow_destroy(vnic_flow_t *flow)
-{
- kmem_cache_free(vnic_flow_cache, flow);
-}
-
-void
-vnic_classifier_flow_add(vnic_mac_t *vnic_mac, vnic_flow_t *flow,
- vnic_rx_fn_t rx_fn, void *rx_arg1, void *rx_arg2)
-{
- vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
- vnic_flow_t **cur_flow;
-
- ASSERT(flow->vf_addr_len == flow_tab->vt_addr_len);
-
- /* initialize the flow structure */
- flow->vf_fn_info.ff_fn = rx_fn;
- flow->vf_fn_info.ff_arg1 = rx_arg1;
- flow->vf_fn_info.ff_arg2 = rx_arg2;
-
- /* add to the flow table */
- rw_enter(&flow_tab->vt_lock, RW_WRITER);
- for (cur_flow = &flow_tab->vt_flow_list;
- *cur_flow != NULL;
- cur_flow = &(*cur_flow)->vf_next)
- ;
- *cur_flow = flow;
- flow->vf_next = NULL;
- rw_exit(&flow_tab->vt_lock);
-}
-
-void
-vnic_classifier_flow_remove(vnic_mac_t *vnic_mac, vnic_flow_t *flow)
-{
- vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
- vnic_flow_t **prev, *cur;
-
- /* unlink from list */
- rw_enter(&flow_tab->vt_lock, RW_WRITER);
- prev = &flow_tab->vt_flow_list;
- for (cur = *prev; cur != NULL && cur != flow;
- prev = &cur->vf_next, cur = cur->vf_next)
- ;
- *prev = cur->vf_next;
- rw_exit(&flow_tab->vt_lock);
-
- /* wait for all references to the flow to go away */
- mutex_enter(&flow->vf_lock);
- flow->vf_clearing = B_TRUE;
- while (flow->vf_refs > 0)
- cv_wait(&flow->vf_cv, &flow->vf_lock);
- mutex_exit(&flow->vf_lock);
-}
-
-void
-vnic_classifier_flow_update_addr(vnic_flow_t *flow, uchar_t *mac_addr)
-{
- bcopy(mac_addr, flow->vf_addr, flow->vf_addr_len);
-}
-
-void
-vnic_classifier_flow_update_fn(vnic_flow_t *flow, vnic_rx_fn_t fn,
- void *arg1, void *arg2)
-{
- flow->vf_fn_info.ff_fn = fn;
- flow->vf_fn_info.ff_arg1 = arg1;
- flow->vf_fn_info.ff_arg2 = arg2;
-}
-
-vnic_flow_t *
-vnic_classifier_get_flow(vnic_mac_t *vnic_mac, mblk_t *mp)
-{
- vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
- vnic_flow_t *flow;
- mac_header_info_t hdr_info;
-
- if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0)
- return (NULL);
-
- rw_enter(&flow_tab->vt_lock, RW_READER);
- for (flow = flow_tab->vt_flow_list; flow != NULL;
- flow = flow->vf_next) {
- if (bcmp(hdr_info.mhi_daddr, flow->vf_addr,
- flow_tab->vt_addr_len) == 0) {
- VNIC_FLOW_REFHOLD(flow);
- break;
- }
- }
- rw_exit(&flow_tab->vt_lock);
- return (flow);
-}
-
-void *
-vnic_classifier_get_client_cookie(vnic_flow_t *flow)
-{
- return (flow->vf_cookie);
-}
-
-vnic_flow_fn_info_t *
-vnic_classifier_get_fn_info(vnic_flow_t *flow)
-{
- return (&flow->vf_fn_info);
-}
-
-boolean_t
-vnic_classifier_is_active(vnic_flow_t *flow)
-{
- return (flow->vf_is_active);
-}
-
-/*
- * Receive function registered with the MAC layer. Classifies
- * the packets, i.e. finds the flows matching the packets passed
- * as argument, and invokes the callback functions associated with
- * these flows.
- */
-/*ARGSUSED*/
-static void
-vnic_classifier_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
-{
- vnic_mac_t *vnic_mac = arg;
- vnic_flow_t *flow;
- mblk_t *next_mp;
- const vnic_flow_fn_info_t *fn_info;
-
- while (mp != NULL) {
- next_mp = mp->b_next;
- mp->b_next = NULL;
-
- vnic_promisc_rx(vnic_mac, NULL, mp);
-
- flow = vnic_classifier_get_flow(vnic_mac, mp);
- if (flow == NULL) {
- freemsg(mp);
- } else {
- if (flow->vf_is_active) {
- /*
- * Inbound packets are delivered to the
- * active MAC through mac_rx() of the
- * the NIC.
- */
- freemsg(mp);
- } else {
- vnic_t *vnic;
-
- fn_info = vnic_classifier_get_fn_info(flow);
-
- /*
- * If the vnic to which we would
- * deliver this packet is in
- * promiscuous mode then it already
- * received the packet via
- * vnic_promisc_rx().
- *
- * XXX assumes that ff_arg2 is a
- * vnic_t pointer if it is non-NULL
- * (currently always true).
- */
- vnic = (vnic_t *)fn_info->ff_arg2;
- if ((vnic != NULL) && vnic->vn_promisc) {
- freemsg(mp);
- } else {
- (fn_info->ff_fn)(fn_info->ff_arg1,
- fn_info->ff_arg2, mp);
- }
- }
- VNIC_FLOW_REFRELE(flow);
- }
- mp = next_mp;
- }
-}
diff --git a/usr/src/uts/common/io/vnic/vnic_ctl.c b/usr/src/uts/common/io/vnic/vnic_ctl.c
index a2873c9601..d4f5554949 100644
--- a/usr/src/uts/common/io/vnic/vnic_ctl.c
+++ b/usr/src/uts/common/io/vnic/vnic_ctl.c
@@ -31,62 +31,35 @@
#include <sys/modctl.h>
#include <sys/vnic.h>
#include <sys/vnic_impl.h>
-#include <inet/common.h>
+#include <sys/priv_names.h>
/* module description */
-#define VNIC_LINKINFO "VNIC MAC"
+#define VNIC_LINKINFO "Virtual NIC"
/* device info ptr, only one for instance 0 */
static dev_info_t *vnic_dip = NULL;
static int vnic_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int vnic_attach(dev_info_t *, ddi_attach_cmd_t);
static int vnic_detach(dev_info_t *, ddi_detach_cmd_t);
-static dld_ioc_func_t vnic_ioc_create, vnic_ioc_modify, vnic_ioc_delete,
- vnic_ioc_info;
+
+static int vnic_ioc_create(void *, intptr_t, int, cred_t *, int *);
+static int vnic_ioc_delete(void *, intptr_t, int, cred_t *, int *);
+static int vnic_ioc_info(void *, intptr_t, int, cred_t *, int *);
+static int vnic_ioc_modify(void *, intptr_t, int, cred_t *, int *);
static dld_ioc_info_t vnic_ioc_list[] = {
- {VNIC_IOC_CREATE, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_create_t),
- vnic_ioc_create},
- {VNIC_IOC_DELETE, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_delete_t),
- vnic_ioc_delete},
+ {VNIC_IOC_CREATE, DLDCOPYINOUT, sizeof (vnic_ioc_create_t),
+ vnic_ioc_create, {PRIV_SYS_DL_CONFIG}},
+ {VNIC_IOC_DELETE, DLDCOPYIN, sizeof (vnic_ioc_delete_t),
+ vnic_ioc_delete, {PRIV_SYS_DL_CONFIG}},
{VNIC_IOC_INFO, DLDCOPYINOUT, sizeof (vnic_ioc_info_t),
- vnic_ioc_info},
- {VNIC_IOC_MODIFY, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_modify_t),
- vnic_ioc_modify}
-};
-
-static struct cb_ops vnic_cb_ops = {
- nulldev, /* open */
- nulldev, /* close */
- nulldev, /* strategy */
- nulldev, /* print */
- nodev, /* dump */
- nodev, /* read */
- nodev, /* write */
- nodev, /* ioctl */
- nodev, /* devmap */
- nodev, /* mmap */
- nodev, /* segmap */
- nochpoll, /* poll */
- ddi_prop_op, /* cb_prop_op */
- 0, /* streamtab */
- D_MP /* Driver compatibility flag */
+ vnic_ioc_info, {NULL}},
+ {VNIC_IOC_MODIFY, DLDCOPYIN, sizeof (vnic_ioc_modify_t),
+ vnic_ioc_modify, {PRIV_SYS_DL_CONFIG}},
};
-static struct dev_ops vnic_dev_ops = {
- DEVO_REV, /* devo_rev */
- 0, /* refcnt */
- vnic_getinfo, /* get_dev_info */
- nulldev, /* identify */
- nulldev, /* probe */
- vnic_attach, /* attach */
- vnic_detach, /* detach */
- nodev, /* reset */
- &vnic_cb_ops, /* driver operations */
- NULL, /* bus operations */
- nodev, /* dev power */
- ddi_quiesce_not_supported, /* dev quiesce */
-};
+DDI_DEFINE_STREAM_OPS(vnic_dev_ops, nulldev, nulldev, vnic_attach, vnic_detach,
+ nodev, vnic_getinfo, D_MP, NULL, ddi_quiesce_not_supported);
static struct modldrv vnic_modldrv = {
&mod_driverops, /* Type of module. This one is a driver */
@@ -95,30 +68,32 @@ static struct modldrv vnic_modldrv = {
};
static struct modlinkage modlinkage = {
- MODREV_1,
- &vnic_modldrv,
- NULL
+ MODREV_1, &vnic_modldrv, NULL
};
int
_init(void)
{
- int err;
+ int status;
mac_init_ops(&vnic_dev_ops, "vnic");
- if ((err = mod_install(&modlinkage)) != 0)
+ status = mod_install(&modlinkage);
+ if (status != DDI_SUCCESS)
mac_fini_ops(&vnic_dev_ops);
- return (err);
+
+ return (status);
}
int
_fini(void)
{
- int err;
+ int status;
- if ((err = mod_remove(&modlinkage)) == 0)
+ status = mod_remove(&modlinkage);
+ if (status == DDI_SUCCESS)
mac_fini_ops(&vnic_dev_ops);
- return (err);
+
+ return (status);
}
int
@@ -131,16 +106,12 @@ static void
vnic_init(void)
{
vnic_dev_init();
- vnic_bcast_init();
- vnic_classifier_init();
}
static void
vnic_fini(void)
{
vnic_dev_fini();
- vnic_bcast_fini();
- vnic_classifier_fini();
}
dev_info_t *
@@ -159,7 +130,7 @@ vnic_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
*result = vnic_dip;
return (DDI_SUCCESS);
case DDI_INFO_DEVT2INSTANCE:
- *result = 0;
+ *result = NULL;
return (DDI_SUCCESS);
}
return (DDI_FAILURE);
@@ -174,14 +145,12 @@ vnic_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
/* we only allow instance 0 to attach */
return (DDI_FAILURE);
}
-
if (dld_ioc_register(VNIC_IOC, vnic_ioc_list,
DLDIOCCNT(vnic_ioc_list)) != 0)
return (DDI_FAILURE);
vnic_dip = dip;
vnic_init();
-
return (DDI_SUCCESS);
case DDI_RESUME:
@@ -208,7 +177,6 @@ vnic_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
vnic_dip = NULL;
vnic_fini();
dld_ioc_unregister(VNIC_IOC);
-
return (DDI_SUCCESS);
case DDI_SUSPEND:
@@ -220,129 +188,135 @@ vnic_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
}
/*
- * Process a VNIC_IOC_CREATE request.
+ * Process a VNICIOC_CREATE request.
*/
/* ARGSUSED */
static int
-vnic_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred)
+vnic_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
vnic_ioc_create_t *create_arg = karg;
- int mac_len;
+ int err = 0, mac_len = 0, mac_slot;
uchar_t mac_addr[MAXMACADDRLEN];
- datalink_id_t vnic_id, linkid;
+ uint_t mac_prefix_len;
vnic_mac_addr_type_t mac_addr_type;
-
- /*
- * VNIC link id
- */
- vnic_id = create_arg->vc_vnic_id;
-
- /*
- * Linkid of the link the VNIC is defined on top of.
- */
- linkid = create_arg->vc_link_id;
+ vnic_ioc_diag_t diag = VNIC_IOC_DIAG_NONE;
+ boolean_t is_anchor = create_arg->vc_flags & VNIC_IOC_CREATE_ANCHOR;
/* MAC address */
mac_addr_type = create_arg->vc_mac_addr_type;
- mac_len = create_arg->vc_mac_len;
+
+ if (is_anchor)
+ goto create;
switch (mac_addr_type) {
case VNIC_MAC_ADDR_TYPE_FIXED:
+ mac_len = create_arg->vc_mac_len;
+ /*
+ * Sanity check the MAC address length. vnic_dev_create()
+ * will perform additional checks to ensure that the
+ * address is a valid unicast address of the appropriate
+ * length.
+ */
+ if (mac_len == 0 || mac_len > MAXMACADDRLEN) {
+ err = EINVAL;
+ diag = VNIC_IOC_DIAG_MACADDRLEN_INVALID;
+ goto bail;
+ }
+ bcopy(create_arg->vc_mac_addr, mac_addr, MAXMACADDRLEN);
+ break;
+ case VNIC_MAC_ADDR_TYPE_FACTORY:
+ mac_slot = create_arg->vc_mac_slot;
+ /* sanity check the specified slot number */
+ if (mac_slot < 0 && mac_slot != -1) {
+ err = EINVAL;
+ diag = VNIC_IOC_DIAG_MACFACTORYSLOTINVALID;
+ goto bail;
+ }
+ break;
+ case VNIC_MAC_ADDR_TYPE_AUTO:
+ mac_slot = -1;
+ /* FALLTHROUGH */
+ case VNIC_MAC_ADDR_TYPE_RANDOM:
+ mac_prefix_len = create_arg->vc_mac_prefix_len;
+ if (mac_prefix_len > MAXMACADDRLEN) {
+ err = EINVAL;
+ diag = VNIC_IOC_DIAG_MACPREFIXLEN_INVALID;
+ goto bail;
+ }
+ mac_len = create_arg->vc_mac_len;
+ if (mac_len > MAXMACADDRLEN) {
+ err = EINVAL;
+ diag = VNIC_IOC_DIAG_MACADDRLEN_INVALID;
+ goto bail;
+ }
bcopy(create_arg->vc_mac_addr, mac_addr, MAXMACADDRLEN);
break;
+ case VNIC_MAC_ADDR_TYPE_PRIMARY:
+ /*
+ * We will get the primary address when we add this
+ * client
+ */
+ break;
default:
- return (ENOTSUP);
+ err = ENOTSUP;
+ goto bail;
}
- return (vnic_dev_create(vnic_id, linkid, mac_len, mac_addr));
-}
+create:
+ err = vnic_dev_create(create_arg->vc_vnic_id, create_arg->vc_link_id,
+ &mac_addr_type, &mac_len, mac_addr, &mac_slot, mac_prefix_len,
+ create_arg->vc_vid, &create_arg->vc_resource_props,
+ create_arg->vc_flags, &diag);
+ if (err != 0)
+ goto bail;
-/* ARGSUSED */
-static int
-vnic_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
- vnic_ioc_modify_t *modify_arg = karg;
- datalink_id_t vnic_id;
- uint_t modify_mask;
- vnic_mac_addr_type_t mac_addr_type;
- uint_t mac_len;
- uchar_t mac_addr[MAXMACADDRLEN];
+ create_arg->vc_mac_addr_type = mac_addr_type;
- vnic_id = modify_arg->vm_vnic_id;
- modify_mask = modify_arg->vm_modify_mask;
+ if (is_anchor)
+ goto bail;
- if (modify_mask & VNIC_IOC_MODIFY_ADDR) {
- mac_addr_type = modify_arg->vm_mac_addr_type;
- mac_len = modify_arg->vm_mac_len;
- bcopy(modify_arg->vm_mac_addr, mac_addr, MAXMACADDRLEN);
+ switch (mac_addr_type) {
+ case VNIC_MAC_ADDR_TYPE_FACTORY:
+ create_arg->vc_mac_slot = mac_slot;
+ break;
+ case VNIC_MAC_ADDR_TYPE_RANDOM:
+ bcopy(mac_addr, create_arg->vc_mac_addr, MAXMACADDRLEN);
+ create_arg->vc_mac_len = mac_len;
+ break;
}
- return (vnic_dev_modify(vnic_id, modify_mask, mac_addr_type,
- mac_len, mac_addr));
+bail:
+ create_arg->vc_diag = diag;
+ create_arg->vc_status = err;
+ return (err);
}
/* ARGSUSED */
static int
-vnic_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred)
+vnic_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
- vnic_ioc_delete_t *delete_arg = karg;
+ vnic_ioc_modify_t *modify_arg = karg;
- return (vnic_dev_delete(delete_arg->vd_vnic_id));
+ return (vnic_dev_modify(modify_arg->vm_vnic_id,
+ modify_arg->vm_modify_mask, modify_arg->vm_mac_addr_type,
+ modify_arg->vm_mac_len, modify_arg->vm_mac_addr,
+ modify_arg->vm_mac_slot, &modify_arg->vm_resource_props));
}
-typedef struct vnic_ioc_info_state {
- uint32_t bytes_left;
- uchar_t *where;
- int mode;
-} vnic_ioc_info_state_t;
-
+/* ARGSUSED */
static int
-vnic_ioc_info_new_vnic(void *arg, datalink_id_t id,
- vnic_mac_addr_type_t addr_type, uint_t mac_len, uint8_t *mac_addr,
- datalink_id_t linkid)
+vnic_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
- vnic_ioc_info_state_t *state = arg;
- /*LINTED*/
- vnic_ioc_info_vnic_t *vn = (vnic_ioc_info_vnic_t *)state->where;
-
- if (state->bytes_left < sizeof (*vn))
- return (ENOSPC);
-
- vn->vn_vnic_id = id;
- vn->vn_link_id = linkid;
- vn->vn_mac_addr_type = addr_type;
- vn->vn_mac_len = mac_len;
- if (ddi_copyout(mac_addr, &(vn->vn_mac_addr), mac_len,
- state->mode) != 0)
- return (EFAULT);
-
- state->where += sizeof (*vn);
- state->bytes_left -= sizeof (*vn);
+ vnic_ioc_delete_t *delete_arg = karg;
- return (0);
+ return (vnic_dev_delete(delete_arg->vd_vnic_id, 0));
}
/* ARGSUSED */
static int
-vnic_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred)
+vnic_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
{
- vnic_ioc_info_t *info_argp = karg;
- uint32_t nvnics;
- datalink_id_t vnic_id, linkid;
- vnic_ioc_info_state_t state;
-
- /*
- * ID of the vnic to return or vnic device.
- * If zero, the call returns information
- * regarding all vnics currently defined.
- */
- vnic_id = info_argp->vi_vnic_id;
- linkid = info_argp->vi_linkid;
-
- state.bytes_left = info_argp->vi_size;
- state.where = (uchar_t *)(arg + sizeof (vnic_ioc_info_t));
- state.mode = mode;
-
- return (vnic_info(&nvnics, vnic_id, linkid, &state,
- vnic_ioc_info_new_vnic));
+ vnic_ioc_info_t *info_arg = karg;
+
+ return (vnic_info(&info_arg->vi_info));
}
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index 7d98003a17..b76ddf678f 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/conf.h>
@@ -43,35 +41,50 @@
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
#include <sys/mac_ether.h>
#include <sys/dls.h>
#include <sys/pattr.h>
+#include <sys/time.h>
+#include <sys/vlan.h>
#include <sys/vnic.h>
#include <sys/vnic_impl.h>
-#include <sys/gld.h>
-#include <inet/ip.h>
+#include <sys/mac_flow_impl.h>
#include <inet/ip_impl.h>
+/*
+ * Note that for best performance, the VNIC is a passthrough design.
+ * For each VNIC corresponds a MAC client of the underlying MAC (lower MAC).
+ * This MAC client is opened by the VNIC driver at VNIC creation,
+ * and closed when the VNIC is deleted.
+ * When a MAC client of the VNIC itself opens a VNIC, the MAC layer
+ * (upper MAC) detects that the MAC being opened is a VNIC. Instead
+ * of allocating a new MAC client, it asks the VNIC driver to return
+ * the lower MAC client handle associated with the VNIC, and that handle
+ * is returned to the upper MAC client directly. This allows access
+ * by upper MAC clients of the VNIC to have direct access to the lower
+ * MAC client for the control path and data path.
+ *
+ * Due to this passthrough, some of the entry points exported by the
+ * VNIC driver are never directly invoked. These entry points include
+ * vnic_m_start, vnic_m_stop, vnic_m_promisc, vnic_m_multicst, etc.
+ */
+
static int vnic_m_start(void *);
static void vnic_m_stop(void *);
static int vnic_m_promisc(void *, boolean_t);
static int vnic_m_multicst(void *, boolean_t, const uint8_t *);
static int vnic_m_unicst(void *, const uint8_t *);
static int vnic_m_stat(void *, uint_t, uint64_t *);
-static void vnic_m_resources(void *);
+static void vnic_m_ioctl(void *, queue_t *, mblk_t *);
static mblk_t *vnic_m_tx(void *, mblk_t *);
static boolean_t vnic_m_capab_get(void *, mac_capab_t, void *);
-static void vnic_mac_free(vnic_mac_t *);
-static uint_t vnic_info_walker(mod_hash_key_t, mod_hash_val_t *, void *);
static void vnic_notify_cb(void *, mac_notify_type_t);
-static int vnic_modify_mac_addr(vnic_t *, uint_t, uchar_t *);
-static mblk_t *vnic_active_tx(void *, mblk_t *);
-static int vnic_promisc_set(vnic_t *, boolean_t);
static kmem_cache_t *vnic_cache;
-static kmem_cache_t *vnic_mac_cache;
static krwlock_t vnic_lock;
-static kmutex_t vnic_mac_lock;
static uint_t vnic_count;
/* hash of VNICs (vnic_t's), keyed by VNIC id */
@@ -79,39 +92,7 @@ static mod_hash_t *vnic_hash;
#define VNIC_HASHSZ 64
#define VNIC_HASH_KEY(vnic_id) ((mod_hash_key_t)(uintptr_t)vnic_id)
-/*
- * Hash of underlying open MACs (vnic_mac_t's), keyed by the string
- * "<device name><instance number>/<port number>".
- */
-static mod_hash_t *vnic_mac_hash;
-#define VNIC_MAC_HASHSZ 64
-
-#define VNIC_MAC_REFHOLD(va) { \
- ASSERT(MUTEX_HELD(&vnic_mac_lock)); \
- (va)->va_refs++; \
- ASSERT((va)->va_refs != 0); \
-}
-
-#define VNIC_MAC_REFRELE(va) { \
- ASSERT(MUTEX_HELD(&vnic_mac_lock)); \
- ASSERT((va)->va_refs != 0); \
- if (--((va)->va_refs) == 0) \
- vnic_mac_free(va); \
-}
-
-static uchar_t vnic_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-
-/* used by vnic_walker */
-typedef struct vnic_info_state {
- datalink_id_t vs_vnic_id;
- datalink_id_t vs_linkid;
- boolean_t vs_vnic_found;
- vnic_info_new_vnic_fn_t vs_new_vnic_fn;
- void *vs_fn_arg;
- int vs_rc;
-} vnic_info_state_t;
-
-#define VNIC_M_CALLBACK_FLAGS (MC_RESOURCES | MC_GETCAPAB)
+#define VNIC_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB)
static mac_callbacks_t vnic_m_callbacks = {
VNIC_M_CALLBACK_FLAGS,
@@ -122,54 +103,21 @@ static mac_callbacks_t vnic_m_callbacks = {
vnic_m_multicst,
vnic_m_unicst,
vnic_m_tx,
- vnic_m_resources,
- NULL, /* m_ioctl */
+ vnic_m_ioctl,
vnic_m_capab_get
};
-/* ARGSUSED */
-static int
-vnic_mac_ctor(void *buf, void *arg, int kmflag)
-{
- vnic_mac_t *vnic_mac = buf;
-
- bzero(vnic_mac, sizeof (vnic_mac_t));
- rw_init(&vnic_mac->va_bcast_grp_lock, NULL, RW_DRIVER, NULL);
- rw_init(&vnic_mac->va_promisc_lock, NULL, RW_DRIVER, NULL);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-vnic_mac_dtor(void *buf, void *arg)
-{
- vnic_mac_t *vnic_mac = buf;
-
- rw_destroy(&vnic_mac->va_promisc_lock);
- rw_destroy(&vnic_mac->va_bcast_grp_lock);
-}
-
void
vnic_dev_init(void)
{
vnic_cache = kmem_cache_create("vnic_cache",
sizeof (vnic_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
- vnic_mac_cache = kmem_cache_create("vnic_mac_cache",
- sizeof (vnic_mac_t), 0, vnic_mac_ctor, vnic_mac_dtor,
- NULL, NULL, NULL, 0);
-
vnic_hash = mod_hash_create_idhash("vnic_hash",
VNIC_HASHSZ, mod_hash_null_valdtor);
- vnic_mac_hash = mod_hash_create_idhash("vnic_mac_hash",
- VNIC_MAC_HASHSZ, mod_hash_null_valdtor);
-
rw_init(&vnic_lock, NULL, RW_DEFAULT, NULL);
- mutex_init(&vnic_mac_lock, NULL, MUTEX_DEFAULT, NULL);
-
vnic_count = 0;
}
@@ -178,11 +126,8 @@ vnic_dev_fini(void)
{
ASSERT(vnic_count == 0);
- mutex_destroy(&vnic_mac_lock);
rw_destroy(&vnic_lock);
- mod_hash_destroy_idhash(vnic_mac_hash);
mod_hash_destroy_idhash(vnic_hash);
- kmem_cache_destroy(vnic_mac_cache);
kmem_cache_destroy(vnic_cache);
}
@@ -192,526 +137,162 @@ vnic_dev_count(void)
return (vnic_count);
}
-static int
-vnic_mac_open(datalink_id_t linkid, vnic_mac_t **vmp)
-{
- int err;
- vnic_mac_t *vnic_mac = NULL;
- const mac_info_t *mip;
-
- *vmp = NULL;
-
- mutex_enter(&vnic_mac_lock);
-
- err = mod_hash_find(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
- (mod_hash_val_t *)&vnic_mac);
- if (err == 0) {
- /* this MAC is already opened, increment reference count */
- VNIC_MAC_REFHOLD(vnic_mac);
- mutex_exit(&vnic_mac_lock);
- *vmp = vnic_mac;
- return (0);
- }
-
- vnic_mac = kmem_cache_alloc(vnic_mac_cache, KM_SLEEP);
- if ((err = mac_open_by_linkid(linkid, &vnic_mac->va_mh)) != 0) {
- vnic_mac->va_mh = NULL;
- goto bail;
- }
-
- /*
- * For now, we do not support VNICs over legacy drivers. This will
- * soon be changed.
- */
- if (mac_is_legacy(vnic_mac->va_mh)) {
- err = ENOTSUP;
- goto bail;
- }
-
- /* only ethernet support, for now */
- mip = mac_info(vnic_mac->va_mh);
- if (mip->mi_media != DL_ETHER) {
- err = ENOTSUP;
- goto bail;
- }
- if (mip->mi_media != mip->mi_nativemedia) {
- err = ENOTSUP;
- goto bail;
- }
-
- vnic_mac->va_linkid = linkid;
-
- /* add entry to hash table */
- err = mod_hash_insert(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
- (mod_hash_val_t)vnic_mac);
- ASSERT(err == 0);
-
- /* initialize the flow table associated with lower MAC */
- vnic_mac->va_addr_len = ETHERADDRL;
- (void) vnic_classifier_flow_tab_init(vnic_mac, vnic_mac->va_addr_len,
- KM_SLEEP);
-
- vnic_mac->va_txinfo = mac_vnic_tx_get(vnic_mac->va_mh);
- vnic_mac->va_notify_hdl = mac_notify_add(vnic_mac->va_mh,
- vnic_notify_cb, vnic_mac);
-
- VNIC_MAC_REFHOLD(vnic_mac);
- *vmp = vnic_mac;
- mutex_exit(&vnic_mac_lock);
- return (0);
-
-bail:
- if (vnic_mac != NULL) {
- if (vnic_mac->va_mh != NULL)
- mac_close(vnic_mac->va_mh);
- kmem_cache_free(vnic_mac_cache, vnic_mac);
+static vnic_ioc_diag_t
+vnic_mac2vnic_diag(mac_diag_t diag)
+{
+ switch (diag) {
+ case MAC_DIAG_MACADDR_NIC:
+ return (VNIC_IOC_DIAG_MACADDR_NIC);
+ case MAC_DIAG_MACADDR_INUSE:
+ return (VNIC_IOC_DIAG_MACADDR_INUSE);
+ case MAC_DIAG_MACADDR_INVALID:
+ return (VNIC_IOC_DIAG_MACADDR_INVALID);
+ case MAC_DIAG_MACADDRLEN_INVALID:
+ return (VNIC_IOC_DIAG_MACADDRLEN_INVALID);
+ case MAC_DIAG_MACFACTORYSLOTINVALID:
+ return (VNIC_IOC_DIAG_MACFACTORYSLOTINVALID);
+ case MAC_DIAG_MACFACTORYSLOTUSED:
+ return (VNIC_IOC_DIAG_MACFACTORYSLOTUSED);
+ case MAC_DIAG_MACFACTORYSLOTALLUSED:
+ return (VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED);
+ case MAC_DIAG_MACFACTORYNOTSUP:
+ return (VNIC_IOC_DIAG_MACFACTORYNOTSUP);
+ case MAC_DIAG_MACPREFIX_INVALID:
+ return (VNIC_IOC_DIAG_MACPREFIX_INVALID);
+ case MAC_DIAG_MACPREFIXLEN_INVALID:
+ return (VNIC_IOC_DIAG_MACPREFIXLEN_INVALID);
+ case MAC_DIAG_MACNO_HWRINGS:
+ return (VNIC_IOC_DIAG_NO_HWRINGS);
+ default:
+ return (VNIC_IOC_DIAG_NONE);
}
- mutex_exit(&vnic_mac_lock);
- return (err);
}
-/*
- * Create a new flow for the active MAC client sharing the NIC
- * with the VNICs. This allows the unicast packets for that NIC
- * to be classified and passed up to the active MAC client. It
- * also allows packets sent from a VNIC to the active link to
- * be classified by the VNIC transmit function and delivered via
- * the MAC module locally. Returns B_TRUE on success, B_FALSE on
- * failure.
- */
static int
-vnic_init_active_rx(vnic_mac_t *vnic_mac)
-{
- uchar_t nic_mac_addr[MAXMACADDRLEN];
-
- if (vnic_mac->va_active_flow != NULL)
- return (B_TRUE);
-
- mac_unicst_get(vnic_mac->va_mh, nic_mac_addr);
-
- vnic_mac->va_active_flow = vnic_classifier_flow_create(
- vnic_mac->va_addr_len, nic_mac_addr, NULL, B_TRUE, KM_SLEEP);
-
- vnic_classifier_flow_add(vnic_mac, vnic_mac->va_active_flow,
- (vnic_rx_fn_t)mac_active_rx, vnic_mac->va_mh, NULL);
- return (B_TRUE);
-}
-
-static void
-vnic_fini_active_rx(vnic_mac_t *vnic_mac)
-{
- if (vnic_mac->va_active_flow == NULL)
- return;
-
- vnic_classifier_flow_remove(vnic_mac, vnic_mac->va_active_flow);
- vnic_classifier_flow_destroy(vnic_mac->va_active_flow);
- vnic_mac->va_active_flow = NULL;
-}
-
-static void
-vnic_update_active_rx(vnic_mac_t *vnic_mac)
-{
- if (vnic_mac->va_active_flow == NULL)
- return;
-
- vnic_fini_active_rx(vnic_mac);
- (void) vnic_init_active_rx(vnic_mac);
-}
-
-/*
- * Copy an mblk, preserving its hardware checksum flags.
- */
-mblk_t *
-vnic_copymsg_cksum(mblk_t *mp)
-{
- mblk_t *mp1;
- uint32_t start, stuff, end, value, flags;
-
- mp1 = copymsg(mp);
- if (mp1 == NULL)
- return (NULL);
-
- hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
- (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
- flags, KM_NOSLEEP);
-
- return (mp1);
-}
-
-/*
- * Copy an mblk chain, presenting the hardware checksum flags of the
- * individual mblks.
- */
-mblk_t *
-vnic_copymsgchain_cksum(mblk_t *mp)
-{
- mblk_t *nmp = NULL;
- mblk_t **nmpp = &nmp;
-
- for (; mp != NULL; mp = mp->b_next) {
- if ((*nmpp = vnic_copymsg_cksum(mp)) == NULL) {
- freemsgchain(nmp);
- return (NULL);
- }
-
- nmpp = &((*nmpp)->b_next);
- }
-
- return (nmp);
-}
-
-
-/*
- * Process the specified mblk chain for proper handling of hardware
- * checksum offload. This routine is invoked for loopback VNIC traffic.
- * The function handles a NULL mblk chain passed as argument.
- */
-mblk_t *
-vnic_fix_cksum(mblk_t *mp_chain)
+vnic_unicast_add(vnic_t *vnic, vnic_mac_addr_type_t vnic_addr_type,
+ int *addr_slot, uint_t prefix_len, int *addr_len_ptr_arg,
+ uint8_t *mac_addr_arg, uint16_t flags, vnic_ioc_diag_t *diag,
+ uint16_t vid)
{
- mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
- uint32_t flags, start, stuff, end, value;
-
- for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
- uint16_t len;
- uint32_t offset;
- struct ether_header *ehp;
- uint16_t sap;
+ mac_diag_t mac_diag;
+ uint16_t mac_flags = 0;
+ int err;
+ uint_t addr_len;
- hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
- &flags);
- if (flags == 0)
- continue;
+ if (flags & VNIC_IOC_CREATE_NODUPCHECK)
+ mac_flags |= MAC_UNICAST_NODUPCHECK;
+ switch (vnic_addr_type) {
+ case VNIC_MAC_ADDR_TYPE_FIXED:
/*
- * Since the processing of checksum offload for loopback
- * traffic requires modification of the packet contents,
- * ensure sure that we are always modifying our own copy.
+ * The MAC address value to assign to the VNIC
+ * is already provided in mac_addr_arg. addr_len_ptr_arg
+ * already contains the MAC address length.
*/
- if (DB_REF(mp) > 1) {
- mp1 = copymsg(mp);
- if (mp1 == NULL)
- continue;
- mp1->b_next = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- if (prev != NULL)
- prev->b_next = mp1;
- else
- new_chain = mp1;
- mp = mp1;
- }
+ break;
+ case VNIC_MAC_ADDR_TYPE_RANDOM:
/*
- * Ethernet, and optionally VLAN header.
+ * Random MAC address. There are two sub-cases:
+ *
+ * 1 - If mac_len == 0, a new MAC address is generated.
+ * The length of the MAC address to generated depends
+ * on the type of MAC used. The prefix to use for the MAC
+ * address is stored in the most significant bytes
+ * of the mac_addr argument, and its length is specified
+ * by the mac_prefix_len argument. This prefix can
+ * correspond to a IEEE OUI in the case of Ethernet,
+ * for example.
+ *
+ * 2 - If mac_len > 0, the address was already picked
+ * randomly, and is now passed back during VNIC
+ * re-creation. The mac_addr argument contains the MAC
+ * address that was generated. We distinguish this
+ * case from the fixed MAC address case, since we
+ * want the user consumers to know, when they query
+ * the list of VNICs, that a VNIC was assigned a
+ * random MAC address vs assigned a fixed address
+ * specified by the user.
*/
- /*LINTED*/
- ehp = (struct ether_header *)mp->b_rptr;
- if (ntohs(ehp->ether_type) == VLAN_TPID) {
- struct ether_vlan_header *evhp;
-
- ASSERT(MBLKL(mp) >=
- sizeof (struct ether_vlan_header));
- /*LINTED*/
- evhp = (struct ether_vlan_header *)mp->b_rptr;
- sap = ntohs(evhp->ether_type);
- offset = sizeof (struct ether_vlan_header);
- } else {
- sap = ntohs(ehp->ether_type);
- offset = sizeof (struct ether_header);
- }
- if (MBLKL(mp) <= offset) {
- offset -= MBLKL(mp);
- if (mp->b_cont == NULL) {
- /* corrupted packet, skip it */
- if (prev != NULL)
- prev->b_next = mp->b_next;
- else
- new_chain = mp->b_next;
- mp1 = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- mp = mp1;
- continue;
- }
- mp = mp->b_cont;
- }
-
- if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
- ipha_t *ipha = NULL;
-
- /*
- * In order to compute the full and header
- * checksums, we need to find and parse
- * the IP and/or ULP headers.
- */
-
- sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
-
- /*
- * IP header.
- */
- if (sap != ETHERTYPE_IP)
- continue;
-
- ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
- /*LINTED*/
- ipha = (ipha_t *)(mp->b_rptr + offset);
-
- if (flags & HCK_FULLCKSUM) {
- ipaddr_t src, dst;
- uint32_t cksum;
- uint16_t *up;
- uint8_t proto;
-
- /*
- * Pointer to checksum field in ULP header.
- */
- proto = ipha->ipha_protocol;
- ASSERT(ipha->ipha_version_and_hdr_length ==
- IP_SIMPLE_HDR_VERSION);
- if (proto == IPPROTO_TCP) {
- /*LINTED*/
- up = IPH_TCPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- } else {
- ASSERT(proto == IPPROTO_UDP);
- /*LINTED*/
- up = IPH_UDPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- }
-
- /*
- * Pseudo-header checksum.
- */
- src = ipha->ipha_src;
- dst = ipha->ipha_dst;
- len = ntohs(ipha->ipha_length) -
- IP_SIMPLE_HDR_LENGTH;
-
- cksum = (dst >> 16) + (dst & 0xFFFF) +
- (src >> 16) + (src & 0xFFFF);
- cksum += htons(len);
-
- /*
- * The checksum value stored in the packet needs
- * to be correct. Compute it here.
- */
- *up = 0;
- cksum += (((proto) == IPPROTO_UDP) ?
- IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
- cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
- offset, cksum);
- *(up) = (uint16_t)(cksum ? cksum : ~cksum);
-
- flags |= HCK_FULLCKSUM_OK;
- value = 0xffff;
- }
-
- if (flags & HCK_IPV4_HDRCKSUM) {
- ASSERT(ipha != NULL);
- ipha->ipha_hdr_checksum =
- (uint16_t)ip_csum_hdr(ipha);
- }
- }
-
- if (flags & HCK_PARTIALCKSUM) {
- uint16_t *up, partial, cksum;
- uchar_t *ipp; /* ptr to beginning of IP header */
-
- if (mp->b_cont != NULL) {
- mblk_t *mp1;
-
- mp1 = msgpullup(mp, offset + end);
- if (mp1 == NULL)
- continue;
- mp1->b_next = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- if (prev != NULL)
- prev->b_next = mp1;
- else
- new_chain = mp1;
- mp = mp1;
- }
-
- ipp = mp->b_rptr + offset;
- /*LINTED*/
- up = (uint16_t *)((uchar_t *)ipp + stuff);
- partial = *up;
- *up = 0;
-
- cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
- end - start, partial);
- cksum = ~cksum;
- *up = cksum ? cksum : ~cksum;
+ /*
+ * If it's a pre-generated address, we're done. mac_addr_arg
+ * and addr_len_ptr_arg already contain the MAC address
+ * value and length.
+ */
+ if (*addr_len_ptr_arg > 0)
+ break;
- /*
- * Since we already computed the whole checksum,
- * indicate to the stack that it has already
- * been verified by the hardware.
- */
- flags &= ~HCK_PARTIALCKSUM;
- flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
- value = 0xffff;
+ /* generate a new random MAC address */
+ if ((err = mac_addr_random(vnic->vn_mch,
+ prefix_len, mac_addr_arg, &mac_diag)) != 0) {
+ *diag = vnic_mac2vnic_diag(mac_diag);
+ return (err);
}
+ *addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
+ break;
- (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
- value, flags, KM_NOSLEEP);
- }
-
- return (new_chain);
-}
-
-static void
-vnic_mac_close(vnic_mac_t *vnic_mac)
-{
- mutex_enter(&vnic_mac_lock);
- VNIC_MAC_REFRELE(vnic_mac);
- mutex_exit(&vnic_mac_lock);
-}
-
-static void
-vnic_mac_free(vnic_mac_t *vnic_mac)
-{
- mod_hash_val_t val;
-
- ASSERT(MUTEX_HELD(&vnic_mac_lock));
- vnic_fini_active_rx(vnic_mac);
- mac_notify_remove(vnic_mac->va_mh, vnic_mac->va_notify_hdl);
- if (vnic_mac->va_mac_set) {
- vnic_mac->va_mac_set = B_FALSE;
- mac_vnic_clear(vnic_mac->va_mh);
- }
- vnic_classifier_flow_tab_fini(vnic_mac);
- mac_close(vnic_mac->va_mh);
-
- (void) mod_hash_remove(vnic_mac_hash,
- (mod_hash_key_t)(uintptr_t)vnic_mac->va_linkid, &val);
- ASSERT(vnic_mac == (vnic_mac_t *)val);
-
- kmem_cache_free(vnic_mac_cache, vnic_mac);
-}
-
-/*
- * Initial VNIC receive routine. Invoked for packets that are steered
- * to a VNIC but the VNIC has not been started yet.
- */
-/* ARGSUSED */
-static void
-vnic_rx_initial(void *arg1, void *arg2, mblk_t *mp_chain)
-{
- vnic_t *vnic = arg1;
- mblk_t *mp;
-
- /* update stats */
- for (mp = mp_chain; mp != NULL; mp = mp->b_next)
- vnic->vn_stat_ierrors++;
- freemsgchain(mp_chain);
-}
-
-/*
- * VNIC receive routine invoked after the classifier for the VNIC
- * has been initialized and the VNIC has been started.
- */
-/* ARGSUSED */
-void
-vnic_rx(void *arg1, void *arg2, mblk_t *mp_chain)
-{
- vnic_t *vnic = arg1;
- mblk_t *mp;
-
- /* update stats */
- for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
- vnic->vn_stat_ipackets++;
- vnic->vn_stat_rbytes += msgdsize(mp);
- }
-
- /* pass packet up */
- mac_rx(vnic->vn_mh, NULL, mp_chain);
-}
-
-/*
- * Routine to create a MAC-based VNIC. Adds the passed MAC address
- * to an unused slot in the NIC if one is available. Otherwise it
- * sets the NIC in promiscuous mode and assigns the MAC address to
- * a Rx ring if available or a soft ring.
- */
-static int
-vnic_add_unicstaddr(vnic_t *vnic, mac_multi_addr_t *maddr)
-{
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- int err;
-
- if (mac_unicst_verify(vnic_mac->va_mh, maddr->mma_addr,
- maddr->mma_addrlen) == B_FALSE)
- return (EINVAL);
-
- if (mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_MULTIADDRESS,
- &(vnic->vn_mma_capab))) {
- if (vnic->vn_maddr_naddrfree == 0) {
- /*
- * No free address slots available.
- * Enable promiscuous mode.
- */
- goto set_promisc;
+ case VNIC_MAC_ADDR_TYPE_FACTORY:
+ err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot);
+ if (err != 0) {
+ if (err == EINVAL)
+ *diag = VNIC_IOC_DIAG_MACFACTORYSLOTINVALID;
+ if (err == EBUSY)
+ *diag = VNIC_IOC_DIAG_MACFACTORYSLOTUSED;
+ if (err == ENOSPC)
+ *diag = VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED;
+ return (err);
}
- err = vnic->vn_maddr_add(vnic->vn_maddr_handle, maddr);
- if (err != 0) {
- if (err == ENOSPC) {
- /*
- * There was a race to add addresses
- * with other multiple address consumers,
- * and we lost out. Use promisc mode.
- */
- goto set_promisc;
- }
+ mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot,
+ mac_addr_arg, &addr_len, NULL, NULL);
+ *addr_len_ptr_arg = addr_len;
+ break;
- return (err);
+ case VNIC_MAC_ADDR_TYPE_AUTO:
+ /* first try to allocate a factory MAC address */
+ err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot);
+ if (err == 0) {
+ mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot,
+ mac_addr_arg, &addr_len, NULL, NULL);
+ vnic_addr_type = VNIC_MAC_ADDR_TYPE_FACTORY;
+ *addr_len_ptr_arg = addr_len;
+ break;
}
- vnic->vn_slot_id = maddr->mma_slot;
- vnic->vn_multi_mac = B_TRUE;
- } else {
/*
- * Either multiple MAC address support is not
- * available or all available addresses have
- * been used up.
+ * Allocating a factory MAC address failed, generate a
+ * random MAC address instead.
*/
- set_promisc:
- if ((err = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
- MAC_DEVPROMISC)) != 0) {
+ if ((err = mac_addr_random(vnic->vn_mch,
+ prefix_len, mac_addr_arg, &mac_diag)) != 0) {
+ *diag = vnic_mac2vnic_diag(mac_diag);
return (err);
}
-
- vnic->vn_promisc_mac = B_TRUE;
+ *addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
+ vnic_addr_type = VNIC_MAC_ADDR_TYPE_RANDOM;
+ break;
+ case VNIC_MAC_ADDR_TYPE_PRIMARY:
+ /*
+ * We get the address here since we copy it in the
+ * vnic's vn_addr.
+ */
+ mac_unicast_primary_get(vnic->vn_lower_mh, mac_addr_arg);
+ *addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
+ mac_flags |= MAC_UNICAST_VNIC_PRIMARY;
+ break;
}
- return (err);
-}
-/*
- * VNIC is getting deleted. Remove the MAC address from the slot.
- * If promiscuous mode was being used, then unset the promiscuous mode.
- */
-static int
-vnic_remove_unicstaddr(vnic_t *vnic)
-{
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- int err;
-
- if (vnic->vn_multi_mac) {
- ASSERT(vnic->vn_promisc_mac == B_FALSE);
- err = vnic->vn_maddr_remove(vnic->vn_maddr_handle,
- vnic->vn_slot_id);
- vnic->vn_multi_mac = B_FALSE;
- }
+ vnic->vn_addr_type = vnic_addr_type;
- if (vnic->vn_promisc_mac) {
- ASSERT(vnic->vn_multi_mac == B_FALSE);
- err = mac_promisc_set(vnic_mac->va_mh, B_FALSE, MAC_DEVPROMISC);
- vnic->vn_promisc_mac = B_FALSE;
+ err = mac_unicast_add(vnic->vn_mch, mac_addr_arg, mac_flags,
+ &vnic->vn_muh, vid, &mac_diag);
+ if (err != 0) {
+ if (vnic_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) {
+ /* release factory MAC address */
+ mac_addr_factory_release(vnic->vn_mch, *addr_slot);
+ }
+ *diag = vnic_mac2vnic_diag(mac_diag);
}
return (err);
@@ -721,21 +302,23 @@ vnic_remove_unicstaddr(vnic_t *vnic)
* Create a new VNIC upon request from administrator.
* Returns 0 on success, an errno on failure.
*/
+/* ARGSUSED */
int
-vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
- uchar_t *mac_addr)
+vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
+ vnic_mac_addr_type_t *vnic_addr_type, int *mac_len, uchar_t *mac_addr,
+ int *mac_slot, uint_t mac_prefix_len, uint16_t vid,
+ mac_resource_props_t *mrp, uint32_t flags, vnic_ioc_diag_t *diag)
{
- vnic_t *vnic = NULL;
+ vnic_t *vnic;
mac_register_t *mac;
int err;
- vnic_mac_t *vnic_mac;
- mac_multi_addr_t maddr;
- mac_txinfo_t tx_info;
+ boolean_t is_anchor = ((flags & VNIC_IOC_CREATE_ANCHOR) != 0);
+ char vnic_name[MAXNAMELEN];
+ const mac_info_t *minfop;
+ uint32_t req_hwgrp_flag = ((flags & VNIC_IOC_CREATE_REQ_HWRINGS) != 0) ?
+ MAC_OPEN_FLAGS_REQ_HWRINGS : 0;
- if (mac_len != ETHERADDRL) {
- /* currently only ethernet NICs are supported */
- return (EINVAL);
- }
+ *diag = VNIC_IOC_DIAG_NONE;
rw_enter(&vnic_lock, RW_WRITER);
@@ -753,36 +336,86 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
return (ENOMEM);
}
- /* open underlying MAC */
- err = vnic_mac_open(linkid, &vnic_mac);
- if (err != 0) {
- kmem_cache_free(vnic_cache, vnic);
- rw_exit(&vnic_lock);
- return (err);
- }
-
bzero(vnic, sizeof (*vnic));
- vnic->vn_id = vnic_id;
- vnic->vn_vnic_mac = vnic_mac;
+ vnic->vn_id = vnic_id;
+ vnic->vn_link_id = linkid;
vnic->vn_started = B_FALSE;
- vnic->vn_promisc = B_FALSE;
- vnic->vn_multi_mac = B_FALSE;
- vnic->vn_bcast_grp = B_FALSE;
-
- /* set the VNIC MAC address */
- maddr.mma_addrlen = mac_len;
- maddr.mma_slot = 0;
- maddr.mma_flags = 0;
- bcopy(mac_addr, maddr.mma_addr, mac_len);
- if ((err = vnic_add_unicstaddr(vnic, &maddr)) != 0)
- goto bail;
- bcopy(mac_addr, vnic->vn_addr, mac_len);
- /* set the initial VNIC capabilities */
- if (!mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_HCKSUM,
- &vnic->vn_hcksum_txflags))
- vnic->vn_hcksum_txflags = 0;
+ if (!is_anchor) {
+ if (linkid == DATALINK_INVALID_LINKID) {
+ err = EINVAL;
+ goto bail;
+ }
+
+ /*
+ * Open the lower MAC and assign its initial bandwidth and
+ * MAC address. We do this here during VNIC creation and
+ * do not wait until the upper MAC client open so that we
+ * can validate the VNIC creation parameters (bandwidth,
+ * MAC address, etc) and reserve a factory MAC address if
+ * one was requested.
+ */
+ err = mac_open_by_linkid(linkid, &vnic->vn_lower_mh);
+ if (err != 0)
+ goto bail;
+
+ /*
+ * VNIC(vlan) over VNICs(vlans) is not supported.
+ */
+ if (mac_is_vnic(vnic->vn_lower_mh)) {
+ err = EINVAL;
+ goto bail;
+ }
+
+ /* only ethernet support for now */
+ minfop = mac_info(vnic->vn_lower_mh);
+ if (minfop->mi_nativemedia != DL_ETHER) {
+ err = ENOTSUP;
+ goto bail;
+ }
+
+ (void) dls_mgmt_get_linkinfo(vnic_id, vnic_name, NULL, NULL,
+ NULL);
+ err = mac_client_open(vnic->vn_lower_mh, &vnic->vn_mch,
+ vnic_name, MAC_OPEN_FLAGS_IS_VNIC | req_hwgrp_flag);
+ if (err != 0)
+ goto bail;
+
+ if (mrp != NULL) {
+ err = mac_client_set_resources(vnic->vn_mch, mrp);
+ if (err != 0)
+ goto bail;
+ }
+ /* assign a MAC address to the VNIC */
+
+ err = vnic_unicast_add(vnic, *vnic_addr_type, mac_slot,
+ mac_prefix_len, mac_len, mac_addr, flags, diag, vid);
+ if (err != 0) {
+ vnic->vn_muh = NULL;
+ if (diag != NULL && req_hwgrp_flag != 0)
+ *diag = VNIC_IOC_DIAG_NO_HWRINGS;
+ goto bail;
+ }
+
+ /* register to receive notification from underlying MAC */
+ vnic->vn_mnh = mac_notify_add(vnic->vn_lower_mh, vnic_notify_cb,
+ vnic);
+
+ *vnic_addr_type = vnic->vn_addr_type;
+ vnic->vn_addr_len = *mac_len;
+ vnic->vn_vid = vid;
+
+ bcopy(mac_addr, vnic->vn_addr, vnic->vn_addr_len);
+
+ if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY)
+ vnic->vn_slot_id = *mac_slot;
+
+ /* set the initial VNIC capabilities */
+ if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_HCKSUM,
+ &vnic->vn_hcksum_txflags))
+ vnic->vn_hcksum_txflags = 0;
+ }
/* register with the MAC module */
if ((mac = mac_alloc(MAC_VERSION)) == NULL)
@@ -795,27 +428,61 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
mac->m_src_addr = vnic->vn_addr;
mac->m_callbacks = &vnic_m_callbacks;
- mac_sdu_get(vnic_mac->va_mh, &mac->m_min_sdu, &mac->m_max_sdu);
+ if (!is_anchor) {
+ /*
+ * If this is a VNIC based VLAN, then we check for the
+ * margin unless it has been created with the force
+ * flag. If we are configuring a VLAN over an etherstub,
+ * we don't check the margin even if force is not set.
+ */
+ if (vid == 0 || (flags & VNIC_IOC_CREATE_FORCE) != 0) {
+ if (vid != VLAN_ID_NONE)
+ vnic->vn_force = B_TRUE;
+ /*
+ * As the current margin size of the underlying mac is
+ * used to determine the margin size of the VNIC
+ * itself, request the underlying mac not to change
+ * to a smaller margin size.
+ */
+ err = mac_margin_add(vnic->vn_lower_mh,
+ &vnic->vn_margin, B_TRUE);
+ ASSERT(err == 0);
+ } else {
+ vnic->vn_margin = VLAN_TAGSZ;
+ err = mac_margin_add(vnic->vn_lower_mh,
+ &vnic->vn_margin, B_FALSE);
+ if (err != 0) {
+ mac_free(mac);
+ if (diag != NULL)
+ *diag = VNIC_IOC_DIAG_MACMARGIN_INVALID;
+ goto bail;
+ }
+ }
+
+ mac_sdu_get(vnic->vn_lower_mh, &mac->m_min_sdu,
+ &mac->m_max_sdu);
+ } else {
+ vnic->vn_margin = VLAN_TAGSZ;
+ mac->m_min_sdu = 0;
+ mac->m_max_sdu = 9000;
+ }
- /*
- * As the current margin size of the underlying mac is used to
- * determine the margin size of the VNIC itself, request the
- * underlying mac not to change to a smaller margin size.
- */
- err = mac_margin_add(vnic_mac->va_mh, &(vnic->vn_margin), B_TRUE);
- if (err != 0)
- goto bail;
mac->m_margin = vnic->vn_margin;
+
err = mac_register(mac, &vnic->vn_mh);
mac_free(mac);
if (err != 0) {
- VERIFY(mac_margin_remove(vnic_mac->va_mh,
+ VERIFY(is_anchor || mac_margin_remove(vnic->vn_lower_mh,
vnic->vn_margin) == 0);
goto bail;
}
+ /* Set the VNIC's MAC in the client */
+ if (!is_anchor)
+ mac_set_upper_mac(vnic->vn_mch, vnic->vn_mh);
+
if ((err = dls_devnet_create(vnic->vn_mh, vnic->vn_id)) != 0) {
- VERIFY(mac_margin_remove(vnic_mac->va_mh,
+ VERIFY(is_anchor || mac_margin_remove(vnic->vn_lower_mh,
vnic->vn_margin) == 0);
(void) mac_unregister(vnic->vn_mh);
goto bail;
@@ -829,69 +496,22 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
rw_exit(&vnic_lock);
- /* Create a flow, initialized with the MAC address of the VNIC */
- if ((vnic->vn_flow_ent = vnic_classifier_flow_create(mac_len, mac_addr,
- NULL, B_FALSE, KM_SLEEP)) == NULL) {
- (void) vnic_dev_delete(vnic_id);
- vnic = NULL;
- err = ENOMEM;
- goto bail_unlocked;
- }
-
- vnic_classifier_flow_add(vnic_mac, vnic->vn_flow_ent, vnic_rx_initial,
- vnic, vnic);
-
- /* setup VNIC to receive broadcast packets */
- err = vnic_bcast_add(vnic, vnic_brdcst_mac, MAC_ADDRTYPE_BROADCAST);
- if (err != 0) {
- (void) vnic_dev_delete(vnic_id);
- vnic = NULL;
- goto bail_unlocked;
- }
- vnic->vn_bcast_grp = B_TRUE;
-
- mutex_enter(&vnic_mac_lock);
- if (!vnic_mac->va_mac_set) {
- /*
- * We want to MAC layer to call the VNIC tx outbound
- * routine, so that local broadcast packets sent by
- * the active interface sharing the underlying NIC (if
- * any), can be broadcast to every VNIC.
- */
- tx_info.mt_fn = vnic_active_tx;
- tx_info.mt_arg = vnic_mac;
- if (!mac_vnic_set(vnic_mac->va_mh, &tx_info,
- vnic_m_capab_get, vnic)) {
- mutex_exit(&vnic_mac_lock);
- (void) vnic_dev_delete(vnic_id);
- vnic = NULL;
- err = EBUSY;
- goto bail_unlocked;
- }
- vnic_mac->va_mac_set = B_TRUE;
- }
- mutex_exit(&vnic_mac_lock);
-
- /* allow passing packets to NIC's active MAC client */
- if (!vnic_init_active_rx(vnic_mac)) {
- (void) vnic_dev_delete(vnic_id);
- vnic = NULL;
- err = ENOMEM;
- goto bail_unlocked;
- }
-
return (0);
bail:
- (void) vnic_remove_unicstaddr(vnic);
- vnic_mac_close(vnic_mac);
rw_exit(&vnic_lock);
-
-bail_unlocked:
- if (vnic != NULL) {
- kmem_cache_free(vnic_cache, vnic);
+ if (!is_anchor) {
+ if (vnic->vn_mnh != NULL)
+ (void) mac_notify_remove(vnic->vn_mnh, B_TRUE);
+ if (vnic->vn_muh != NULL)
+ (void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh);
+ if (vnic->vn_mch != NULL)
+ mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC);
+ if (vnic->vn_lower_mh != NULL)
+ mac_close(vnic->vn_lower_mh);
}
+ kmem_cache_free(vnic_cache, vnic);
return (err);
}
@@ -901,11 +521,10 @@ bail_unlocked:
/* ARGSUSED */
int
vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask,
- vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr)
+ vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr,
+ uint_t mac_slot, mac_resource_props_t *mrp)
{
vnic_t *vnic = NULL;
- int rv = 0;
- boolean_t notify_mac_addr = B_FALSE;
rw_enter(&vnic_lock, RW_WRITER);
@@ -915,29 +534,19 @@ vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask,
return (ENOENT);
}
- if (modify_mask & VNIC_IOC_MODIFY_ADDR) {
- rv = vnic_modify_mac_addr(vnic, mac_len, mac_addr);
- if (rv == 0)
- notify_mac_addr = B_TRUE;
- }
-
rw_exit(&vnic_lock);
- if (notify_mac_addr)
- mac_unicst_update(vnic->vn_mh, mac_addr);
-
- return (rv);
+ return (0);
}
+/* ARGSUSED */
int
-vnic_dev_delete(datalink_id_t vnic_id)
+vnic_dev_delete(datalink_id_t vnic_id, uint32_t flags)
{
vnic_t *vnic = NULL;
mod_hash_val_t val;
- vnic_flow_t *flent;
datalink_id_t tmpid;
int rc;
- vnic_mac_t *vnic_mac;
rw_enter(&vnic_lock, RW_WRITER);
@@ -947,7 +556,7 @@ vnic_dev_delete(datalink_id_t vnic_id)
return (ENOENT);
}
- if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid)) != 0) {
+ if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid, B_TRUE)) != 0) {
rw_exit(&vnic_lock);
return (rc);
}
@@ -957,317 +566,136 @@ vnic_dev_delete(datalink_id_t vnic_id)
/*
* We cannot unregister the MAC yet. Unregistering would
* free up mac_impl_t which should not happen at this time.
- * Packets could be entering vnic_rx() through the
- * flow entry and so mac_impl_t cannot be NULL. So disable
- * mac_impl_t by calling mac_disable(). This will prevent any
- * new claims on mac_impl_t.
+ * So disable mac_impl_t by calling mac_disable(). This will prevent
+ * any new claims on mac_impl_t.
*/
- if (mac_disable(vnic->vn_mh) != 0) {
+ if ((rc = mac_disable(vnic->vn_mh)) != 0) {
(void) dls_devnet_create(vnic->vn_mh, vnic_id);
rw_exit(&vnic_lock);
- return (EBUSY);
+ return (rc);
}
(void) mod_hash_remove(vnic_hash, VNIC_HASH_KEY(vnic_id), &val);
ASSERT(vnic == (vnic_t *)val);
-
- if (vnic->vn_bcast_grp)
- (void) vnic_bcast_delete(vnic, vnic_brdcst_mac);
-
- flent = vnic->vn_flow_ent;
- if (flent != NULL) {
- /*
- * vnic_classifier_flow_destroy() ensures that the
- * flow is no longer used.
- */
- vnic_classifier_flow_remove(vnic->vn_vnic_mac, flent);
- vnic_classifier_flow_destroy(flent);
- }
-
- rc = mac_margin_remove(vnic->vn_vnic_mac->va_mh, vnic->vn_margin);
- ASSERT(rc == 0);
- rc = mac_unregister(vnic->vn_mh);
- ASSERT(rc == 0);
- (void) vnic_remove_unicstaddr(vnic);
- vnic_mac = vnic->vn_vnic_mac;
- kmem_cache_free(vnic_cache, vnic);
vnic_count--;
rw_exit(&vnic_lock);
- vnic_mac_close(vnic_mac);
- return (0);
-}
-
-/*
- * For the specified packet chain, return a sub-chain to be sent
- * and the transmit function to be used to send the packet. Also
- * return a pointer to the sub-chain of packets that should
- * be re-classified. If the function returns NULL, the packet
- * should be sent using the underlying NIC.
- */
-static vnic_flow_t *
-vnic_classify(vnic_mac_t *vnic_mac, mblk_t *mp, mblk_t **mp_chain_rest)
-{
- vnic_flow_t *flow_ent;
-
- /* one packet at a time */
- *mp_chain_rest = mp->b_next;
- mp->b_next = NULL;
-
- /* do classification on the packet */
- flow_ent = vnic_classifier_get_flow(vnic_mac, mp);
- return (flow_ent);
-}
-
-/*
- * Send a packet chain to a local VNIC or an active MAC client.
- */
-static void
-vnic_local_tx(vnic_mac_t *vnic_mac, vnic_flow_t *flow_ent, mblk_t *mp_chain)
-{
- mblk_t *mp1;
- const vnic_flow_fn_info_t *fn_info;
- vnic_t *vnic;
-
- if (!vnic_classifier_is_active(flow_ent) &&
- mac_promisc_get(vnic_mac->va_mh, MAC_PROMISC)) {
- /*
- * If the MAC is in promiscous mode,
- * send a copy of the active client.
- */
- if ((mp1 = vnic_copymsgchain_cksum(mp_chain)) == NULL)
- goto sendit;
- if ((mp1 = vnic_fix_cksum(mp1)) == NULL)
- goto sendit;
- mac_active_rx(vnic_mac->va_mh, NULL, mp1);
- }
-sendit:
- fn_info = vnic_classifier_get_fn_info(flow_ent);
/*
- * If the vnic to which we would deliver this packet is in
- * promiscuous mode then it already received the packet via
- * vnic_promisc_rx().
- *
- * XXX assumes that ff_arg2 is a vnic_t pointer if it is
- * non-NULL (currently always true).
+ * XXX-nicolas shouldn't have a void cast here, if it's
+ * expected that the function will never fail, then we should
+ * have an ASSERT().
*/
- vnic = (vnic_t *)fn_info->ff_arg2;
- if ((vnic != NULL) && vnic->vn_promisc)
- freemsg(mp_chain);
- else if ((mp1 = vnic_fix_cksum(mp_chain)) != NULL)
- (fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp1);
-}
+ (void) mac_unregister(vnic->vn_mh);
-/*
- * This function is invoked when a MAC client needs to send a packet
- * to a NIC which is shared by VNICs. It is passed to the MAC layer
- * by a call to mac_vnic_set() when the NIC is opened, and is returned
- * to MAC clients by mac_tx_get() when VNICs are present.
- */
-mblk_t *
-vnic_active_tx(void *arg, mblk_t *mp_chain)
-{
- vnic_mac_t *vnic_mac = arg;
- mblk_t *mp, *extra_mp = NULL;
- vnic_flow_t *flow_ent;
- void *flow_cookie;
- const mac_txinfo_t *mtp = vnic_mac->va_txinfo;
-
- for (mp = mp_chain; mp != NULL; mp = extra_mp) {
- mblk_t *next;
-
- next = mp->b_next;
- mp->b_next = NULL;
-
- vnic_promisc_rx(vnic_mac, (vnic_t *)-1, mp);
-
- flow_ent = vnic_classify(vnic_mac, mp, &extra_mp);
- ASSERT(extra_mp == NULL);
- extra_mp = next;
-
- if (flow_ent != NULL) {
- flow_cookie = vnic_classifier_get_client_cookie(
- flow_ent);
- if (flow_cookie != NULL) {
- /*
- * Send a copy to every VNIC defined on the
- * interface, as well as the underlying MAC.
- */
- vnic_bcast_send(flow_cookie, (vnic_t *)-1, mp);
- } else {
- /*
- * loopback the packet to a local VNIC or
- * an active MAC client.
- */
- vnic_local_tx(vnic_mac, flow_ent, mp);
- }
- VNIC_FLOW_REFRELE(flow_ent);
- mp_chain = NULL;
- } else {
- /*
- * Non-VNIC destination, send via the underlying
- * NIC. In order to avoid a recursive call
- * to this function, we ensured that mtp points
- * to the unerlying NIC transmit function
- * by inilizating through mac_vnic_tx_get().
- */
- mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
- if (mp_chain != NULL)
- break;
+ if (vnic->vn_lower_mh != NULL) {
+ /*
+ * Check if MAC address for the vnic was obtained from the
+ * factory MAC addresses. If yes, release it.
+ */
+ if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) {
+ (void) mac_addr_factory_release(vnic->vn_mch,
+ vnic->vn_slot_id);
}
+ (void) mac_margin_remove(vnic->vn_lower_mh, vnic->vn_margin);
+ (void) mac_notify_remove(vnic->vn_mnh, B_TRUE);
+ (void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh);
+ mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC);
+ mac_close(vnic->vn_lower_mh);
}
- if ((mp_chain != NULL) && (extra_mp != NULL)) {
- ASSERT(mp_chain->b_next == NULL);
- mp_chain->b_next = extra_mp;
- }
- return (mp_chain);
+ kmem_cache_free(vnic_cache, vnic);
+ return (0);
}
-/*
- * VNIC transmit function.
- */
+/* ARGSUSED */
mblk_t *
vnic_m_tx(void *arg, mblk_t *mp_chain)
{
- vnic_t *vnic = arg;
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- mblk_t *mp, *extra_mp = NULL;
- vnic_flow_t *flow_ent;
- void *flow_cookie;
-
/*
- * Update stats.
+ * This function could be invoked for an anchor VNIC when sending
+ * broadcast and multicast packets, and unicast packets which did
+ * not match any local known destination.
*/
- for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
- vnic->vn_stat_opackets++;
- vnic->vn_stat_obytes += msgdsize(mp);
- }
-
- for (mp = mp_chain; mp != NULL; mp = extra_mp) {
- mblk_t *next;
-
- next = mp->b_next;
- mp->b_next = NULL;
-
- vnic_promisc_rx(vnic->vn_vnic_mac, vnic, mp);
-
- flow_ent = vnic_classify(vnic->vn_vnic_mac, mp, &extra_mp);
- ASSERT(extra_mp == NULL);
- extra_mp = next;
-
- if (flow_ent != NULL) {
- flow_cookie = vnic_classifier_get_client_cookie(
- flow_ent);
- if (flow_cookie != NULL) {
- /*
- * The vnic_bcast_send function expects
- * to receive the sender VNIC as value
- * for arg2.
- */
- vnic_bcast_send(flow_cookie, vnic, mp);
- } else {
- /*
- * loopback the packet to a local VNIC or
- * an active MAC client.
- */
- vnic_local_tx(vnic_mac, flow_ent, mp);
- }
- VNIC_FLOW_REFRELE(flow_ent);
- mp_chain = NULL;
- } else {
- /*
- * Non-local destination, send via the underlying
- * NIC.
- */
- const mac_txinfo_t *mtp = vnic->vn_txinfo;
- mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
- if (mp_chain != NULL)
- break;
- }
- }
-
- /* update stats to account for unsent packets */
- for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
- vnic->vn_stat_opackets--;
- vnic->vn_stat_obytes -= msgdsize(mp);
- vnic->vn_stat_oerrors++;
- /*
- * link back in the last portion not counted due to bandwidth
- * control.
- */
- if (mp->b_next == NULL) {
- mp->b_next = extra_mp;
- break;
- }
- }
-
- return (mp_chain);
+ freemsgchain(mp_chain);
+ return (NULL);
}
-/* ARGSUSED */
+/*ARGSUSED*/
static void
-vnic_m_resources(void *arg)
+vnic_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
{
- /* no resources to advertise */
+ miocnak(q, mp, 0, ENOTSUP);
}
+/*
+ * This entry point cannot be passed-through, since it is invoked
+ * for the per-VNIC kstats which must be exported independently
+ * of the existence of VNIC MAC clients.
+ */
static int
vnic_m_stat(void *arg, uint_t stat, uint64_t *val)
{
vnic_t *vnic = arg;
int rval = 0;
- rw_enter(&vnic_lock, RW_READER);
+ if (vnic->vn_lower_mh == NULL) {
+ /*
+ * It's an anchor VNIC, which does not have any
+ * statistics in itself.
+ */
+ return (ENOTSUP);
+ }
+
+ /*
+ * ENOTSUP must be reported for unsupported stats, the VNIC
+ * driver reports a subset of the stats that would
+ * be returned by a real piece of hardware.
+ */
switch (stat) {
- case ETHER_STAT_LINK_DUPLEX:
- *val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
- ETHER_STAT_LINK_DUPLEX);
- break;
+ case MAC_STAT_LINK_STATE:
+ case MAC_STAT_LINK_UP:
+ case MAC_STAT_PROMISC:
case MAC_STAT_IFSPEED:
- *val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
- MAC_STAT_IFSPEED);
- break;
case MAC_STAT_MULTIRCV:
- *val = vnic->vn_stat_multircv;
- break;
- case MAC_STAT_BRDCSTRCV:
- *val = vnic->vn_stat_brdcstrcv;
- break;
case MAC_STAT_MULTIXMT:
- *val = vnic->vn_stat_multixmt;
- break;
+ case MAC_STAT_BRDCSTRCV:
case MAC_STAT_BRDCSTXMT:
- *val = vnic->vn_stat_brdcstxmt;
- break;
+ case MAC_STAT_OPACKETS:
+ case MAC_STAT_OBYTES:
case MAC_STAT_IERRORS:
- *val = vnic->vn_stat_ierrors;
- break;
case MAC_STAT_OERRORS:
- *val = vnic->vn_stat_oerrors;
- break;
case MAC_STAT_RBYTES:
- *val = vnic->vn_stat_rbytes;
- break;
case MAC_STAT_IPACKETS:
- *val = vnic->vn_stat_ipackets;
- break;
- case MAC_STAT_OBYTES:
- *val = vnic->vn_stat_obytes;
- break;
- case MAC_STAT_OPACKETS:
- *val = vnic->vn_stat_opackets;
+ *val = mac_client_stat_get(vnic->vn_mch, stat);
break;
default:
rval = ENOTSUP;
}
- rw_exit(&vnic_lock);
return (rval);
}
/*
+ * Invoked by the upper MAC to retrieve the lower MAC client handle
+ * corresponding to a VNIC. A pointer to this function is obtained
+ * by the upper MAC via capability query.
+ *
+ * XXX-nicolas Note: this currently causes all VNIC MAC clients to
+ * receive the same MAC client handle for the same VNIC. This is ok
+ * as long as we have only one VNIC MAC client which sends and
+ * receives data, but we don't currently enforce this at the MAC layer.
+ */
+static void *
+vnic_mac_client_handle(void *vnic_arg)
+{
+ vnic_t *vnic = vnic_arg;
+
+ return (vnic->vn_mch);
+}
+
+
+/*
* Return information about the specified capability.
*/
/* ARGSUSED */
@@ -1277,8 +705,6 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
vnic_t *vnic = arg;
switch (cap) {
- case MAC_CAPAB_POLL:
- return (B_TRUE);
case MAC_CAPAB_HCKSUM: {
uint32_t *hcksum_txflags = cap_data;
@@ -1287,331 +713,129 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
HCKSUM_INET_PARTIAL);
break;
}
+ case MAC_CAPAB_VNIC: {
+ mac_capab_vnic_t *vnic_capab = cap_data;
+
+ if (vnic->vn_lower_mh == NULL) {
+ /*
+ * It's an anchor VNIC, we don't have an underlying
+ * NIC and MAC client handle.
+ */
+ return (B_FALSE);
+ }
+
+ if (vnic_capab != NULL) {
+ vnic_capab->mcv_arg = vnic;
+ vnic_capab->mcv_mac_client_handle =
+ vnic_mac_client_handle;
+ }
+ break;
+ }
+ case MAC_CAPAB_ANCHOR_VNIC: {
+ /* since it's an anchor VNIC we don't have lower mac handle */
+ if (vnic->vn_lower_mh == NULL) {
+ ASSERT(vnic->vn_link_id == 0);
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+ }
+ case MAC_CAPAB_NO_NATIVEVLAN:
+ case MAC_CAPAB_NO_ZCOPY:
+ return (B_TRUE);
default:
return (B_FALSE);
}
return (B_TRUE);
}
+/* ARGSUSED */
static int
vnic_m_start(void *arg)
{
- vnic_t *vnic = arg;
- mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
- int rc;
-
- rc = mac_start(lower_mh);
- if (rc != 0)
- return (rc);
-
- vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx, vnic, vnic);
return (0);
}
+/* ARGSUSED */
static void
vnic_m_stop(void *arg)
{
- vnic_t *vnic = arg;
- mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
-
- vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx_initial,
- vnic, vnic);
- mac_stop(lower_mh);
}
/* ARGSUSED */
static int
vnic_m_promisc(void *arg, boolean_t on)
{
- vnic_t *vnic = arg;
-
- return (vnic_promisc_set(vnic, on));
+ return (0);
}
+/* ARGSUSED */
static int
vnic_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
{
- vnic_t *vnic = arg;
- int rc = 0;
-
- if (add)
- rc = vnic_bcast_add(vnic, addrp, MAC_ADDRTYPE_MULTICAST);
- else
- vnic_bcast_delete(vnic, addrp);
-
- return (rc);
+ return (0);
}
static int
-vnic_m_unicst(void *arg, const uint8_t *mac_addr)
+vnic_m_unicst(void *arg, const uint8_t *macaddr)
{
vnic_t *vnic = arg;
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- int rv;
- rw_enter(&vnic_lock, RW_WRITER);
- rv = vnic_modify_mac_addr(vnic, vnic_mac->va_addr_len,
- (uchar_t *)mac_addr);
- rw_exit(&vnic_lock);
-
- if (rv == 0)
- mac_unicst_update(vnic->vn_mh, mac_addr);
- return (0);
+ return (mac_vnic_unicast_set(vnic->vn_mch, macaddr));
}
int
-vnic_info(uint_t *nvnics, datalink_id_t vnic_id, datalink_id_t linkid,
- void *fn_arg, vnic_info_new_vnic_fn_t new_vnic_fn)
-{
- vnic_info_state_t state;
- int rc = 0;
-
- rw_enter(&vnic_lock, RW_READER);
-
- *nvnics = vnic_count;
-
- bzero(&state, sizeof (state));
- state.vs_vnic_id = vnic_id;
- state.vs_linkid = linkid;
- state.vs_new_vnic_fn = new_vnic_fn;
- state.vs_fn_arg = fn_arg;
-
- mod_hash_walk(vnic_hash, vnic_info_walker, &state);
-
- if ((rc = state.vs_rc) == 0 && vnic_id != DATALINK_ALL_LINKID &&
- !state.vs_vnic_found)
- rc = ENOENT;
-
- rw_exit(&vnic_lock);
- return (rc);
-}
-
-/*
- * Walker invoked when building a list of vnics that must be passed
- * up to user space.
- */
-/*ARGSUSED*/
-static uint_t
-vnic_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
-{
- vnic_t *vnic;
- vnic_info_state_t *state = arg;
-
- if (state->vs_rc != 0)
- return (MH_WALK_TERMINATE); /* terminate walk */
-
- vnic = (vnic_t *)val;
-
- if (state->vs_vnic_id != DATALINK_ALL_LINKID &&
- vnic->vn_id != state->vs_vnic_id) {
- goto bail;
- }
-
- state->vs_vnic_found = B_TRUE;
-
- state->vs_rc = state->vs_new_vnic_fn(state->vs_fn_arg,
- vnic->vn_id, vnic->vn_addr_type, vnic->vn_vnic_mac->va_addr_len,
- vnic->vn_addr, vnic->vn_vnic_mac->va_linkid);
-bail:
- return ((state->vs_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
-}
-
-/*
- * vnic_notify_cb() and vnic_notify_walker() below are used to
- * process events received from an underlying NIC and, if needed,
- * forward these events to the VNICs defined on top of that NIC.
- */
-
-typedef struct vnic_notify_state {
- mac_notify_type_t vo_type;
- vnic_mac_t *vo_vnic_mac;
-} vnic_notify_state_t;
-
-/* ARGSUSED */
-static uint_t
-vnic_notify_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+vnic_info(vnic_info_t *info)
{
- vnic_t *vnic = (vnic_t *)val;
- vnic_notify_state_t *state = arg;
+ vnic_t *vnic;
+ int err;
- /* ignore VNICs that don't use the specified underlying MAC */
- if (vnic->vn_vnic_mac != state->vo_vnic_mac)
- return (MH_WALK_CONTINUE);
+ rw_enter(&vnic_lock, RW_WRITER);
- switch (state->vo_type) {
- case MAC_NOTE_TX:
- mac_tx_update(vnic->vn_mh);
- break;
- case MAC_NOTE_LINK:
- /*
- * The VNIC link state must be up regardless of
- * the link state of the underlying NIC to maintain
- * connectivity between VNICs on the same host.
- */
- mac_link_update(vnic->vn_mh, LINK_STATE_UP);
- break;
- case MAC_NOTE_UNICST:
- vnic_update_active_rx(vnic->vn_vnic_mac);
- break;
- case MAC_NOTE_VNIC:
- /* only for clients which share a NIC with a VNIC */
- break;
- case MAC_NOTE_PROMISC:
- mutex_enter(&vnic_mac_lock);
- vnic->vn_vnic_mac->va_txinfo = mac_vnic_tx_get(
- vnic->vn_vnic_mac->va_mh);
- mutex_exit(&vnic_mac_lock);
- break;
+ err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(info->vn_vnic_id),
+ (mod_hash_val_t *)&vnic);
+ if (err != 0) {
+ rw_exit(&vnic_lock);
+ return (ENOENT);
}
- return (MH_WALK_CONTINUE);
-}
-
-static void
-vnic_notify_cb(void *arg, mac_notify_type_t type)
-{
- vnic_mac_t *vnic = arg;
- vnic_notify_state_t state;
+ info->vn_link_id = vnic->vn_link_id;
+ info->vn_mac_addr_type = vnic->vn_addr_type;
+ info->vn_mac_len = vnic->vn_addr_len;
+ bcopy(vnic->vn_addr, info->vn_mac_addr, MAXMACADDRLEN);
+ info->vn_mac_slot = vnic->vn_slot_id;
+ info->vn_mac_prefix_len = 0;
+ info->vn_vid = vnic->vn_vid;
+ info->vn_force = vnic->vn_force;
- state.vo_type = type;
- state.vo_vnic_mac = vnic;
+ bzero(&info->vn_resource_props, sizeof (mac_resource_props_t));
+ if (vnic->vn_mch != NULL)
+ mac_resource_ctl_get(vnic->vn_mch, &info->vn_resource_props);
- rw_enter(&vnic_lock, RW_READER);
- mod_hash_walk(vnic_hash, vnic_notify_walker, &state);
rw_exit(&vnic_lock);
-}
-
-static int
-vnic_modify_mac_addr(vnic_t *vnic, uint_t mac_len, uchar_t *mac_addr)
-{
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- vnic_flow_t *vnic_flow = vnic->vn_flow_ent;
-
- ASSERT(RW_WRITE_HELD(&vnic_lock));
-
- if (mac_len != vnic_mac->va_addr_len)
- return (EINVAL);
-
- vnic_classifier_flow_update_addr(vnic_flow, mac_addr);
return (0);
}
-static int
-vnic_promisc_set(vnic_t *vnic, boolean_t on)
-{
- vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
- int r = -1;
-
- if (vnic->vn_promisc == on)
- return (0);
-
- if (on) {
- if ((r = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
- MAC_DEVPROMISC)) != 0) {
- return (r);
- }
-
- rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
- vnic->vn_promisc_next = vnic_mac->va_promisc;
- vnic_mac->va_promisc = vnic;
- vnic_mac->va_promisc_gen++;
-
- vnic->vn_promisc = B_TRUE;
- rw_exit(&vnic_mac->va_promisc_lock);
-
- return (0);
- } else {
- vnic_t *loop, *prev = NULL;
-
- rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
- loop = vnic_mac->va_promisc;
-
- while ((loop != NULL) && (loop != vnic)) {
- prev = loop;
- loop = loop->vn_promisc_next;
- }
-
- if ((loop != NULL) &&
- ((r = mac_promisc_set(vnic_mac->va_mh, B_FALSE,
- MAC_DEVPROMISC)) == 0)) {
- if (prev != NULL)
- prev->vn_promisc_next = loop->vn_promisc_next;
- else
- vnic_mac->va_promisc = loop->vn_promisc_next;
- vnic_mac->va_promisc_gen++;
-
- vnic->vn_promisc = B_FALSE;
- }
- rw_exit(&vnic_mac->va_promisc_lock);
-
- return (r);
- }
-}
-
-void
-vnic_promisc_rx(vnic_mac_t *vnic_mac, vnic_t *sender, mblk_t *mp)
+static void
+vnic_notify_cb(void *arg, mac_notify_type_t type)
{
- vnic_t *loop;
- vnic_flow_t *flow;
- const vnic_flow_fn_info_t *fn_info;
- mac_header_info_t hdr_info;
- boolean_t dst_must_match = B_TRUE;
-
- ASSERT(mp->b_next == NULL);
-
- rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
- if (vnic_mac->va_promisc == NULL)
- goto done;
-
- if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0)
- goto done;
+ vnic_t *vnic = arg;
/*
- * If this is broadcast or multicast then the destination
- * address need not match for us to deliver it.
+ * Only the VLAN VNIC needs to be notified with primary MAC
+ * address change.
*/
- if ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) ||
- (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST))
- dst_must_match = B_FALSE;
-
- for (loop = vnic_mac->va_promisc;
- loop != NULL;
- loop = loop->vn_promisc_next) {
- if (loop == sender)
- continue;
-
- if (dst_must_match &&
- (bcmp(hdr_info.mhi_daddr, loop->vn_addr,
- sizeof (loop->vn_addr)) != 0))
- continue;
-
- flow = loop->vn_flow_ent;
- ASSERT(flow != NULL);
-
- if (!flow->vf_is_active) {
- mblk_t *copy;
- uint64_t gen;
-
- if ((copy = vnic_copymsg_cksum(mp)) == NULL)
- break;
- if ((sender != NULL) &&
- ((copy = vnic_fix_cksum(copy)) == NULL))
- break;
-
- VNIC_FLOW_REFHOLD(flow);
- gen = vnic_mac->va_promisc_gen;
- rw_exit(&vnic_mac->va_promisc_lock);
-
- fn_info = vnic_classifier_get_fn_info(flow);
- (fn_info->ff_fn)(fn_info->ff_arg1,
- fn_info->ff_arg2, copy);
-
- VNIC_FLOW_REFRELE(flow);
- rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
- if (vnic_mac->va_promisc_gen != gen)
- break;
- }
+ if (vnic->vn_addr_type != VNIC_MAC_ADDR_TYPE_PRIMARY)
+ return;
+
+ switch (type) {
+ case MAC_NOTE_UNICST:
+ /* the unicast MAC address value */
+ mac_unicast_primary_get(vnic->vn_lower_mh, vnic->vn_addr);
+
+ /* notify its upper layer MAC about MAC address change */
+ mac_unicst_update(vnic->vn_mh, (const uint8_t *)vnic->vn_addr);
+ break;
+ default:
+ break;
}
-done:
- rw_exit(&vnic_mac->va_promisc_lock);
}
diff --git a/usr/src/uts/common/io/wpi/wpi.c b/usr/src/uts/common/io/wpi/wpi.c
index 00878f64ce..bd817f22c5 100644
--- a/usr/src/uts/common/io/wpi/wpi.c
+++ b/usr/src/uts/common/io/wpi/wpi.c
@@ -42,7 +42,7 @@
#include <sys/modctl.h>
#include <sys/devops.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_wifi.h>
#include <sys/net80211.h>
#include <sys/net80211_proto.h>
@@ -371,7 +371,6 @@ mac_callbacks_t wpi_m_callbacks = {
wpi_m_multicst,
wpi_m_unicst,
wpi_m_tx,
- NULL,
wpi_m_ioctl,
NULL,
NULL,
diff --git a/usr/src/uts/common/io/xge/drv/xge.c b/usr/src/uts/common/io/xge/drv/xge.c
index c41f82d706..6ee52f4262 100644
--- a/usr/src/uts/common/io/xge/drv/xge.c
+++ b/usr/src/uts/common/io/xge/drv/xge.c
@@ -65,34 +65,6 @@ ddi_device_acc_attr_t xge_dev_attr = {
ddi_device_acc_attr_t *p_xge_dev_attr = &xge_dev_attr;
/*
- * xge_event
- *
- * This function called by HAL to notify upper layer that some any
- * event been produced.
- */
-void
-xge_event(xge_queue_item_t *item)
-{
- xgell_fifo_t *fifo = item->context;
- xgelldev_t *lldev = fifo->lldev;
-
- switch (item->event_type) {
- case XGELL_EVENT_RESCHED_NEEDED:
- if (lldev->is_initialized) {
- if (xge_hal_channel_dtr_count(fifo->channelh)
- >= XGELL_TX_LEVEL_HIGH) {
- mac_tx_update(lldev->mh);
- xge_debug_osdep(XGE_TRACE, "%s",
- "mac_tx_update happened!");
- }
- }
- break;
- default:
- break;
- }
-}
-
-/*
* xgell_callback_crit_err
*
* This function called by HAL on Serious Error event. XGE_HAL_EVENT_SERR.
@@ -139,18 +111,6 @@ xge_xpak_alarm_log(void *userdata, xge_hal_xpak_alarm_type_e type)
}
/*
- * xge_queue_produce context
- */
-static void
-xge_callback_event_queued(xge_hal_device_h devh, int event_type)
-{
- if (event_type == XGELL_EVENT_RESCHED_NEEDED) {
- (void) taskq_dispatch(system_taskq, xge_device_poll_now, devh,
- TQ_NOSLEEP);
- }
-}
-
-/*
* xge_driver_init_hal
*
* To initialize HAL portion of driver.
@@ -167,8 +127,8 @@ xge_driver_init_hal(void)
uld_callbacks.link_up = xgell_callback_link_up;
uld_callbacks.link_down = xgell_callback_link_down;
uld_callbacks.crit_err = xge_callback_crit_err;
- uld_callbacks.event = xge_event;
- uld_callbacks.event_queued = xge_callback_event_queued;
+ uld_callbacks.event = NULL;
+ uld_callbacks.event_queued = NULL;
uld_callbacks.before_device_poll = NULL;
uld_callbacks.after_device_poll = NULL;
uld_callbacks.sched_timer = NULL;
@@ -241,7 +201,6 @@ _info(struct modinfo *pModinfo)
return (mod_info(&modlinkage, pModinfo));
}
-/* ARGSUSED */
/*
* xge_isr
* @arg: pointer to device private strucutre(hldev)
@@ -249,6 +208,7 @@ _info(struct modinfo *pModinfo)
* This is the ISR scheduled by the OS to indicate to the
* driver that the receive/transmit operation is completed.
*/
+/* ARGSUSED */
static uint_t
xge_isr(caddr_t arg0, caddr_t arg1)
{
@@ -308,262 +268,263 @@ xge_ring_msix_isr(caddr_t arg0, caddr_t arg1)
* Configure single ring
*/
static void
-xge_ring_config(dev_info_t *dev_info,
- xge_hal_device_config_t *device_config, int num)
+xge_ring_config(dev_info_t *dev_info, xge_hal_device_config_t *device_config,
+ int index)
{
char msg[MSG_SIZE];
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_configured", num);
- device_config->ring.queue[num].configured =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_configured", index);
+ device_config->ring.queue[index].configured =
ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS,
- msg, num < XGELL_MAX_RING_DEFAULT ? 1 : 0);
+ msg, index < XGELL_RX_RING_NUM_MAX ? 1 : 0);
/* no point to configure it further if unconfigured */
- if (!device_config->ring.queue[num].configured)
+ if (!device_config->ring.queue[index].configured)
return;
#if defined(__sparc)
- device_config->ring.queue[num].no_snoop_bits = 1;
+ device_config->ring.queue[index].no_snoop_bits = 1;
#endif
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max", num);
- device_config->ring.queue[num].max =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max", index);
+ device_config->ring.queue[index].max =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_USE_HARDCODE);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_initial", num);
- device_config->ring.queue[num].initial =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_initial", index);
+ device_config->ring.queue[index].initial =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_USE_HARDCODE);
- if (device_config->ring.queue[num].initial ==
+ if (device_config->ring.queue[index].initial ==
XGE_HAL_DEFAULT_USE_HARDCODE) {
- if (device_config->mtu > XGE_HAL_DEFAULT_MTU) {
- device_config->ring.queue[num].initial =
- device_config->ring.queue[num].max =
- XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_J;
- } else {
- device_config->ring.queue[num].initial =
- device_config->ring.queue[num].max =
- XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_N;
- }
+ device_config->ring.queue[index].initial =
+ device_config->ring.queue[index].max =
+ XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS;
}
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_buffer_mode", num);
- device_config->ring.queue[num].buffer_mode =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_buffer_mode", index);
+ device_config->ring.queue[index].buffer_mode =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_RING_QUEUE_BUFFER_MODE_DEFAULT);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_dram_size_mb", num);
- device_config->ring.queue[num].dram_size_mb =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_dram_size_mb", index);
+ device_config->ring.queue[index].dram_size_mb =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_USE_HARDCODE);
(void) xge_os_snprintf(msg, MSG_SIZE,
- "ring%d_backoff_interval_us", num);
- device_config->ring.queue[num].backoff_interval_us =
+ "ring%d_backoff_interval_us", index);
+ device_config->ring.queue[index].backoff_interval_us =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_BACKOFF_INTERVAL_US);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max_frm_len", num);
- device_config->ring.queue[num].max_frm_len =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max_frm_len", index);
+ device_config->ring.queue[index].max_frm_len =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_RING_USE_MTU);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_priority", num);
- device_config->ring.queue[num].priority =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_priority", index);
+ device_config->ring.queue[index].priority =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RING_PRIORITY);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_a", num);
- device_config->ring.queue[num].rti.urange_a =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_a", index);
+ device_config->ring.queue[index].rti.urange_a =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RX_URANGE_A);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_a", num);
- device_config->ring.queue[num].rti.ufc_a =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_a", index);
+ device_config->ring.queue[index].rti.ufc_a =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RX_UFC_A);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_b", num);
- device_config->ring.queue[num].rti.urange_b =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_b", index);
+ device_config->ring.queue[index].rti.urange_b =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RX_URANGE_B);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_b", num);
- device_config->ring.queue[num].rti.ufc_b =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_b", index);
+ device_config->ring.queue[index].rti.ufc_b =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
device_config->mtu > XGE_HAL_DEFAULT_MTU ?
XGE_HAL_DEFAULT_RX_UFC_B_J:
XGE_HAL_DEFAULT_RX_UFC_B_N);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_c", num);
- device_config->ring.queue[num].rti.urange_c =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_c", index);
+ device_config->ring.queue[index].rti.urange_c =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RX_URANGE_C);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_c", num);
- device_config->ring.queue[num].rti.ufc_c =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_c", index);
+ device_config->ring.queue[index].rti.ufc_c =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
device_config->mtu > XGE_HAL_DEFAULT_MTU ?
XGE_HAL_DEFAULT_RX_UFC_C_J:
XGE_HAL_DEFAULT_RX_UFC_C_N);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_d", num);
- device_config->ring.queue[num].rti.ufc_d =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_d", index);
+ device_config->ring.queue[index].rti.ufc_d =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RX_UFC_D);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_val", num);
- device_config->ring.queue[num].rti.timer_val_us =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_val", index);
+ device_config->ring.queue[index].rti.timer_val_us =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RX_TIMER_VAL);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_ac_en", num);
- device_config->ring.queue[num].rti.timer_ac_en =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_ac_en", index);
+ device_config->ring.queue[index].rti.timer_ac_en =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_RX_TIMER_AC_EN);
- (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_indicate_max_pkts", num);
- device_config->ring.queue[num].indicate_max_pkts =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_indicate_max_pkts",
+ index);
+ device_config->ring.queue[index].indicate_max_pkts =
ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
(device_config->bimodal_interrupts ?
XGE_HAL_DEFAULT_INDICATE_MAX_PKTS_B :
XGE_HAL_DEFAULT_INDICATE_MAX_PKTS_N));
- if (device_config->ring.queue[num].configured) {
- /* enable RTH steering by default */
- device_config->ring.queue[num].rth_en = 1;
- device_config->rth_en = XGE_HAL_RTH_ENABLE;
- device_config->rth_bucket_size = XGE_HAL_MAX_RTH_BUCKET_SIZE;
- device_config->rth_spdm_en = XGE_HAL_RTH_SPDM_DISABLE;
- device_config->rth_spdm_use_l4 = XGE_HAL_RTH_SPDM_USE_L4;
- }
+ /*
+ * Enable RTH steering if needed HERE!!!!
+ */
+ if (device_config->rth_en == XGE_HAL_RTH_ENABLE)
+ device_config->ring.queue[index].rth_en = 1;
}
/*
* Configure single fifo
*/
static void
-xge_fifo_config(dev_info_t *dev_info,
- xge_hal_device_config_t *device_config, int num)
+xge_fifo_config(dev_info_t *dev_info, xge_hal_device_config_t *device_config,
+ int index)
{
char msg[MSG_SIZE];
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_configured", num);
- device_config->fifo.queue[num].configured =
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_configured", index);
+ device_config->fifo.queue[index].configured =
ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS,
- msg, num < XGELL_MAX_FIFO_DEFAULT ? 1 : 0);
+ msg, index < XGELL_TX_RING_NUM_MAX ? 1 : 0);
/* no point to configure it further */
- if (!device_config->fifo.queue[num].configured)
+ if (!device_config->fifo.queue[index].configured)
return;
#if defined(__sparc)
- device_config->fifo.queue[num].no_snoop_bits = 1;
+ device_config->fifo.queue[index].no_snoop_bits = 1;
#endif
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_max", num);
- device_config->fifo.queue[num].max = ddi_prop_get_int(DDI_DEV_T_ANY,
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_max", index);
+ device_config->fifo.queue[index].max = ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_USE_HARDCODE);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_initial", num);
- device_config->fifo.queue[num].initial = ddi_prop_get_int(DDI_DEV_T_ANY,
- dev_info, DDI_PROP_DONTPASS, msg,
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_initial", index);
+ device_config->fifo.queue[index].initial =
+ ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_USE_HARDCODE);
- if (device_config->fifo.queue[num].initial ==
+#if 0
+ if (device_config->fifo.queue[index].initial ==
XGE_HAL_DEFAULT_USE_HARDCODE) {
if (device_config->mtu > XGE_HAL_DEFAULT_MTU) {
- device_config->fifo.queue[num].initial =
- device_config->fifo.queue[num].max =
+ device_config->fifo.queue[index].initial =
+ device_config->fifo.queue[index].max =
XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_J;
} else {
- device_config->fifo.queue[num].initial =
- device_config->fifo.queue[num].max =
+ device_config->fifo.queue[index].initial =
+ device_config->fifo.queue[index].max =
XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_N;
}
}
+#else
+ if (device_config->fifo.queue[index].initial ==
+ XGE_HAL_DEFAULT_USE_HARDCODE) {
+ device_config->fifo.queue[index].max =
+ device_config->fifo.queue[index].initial =
+ XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_A;
+ }
+#endif
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_intr", num);
- device_config->fifo.queue[num].intr = ddi_prop_get_int(DDI_DEV_T_ANY,
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_intr", index);
+ device_config->fifo.queue[index].intr = ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_FIFO_QUEUE_INTR);
/*
* TTI 0 configuration
*/
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_enable", num);
- device_config->fifo.queue[num].tti[num].enabled = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_enable", index);
+ device_config->fifo.queue[index].tti[index].enabled = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, 1);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_a", num);
- device_config->fifo.queue[num].tti[num].urange_a = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_a", index);
+ device_config->fifo.queue[index].tti[index].urange_a = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_URANGE_A);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_a", num);
- device_config->fifo.queue[num].tti[num].ufc_a = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_a", index);
+ device_config->fifo.queue[index].tti[index].ufc_a = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_UFC_A);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_b", num);
- device_config->fifo.queue[num].tti[num].urange_b = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_b", index);
+ device_config->fifo.queue[index].tti[index].urange_b = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_URANGE_B);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_b", num);
- device_config->fifo.queue[num].tti[num].ufc_b = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_b", index);
+ device_config->fifo.queue[index].tti[index].ufc_b = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_UFC_B);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_c", num);
- device_config->fifo.queue[num].tti[num].urange_c = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_c", index);
+ device_config->fifo.queue[index].tti[index].urange_c = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_URANGE_C);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_c", num);
- device_config->fifo.queue[num].tti[num].ufc_c = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_c", index);
+ device_config->fifo.queue[index].tti[index].ufc_c = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_UFC_C);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_d", num);
- device_config->fifo.queue[num].tti[num].ufc_d = ddi_prop_get_int(
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_d", index);
+ device_config->fifo.queue[index].tti[index].ufc_d = ddi_prop_get_int(
DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_UFC_D);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_timer_ac_en", num);
- device_config->fifo.queue[num].tti[num].timer_ac_en = ddi_prop_get_int(
- DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_timer_ac_en", index);
+ device_config->fifo.queue[index].tti[index].timer_ac_en =
+ ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_TIMER_AC_EN);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_val", num);
- device_config->fifo.queue[num].tti[num].timer_val_us = ddi_prop_get_int(
- DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_val", index);
+ device_config->fifo.queue[index].tti[index].timer_val_us =
+ ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_TIMER_VAL);
- (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_ci_en", num);
- device_config->fifo.queue[num].tti[num].timer_ci_en = ddi_prop_get_int(
- DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
+ (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_ci_en", index);
+ device_config->fifo.queue[index].tti[index].timer_ci_en =
+ ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
XGE_HAL_DEFAULT_TX_TIMER_CI_EN);
}
@@ -577,11 +538,57 @@ xge_fifo_config(dev_info_t *dev_info,
*/
static void
xge_configuration_init(dev_info_t *dev_info,
- xge_hal_device_config_t *device_config, xgell_config_t *ll_config)
+ xge_hal_device_config_t *device_config, xgell_config_t *xgell_config)
{
int i, rings_configured = 0, fifos_configured = 0;
/*
+ * Initialize link layer configuration first
+ */
+ xgell_config->rx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+ DDI_PROP_DONTPASS, "rx_dma_lowat", XGELL_RX_DMA_LOWAT);
+ xgell_config->rx_pkt_burst = ddi_prop_get_int(DDI_DEV_T_ANY,
+ dev_info, DDI_PROP_DONTPASS, "rx_pkt_burst", XGELL_RX_PKT_BURST);
+ xgell_config->tx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+ DDI_PROP_DONTPASS, "tx_dma_lowat", XGELL_TX_DMA_LOWAT);
+ xgell_config->lso_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+ DDI_PROP_DONTPASS, "lso_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
+ xgell_config->msix_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+ DDI_PROP_DONTPASS, "msix_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
+
+ xgell_config->grouping = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+ DDI_PROP_DONTPASS, "grouping", XGELL_CONF_GROUP_POLICY_DEFAULT);
+
+ switch (xgell_config->grouping) {
+ case XGELL_CONF_GROUP_POLICY_VIRT:
+ /*
+ * Enable layer 2 steering for better virtualization
+ */
+ device_config->rth_en = XGE_HAL_RTH_DISABLE;
+ device_config->rts_mac_en = XGE_HAL_RTS_MAC_ENABLE;
+ break;
+ case XGELL_CONF_GROUP_POLICY_PERF:
+ /*
+ * Configure layer 4 RTH to hashing inbound traffic
+ */
+ device_config->rth_en = XGE_HAL_RTH_ENABLE;
+ device_config->rth_bucket_size = XGE_HAL_MAX_RTH_BUCKET_SIZE;
+ device_config->rth_spdm_en = XGE_HAL_RTH_SPDM_DISABLE;
+ device_config->rth_spdm_use_l4 = XGE_HAL_RTH_SPDM_USE_L4;
+
+ device_config->rts_mac_en = XGE_HAL_RTS_MAC_DISABLE;
+ break;
+ case XGELL_CONF_GROUP_POLICY_BASIC:
+ default:
+ /*
+ * Disable both RTS and RTH for single ring configuration
+ */
+ device_config->rth_en = XGE_HAL_RTH_DISABLE;
+ device_config->rts_mac_en = XGE_HAL_RTS_MAC_DISABLE;
+ break;
+ }
+
+ /*
* Initialize common properties
*/
device_config->mtu = ddi_prop_get_int(DDI_DEV_T_ANY,
@@ -634,12 +641,6 @@ xge_configuration_init(dev_info_t *dev_info,
XGE_HAL_DEFAULT_BIMODAL_TIMER_HI_US);
/*
- * MSI-X switch
- */
- ll_config->msix_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
- DDI_PROP_DONTPASS, "msix_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
-
- /*
* Go through all possibly configured rings. Each ring could be
* configured individually. To enable/disable specific ring, just
* set ring->configured = [1|0].
@@ -740,30 +741,20 @@ xge_configuration_init(dev_info_t *dev_info,
XGE_HAL_DEFAULT_LRO_FRM_LEN);
/*
- * Initialize link layer configuration
+ * Initialize other link layer configuration first
*/
- ll_config->rx_buffer_total = ddi_prop_get_int(DDI_DEV_T_ANY,
+ xgell_config->rx_buffer_total = ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, "rx_buffer_total",
- device_config->ring.queue[XGELL_RING_MAIN_QID].initial *
+ device_config->ring.queue[XGELL_RX_RING_MAIN].initial *
XGELL_RX_BUFFER_TOTAL);
- ll_config->rx_buffer_total += XGELL_RX_BUFFER_RECYCLE_CACHE;
- ll_config->rx_buffer_post_hiwat = ddi_prop_get_int(DDI_DEV_T_ANY,
+ xgell_config->rx_buffer_total += XGELL_RX_BUFFER_RECYCLE_CACHE;
+ xgell_config->rx_buffer_post_hiwat = ddi_prop_get_int(DDI_DEV_T_ANY,
dev_info, DDI_PROP_DONTPASS, "rx_buffer_post_hiwat",
- device_config->ring.queue[XGELL_RING_MAIN_QID].initial *
+ device_config->ring.queue[XGELL_RX_RING_MAIN].initial *
XGELL_RX_BUFFER_POST_HIWAT);
- ll_config->rx_buffer_post_hiwat += XGELL_RX_BUFFER_RECYCLE_CACHE;
- ll_config->rx_pkt_burst = ddi_prop_get_int(DDI_DEV_T_ANY,
- dev_info, DDI_PROP_DONTPASS, "rx_pkt_burst",
- XGELL_RX_PKT_BURST);
- ll_config->rx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
- DDI_PROP_DONTPASS, "rx_dma_lowat", XGELL_RX_DMA_LOWAT);
- ll_config->tx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
- DDI_PROP_DONTPASS, "tx_dma_lowat", XGELL_TX_DMA_LOWAT);
- ll_config->lso_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
- DDI_PROP_DONTPASS, "lso_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
+ xgell_config->rx_buffer_post_hiwat += XGELL_RX_BUFFER_RECYCLE_CACHE;
}
-
/*
* xge_alloc_intrs:
*
@@ -847,6 +838,7 @@ _err_exit2:
}
_err_exit1:
kmem_free(lldev->intr_table, lldev->intr_table_size);
+ lldev->intr_table = NULL;
_err_exit0:
if (lldev->intr_type == DDI_INTR_TYPE_MSIX)
(void) ddi_prop_remove(DDI_DEV_T_NONE, dip, "#msix-request");
@@ -869,6 +861,7 @@ xge_free_intrs(xgelldev_t *lldev)
(void) ddi_intr_free(lldev->intr_table[i]);
}
kmem_free(lldev->intr_table, lldev->intr_table_size);
+ lldev->intr_table = NULL;
if (lldev->intr_type == DDI_INTR_TYPE_MSIX)
(void) ddi_prop_remove(DDI_DEV_T_NONE, dip, "#msix-request");
@@ -889,9 +882,10 @@ xge_add_intrs(xgelldev_t *lldev)
xge_hal_fifo_config_t *fifo_conf = &hal_conf->fifo;
xge_list_t *item;
int msix_idx = 1; /* 0 by default is reserved for Alarms. */
- xge_hal_channel_t *assigned[XGELL_MAX_RING_DEFAULT +
- XGELL_MAX_FIFO_DEFAULT + 1];
+ xge_hal_channel_t *assigned[XGELL_RX_RING_NUM_MAX +
+ XGELL_TX_RING_NUM_MAX + 1];
+ xge_assert(lldev->intr_table != NULL);
switch (lldev->intr_type) {
case DDI_INTR_TYPE_FIXED:
ret = ddi_intr_add_handler(lldev->intr_table[0],
@@ -1054,6 +1048,8 @@ xge_rem_intrs(xgelldev_t *lldev)
{
int i;
+ xge_assert(lldev->intr_table != NULL);
+
/* Call ddi_intr_remove_handler() */
for (i = 0; i < lldev->intr_cnt; i++) {
(void) ddi_intr_remove_handler(lldev->intr_table[i]);
@@ -1079,11 +1075,11 @@ static int
xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
{
xgelldev_t *ll;
+ xgell_config_t *xgell_config;
xge_hal_device_config_t *device_config;
xge_hal_device_t *hldev;
xge_hal_device_attr_t attr;
xge_hal_status_e status;
- xgell_config_t ll_config;
int ret, intr_types, i;
xge_debug_osdep(XGE_TRACE, "XGE_ATTACH cmd %d", cmd);
@@ -1104,10 +1100,13 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
goto _exit0;
}
+ xgell_config = kmem_zalloc(sizeof (xgell_config_t), KM_SLEEP);
device_config = kmem_zalloc(sizeof (xge_hal_device_config_t), KM_SLEEP);
- /* Init device_config by lookup up properties from .conf file */
- xge_configuration_init(dev_info, device_config, &ll_config);
+ /*
+ * Initialize all configurations
+ */
+ xge_configuration_init(dev_info, device_config, xgell_config);
/* Determine which types of interrupts supported */
ret = ddi_intr_get_supported_types(dev_info, &intr_types);
@@ -1161,7 +1160,34 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
goto _exit3;
}
- if (ll_config.msix_enable && intr_types & DDI_INTR_TYPE_MSIX) {
+ /*
+ * Init multiple rings configuration
+ */
+ switch (xgell_config->grouping) {
+ case XGELL_CONF_GROUP_POLICY_VIRT:
+ ll->init_rx_rings = XGELL_RX_RING_NUM_MAX; /* 8 */
+ ll->init_tx_rings = XGELL_TX_RING_NUM_MAX; /* 8 */
+ ll->init_rx_groups = ll->init_rx_rings;
+ break;
+ case XGELL_CONF_GROUP_POLICY_PERF:
+ ll->init_rx_rings = XGELL_RX_RING_NUM_MAX; /* 8 */
+ ll->init_tx_rings = XGELL_TX_RING_NUM_MAX; /* 8 */
+ ll->init_rx_groups = 1;
+ break;
+ case XGELL_CONF_GROUP_POLICY_BASIC:
+ ll->init_rx_rings = XGELL_RX_RING_NUM_MIN; /* 1 */
+ ll->init_tx_rings = XGELL_TX_RING_NUM_MIN; /* 1 */
+ ll->init_rx_groups = ll->init_rx_rings;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ /*
+ * Init MSI-X configuration
+ */
+ if (xgell_config->msix_enable && intr_types & DDI_INTR_TYPE_MSIX) {
ll->intr_type = DDI_INTR_TYPE_MSIX;
ll->intr_cnt = 1;
for (i = 0; i < XGE_HAL_MAX_FIFO_NUM; i++)
@@ -1175,9 +1201,12 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
ll->intr_cnt = 1;
}
+ /*
+ * Allocate interrupt(s)
+ */
while ((ret = xge_alloc_intrs(ll)) != DDI_SUCCESS) {
if (ll->intr_type == DDI_INTR_TYPE_MSIX) {
- ll_config.msix_enable = 0;
+ xgell_config->msix_enable = 0;
ll->intr_type = DDI_INTR_TYPE_FIXED;
ll->intr_cnt = 1;
device_config->intr_mode = XGE_HAL_INTR_MODE_IRQLINE;
@@ -1231,7 +1260,7 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
goto _exit4;
/* allocate and register Link Layer */
- ret = xgell_device_register(ll, &ll_config);
+ ret = xgell_device_register(ll, xgell_config);
if (ret != DDI_SUCCESS) {
goto _exit5;
}
@@ -1240,6 +1269,7 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
xge_hal_device_private_set(hldev, ll);
kmem_free(device_config, sizeof (xge_hal_device_config_t));
+ kmem_free(xgell_config, sizeof (xgell_config_t));
return (DDI_SUCCESS);
@@ -1263,6 +1293,7 @@ _exit1:
ddi_regs_map_free(&attr.regh0);
_exit0a:
kmem_free(device_config, sizeof (xge_hal_device_config_t));
+ kmem_free(xgell_config, sizeof (xgell_config_t));
_exit0:
return (ret);
}
@@ -1298,7 +1329,7 @@ xge_quiesce(dev_info_t *dev_info)
* This function is called by OS when the system is about
* to shutdown or when the super user tries to unload
* the driver. This function frees all the memory allocated
- * during xge_attch() and also unregisters the Xframe
+ * during xge_attach() and also unregisters the Xframe
* device instance from the GLD framework.
*/
static int
diff --git a/usr/src/uts/common/io/xge/drv/xge_osdep.h b/usr/src/uts/common/io/xge/drv/xge_osdep.h
index 18923972ee..4b09b0f983 100644
--- a/usr/src/uts/common/io/xge/drv/xge_osdep.h
+++ b/usr/src/uts/common/io/xge/drv/xge_osdep.h
@@ -37,8 +37,6 @@
#ifndef _SYS_XGE_OSDEP_H
#define _SYS_XGE_OSDEP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/varargs.h>
diff --git a/usr/src/uts/common/io/xge/drv/xgell.c b/usr/src/uts/common/io/xge/drv/xgell.c
index 85db35ddcc..4ec1117750 100644
--- a/usr/src/uts/common/io/xge/drv/xgell.c
+++ b/usr/src/uts/common/io/xge/drv/xgell.c
@@ -24,10 +24,8 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
- * Copyright (c) 2002-2005 Neterion, Inc.
+ * Copyright (c) 2002-2008 Neterion, Inc.
* All right Reserved.
*
* FileName : xgell.c
@@ -100,9 +98,7 @@ static int xgell_m_start(void *);
static void xgell_m_stop(void *);
static int xgell_m_promisc(void *, boolean_t);
static int xgell_m_multicst(void *, boolean_t, const uint8_t *);
-static int xgell_m_unicst(void *, const uint8_t *);
static void xgell_m_ioctl(void *, queue_t *, mblk_t *);
-static mblk_t *xgell_m_tx(void *, mblk_t *);
static boolean_t xgell_m_getcapab(void *, mac_capab_t, void *);
#define XGELL_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB)
@@ -114,8 +110,7 @@ static mac_callbacks_t xgell_m_callbacks = {
xgell_m_stop,
xgell_m_promisc,
xgell_m_multicst,
- xgell_m_unicst,
- xgell_m_tx,
+ NULL,
NULL,
xgell_m_ioctl,
xgell_m_getcapab
@@ -124,7 +119,7 @@ static mac_callbacks_t xgell_m_callbacks = {
/*
* xge_device_poll
*
- * Cyclic should call me every 1s. xge_callback_event_queued should call me
+ * Timeout should call me every 1s. xge_callback_event_queued should call me
* when HAL hope event was rescheduled.
*/
/*ARGSUSED*/
@@ -194,32 +189,34 @@ xgell_callback_link_down(void *userdata)
* xgell_rx_buffer_replenish_all
*
* To replenish all freed dtr(s) with buffers in free pool. It's called by
- * xgell_rx_buffer_recycle() or xgell_rx_1b_compl().
+ * xgell_rx_buffer_recycle() or xgell_rx_1b_callback().
* Must be called with pool_lock held.
*/
static void
-xgell_rx_buffer_replenish_all(xgell_ring_t *ring)
+xgell_rx_buffer_replenish_all(xgell_rx_ring_t *ring)
{
+ xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
xge_hal_dtr_h dtr;
xgell_rx_buffer_t *rx_buffer;
xgell_rxd_priv_t *rxd_priv;
- xge_assert(mutex_owned(&ring->bf_pool.pool_lock));
+ xge_assert(mutex_owned(&bf_pool->pool_lock));
+
+ while ((bf_pool->free > 0) &&
+ (xge_hal_ring_dtr_reserve(ring->channelh, &dtr) == XGE_HAL_OK)) {
+ xge_assert(bf_pool->head);
- while ((ring->bf_pool.free > 0) &&
- (xge_hal_ring_dtr_reserve(ring->channelh, &dtr) ==
- XGE_HAL_OK)) {
- rx_buffer = ring->bf_pool.head;
- ring->bf_pool.head = rx_buffer->next;
- ring->bf_pool.free--;
+ rx_buffer = bf_pool->head;
+
+ bf_pool->head = rx_buffer->next;
+ bf_pool->free--;
- xge_assert(rx_buffer);
xge_assert(rx_buffer->dma_addr);
rxd_priv = (xgell_rxd_priv_t *)
xge_hal_ring_dtr_private(ring->channelh, dtr);
xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr,
- ring->bf_pool.size);
+ bf_pool->size);
rxd_priv->rx_buffer = rx_buffer;
xge_hal_ring_dtr_post(ring->channelh, dtr);
@@ -235,15 +232,16 @@ xgell_rx_buffer_replenish_all(xgell_ring_t *ring)
static void
xgell_rx_buffer_release(xgell_rx_buffer_t *rx_buffer)
{
- xgell_ring_t *ring = rx_buffer->ring;
+ xgell_rx_ring_t *ring = rx_buffer->ring;
+ xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
- xge_assert(mutex_owned(&ring->bf_pool.pool_lock));
+ xge_assert(mutex_owned(&bf_pool->pool_lock));
/* Put the buffer back to pool */
- rx_buffer->next = ring->bf_pool.head;
- ring->bf_pool.head = rx_buffer;
+ rx_buffer->next = bf_pool->head;
+ bf_pool->head = rx_buffer;
- ring->bf_pool.free++;
+ bf_pool->free++;
}
/*
@@ -266,7 +264,7 @@ static void
xgell_rx_buffer_recycle(char *arg)
{
xgell_rx_buffer_t *rx_buffer = (xgell_rx_buffer_t *)arg;
- xgell_ring_t *ring = rx_buffer->ring;
+ xgell_rx_ring_t *ring = rx_buffer->ring;
xgelldev_t *lldev = ring->lldev;
xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
@@ -282,18 +280,17 @@ xgell_rx_buffer_recycle(char *arg)
* Before finding a good way to set this hiwat, just always call to
* replenish_all. *TODO*
*/
- if ((lldev->is_initialized != 0) &&
+ if ((lldev->is_initialized != 0) && (ring->live) &&
(bf_pool->recycle >= XGELL_RX_BUFFER_RECYCLE_CACHE)) {
- if (mutex_tryenter(&bf_pool->pool_lock)) {
- bf_pool->recycle_tail->next = bf_pool->head;
- bf_pool->head = bf_pool->recycle_head;
- bf_pool->recycle_head = bf_pool->recycle_tail = NULL;
- bf_pool->post -= bf_pool->recycle;
- bf_pool->free += bf_pool->recycle;
- bf_pool->recycle = 0;
- xgell_rx_buffer_replenish_all(ring);
- mutex_exit(&bf_pool->pool_lock);
- }
+ mutex_enter(&bf_pool->pool_lock);
+ bf_pool->recycle_tail->next = bf_pool->head;
+ bf_pool->head = bf_pool->recycle_head;
+ bf_pool->recycle_head = bf_pool->recycle_tail = NULL;
+ bf_pool->post -= bf_pool->recycle;
+ bf_pool->free += bf_pool->recycle;
+ bf_pool->recycle = 0;
+ xgell_rx_buffer_replenish_all(ring);
+ mutex_exit(&bf_pool->pool_lock);
}
mutex_exit(&bf_pool->recycle_lock);
@@ -306,8 +303,10 @@ xgell_rx_buffer_recycle(char *arg)
* Return NULL if failed.
*/
static xgell_rx_buffer_t *
-xgell_rx_buffer_alloc(xgell_ring_t *ring)
+xgell_rx_buffer_alloc(xgell_rx_ring_t *ring)
{
+ xgelldev_t *lldev = ring->lldev;
+ xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
xge_hal_device_t *hldev;
void *vaddr;
ddi_dma_handle_t dma_handle;
@@ -318,7 +317,6 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
size_t real_size;
extern ddi_device_acc_attr_t *p_xge_dev_attr;
xgell_rx_buffer_t *rx_buffer;
- xgelldev_t *lldev = ring->lldev;
hldev = (xge_hal_device_t *)lldev->devh;
@@ -330,7 +328,7 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
}
/* reserve some space at the end of the buffer for recycling */
- if (ddi_dma_mem_alloc(dma_handle, HEADROOM + ring->bf_pool.size +
+ if (ddi_dma_mem_alloc(dma_handle, HEADROOM + bf_pool->size +
sizeof (xgell_rx_buffer_t), p_xge_dev_attr, DDI_DMA_STREAMING,
DDI_DMA_SLEEP, 0, (caddr_t *)&vaddr, &real_size, &dma_acch) !=
DDI_SUCCESS) {
@@ -339,7 +337,7 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
goto mem_failed;
}
- if (HEADROOM + ring->bf_pool.size + sizeof (xgell_rx_buffer_t) >
+ if (HEADROOM + bf_pool->size + sizeof (xgell_rx_buffer_t) >
real_size) {
xge_debug_ll(XGE_ERR, "%s%d: can not allocate DMA-able memory",
XGELL_IFNAME, lldev->instance);
@@ -347,14 +345,14 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
}
if (ddi_dma_addr_bind_handle(dma_handle, NULL, (char *)vaddr + HEADROOM,
- ring->bf_pool.size, DDI_DMA_READ | DDI_DMA_STREAMING,
+ bf_pool->size, DDI_DMA_READ | DDI_DMA_STREAMING,
DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_SUCCESS) {
xge_debug_ll(XGE_ERR, "%s%d: out of mapping for mblk",
XGELL_IFNAME, lldev->instance);
goto bind_failed;
}
- if (ncookies != 1 || dma_cookie.dmac_size < ring->bf_pool.size) {
+ if (ncookies != 1 || dma_cookie.dmac_size < bf_pool->size) {
xge_debug_ll(XGE_ERR, "%s%d: can not handle partial DMA",
XGELL_IFNAME, lldev->instance);
goto check_failed;
@@ -393,64 +391,77 @@ handle_failed:
* Destroy buffer pool. If there is still any buffer hold by upper layer,
* recorded by bf_pool.post, return DDI_FAILURE to reject to be unloaded.
*/
-static int
-xgell_rx_destroy_buffer_pool(xgell_ring_t *ring)
+static boolean_t
+xgell_rx_destroy_buffer_pool(xgell_rx_ring_t *ring)
{
+ xgelldev_t *lldev = ring->lldev;
+ xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
xgell_rx_buffer_t *rx_buffer;
ddi_dma_handle_t dma_handle;
ddi_acc_handle_t dma_acch;
- xgelldev_t *lldev = ring->lldev;
int i;
- if (ring->bf_pool.recycle > 0) {
- ring->bf_pool.recycle_tail->next = ring->bf_pool.head;
- ring->bf_pool.head = ring->bf_pool.recycle_head;
- ring->bf_pool.recycle_tail =
- ring->bf_pool.recycle_head = NULL;
- ring->bf_pool.post -= ring->bf_pool.recycle;
- ring->bf_pool.free += ring->bf_pool.recycle;
- ring->bf_pool.recycle = 0;
+ /*
+ * If the pool has been destroied, just return B_TRUE
+ */
+ if (!bf_pool->live)
+ return (B_TRUE);
+
+ mutex_enter(&bf_pool->recycle_lock);
+ if (bf_pool->recycle > 0) {
+ mutex_enter(&bf_pool->pool_lock);
+ bf_pool->recycle_tail->next = bf_pool->head;
+ bf_pool->head = bf_pool->recycle_head;
+ bf_pool->recycle_tail = bf_pool->recycle_head = NULL;
+ bf_pool->post -= bf_pool->recycle;
+ bf_pool->free += bf_pool->recycle;
+ bf_pool->recycle = 0;
+ mutex_exit(&bf_pool->pool_lock);
}
+ mutex_exit(&bf_pool->recycle_lock);
/*
* If there is any posted buffer, the driver should reject to be
* detached. Need notice upper layer to release them.
*/
- if (ring->bf_pool.post != 0) {
+ if (bf_pool->post != 0) {
xge_debug_ll(XGE_ERR,
"%s%d has some buffers not be recycled, try later!",
XGELL_IFNAME, lldev->instance);
- return (DDI_FAILURE);
+ return (B_FALSE);
}
/*
- * Relase buffers one by one.
+ * Release buffers one by one.
*/
- for (i = ring->bf_pool.total; i > 0; i--) {
- rx_buffer = ring->bf_pool.head;
+ for (i = bf_pool->total; i > 0; i--) {
+ rx_buffer = bf_pool->head;
xge_assert(rx_buffer != NULL);
- ring->bf_pool.head = rx_buffer->next;
+ bf_pool->head = rx_buffer->next;
dma_handle = rx_buffer->dma_handle;
dma_acch = rx_buffer->dma_acch;
if (ddi_dma_unbind_handle(dma_handle) != DDI_SUCCESS) {
- xge_debug_ll(XGE_ERR, "%s",
- "failed to unbind DMA handle!");
- ring->bf_pool.head = rx_buffer;
- return (DDI_FAILURE);
+ xge_debug_ll(XGE_ERR, "failed to unbind DMA handle!");
+ bf_pool->head = rx_buffer;
+ return (B_FALSE);
}
ddi_dma_mem_free(&dma_acch);
ddi_dma_free_handle(&dma_handle);
- ring->bf_pool.total--;
- ring->bf_pool.free--;
+ bf_pool->total--;
+ bf_pool->free--;
}
- mutex_destroy(&ring->bf_pool.recycle_lock);
- mutex_destroy(&ring->bf_pool.pool_lock);
- return (DDI_SUCCESS);
+ xge_assert(!mutex_owned(&bf_pool->pool_lock));
+
+ mutex_destroy(&bf_pool->recycle_lock);
+ mutex_destroy(&bf_pool->pool_lock);
+ bf_pool->live = B_FALSE;
+
+ return (B_TRUE);
}
/*
@@ -458,29 +469,34 @@ xgell_rx_destroy_buffer_pool(xgell_ring_t *ring)
*
* Initialize RX buffer pool for all RX rings. Refer to rx_buffer_pool_t.
*/
-static int
-xgell_rx_create_buffer_pool(xgell_ring_t *ring)
+static boolean_t
+xgell_rx_create_buffer_pool(xgell_rx_ring_t *ring)
{
+ xgelldev_t *lldev = ring->lldev;
+ xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
xge_hal_device_t *hldev;
xgell_rx_buffer_t *rx_buffer;
- xgelldev_t *lldev = ring->lldev;
int i;
+ if (bf_pool->live)
+ return (B_TRUE);
+
hldev = (xge_hal_device_t *)lldev->devh;
- ring->bf_pool.total = 0;
- ring->bf_pool.size = XGELL_MAX_FRAME_SIZE(hldev);
- ring->bf_pool.head = NULL;
- ring->bf_pool.free = 0;
- ring->bf_pool.post = 0;
- ring->bf_pool.post_hiwat = lldev->config.rx_buffer_post_hiwat;
- ring->bf_pool.recycle = 0;
- ring->bf_pool.recycle_head = NULL;
- ring->bf_pool.recycle_tail = NULL;
-
- mutex_init(&ring->bf_pool.pool_lock, NULL, MUTEX_DRIVER,
+ bf_pool->total = 0;
+ bf_pool->size = XGELL_MAX_FRAME_SIZE(hldev);
+ bf_pool->head = NULL;
+ bf_pool->free = 0;
+ bf_pool->post = 0;
+ bf_pool->post_hiwat = lldev->config.rx_buffer_post_hiwat;
+ bf_pool->recycle = 0;
+ bf_pool->recycle_head = NULL;
+ bf_pool->recycle_tail = NULL;
+ bf_pool->live = B_TRUE;
+
+ mutex_init(&bf_pool->pool_lock, NULL, MUTEX_DRIVER,
DDI_INTR_PRI(hldev->irqh));
- mutex_init(&ring->bf_pool.recycle_lock, NULL, MUTEX_DRIVER,
+ mutex_init(&bf_pool->recycle_lock, NULL, MUTEX_DRIVER,
DDI_INTR_PRI(hldev->irqh));
/*
@@ -491,17 +507,17 @@ xgell_rx_create_buffer_pool(xgell_ring_t *ring)
for (i = 0; i < lldev->config.rx_buffer_total; i++) {
if ((rx_buffer = xgell_rx_buffer_alloc(ring)) == NULL) {
(void) xgell_rx_destroy_buffer_pool(ring);
- return (DDI_FAILURE);
+ return (B_FALSE);
}
- rx_buffer->next = ring->bf_pool.head;
- ring->bf_pool.head = rx_buffer;
+ rx_buffer->next = bf_pool->head;
+ bf_pool->head = rx_buffer;
- ring->bf_pool.total++;
- ring->bf_pool.free++;
+ bf_pool->total++;
+ bf_pool->free++;
}
- return (DDI_SUCCESS);
+ return (B_TRUE);
}
/*
@@ -514,23 +530,26 @@ xge_hal_status_e
xgell_rx_dtr_replenish(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, int index,
void *userdata, xge_hal_channel_reopen_e reopen)
{
- xgell_ring_t *ring = userdata;
+ xgell_rx_ring_t *ring = userdata;
+ xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
xgell_rx_buffer_t *rx_buffer;
xgell_rxd_priv_t *rxd_priv;
- if (ring->bf_pool.head == NULL) {
- xge_debug_ll(XGE_ERR, "%s", "no more available rx DMA buffer!");
+ mutex_enter(&bf_pool->pool_lock);
+ if (bf_pool->head == NULL) {
+ xge_debug_ll(XGE_ERR, "no more available rx DMA buffer!");
return (XGE_HAL_FAIL);
}
- rx_buffer = ring->bf_pool.head;
- ring->bf_pool.head = rx_buffer->next;
- ring->bf_pool.free--;
-
+ rx_buffer = bf_pool->head;
xge_assert(rx_buffer);
xge_assert(rx_buffer->dma_addr);
+ bf_pool->head = rx_buffer->next;
+ bf_pool->free--;
+ mutex_exit(&bf_pool->pool_lock);
+
rxd_priv = (xgell_rxd_priv_t *)xge_hal_ring_dtr_private(channelh, dtr);
- xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr, ring->bf_pool.size);
+ xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr, bf_pool->size);
rxd_priv->rx_buffer = rx_buffer;
@@ -637,9 +656,10 @@ xgell_rx_hcksum_assoc(mblk_t *mp, char *vaddr, int pkt_length,
* new message and copy the payload in.
*/
static mblk_t *
-xgell_rx_1b_msg_alloc(xgelldev_t *lldev, xgell_rx_buffer_t *rx_buffer,
+xgell_rx_1b_msg_alloc(xgell_rx_ring_t *ring, xgell_rx_buffer_t *rx_buffer,
int pkt_length, xge_hal_dtr_info_t *ext_info, boolean_t *copyit)
{
+ xgelldev_t *lldev = ring->lldev;
mblk_t *mp;
char *vaddr;
@@ -676,24 +696,25 @@ xgell_rx_1b_msg_alloc(xgelldev_t *lldev, xgell_rx_buffer_t *rx_buffer,
}
/*
- * xgell_rx_1b_compl
+ * xgell_rx_1b_callback
*
* If the interrupt is because of a received frame or if the receive ring
* contains fresh as yet un-processed frames, this function is called.
*/
static xge_hal_status_e
-xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
+xgell_rx_1b_callback(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
void *userdata)
{
- xgell_ring_t *ring = (xgell_ring_t *)userdata;
+ xgell_rx_ring_t *ring = (xgell_rx_ring_t *)userdata;
xgelldev_t *lldev = ring->lldev;
xgell_rx_buffer_t *rx_buffer;
mblk_t *mp_head = NULL;
mblk_t *mp_end = NULL;
int pkt_burst = 0;
- mutex_enter(&ring->bf_pool.pool_lock);
+ xge_debug_ll(XGE_TRACE, "xgell_rx_1b_callback on ring %d", ring->index);
+ mutex_enter(&ring->bf_pool.pool_lock);
do {
int pkt_length;
dma_addr_t dma_data;
@@ -744,7 +765,7 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
copyit = B_FALSE;
}
- mp = xgell_rx_1b_msg_alloc(lldev, rx_buffer, pkt_length,
+ mp = xgell_rx_1b_msg_alloc(ring, rx_buffer, pkt_length,
&ext_info, &copyit);
xge_hal_ring_dtr_free(channelh, dtr);
@@ -771,8 +792,10 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
* Associate cksum_flags per packet type and h/w
* cksum flags.
*/
- xgell_rx_hcksum_assoc(mp, (char *)rx_buffer->vaddr +
- HEADROOM, pkt_length, &ext_info);
+ xgell_rx_hcksum_assoc(mp, (char *)rx_buffer->vaddr + HEADROOM,
+ pkt_length, &ext_info);
+
+ ring->received_bytes += pkt_length;
if (mp_head == NULL) {
mp_head = mp;
@@ -782,6 +805,26 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
mp_end = mp;
}
+ /*
+ * Inlined implemented polling function.
+ */
+ if ((ring->poll_mp == NULL) && (ring->poll_bytes > 0)) {
+ ring->poll_mp = mp_head;
+ }
+ if (ring->poll_mp != NULL) {
+ if ((ring->poll_bytes -= pkt_length) <= 0) {
+ /* have polled enough packets. */
+ break;
+ } else {
+ /* continue polling packets. */
+ continue;
+ }
+ }
+
+ /*
+ * We're not in polling mode, so try to chain more messages
+ * or send the chain up according to pkt_burst.
+ */
if (++pkt_burst < lldev->config.rx_pkt_burst)
continue;
@@ -791,8 +834,8 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
}
mutex_exit(&ring->bf_pool.pool_lock);
if (mp_head != NULL) {
- mac_rx(lldev->mh, ((xgell_ring_t *)userdata)->handle,
- mp_head);
+ mac_rx_ring(lldev->mh, ring->ring_handle, mp_head,
+ ring->ring_gen_num);
}
mp_head = mp_end = NULL;
pkt_burst = 0;
@@ -807,13 +850,39 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
xgell_rx_buffer_replenish_all(ring);
mutex_exit(&ring->bf_pool.pool_lock);
- if (mp_head != NULL) {
- mac_rx(lldev->mh, ((xgell_ring_t *)userdata)->handle, mp_head);
+ /*
+ * If we're not in polling cycle, call mac_rx(), otherwise
+ * just return while leaving packets chained to ring->poll_mp.
+ */
+ if ((ring->poll_mp == NULL) && (mp_head != NULL)) {
+ mac_rx_ring(lldev->mh, ring->ring_handle, mp_head,
+ ring->ring_gen_num);
}
return (XGE_HAL_OK);
}
+mblk_t *
+xgell_rx_poll(void *arg, int bytes_to_pickup)
+{
+ xgell_rx_ring_t *ring = (xgell_rx_ring_t *)arg;
+ int got_rx = 0;
+ mblk_t *mp;
+
+ xge_debug_ll(XGE_TRACE, "xgell_rx_poll on ring %d", ring->index);
+
+ ring->poll_mp = NULL;
+ ring->poll_bytes = bytes_to_pickup;
+ (void) xge_hal_device_poll_rx_channel(ring->channelh, &got_rx);
+
+ mp = ring->poll_mp;
+ ring->poll_bytes = -1;
+ ring->polled_bytes += got_rx;
+ ring->poll_mp = NULL;
+
+ return (mp);
+}
+
/*
* xgell_xmit_compl
*
@@ -826,8 +895,8 @@ static xge_hal_status_e
xgell_xmit_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
void *userdata)
{
- xgell_fifo_t *fifo = (xgell_fifo_t *)userdata;
- xgelldev_t *lldev = fifo->lldev;
+ xgell_tx_ring_t *ring = userdata;
+ xgelldev_t *lldev = ring->lldev;
do {
xgell_txd_priv_t *txd_priv = ((xgell_txd_priv_t *)
@@ -861,58 +930,36 @@ xgell_xmit_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
txd_priv->mblk = NULL;
}
- lldev->resched_avail++;
-
} while (xge_hal_fifo_dtr_next_completed(channelh, &dtr, &t_code) ==
XGE_HAL_OK);
- if (lldev->resched_retry &&
- xge_queue_produce_context(xge_hal_device_queue(lldev->devh),
- XGELL_EVENT_RESCHED_NEEDED, fifo) == XGE_QUEUE_OK) {
- xge_debug_ll(XGE_TRACE, "%s%d: IRQ produced event for queue %d",
- XGELL_IFNAME, lldev->instance,
- ((xge_hal_channel_t *)channelh)->post_qid);
- lldev->resched_send = lldev->resched_avail;
- lldev->resched_retry = 0;
- }
+ if (ring->need_resched)
+ mac_tx_ring_update(lldev->mh, ring->ring_handle);
return (XGE_HAL_OK);
}
-/*
- * xgell_send
- * @hldev: pointer to xge_hal_device_t strucutre
- * @mblk: pointer to network buffer, i.e. mblk_t structure
- *
- * Called by the xgell_m_tx to transmit the packet to the XFRAME firmware.
- * A pointer to an M_DATA message that contains the packet is passed to
- * this routine.
- */
-static boolean_t
-xgell_send(xgelldev_t *lldev, mblk_t *mp)
+mblk_t *
+xgell_ring_tx(void *arg, mblk_t *mp)
{
+ xgell_tx_ring_t *ring = (xgell_tx_ring_t *)arg;
mblk_t *bp;
- boolean_t retry;
+ xgelldev_t *lldev = ring->lldev;
xge_hal_device_t *hldev = lldev->devh;
xge_hal_status_e status;
xge_hal_dtr_h dtr;
xgell_txd_priv_t *txd_priv;
uint32_t hckflags;
+ uint32_t lsoflags;
uint32_t mss;
int handle_cnt, frag_cnt, ret, i, copied;
boolean_t used_copy;
- xgell_fifo_t *fifo;
- xge_hal_channel_h fifo_channel;
_begin:
- retry = B_FALSE;
handle_cnt = frag_cnt = 0;
if (!lldev->is_initialized || lldev->in_reset)
- return (B_FALSE);
-
- fifo = &lldev->fifos[0];
- fifo_channel = fifo->channelh;
+ return (mp);
/*
* If the free Tx dtrs count reaches the lower threshold,
@@ -921,23 +968,17 @@ _begin:
* gld through gld_sched call, when the free dtrs count exceeds
* the higher threshold.
*/
- if (xge_hal_channel_dtr_count(fifo_channel)
+ if (xge_hal_channel_dtr_count(ring->channelh)
<= XGELL_TX_LEVEL_LOW) {
- if (++fifo->level_low > XGELL_TX_LEVEL_CHECK) {
- xge_debug_ll(XGE_TRACE, "%s%d: queue %d: err on xmit,"
- "free descriptors count at low threshold %d",
- XGELL_IFNAME, lldev->instance,
- ((xge_hal_channel_t *)fifo_channel)->post_qid,
- XGELL_TX_LEVEL_LOW);
- fifo->level_low = 0;
- retry = B_TRUE;
- goto _exit;
- }
- } else {
- fifo->level_low = 0;
+ xge_debug_ll(XGE_TRACE, "%s%d: queue %d: err on xmit,"
+ "free descriptors count at low threshold %d",
+ XGELL_IFNAME, lldev->instance,
+ ((xge_hal_channel_t *)ring->channelh)->post_qid,
+ XGELL_TX_LEVEL_LOW);
+ goto _exit;
}
- status = xge_hal_fifo_dtr_reserve(fifo_channel, &dtr);
+ status = xge_hal_fifo_dtr_reserve(ring->channelh, &dtr);
if (status != XGE_HAL_OK) {
switch (status) {
case XGE_HAL_INF_CHANNEL_IS_NOT_READY:
@@ -945,19 +986,17 @@ _begin:
"%s%d: channel %d is not ready.", XGELL_IFNAME,
lldev->instance,
((xge_hal_channel_t *)
- fifo_channel)->post_qid);
- retry = B_TRUE;
+ ring->channelh)->post_qid);
goto _exit;
case XGE_HAL_INF_OUT_OF_DESCRIPTORS:
xge_debug_ll(XGE_TRACE, "%s%d: queue %d: error in xmit,"
" out of descriptors.", XGELL_IFNAME,
lldev->instance,
((xge_hal_channel_t *)
- fifo_channel)->post_qid);
- retry = B_TRUE;
+ ring->channelh)->post_qid);
goto _exit;
default:
- return (B_FALSE);
+ return (mp);
}
}
@@ -1002,6 +1041,8 @@ _begin:
continue;
}
+ ring->sent_bytes += mblen;
+
/*
* Check the message length to decide to DMA or bcopy() data
* to tx descriptor(s).
@@ -1009,7 +1050,7 @@ _begin:
if (mblen < lldev->config.tx_dma_lowat &&
(copied + mblen) < lldev->tx_copied_max) {
xge_hal_status_e rc;
- rc = xge_hal_fifo_dtr_buffer_append(fifo_channel,
+ rc = xge_hal_fifo_dtr_buffer_append(ring->channelh,
dtr, bp->b_rptr, mblen);
if (rc == XGE_HAL_OK) {
used_copy = B_TRUE;
@@ -1017,11 +1058,11 @@ _begin:
continue;
} else if (used_copy) {
xge_hal_fifo_dtr_buffer_finalize(
- fifo_channel, dtr, frag_cnt++);
+ ring->channelh, dtr, frag_cnt++);
used_copy = B_FALSE;
}
} else if (used_copy) {
- xge_hal_fifo_dtr_buffer_finalize(fifo_channel,
+ xge_hal_fifo_dtr_buffer_finalize(ring->channelh,
dtr, frag_cnt++);
used_copy = B_FALSE;
}
@@ -1075,7 +1116,7 @@ _begin:
/* setup the descriptors for this data buffer */
while (ncookies) {
- xge_hal_fifo_dtr_buffer_set(fifo_channel, dtr,
+ xge_hal_fifo_dtr_buffer_set(ring->channelh, dtr,
frag_cnt++, dma_cookie.dmac_laddress,
dma_cookie.dmac_size);
if (--ncookies) {
@@ -1108,7 +1149,7 @@ _begin:
/* finalize unfinished copies */
if (used_copy) {
- xge_hal_fifo_dtr_buffer_finalize(fifo_channel, dtr,
+ xge_hal_fifo_dtr_buffer_finalize(ring->channelh, dtr,
frag_cnt++);
}
@@ -1118,11 +1159,14 @@ _begin:
* If LSO is required, just call xge_hal_fifo_dtr_mss_set(dtr, mss) to
* do all necessary work.
*/
- hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, &mss, &hckflags);
- if ((hckflags & HW_LSO) && (mss != 0)) {
+ lso_info_get(mp, &mss, &lsoflags);
+
+ if (lsoflags & HW_LSO) {
+ xge_assert((mss != 0) && (mss <= XGE_HAL_DEFAULT_MTU));
xge_hal_fifo_dtr_mss_set(dtr, mss);
}
+ hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
if (hckflags & HCK_IPV4_HDRCKSUM) {
xge_hal_fifo_dtr_cksum_set_bits(dtr,
XGE_HAL_TXD_TX_CKO_IPV4_EN);
@@ -1132,63 +1176,376 @@ _begin:
XGE_HAL_TXD_TX_CKO_UDP_EN);
}
- xge_hal_fifo_dtr_post(fifo_channel, dtr);
+ xge_hal_fifo_dtr_post(ring->channelh, dtr);
- return (B_TRUE);
+ return (NULL);
_exit_cleanup:
-
+ /*
+ * Could not successfully transmit but have changed the message,
+ * so just free it and return NULL
+ */
for (i = 0; i < handle_cnt; i++) {
(void) ddi_dma_unbind_handle(txd_priv->dma_handles[i]);
ddi_dma_free_handle(&txd_priv->dma_handles[i]);
txd_priv->dma_handles[i] = 0;
}
- xge_hal_fifo_dtr_free(fifo_channel, dtr);
+ xge_hal_fifo_dtr_free(ring->channelh, dtr);
+
+ freemsg(mp);
+ return (NULL);
_exit:
- if (retry) {
- if (lldev->resched_avail != lldev->resched_send &&
- xge_queue_produce_context(xge_hal_device_queue(lldev->devh),
- XGELL_EVENT_RESCHED_NEEDED, fifo) == XGE_QUEUE_OK) {
- lldev->resched_send = lldev->resched_avail;
- return (B_FALSE);
- } else {
- lldev->resched_retry = 1;
+ ring->need_resched = B_TRUE;
+ return (mp);
+}
+
+/*
+ * xgell_ring_macaddr_init
+ */
+static void
+xgell_rx_ring_maddr_init(xgell_rx_ring_t *ring)
+{
+ int i;
+ xgelldev_t *lldev = ring->lldev;
+ xge_hal_device_t *hldev = lldev->devh;
+ int slot_start;
+
+ xge_debug_ll(XGE_TRACE, "%s", "xgell_rx_ring_maddr_init");
+
+ ring->mmac.naddr = XGE_RX_MULTI_MAC_ADDRESSES_MAX;
+ ring->mmac.naddrfree = ring->mmac.naddr;
+
+ /*
+ * For the default rx ring, the first MAC address is the factory one.
+ * This will be set by the framework, so need to clear it for now.
+ */
+ (void) xge_hal_device_macaddr_clear(hldev, 0);
+
+ /*
+ * Read the MAC address Configuration Memory from HAL.
+ * The first slot will hold a factory MAC address, contents in other
+ * slots will be FF:FF:FF:FF:FF:FF.
+ */
+ slot_start = ring->index * 32;
+ for (i = 0; i < ring->mmac.naddr; i++) {
+ (void) xge_hal_device_macaddr_get(hldev, slot_start + i,
+ ring->mmac.mac_addr + i);
+ ring->mmac.mac_addr_set[i] = B_FALSE;
+ }
+}
+
+static int xgell_maddr_set(xgelldev_t *, int, uint8_t *);
+
+static int
+xgell_addmac(void *arg, const uint8_t *mac_addr)
+{
+ xgell_rx_ring_t *ring = arg;
+ xgelldev_t *lldev = ring->lldev;
+ xge_hal_device_t *hldev = lldev->devh;
+ int slot;
+ int slot_start;
+
+ xge_debug_ll(XGE_TRACE, "%s", "xgell_addmac");
+
+ mutex_enter(&lldev->genlock);
+
+ if (ring->mmac.naddrfree == 0) {
+ mutex_exit(&lldev->genlock);
+ return (ENOSPC);
+ }
+
+ /* First slot is for factory MAC address */
+ for (slot = 0; slot < ring->mmac.naddr; slot++) {
+ if (ring->mmac.mac_addr_set[slot] == B_FALSE) {
+ break;
}
}
- if (mp)
- freemsg(mp);
- return (B_TRUE);
+ ASSERT(slot < ring->mmac.naddr);
+
+ slot_start = ring->index * 32;
+
+ if (xgell_maddr_set(lldev, slot_start + slot, (uint8_t *)mac_addr) !=
+ 0) {
+ mutex_exit(&lldev->genlock);
+ return (EIO);
+ }
+
+ /* Simply enable RTS for the whole section. */
+ (void) xge_hal_device_rts_section_enable(hldev, slot_start + slot);
+
+ /*
+ * Read back the MAC address from HAL to keep the array up to date.
+ */
+ if (xge_hal_device_macaddr_get(hldev, slot_start + slot,
+ ring->mmac.mac_addr + slot) != XGE_HAL_OK) {
+ (void) xge_hal_device_macaddr_clear(hldev, slot_start + slot);
+ return (EIO);
+ }
+
+ ring->mmac.mac_addr_set[slot] = B_TRUE;
+ ring->mmac.naddrfree--;
+
+ mutex_exit(&lldev->genlock);
+
+ return (0);
+}
+
+static int
+xgell_remmac(void *arg, const uint8_t *mac_addr)
+{
+ xgell_rx_ring_t *ring = arg;
+ xgelldev_t *lldev = ring->lldev;
+ xge_hal_device_t *hldev = lldev->devh;
+ xge_hal_status_e status;
+ int slot;
+ int slot_start;
+
+ xge_debug_ll(XGE_TRACE, "%s", "xgell_remmac");
+
+ slot = xge_hal_device_macaddr_find(hldev, (uint8_t *)mac_addr);
+ if (slot == -1)
+ return (EINVAL);
+
+ slot_start = ring->index * 32;
+
+ /*
+ * Adjust slot to the offset in the MAC array of this ring (group).
+ */
+ slot -= slot_start;
+
+ /*
+ * Only can remove a pre-set MAC address for this ring (group).
+ */
+ if (slot < 0 || slot >= ring->mmac.naddr)
+ return (EINVAL);
+
+
+ xge_assert(ring->mmac.mac_addr_set[slot]);
+
+ mutex_enter(&lldev->genlock);
+ if (!ring->mmac.mac_addr_set[slot]) {
+ mutex_exit(&lldev->genlock);
+ /*
+ * The result will be unexpected when reach here. WARNING!
+ */
+ xge_debug_ll(XGE_ERR,
+ "%s%d: caller is trying to remove an unset MAC address",
+ XGELL_IFNAME, lldev->instance);
+ return (ENXIO);
+ }
+
+ status = xge_hal_device_macaddr_clear(hldev, slot_start + slot);
+ if (status != XGE_HAL_OK) {
+ mutex_exit(&lldev->genlock);
+ return (EIO);
+ }
+
+ ring->mmac.mac_addr_set[slot] = B_FALSE;
+ ring->mmac.naddrfree++;
+
+ /*
+ * TODO: Disable MAC RTS if all addresses have been cleared.
+ */
+
+ /*
+ * Read back the MAC address from HAL to keep the array up to date.
+ */
+ (void) xge_hal_device_macaddr_get(hldev, slot_start + slot,
+ ring->mmac.mac_addr + slot);
+ mutex_exit(&lldev->genlock);
+
+ return (0);
}
/*
- * xge_m_tx
- * @arg: pointer to the xgelldev_t structure
- * @resid: resource id
- * @mp: pointer to the message buffer
+ * Temporarily calling hal function.
*
- * Called by MAC Layer to send a chain of packets
+ * With MSI-X implementation, no lock is needed, so that the interrupt
+ * handling could be faster.
*/
-static mblk_t *
-xgell_m_tx(void *arg, mblk_t *mp)
+int
+xgell_rx_ring_intr_enable(mac_intr_handle_t ih)
{
- xgelldev_t *lldev = arg;
- mblk_t *next;
+ xgell_rx_ring_t *ring = (xgell_rx_ring_t *)ih;
- while (mp != NULL) {
- next = mp->b_next;
- mp->b_next = NULL;
+ mutex_enter(&ring->ring_lock);
+ xge_hal_device_rx_channel_disable_polling(ring->channelh);
+ mutex_exit(&ring->ring_lock);
- if (!xgell_send(lldev, mp)) {
- mp->b_next = next;
- break;
- }
- mp = next;
+ return (0);
+}
+
+int
+xgell_rx_ring_intr_disable(mac_intr_handle_t ih)
+{
+ xgell_rx_ring_t *ring = (xgell_rx_ring_t *)ih;
+
+ mutex_enter(&ring->ring_lock);
+ xge_hal_device_rx_channel_enable_polling(ring->channelh);
+ mutex_exit(&ring->ring_lock);
+
+ return (0);
+}
+
+static int
+xgell_rx_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+ xgell_rx_ring_t *rx_ring = (xgell_rx_ring_t *)rh;
+
+ rx_ring->ring_gen_num = mr_gen_num;
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+xgell_rx_ring_stop(mac_ring_driver_t rh)
+{
+}
+
+/*ARGSUSED*/
+static int
+xgell_tx_ring_start(mac_ring_driver_t rh, uint64_t useless)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+xgell_tx_ring_stop(mac_ring_driver_t rh)
+{
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ *
+ * Xframe hardware doesn't support grouping explicitly, so the driver needs
+ * to pretend having resource groups. We may also optionally group all 8 rx
+ * rings into a single group for increased scalability on CMT architectures,
+ * or group one rx ring per group for maximum virtualization.
+ *
+ * TX grouping is actually done by framework, so, just register all TX
+ * resources without grouping them.
+ */
+void
+xgell_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+ const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ xgelldev_t *lldev = (xgelldev_t *)arg;
+ mac_intr_t *mintr;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ xgell_rx_ring_t *rx_ring;
+
+ xge_assert(index < lldev->init_rx_rings);
+ xge_assert(rg_index < lldev->init_rx_groups);
+
+ /*
+ * Performance vs. Virtualization
+ */
+ if (lldev->init_rx_rings == lldev->init_rx_groups)
+ rx_ring = lldev->rx_ring + rg_index;
+ else
+ rx_ring = lldev->rx_ring + index;
+
+ rx_ring->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)rx_ring;
+ infop->mri_start = xgell_rx_ring_start;
+ infop->mri_stop = xgell_rx_ring_stop;
+ infop->mri_poll = xgell_rx_poll;
+
+ mintr = &infop->mri_intr;
+ mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+ mintr->mi_enable = xgell_rx_ring_intr_enable;
+ mintr->mi_disable = xgell_rx_ring_intr_disable;
+
+ break;
}
+ case MAC_RING_TYPE_TX: {
+ xgell_tx_ring_t *tx_ring;
- return (mp);
+ xge_assert(rg_index == -1);
+
+ xge_assert((index >= 0) && (index < lldev->init_tx_rings));
+
+ tx_ring = lldev->tx_ring + index;
+ tx_ring->ring_handle = rh;
+
+ infop->mri_driver = (mac_ring_driver_t)tx_ring;
+ infop->mri_start = xgell_tx_ring_start;
+ infop->mri_stop = xgell_tx_ring_stop;
+ infop->mri_tx = xgell_ring_tx;
+
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+void
+xgell_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+ mac_group_info_t *infop, mac_group_handle_t gh)
+{
+ xgelldev_t *lldev = (xgelldev_t *)arg;
+
+ switch (rtype) {
+ case MAC_RING_TYPE_RX: {
+ xgell_rx_ring_t *rx_ring;
+
+ xge_assert(index < lldev->init_rx_groups);
+
+ rx_ring = lldev->rx_ring + index;
+
+ rx_ring->group_handle = gh;
+
+ infop->mgi_driver = (mac_group_driver_t)rx_ring;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = xgell_addmac;
+ infop->mgi_remmac = xgell_remmac;
+ infop->mgi_count = lldev->init_rx_rings / lldev->init_rx_groups;
+
+ break;
+ }
+ case MAC_RING_TYPE_TX:
+ xge_assert(0);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * xgell_macaddr_set
+ */
+static int
+xgell_maddr_set(xgelldev_t *lldev, int index, uint8_t *macaddr)
+{
+ xge_hal_device_t *hldev = lldev->devh;
+ xge_hal_status_e status;
+
+ xge_debug_ll(XGE_TRACE, "%s", "xgell_maddr_set");
+
+ xge_debug_ll(XGE_TRACE,
+ "setting macaddr: 0x%02x-%02x-%02x-%02x-%02x-%02x",
+ macaddr[0], macaddr[1], macaddr[2],
+ macaddr[3], macaddr[4], macaddr[5]);
+
+ status = xge_hal_device_macaddr_set(hldev, index, (uchar_t *)macaddr);
+
+ if (status != XGE_HAL_OK) {
+ xge_debug_ll(XGE_ERR, "%s%d: can not set mac address",
+ XGELL_IFNAME, lldev->instance);
+ return (EIO);
+ }
+
+ return (0);
}
/*
@@ -1201,12 +1558,13 @@ static void
xgell_rx_dtr_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh,
xge_hal_dtr_state_e state, void *userdata, xge_hal_channel_reopen_e reopen)
{
- xgell_ring_t *ring = (xgell_ring_t *)userdata;
xgell_rxd_priv_t *rxd_priv =
((xgell_rxd_priv_t *)xge_hal_ring_dtr_private(channelh, dtrh));
xgell_rx_buffer_t *rx_buffer = rxd_priv->rx_buffer;
if (state == XGE_HAL_DTR_STATE_POSTED) {
+ xgell_rx_ring_t *ring = rx_buffer->ring;
+
mutex_enter(&ring->bf_pool.pool_lock);
xge_hal_ring_dtr_free(channelh, dtrh);
xgell_rx_buffer_release(rx_buffer);
@@ -1215,6 +1573,137 @@ xgell_rx_dtr_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh,
}
/*
+ * To open a rx ring.
+ */
+static boolean_t
+xgell_rx_ring_open(xgell_rx_ring_t *rx_ring)
+{
+ xge_hal_status_e status;
+ xge_hal_channel_attr_t attr;
+ xgelldev_t *lldev = rx_ring->lldev;
+ xge_hal_device_t *hldev = lldev->devh;
+
+ if (rx_ring->live)
+ return (B_TRUE);
+
+ /* Create the buffer pool first */
+ if (!xgell_rx_create_buffer_pool(rx_ring)) {
+ xge_debug_ll(XGE_ERR, "can not create buffer pool for ring: %d",
+ rx_ring->index);
+ return (B_FALSE);
+ }
+
+ /* Default ring initialization */
+ attr.post_qid = rx_ring->index;
+ attr.compl_qid = 0;
+ attr.callback = xgell_rx_1b_callback;
+ attr.per_dtr_space = sizeof (xgell_rxd_priv_t);
+ attr.flags = 0;
+ attr.type = XGE_HAL_CHANNEL_TYPE_RING;
+ attr.dtr_init = xgell_rx_dtr_replenish;
+ attr.dtr_term = xgell_rx_dtr_term;
+ attr.userdata = rx_ring;
+
+ status = xge_hal_channel_open(lldev->devh, &attr, &rx_ring->channelh,
+ XGE_HAL_CHANNEL_OC_NORMAL);
+ if (status != XGE_HAL_OK) {
+ xge_debug_ll(XGE_ERR, "%s%d: cannot open Rx channel got status "
+ " code %d", XGELL_IFNAME, lldev->instance, status);
+ (void) xgell_rx_destroy_buffer_pool(rx_ring);
+ return (B_FALSE);
+ }
+
+ xgell_rx_ring_maddr_init(rx_ring);
+
+ mutex_init(&rx_ring->ring_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(hldev->irqh));
+
+ rx_ring->received_bytes = 0;
+ rx_ring->poll_bytes = -1;
+ rx_ring->polled_bytes = 0;
+ rx_ring->poll_mp = NULL;
+ rx_ring->live = B_TRUE;
+
+ xge_debug_ll(XGE_TRACE, "RX ring [%d] is opened successfully",
+ rx_ring->index);
+
+ return (B_TRUE);
+}
+
+static void
+xgell_rx_ring_close(xgell_rx_ring_t *rx_ring)
+{
+ if (!rx_ring->live)
+ return;
+ xge_hal_channel_close(rx_ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
+ rx_ring->channelh = NULL;
+ /* This may not clean up all used buffers, driver will handle it */
+ if (xgell_rx_destroy_buffer_pool(rx_ring))
+ rx_ring->live = B_FALSE;
+
+ mutex_destroy(&rx_ring->ring_lock);
+}
+
+/*
+ * xgell_rx_open
+ * @lldev: the link layer object
+ *
+ * Initialize and open all RX channels.
+ */
+static boolean_t
+xgell_rx_open(xgelldev_t *lldev)
+{
+ xgell_rx_ring_t *rx_ring;
+ int i;
+
+ if (lldev->live_rx_rings != 0)
+ return (B_TRUE);
+
+ lldev->live_rx_rings = 0;
+
+ /*
+ * Initialize all rings
+ */
+ for (i = 0; i < lldev->init_rx_rings; i++) {
+ rx_ring = &lldev->rx_ring[i];
+ rx_ring->index = i;
+ rx_ring->lldev = lldev;
+ rx_ring->live = B_FALSE;
+
+ if (!xgell_rx_ring_open(rx_ring))
+ return (B_FALSE);
+
+ lldev->live_rx_rings++;
+ }
+
+ return (B_TRUE);
+}
+
+static void
+xgell_rx_close(xgelldev_t *lldev)
+{
+ xgell_rx_ring_t *rx_ring;
+ int i;
+
+ if (lldev->live_rx_rings == 0)
+ return;
+
+ /*
+ * Close all rx rings
+ */
+ for (i = 0; i < lldev->init_rx_rings; i++) {
+ rx_ring = &lldev->rx_ring[i];
+
+ if (rx_ring->live) {
+ xgell_rx_ring_close(rx_ring);
+ lldev->live_rx_rings--;
+ }
+ }
+
+ xge_assert(lldev->live_rx_rings == 0);
+}
+
+/*
* xgell_tx_term
*
* Function will be called by HAL to terminate all DTRs for
@@ -1252,215 +1741,105 @@ xgell_tx_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh,
}
}
-/*
- * xgell_tx_close
- * @lldev: the link layer object
- *
- * Close all Tx channels
- */
-static void
-xgell_tx_close(xgelldev_t *lldev)
-{
- xge_list_t *item, *list;
- xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
-
- list = &hldev->fifo_channels;
- while (!xge_list_is_empty(list)) {
- item = xge_list_first_get(list);
- xge_hal_channel_t *channel = xge_container_of(item,
- xge_hal_channel_t, item);
-
- xge_hal_channel_close(channel, XGE_HAL_CHANNEL_OC_NORMAL);
- }
-}
-
-/*
- * xgell_tx_open
- * @lldev: the link layer object
- *
- * Initialize and open all Tx channels;
- */
static boolean_t
-xgell_tx_open(xgelldev_t *lldev)
+xgell_tx_ring_open(xgell_tx_ring_t *tx_ring)
{
xge_hal_status_e status;
- u64 adapter_status;
xge_hal_channel_attr_t attr;
- xge_list_t *item;
- xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
+ xgelldev_t *lldev = tx_ring->lldev;
+
+ if (tx_ring->live)
+ return (B_TRUE);
- attr.post_qid = 0;
+ attr.post_qid = tx_ring->index;
attr.compl_qid = 0;
attr.callback = xgell_xmit_compl;
attr.per_dtr_space = sizeof (xgell_txd_priv_t);
attr.flags = 0;
attr.type = XGE_HAL_CHANNEL_TYPE_FIFO;
- attr.userdata = lldev;
attr.dtr_init = NULL;
attr.dtr_term = xgell_tx_term;
+ attr.userdata = tx_ring;
- if (xge_hal_device_status(lldev->devh, &adapter_status)) {
- xge_debug_ll(XGE_ERR, "%s%d: device is not ready "
- "adaper status reads 0x%"PRIx64, XGELL_IFNAME,
- lldev->instance, (uint64_t)adapter_status);
+ status = xge_hal_channel_open(lldev->devh, &attr, &tx_ring->channelh,
+ XGE_HAL_CHANNEL_OC_NORMAL);
+ if (status != XGE_HAL_OK) {
+ xge_debug_ll(XGE_ERR, "%s%d: cannot open Tx channel got status "
+ "code %d", XGELL_IFNAME, lldev->instance, status);
return (B_FALSE);
}
- /*
- * Open only configured channels. HAL structures are static,
- * so, no worries here..
- */
-_next_channel:
- xge_list_for_each(item, &hldev->free_channels) {
- xge_hal_channel_t *channel = xge_container_of(item,
- xge_hal_channel_t, item);
- xgell_fifo_t *fifo;
-
- /* filter on FIFO channels */
- if (channel->type != XGE_HAL_CHANNEL_TYPE_FIFO)
- continue;
-
- fifo = &lldev->fifos[attr.post_qid];
- fifo->lldev = lldev;
- attr.userdata = fifo;
-
- status = xge_hal_channel_open(lldev->devh, &attr,
- &fifo->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
- if (status != XGE_HAL_OK) {
- xge_debug_ll(XGE_ERR, "%s%d: cannot open Tx channel "
- "got status code %d", XGELL_IFNAME,
- lldev->instance, status);
- /* unwind */
- xgell_tx_close(lldev);
- return (B_FALSE);
- }
-
- attr.post_qid++;
-
- /*
- * because channel_open() moves xge_list entry
- * to the fifos_channels
- */
- goto _next_channel;
- }
+ tx_ring->sent_bytes = 0;
+ tx_ring->live = B_TRUE;
return (B_TRUE);
}
-/*
- * xgell_rx_close
- * @lldev: the link layer object
- *
- * Close all Rx channels
- */
static void
-xgell_rx_close(xgelldev_t *lldev)
+xgell_tx_ring_close(xgell_tx_ring_t *tx_ring)
{
- xge_list_t *item, *list;
- xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
-
- list = &hldev->ring_channels;
- while (!xge_list_is_empty(list)) {
- item = xge_list_first_get(list);
- xge_hal_channel_t *channel = xge_container_of(item,
- xge_hal_channel_t, item);
- xgell_ring_t *ring = xge_hal_channel_userdata(channel);
-
- xge_hal_channel_close(channel, XGE_HAL_CHANNEL_OC_NORMAL);
-
- /*
- * destroy Ring's buffer pool
- */
- if (xgell_rx_destroy_buffer_pool(ring) != DDI_SUCCESS) {
- xge_debug_ll(XGE_ERR, "unable to destroy Ring%d "
- "buffer pool", channel->post_qid);
- }
- list = &hldev->ring_channels;
- }
+ if (!tx_ring->live)
+ return;
+ xge_hal_channel_close(tx_ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
+ tx_ring->live = B_FALSE;
}
/*
- * xgell_rx_open
+ * xgell_tx_open
* @lldev: the link layer object
*
- * Initialize and open all Rx channels;
+ * Initialize and open all TX channels.
*/
static boolean_t
-xgell_rx_open(xgelldev_t *lldev)
+xgell_tx_open(xgelldev_t *lldev)
{
- xge_hal_status_e status;
- u64 adapter_status;
- xge_hal_channel_attr_t attr;
- xge_list_t *item;
- xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
+ xgell_tx_ring_t *tx_ring;
+ int i;
- attr.post_qid = 0;
- attr.compl_qid = 0;
- attr.callback = xgell_rx_1b_compl;
- attr.per_dtr_space = sizeof (xgell_rxd_priv_t);
- attr.flags = 0;
- attr.type = XGE_HAL_CHANNEL_TYPE_RING;
- attr.dtr_init = xgell_rx_dtr_replenish;
- attr.dtr_term = xgell_rx_dtr_term;
+ if (lldev->live_tx_rings != 0)
+ return (B_TRUE);
- if (xge_hal_device_status(lldev->devh, &adapter_status)) {
- xge_debug_ll(XGE_ERR,
- "%s%d: device is not ready adaper status reads 0x%"PRIx64,
- XGELL_IFNAME, lldev->instance,
- (uint64_t)adapter_status);
- return (B_FALSE);
- }
+ lldev->live_tx_rings = 0;
/*
- * Open only configured channels. HAL structures are static,
- * so, no worries here..
+ * Enable rings by reserve sequence to match the h/w sequences.
*/
-_next_channel:
- xge_list_for_each(item, &hldev->free_channels) {
- xge_hal_channel_t *channel = xge_container_of(item,
- xge_hal_channel_t, item);
- xgell_ring_t *ring;
-
- /* filter on RING channels */
- if (channel->type != XGE_HAL_CHANNEL_TYPE_RING)
- continue;
-
- ring = &lldev->rings[attr.post_qid];
- ring->lldev = lldev;
- attr.userdata = ring;
-
- if (xgell_rx_create_buffer_pool(ring) != DDI_SUCCESS) {
- xge_debug_ll(XGE_ERR, "unable to create Ring%d "
- "buffer pool", attr.post_qid);
- /* unwind */
- xgell_rx_close(lldev);
- return (B_FALSE);
- }
+ for (i = 0; i < lldev->init_tx_rings; i++) {
+ tx_ring = &lldev->tx_ring[i];
+ tx_ring->index = i;
+ tx_ring->lldev = lldev;
+ tx_ring->live = B_FALSE;
- status = xge_hal_channel_open(lldev->devh, &attr,
- &ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
- if (status != XGE_HAL_OK) {
- xge_debug_ll(XGE_ERR, "%s%d: cannot open Rx channel "
- "got status got status code %d", XGELL_IFNAME,
- lldev->instance, status);
- /* unwind */
- (void) xgell_rx_destroy_buffer_pool(ring);
- xgell_rx_close(lldev);
+ if (!xgell_tx_ring_open(tx_ring))
return (B_FALSE);
- }
- attr.post_qid++;
-
- /*
- * because chhannel_open() moves xge_list entry
- * to the rings channels
- */
- goto _next_channel;
+ lldev->live_tx_rings++;
}
return (B_TRUE);
}
+static void
+xgell_tx_close(xgelldev_t *lldev)
+{
+ xgell_tx_ring_t *tx_ring;
+ int i;
+
+ if (lldev->live_tx_rings == 0)
+ return;
+
+ /*
+ * Enable rings by reserve sequence to match the h/w sequences.
+ */
+ for (i = 0; i < lldev->init_tx_rings; i++) {
+ tx_ring = &lldev->tx_ring[i];
+ if (tx_ring->live) {
+ xgell_tx_ring_close(tx_ring);
+ lldev->live_tx_rings--;
+ }
+ }
+}
+
static int
xgell_initiate_start(xgelldev_t *lldev)
{
@@ -1485,13 +1864,13 @@ xgell_initiate_start(xgelldev_t *lldev)
}
/* tune jumbo/normal frame UFC counters */
- hldev->config.ring.queue[XGELL_RING_MAIN_QID].rti.ufc_b = \
- maxpkt > XGE_HAL_DEFAULT_MTU ?
+ hldev->config.ring.queue[XGELL_RX_RING_MAIN].rti.ufc_b =
+ (maxpkt > XGE_HAL_DEFAULT_MTU) ?
XGE_HAL_DEFAULT_RX_UFC_B_J :
XGE_HAL_DEFAULT_RX_UFC_B_N;
- hldev->config.ring.queue[XGELL_RING_MAIN_QID].rti.ufc_c = \
- maxpkt > XGE_HAL_DEFAULT_MTU ?
+ hldev->config.ring.queue[XGELL_RX_RING_MAIN].rti.ufc_c =
+ (maxpkt > XGE_HAL_DEFAULT_MTU) ?
XGE_HAL_DEFAULT_RX_UFC_C_J :
XGE_HAL_DEFAULT_RX_UFC_C_N;
@@ -1515,6 +1894,7 @@ xgell_initiate_start(xgelldev_t *lldev)
XGELL_IFNAME, lldev->instance,
(uint64_t)adapter_status, status);
}
+ xgell_rx_close(lldev);
xge_os_mdelay(1500);
return (ENOMEM);
}
@@ -1531,9 +1911,9 @@ xgell_initiate_start(xgelldev_t *lldev)
XGELL_IFNAME, lldev->instance,
(uint64_t)adapter_status, status);
}
- xge_os_mdelay(1500);
+ xgell_tx_close(lldev);
xgell_rx_close(lldev);
-
+ xge_os_mdelay(1500);
return (ENOMEM);
}
@@ -1686,46 +2066,6 @@ xgell_onerr_reset(xgelldev_t *lldev)
return (rc);
}
-
-/*
- * xgell_m_unicst
- * @arg: pointer to device private strucutre(hldev)
- * @mac_addr:
- *
- * This function is called by MAC Layer to set the physical address
- * of the XFRAME firmware.
- */
-static int
-xgell_m_unicst(void *arg, const uint8_t *macaddr)
-{
- xge_hal_status_e status;
- xgelldev_t *lldev = (xgelldev_t *)arg;
- xge_hal_device_t *hldev = lldev->devh;
- xge_debug_ll(XGE_TRACE, "%s", "MAC_UNICST");
-
- xge_debug_ll(XGE_TRACE, "%s", "M_UNICAST");
-
- mutex_enter(&lldev->genlock);
-
- xge_debug_ll(XGE_TRACE,
- "setting macaddr: 0x%02x-%02x-%02x-%02x-%02x-%02x",
- macaddr[0], macaddr[1], macaddr[2],
- macaddr[3], macaddr[4], macaddr[5]);
-
- status = xge_hal_device_macaddr_set(hldev, 0, (uchar_t *)macaddr);
- if (status != XGE_HAL_OK) {
- xge_debug_ll(XGE_ERR, "%s%d: can not set mac address",
- XGELL_IFNAME, lldev->instance);
- mutex_exit(&lldev->genlock);
- return (EIO);
- }
-
- mutex_exit(&lldev->genlock);
-
- return (0);
-}
-
-
/*
* xgell_m_multicst
* @arg: pointer to device private strucutre(hldev)
@@ -2039,12 +2379,14 @@ xgell_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
}
}
-/* ARGSUSED */
+
static boolean_t
xgell_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
xgelldev_t *lldev = arg;
+ xge_debug_ll(XGE_TRACE, "xgell_m_getcapab: %x", cap);
+
switch (cap) {
case MAC_CAPAB_HCKSUM: {
uint32_t *hcksum_txflags = cap_data;
@@ -2063,6 +2405,29 @@ xgell_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
return (B_FALSE);
}
}
+ case MAC_CAPAB_RINGS: {
+ mac_capab_rings_t *cap_rings = cap_data;
+
+ switch (cap_rings->mr_type) {
+ case MAC_RING_TYPE_RX:
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = lldev->init_rx_rings;
+ cap_rings->mr_gnum = lldev->init_rx_groups;
+ cap_rings->mr_rget = xgell_fill_ring;
+ cap_rings->mr_gget = xgell_fill_group;
+ break;
+ case MAC_RING_TYPE_TX:
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_rnum = lldev->init_tx_rings;
+ cap_rings->mr_gnum = 0;
+ cap_rings->mr_rget = xgell_fill_ring;
+ cap_rings->mr_gget = NULL;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
default:
return (B_FALSE);
}
@@ -2320,8 +2685,7 @@ xgell_devconfig_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
return (ENOSPC);
}
status = xge_hal_aux_device_config_read(lldev->devh,
- XGELL_DEVCONF_BUFSIZE,
- buf, &retsize);
+ XGELL_DEVCONF_BUFSIZE, buf, &retsize);
if (status != XGE_HAL_OK) {
kmem_free(buf, XGELL_DEVCONF_BUFSIZE);
xge_debug_ll(XGE_ERR, "device_config_read(): status %d",
@@ -2349,6 +2713,9 @@ xgell_device_register(xgelldev_t *lldev, xgell_config_t *config)
mac_register_t *macp = NULL;
xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
+ /*
+ * Initialize some NDD interface for internal debug.
+ */
if (nd_load(&lldev->ndp, "pciconf", xgell_pciconf_get, NULL,
(caddr_t)lldev) == B_FALSE)
goto xgell_ndd_fail;
@@ -2393,11 +2760,11 @@ xgell_device_register(xgelldev_t *lldev, xgell_config_t *config)
macp->m_min_sdu = 0;
macp->m_max_sdu = hldev->config.mtu;
macp->m_margin = VLAN_TAGSZ;
+ macp->m_v12n = MAC_VIRT_LEVEL1;
+
/*
- * Finally, we're ready to register ourselves with the Nemo
- * interface; if this succeeds, we're all ready to start()
+ * MAC Registration.
*/
-
if (mac_register(macp, &lldev->mh) != 0)
goto xgell_register_fail;
diff --git a/usr/src/uts/common/io/xge/drv/xgell.h b/usr/src/uts/common/io/xge/drv/xgell.h
index aa8bcc43ff..93845bb655 100644
--- a/usr/src/uts/common/io/xge/drv/xgell.h
+++ b/usr/src/uts/common/io/xge/drv/xgell.h
@@ -60,7 +60,7 @@
#include <sys/pattr.h>
#include <sys/strsun.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#ifdef __cplusplus
@@ -69,11 +69,6 @@ extern "C" {
#define XGELL_DESC "Xframe I/II 10Gb Ethernet"
#define XGELL_IFNAME "xge"
-#define XGELL_TX_LEVEL_LOW 8
-#define XGELL_TX_LEVEL_HIGH 32
-#define XGELL_TX_LEVEL_CHECK 3
-#define XGELL_MAX_RING_DEFAULT 8
-#define XGELL_MAX_FIFO_DEFAULT 1
#include <xgehal.h>
@@ -93,25 +88,64 @@ extern "C" {
#define XGELL_RX_BUFFER_TOTAL XGE_HAL_RING_RXDS_PER_BLOCK(1) * 6
#define XGELL_RX_BUFFER_POST_HIWAT XGE_HAL_RING_RXDS_PER_BLOCK(1) * 5
-/* Control driver to copy or DMA received packets */
-#define XGELL_RX_DMA_LOWAT 256
+/*
+ * Multiple rings configuration
+ */
+#define XGELL_RX_RING_MAIN 0
+#define XGELL_TX_RING_MAIN 0
+
+#define XGELL_RX_RING_NUM_MIN 1
+#define XGELL_TX_RING_NUM_MIN 1
+#define XGELL_RX_RING_NUM_MAX 8
+#define XGELL_TX_RING_NUM_MAX 1 /* TODO */
+#define XGELL_RX_RING_NUM_DEFAULT XGELL_RX_RING_NUM_MAX
+#define XGELL_TX_RING_NUM_DEFAULT XGELL_TX_RING_NUM_MAX
+
+#define XGELL_MINTR_NUM_MIN 1
+#define XGELL_MINTR_NUM_MAX \
+ (XGELL_RX_RING_NUM_MAX + XGELL_TX_RING_NUM_MAX + 1)
+#define XGELL_MINTR_NUM_DEFAULT XGELL_MINTR_NUM_MAX
+
+#define XGELL_CONF_GROUP_POLICY_BASIC 0
+#define XGELL_CONF_GROUP_POLICY_VIRT 1
+#define XGELL_CONF_GROUP_POLICY_PERF 2
+#if 0
+#if defined(__sparc)
+#define XGELL_CONF_GROUP_POLICY_DEFAULT XGELL_CONF_GROUP_POLICY_PERF
+#else
+#define XGELL_CONF_GROUP_POLICY_DEFAULT XGELL_CONF_GROUP_POLICY_VIRT
+#endif
+#else
+/*
+ * The _PERF configuration enable a fat group of all rx rings, as approachs
+ * better fanout performance of the primary interface.
+ */
+#define XGELL_CONF_GROUP_POLICY_DEFAULT XGELL_CONF_GROUP_POLICY_PERF
+#endif
-#define XGELL_RING_MAIN_QID 0
+#define XGELL_TX_LEVEL_LOW 8
+#define XGELL_TX_LEVEL_HIGH 32
+#define XGELL_TX_LEVEL_CHECK 3
+#define XGELL_MAX_RING_DEFAULT 8
+#define XGELL_MAX_FIFO_DEFAULT 1
-#if defined(__x86)
-#define XGELL_TX_DMA_LOWAT 128
+/* Control driver to copy or DMA inbound/outbound packets */
+#if defined(__sparc)
+#define XGELL_RX_DMA_LOWAT 256
+#define XGELL_TX_DMA_LOWAT 512
#else
-#define XGELL_TX_DMA_LOWAT 512
+#define XGELL_RX_DMA_LOWAT 256
+#define XGELL_TX_DMA_LOWAT 128
#endif
/*
* Try to collapse up to XGELL_RX_PKT_BURST packets into single mblk
* sequence before mac_rx() is called.
*/
-#define XGELL_RX_PKT_BURST 32
+#define XGELL_RX_PKT_BURST 32
/* About 1s */
-#define XGE_DEV_POLL_TICKS drv_usectohz(1000000)
+#define XGE_DEV_POLL_TICKS drv_usectohz(1000000)
#define XGELL_LSO_MAXLEN 65535
#define XGELL_CONF_ENABLE_BY_DEFAULT 1
@@ -157,6 +191,7 @@ extern "C" {
#define XGE_HAL_DEFAULT_RX_TIMER_AC_EN 1
#define XGE_HAL_DEFAULT_RX_TIMER_VAL 384
+#define XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_A 1024
#define XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_J 2048
#define XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_N 4096
#define XGE_HAL_DEFAULT_FIFO_QUEUE_INTR 0
@@ -171,15 +206,14 @@ extern "C" {
*/
#define XGE_HAL_DEFAULT_FIFO_ALIGNMENT_SIZE 4096
#define XGE_HAL_DEFAULT_FIFO_MAX_ALIGNED_FRAGS 1
-#if defined(__x86)
-#define XGE_HAL_DEFAULT_FIFO_FRAGS 128
-#else
+#if defined(__sparc)
#define XGE_HAL_DEFAULT_FIFO_FRAGS 64
+#else
+#define XGE_HAL_DEFAULT_FIFO_FRAGS 128
#endif
#define XGE_HAL_DEFAULT_FIFO_FRAGS_THRESHOLD 18
-#define XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_J 2
-#define XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_N 2
+#define XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS 2
#define XGE_HAL_RING_QUEUE_BUFFER_MODE_DEFAULT 1
#define XGE_HAL_DEFAULT_BACKOFF_INTERVAL_US 64
#define XGE_HAL_DEFAULT_RING_PRIORITY 0
@@ -202,18 +236,15 @@ extern "C" {
#define XGE_HAL_DEFAULT_STATS_REFRESH_TIME 1
#if defined(__sparc)
-#define XGE_HAL_DEFAULT_MMRB_COUNT \
- XGE_HAL_MAX_MMRB_COUNT
-#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION \
- XGE_HAL_EIGHT_SPLIT_TRANSACTION
+#define XGE_HAL_DEFAULT_MMRB_COUNT XGE_HAL_MAX_MMRB_COUNT
+#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION XGE_HAL_EIGHT_SPLIT_TRANSACTION
#else
#define XGE_HAL_DEFAULT_MMRB_COUNT 1 /* 1k */
-#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION \
- XGE_HAL_TWO_SPLIT_TRANSACTION
+#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION XGE_HAL_TWO_SPLIT_TRANSACTION
#endif
/*
- * default the size of buffers allocated for ndd interface functions
+ * Default the size of buffers allocated for ndd interface functions
*/
#define XGELL_STATS_BUFSIZE 8192
#define XGELL_PCICONF_BUFSIZE 2048
@@ -222,17 +253,12 @@ extern "C" {
#define XGELL_DEVCONF_BUFSIZE 8192
/*
- * xgell_event_e
+ * Multiple mac address definitions
*
- * This enumeration derived from xgehal_event_e. It extends it
- * for the reason to get serialized context.
+ * We'll use whole MAC Addresses Configuration Memory for unicast addresses,
+ * since current multicast implementation in HAL is by enabling promise mode.
*/
-/* Renamb the macro from HAL */
-#define XGELL_EVENT_BASE XGE_LL_EVENT_BASE
-typedef enum xgell_event_e {
- /* LL events */
- XGELL_EVENT_RESCHED_NEEDED = XGELL_EVENT_BASE + 1,
-} xgell_event_e;
+#define XGE_RX_MULTI_MAC_ADDRESSES_MAX 8 /* per ring group */
typedef struct {
int rx_pkt_burst;
@@ -240,24 +266,27 @@ typedef struct {
int rx_buffer_post_hiwat;
int rx_dma_lowat;
int tx_dma_lowat;
- int msix_enable;
int lso_enable;
+ int msix_enable;
+ int grouping;
} xgell_config_t;
-typedef struct xgell_ring xgell_ring_t;
-typedef struct xgell_fifo xgell_fifo_t;
+typedef struct xgell_multi_mac xgell_multi_mac_t;
+typedef struct xgell_rx_ring xgell_rx_ring_t;
+typedef struct xgell_tx_ring xgell_tx_ring_t;
+typedef struct xgelldev xgelldev_t;
typedef struct xgell_rx_buffer_t {
- struct xgell_rx_buffer_t *next;
- void *vaddr;
- dma_addr_t dma_addr;
- ddi_dma_handle_t dma_handle;
- ddi_acc_handle_t dma_acch;
- xgell_ring_t *ring;
- frtn_t frtn;
+ struct xgell_rx_buffer_t *next;
+ void *vaddr;
+ dma_addr_t dma_addr;
+ ddi_dma_handle_t dma_handle;
+ ddi_acc_handle_t dma_acch;
+ xgell_rx_ring_t *ring;
+ frtn_t frtn;
} xgell_rx_buffer_t;
-/* Buffer pool for all rings */
+/* Buffer pool for one rx ring */
typedef struct xgell_rx_buffer_pool_t {
uint_t total; /* total buffers */
uint_t size; /* buffer size */
@@ -266,50 +295,92 @@ typedef struct xgell_rx_buffer_pool_t {
uint_t post; /* posted buffers */
uint_t post_hiwat; /* hiwat to stop post */
spinlock_t pool_lock; /* buffer pool lock */
+ boolean_t live; /* pool status */
xgell_rx_buffer_t *recycle_head; /* recycle list's head */
xgell_rx_buffer_t *recycle_tail; /* recycle list's tail */
uint_t recycle; /* # of rx buffers recycled */
spinlock_t recycle_lock; /* buffer recycle lock */
} xgell_rx_buffer_pool_t;
-typedef struct xgelldev xgelldev_t;
+struct xgell_multi_mac {
+ int naddr; /* total supported addresses */
+ int naddrfree; /* free addresses slots */
+ ether_addr_t mac_addr[XGE_RX_MULTI_MAC_ADDRESSES_MAX];
+ boolean_t mac_addr_set[XGE_RX_MULTI_MAC_ADDRESSES_MAX];
+};
-struct xgell_ring {
- xge_hal_channel_h channelh;
- xgelldev_t *lldev;
- mac_resource_handle_t handle; /* per ring cookie */
- xgell_rx_buffer_pool_t bf_pool;
+typedef uint_t (*intr_func_t)(caddr_t, caddr_t);
+
+typedef struct xgell_intr {
+ uint_t index;
+ ddi_intr_handle_t *handle; /* DDI interrupt handle */
+ intr_func_t *function; /* interrupt function */
+ caddr_t arg; /* interrupt source */
+} xgell_intr_t;
+
+struct xgell_rx_ring {
+ int index;
+ boolean_t live; /* ring active status */
+ xge_hal_channel_h channelh; /* hardware channel */
+ xgelldev_t *lldev; /* driver device */
+ mac_ring_handle_t ring_handle; /* call back ring handle */
+ mac_group_handle_t group_handle; /* call back group handle */
+ uint64_t ring_gen_num;
+
+ xgell_multi_mac_t mmac; /* per group multiple addrs */
+ xgell_rx_buffer_pool_t bf_pool; /* per ring buffer pool */
+ int received_bytes; /* total received bytes */
+ int intr_bytes; /* interrupt received bytes */
+ int poll_bytes; /* bytes to be polled up */
+ int polled_bytes; /* total polled bytes */
+ mblk_t *poll_mp; /* polled messages */
+
+ spinlock_t ring_lock; /* per ring lock */
};
-struct xgell_fifo {
- xge_hal_channel_h channelh;
- xgelldev_t *lldev;
- int level_low;
+struct xgell_tx_ring {
+ int index;
+ boolean_t live; /* ring active status */
+ xge_hal_channel_h channelh; /* hardware channel */
+ xgelldev_t *lldev; /* driver device */
+ mac_ring_handle_t ring_handle; /* call back ring handle */
+ int sent_bytes; /* bytes sent though the ring */
+
+ boolean_t need_resched;
};
struct xgelldev {
- caddr_t ndp;
+ volatile int is_initialized;
+ volatile int in_reset;
+ kmutex_t genlock;
mac_handle_t mh;
int instance;
dev_info_t *dev_info;
xge_hal_device_h devh;
- xgell_ring_t rings[XGE_HAL_MAX_RING_NUM];
- xgell_fifo_t fifos[XGE_HAL_MAX_FIFO_NUM];
- int resched_avail;
- int resched_send;
- int resched_retry;
- int tx_copied_max;
- volatile int is_initialized;
- xgell_config_t config;
- volatile int in_reset;
+ caddr_t ndp;
timeout_id_t timeout_id;
- kmutex_t genlock;
+
+ int init_rx_rings;
+ int init_tx_rings;
+ int init_rx_groups;
+
+ int live_rx_rings;
+ int live_tx_rings;
+ xgell_rx_ring_t rx_ring[XGELL_RX_RING_NUM_DEFAULT];
+ xgell_tx_ring_t tx_ring[XGELL_TX_RING_NUM_DEFAULT];
+
+ int tx_copied_max;
+
+ xgell_intr_t intrs[XGELL_MINTR_NUM_DEFAULT];
+
ddi_intr_handle_t *intr_table;
uint_t intr_table_size;
int intr_type;
int intr_cnt;
uint_t intr_pri;
int intr_cap;
+
+ xgell_config_t config;
};
typedef struct {
diff --git a/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h b/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h
index 5852bb9e9a..5275da409a 100644
--- a/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h
+++ b/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h
@@ -21,6 +21,11 @@
* Copyright (c) 2002-2006 Neterion, Inc.
*/
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
#ifndef XGE_HAL_CHANNEL_H
#define XGE_HAL_CHANNEL_H
@@ -69,7 +74,8 @@ typedef enum xge_hal_channel_type_e {
typedef enum xge_hal_channel_flag_e {
XGE_HAL_CHANNEL_FLAG_NONE = 0x0,
XGE_HAL_CHANNEL_FLAG_USE_TX_LOCK = 0x1,
- XGE_HAL_CHANNEL_FLAG_FREE_RXD = 0x2
+ XGE_HAL_CHANNEL_FLAG_FREE_RXD = 0x2,
+ XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING = 0x4
} xge_hal_channel_flag_e;
/**
diff --git a/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h b/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h
index e79774e329..f0b0a3520d 100644
--- a/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h
+++ b/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h
@@ -21,6 +21,11 @@
* Copyright (c) 2002-2006 Neterion, Inc.
*/
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
#ifndef XGE_HAL_REGS_H
#define XGE_HAL_REGS_H
@@ -814,8 +819,8 @@ typedef struct {
u64 rmac_cfg_key;
#define XGE_HAL_RMAC_CFG_KEY(val) vBIT(val,0,16)
-#define XGE_HAL_MAX_MAC_ADDRESSES 64
-#define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET 63
+#define XGE_HAL_MAX_MAC_ADDRESSES 256
+#define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET 255
#define XGE_HAL_MAX_MAC_ADDRESSES_HERC 256
#define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET_HERC 255
diff --git a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c
index 5b70ea1378..d08c1d58bf 100644
--- a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c
+++ b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c
@@ -21,6 +21,11 @@
* Copyright (c) 2002-2006 Neterion, Inc.
*/
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
#ifdef XGE_DEBUG_FP
#include "xgehal-device.h"
#endif
@@ -444,7 +449,9 @@ xge_hal_device_poll_rx_channels(xge_hal_device_t *hldev, int *got_rx)
if (hldev->terminating)
return XGE_HAL_OK;
channel = xge_container_of(item, xge_hal_channel_t, item);
- (void) xge_hal_device_poll_rx_channel(channel, got_rx);
+ if (!(channel->flags & XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING)) {
+ (void) xge_hal_device_poll_rx_channel(channel, got_rx);
+ }
}
return XGE_HAL_OK;
@@ -483,6 +490,21 @@ xge_hal_device_poll_tx_channels(xge_hal_device_t *hldev, int *got_tx)
}
/**
+ *
+ */
+__HAL_STATIC_DEVICE __HAL_INLINE_DEVICE void
+xge_hal_device_rx_channel_enable_polling(xge_hal_channel_t *channel)
+{
+ channel->flags |= XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING;
+}
+
+__HAL_STATIC_DEVICE __HAL_INLINE_DEVICE void
+xge_hal_device_rx_channel_disable_polling(xge_hal_channel_t *channel)
+{
+ channel->flags &= ~XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING;
+}
+
+/**
* xge_hal_device_mask_tx - Mask Tx interrupts.
* @hldev: HAL device handle.
*
diff --git a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c
index 346f10b8bc..4cf18c2621 100644
--- a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c
+++ b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c
@@ -5044,7 +5044,7 @@ xge_hal_device_macaddr_find(xge_hal_device_t *hldev, macaddr_t wanted)
return XGE_HAL_ERR_INVALID_DEVICE;
}
- for (i=1; i<XGE_HAL_MAX_MAC_ADDRESSES; i++) {
+ for (i=0; i<XGE_HAL_MAX_MAC_ADDRESSES; i++) {
(void) xge_hal_device_macaddr_get(hldev, i, &macaddr);
if (!xge_os_memcmp(macaddr, wanted, sizeof(macaddr_t))) {
return i;
diff --git a/usr/src/uts/common/os/exacct.c b/usr/src/uts/common/os/exacct.c
index cb8ced5239..43a7298c7b 100644
--- a/usr/src/uts/common/os/exacct.c
+++ b/usr/src/uts/common/os/exacct.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/exacct.h>
#include <sys/exacct_catalog.h>
#include <sys/disp.h>
@@ -43,6 +41,7 @@
#include <sys/sysmacros.h>
#include <sys/bitmap.h>
#include <sys/msacct.h>
+#include <sys/mac.h>
/*
* exacct usage and recording routines
@@ -1163,6 +1162,271 @@ exacct_commit_proc(proc_t *p, int wstat)
}
static int
+exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res)
+{
+ int attached = 1;
+
+ switch (res) {
+ case AC_NET_NAME:
+ (void) ea_attach_item(record, ns->ns_name,
+ strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME);
+ break;
+ case AC_NET_CURTIME:
+ {
+ uint64_t now;
+ timestruc_t ts;
+
+ gethrestime(&ts);
+ now = (uint64_t)(ulong_t)ts.tv_sec;
+ (void) ea_attach_item(record, &now, sizeof (uint64_t),
+ EXT_UINT64 | EXD_NET_STATS_CURTIME);
+ }
+ break;
+ case AC_NET_IBYTES:
+ (void) ea_attach_item(record, &ns->ns_ibytes,
+ sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES);
+ break;
+ case AC_NET_OBYTES:
+ (void) ea_attach_item(record, &ns->ns_obytes,
+ sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES);
+ break;
+ case AC_NET_IPKTS:
+ (void) ea_attach_item(record, &ns->ns_ipackets,
+ sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS);
+ break;
+ case AC_NET_OPKTS:
+ (void) ea_attach_item(record, &ns->ns_opackets,
+ sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS);
+ break;
+ case AC_NET_IERRPKTS:
+ (void) ea_attach_item(record, &ns->ns_ierrors,
+ sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS);
+ break;
+ case AC_NET_OERRPKTS:
+ (void) ea_attach_item(record, &ns->ns_oerrors,
+ sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS);
+ break;
+ default:
+ attached = 0;
+ }
+ return (attached);
+}
+
+static int
+exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res)
+{
+ int attached = 1;
+
+ switch (res) {
+ case AC_NET_NAME:
+ (void) ea_attach_item(record, nd->nd_name,
+ strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME);
+ break;
+ case AC_NET_DEVNAME:
+ (void) ea_attach_item(record, nd->nd_devname,
+ strlen(nd->nd_devname) + 1, EXT_STRING |
+ EXD_NET_DESC_DEVNAME);
+ break;
+ case AC_NET_EHOST:
+ (void) ea_attach_item(record, &nd->nd_ehost,
+ sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST);
+ break;
+ case AC_NET_EDEST:
+ (void) ea_attach_item(record, &nd->nd_edest,
+ sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST);
+ break;
+ case AC_NET_VLAN_TPID:
+ (void) ea_attach_item(record, &nd->nd_vlan_tpid,
+ sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID);
+ break;
+ case AC_NET_VLAN_TCI:
+ (void) ea_attach_item(record, &nd->nd_vlan_tci,
+ sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI);
+ break;
+ case AC_NET_SAP:
+ (void) ea_attach_item(record, &nd->nd_sap,
+ sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP);
+ break;
+ case AC_NET_PRIORITY:
+ (void) ea_attach_item(record, &nd->nd_priority,
+ sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY);
+ break;
+ case AC_NET_BWLIMIT:
+ (void) ea_attach_item(record, &nd->nd_bw_limit,
+ sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT);
+ break;
+ case AC_NET_SADDR:
+ if (nd->nd_isv4) {
+ (void) ea_attach_item(record, &nd->nd_saddr[3],
+ sizeof (uint32_t), EXT_UINT32 |
+ EXD_NET_DESC_V4SADDR);
+ } else {
+ (void) ea_attach_item(record, &nd->nd_saddr,
+ sizeof (nd->nd_saddr), EXT_RAW |
+ EXD_NET_DESC_V6SADDR);
+ }
+ break;
+ case AC_NET_DADDR:
+ if (nd->nd_isv4) {
+ (void) ea_attach_item(record, &nd->nd_daddr[3],
+ sizeof (uint32_t), EXT_UINT32 |
+ EXD_NET_DESC_V4DADDR);
+ } else {
+ (void) ea_attach_item(record, &nd->nd_daddr,
+ sizeof (nd->nd_daddr), EXT_RAW |
+ EXD_NET_DESC_V6DADDR);
+ }
+ break;
+ case AC_NET_SPORT:
+ (void) ea_attach_item(record, &nd->nd_sport,
+ sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT);
+ break;
+ case AC_NET_DPORT:
+ (void) ea_attach_item(record, &nd->nd_dport,
+ sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT);
+ break;
+ case AC_NET_PROTOCOL:
+ (void) ea_attach_item(record, &nd->nd_protocol,
+ sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL);
+ break;
+ case AC_NET_DSFIELD:
+ (void) ea_attach_item(record, &nd->nd_dsfield,
+ sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD);
+ break;
+ default:
+ attached = 0;
+ }
+ return (attached);
+}
+
+static ea_object_t *
+exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type,
+ int what)
+{
+ int res;
+ int count;
+ ea_object_t *record;
+
+ /*
+ * Assemble usage values into group.
+ */
+ record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
+ for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++)
+ if (BT_TEST(mask, res)) {
+ if (what == EX_NET_LNDESC_REC ||
+ what == EX_NET_FLDESC_REC) {
+ count += exacct_attach_netdesc_item(
+ (net_desc_t *)ninfo, record, res);
+ } else {
+ count += exacct_attach_netstat_item(
+ (net_stat_t *)ninfo, record, res);
+ }
+ }
+ if (count == 0) {
+ ea_free_object(record, EUP_ALLOC);
+ record = NULL;
+ }
+ return (record);
+}
+
+int
+exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo,
+ int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
+ void *ubuf, size_t ubufsize, size_t *actual, int what)
+{
+ ulong_t mask[AC_MASK_SZ];
+ ea_object_t *net_desc;
+ ea_catalog_t record_type;
+ void *buf;
+ size_t bufsize;
+ int ret;
+
+ mutex_enter(&ac_net->ac_lock);
+ if (ac_net->ac_state == AC_OFF) {
+ mutex_exit(&ac_net->ac_lock);
+ return (ENOTACTIVE);
+ }
+ bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
+ mutex_exit(&ac_net->ac_lock);
+
+ switch (what) {
+ case EX_NET_LNDESC_REC:
+ record_type = EXD_GROUP_NET_LINK_DESC;
+ break;
+ case EX_NET_LNSTAT_REC:
+ record_type = EXD_GROUP_NET_LINK_STATS;
+ break;
+ case EX_NET_FLDESC_REC:
+ record_type = EXD_GROUP_NET_FLOW_DESC;
+ break;
+ case EX_NET_FLSTAT_REC:
+ record_type = EXD_GROUP_NET_FLOW_STATS;
+ break;
+ }
+
+ net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what);
+ if (net_desc == NULL)
+ return (0);
+
+ /*
+ * Pack object into buffer and pass to callback.
+ */
+ bufsize = ea_pack_object(net_desc, NULL, 0);
+ buf = kmem_alloc(bufsize, KM_NOSLEEP);
+ if (buf == NULL)
+ return (ENOMEM);
+
+ (void) ea_pack_object(net_desc, buf, bufsize);
+
+ ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual);
+
+ /*
+ * Free all previously allocations.
+ */
+ kmem_free(buf, bufsize);
+ ea_free_object(net_desc, EUP_ALLOC);
+ return (ret);
+}
+
+int
+exacct_commit_netinfo(void *arg, int what)
+{
+ size_t size;
+ ulong_t mask[AC_MASK_SZ];
+ struct exacct_globals *acg;
+ ac_info_t *ac_net;
+
+ if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
+ /*
+ * acctctl module not loaded. Nothing to do.
+ */
+ return (ENOTACTIVE);
+ }
+
+ /*
+ * Even though each zone nominally has its own flow accounting settings
+ * (ac_flow), these are only maintained by and for the global zone.
+ *
+ * If this were to change in the future, this function should grow a
+ * second zoneid (or zone) argument, and use the corresponding zone's
+ * settings rather than always using those of the global zone.
+ */
+ acg = zone_getspecific(exacct_zone_key, global_zone);
+ ac_net = &acg->ac_net;
+
+ mutex_enter(&ac_net->ac_lock);
+ if (ac_net->ac_state == AC_OFF) {
+ mutex_exit(&ac_net->ac_lock);
+ return (ENOTACTIVE);
+ }
+ bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
+ mutex_exit(&ac_net->ac_lock);
+
+ return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback,
+ NULL, 0, &size, what));
+}
+
+static int
exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
{
int attached = 1;
diff --git a/usr/src/uts/common/inet/ip/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c
index 3b5b3435d9..722c793b79 100644
--- a/usr/src/uts/common/inet/ip/ip_cksum.c
+++ b/usr/src/uts/common/os/ip_cksum.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,13 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/inttypes.h>
#include <sys/systm.h>
@@ -36,6 +33,7 @@
#include <sys/ddi.h>
#include <sys/vtrace.h>
#include <inet/sctp_crc32.h>
+#include <inet/ip.h>
#include <sys/multidata.h>
#include <sys/multidata_impl.h>
@@ -154,14 +152,14 @@ ip_cksum(mblk_t *mp, int offset, uint_t sum)
*/
if (mlen)
mlen += dp->db_cksumend
- - dp->db_cksumstuff;
+ - dp->db_cksumstuff;
else {
w = (ushort_t *)(mp->b_rptr +
dp->db_cksumstuff);
if (is_odd(w))
goto slow;
mlen = dp->db_cksumend
- - dp->db_cksumstuff;
+ - dp->db_cksumstuff;
}
} else if (mlen == 0)
return (psum);
@@ -239,7 +237,7 @@ slow1:
int odd;
douio:
odd = is_odd(dp->db_cksumstuff -
- dp->db_cksumstart);
+ dp->db_cksumstart);
if (pmlen == -1) {
/*
* Previous mlen was odd, so swap
@@ -262,7 +260,7 @@ slow1:
*/
if (mlen)
mlen += dp->db_cksumend
- - dp->db_cksumstuff;
+ - dp->db_cksumstuff;
else {
w = (ushort_t *)(mp->b_rptr +
dp->db_cksumstuff);
@@ -385,7 +383,7 @@ done:
sum = (sum & 0xFFFF) + (sum >> 16);
sum = (sum & 0xFFFF) + (sum >> 16);
TRACE_3(TR_FAC_IP, TR_IP_CKSUM_END,
- "ip_cksum_end:(%S) type %d (%X)", "ip_cksum", 1, sum);
+ "ip_cksum_end:(%S) type %d (%X)", "ip_cksum", 1, sum);
return (sum);
}
@@ -537,3 +535,30 @@ ip_md_cksum(pdesc_t *pd, int offset, uint_t sum)
return (sum);
}
+
+/* Return the IP checksum for the IP header at "iph". */
+uint16_t
+ip_csum_hdr(ipha_t *ipha)
+{
+ uint16_t *uph;
+ uint32_t sum;
+ int opt_len;
+
+ opt_len = (ipha->ipha_version_and_hdr_length & 0xF) -
+ IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+ uph = (uint16_t *)ipha;
+ sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
+ uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
+ if (opt_len > 0) {
+ do {
+ sum += uph[10];
+ sum += uph[11];
+ uph += 2;
+ } while (--opt_len);
+ }
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ sum = ~(sum + (sum >> 16)) & 0xFFFF;
+ if (sum == 0xffff)
+ sum = 0;
+ return ((uint16_t)sum);
+}
diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c
index 3c63231253..4d52a9eb66 100644
--- a/usr/src/uts/common/os/modhash.c
+++ b/usr/src/uts/common/os/modhash.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* mod_hash: flexible hash table implementation.
*
@@ -816,6 +814,22 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
return (res);
}
+int
+mod_hash_find_cb_rval(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
+ int (*find_cb)(mod_hash_key_t, mod_hash_val_t), int *cb_rval)
+{
+ int res;
+
+ rw_enter(&hash->mh_contents, RW_READER);
+ res = i_mod_hash_find_nosync(hash, key, val);
+ if (res == 0) {
+ *cb_rval = find_cb(key, *val);
+ }
+ rw_exit(&hash->mh_contents);
+
+ return (res);
+}
+
void
i_mod_hash_walk_nosync(mod_hash_t *hash,
uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 23c5e91475..b71b956f8a 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -54,6 +54,7 @@
#include <sys/vfs.h>
#include <sys/mntent.h>
#include <sys/contract_impl.h>
+#include <sys/dld_ioc.h>
/*
* There are two possible layers of privilege routines and two possible
@@ -2267,3 +2268,23 @@ secpolicy_xvm_control(const cred_t *cr)
return (EPERM);
return (0);
}
+
+/*
+ * secpolicy_dld_ioctl
+ *
+ * Determine if the subject has permission to use certain dld ioctls.
+ * Each ioctl should require a limited number of privileges. A large
+ * number indicates a poor design.
+ */
+int
+secpolicy_dld_ioctl(const cred_t *cr, const char *dld_priv, const char *msg)
+{
+ int rv;
+
+ if ((rv = priv_getbyname(dld_priv, 0)) >= 0) {
+ return (PRIV_POLICY(cr, rv, B_FALSE, EPERM, msg));
+ }
+ /* priv_getbyname() returns -ve errno */
+ return (-rv);
+
+}
diff --git a/usr/src/uts/common/inet/sctp_crc32.c b/usr/src/uts/common/os/sctp_crc32.c
index 21dcaf18fd..38e049e440 100644
--- a/usr/src/uts/common/inet/sctp_crc32.c
+++ b/usr/src/uts/common/os/sctp_crc32.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
/*
@@ -68,7 +65,7 @@ static uint32_t
flip32(uint32_t w)
{
return (((w >> 24) | ((w >> 8) & 0xff00) | ((w << 8) & 0xff0000) |
- (w << 24)));
+ (w << 24)));
}
#endif
diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c
index 6edebecdfe..6ed5e749f1 100644
--- a/usr/src/uts/common/os/space.c
+++ b/usr/src/uts/common/os/space.c
@@ -359,23 +359,14 @@ space_free(char *key)
const uint32_t crc32_table[256] = { CRC32_TABLE };
-
/*
- * We need to fanout load from NIC which can overwhelm a single
- * CPU. A 10Gb NIC interrupting a single CPU is a good example.
- * Instead of fanning out to random CPUs, it a big performance
- * win if you can fanout to the threads on the same core (niagara)
- * that is taking interrupts.
- *
- * We need a better mechanism to figure out the other threads on
- * the same core or cores on the same chip which share caches etc.
- * but for time being, this will suffice.
+ * We need to fanout load from NIC which can overwhelm a single CPU.
+ * This becomes especially important on systems having slow CPUs
+ * (sun4v architecture). mac_soft_ring_enable is false on all
+ * systems except sun4v. On sun4v, they get enabled by default (see
+ * sun4v/os/mach_startup.c).
*/
-#define NUMBER_OF_THREADS_PER_CPU 4
-uint_t ip_threads_per_cpu = NUMBER_OF_THREADS_PER_CPU;
-
-/* Global flag to enable/disable soft ring facility */
-boolean_t ip_squeue_soft_ring = B_FALSE;
+boolean_t mac_soft_ring_enable = B_FALSE;
/*
* Global iscsi boot prop
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index cd8a0a2a62..442ced2b51 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -27,8 +27,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/param.h>
@@ -8450,18 +8448,25 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
if (mp->b_datap->db_type == M_DATA) {
if (flags != NULL) {
- *flags = DB_CKSUMFLAGS(mp);
- if (*flags & HCK_PARTIALCKSUM) {
- if (start != NULL)
- *start = (uint32_t)DB_CKSUMSTART(mp);
- if (stuff != NULL)
- *stuff = (uint32_t)DB_CKSUMSTUFF(mp);
- if (end != NULL)
- *end = (uint32_t)DB_CKSUMEND(mp);
+ *flags = DB_CKSUMFLAGS(mp) & (HCK_IPV4_HDRCKSUM |
+ HCK_PARTIALCKSUM | HCK_FULLCKSUM |
+ HCK_FULLCKSUM_OK);
+ if ((*flags & (HCK_PARTIALCKSUM |
+ HCK_FULLCKSUM)) != 0) {
if (value != NULL)
*value = (uint32_t)DB_CKSUM16(mp);
- } else if ((*flags & HW_LSO) && (value != NULL))
- *value = (uint32_t)DB_LSOMSS(mp);
+ if ((*flags & HCK_PARTIALCKSUM) != 0) {
+ if (start != NULL)
+ *start =
+ (uint32_t)DB_CKSUMSTART(mp);
+ if (stuff != NULL)
+ *stuff =
+ (uint32_t)DB_CKSUMSTUFF(mp);
+ if (end != NULL)
+ *end =
+ (uint32_t)DB_CKSUMEND(mp);
+ }
+ }
}
} else {
pattrinfo_t hck_attr = {PATTR_HCKSUM};
@@ -8488,6 +8493,28 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
}
}
+void
+lso_info_set(mblk_t *mp, uint32_t mss, uint32_t flags)
+{
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ /* Set the flags */
+ DB_LSOFLAGS(mp) |= flags;
+ DB_LSOMSS(mp) = mss;
+}
+
+void
+lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
+{
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ if (flags != NULL) {
+ *flags = DB_CKSUMFLAGS(mp) & HW_LSO;
+ if ((*flags != 0) && (mss != NULL))
+ *mss = (uint32_t)DB_LSOMSS(mp);
+ }
+}
+
/*
* Checksum buffer *bp for len bytes with psum partial checksum,
* or 0 if none, and return the 16 bit partial checksum.
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 5fe7393f56..cecccf50ab 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -191,6 +191,7 @@ CHKHDRS= \
dld_impl.h \
dld_ioc.h \
dls.h \
+ dls_mgmt.h \
dls_impl.h \
dma_i8237A.h \
dnlc.h \
@@ -353,7 +354,13 @@ CHKHDRS= \
lwp_upimutex_impl.h \
lpif.h \
mac.h \
+ mac_client.h \
+ mac_client_impl.h \
+ mac_flow.h \
+ mac_flow_impl.h \
mac_impl.h \
+ mac_provider.h \
+ mac_soft_ring.h \
machelf.h \
map.h \
md4.h \
@@ -418,6 +425,7 @@ CHKHDRS= \
pci.h \
pcie.h \
pci_impl.h \
+ pci_tools.h \
pcmcia.h \
pctypes.h \
pem.h \
diff --git a/usr/src/uts/common/sys/acctctl.h b/usr/src/uts/common/sys/acctctl.h
index 5019d36c4c..1dfa8e8577 100644
--- a/usr/src/uts/common/sys/acctctl.h
+++ b/usr/src/uts/common/sys/acctctl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ACCTCTL_H
#define _SYS_ACCTCTL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/bitmap.h>
#include <sys/sysmacros.h>
@@ -44,10 +41,11 @@ extern "C" {
/*
* modes
*/
-#define AC_PROC (0x1 << 28) /* change process accounting settings */
-#define AC_TASK (0x2 << 28) /* change task accounting settings */
-#define AC_FLOW (0x4 << 28) /* change flow accounting settings */
-#define AC_MODE(x) ((x) & 0xf0000000)
+#define AC_PROC (0x1 << 24) /* change process accounting settings */
+#define AC_TASK (0x2 << 24) /* change task accounting settings */
+#define AC_FLOW (0x4 << 24) /* change flow accounting settings */
+#define AC_NET (0x8 << 24) /* change network accounting settings */
+#define AC_MODE(x) ((x) & 0xff000000)
/*
* options
@@ -58,7 +56,7 @@ extern "C" {
#define AC_RES_GET (0x08) /* get a list of enabled resources */
#define AC_STATE_SET (0x10) /* set accounting mode state (on/off) */
#define AC_STATE_GET (0x20) /* get accounting mode state */
-#define AC_OPTION(x) ((x) & 0x0fffffff)
+#define AC_OPTION(x) ((x) & 0x00ffffff)
/*
* Process accounting resource IDs
@@ -113,8 +111,36 @@ extern "C" {
#define AC_FLOW_ANAME 13 /* action instance name */
#define AC_FLOW_MAX_RES 13 /* must be equal to the number above */
-#define AC_MAX_RES_TMP MAX(AC_PROC_MAX_RES, AC_TASK_MAX_RES)
-#define AC_MAX_RES MAX(AC_MAX_RES_TMP, AC_FLOW_MAX_RES)
+/*
+ * Network accounting resource IDs
+ */
+#define AC_NET_NAME 1 /* flow name */
+#define AC_NET_EHOST 2 /* ethernet source address */
+#define AC_NET_EDEST 3 /* ethernet destination address */
+#define AC_NET_VLAN_TPID 4 /* VLAN protocol ID */
+#define AC_NET_VLAN_TCI 5 /* VLAN tag control info. */
+#define AC_NET_SAP 6 /* SAP */
+#define AC_NET_PRIORITY 7 /* Priority */
+#define AC_NET_BWLIMIT 8 /* Bandwidth limit */
+#define AC_NET_DEVNAME 9 /* Device name */
+#define AC_NET_SADDR 10 /* Source IP address */
+#define AC_NET_DADDR 11 /* Dest IP address */
+#define AC_NET_SPORT 12 /* Source Port */
+#define AC_NET_DPORT 13 /* Dest Port */
+#define AC_NET_PROTOCOL 14 /* Protocol */
+#define AC_NET_DSFIELD 15 /* DiffServ field */
+#define AC_NET_CURTIME 16 /* Current Time */
+#define AC_NET_IBYTES 17 /* Inbound Bytes */
+#define AC_NET_OBYTES 18 /* Outbound Bytes */
+#define AC_NET_IPKTS 19 /* Inbound Packets */
+#define AC_NET_OPKTS 20 /* Outbound Packets */
+#define AC_NET_IERRPKTS 21 /* Inbound Error Packets */
+#define AC_NET_OERRPKTS 22 /* Outbound Error Packets */
+#define AC_NET_MAX_RES 22 /* must be equal to the number above */
+
+#define AC_MAX_RES \
+ MAX(MAX(MAX(AC_PROC_MAX_RES, AC_TASK_MAX_RES), AC_FLOW_MAX_RES), \
+ AC_NET_MAX_RES)
#define AC_MASK_SZ BT_BITOUL(AC_MAX_RES + 1)
/*
@@ -150,7 +176,7 @@ extern zone_key_t exacct_zone_key;
/*
* Per-zone exacct settings. Each zone may have its own settings for
- * process, task, and flow accounting.
+ * process, task, flow, and network accounting.
*
* Per-zone flow accounting has not yet been implemented, so this
* provides zones with the view that flow accounting in the zone hasn't
@@ -164,6 +190,7 @@ struct exacct_globals {
ac_info_t ac_task;
ac_info_t ac_proc;
ac_info_t ac_flow;
+ ac_info_t ac_net;
list_node_t ac_link;
};
diff --git a/usr/src/uts/common/sys/aggr.h b/usr/src/uts/common/sys/aggr.h
index 740ac7f6f9..c63cc9e99f 100644
--- a/usr/src/uts/common/sys/aggr.h
+++ b/usr/src/uts/common/sys/aggr.h
@@ -28,9 +28,8 @@
#include <sys/types.h>
#include <sys/ethernet.h>
-#include <sys/mac.h>
-#include <sys/dls.h>
#include <sys/param.h>
+#include <sys/mac.h>
#include <sys/dld_ioc.h>
#ifdef __cplusplus
@@ -38,7 +37,7 @@ extern "C" {
#endif
/*
- * Note that the datastructures defined here define an ioctl interface
+ * Note that the data structures defined here define an ioctl interface
* that is shared betwen user and kernel space. The aggr driver thus
* assumes that the structures have identical layout and size when
* compiled in either IPL32 or LP64.
diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h
index 62fe0de59b..a1f7e82849 100644
--- a/usr/src/uts/common/sys/aggr_impl.h
+++ b/usr/src/uts/common/sys/aggr_impl.h
@@ -27,8 +27,10 @@
#define _SYS_AGGR_IMPL_H
#include <sys/types.h>
-#include <sys/mac.h>
#include <sys/mac_ether.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
#include <sys/aggr_lacp.h>
#ifdef __cplusplus
@@ -46,6 +48,33 @@ extern "C" {
#define AGGR_MODIFY_LACP_TIMER 0x08
/*
+ * Possible value of aggr_rseudo_rx_ring_t.arr_flags. Set when the ring entry
+ * in the pseudo RX group is used.
+ */
+#define MAC_PSEUDO_RING_INUSE 0x01
+
+typedef struct aggr_unicst_addr_s {
+ uint8_t aua_addr[ETHERADDRL];
+ struct aggr_unicst_addr_s *aua_next;
+} aggr_unicst_addr_t;
+
+typedef struct aggr_pseudo_rx_ring_s {
+ mac_ring_handle_t arr_rh; /* filled in by aggr_fill_ring() */
+ struct aggr_port_s *arr_port;
+ mac_ring_handle_t arr_hw_rh;
+ uint_t arr_flags;
+ uint64_t arr_gen;
+} aggr_pseudo_rx_ring_t;
+
+typedef struct aggr_pseudo_rx_group_s {
+ struct aggr_grp_s *arg_grp; /* filled in by aggr_fill_group() */
+ mac_group_handle_t arg_gh; /* filled in by aggr_fill_group() */
+ aggr_unicst_addr_t *arg_macaddr;
+ aggr_pseudo_rx_ring_t arg_rings[MAX_RINGS_PER_GROUP];
+ uint_t arg_ring_cnt;
+} aggr_pseudo_rx_group_t;
+
+/*
* A link aggregation MAC port.
* Note that lp_next is protected by the lg_lock of the group the
* port is part of.
@@ -63,13 +92,13 @@ typedef struct aggr_port_s {
lp_collector_enabled : 1,
lp_promisc_on : 1,
lp_no_link_update : 1,
- lp_pad_bits : 27;
- uint32_t lp_closing;
+ lp_grp_added : 1,
+ lp_closing : 1,
+ lp_pad_bits : 25;
mac_handle_t lp_mh;
+ mac_client_handle_t lp_mch;
const mac_info_t *lp_mip;
mac_notify_handle_t lp_mnh;
- mac_rx_handle_t lp_mrh;
- krwlock_t lp_lock;
uint_t lp_tx_idx; /* idx in group's tx array */
uint64_t lp_ifspeed;
link_state_t lp_link_state;
@@ -78,15 +107,15 @@ typedef struct aggr_port_s {
uint64_t lp_ether_stat[ETHER_NSTAT];
aggr_lacp_port_t lp_lacp; /* LACP state */
lacp_stats_t lp_lacp_stats;
- const mac_txinfo_t *lp_txinfo;
uint32_t lp_margin;
-} aggr_port_t;
+ mac_promisc_handle_t lp_mphp;
+ mac_unicast_handle_t lp_mah;
-typedef struct lg_mcst_addr_s lg_mcst_addr_t;
-struct lg_mcst_addr_s {
- lg_mcst_addr_t *lg_mcst_nextp;
- uint8_t lg_mcst_addr[MAXMACADDRLEN];
-};
+ /* List of non-primary addresses that requires promiscous mode set */
+ aggr_unicst_addr_t *lp_prom_addr;
+ /* handle of the underlying HW RX group */
+ mac_group_handle_t lp_hwgh;
+} aggr_port_t;
/*
* A link aggregation group.
@@ -105,7 +134,6 @@ struct lg_mcst_addr_s {
*
*/
typedef struct aggr_grp_s {
- krwlock_t lg_lock;
datalink_id_t lg_linkid;
uint16_t lg_key; /* key (group port number) */
uint32_t lg_refs; /* refcount */
@@ -116,16 +144,15 @@ typedef struct aggr_grp_s {
lg_addr_fixed : 1, /* fixed MAC address? */
lg_started : 1, /* group started? */
lg_promisc : 1, /* in promiscuous mode? */
- lg_gldv3_polling : 1,
lg_zcopy : 1,
lg_vlan : 1,
lg_force : 1,
- lg_pad_bits : 8;
+ lg_pad_bits : 9;
aggr_port_t *lg_ports; /* list of configured ports */
aggr_port_t *lg_mac_addr_port;
mac_handle_t lg_mh;
- uint_t lg_rx_resources;
uint_t lg_nattached_ports;
+ krwlock_t lg_tx_lock;
uint_t lg_ntx_ports;
aggr_port_t **lg_tx_ports; /* array of tx ports */
uint_t lg_tx_ports_size; /* size of lg_tx_ports */
@@ -140,14 +167,32 @@ typedef struct aggr_grp_s {
uint32_t lg_hcksum_txflags;
uint_t lg_max_sdu;
uint32_t lg_margin;
- lg_mcst_addr_t *lg_mcst_list; /* A list of multicast addresses */
-} aggr_grp_t;
-#define AGGR_LACP_LOCK_WRITER(grp) rw_enter(&(grp)->aggr.gl_lock, RW_WRITER);
-#define AGGR_LACP_UNLOCK(grp) rw_exit(&(grp)->aggr.gl_lock);
-#define AGGR_LACP_LOCK_HELD_WRITER(grp) RW_WRITE_HELD(&(grp)->aggr.gl_lock)
-#define AGGR_LACP_LOCK_READER(grp) rw_enter(&(grp)->aggr.gl_lock, RW_READER);
-#define AGGR_LACP_LOCK_HELD_READER(grp) RW_READ_HELD(&(grp)->aggr.gl_lock)
+ /*
+ * The following fields are used by the LACP packets processing.
+ * Specifically, as the LACP packets processing is not performance
+ * critical, all LACP packets will be handled by a dedicated thread
+ * instead of in the mac_rx() call. This is to avoid the dead lock
+ * with mac_unicast_remove(), which holding the mac perimeter of the
+ * aggr, and wait for the mr_refcnt of the RX ring to drop to zero.
+ */
+ kmutex_t lg_lacp_lock;
+ kcondvar_t lg_lacp_cv;
+ mblk_t *lg_lacp_head;
+ mblk_t *lg_lacp_tail;
+ kthread_t *lg_lacp_rx_thread;
+ boolean_t lg_lacp_done;
+ aggr_pseudo_rx_group_t lg_rx_group;
+
+ /*
+ * The following fields are used by aggr to wait for all the
+ * aggr_port_notify_cb() and aggr_port_timer_thread() to finish
+ * before it calls mac_unregister() when the aggr is deleted.
+ */
+ kmutex_t lg_port_lock;
+ kcondvar_t lg_port_cv;
+ int lg_port_ref;
+} aggr_grp_t;
#define AGGR_GRP_REFHOLD(grp) { \
atomic_add_32(&(grp)->lg_refs, 1); \
@@ -195,33 +240,34 @@ extern int aggr_grp_info(datalink_id_t, void *, aggr_grp_info_new_grp_fn_t,
aggr_grp_info_new_port_fn_t);
extern void aggr_grp_notify(aggr_grp_t *, uint32_t);
extern boolean_t aggr_grp_attach_port(aggr_grp_t *, aggr_port_t *);
-extern boolean_t aggr_grp_detach_port(aggr_grp_t *, aggr_port_t *, boolean_t);
+extern boolean_t aggr_grp_detach_port(aggr_grp_t *, aggr_port_t *);
extern void aggr_grp_port_mac_changed(aggr_grp_t *, aggr_port_t *,
boolean_t *, boolean_t *);
extern int aggr_grp_add_ports(datalink_id_t, uint_t, boolean_t,
laioc_port_t *);
extern int aggr_grp_rem_ports(datalink_id_t, uint_t, laioc_port_t *);
extern boolean_t aggr_grp_update_ports_mac(aggr_grp_t *);
-extern int aggr_grp_modify(datalink_id_t, aggr_grp_t *, uint8_t, uint32_t,
- boolean_t, const uchar_t *, aggr_lacp_mode_t, aggr_lacp_timer_t);
+extern int aggr_grp_modify(datalink_id_t, uint8_t, uint32_t, boolean_t,
+ const uchar_t *, aggr_lacp_mode_t, aggr_lacp_timer_t);
extern void aggr_grp_multicst_port(aggr_port_t *, boolean_t);
extern uint_t aggr_grp_count(void);
extern void aggr_port_init(void);
extern void aggr_port_fini(void);
-extern int aggr_port_create(const datalink_id_t, boolean_t, aggr_port_t **);
+extern int aggr_port_create(aggr_grp_t *, const datalink_id_t, boolean_t,
+ aggr_port_t **);
extern void aggr_port_delete(aggr_port_t *);
extern void aggr_port_free(aggr_port_t *);
extern int aggr_port_start(aggr_port_t *);
extern void aggr_port_stop(aggr_port_t *);
extern int aggr_port_promisc(aggr_port_t *, boolean_t);
-extern int aggr_port_unicst(aggr_port_t *, uint8_t *);
+extern int aggr_port_unicst(aggr_port_t *);
extern int aggr_port_multicst(void *, boolean_t, const uint8_t *);
extern uint64_t aggr_port_stat(aggr_port_t *, uint_t);
-extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *, boolean_t);
+extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *);
extern void aggr_port_init_callbacks(aggr_port_t *);
-extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *);
+extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t);
extern mblk_t *aggr_m_tx(void *, mblk_t *);
extern void aggr_send_port_enable(aggr_port_t *);
@@ -236,10 +282,20 @@ extern void aggr_lacp_set_mode(aggr_grp_t *, aggr_lacp_mode_t,
aggr_lacp_timer_t);
extern void aggr_lacp_update_mode(aggr_grp_t *, aggr_lacp_mode_t);
extern void aggr_lacp_update_timer(aggr_grp_t *, aggr_lacp_timer_t);
-extern void aggr_lacp_rx(aggr_port_t *, mblk_t *);
+extern void aggr_lacp_rx_enqueue(aggr_port_t *, mblk_t *);
extern void aggr_lacp_port_attached(aggr_port_t *);
extern void aggr_lacp_port_detached(aggr_port_t *);
-extern void aggr_lacp_policy_changed(aggr_grp_t *);
+extern void aggr_port_lacp_set_mode(aggr_grp_t *, aggr_port_t *);
+
+extern void aggr_lacp_rx_thread(void *);
+extern void aggr_recv_lacp(aggr_port_t *, mac_resource_handle_t, mblk_t *);
+
+extern void aggr_grp_port_hold(aggr_port_t *);
+extern void aggr_grp_port_rele(aggr_port_t *);
+extern void aggr_grp_port_wait(aggr_grp_t *);
+
+extern int aggr_port_addmac(aggr_port_t *, const uint8_t *);
+extern void aggr_port_remmac(aggr_port_t *, const uint8_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/aggr_lacp.h b/usr/src/uts/common/sys/aggr_lacp.h
index ebcc07cb12..ef8c7408ac 100644
--- a/usr/src/uts/common/sys/aggr_lacp.h
+++ b/usr/src/uts/common/sys/aggr_lacp.h
@@ -157,8 +157,6 @@ typedef struct Agg {
aggr_lacp_timer_t PeriodicTimer; /* AGGR_LACP_{LONG,SHORT} */
uint64_t TimeOfLastOperChange; /* Time in state */
boolean_t ready; /* Ready_N for all ports TRUE */
-
- krwlock_t gl_lock;
} Agg_t;
/*
@@ -192,6 +190,19 @@ typedef struct state_machine {
} state_machine_t;
/*
+ * The following three flags are set when specific timer is timed out; used
+ * by the LACP timer handler thread.
+ */
+#define LACP_PERIODIC_TIMEOUT 0x01
+#define LACP_WAIT_WHILE_TIMEOUT 0x02
+#define LACP_CURRENT_WHILE_TIMEOUT 0x04
+/*
+ * Set when the port is being deleted; used to inform the LACP timer handler
+ * thread to exit.
+ */
+#define LACP_THREAD_EXIT 0x08
+
+/*
* 802.3ad Variables associated with each port (section 43.4.7)
*/
typedef struct aggr_lacp_port {
@@ -228,6 +239,10 @@ typedef struct aggr_lacp_port {
lacp_timer_t current_while_timer;
lacp_timer_t periodic_timer;
lacp_timer_t wait_while_timer;
+ uint32_t lacp_timer_bits;
+ kthread_t *lacp_timer_thread;
+ kmutex_t lacp_timer_lock;
+ kcondvar_t lacp_timer_cv;
hrtime_t time;
} aggr_lacp_port_t;
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index d3663f464f..1510b46123 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -38,6 +38,7 @@
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dld_ioc.h>
+#include <sys/mac_flow.h>
#include <sys/conf.h>
#include <sys/sad.h>
#include <net/if.h>
@@ -84,14 +85,18 @@ extern "C" {
*/
#define DLD_DRIVER_NAME "dld"
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
/*
* IOCTL codes and data structures.
*/
#define DLDIOC_ATTR DLDIOC(0x03)
typedef struct dld_ioc_attr {
- datalink_id_t dia_linkid;
- uint_t dia_max_sdu;
+ datalink_id_t dia_linkid;
+ uint_t dia_max_sdu;
} dld_ioc_attr_t;
#define DLDIOC_VLAN_ATTR DLDIOC(0x04)
@@ -100,7 +105,6 @@ typedef struct dld_ioc_vlan_attr {
uint16_t div_vid;
datalink_id_t div_linkid;
boolean_t div_force;
- boolean_t div_implicit;
} dld_ioc_vlan_attr_t;
#define DLDIOC_PHYS_ATTR DLDIOC(0x05)
@@ -203,15 +207,8 @@ typedef struct dld_ioc_rename {
typedef struct dld_ioc_zid {
zoneid_t diz_zid;
char diz_link[MAXLINKNAMELEN];
- boolean_t diz_is_ppa_hack;
} dld_ioc_zid_t;
-#define DLDIOC_GETZID DLDIOC(0x13)
-typedef struct dld_ioc_getzid {
- datalink_id_t dig_linkid;
- zoneid_t dig_zid;
-} dld_ioc_getzid_t;
-
/*
* data-link autopush configuration.
*/
@@ -221,8 +218,72 @@ struct dlautopush {
char dap_aplist[MAXAPUSH][FMNAMESZ+1];
};
-#define DLDIOC_SETMACPROP DLDIOC(0x14)
-#define DLDIOC_GETMACPROP DLDIOC(0x15)
+#define DLDIOC_MACADDRGET DLDIOC(0x15)
+typedef struct dld_ioc_macaddrget {
+ datalink_id_t dig_linkid;
+ uint_t dig_count;
+ uint_t dig_size;
+} dld_ioc_macaddrget_t;
+
+/* possible flags for dmi_flags below */
+#define DLDIOCMACADDR_USED 0x1 /* address slot used */
+
+typedef struct dld_macaddrinfo {
+ uint_t dmi_slot;
+ uint_t dmi_flags;
+ uint_t dmi_addrlen;
+ uchar_t dmi_addr[MAXMACADDRLEN];
+ char dmi_client_name[MAXNAMELEN];
+ datalink_id_t dma_client_linkid;
+} dld_macaddrinfo_t;
+
+/*
+ * IOCTL codes and data structures for flowadm.
+ */
+#define DLDIOC_ADDFLOW DLDIOC(0x16)
+typedef struct dld_ioc_addflow {
+ datalink_id_t af_linkid;
+ flow_desc_t af_flow_desc;
+ mac_resource_props_t af_resource_props;
+ char af_name[MAXNAMELEN];
+} dld_ioc_addflow_t;
+
+#define DLDIOC_REMOVEFLOW DLDIOC(0x17)
+typedef struct dld_ioc_removeflow {
+ char rf_name[MAXNAMELEN];
+} dld_ioc_removeflow_t;
+
+#define DLDIOC_MODIFYFLOW DLDIOC(0x18)
+typedef struct dld_ioc_modifyflow {
+ char mf_name[MAXNAMELEN];
+ mac_resource_props_t mf_resource_props;
+} dld_ioc_modifyflow_t;
+
+#define DLDIOC_WALKFLOW DLDIOC(0x19)
+typedef struct dld_ioc_walkflow {
+ datalink_id_t wf_linkid;
+ char wf_name[MAXNAMELEN];
+ uint32_t wf_nflows;
+ uint_t wf_len;
+} dld_ioc_walkflow_t;
+
+typedef struct dld_flowinfo {
+ datalink_id_t fi_linkid;
+ flow_desc_t fi_flow_desc;
+ mac_resource_props_t fi_resource_props;
+ char fi_flowname[MAXNAMELEN];
+ uint32_t fi_pad;
+} dld_flowinfo_t;
+
+#define DLDIOC_USAGELOG DLDIOC(0x1a)
+typedef struct dld_ioc_usagelog {
+ mac_logtype_t ul_type;
+ boolean_t ul_onoff;
+ uint_t ul_interval;
+} dld_ioc_usagelog_t;
+
+#define DLDIOC_SETMACPROP DLDIOC(0x1b)
+#define DLDIOC_GETMACPROP DLDIOC(0x1c)
#define MAC_PROP_VERSION 1
typedef struct dld_ioc_macprop_s {
@@ -236,7 +297,111 @@ typedef struct dld_ioc_macprop_s {
char pr_val[1];
} dld_ioc_macprop_t;
+#define DLDIOC_GETHWGRP DLDIOC(0x1d)
+
+typedef struct dld_ioc_hwgrpget {
+ datalink_id_t dih_linkid;
+ uint_t dih_n_groups; /* number of groups included in ioc */
+ uint_t dih_size;
+} dld_ioc_hwgrpget_t;
+
+#define MAXCLIENTNAMELEN 1024
+typedef struct dld_hwgrpinfo {
+ char dhi_link_name[MAXLINKNAMELEN];
+ uint_t dhi_grp_num;
+ uint_t dhi_grp_type;
+ uint_t dhi_n_rings;
+ uint_t dhi_n_clnts;
+ /* XXXX later we should use dhi_n_clnts * MAXNAMELEN for dhi_clnts */
+ char dhi_clnts[MAXCLIENTNAMELEN];
+} dld_hwgrpinfo_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
#ifdef _KERNEL
+
+#define DLD_CAPAB_DIRECT 0x00000001
+#define DLD_CAPAB_POLL 0x00000002
+#define DLD_CAPAB_PERIM 0x00000003
+#define DLD_CAPAB_LSO 0x00000004
+
+#define DLD_ENABLE 0x00000001
+#define DLD_DISABLE 0x00000002
+#define DLD_QUERY 0x00000003
+
+/*
+ * GLDv3 entry point for negotiating capabilities.
+ * This is exposed to IP after negotiation of DL_CAPAB_DLD.
+ *
+ * This function takes the following arguments:
+ * handle: used for identifying the interface to operate on (provided by dld).
+ * type: capability type.
+ * arg: points to a capability-specific structure.
+ * flags: used for indicating whether to enable or disable a capability.
+ *
+ * With this function, capability negotiation is reduced from a multi-step
+ * process to just one single function call.
+ * e.g. the following code would pass 'x' from IP to dld and obtain
+ * arg.output_arg from dld:
+ *
+ * arg.input_arg = x;
+ * rc = (*dld_capab)(handle, DLD_CAPAB_XXX, &arg, DLD_ENABLE);
+ * ill->info1 = arg.output_arg;
+ */
+typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t);
+
+/*
+ * Direct Tx/Rx capability.
+ */
+typedef struct dld_capab_direct_s {
+ /*
+ * Rx entry point and handle, owned by IP.
+ */
+ uintptr_t di_rx_cf;
+ void *di_rx_ch;
+
+ /*
+ * Tx entry points and handle, owned by DLD.
+ */
+ /* Entry point for transmitting packets */
+ uintptr_t di_tx_df;
+ void *di_tx_dh;
+
+ /* flow control notification callback */
+ uintptr_t di_tx_cb_df; /* callback registration/de-registration */
+ void *di_tx_cb_dh;
+} dld_capab_direct_t;
+
+/*
+ * Polling/softring capability.
+ */
+#define POLL_SOFTRING 0x00000001
+typedef struct dld_capab_poll_s {
+ uintptr_t poll_ring_add_cf;
+ uintptr_t poll_ring_remove_cf;
+ uintptr_t poll_ring_quiesce_cf;
+ uintptr_t poll_ring_restart_cf;
+ uintptr_t poll_ring_bind_cf;
+ void *poll_ring_ch;
+ uintptr_t poll_mac_accept_df;
+ void *poll_mac_dh;
+} dld_capab_poll_t;
+
+/*
+ * LSO capability
+ */
+/*
+ * Currently supported flags for LSO.
+ */
+#define DLD_LSO_TX_BASIC_TCP_IPV4 0x01 /* TCP LSO capability */
+
+typedef struct dld_capab_lso_s {
+ uint_t lso_flags; /* capability flags */
+ uint_t lso_max; /* maximum payload */
+} dld_capab_lso_t;
+
int dld_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
int dld_open(queue_t *, dev_t *, int, int, cred_t *);
int dld_close(queue_t *);
@@ -245,6 +410,13 @@ void dld_wsrv(queue_t *);
void dld_init_ops(struct dev_ops *, const char *);
void dld_fini_ops(struct dev_ops *);
int dld_autopush(dev_t *, struct dlautopush *);
+
+int dld_add_flow(datalink_id_t, char *, flow_desc_t *,
+ mac_resource_props_t *);
+int dld_remove_flow(char *);
+int dld_modify_flow(char *, mac_resource_props_t *);
+int dld_walk_flow(dld_ioc_walkflow_t *, intptr_t);
+
#endif
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h
index 8d2138cc52..906fd6fe15 100644
--- a/usr/src/uts/common/sys/dld_impl.h
+++ b/usr/src/uts/common/sys/dld_impl.h
@@ -27,13 +27,12 @@
#define _SYS_DLD_IMPL_H
#include <sys/types.h>
-#include <sys/conf.h>
+#include <sys/list.h>
#include <sys/ethernet.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
-#include <sys/dls.h>
#include <sys/dld.h>
+#include <sys/dls_impl.h>
#ifdef __cplusplus
extern "C" {
@@ -57,39 +56,50 @@ typedef enum {
DLD_ACTIVE
} dld_passivestate_t;
-typedef struct dld_str dld_str_t;
-typedef void (*dld_tx_t)(struct dld_str *, mblk_t *);
-
/*
- * dld_str_t object definition.
+ * The dld_str_t object definition and protection scheme for each member
+ * is described below. The framework locking mechanism details are described in
+ * mac_impl.h and mac.c
+ *
+ * Write Once Only (WO): Typically these are initialized when the end point
+ * is created or initialized and don't change subsequently
+ *
+ * Serializer (SL): Protected by the Serializer. All modify operations on an
+ * end point go through the serializer. Readers don't care about reading
+ * these fields atomically, or readers also use the serializer to see the
+ * values atomically.
+ *
+ * Lock: kmutex_t or kwrlock_t lock. Modify operations still go through the
+ * serializer, the lock helps synchronize readers with writers.
*/
-struct dld_str {
+
+struct dld_str_s { /* Protected by */
/*
* Major number of the device
*/
- major_t ds_major;
+ major_t ds_major; /* WO */
/*
* Ephemeral minor number for the object.
*/
- minor_t ds_minor;
+ minor_t ds_minor; /* WO */
/*
- * Read/write queues for the stream which the object represents.
+ * PPA number this stream is attached to.
*/
- queue_t *ds_rq;
- queue_t *ds_wq;
+ t_uscalar_t ds_ppa; /* SL */
/*
- * Lock to protect this structure.
+ * Read/write queues for the stream which the object represents.
*/
- krwlock_t ds_lock;
+ queue_t *ds_rq; /* WO */
+ queue_t *ds_wq; /* WO */
/*
* Stream is open to DLD_CONTROL (control node) or
* DLD_DLPI (DLS provider) node.
*/
- uint_t ds_type;
+ uint_t ds_type; /* WO */
/*
* The following fields are only used for DLD_DLPI type objects.
@@ -98,158 +108,123 @@ struct dld_str {
/*
* Current DLPI state.
*/
- t_uscalar_t ds_dlstate;
+ t_uscalar_t ds_dlstate; /* ds_lock */
/*
* DLPI style
*/
- t_uscalar_t ds_style;
+ t_uscalar_t ds_style; /* WO */
/*
* Currently bound DLSAP.
*/
- uint16_t ds_sap;
-
- /*
- * Handle of the data-link channel that is used by this object.
- */
- dls_channel_t ds_dc;
+ uint16_t ds_sap; /* SL */
/*
* Handle of the MAC that is used by the data-link interface.
*/
- mac_handle_t ds_mh;
-
- /*
- * VLAN identifier of the data-link interface.
- */
- uint16_t ds_vid;
+ mac_handle_t ds_mh; /* SL */
+ mac_client_handle_t ds_mch; /* SL */
/*
* Promiscuity level information.
*/
- uint32_t ds_promisc;
+ uint32_t ds_promisc; /* SL */
+ mac_promisc_handle_t ds_mph;
+ mac_promisc_handle_t ds_vlan_mph;
/*
* Immutable information of the MAC which the channel is using.
*/
- const mac_info_t *ds_mip;
+ const mac_info_t *ds_mip; /* SL */
/*
* Current packet priority.
*/
- uint_t ds_pri;
+ uint_t ds_pri; /* SL */
/*
* Handle of our MAC notification callback.
*/
- mac_notify_handle_t ds_mnh;
+ mac_notify_handle_t ds_mnh; /* SL */
/*
* Set of enabled DL_NOTE... notifications. (See dlpi.h).
*/
- uint32_t ds_notifications;
-
- /*
- * Cached MAC unicast addresses.
- */
- uint8_t ds_fact_addr[MAXMACADDRLEN];
- uint8_t ds_curr_addr[MAXMACADDRLEN];
+ uint32_t ds_notifications; /* SL */
/*
* Mode: unitdata, fast-path or raw.
*/
- dld_str_mode_t ds_mode;
+ dld_str_mode_t ds_mode; /* SL */
/*
* Native mode state.
*/
- boolean_t ds_native;
+ boolean_t ds_native; /* SL */
/*
* IP polling is operational if this flag is set.
*/
- boolean_t ds_polling;
- boolean_t ds_soft_ring;
+ boolean_t ds_polling; /* SL */
+ boolean_t ds_direct; /* SL */
/*
* LSO is enabled if ds_lso is set.
*/
- boolean_t ds_lso;
- uint64_t ds_lso_max;
+ boolean_t ds_lso; /* SL */
+ uint64_t ds_lso_max; /* SL */
/*
* State of DLPI user: may be active (regular network layer),
* passive (snoop-like monitoring), or unknown (not yet
* determined).
*/
- dld_passivestate_t ds_passivestate;
+ dld_passivestate_t ds_passivestate; /* SL */
/*
* Dummy mblk used for flow-control.
*/
- mblk_t *ds_tx_flow_mp;
-
- /*
- * Internal transmit queue and its parameters.
- */
- kmutex_t ds_tx_list_lock;
- mblk_t *ds_tx_list_head;
- mblk_t *ds_tx_list_tail;
- uint_t ds_tx_cnt;
- uint_t ds_tx_msgcnt;
- timeout_id_t ds_tx_qdepth_tid;
- boolean_t ds_tx_qbusy;
-
- dld_tx_t ds_tx;
- dld_tx_t ds_unitdata_tx;
- kmutex_t ds_tx_lock;
- kcondvar_t ds_tx_cv;
- uint32_t ds_intx_cnt;
- boolean_t ds_detaching;
-
- /*
- * Pending control messages to be processed.
- */
- mblk_t *ds_pending_head;
- mblk_t *ds_pending_tail;
-
- taskqid_t ds_tid;
- kmutex_t ds_disp_lock;
- kcondvar_t ds_disp_cv;
- boolean_t ds_closing;
+ mblk_t *ds_tx_flow_mp; /* ds_lock */
/*
- * Used to process ioctl message for control node. See comments
- * above dld_ioctl().
+ * List of queued DLPI requests. These will be processed
+ * by a taskq thread. This block is protected by ds_lock
*/
- void (*ds_ioctl)(queue_t *, mblk_t *);
+ kmutex_t ds_lock;
+ krwlock_t ds_rw_lock;
+ kcondvar_t ds_datathr_cv; /* ds_lock */
+ uint_t ds_datathr_cnt; /* ds_lock */
+ mblk_t *ds_pending_head; /* ds_lock */
+ mblk_t *ds_pending_tail; /* ds_lock */
+ kcondvar_t ds_dlpi_pending_cv; /* ds_lock */
+ uint32_t
+ ds_dlpi_pending : 1, /* ds_lock */
+ ds_local : 1,
+ ds_pad : 30; /* ds_lock */
+
+ dls_link_t *ds_dlp; /* SL */
+ dls_multicst_addr_t *ds_dmap; /* ds_rw_lock */
+ dls_rx_t ds_rx; /* ds_lock */
+ void *ds_rx_arg; /* ds_lock */
+ boolean_t ds_active; /* SL */
+ dld_str_t *ds_next; /* SL */
+ dls_head_t *ds_head;
+ dls_dl_handle_t ds_ddh;
+ list_node_t ds_tqlist;
};
-#define DLD_TX_ENTER(dsp) { \
- mutex_enter(&(dsp)->ds_tx_lock); \
- (dsp)->ds_intx_cnt++; \
- mutex_exit(&(dsp)->ds_tx_lock); \
-}
-
-#define DLD_TX_EXIT(dsp) { \
- mutex_enter(&(dsp)->ds_tx_lock); \
- if ((--(dsp)->ds_intx_cnt == 0) && (dsp)->ds_detaching) \
- cv_signal(&(dsp)->ds_tx_cv); \
- mutex_exit(&(dsp)->ds_tx_lock); \
+#define DLD_DATATHR_INC(dsp) { \
+ ASSERT(MUTEX_HELD(&(dsp)->ds_lock)); \
+ dsp->ds_datathr_cnt++; \
}
-/*
- * Quiesce the traffic.
- */
-#define DLD_TX_QUIESCE(dsp) { \
- mutex_enter(&(dsp)->ds_tx_lock); \
- (dsp)->ds_tx = (dsp)->ds_unitdata_tx = NULL; \
- (dsp)->ds_detaching = B_TRUE; \
- while ((dsp)->ds_intx_cnt != 0) \
- cv_wait(&(dsp)->ds_tx_cv, &(dsp)->ds_tx_lock); \
- (dsp)->ds_detaching = B_FALSE; \
- mutex_exit(&(dsp)->ds_tx_lock); \
+#define DLD_DATATHR_DCR(dsp) { \
+ mutex_enter(&(dsp)->ds_lock); \
+ (dsp)->ds_datathr_cnt--; \
+ if ((dsp)->ds_datathr_cnt == 0) \
+ cv_broadcast(&(dsp)->ds_datathr_cv); \
+ mutex_exit(&(dsp)->ds_lock); \
}
/*
@@ -269,26 +244,34 @@ extern void dld_str_rx_fastpath(void *, mac_resource_handle_t,
mblk_t *, mac_header_info_t *);
extern void dld_str_rx_unitdata(void *, mac_resource_handle_t,
mblk_t *, mac_header_info_t *);
-
-extern void dld_tx_flush(dld_str_t *);
extern void dld_str_notify_ind(dld_str_t *);
-extern void dld_tx_single(dld_str_t *, mblk_t *);
-extern void str_mdata_fastpath_put(dld_str_t *, mblk_t *);
-extern void str_mdata_raw_put(dld_str_t *, mblk_t *);
-
-extern void dld_ioctl(queue_t *, mblk_t *);
-extern void dld_finish_pending_task(dld_str_t *);
+extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *,
+ uintptr_t, uint16_t);
+extern int dld_flow_ctl_callb(dld_str_t *, uint64_t,
+ int (*func)(), void *);
/*
* dld_proto.c
*/
-extern void dld_wput_proto_nondata(dld_str_t *, mblk_t *);
-extern void dld_wput_proto_data(dld_str_t *, mblk_t *);
+extern void dld_proto(dld_str_t *, mblk_t *);
+extern void dld_proto_unitdata_req(dld_str_t *, mblk_t *);
extern void dld_capabilities_disable(dld_str_t *);
+extern void proto_unitdata_req(dld_str_t *, mblk_t *);
+
+/*
+ * dld_flow.c
+ */
+extern void flow_rx_pkt_chain(void *, void *, mblk_t *);
+
+/*
+ * dld_drv.c
+ */
+extern mac_handle_t dld_mac_open(char *dev_name, int *err);
+#define dld_mac_close(mh) mac_close(mh)
/*
* Options: there should be a separate bit defined here for each
- * DLD_PROP... defined in dld.h.
+ * DLD_PROP... defined in dld.h.
*/
#define DLD_OPT_NO_FASTPATH 0x00000001
#define DLD_OPT_NO_POLL 0x00000002
@@ -316,6 +299,33 @@ typedef struct dld_ap {
#define IMPLY(p, c) (!(p) || (c))
+#define DLD_SETQFULL(dsp) { \
+ queue_t *q = (dsp)->ds_wq; \
+ \
+ mutex_enter(&(dsp)->ds_lock); \
+ if ((dsp)->ds_tx_flow_mp != NULL) { \
+ (void) putq(q, (dsp)->ds_tx_flow_mp); \
+ (dsp)->ds_tx_flow_mp = NULL; \
+ qenable((dsp)->ds_wq); \
+ } \
+ mutex_exit(&(dsp)->ds_lock); \
+}
+
+#define DLD_CLRQFULL(dsp) { \
+ queue_t *q = (dsp)->ds_wq; \
+ \
+ mutex_enter(&(dsp)->ds_lock); \
+ if (!mac_tx_is_flow_blocked((dsp)->ds_mch, NULL)) { \
+ if ((dsp)->ds_tx_flow_mp == NULL) \
+ (dsp)->ds_tx_flow_mp = getq(q); \
+ ASSERT((dsp)->ds_tx_flow_mp != NULL); \
+ } \
+ mutex_exit(&(dsp)->ds_lock); \
+}
+
+#define DLD_TX(dsp, mp, f_hint, flag) \
+ mac_tx(dsp->ds_mch, mp, f_hint, flag, NULL)
+
#ifdef DEBUG
#define DLD_DBG cmn_err
#else
diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h
index cb8f5bf225..86406cab4f 100644
--- a/usr/src/uts/common/sys/dld_ioc.h
+++ b/usr/src/uts/common/sys/dld_ioc.h
@@ -77,18 +77,22 @@ extern "C" {
* DLDCOPYIN or DLDCOPYOUT flags are set so that every di_func()
* callback function does not need to copyin/out its own data.
*/
-typedef int (dld_ioc_func_t)(void *, intptr_t, int, cred_t *);
+
+/* Maximum number of Privileges */
+#define DLD_MAX_PRIV 16
+
+typedef int (dld_ioc_func_t)(void *, intptr_t, int, cred_t *, int *);
typedef struct dld_ioc_info {
uint_t di_cmd;
uint_t di_flags;
size_t di_argsize;
dld_ioc_func_t *di_func;
+ const char *di_priv[DLD_MAX_PRIV];
} dld_ioc_info_t;
/* Values for di_flags */
#define DLDCOPYIN 0x00000001 /* copyin di_argsize amount of data */
#define DLDCOPYOUT 0x00000002 /* copyout di_argsize amount of data */
-#define DLDDLCONFIG 0x00000004 /* ioctl requires PRIV_SYS_DL_CONFIG */
#define DLDCOPYINOUT (DLDCOPYIN | DLDCOPYOUT)
#define DLDIOCCNT(l) (sizeof (l) / sizeof (dld_ioc_info_t))
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 3af7b7bca7..aa01ddeed6 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -586,12 +586,8 @@ union DL_qos_types {
/* dl_data is dl_capab_mdt_t */
#define DL_CAPAB_ZEROCOPY 0x05 /* Zero-copy capability */
/* dl_data is dl_capab_zerocopy_t */
-#define DL_CAPAB_POLL 0x06 /* Polling capability */
- /* dl_data is dl_capab_dls_t */
-#define DL_CAPAB_SOFT_RING 0x07 /* Soft ring capable */
- /* dl_data is dl_capab_dls_t */
-#define DL_CAPAB_LSO 0x08 /* Large Send Offload capability */
- /* dl_data is dl_capab_lso_t */
+#define DL_CAPAB_DLD 0x06 /* dld capability */
+ /* dl_data is dl_capab_dld_t */
typedef struct {
t_uscalar_t dl_cap; /* capability type */
@@ -710,55 +706,22 @@ typedef struct {
#ifdef _KERNEL
/*
- * This structure is used by DL_CAPAB_POLL and DL_CAPAB_SOFT_RING
- * capabilities. It provides a mechanism for IP to exchange function
- * pointers with a gldv3-based driver to enable it to bypass streams-
- * data-paths. DL_CAPAB_POLL mechanism provides a way to blank
- * interrupts. Note: True polling support will be added in the future.
- * DL_CAPAB_SOFT_RING provides a mechanism to create soft ring at the
- * dls layer.
+ * The DL_CAPAB_DLD capability enables the capabilities of gldv3-based drivers
+ * to be negotiated using a function call (dld_capab) instead of using streams.
*/
-typedef struct dl_capab_dls_s {
- t_uscalar_t dls_version;
- t_uscalar_t dls_flags;
+typedef struct dl_capab_dld_s {
+ t_uscalar_t dld_version;
+ t_uscalar_t dld_flags;
/* DLD provided information */
- uintptr_t dls_tx_handle;
- uintptr_t dls_tx;
- uintptr_t dls_ring_change_status;
- uintptr_t dls_ring_bind;
- uintptr_t dls_ring_unbind;
+ uintptr_t dld_capab;
+ uintptr_t dld_capab_handle;
+ dl_mid_t dld_mid; /* module ID */
+} dl_capab_dld_t;
- /* IP provided information */
- uintptr_t dls_rx_handle;
- uintptr_t dls_ring_assign;
- uintptr_t dls_rx;
- uintptr_t dls_ring_add;
- t_uscalar_t dls_ring_cnt;
-
- dl_mid_t dls_mid; /* module ID */
-} dl_capab_dls_t;
-
-#define POLL_CURRENT_VERSION 0x01
-#define POLL_VERSION_1 0x01
-
-#define SOFT_RING_VERSION_1 0x01
-
-/* Values for poll_flags */
-#define POLL_ENABLE 0x01 /* Set to enable polling */
- /* capability */
-#define POLL_CAPABLE 0x02 /* Polling ability exists */
-#define POLL_DISABLE 0x03 /* Disable Polling */
-
-/* Values for soft_ring_flags */
-#define SOFT_RING_ENABLE 0x04 /* Set to enable soft_ring */
- /* capability */
-#define SOFT_RING_CAPABLE 0x05 /* Soft_Ring ability exists */
-#define SOFT_RING_DISABLE 0x06 /* Disable Soft_Ring */
-
-/* Soft_Ring fanout types (used by soft_ring_change_status) */
-#define SOFT_RING_NONE 0x00
-#define SOFT_RING_FANOUT 0x01
+#define DL_CAPAB_DLD_ENABLE 0x00000001
+#define DLD_VERSION_1 1
+#define DLD_CURRENT_VERSION DLD_VERSION_1
#endif /* _KERNEL */
@@ -786,29 +749,6 @@ typedef struct {
/* transmit */
/*
- * Large Send Offload sub-capability (follows dl_capability_sub_t)
- */
-typedef struct {
- t_uscalar_t lso_version; /* interface version */
- t_uscalar_t lso_flags; /* capability flags */
- t_uscalar_t lso_max; /* maximum payload */
- t_uscalar_t reserved[1]; /* reserved fields */
- dl_mid_t lso_mid; /* module ID */
-} dl_capab_lso_t;
-
-/*
- * Large Send Offload revision definition history
- */
-#define LSO_CURRENT_VERSION 0x01
-#define LSO_VERSION_1 0x01
-
-/*
- * Currently supported values of lso_flags
- */
-#define LSO_TX_ENABLE 0x01 /* to enable LSO */
-#define LSO_TX_BASIC_TCP_IPV4 0x02 /* TCP LSO capability */
-
-/*
* DLPI interface primitive definitions.
*
* Each primitive is sent as a stream message. It is possible that
diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h
index 3bfe25ecf0..c96c6f1b85 100644
--- a/usr/src/uts/common/sys/dls.h
+++ b/usr/src/uts/common/sys/dls.h
@@ -28,8 +28,8 @@
#include <sys/types.h>
#include <sys/stream.h>
-#include <net/if.h>
-#include <sys/mac.h>
+#include <sys/mac_client.h>
+#include <sys/dls_mgmt.h>
/*
* Data-Link Services Module
@@ -53,233 +53,56 @@ extern "C" {
* Macros for converting ppas to instance #s, Vlan ID, or minor.
*/
#define DLS_PPA2INST(ppa) ((int)((ppa) % 1000))
-#define DLS_PPA2VID(ppa) ((ppa) / 1000)
+#define DLS_PPA2VID(ppa) ((uint16_t)((ppa) / 1000))
+#define DLS_PPA2MINOR(ppa) ((minor_t)((DLS_PPA2INST(ppa)) + 1))
/*
- * Converts a minor to an instance#; makes sense only when minor <= 1000.
- */
-#define DLS_MINOR2INST(minor) ((int)((minor) - 1))
-
-typedef enum {
- DATALINK_CLASS_PHYS = 0x01,
- DATALINK_CLASS_VLAN = 0x02,
- DATALINK_CLASS_AGGR = 0x04,
- DATALINK_CLASS_VNIC = 0x08
-} datalink_class_t;
-
-#define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \
- DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC)
-
-/*
- * A combination of flags and media.
- * flags is the higher 32 bits, and if it is 0x01, it indicates all media
- * types can be accepted; otherwise, only the given media type (specified
- * in the lower 32 bits) is accepted.
+ * Maps a (VID, INST) pair to ppa
*/
-typedef uint64_t datalink_media_t;
-
-#define DATALINK_ANY_MEDIATYPE \
- ((datalink_media_t)(((datalink_media_t)0x01) << 32))
-
-#define DATALINK_MEDIA_ACCEPTED(dmedia, media) \
- (((uint32_t)(((dmedia) >> 32) & 0xfffffffful) & 0x01) ? \
- B_TRUE : ((uint32_t)((dmedia) & 0xfffffffful) == (media)))
-
-#define MAXLINKATTRLEN 32
-#define MAXLINKATTRVALLEN 1024
+#define DLS_VIDINST2PPA(vid, inst) ((minor_t)((vid) * 1000 + (inst)))
/*
- * Link attributes used by the kernel.
- */
-/*
- * The major number and instance number of the underlying physical device
- * are kept as FPHYMAJ and FPHYINST (major, instance + 1).
- *
- * Set for physical links only.
- */
-#define FPHYMAJ "phymaj" /* uint64_t */
-#define FPHYINST "phyinst" /* uint64_t */
-
-/*
- * The devname of the physical link. For example, bge0, ce1. Set for physical
- * links only.
- */
-#define FDEVNAME "devname" /* string */
-
-/*
- * The door file for the dlmgmtd (data-link management) daemon.
- */
-#define DLMGMT_DOOR "/etc/svc/volatile/dladm/dlmgmt_door"
-
-/*
- * Door upcall commands.
- */
-#define DLMGMT_CMD_DLS_CREATE 1
-#define DLMGMT_CMD_DLS_GETATTR 2
-#define DLMGMT_CMD_DLS_DESTROY 3
-#define DLMGMT_CMD_GETNAME 4
-#define DLMGMT_CMD_GETLINKID 5
-#define DLMGMT_CMD_GETNEXT 6
-#define DLMGMT_CMD_DLS_UPDATE 7
-#define DLMGMT_CMD_LINKPROP_INIT 8
-#define DLMGMT_CMD_BASE 128
-
-/*
- * Indicate the link mapping is active or persistent
- */
-#define DLMGMT_ACTIVE 0x01
-#define DLMGMT_PERSIST 0x02
-
-/* upcall argument */
-typedef struct dlmgmt_door_arg {
- uint_t ld_cmd;
-} dlmgmt_door_arg_t;
-
-typedef struct dlmgmt_upcall_arg_create {
- int ld_cmd;
- datalink_class_t ld_class;
- uint32_t ld_media;
- boolean_t ld_persist;
- uint64_t ld_phymaj;
- uint64_t ld_phyinst;
- char ld_devname[MAXNAMELEN];
-} dlmgmt_upcall_arg_create_t;
-
-/*
- * Note: ld_padding is necessary to keep the size of the structure the
- * same on amd64 and i386. The same note applies to other ld_padding
- * and lr_paddding fields in structures throughout this file.
+ * Converts a minor to an instance#; makes sense only when minor <= 1000.
*/
-typedef struct dlmgmt_upcall_arg_destroy {
- int ld_cmd;
- datalink_id_t ld_linkid;
- boolean_t ld_persist;
- int ld_padding;
-} dlmgmt_upcall_arg_destroy_t;
-
-typedef struct dlmgmt_upcall_arg_update {
- int ld_cmd;
- boolean_t ld_novanity;
- uint32_t ld_media;
- uint32_t ld_padding;
- char ld_devname[MAXNAMELEN];
-} dlmgmt_upcall_arg_update_t;
-
-typedef struct dlmgmt_upcall_arg_getattr {
- int ld_cmd;
- datalink_id_t ld_linkid;
- char ld_attr[MAXLINKATTRLEN];
-} dlmgmt_upcall_arg_getattr_t;
-
-typedef struct dlmgmt_door_getname {
- int ld_cmd;
- datalink_id_t ld_linkid;
-} dlmgmt_door_getname_t;
-
-typedef struct dlmgmt_door_getlinkid {
- int ld_cmd;
- char ld_link[MAXLINKNAMELEN];
-} dlmgmt_door_getlinkid_t;
-
-typedef struct dlmgmt_door_getnext_s {
- int ld_cmd;
- datalink_id_t ld_linkid;
- datalink_class_t ld_class;
- uint32_t ld_flags;
- datalink_media_t ld_dmedia;
-} dlmgmt_door_getnext_t;
-
-typedef struct dlmgmt_door_linkprop_init {
- int ld_cmd;
- datalink_id_t ld_linkid;
-} dlmgmt_door_linkprop_init_t;
-
-/* upcall return value */
-typedef struct dlmgmt_retval_s {
- uint_t lr_err; /* return error code */
-} dlmgmt_retval_t;
-
-typedef dlmgmt_retval_t dlmgmt_destroy_retval_t,
- dlmgmt_linkprop_init_retval_t;
-
-struct dlmgmt_linkid_retval_s {
- uint_t lr_err;
- datalink_id_t lr_linkid;
- uint32_t lr_flags;
- datalink_class_t lr_class;
- uint32_t lr_media;
- uint32_t lr_padding;
-};
-
-typedef struct dlmgmt_linkid_retval_s dlmgmt_create_retval_t,
- dlmgmt_update_retval_t,
- dlmgmt_getlinkid_retval_t,
- dlmgmt_getnext_retval_t;
-
-typedef struct dlmgmt_getname_retval_s {
- uint_t lr_err;
- char lr_link[MAXLINKNAMELEN];
- datalink_class_t lr_class;
- uint32_t lr_media;
- uint32_t lr_flags;
-} dlmgmt_getname_retval_t;
-
-typedef struct dlmgmt_getattr_retval_s {
- uint_t lr_err;
- uint_t lr_type;
- uint_t lr_attrsz;
- uint_t lr_padding;
- char lr_attrval[MAXLINKATTRVALLEN];
-} dlmgmt_getattr_retval_t;
+#define DLS_MINOR2INST(minor) ((int)((minor) - 1))
#ifdef _KERNEL
#define DLS_MAX_PPA 999
#define DLS_MAX_MINOR (DLS_MAX_PPA + 1)
-typedef struct dls_t *dls_channel_t;
+typedef void (*dls_rx_t)(void *, mac_resource_handle_t, mblk_t *,
+ mac_header_info_t *);
-extern int dls_open_style2_vlan(major_t, uint_t, dls_channel_t *);
-extern int dls_open_by_dev(dev_t, dls_channel_t *);
-extern void dls_close(dls_channel_t);
-
-extern mac_handle_t dls_mac(dls_channel_t);
-extern uint16_t dls_vid(dls_channel_t);
+typedef struct dld_str_s dld_str_t;
+typedef struct dls_devnet_s *dls_dl_handle_t;
+typedef struct dls_dev_t *dls_dev_handle_t;
+typedef struct dls_link_s dls_link_t;
#define DLS_SAP_LLC 0
#define DLS_SAP_PROMISC (1 << 16)
-extern int dls_bind(dls_channel_t, uint32_t);
-extern void dls_unbind(dls_channel_t);
-
#define DLS_PROMISC_SAP 0x00000001
#define DLS_PROMISC_MULTI 0x00000002
#define DLS_PROMISC_PHYS 0x00000004
-extern int dls_promisc(dls_channel_t, uint32_t);
-
-extern int dls_multicst_add(dls_channel_t, const uint8_t *);
-extern int dls_multicst_remove(dls_channel_t, const uint8_t *);
-
-extern mblk_t *dls_header(dls_channel_t, const uint8_t *,
- uint16_t, uint_t, mblk_t **);
-extern int dls_header_info(dls_channel_t, mblk_t *,
- mac_header_info_t *);
+extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *);
+extern void dls_close(dld_str_t *);
+extern int dls_bind(dld_str_t *, uint32_t);
+extern int dls_unbind(dld_str_t *);
-typedef void (*dls_rx_t)(void *, mac_resource_handle_t, mblk_t *,
- mac_header_info_t *);
+extern int dls_promisc(dld_str_t *, uint32_t);
-extern void dls_rx_set(dls_channel_t, dls_rx_t, void *);
+extern int dls_multicst_add(dld_str_t *, const uint8_t *);
+extern int dls_multicst_remove(dld_str_t *, const uint8_t *);
-extern mblk_t *dls_tx(dls_channel_t, mblk_t *);
+extern mblk_t *dls_header(dld_str_t *, const uint8_t *,
+ uint16_t, uint_t, mblk_t **);
-extern boolean_t dls_active_set(dls_channel_t);
-extern void dls_active_clear(dls_channel_t);
+extern void dls_rx_set(dld_str_t *, dls_rx_t, void *);
+extern dld_str_t *dls_rx_get(char *, flow_desc_t *, size_t *);
-extern dev_info_t *dls_finddevinfo(dev_t);
-
-typedef struct dls_devnet_s *dls_dl_handle_t;
-typedef struct dls_dev_t *dls_dev_handle_t;
+extern void str_notify(void *, mac_notify_type_t);
extern int dls_devnet_open(const char *,
dls_dl_handle_t *, dev_t *);
@@ -289,19 +112,18 @@ extern boolean_t dls_devnet_rebuild();
extern int dls_devnet_rename(datalink_id_t, datalink_id_t,
const char *);
extern int dls_devnet_create(mac_handle_t, datalink_id_t);
-extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *);
+extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *,
+ boolean_t);
extern int dls_devnet_recreate(mac_handle_t, datalink_id_t);
-extern int dls_devnet_create_vlan(datalink_id_t,
- datalink_id_t, uint16_t, boolean_t);
-extern int dls_devnet_destroy_vlan(datalink_id_t);
extern int dls_devnet_hold_tmp(datalink_id_t, dls_dl_handle_t *);
extern void dls_devnet_rele_tmp(dls_dl_handle_t);
+extern int dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *);
+extern void dls_devnet_rele(dls_dl_handle_t);
extern void dls_devnet_prop_task_wait(dls_dl_handle_t);
extern const char *dls_devnet_mac(dls_dl_handle_t);
extern uint16_t dls_devnet_vid(dls_dl_handle_t);
extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t);
-extern boolean_t dls_devnet_is_explicit(dls_dl_handle_t);
extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *);
extern int dls_devnet_phydev(datalink_id_t, dev_t *);
extern int dls_devnet_setzid(const char *, zoneid_t);
@@ -318,6 +140,8 @@ extern int dls_mgmt_get_linkinfo(datalink_id_t, char *,
extern int dls_mgmt_get_linkid(const char *, datalink_id_t *);
extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t,
datalink_media_t, uint32_t);
+extern int dls_devnet_macname2linkid(const char *,
+ datalink_id_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 83bccd20bb..71f79a611a 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -26,174 +26,97 @@
#ifndef _SYS_DLS_IMPL_H
#define _SYS_DLS_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/stream.h>
#include <sys/dls.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
#include <sys/modhash.h>
#include <sys/kstat.h>
#include <net/if.h>
#include <sys/dlpi.h>
-#include <sys/dls_soft_ring.h>
#ifdef __cplusplus
extern "C" {
#endif
-typedef struct dls_multicst_addr_s dls_multicst_addr_t;
-
-struct dls_multicst_addr_s {
- dls_multicst_addr_t *dma_nextp;
- uint8_t dma_addr[MAXMACADDRLEN];
-};
-
-typedef struct dls_link_s dls_link_t;
-
-struct dls_link_s {
- char dl_name[MAXNAMELEN];
- mac_handle_t dl_mh;
- const mac_info_t *dl_mip;
- mac_rx_handle_t dl_mrh;
- mac_txloop_handle_t dl_mth;
- uint_t dl_ref;
- uint_t dl_macref;
- mod_hash_t *dl_impl_hash;
- krwlock_t dl_impl_lock;
- uint_t dl_impl_count;
- kmutex_t dl_promisc_lock;
- uint_t dl_npromisc;
- uint_t dl_nactive;
- uint32_t dl_unknowns;
- kmutex_t dl_lock;
+typedef struct dls_multicst_addr_s {
+ struct dls_multicst_addr_s *dma_nextp; /* ds_rw_lock */
+ uint8_t dma_addr[MAXMACADDRLEN];
+} dls_multicst_addr_t;
+
+struct dls_link_s { /* Protected by */
+ char dl_name[MAXNAMELEN]; /* SL */
+ uint_t dl_ddi_instance; /* SL */
+ mac_handle_t dl_mh; /* SL */
+ mac_client_handle_t dl_mch; /* SL */
+ mac_unicast_handle_t dl_mah; /* SL */
+ const mac_info_t *dl_mip; /* SL */
+ uint_t dl_ref; /* SL */
+ mod_hash_t *dl_str_hash; /* SL, modhash lock */
+ uint_t dl_impl_count; /* SL */
+ uint_t dl_nactive; /* SL */
+ uint32_t dl_unknowns; /* atomic */
+ zoneid_t dl_zid;
+ uint_t dl_zone_ref;
};
-typedef struct dls_impl_s dls_impl_t;
-typedef struct dls_head_s dls_head_t;
-
-/*
- * The maximum length of an SPA (subnetwork point of attachment). It is of
- * the form <macname/vid>.
- */
-#define MAXSPALEN (MAXNAMELEN + 5)
-
-typedef struct dls_vlan_s {
- /*
- * The following fields will not change after dls_vlan_t creation.
- */
- dls_link_t *dv_dlp;
- uint16_t dv_id;
-
- /*
- * Unique SPA (of the form <macname/vid>) identifying a data-link;
- * is needed to avoid name collisions between an explicitly and
- * implicitly created VLANs.
- */
- char dv_spa[MAXSPALEN];
-
- /*
- * The ppa value of the associated device. Used to derive this link's
- * devfs node name.
- */
- uint_t dv_ppa;
-
- /*
- * The dev_t used to access this dls_vlan_t.
- */
- dev_t dv_dev;
-
- dev_info_t *dv_dip;
- kstat_t *dv_ksp;
- uint32_t dv_force : 1;
-
- /*
- * The following fields are protected by dv_lock.
- */
- kmutex_t dv_lock;
-
- /*
- * Reference count of dls_impl_t plus explicit creation of the link
- */
- uint_t dv_ref;
-
- /*
- * The reference count of this vlan is opened in its own zone.
- */
- uint_t dv_zone_ref;
- zoneid_t dv_zid;
-} dls_vlan_t;
-
-struct dls_impl_s {
- dls_impl_t *di_nextp;
- dls_head_t *di_headp;
- dls_vlan_t *di_dvp;
- mac_handle_t di_mh;
- mac_notify_handle_t di_mnh;
- const mac_info_t *di_mip;
- krwlock_t di_lock;
- uint16_t di_sap;
- uint_t di_promisc;
- dls_multicst_addr_t *di_dmap;
- dls_rx_t di_rx;
- void *di_rx_arg;
- mac_resource_add_t di_ring_add;
- const mac_txinfo_t *di_txinfo;
- uint_t di_bound : 1,
- di_removing : 1,
- di_active : 1,
- di_local : 1;
-
- uint8_t di_unicst_addr[MAXMACADDRLEN];
- soft_ring_t **di_soft_ring_list;
- uint_t di_soft_ring_size;
- dls_dl_handle_t di_ddh;
-};
-
-struct dls_head_s {
- dls_impl_t *dh_list;
- uint_t dh_ref;
- mod_hash_key_t dh_key;
-};
+typedef struct dls_head_s {
+ kmutex_t dh_lock;
+ struct dld_str_s *dh_list; /* dh_ref */
+ uint_t dh_ref; /* dh_lock */
+ mod_hash_key_t dh_key; /* SL */
+ kcondvar_t dh_cv; /* dh_lock */
+ uint_t dh_removing; /* dh_lock */
+} dls_head_t;
extern void dls_link_init(void);
extern int dls_link_fini(void);
extern int dls_link_hold(const char *, dls_link_t **);
+extern int dls_link_hold_create(const char *, dls_link_t **);
+extern int dls_link_hold_by_dev(dev_t, dls_link_t **);
extern void dls_link_rele(dls_link_t *);
-extern void dls_link_add(dls_link_t *, uint32_t, dls_impl_t *);
-extern void dls_link_remove(dls_link_t *, dls_impl_t *);
+extern int dls_link_rele_by_name(const char *);
+extern void dls_link_add(dls_link_t *, uint32_t, dld_str_t *);
+extern void dls_link_remove(dls_link_t *, dld_str_t *);
extern int dls_link_header_info(dls_link_t *, mblk_t *,
mac_header_info_t *);
-extern int dls_mac_hold(dls_link_t *);
-extern void dls_mac_rele(dls_link_t *);
-extern boolean_t dls_mac_active_set(dls_link_t *);
-extern void dls_mac_active_clear(dls_link_t *);
+extern int dls_link_setzid(const char *, zoneid_t);
+extern dev_info_t *dls_link_devinfo(dev_t);
+extern dev_t dls_link_dev(dls_link_t *);
-extern void dls_mac_stat_create(dls_vlan_t *);
-extern void dls_mac_stat_destroy(dls_vlan_t *);
+extern void i_dls_head_rele(dls_head_t *);
+extern int dls_mac_active_set(dls_link_t *i);
+extern void dls_mac_active_clear(dls_link_t *);
-extern void dls_vlan_init(void);
-extern int dls_vlan_fini(void);
-extern int dls_vlan_hold(const char *, uint16_t, dls_vlan_t **,
- boolean_t, boolean_t);
-extern int dls_vlan_hold_by_dev(dev_t, dls_vlan_t **);
-extern void dls_vlan_rele(dls_vlan_t *);
-extern int dls_vlan_destroy(const char *, uint16_t);
-extern int dls_vlan_create(const char *, uint16_t, boolean_t);
-extern int dls_vlan_setzid(const char *, uint16_t, zoneid_t);
-extern int dls_stat_update(kstat_t *, dls_vlan_t *, int);
+extern void dls_create_str_kstats(dld_str_t *);
+extern int dls_stat_update(kstat_t *, dls_link_t *, int);
extern int dls_stat_create(const char *, int, const char *,
int (*)(struct kstat *, int), void *, kstat_t **);
-extern int dls_devnet_open_by_dev(dev_t, dls_vlan_t **,
+extern int dls_devnet_open_by_dev(dev_t, dls_link_t **,
dls_dl_handle_t *);
+extern int dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *,
+ dls_link_t **);
+extern void dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *);
extern void dls_init(void);
extern int dls_fini(void);
extern void dls_link_txloop(void *, mblk_t *);
-extern boolean_t dls_accept(dls_impl_t *, mac_header_info_t *,
+extern boolean_t dls_accept(dld_str_t *, mac_header_info_t *,
dls_rx_t *, void **);
-extern boolean_t dls_accept_loopback(dls_impl_t *, mac_header_info_t *,
+extern boolean_t dls_accept_loopback(dld_str_t *, mac_header_info_t *,
dls_rx_t *, void **);
+extern boolean_t dls_accept_promisc(dld_str_t *, mac_header_info_t *,
+ dls_rx_t *, void **, boolean_t);
+extern void i_dls_link_rx(void *, mac_resource_handle_t, mblk_t *,
+ boolean_t);
+extern void dls_rx_promisc(void *, mac_resource_handle_t, mblk_t *,
+ boolean_t);
+extern void dls_rx_vlan_promisc(void *, mac_resource_handle_t,
+ mblk_t *, boolean_t);
+extern int dls_active_set(dld_str_t *);
+extern void dls_active_clear(dld_str_t *);
extern void dls_mgmt_init(void);
extern void dls_mgmt_fini(void);
diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h
new file mode 100644
index 0000000000..5177de09b9
--- /dev/null
+++ b/usr/src/uts/common/sys/dls_mgmt.h
@@ -0,0 +1,218 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DLS_MGMT_H
+#define _DLS_MGMT_H
+
+#include <sys/types.h>
+#include <sys/dld.h>
+
+/*
+ * Data-Link Services Module
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ DATALINK_CLASS_PHYS = 0x01,
+ DATALINK_CLASS_VLAN = 0x02,
+ DATALINK_CLASS_AGGR = 0x04,
+ DATALINK_CLASS_VNIC = 0x08,
+ DATALINK_CLASS_ETHERSTUB = 0x10
+} datalink_class_t;
+
+#define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \
+ DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \
+ DATALINK_CLASS_ETHERSTUB)
+
+/*
+ * A combination of flags and media.
+ * flags is the higher 32 bits, and if it is 0x01, it indicates all media
+ * types can be accepted; otherwise, only the given media type (specified
+ * in the lower 32 bits) is accepted.
+ */
+typedef uint64_t datalink_media_t;
+
+#define DATALINK_ANY_MEDIATYPE \
+ ((datalink_media_t)(((datalink_media_t)0x01) << 32))
+
+#define DATALINK_MEDIA_ACCEPTED(dmedia, media) \
+ (((uint32_t)(((dmedia) >> 32) & 0xfffffffful) & 0x01) ? \
+ B_TRUE : ((uint32_t)((dmedia) & 0xfffffffful) == (media)))
+
+#define MAXLINKATTRLEN 32
+#define MAXLINKATTRVALLEN 1024
+
+/*
+ * Link attributes used by the kernel.
+ */
+/*
+ * The major number and instance number of the underlying physical device
+ * are kept as FPHYMAJ and FPHYINST (major, instance + 1).
+ *
+ * Set for physical links only.
+ */
+#define FPHYMAJ "phymaj" /* uint64_t */
+#define FPHYINST "phyinst" /* uint64_t */
+
+/*
+ * The devname of the physical link. For example, bge0, ce1. Set for physical
+ * links only.
+ */
+#define FDEVNAME "devname" /* string */
+
+/*
+ * The door file for the dlmgmtd (data-link management) daemon.
+ */
+#define DLMGMT_DOOR "/etc/svc/volatile/dladm/dlmgmt_door"
+
+/*
+ * Door upcall commands.
+ */
+#define DLMGMT_CMD_DLS_CREATE 1
+#define DLMGMT_CMD_DLS_GETATTR 2
+#define DLMGMT_CMD_DLS_DESTROY 3
+#define DLMGMT_CMD_GETNAME 4
+#define DLMGMT_CMD_GETLINKID 5
+#define DLMGMT_CMD_GETNEXT 6
+#define DLMGMT_CMD_DLS_UPDATE 7
+#define DLMGMT_CMD_LINKPROP_INIT 8
+#define DLMGMT_CMD_BASE 128
+
+/*
+ * Indicate the link mapping is active or persistent
+ */
+#define DLMGMT_ACTIVE 0x01
+#define DLMGMT_PERSIST 0x02
+
+/* upcall argument */
+typedef struct dlmgmt_door_arg {
+ uint_t ld_cmd;
+} dlmgmt_door_arg_t;
+
+typedef struct dlmgmt_upcall_arg_create {
+ int ld_cmd;
+ datalink_class_t ld_class;
+ uint32_t ld_media;
+ boolean_t ld_persist;
+ uint64_t ld_phymaj;
+ uint64_t ld_phyinst;
+ char ld_devname[MAXNAMELEN];
+} dlmgmt_upcall_arg_create_t;
+
+/*
+ * Note: ld_padding is necessary to keep the size of the structure the
+ * same on amd64 and i386. The same note applies to other ld_padding
+ * and lr_paddding fields in structures throughout this file.
+ */
+typedef struct dlmgmt_upcall_arg_destroy {
+ int ld_cmd;
+ datalink_id_t ld_linkid;
+ boolean_t ld_persist;
+ int ld_padding;
+} dlmgmt_upcall_arg_destroy_t;
+
+typedef struct dlmgmt_upcall_arg_update {
+ int ld_cmd;
+ boolean_t ld_novanity;
+ uint32_t ld_media;
+ uint32_t ld_padding;
+ char ld_devname[MAXNAMELEN];
+} dlmgmt_upcall_arg_update_t;
+
+typedef struct dlmgmt_upcall_arg_getattr {
+ int ld_cmd;
+ datalink_id_t ld_linkid;
+ char ld_attr[MAXLINKATTRLEN];
+} dlmgmt_upcall_arg_getattr_t;
+
+typedef struct dlmgmt_door_getname {
+ int ld_cmd;
+ datalink_id_t ld_linkid;
+} dlmgmt_door_getname_t;
+
+typedef struct dlmgmt_door_getlinkid {
+ int ld_cmd;
+ char ld_link[MAXLINKNAMELEN];
+} dlmgmt_door_getlinkid_t;
+
+typedef struct dlmgmt_door_getnext_s {
+ int ld_cmd;
+ datalink_id_t ld_linkid;
+ datalink_class_t ld_class;
+ uint32_t ld_flags;
+ datalink_media_t ld_dmedia;
+} dlmgmt_door_getnext_t;
+
+typedef struct dlmgmt_door_linkprop_init {
+ int ld_cmd;
+ datalink_id_t ld_linkid;
+} dlmgmt_door_linkprop_init_t;
+
+/* upcall return value */
+typedef struct dlmgmt_retval_s {
+ uint_t lr_err; /* return error code */
+} dlmgmt_retval_t;
+
+typedef dlmgmt_retval_t dlmgmt_destroy_retval_t,
+ dlmgmt_linkprop_init_retval_t;
+
+struct dlmgmt_linkid_retval_s {
+ uint_t lr_err;
+ datalink_id_t lr_linkid;
+ uint32_t lr_flags;
+ datalink_class_t lr_class;
+ uint32_t lr_media;
+ uint32_t lr_padding;
+};
+
+typedef struct dlmgmt_linkid_retval_s dlmgmt_create_retval_t,
+ dlmgmt_update_retval_t,
+ dlmgmt_getlinkid_retval_t,
+ dlmgmt_getnext_retval_t;
+
+typedef struct dlmgmt_getname_retval_s {
+ uint_t lr_err;
+ char lr_link[MAXLINKNAMELEN];
+ datalink_class_t lr_class;
+ uint32_t lr_media;
+ uint32_t lr_flags;
+} dlmgmt_getname_retval_t;
+
+typedef struct dlmgmt_getattr_retval_s {
+ uint_t lr_err;
+ uint_t lr_type;
+ uint_t lr_attrsz;
+ uint_t lr_padding;
+ char lr_attrval[MAXLINKATTRVALLEN];
+} dlmgmt_getattr_retval_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DLS_MGMT_H */
diff --git a/usr/src/uts/common/sys/dls_soft_ring.h b/usr/src/uts/common/sys/dls_soft_ring.h
deleted file mode 100644
index 403623853a..0000000000
--- a/usr/src/uts/common/sys/dls_soft_ring.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DLS_SOFT_RING_H
-#define _SYS_DLS_SOFT_RING_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/types.h>
-#include <sys/processor.h>
-#include <sys/stream.h>
-#include <sys/squeue.h>
-#include <sys/mac.h>
-
-#define S_RING_NAMELEN 64
-
-typedef void (*s_ring_proc_t)(void *, void *, mblk_t *, mac_header_info_t *);
-
-typedef struct soft_ring_s {
- /* Keep the most used members 64bytes cache aligned */
- kmutex_t s_ring_lock; /* lock before using any member */
- uint16_t s_ring_type; /* processing model of the sq */
- uint16_t s_ring_state; /* state flags and message count */
- int s_ring_count; /* # of mblocks in soft_ring */
- mblk_t *s_ring_first; /* first mblk chain or NULL */
- mblk_t *s_ring_last; /* last mblk chain or NULL */
- s_ring_proc_t s_ring_upcall; /* Upcall func pointer */
- void *s_ring_upcall_arg1; /* upcall argument 1 */
- void *s_ring_upcall_arg2; /* upcall argument 2 */
- clock_t s_ring_awaken; /* time async thread was awakened */
-
- kthread_t *s_ring_run; /* Current thread processing sq */
- processorid_t s_ring_bind; /* processor to bind to */
- kcondvar_t s_ring_async; /* async thread blocks on */
- clock_t s_ring_wait; /* lbolts to wait after a fill() */
- timeout_id_t s_ring_tid; /* timer id of pending timeout() */
- kthread_t *s_ring_worker; /* kernel thread id */
- char s_ring_name[S_RING_NAMELEN + 1];
- uint32_t s_ring_total_inpkt;
-} soft_ring_t;
-
-
-/*
- * type flags - combination allowed to process and drain the queue
- */
-#define S_RING_WORKER_ONLY 0x0001 /* Worker thread only */
-#define S_RING_ANY 0x0002 /* Any thread can process the queue */
-
-/*
- * State flags.
- */
-#define S_RING_PROC 0x0001 /* being processed */
-#define S_RING_WORKER 0x0002 /* worker thread */
-#define S_RING_BOUND 0x0004 /* Worker thread is bound */
-#define S_RING_DESTROY 0x0008 /* Ring is being destroyed */
-#define S_RING_DEAD 0x0010 /* Worker thread is no more */
-
-/*
- * arguments for processors to bind to
- */
-#define S_RING_BIND_NONE -1
-
-/*
- * Structure for dls statistics
- */
-struct dls_kstats {
- kstat_named_t dlss_soft_ring_pkt_drop;
-};
-
-extern struct dls_kstats dls_kstat;
-
-#define DLS_BUMP_STAT(x, y) (dls_kstat.x.value.ui32 += y)
-
-extern void soft_ring_init(void);
-extern soft_ring_t *soft_ring_create(char *, processorid_t, clock_t,
- uint_t, pri_t);
-extern soft_ring_t **soft_ring_set_create(char *, processorid_t, clock_t,
- uint_t, pri_t, int);
-extern void soft_ring_set_destroy(soft_ring_t **, int);
-extern void soft_ring_bind(void *, processorid_t);
-extern void soft_ring_unbind(void *);
-extern void dls_soft_ring_fanout(void *, void *, mblk_t *, mac_header_info_t *);
-extern boolean_t dls_soft_ring_enable(dls_channel_t, dl_capab_dls_t *);
-extern void dls_soft_ring_disable(dls_channel_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DLS_SOFT_RING_H */
diff --git a/usr/src/uts/common/sys/exacct.h b/usr/src/uts/common/sys/exacct.h
index b30362bb05..a9c394bb4f 100644
--- a/usr/src/uts/common/sys/exacct.h
+++ b/usr/src/uts/common/sys/exacct.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_EXACCT_H
#define _SYS_EXACCT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/task.h>
#include <sys/proc.h>
@@ -175,6 +173,7 @@ extern int exacct_tag_task(ac_info_t *, task_t *, void *, size_t, int);
extern int exacct_tag_proc(ac_info_t *, pid_t, taskid_t, void *, size_t, int,
const char *);
extern void exacct_commit_flow(void *);
+extern int exacct_commit_netinfo(void *, int);
extern void exacct_init(void);
extern void *exacct_create_header(size_t *);
extern int exacct_write_header(ac_info_t *, void *, size_t);
@@ -192,6 +191,9 @@ extern int exacct_assemble_flow_usage(ac_info_t *, flow_usage_t *,
int (*)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
void *, size_t, size_t *);
extern void exacct_move_mstate(proc_t *, task_t *, task_t *);
+extern int exacct_assemble_net_usage(ac_info_t *, void *,
+ int (*)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
+ void *, size_t, size_t *, int);
extern taskq_t *exacct_queue;
extern kmem_cache_t *exacct_object_cache;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/exacct_catalog.h b/usr/src/uts/common/sys/exacct_catalog.h
index 0911344382..f6d9c09e7a 100644
--- a/usr/src/uts/common/sys/exacct_catalog.h
+++ b/usr/src/uts/common/sys/exacct_catalog.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_EXACCT_CATALOG_H
#define _SYS_EXACCT_CATALOG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -104,6 +101,10 @@ extern "C" {
#define EXD_GROUP_FLOW 0x000109
#define EXD_GROUP_RFMA 0x00010a
#define EXD_GROUP_FMA 0x00010b
+#define EXD_GROUP_NET_LINK_DESC 0X00010c
+#define EXD_GROUP_NET_FLOW_DESC 0X00010d
+#define EXD_GROUP_NET_LINK_STATS 0X00010e
+#define EXD_GROUP_NET_FLOW_STATS 0X00010f
#define EXD_PROC_PID 0x001000
#define EXD_PROC_UID 0x001001
@@ -204,6 +205,36 @@ extern "C" {
#define EXD_FMA_OFFSET 0x00400B
#define EXD_FMA_UUID 0x00400C
+/* For EXD_GROUP_FLDESC and EXD_GROUP_LNDESC */
+#define EXD_NET_DESC_NAME 0x005001
+#define EXD_NET_DESC_EHOST 0x005002
+#define EXD_NET_DESC_EDEST 0x005003
+#define EXD_NET_DESC_VLAN_TPID 0x005004
+#define EXD_NET_DESC_VLAN_TCI 0x005005
+#define EXD_NET_DESC_SAP 0x005006
+#define EXD_NET_DESC_PRIORITY 0x005007
+#define EXD_NET_DESC_BWLIMIT 0x005008
+/* For EXD_GROUP_FLDESC only */
+#define EXD_NET_DESC_DEVNAME 0x005009
+#define EXD_NET_DESC_V4SADDR 0x00500a
+#define EXD_NET_DESC_V4DADDR 0x00500b
+#define EXD_NET_DESC_V6SADDR 0x00500c
+#define EXD_NET_DESC_V6DADDR 0x00500d
+#define EXD_NET_DESC_SPORT 0x00500e
+#define EXD_NET_DESC_DPORT 0x00500f
+#define EXD_NET_DESC_PROTOCOL 0x005010
+#define EXD_NET_DESC_DSFIELD 0x005011
+
+/* For EXD_NET_STATS */
+#define EXD_NET_STATS_NAME 0x006000
+#define EXD_NET_STATS_CURTIME 0x006001
+#define EXD_NET_STATS_IBYTES 0x006002
+#define EXD_NET_STATS_OBYTES 0x006003
+#define EXD_NET_STATS_IPKTS 0x006004
+#define EXD_NET_STATS_OPKTS 0x006005
+#define EXD_NET_STATS_IERRPKTS 0x006006
+#define EXD_NET_STATS_OERRPKTS 0x006007
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/exacct_impl.h b/usr/src/uts/common/sys/exacct_impl.h
index 14cee43d5f..6f25f02e7e 100644
--- a/usr/src/uts/common/sys/exacct_impl.h
+++ b/usr/src/uts/common/sys/exacct_impl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_EXACCT_IMPL_H
#define _SYS_EXACCT_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -129,6 +126,42 @@ typedef struct flow_usage {
char *fu_aname; /* action instance name */
} flow_usage_t;
+#define EX_NET_LNDESC_REC 1
+#define EX_NET_FLDESC_REC 2
+#define EX_NET_LNSTAT_REC 3
+#define EX_NET_FLSTAT_REC 4
+
+typedef struct net_stat_s {
+ char *ns_name;
+ uint64_t ns_ibytes;
+ uint64_t ns_obytes;
+ uint64_t ns_ipackets;
+ uint64_t ns_opackets;
+ uint64_t ns_ierrors;
+ uint64_t ns_oerrors;
+ boolean_t ns_isref;
+} net_stat_t;
+
+typedef struct net_desc_s {
+ char *nd_name;
+ char *nd_devname;
+ uchar_t nd_ehost[6];
+ uchar_t nd_edest[6];
+ ushort_t nd_vlan_tpid;
+ ushort_t nd_vlan_tci;
+ ushort_t nd_sap;
+ ushort_t nd_priority;
+ uint64_t nd_bw_limit;
+ uint32_t nd_saddr[4];
+ uint32_t nd_daddr[4];
+ boolean_t nd_isv4;
+ uint16_t nd_sport;
+ uint16_t nd_dport;
+ uint8_t nd_protocol;
+ uint8_t nd_dsfield;
+ int nd_type;
+} net_desc_t;
+
extern void exacct_order16(uint16_t *);
extern void exacct_order32(uint32_t *);
extern void exacct_order64(uint64_t *);
diff --git a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h
index 8cdf2cf96a..73419866a9 100644
--- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h
+++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h
@@ -26,8 +26,6 @@
#ifndef _SYS_IB_CLIENTS_IBD_H
#define _SYS_IB_CLIENTS_IBD_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -96,7 +94,7 @@ typedef struct ipoib_pgrh {
#include <sys/ib/ibtl/ibti.h>
#include <sys/ib/ib_pkt_hdrs.h>
#include <sys/list.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ib.h>
#include <sys/modhash.h>
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index 9011423727..d4608f3729 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -18,6 +18,7 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -30,6 +31,7 @@
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/stream.h>
+#include <sys/mac_flow.h>
/*
* MAC Services Module
@@ -42,13 +44,7 @@ extern "C" {
/*
* MAC Information (text emitted by modinfo(1m))
*/
-#define MAC_INFO "MAC Services"
-
-/*
- * MAC version identifier. This is used by mac_alloc() mac_register() to
- * verify that incompatible drivers don't register.
- */
-#define MAC_VERSION 0x1
+#define MAC_INFO "MAC Services v1.20"
/*
* MAC-Type version identifier. This is used by mactype_alloc() and
@@ -58,17 +54,23 @@ extern "C" {
#define MACTYPE_VERSION 0x1
/*
- * Statistics
+ * Opaque handle types
*/
+typedef struct __mac_handle *mac_handle_t;
+typedef struct __mac_resource_handle *mac_resource_handle_t;
+typedef struct __mac_notify_handle *mac_notify_handle_t;
+typedef struct __mac_tx_notify_handle *mac_tx_notify_handle_t;
+typedef struct __mac_intr_handle *mac_intr_handle_t;
+typedef struct __mac_ring_handle *mac_ring_handle_t;
+typedef struct __mac_group_handle *mac_group_handle_t;
-#define XCVR_UNDEFINED 0
-#define XCVR_NONE 1
-#define XCVR_10 2
-#define XCVR_100T4 3
-#define XCVR_100X 4
-#define XCVR_100T2 5
-#define XCVR_1000X 6
-#define XCVR_1000T 7
+#define DATALINK_INVALID_LINKID 0
+#define DATALINK_ALL_LINKID 0
+#define DATALINK_MAX_LINKID 0xffffffff
+
+#define MAC_MAX_MINOR 1000
+
+typedef uint32_t datalink_id_t;
typedef enum {
LINK_STATE_UNKNOWN = -1,
@@ -82,10 +84,6 @@ typedef enum {
LINK_DUPLEX_FULL
} link_duplex_t;
-#define DATALINK_INVALID_LINKID 0
-#define DATALINK_ALL_LINKID 0
-#define DATALINK_MAX_LINKID 0xffffffff
-
typedef enum {
LINK_FLOWCTRL_NONE = 0,
LINK_FLOWCTRL_RX,
@@ -93,7 +91,15 @@ typedef enum {
LINK_FLOWCTRL_BI
} link_flowctrl_t;
-typedef uint32_t datalink_id_t;
+/*
+ * Maximum MAC address length
+ */
+#define MAXMACADDRLEN 20
+
+typedef enum {
+ MAC_LOGTYPE_LINK = 1,
+ MAC_LOGTYPE_FLOW
+} mac_logtype_t;
/*
* Encodings for public properties.
@@ -153,15 +159,13 @@ typedef enum {
MAC_PROP_WL_DELKEY,
MAC_PROP_WL_KEY,
MAC_PROP_WL_MLME,
+ MAC_PROP_MAXBW,
+ MAC_PROP_PRIO,
+ MAC_PROP_BIND_CPU,
MAC_PROP_PRIVATE = -1
} mac_prop_id_t;
/*
- * Maximum MAC address length
- */
-#define MAXMACADDRLEN 20
-
-/*
* Flags to figure out r/w status of legacy ndd props.
*/
#define MAC_PROP_PERM_READ 0x0001
@@ -172,13 +176,6 @@ typedef enum {
#ifdef _KERNEL
-typedef struct mac_stat_info_s {
- uint_t msi_stat;
- char *msi_name;
- uint_t msi_type; /* as defined in kstat_named_init(9F) */
- uint64_t msi_default;
-} mac_stat_info_t;
-
/*
* There are three ranges of statistics values. 0 to 1 - MAC_STAT_MIN are
* interface statistics maintained by the mac module. MAC_STAT_MIN to 1 -
@@ -259,27 +256,6 @@ typedef struct mac_info_s {
} mac_info_t;
/*
- * LSO capability
- */
-typedef struct lso_basic_tcp_ipv4_s {
- t_uscalar_t lso_max; /* maximum payload */
-} lso_basic_tcp_ipv4_t;
-
-/*
- * Future LSO capabilities can be added at the end of the mac_capab_lso_t.
- * When such capability is added to the GLDv3 framework, the size of the
- * mac_capab_lso_t it allocates and passes to the drivers increases. Older
- * drivers wil access only the (upper) sections of that structure, that is the
- * sections carrying the capabilities they understand. This ensures the
- * interface can be safely extended in a binary compatible way.
- */
-typedef struct mac_capab_lso_s {
- t_uscalar_t lso_flags;
- lso_basic_tcp_ipv4_t lso_basic_tcp_ipv4;
- /* Add future lso capabilities here */
-} mac_capab_lso_t;
-
-/*
* Information for legacy devices.
*/
typedef struct mac_capab_legacy_s {
@@ -294,307 +270,32 @@ typedef struct mac_capab_legacy_s {
} mac_capab_legacy_t;
/*
- * MAC layer capabilities. These capabilities are handled by the drivers'
- * mc_capab_get() callbacks. Some capabilities require the driver to fill
- * in a given data structure, and others are simply boolean capabilities.
- * Note that capability values must be powers of 2 so that consumers and
- * providers of this interface can keep track of which capabilities they
- * care about by keeping a bitfield of these things around somewhere.
- */
-typedef enum {
- MAC_CAPAB_HCKSUM = 0x01, /* data is a uint32_t for the txflags */
- MAC_CAPAB_POLL = 0x02, /* boolean only, no data */
- MAC_CAPAB_MULTIADDRESS = 0x04, /* data is multiaddress_capab_t */
- MAC_CAPAB_LSO = 0x08, /* data is mac_capab_lso_t */
- MAC_CAPAB_NO_NATIVEVLAN = 0x10, /* boolean only, no data */
- MAC_CAPAB_NO_ZCOPY = 0x20, /* boolean only, no data */
- /* add new capabilities here */
- MAC_CAPAB_RINGS = 0x100, /* data is mac_capab_rings_t */
- MAC_CAPAB_SHARES = 0x200, /* data is mac_capab_share_t */
-
- /* The following capabilities are specific to softmac. */
- MAC_CAPAB_LEGACY = 0x8000 /* data is mac_capab_legacy_t */
-} mac_capab_t;
-
-typedef int mac_addr_slot_t;
-
-/* mma_flags values */
-#define MMAC_SLOT_USED 0x1 /* address slot used */
-#define MMAC_SLOT_UNUSED 0x2 /* free address slot */
-#define MMAC_VENDOR_ADDR 0x4 /* address returned is vendor supplied */
-
-typedef struct mac_multi_address_s {
- mac_addr_slot_t mma_slot; /* slot for add/remove/get/set */
- uint_t mma_addrlen;
- uint8_t mma_addr[MAXMACADDRLEN];
- uint_t mma_flags;
-} mac_multi_addr_t;
-
-typedef int (*maddr_reserve_t)(void *, mac_multi_addr_t *);
-typedef int (*maddr_add_t)(void *, mac_multi_addr_t *);
-typedef int (*maddr_remove_t)(void *, mac_addr_slot_t);
-typedef int (*maddr_modify_t)(void *, mac_multi_addr_t *);
-typedef int (*maddr_get_t)(void *, mac_multi_addr_t *);
-
-/* maddr_flag values */
-#define MADDR_VENDOR_ADDR 0x01 /* addr returned is vendor supplied */
-
-/* multiple mac address: add/remove/set/get mac address */
-typedef struct multiaddress_capab_s {
- int maddr_naddr; /* total addresses */
- int maddr_naddrfree; /* free address slots */
- uint_t maddr_flag; /* MADDR_VENDOR_ADDR bit can be set */
- /* driver entry points */
- void *maddr_handle; /* cookie to be used for the calls */
- maddr_reserve_t maddr_reserve; /* reserve a factory address */
- maddr_add_t maddr_add; /* add a new unicst address */
- maddr_remove_t maddr_remove; /* remove an added address */
- maddr_modify_t maddr_modify; /* modify an added address */
- maddr_get_t maddr_get; /* get address from specified slot */
-} multiaddress_capab_t;
-
-/*
- * MAC driver entry point types.
- */
-typedef int (*mac_getstat_t)(void *, uint_t, uint64_t *);
-typedef int (*mac_start_t)(void *);
-typedef void (*mac_stop_t)(void *);
-typedef int (*mac_setpromisc_t)(void *, boolean_t);
-typedef int (*mac_multicst_t)(void *, boolean_t, const uint8_t *);
-typedef int (*mac_unicst_t)(void *, const uint8_t *);
-typedef void (*mac_ioctl_t)(void *, queue_t *, mblk_t *);
-typedef void (*mac_resources_t)(void *);
-typedef mblk_t *(*mac_tx_t)(void *, mblk_t *);
-typedef boolean_t (*mac_getcapab_t)(void *, mac_capab_t, void *);
-typedef int (*mac_open_t)(void *);
-typedef void (*mac_close_t)(void *);
-typedef int (*mac_set_prop_t)(void *, const char *, mac_prop_id_t,
- uint_t, const void *);
-typedef int (*mac_get_prop_t)(void *, const char *, mac_prop_id_t,
- uint_t, uint_t, void *, uint_t *);
-
-/*
- * Drivers must set all of these callbacks except for mc_resources,
- * mc_ioctl, and mc_getcapab, which are optional. If any of these optional
- * callbacks are set, their appropriate flags must be set in mc_callbacks.
- * Any future additions to this list must also be accompanied by an
- * associated mc_callbacks flag so that the framework can grow without
- * affecting the binary compatibility of the interface.
- */
-typedef struct mac_callbacks_s {
- uint_t mc_callbacks; /* Denotes which callbacks are set */
- mac_getstat_t mc_getstat; /* Get the value of a statistic */
- mac_start_t mc_start; /* Start the device */
- mac_stop_t mc_stop; /* Stop the device */
- mac_setpromisc_t mc_setpromisc; /* Enable or disable promiscuous mode */
- mac_multicst_t mc_multicst; /* Enable or disable a multicast addr */
- mac_unicst_t mc_unicst; /* Set the unicast MAC address */
- mac_tx_t mc_tx; /* Transmit a packet */
- mac_resources_t mc_resources; /* Get the device resources */
- mac_ioctl_t mc_ioctl; /* Process an unknown ioctl */
- mac_getcapab_t mc_getcapab; /* Get capability information */
- mac_open_t mc_open; /* Open the device */
- mac_close_t mc_close; /* Close the device */
- mac_set_prop_t mc_setprop;
- mac_get_prop_t mc_getprop;
-} mac_callbacks_t;
-
-typedef struct mac_priv_prop_s {
- char mpp_name[MAXLINKPROPNAME];
- uint_t mpp_flags;
-} mac_priv_prop_t;
-
-/*
- * Multiple Rings capability
- */
-typedef enum {
- MAC_RING_TYPE_RX = 1, /* Receive ring */
- MAC_RING_TYPE_TX = 2 /* Transmit ring */
-} mac_ring_type_t;
-
-/*
- * Grouping type of a ring group
+ * When VNICs are created on top of the NIC, there are two levels
+ * of MAC layer, a lower MAC, which is the MAC layer at the level of the
+ * physical NIC, and an upper MAC, which is the MAC layer at the level
+ * of the VNIC. Each VNIC maps to a MAC client at the lower MAC, and
+ * the SRS and classification is done at the lower MAC level. The upper
+ * MAC is therefore for the most part pass-through, and therefore
+ * special processing needs to be done at the upper MAC layer when
+ * dealing with a VNIC.
*
- * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped.
- * MAC_GROUP_TYPE_DYNAMIC: The ring group support dynamic re-grouping
- */
-typedef enum {
- MAC_GROUP_TYPE_STATIC = 1, /* Static ring group */
- MAC_GROUP_TYPE_DYNAMIC = 2 /* Dynamic ring group */
-} mac_group_type_t;
-
-typedef struct __mac_ring_driver *mac_ring_driver_t;
-typedef struct __mac_ring_handle *mac_ring_handle_t;
-typedef struct __mac_group_driver *mac_group_driver_t;
-typedef struct __mac_group_handle *mac_group_handle_t;
-typedef struct __mac_intr_handle *mac_intr_handle_t;
-
-typedef struct mac_ring_info_s mac_ring_info_t;
-typedef struct mac_group_info_s mac_group_info_t;
-
-typedef int (*mac_intr_enable_t)(mac_intr_handle_t);
-typedef int (*mac_intr_disable_t)(mac_intr_handle_t);
-
-typedef struct mac_intr_s {
- mac_intr_handle_t mi_handle;
- mac_intr_enable_t mi_enable;
- mac_intr_disable_t mi_disable;
-} mac_intr_t;
-
-typedef void (*mac_get_ring_t)(void *, mac_ring_type_t, const int, const int,
- mac_ring_info_t *, mac_ring_handle_t);
-typedef void (*mac_get_group_t)(void *, mac_ring_type_t, const int,
- mac_group_info_t *, mac_group_handle_t);
-
-typedef void (*mac_group_add_ring_t)(mac_group_driver_t,
- mac_ring_driver_t, mac_ring_type_t);
-typedef void (*mac_group_rem_ring_t)(mac_group_driver_t,
- mac_ring_driver_t, mac_ring_type_t);
-
-/*
- * Multiple Rings Capability
- */
-typedef struct mac_capab_rings_s {
- mac_ring_type_t mr_type; /* Ring type */
- mac_group_type_t mr_group_type; /* Grouping type */
- void *mr_handle; /* Group Driver Handle. */
- uint_t mr_rnum; /* Number of rings */
- uint_t mr_gnum; /* Number of ring groups */
- mac_get_ring_t mr_rget; /* Get ring from driver */
- mac_get_group_t mr_gget; /* Get ring group from driver */
- mac_group_add_ring_t mr_gadd_ring; /* Add ring into a group */
- mac_group_rem_ring_t mr_grem_ring; /* Remove ring from a group */
-} mac_capab_rings_t;
-
-/*
- * Common ring functions and driver interfaces
+ * This capability allows the MAC layer to detect when a VNIC is being
+ * access, and implement the required shortcuts.
*/
-typedef int (*mac_ring_start_t)(mac_ring_driver_t);
-typedef void (*mac_ring_stop_t)(mac_ring_driver_t);
-typedef mblk_t *(*mac_ring_send_t)(void *, mblk_t *);
-typedef mblk_t *(*mac_ring_poll_t)(void *, int);
+typedef void *(*mac_client_handle_fn_t)(void *);
-typedef struct mac_ring_info_s {
- mac_ring_driver_t mr_driver;
- mac_ring_start_t mr_start;
- mac_ring_stop_t mr_stop;
- mac_intr_t mr_intr;
- union {
- mac_ring_send_t send;
- mac_ring_poll_t poll;
- } mrfunion;
-} mac_ring_info_s;
-
-#define mr_send mrfunion.send
-#define mr_poll mrfunion.poll
-
-typedef int (*mac_group_start_t)(mac_group_driver_t);
-typedef void (*mac_group_stop_t)(mac_group_driver_t);
-typedef int (*mac_add_mac_addr_t)(void *, const uint8_t *);
-typedef int (*mac_rem_mac_addr_t)(void *, const uint8_t *);
-
-struct mac_group_info_s {
- mac_group_driver_t mrg_driver; /* Driver reference */
- mac_group_start_t mrg_start; /* Start the group */
- mac_group_stop_t mrg_stop; /* Stop the group */
- uint_t mrg_count; /* Count of rings */
- mac_intr_t mrg_intr; /* Optional per-group intr */
-
- /* Only used for rx groups */
- mac_add_mac_addr_t mrg_addmac; /* Add a MAC address */
- mac_rem_mac_addr_t mrg_remmac; /* Remove a MAC address */
-};
-
-/*
- * Share management functions.
- */
-typedef uint64_t mac_share_handle_t;
+typedef struct mac_capab_vnic_s {
+ void *mcv_arg;
+ mac_client_handle_fn_t mcv_mac_client_handle;
+} mac_capab_vnic_t;
-/*
- * Returns a Share handle to the client calling from above.
- */
-typedef int (*mac_alloc_share_t)(void *, uint64_t cookie,
- uint64_t *rcookie, mac_share_handle_t *);
-
-/*
- * Destroys the share previously allocated and unallocates
- * all share resources (e.g. DMA's assigned to the share).
- */
-typedef void (*mac_free_share_t)(mac_share_handle_t);
-
-typedef void (*mac_share_query_t)(mac_share_handle_t shdl,
- mac_ring_type_t type, uint32_t *rmin, uint32_t *rmax,
- uint64_t *rmap, uint64_t *gnum);
-
-/*
- * Basic idea, bind previously created ring groups to shares
- * for them to be exported (or shared) by another domain.
- * These interfaces bind/unbind the ring group to a share. The
- * of doing such causes the resources to be shared with the guest.
- */
-typedef int (*mac_share_add_group_t)(mac_share_handle_t,
- mac_group_handle_t);
-typedef int (*mac_share_rem_group_t)(mac_share_handle_t,
- mac_group_handle_t);
-
-typedef struct mac_capab_share_s {
- uint_t ms_snum; /* Number of shares (vr's) */
- void *ms_handle; /* Handle to driver. */
- mac_alloc_share_t ms_salloc; /* Get a share from driver. */
- mac_free_share_t ms_sfree; /* Return a share to driver. */
- mac_share_add_group_t ms_sadd; /* Add a group to the share. */
- mac_share_rem_group_t ms_sremove; /* Remove group from share. */
- mac_share_query_t ms_squery; /* Query share constraints */
-} mac_capab_share_t;
+typedef void (*mac_rename_fn_t)(const char *, void *);
+typedef struct mac_capab_aggr_s {
+ mac_rename_fn_t mca_rename_fn;
+ int (*mca_unicst)(void *, const uint8_t *);
+} mac_capab_aggr_t;
-/*
- * Flags for mc_callbacks. Requiring drivers to set the flags associated
- * with optional callbacks initialized in the structure allows the mac
- * module to add optional callbacks in the future without requiring drivers
- * to recompile.
- */
-#define MC_RESOURCES 0x001
-#define MC_IOCTL 0x002
-#define MC_GETCAPAB 0x004
-#define MC_OPEN 0x008
-#define MC_CLOSE 0x010
-#define MC_SETPROP 0x020
-#define MC_GETPROP 0x040
-
-#define MAC_MAX_MINOR 1000
-
-typedef struct mac_register_s {
- uint_t m_version; /* set by mac_alloc() */
- const char *m_type_ident;
- void *m_driver; /* Driver private data */
- dev_info_t *m_dip;
- uint_t m_instance;
- uint8_t *m_src_addr;
- uint8_t *m_dst_addr;
- mac_callbacks_t *m_callbacks;
- uint_t m_min_sdu;
- uint_t m_max_sdu;
- void *m_pdata;
- size_t m_pdata_size;
- uint32_t m_margin;
- mac_priv_prop_t *m_priv_props;
- size_t m_priv_prop_count;
-} mac_register_t;
-
-
-/*
- * Opaque handle types.
- */
-typedef struct mac_t *mac_handle_t;
-typedef struct __mac_notify_handle *mac_notify_handle_t;
-typedef struct __mac_rx_handle *mac_rx_handle_t;
-typedef struct __mac_txloop_handle *mac_txloop_handle_t;
-typedef struct __mac_resource_handle *mac_resource_handle_t;
-
-/*
- * MAC interface callback types.
- */
typedef enum {
MAC_NOTE_LINK,
MAC_NOTE_PROMISC,
@@ -604,15 +305,15 @@ typedef enum {
MAC_NOTE_DEVPROMISC,
MAC_NOTE_FASTPATH_FLUSH,
MAC_NOTE_SDU_SIZE,
- MAC_NOTE_VNIC,
MAC_NOTE_MARGIN,
+ MAC_NOTE_CAPAB_CHG,
MAC_NNOTE /* must be the last entry */
} mac_notify_type_t;
typedef void (*mac_notify_t)(void *, mac_notify_type_t);
-typedef void (*mac_rx_t)(void *, mac_resource_handle_t, mblk_t *);
-typedef void (*mac_txloop_t)(void *, mblk_t *);
-typedef void (*mac_blank_t)(void *, time_t, uint_t);
+typedef void (*mac_rx_t)(void *, mac_resource_handle_t, mblk_t *,
+ boolean_t);
+typedef mblk_t *(*mac_receive_t)(void *, int);
/*
* MAC promiscuous types
@@ -629,26 +330,38 @@ typedef enum {
MAC_RX_FIFO = 1
} mac_resource_type_t;
+typedef int (*mac_intr_enable_t)(mac_intr_handle_t);
+typedef int (*mac_intr_disable_t)(mac_intr_handle_t);
+
+typedef struct mac_intr_s {
+ mac_intr_handle_t mi_handle;
+ mac_intr_enable_t mi_enable;
+ mac_intr_disable_t mi_disable;
+} mac_intr_t;
+
typedef struct mac_rx_fifo_s {
mac_resource_type_t mrf_type; /* MAC_RX_FIFO */
- mac_blank_t mrf_blank;
- void *mrf_arg;
- time_t mrf_normal_blank_time;
- uint_t mrf_normal_pkt_count;
+ mac_intr_t mrf_intr;
+ mac_receive_t mrf_receive;
+ void *mrf_rx_arg;
+ uint32_t mrf_flow_priority;
+ /*
+ * The CPU this flow is to be processed on. With intrd and future
+ * things, we should know which CPU the flow needs to be processed
+ * and get a squeue assigned on that CPU.
+ */
+ uint_t mrf_cpu_id;
} mac_rx_fifo_t;
-typedef struct mac_txinfo_s {
- mac_tx_t mt_fn;
- void *mt_arg;
-} mac_txinfo_t;
+#define mrf_intr_handle mrf_intr.mi_handle
+#define mrf_intr_enable mrf_intr.mi_enable
+#define mrf_intr_disable mrf_intr.mi_disable
typedef union mac_resource_u {
mac_resource_type_t mr_type;
mac_rx_fifo_t mr_fifo;
} mac_resource_t;
-typedef mac_resource_handle_t (*mac_resource_add_t)(void *, mac_resource_t *);
-
typedef enum {
MAC_ADDRTYPE_UNICAST,
MAC_ADDRTYPE_MULTICAST,
@@ -664,11 +377,29 @@ typedef struct mac_header_info_s {
uint32_t mhi_bindsap;
mac_addrtype_t mhi_dsttype;
uint16_t mhi_tci;
- uint_t mhi_istagged:1,
- mhi_prom_looped:1;
+ boolean_t mhi_istagged;
} mac_header_info_t;
/*
+ * Function pointer to match dls client signature. Should be same as
+ * dls_rx_t to allow a soft ring to bypass DLS layer and call a DLS
+ * client directly.
+ */
+typedef void (*mac_direct_rx_t)(void *, mac_resource_handle_t,
+ mblk_t *, mac_header_info_t *);
+
+typedef mac_resource_handle_t (*mac_resource_add_t)(void *, mac_resource_t *);
+typedef int (*mac_resource_bind_t)(void *,
+ mac_resource_handle_t, processorid_t);
+typedef void (*mac_resource_remove_t)(void *, void *);
+typedef void (*mac_resource_quiesce_t)(void *, void *);
+typedef void (*mac_resource_restart_t)(void *, void *);
+typedef int (*mac_resource_modify_t)(void *, void *,
+ mac_resource_t *);
+typedef void (*mac_change_upcall_t)(void *, mac_direct_rx_t,
+ void *);
+
+/*
* MAC-Type plugin interfaces
*/
@@ -782,6 +513,13 @@ typedef struct mac_ndd_mapping_s {
#define mp_prop_id u_mp_id.u_id
#define mp_kstat u_mp_id.u_kstat
+typedef struct mac_stat_info_s {
+ uint_t msi_stat;
+ char *msi_name;
+ uint_t msi_type; /* as defined in kstat_named_init(9F) */
+ uint64_t msi_default;
+} mac_stat_info_t;
+
typedef struct mactype_register_s {
uint_t mtr_version; /* set by mactype_alloc() */
const char *mtr_ident;
@@ -803,107 +541,25 @@ typedef struct mac_prop_s {
} mac_prop_t;
/*
- * Client interface functions.
+ * Driver interface functions.
*/
-extern int mac_open(const char *, mac_handle_t *);
extern int mac_open_by_linkid(datalink_id_t,
mac_handle_t *);
extern int mac_open_by_linkname(const char *,
mac_handle_t *);
-extern void mac_close(mac_handle_t);
-extern const mac_info_t *mac_info(mac_handle_t);
-extern boolean_t mac_info_get(const char *, mac_info_t *);
-extern uint64_t mac_stat_get(mac_handle_t, uint_t);
-extern int mac_start(mac_handle_t);
-extern void mac_stop(mac_handle_t);
-extern int mac_promisc_set(mac_handle_t, boolean_t,
- mac_promisc_type_t);
-extern boolean_t mac_promisc_get(mac_handle_t,
- mac_promisc_type_t);
-extern int mac_multicst_add(mac_handle_t, const uint8_t *);
-extern int mac_multicst_remove(mac_handle_t,
- const uint8_t *);
-extern boolean_t mac_unicst_verify(mac_handle_t,
- const uint8_t *, uint_t);
-extern int mac_unicst_set(mac_handle_t, const uint8_t *);
-extern void mac_unicst_get(mac_handle_t, uint8_t *);
-extern void mac_dest_get(mac_handle_t, uint8_t *);
-extern void mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
-extern void mac_resources(mac_handle_t);
-extern void mac_ioctl(mac_handle_t, queue_t *, mblk_t *);
-extern const mac_txinfo_t *mac_tx_get(mac_handle_t);
-extern const mac_txinfo_t *mac_vnic_tx_get(mac_handle_t);
-extern link_state_t mac_link_get(mac_handle_t);
-extern mac_notify_handle_t mac_notify_add(mac_handle_t, mac_notify_t,
- void *);
-extern void mac_notify_remove(mac_handle_t,
- mac_notify_handle_t);
-extern void mac_notify(mac_handle_t);
-extern mac_rx_handle_t mac_rx_add(mac_handle_t, mac_rx_t, void *);
-extern mac_rx_handle_t mac_active_rx_add(mac_handle_t, mac_rx_t,
- void *);
-extern void mac_rx_remove(mac_handle_t, mac_rx_handle_t,
- boolean_t);
-extern void mac_rx_remove_wait(mac_handle_t);
-extern mblk_t *mac_txloop(void *, mblk_t *);
-extern mac_txloop_handle_t mac_txloop_add(mac_handle_t, mac_txloop_t,
- void *);
-extern void mac_txloop_remove(mac_handle_t,
- mac_txloop_handle_t);
-extern boolean_t mac_active_set(mac_handle_t);
-extern boolean_t mac_active_shareable_set(mac_handle_t);
-extern void mac_active_clear(mac_handle_t);
-extern void mac_active_rx(void *, mac_resource_handle_t,
- mblk_t *);
-extern boolean_t mac_vnic_set(mac_handle_t, mac_txinfo_t *,
- mac_getcapab_t, void *);
-extern void mac_vnic_clear(mac_handle_t);
-extern void mac_resource_set(mac_handle_t,
- mac_resource_add_t, void *);
-extern dev_info_t *mac_devinfo_get(mac_handle_t);
extern const char *mac_name(mac_handle_t);
extern minor_t mac_minor(mac_handle_t);
-extern boolean_t mac_capab_get(mac_handle_t, mac_capab_t,
- void *);
-extern boolean_t mac_vnic_capab_get(mac_handle_t, mac_capab_t,
- void *);
-extern boolean_t mac_sap_verify(mac_handle_t, uint32_t,
- uint32_t *);
-extern mblk_t *mac_header(mac_handle_t, const uint8_t *,
- uint32_t, mblk_t *, size_t);
-extern int mac_header_info(mac_handle_t, mblk_t *,
- mac_header_info_t *);
-extern mblk_t *mac_header_cook(mac_handle_t, mblk_t *);
-extern mblk_t *mac_header_uncook(mac_handle_t, mblk_t *);
extern minor_t mac_minor_hold(boolean_t);
extern void mac_minor_rele(minor_t);
+extern void mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
+extern int mac_maxsdu_update(mac_handle_t, uint_t);
-/*
- * Driver interface functions.
- */
-extern mac_register_t *mac_alloc(uint_t);
-extern void mac_free(mac_register_t *);
-extern int mac_register(mac_register_t *, mac_handle_t *);
-extern int mac_disable(mac_handle_t);
-extern int mac_unregister(mac_handle_t);
-extern void mac_rx(mac_handle_t, mac_resource_handle_t,
- mblk_t *);
-extern void mac_link_update(mac_handle_t, link_state_t);
extern void mac_unicst_update(mac_handle_t,
const uint8_t *);
-extern void mac_tx_update(mac_handle_t);
extern void mac_resource_update(mac_handle_t);
-extern mac_resource_handle_t mac_resource_add(mac_handle_t,
- mac_resource_t *);
-extern int mac_maxsdu_update(mac_handle_t, uint_t);
+extern void mac_capab_update(mac_handle_t);
extern int mac_pdata_update(mac_handle_t, void *,
size_t);
-extern void mac_multicst_refresh(mac_handle_t,
- mac_multicst_t, void *, boolean_t);
-extern void mac_unicst_refresh(mac_handle_t, mac_unicst_t,
- void *);
-extern void mac_promisc_refresh(mac_handle_t,
- mac_setpromisc_t, void *);
extern boolean_t mac_margin_update(mac_handle_t, uint32_t);
extern void mac_margin_get(mac_handle_t, uint32_t *);
extern int mac_margin_remove(mac_handle_t, uint32_t);
@@ -912,18 +568,17 @@ extern int mac_margin_add(mac_handle_t, uint32_t *,
extern void mac_init_ops(struct dev_ops *, const char *);
extern void mac_fini_ops(struct dev_ops *);
extern uint32_t mac_no_notification(mac_handle_t);
-extern boolean_t mac_is_legacy(mac_handle_t);
-extern int mac_hold_exclusive(mac_handle_t);
-extern void mac_rele_exclusive(mac_handle_t);
extern mactype_register_t *mactype_alloc(uint_t);
extern void mactype_free(mactype_register_t *);
extern int mactype_register(mactype_register_t *);
extern int mactype_unregister(const char *);
-extern int mac_set_prop(mac_handle_t, mac_prop_t *,
- void *, uint_t);
-extern int mac_get_prop(mac_handle_t, mac_prop_t *,
- void *, uint_t, uint_t *);
+extern void mac_set_ring(void *, void *);
+
+extern void mac_start_logusage(mac_logtype_t, uint_t);
+extern void mac_stop_logusage(mac_logtype_t);
+
+extern mac_handle_t mac_get_lower_mac_handle(mac_handle_t);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h
new file mode 100644
index 0000000000..f1743577ef
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_client.h
@@ -0,0 +1,184 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file captures the MAC client API definitions. It can be
+ * included from any MAC clients.
+ */
+
+#ifndef _SYS_MAC_CLIENT_H
+#define _SYS_MAC_CLIENT_H
+
+#include <sys/mac.h>
+#include <sys/mac_flow.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * MAC client interface.
+ */
+
+typedef struct __mac_client_handle *mac_client_handle_t;
+typedef struct __mac_unicast_handle *mac_unicast_handle_t;
+typedef struct __mac_promisc_handle *mac_promisc_handle_t;
+typedef struct __mac_perim_handle *mac_perim_handle_t;
+typedef uintptr_t mac_tx_cookie_t;
+
+typedef void (*mac_tx_notify_t)(void *, mac_tx_cookie_t);
+
+typedef enum {
+ MAC_DIAG_NONE,
+ MAC_DIAG_MACADDR_NIC,
+ MAC_DIAG_MACADDR_INUSE,
+ MAC_DIAG_MACADDR_INVALID,
+ MAC_DIAG_MACADDRLEN_INVALID,
+ MAC_DIAG_MACFACTORYSLOTINVALID,
+ MAC_DIAG_MACFACTORYSLOTUSED,
+ MAC_DIAG_MACFACTORYSLOTALLUSED,
+ MAC_DIAG_MACFACTORYNOTSUP,
+ MAC_DIAG_MACPREFIX_INVALID,
+ MAC_DIAG_MACPREFIXLEN_INVALID,
+ MAC_DIAG_MACNO_HWRINGS
+} mac_diag_t;
+
+typedef enum {
+ MAC_CLIENT_PROMISC_ALL,
+ MAC_CLIENT_PROMISC_FILTERED,
+ MAC_CLIENT_PROMISC_MULTI
+} mac_client_promisc_type_t;
+
+/* flags passed to mac_unicast_add() */
+#define MAC_UNICAST_NODUPCHECK 0x0001
+#define MAC_UNICAST_PRIMARY 0x0002
+#define MAC_UNICAST_HW 0x0004
+#define MAC_UNICAST_VNIC_PRIMARY 0x0008
+
+/* flags passed to mac_client_open */
+#define MAC_OPEN_FLAGS_IS_VNIC 0x0001
+#define MAC_OPEN_FLAGS_EXCLUSIVE 0x0002
+#define MAC_OPEN_FLAGS_TAG_DISABLE 0x0004
+#define MAC_OPEN_FLAGS_IS_AGGR_PORT 0x0008
+#define MAC_OPEN_FLAGS_STRIP_DISABLE 0x0010
+#define MAC_OPEN_FLAGS_NO_HWRINGS 0x0020
+#define MAC_OPEN_FLAGS_SHARES_DESIRED 0x0040
+#define MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK 0x0080
+#define MAC_OPEN_FLAGS_USE_DATALINK_NAME 0x0100
+#define MAC_OPEN_FLAGS_REQ_HWRINGS 0x0200
+
+/* flags passed to mac_client_close */
+#define MAC_CLOSE_FLAGS_IS_VNIC 0x0001
+#define MAC_CLOSE_FLAGS_EXCLUSIVE 0x0002
+#define MAC_CLOSE_FLAGS_IS_AGGR_PORT 0x0004
+
+/* flags passed to mac_promisc_add() */
+#define MAC_PROMISC_FLAGS_NO_TX_LOOP 0x0001
+#define MAC_PROMISC_FLAGS_NO_PHYS 0x0002
+
+/* flags passed to mac_tx() */
+#define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */
+#define MAC_TX_NO_ENQUEUE 0x02 /* don't enqueue mblks if not xmit'ed */
+#define MAC_TX_NO_HOLD 0x04 /* don't bump the active Tx count */
+
+extern int mac_client_open(mac_handle_t, mac_client_handle_t *, char *,
+ uint16_t);
+extern void mac_client_close(mac_client_handle_t, uint16_t);
+
+extern int mac_unicast_add(mac_client_handle_t, uint8_t *, uint16_t,
+ mac_unicast_handle_t *, uint16_t, mac_diag_t *);
+extern int mac_unicast_primary_add(mac_client_handle_t, mac_unicast_handle_t *,
+ mac_diag_t *);
+extern int mac_unicast_remove(mac_client_handle_t, mac_unicast_handle_t);
+
+extern int mac_multicast_add(mac_client_handle_t, const uint8_t *);
+extern void mac_multicast_remove(mac_client_handle_t, const uint8_t *);
+
+extern void mac_rx_set(mac_client_handle_t, mac_rx_t, void *);
+extern void mac_rx_clear(mac_client_handle_t);
+extern mac_tx_cookie_t mac_tx(mac_client_handle_t, mblk_t *,
+ uintptr_t, uint16_t, mblk_t **);
+extern boolean_t mac_tx_is_flow_blocked(mac_client_handle_t, mac_tx_cookie_t);
+extern uint64_t mac_client_stat_get(mac_client_handle_t, uint_t);
+
+extern int mac_promisc_add(mac_client_handle_t, mac_client_promisc_type_t,
+ mac_rx_t, void *, mac_promisc_handle_t *, uint16_t);
+extern int mac_promisc_remove(mac_promisc_handle_t);
+
+extern mac_notify_handle_t mac_notify_add(mac_handle_t, mac_notify_t, void *);
+extern int mac_notify_remove(mac_notify_handle_t, boolean_t);
+extern void mac_notify_remove_wait(mac_handle_t);
+extern int mac_rename_primary(mac_handle_t, const char *);
+extern char *mac_client_name(mac_client_handle_t);
+
+extern int mac_open(const char *, mac_handle_t *);
+extern void mac_close(mac_handle_t);
+extern uint64_t mac_stat_get(mac_handle_t, uint_t);
+
+extern int mac_unicast_primary_set(mac_handle_t, const uint8_t *);
+extern void mac_unicast_primary_get(mac_handle_t, uint8_t *);
+extern void mac_unicast_primary_info(mac_handle_t, char *, boolean_t *);
+
+extern int mac_addr_random(mac_client_handle_t, uint_t, uint8_t *,
+ mac_diag_t *);
+
+extern int mac_addr_factory_reserve(mac_client_handle_t, int *);
+extern void mac_addr_factory_release(mac_client_handle_t, uint_t);
+extern void mac_addr_factory_value(mac_handle_t, int, uchar_t *, uint_t *,
+ char *, boolean_t *);
+extern uint_t mac_addr_factory_num(mac_handle_t);
+
+extern uint_t mac_addr_len(mac_handle_t);
+
+extern mac_tx_notify_handle_t mac_client_tx_notify(mac_client_handle_t,
+ mac_tx_notify_t, void *);
+
+extern int mac_set_resources(mac_handle_t, mac_resource_props_t *);
+extern void mac_get_resources(mac_handle_t, mac_resource_props_t *);
+extern int mac_client_set_resources(mac_client_handle_t,
+ mac_resource_props_t *);
+extern void mac_client_get_resources(mac_client_handle_t,
+ mac_resource_props_t *);
+
+extern int mac_share_capable(mac_handle_t);
+extern int mac_share_bind(mac_client_handle_t, uint64_t, uint64_t *);
+extern void mac_share_unbind(mac_client_handle_t);
+
+extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *);
+
+extern uint_t mac_hwgrp_num(mac_handle_t);
+extern void mac_get_hwgrp_info(mac_handle_t, int, uint_t *, uint_t *,
+ uint_t *, uint_t *, char *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_CLIENT_H */
diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h
new file mode 100644
index 0000000000..29d2a40ff1
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_client_impl.h
@@ -0,0 +1,318 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MAC_CLIENT_IMPL_H
+#define _SYS_MAC_CLIENT_IMPL_H
+
+#include <sys/modhash.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <net/if.h>
+#include <sys/mac_flow_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern kmem_cache_t *mac_client_impl_cache;
+extern kmem_cache_t *mac_unicast_impl_cache;
+extern kmem_cache_t *mac_promisc_impl_cache;
+
+/*
+ * Need a list to chain all VIDs assigned to a client. Normally, one
+ * MAC client only has one VID. But vsw might need multiple VIDs.
+ */
+typedef struct mac_unicast_impl_s { /* Protected by */
+ struct mac_unicast_impl_s *mui_next; /* SL */
+ mac_address_t *mui_map; /* SL */
+ uint16_t mui_vid; /* SL */
+} mac_unicast_impl_t;
+
+#define MAC_CLIENT_FLAGS_PRIMARY 0X0001
+#define MAC_CLIENT_FLAGS_VNIC_PRIMARY 0x0002
+
+/*
+ * One of these is instantiated per MAC client promiscuous callback.
+ *
+ * Each element of this structure belongs to two linked list. One
+ * for the mac_client_impl_t (mci_promisc_list) which created allocated
+ * the callback, the other for the mac_impl_t (mi_promisc_list) corresponding
+ * to the MAC client.
+ * The former allows us to do bookkeeping, the latter allows us
+ * to more efficiently dispatch packets to the promiscuous callbacks.
+ */
+typedef struct mac_promisc_impl_s { /* Protected by */
+ mac_cb_t mpi_mci_link; /* mi_promisc_lock */
+ mac_cb_t mpi_mi_link; /* mi_promisc_lock */
+ mac_client_promisc_type_t mpi_type; /* WO */
+ mac_rx_t mpi_fn; /* WO */
+ void *mpi_arg; /* WO */
+ struct mac_client_impl_s *mpi_mcip; /* WO */
+ boolean_t mpi_no_tx_loop; /* WO */
+ boolean_t mpi_no_phys; /* WO */
+} mac_promisc_impl_t;
+
+typedef union mac_tx_percpu_s {
+ struct {
+ kmutex_t _pcpu_tx_lock;
+ uint_t _pcpu_tx_refcnt;
+ } pcpu_lr;
+ uchar_t pcpu_pad[64];
+} mac_tx_percpu_t;
+
+#define pcpu_tx_lock pcpu_lr._pcpu_tx_lock
+#define pcpu_tx_refcnt pcpu_lr._pcpu_tx_refcnt
+
+/*
+ * One of these is instanciated for each MAC client.
+ */
+struct mac_client_impl_s { /* Protected by */
+ struct mac_client_impl_s *mci_client_next; /* mi_rw_lock */
+ char mci_name[MAXNAMELEN]; /* mi_rw_lock */
+ /*
+ * This flow entry will contain all the internal constructs
+ * such as SRS etc. for this MAC client. The MAC client may
+ * have more than one flow corresponding to each upper client
+ * sharing this mac_client_impl_t.
+ */
+ flow_entry_t *mci_flent; /* mi_rw_lock */
+ struct mac_impl_s *mci_mip; /* WO */
+ /*
+ * If this is a client that has a pass thru MAC (e.g. a VNIC),
+ * then we also keep the handle for the client's upper MAC.
+ */
+ struct mac_impl_s *mci_upper_mip; /* WO */
+
+ uint32_t mci_state_flags; /* WO */
+ mac_rx_t mci_rx_fn; /* Rx Quiescence */
+ void *mci_rx_arg; /* Rx Quiescence */
+ mac_direct_rx_t mci_direct_rx_fn; /* SL */
+ void *mci_direct_rx_arg; /* SL */
+
+ mac_cb_t *mci_promisc_list; /* mi_promisc_lock */
+
+ mac_address_t *mci_unicast;
+ uint32_t mci_flags; /* SL */
+ krwlock_t mci_rw_lock;
+ mac_unicast_impl_t *mci_unicast_list; /* mci_rw_lock */
+ /*
+ * The mac_client_impl_t may be shared by multiple clients, i.e
+ * multiple VLANs sharing the same MAC client. In this case the
+ * address/vid tubles differ and are each associated with their
+ * own flow entry, but the rest underlying components SRS, etc,
+ * are common.
+ */
+ flow_entry_t *mci_flent_list; /* mci_rw_lock */
+ uint_t mci_nflents; /* mci_rw_lock */
+ uint_t mci_nvids; /* mci_rw_lock */
+
+ /* Resource Management Functions */
+ mac_resource_add_t mci_resource_add; /* SL */
+ mac_resource_remove_t mci_resource_remove; /* SL */
+ mac_resource_quiesce_t mci_resource_quiesce; /* SL */
+ mac_resource_restart_t mci_resource_restart; /* SL */
+ mac_resource_bind_t mci_resource_bind; /* SL */
+ void *mci_resource_arg; /* SL */
+
+
+ /* Tx notify callback */
+ kmutex_t mci_tx_cb_lock;
+ mac_cb_info_t mci_tx_notify_cb_info; /* cb list info */
+ mac_cb_t *mci_tx_notify_cb_list; /* The cb list */
+ uintptr_t mci_tx_notify_id;
+
+ /* per MAC client stats */ /* None */
+ uint64_t mci_stat_multircv;
+ uint64_t mci_stat_brdcstrcv;
+ uint64_t mci_stat_multixmt;
+ uint64_t mci_stat_brdcstxmt;
+ uint64_t mci_stat_obytes;
+ uint64_t mci_stat_opackets;
+ uint64_t mci_stat_oerrors;
+ uint64_t mci_stat_ibytes;
+ uint64_t mci_stat_ipackets;
+ uint64_t mci_stat_ierrors;
+
+ flow_tab_t *mci_subflow_tab; /* Rx quiescence */
+
+ /*
+ * Priority range for this MAC client. This the range
+ * corresponding to the priority configured (nr_flow_priority).
+ */
+ pri_t mci_min_pri;
+ pri_t mci_max_pri;
+
+ /*
+ * Hybrid I/O related definitions.
+ */
+ mac_share_handle_t mci_share;
+ boolean_t mci_share_bound;
+ boolean_t mci_no_hwrings;
+
+ /* The client requests a hardware group */
+ boolean_t mci_req_hwrings;
+
+ /* for multicast support */
+ struct mac_mcast_addrs_s *mci_mcast_addrs; /* mi_rw_lock */
+
+ /*
+ * Protected by mci_tx_pcpu[0].pcpu_tx_lock
+ */
+ uint_t mci_tx_flag;
+ kcondvar_t mci_tx_cv;
+
+ /* Must be last in the structure for dynamic sizing */
+ mac_tx_percpu_t mci_tx_pcpu[1]; /* SL */
+};
+
+#define MAC_CLIENT_IMPL_SIZE \
+ (sizeof (mac_client_impl_t) + \
+ (mac_tx_percpu_cnt * sizeof (mac_tx_percpu_t)))
+
+extern int mac_tx_percpu_cnt;
+
+#define MCIP_TX_SRS(mcip) \
+ ((mcip)->mci_flent == NULL ? NULL : (mcip)->mci_flent->fe_tx_srs)
+
+/* Defensive coding, non-null mcip_flent could be an assert */
+
+#define MCIP_DATAPATH_SETUP(mcip) \
+ ((mcip)->mci_flent == NULL ? B_FALSE : \
+ !((mcip)->mci_flent->fe_flags & FE_MC_NO_DATAPATH))
+
+#define MCIP_RESOURCE_PROPS(mcip) \
+ ((mcip)->mci_flent == NULL ? NULL : \
+ &(mcip)->mci_flent->fe_resource_props)
+
+#define MCIP_EFFECTIVE_PROPS(mcip) \
+ (mcip->mci_flent == NULL ? NULL : \
+ &(mcip)->mci_flent->fe_effective_props)
+
+#define MCIP_RESOURCE_PROPS_MASK(mcip) \
+ ((mcip)->mci_flent == NULL ? 0 : \
+ (mcip)->mci_flent->fe_resource_props.mrp_mask)
+
+#define MCIP_RESOURCE_PROPS_MAXBW(mcip) \
+ ((mcip)->mci_flent == NULL ? 0 : \
+ (mcip)->mci_flent->fe_resource_props.mrp_maxbw)
+
+#define MCIP_RESOURCE_PROPS_PRIORITY(mcip) \
+ ((mcip)->mci_flent == NULL ? 0 : \
+ (mcip)->mci_flent->fe_resource_props.mrp_priority)
+
+#define MCIP_RESOURCE_PROPS_CPUS(mcip) \
+ ((mcip)->mci_flent == NULL ? 0 : \
+ &(mcip)->mci_flent->fe_resource_props.mrp_cpus)
+
+#define MCIP_RESOURCE_PROPS_NCPUS(mcip) \
+ ((mcip)->mci_flent == NULL ? 0 : \
+ (mcip)->mci_flent->fe_resource_props.mrp_ncpus)
+
+#define MCIP_RESOURCE_PROPS_CPU(mcip) \
+ ((mcip)->mci_flent == NULL ? 0 : \
+ (mcip)->mci_flent->fe_resource_props.mrp_ncpu)
+
+/*
+ * We validate the VLAN id of the packet w.r.t the client's vid,
+ * if required (i.e. !MCIS_DISABLE_TX_VID_CHECK). DLS clients
+ * will have MCIS_DISABLE_TX_VID_CHECK set.
+ * (In the case of aggr when we get back packets, due to
+ * the underlying driver being flow controlled, we won't
+ * drop the packet even if it is VLAN tagged as we
+ * don't set MCIS_DISABLE_TX_VID_CHECK for an aggr.)
+ */
+#define MAC_VID_CHECK_NEEDED(mcip) \
+ (((mcip)->mci_state_flags & MCIS_DISABLE_TX_VID_CHECK) == 0 && \
+ (mcip)->mci_mip->mi_info.mi_nativemedia == DL_ETHER)
+
+#define MAC_VID_CHECK(mcip, mp, err) { \
+ if (ntohs(((struct ether_header *)(mp)->b_rptr)->ether_type) == \
+ ETHERTYPE_VLAN) { \
+ /* \
+ * err is set to EINVAL (so the caller can take the \
+ * appropriate action. e.g. freemsg()) for two cases: \
+ * -client is not responsible for filling in the vid. \
+ * -client is responsible for filling in the vid, but \
+ * the vid doesn't match the vid of the MAC client. \
+ */ \
+ (err) = EINVAL; \
+ if (((mcip)->mci_state_flags & MCIS_TAG_DISABLE) != 0) {\
+ struct ether_vlan_header *evhp; \
+ uint16_t vlanid; \
+ \
+ evhp = (struct ether_vlan_header *)(mp)->b_rptr;\
+ vlanid = VLAN_ID(ntohs(evhp->ether_tci)); \
+ if (mac_client_check_flow_vid((mcip), vlanid)) \
+ (err) = 0; \
+ } \
+ } \
+}
+
+#define MAC_TAG_NEEDED(mcip) \
+ (((mcip)->mci_state_flags & MCIS_TAG_DISABLE) == 0 && \
+ (mcip)->mci_nvids == 1) \
+
+/* MCI state flags */
+#define MCIS_IS_VNIC 0x0001
+#define MCIS_EXCLUSIVE 0x0002
+#define MCIS_TAG_DISABLE 0x0004
+#define MCIS_STRIP_DISABLE 0x0008
+#define MCIS_IS_AGGR_PORT 0x0010
+#define MCIS_CLIENT_POLL_CAPABLE 0x0020
+#define MCIS_DESC_LOGGED 0x0040
+#define MCIS_SHARE_BOUND 0x0080
+#define MCIS_NO_HWRINGS 0x0100
+#define MCIS_DISABLE_TX_VID_CHECK 0x0200
+#define MCIS_USE_DATALINK_NAME 0x0400
+
+/* in mac_client.c */
+extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);
+extern void mac_client_init(void);
+extern void mac_client_fini(void);
+extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *,
+ mac_client_impl_t *);
+
+extern int mac_validate_props(mac_resource_props_t *);
+
+extern mac_client_impl_t *mac_vnic_lower(mac_impl_t *);
+extern mac_client_impl_t *mac_primary_client_handle(mac_impl_t *);
+extern uint16_t i_mac_flow_vid(flow_entry_t *);
+extern boolean_t i_mac_capab_get(mac_handle_t, mac_capab_t, void *);
+
+extern void mac_unicast_update_clients(mac_impl_t *, mac_address_t *);
+extern void mac_update_resources(mac_resource_props_t *,
+ mac_resource_props_t *, boolean_t);
+
+boolean_t mac_client_check_flow_vid(mac_client_impl_t *, uint16_t);
+
+extern boolean_t mac_is_primary_client(mac_client_impl_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_CLIENT_IMPL_H */
diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h
new file mode 100644
index 0000000000..7e22552aeb
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_client_priv.h
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file contains *private* MAC API definitions. This header file
+ * should only be included by kernel components which are part of the
+ * GLDv3 stack (dld, dls, aggr, softmac).
+ */
+
+#ifndef _SYS_MAC_CLIENT_PRIV_H
+#define _SYS_MAC_CLIENT_PRIV_H
+
+#include <sys/mac.h>
+#include <sys/mac_flow.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#ifdef DEBUG
+#define MAC_PERIM_HELD(mph) mac_perim_held(mph)
+#else
+#define MAC_PERIM_HELD(mph)
+#endif
+
+extern boolean_t mac_rx_bypass_set(mac_client_handle_t, mac_direct_rx_t,
+ void *);
+
+extern const mac_info_t *mac_info(mac_handle_t);
+extern boolean_t mac_info_get(const char *, mac_info_t *);
+extern int mac_promisc_set(mac_handle_t, boolean_t, mac_promisc_type_t);
+extern boolean_t mac_promisc_get(mac_handle_t, mac_promisc_type_t);
+
+extern void mac_ioctl(mac_handle_t, queue_t *, mblk_t *);
+extern link_state_t mac_link_get(mac_handle_t);
+extern void mac_resource_set(mac_client_handle_t, mac_resource_add_t, void *);
+extern dev_info_t *mac_devinfo_get(mac_handle_t);
+extern boolean_t mac_capab_get(mac_handle_t, mac_capab_t, void *);
+extern boolean_t mac_sap_verify(mac_handle_t, uint32_t, uint32_t *);
+extern mblk_t *mac_header(mac_handle_t, const uint8_t *, uint32_t, mblk_t *,
+ size_t);
+extern int mac_header_info(mac_handle_t, mblk_t *, mac_header_info_t *);
+extern mblk_t *mac_header_cook(mac_handle_t, mblk_t *);
+extern mblk_t *mac_header_uncook(mac_handle_t, mblk_t *);
+
+extern void mac_resource_set_common(mac_client_handle_t,
+ mac_resource_add_t, mac_resource_remove_t, mac_resource_quiesce_t,
+ mac_resource_restart_t, mac_resource_bind_t, void *);
+
+extern void mac_perim_enter_by_mh(mac_handle_t, mac_perim_handle_t *);
+extern int mac_perim_enter_by_macname(const char *, mac_perim_handle_t *);
+extern int mac_perim_enter_by_linkid(datalink_id_t, mac_perim_handle_t *);
+extern void mac_perim_exit(mac_perim_handle_t);
+extern boolean_t mac_perim_held(mac_handle_t);
+
+extern uint16_t mac_client_vid(mac_client_handle_t);
+extern int mac_vnic_unicast_set(mac_client_handle_t, const uint8_t *);
+
+extern void mac_client_poll_enable(mac_client_handle_t);
+extern void mac_client_poll_disable(mac_client_handle_t);
+
+extern int mac_resource_ctl_set(mac_client_handle_t, mac_resource_props_t *);
+extern void mac_resource_ctl_get(mac_client_handle_t, mac_resource_props_t *);
+
+/*
+ * Flow-related APIs for MAC clients.
+ */
+
+extern void mac_link_init_flows(mac_client_handle_t);
+extern void mac_link_release_flows(mac_client_handle_t);
+extern int mac_link_flow_add(datalink_id_t, char *, flow_desc_t *,
+ mac_resource_props_t *);
+extern int mac_link_flow_remove(char *);
+extern int mac_link_flow_modify(char *, mac_resource_props_t *);
+extern boolean_t mac_link_has_flows(mac_client_handle_t);
+
+typedef struct {
+ char fi_flow_name[MAXNAMELEN];
+ datalink_id_t fi_link_id;
+ flow_desc_t fi_flow_desc;
+ mac_resource_props_t fi_resource_props;
+} mac_flowinfo_t;
+
+extern int mac_link_flow_walk(datalink_id_t,
+ int (*)(mac_flowinfo_t *, void *), void *);
+extern int mac_link_flow_info(char *, mac_flowinfo_t *);
+
+extern void *mac_tx_hold(mac_client_handle_t);
+extern void mac_tx_rele(mac_client_handle_t, void *);
+extern void mac_rx_client_quiesce(mac_client_handle_t);
+extern void mac_rx_client_restart(mac_client_handle_t);
+extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t);
+extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *,
+ mac_ring_handle_t *);
+extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t);
+extern void mac_hwring_teardown(mac_ring_handle_t);
+extern int mac_hwring_disable_intr(mac_ring_handle_t);
+extern int mac_hwring_enable_intr(mac_ring_handle_t);
+extern int mac_hwring_start(mac_ring_handle_t);
+extern void mac_hwring_stop(mac_ring_handle_t);
+extern mblk_t *mac_hwring_poll(mac_ring_handle_t, int);
+#define MAC_HWRING_POLL(ring, bytes) \
+ (((ring)->mr_info.mri_poll) \
+ ((ring)->mr_info.mri_driver, (bytes)))
+
+extern int mac_hwgroup_addmac(mac_group_handle_t, const uint8_t *);
+extern int mac_hwgroup_remmac(mac_group_handle_t, const uint8_t *);
+
+extern void mac_set_upper_mac(mac_client_handle_t, mac_handle_t);
+
+extern int mac_mark_exclusive(mac_handle_t);
+extern void mac_unmark_exclusive(mac_handle_t);
+
+extern int32_t mac_client_intr_cpu(mac_client_handle_t);
+extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t);
+extern void *mac_get_devinfo(mac_handle_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_CLIENT_PRIV_H */
diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h
new file mode 100644
index 0000000000..05ed62a217
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_flow.h
@@ -0,0 +1,210 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _MAC_FLOW_H
+#define _MAC_FLOW_H
+
+/*
+ * Main structure describing a flow of packets, for classification use
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <netinet/in.h> /* for IPPROTO_* constants */
+#include <sys/ethernet.h>
+
+#define MAXFLOWNAME 32
+
+/* need to use MAXMACADDRLEN from dld.h instead of this one */
+#define MAXMACADDR 20
+
+/* Bit-mask for the selectors carried in the flow descriptor */
+typedef uint64_t flow_mask_t;
+
+#define FLOW_LINK_DST 0x00000001 /* Destination MAC addr */
+#define FLOW_LINK_SRC 0x00000002 /* Source MAC address */
+#define FLOW_LINK_VID 0x00000004 /* VLAN ID */
+#define FLOW_LINK_SAP 0x00000008 /* SAP value */
+
+#define FLOW_IP_VERSION 0x00000010 /* V4 or V6 */
+#define FLOW_IP_PROTOCOL 0x00000020 /* Protocol type */
+#define FLOW_IP_LOCAL 0x00000040 /* Local address */
+#define FLOW_IP_REMOTE 0x00000080 /* Remote address */
+#define FLOW_IP_DSFIELD 0x00000100 /* DSfield value */
+
+#define FLOW_ULP_PORT_LOCAL 0x00001000 /* ULP local port */
+#define FLOW_ULP_PORT_REMOTE 0x00002000 /* ULP remote port */
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct flow_desc_s {
+ flow_mask_t fd_mask;
+ uint32_t fd_mac_len;
+ uint8_t fd_dst_mac[MAXMACADDR];
+ uint8_t fd_src_mac[MAXMACADDR];
+ uint16_t fd_vid;
+ uint32_t fd_sap;
+ uint8_t fd_ipversion;
+ uint8_t fd_protocol;
+ in6_addr_t fd_local_addr;
+ in6_addr_t fd_local_netmask;
+ in6_addr_t fd_remote_addr;
+ in6_addr_t fd_remote_netmask;
+ in_port_t fd_local_port;
+ in_port_t fd_remote_port;
+ uint8_t fd_dsfield;
+ uint8_t fd_dsfield_mask;
+} flow_desc_t;
+
+#define MRP_NCPUS 128
+
+/*
+ * In MCM_CPUS mode, cpu bindings is user specified. In MCM_FANOUT mode,
+ * user only specifies a fanout count.
+ * mc_fanout_cnt gives the number of CPUs used for fanout soft rings.
+ * mc_fanout_cpus[] array stores the CPUs used for fanout soft rings.
+ */
+typedef enum {
+ MCM_FANOUT = 1,
+ MCM_CPUS
+} mac_cpu_mode_t;
+
+typedef struct mac_cpus_props_s {
+ uint32_t mc_ncpus; /* num of cpus */
+ uint32_t mc_cpus[MRP_NCPUS]; /* cpu list */
+ uint32_t mc_fanout_cnt; /* soft ring cpu cnt */
+ uint32_t mc_fanout_cpus[MRP_NCPUS]; /* SR cpu list */
+ uint32_t mc_pollid; /* poll thr binding */
+ uint32_t mc_workerid; /* worker thr binding */
+ /*
+ * interrupt cpu: mrp_intr_cpu less than 0 implies platform limitation
+ * in retargetting the interrupt assignment.
+ */
+ int32_t mc_intr_cpu;
+ mac_cpu_mode_t mc_fanout_mode; /* fanout mode */
+} mac_cpus_t;
+
+/* Priority values */
+typedef enum {
+ MPL_LOW,
+ MPL_MEDIUM,
+ MPL_HIGH,
+ MPL_RESET
+} mac_priority_level_t;
+
+/* The default priority for links */
+#define MPL_LINK_DEFAULT MPL_HIGH
+
+/* The default priority for flows */
+#define MPL_SUBFLOW_DEFAULT MPL_MEDIUM
+
+#define MRP_MAXBW 0x00000001 /* Limit set */
+#define MRP_CPUS 0x00000002 /* CPU/fanout set */
+#define MRP_CPUS_USERSPEC 0x00000004 /* CPU/fanout from user */
+#define MRP_PRIORITY 0x00000008 /* Priority set */
+
+#define MRP_THROTTLE MRP_MAXBW
+
+/* 3 levels - low, medium, high */
+#define MRP_PRIORITY_LEVELS 3
+
+/* Special value denoting no bandwidth control */
+#define MRP_MAXBW_RESETVAL -1ULL
+
+/*
+ * Until sub-megabit limit is implemented,
+ * reject values lower than 1 MTU per tick or 1.2Mbps
+ */
+#define MRP_MAXBW_MINVAL 1200000
+
+typedef struct mac_resource_props_s {
+ /*
+ * Bit-mask for the network resource control types types
+ */
+ uint32_t mrp_mask;
+ uint64_t mrp_maxbw; /* bandwidth limit in bps */
+ mac_priority_level_t mrp_priority; /* relative flow priority */
+ mac_cpus_t mrp_cpus;
+} mac_resource_props_t;
+
+#define mrp_ncpus mrp_cpus.mc_ncpus
+#define mrp_cpu mrp_cpus.mc_cpus
+#define mrp_fanout_cnt mrp_cpus.mc_fanout_cnt
+#define mrp_fanout_cpu mrp_cpus.mc_fanout_cpus
+#define mrp_pollid mrp_cpus.mc_pollid
+#define mrp_workerid mrp_cpus.mc_workerid
+#define mrp_intr_cpu mrp_cpus.mc_intr_cpu
+#define mrp_fanout_mode mrp_cpus.mc_fanout_mode
+
+#define MAC_COPY_CPUS(mrp, fmrp) { \
+ int ncpus; \
+ (fmrp)->mrp_ncpus = (mrp)->mrp_ncpus; \
+ (fmrp)->mrp_intr_cpu = (mrp)->mrp_intr_cpu; \
+ (fmrp)->mrp_fanout_mode = (mrp)->mrp_fanout_mode; \
+ if ((mrp)->mrp_ncpus == 0) { \
+ (fmrp)->mrp_mask &= ~MRP_CPUS; \
+ (fmrp)->mrp_mask &= ~MRP_CPUS_USERSPEC; \
+ } else { \
+ for (ncpus = 0; ncpus < (fmrp)->mrp_ncpus; ncpus++) \
+ (fmrp)->mrp_cpu[ncpus] = (mrp)->mrp_cpu[ncpus];\
+ (fmrp)->mrp_mask |= MRP_CPUS; \
+ if ((mrp)->mrp_mask & MRP_CPUS_USERSPEC) \
+ (fmrp)->mrp_mask |= MRP_CPUS_USERSPEC; \
+ } \
+}
+
+typedef struct flow_stats_s {
+ uint64_t fs_rbytes;
+ uint64_t fs_ipackets;
+ uint64_t fs_ierrors;
+ uint64_t fs_obytes;
+ uint64_t fs_opackets;
+ uint64_t fs_oerrors;
+} flow_stats_t;
+
+typedef enum {
+ FLOW_STAT_RBYTES,
+ FLOW_STAT_IPACKETS,
+ FLOW_STAT_IERRORS,
+ FLOW_STAT_OBYTES,
+ FLOW_STAT_OPACKETS,
+ FLOW_STAT_OERRORS
+} flow_stat_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MAC_FLOW_H */
diff --git a/usr/src/uts/common/sys/mac_flow_impl.h b/usr/src/uts/common/sys/mac_flow_impl.h
new file mode 100644
index 0000000000..6029873930
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_flow_impl.h
@@ -0,0 +1,537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _MAC_FLOW_IMPL_H
+#define _MAC_FLOW_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/atomic.h>
+#include <sys/ksynch.h>
+#include <sys/mac_flow.h>
+#include <sys/stream.h>
+#include <sys/sdt.h>
+#include <net/if.h>
+
+/*
+ * Macros to increment/decrement the reference count on a flow_entry_t.
+ */
+#define FLOW_REFHOLD(flent) { \
+ DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \
+ mutex_enter(&(flent)->fe_lock); \
+ (flent)->fe_refcnt++; \
+ mutex_exit(&(flent)->fe_lock); \
+}
+
+/*
+ * Data paths must not attempt to use a flow entry if it is marked INCIPIENT
+ * or QUIESCE. In the former case the set up is not yet complete and the
+ * data path could stumble on inconsistent data structures. In the latter
+ * case a control operation is waiting for quiescence so that it can
+ * change callbacks or other structures without the use of locks.
+ */
+#define FLOW_TRY_REFHOLD(flent, err) { \
+ DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \
+ (err) = 0; \
+ mutex_enter(&(flent)->fe_lock); \
+ if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \
+ FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH)) \
+ (err) = -1; \
+ else \
+ (flent)->fe_refcnt++; \
+ mutex_exit(&(flent)->fe_lock); \
+}
+
+#define FLOW_REFRELE(flent) { \
+ DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent)); \
+ mutex_enter(&(flent)->fe_lock); \
+ ASSERT((flent)->fe_refcnt != 0); \
+ (flent)->fe_refcnt--; \
+ if ((flent)->fe_flags & FE_WAITER) { \
+ ASSERT((flent)->fe_refcnt != 0); \
+ cv_signal(&(flent)->fe_cv); \
+ mutex_exit(&(flent)->fe_lock); \
+ } else if ((flent)->fe_refcnt == 0) { \
+ mac_flow_destroy(flent); \
+ } else { \
+ mutex_exit(&(flent)->fe_lock); \
+ } \
+}
+
+#define FLOW_USER_REFHOLD(flent) { \
+ mutex_enter(&(flent)->fe_lock); \
+ (flent)->fe_user_refcnt++; \
+ mutex_exit(&(flent)->fe_lock); \
+}
+
+#define FLOW_USER_REFRELE(flent) { \
+ mutex_enter(&(flent)->fe_lock); \
+ ASSERT((flent)->fe_user_refcnt != 0); \
+ if (--(flent)->fe_user_refcnt == 0 && \
+ ((flent)->fe_flags & FE_WAITER)) \
+ cv_signal(&(flent)->fe_cv); \
+ mutex_exit(&(flent)->fe_lock); \
+}
+
+#define FLOW_FINAL_REFRELE(flent) { \
+ ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0); \
+ FLOW_REFRELE(flent); \
+}
+
+/*
+ * Mark or unmark the flent with a bit flag
+ */
+#define FLOW_MARK(flent, flag) { \
+ mutex_enter(&(flent)->fe_lock); \
+ (flent)->fe_flags |= flag; \
+ mutex_exit(&(flent)->fe_lock); \
+}
+
+#define FLOW_UNMARK(flent, flag) { \
+ mutex_enter(&(flent)->fe_lock); \
+ (flent)->fe_flags &= ~flag; \
+ mutex_exit(&(flent)->fe_lock); \
+}
+
+#define FLENT_TO_MIP(flent) \
+ (flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) : \
+ ((mac_client_impl_t *)flent->fe_mcip)->mci_mip)
+
+/* Convert a bandwidth expressed in bps to a number of bytes per tick. */
+#define FLOW_BYTES_PER_TICK(bps) (((bps) >> 3) / hz)
+
+/*
+ * Given an underlying range and a priority level, obtain the minimum for the
+ * new range.
+ */
+#define FLOW_MIN_PRIORITY(min, max, pri) \
+ ((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri)))
+
+/*
+ * Given an underlying range and a minimum level (base), obtain the maximum
+ * for the new range.
+ */
+#define FLOW_MAX_PRIORITY(min, max, base) \
+ ((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS))
+
+/*
+ * Given an underlying range and a priority level, get the absolute
+ * priority value. For now there are just 3 values, high, low and
+ * medium so we can just return max, min or min + (max - min) / 2.
+ * If there are more than three we need to change this computation.
+ */
+#define FLOW_PRIORITY(min, max, pri) \
+ (pri) == MPL_HIGH ? (max) : \
+ (pri) == MPL_LOW ? (min) : \
+ ((min) + (((max) - (min)) / 2))
+
+#define MAC_FLOW_TAB_SIZE 500
+
+typedef struct flow_entry_s flow_entry_t;
+typedef struct flow_tab_s flow_tab_t;
+typedef struct flow_state_s flow_state_t;
+struct mac_impl_s;
+struct mac_client_impl_s;
+
+/*
+ * Classification flags used to lookup the flow.
+ */
+#define FLOW_INBOUND 0x01
+#define FLOW_OUTBOUND 0x02
+/* Don't compare VID when classifying the packets, see mac_rx_classify() */
+#define FLOW_IGNORE_VLAN 0x04
+
+/* Generic flow client function signature */
+typedef void (*flow_fn_t)(void *, void *, mblk_t *, boolean_t);
+
+/* Flow state */
+typedef enum {
+ FLOW_DRIVER_UPCALL,
+ FLOW_USER_REF
+} mac_flow_state_t;
+
+/* Matches a flow_entry_t using the extracted flow_state_t info */
+typedef boolean_t (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *,
+ flow_state_t *);
+
+/* fe_flags */
+#define FE_QUIESCE 0x01 /* Quiesce the flow */
+#define FE_WAITER 0x02 /* Flow has a waiter */
+#define FE_FLOW_TAB 0x04 /* Flow is in the flow tab list */
+#define FE_G_FLOW_HASH 0x08 /* Flow is in the global flow hash */
+#define FE_INCIPIENT 0x10 /* Being setup */
+#define FE_CONDEMNED 0x20 /* Being deleted */
+#define FE_UF_NO_DATAPATH 0x40 /* No datapath setup for User flow */
+#define FE_MC_NO_DATAPATH 0x80 /* No datapath setup for mac client */
+
+/* fe_type */
+#define FLOW_PRIMARY_MAC 0x01 /* NIC primary MAC address */
+#define FLOW_VNIC_MAC 0x02 /* VNIC flow */
+#define FLOW_MCAST 0x04 /* Multicast (and broadcast) */
+#define FLOW_OTHER 0x08 /* Other flows configured */
+#define FLOW_USER 0x10 /* User defined flow */
+#define FLOW_VNIC FLOW_VNIC_MAC
+#define FLOW_NO_STATS 0x20 /* Don't create stats for the flow */
+
+/*
+ * Shared Bandwidth control counters between the soft ring set and its
+ * associated soft rings. In case the flow associated with NIC/VNIC
+ * has a group of Rx rings assigned to it, we have the same
+ * number of soft ring sets as we have the Rx ring in the group
+ * and each individual SRS (and its soft rings) decide when to
+ * poll their Rx ring independently. But if there is a B/W limit
+ * associated with the NIC/VNIC, then the B/W control counter is
+ * shared across all the SRS in the group and their associated
+ * soft rings.
+ *
+ * There is a many to 1 mapping between the SRS and
+ * mac_bw_ctl if the flow has a group of Rx rings associated with
+ * it.
+ */
+typedef struct mac_bw_ctl_s {
+ kmutex_t mac_bw_lock;
+ uint32_t mac_bw_state;
+ size_t mac_bw_sz; /* ?? Is it needed */
+ size_t mac_bw_limit; /* Max bytes to process per tick */
+ size_t mac_bw_used; /* Bytes processed in current tick */
+ size_t mac_bw_drop_threshold; /* Max queue length */
+ size_t mac_bw_drop_bytes;
+ size_t mac_bw_polled;
+ size_t mac_bw_intr;
+ clock_t mac_bw_curr_time;
+} mac_bw_ctl_t;
+
+struct flow_entry_s { /* Protected by */
+ struct flow_entry_s *fe_next; /* ft_lock */
+
+ datalink_id_t fe_link_id; /* WO */
+
+ /* Properties as specified for this flow */
+ mac_resource_props_t fe_resource_props; /* SL */
+
+ /* Properties actually effective at run time for this flow */
+ mac_resource_props_t fe_effective_props; /* SL */
+
+ kmutex_t fe_lock;
+ char fe_flow_name[MAXFLOWNAME]; /* fe_lock */
+ flow_desc_t fe_flow_desc; /* fe_lock */
+ kcondvar_t fe_cv; /* fe_lock */
+ /*
+ * Initial flow ref is 1 on creation. A thread that lookups the
+ * flent typically by a mac_flow_lookup() dynamically holds a ref.
+ * If the ref is 1, it means there arent' any upcalls from the driver
+ * or downcalls from the stack using this flent. Structures pointing
+ * to the flent or flent inserted in lists don't count towards this
+ * refcnt. Instead they are tracked using fe_flags. Only a control
+ * thread doing a teardown operation deletes the flent, after waiting
+ * for upcalls to finish synchronously. The fe_refcnt tracks
+ * the number of upcall refs
+ */
+ uint32_t fe_refcnt; /* fe_lock */
+
+ /*
+ * This tracks lookups done using the global hash list for user
+ * generated flows. This refcnt only protects the flent itself
+ * from disappearing and helps walkers to read the flent info such
+ * as flow spec. However the flent may be quiesced and the SRS could
+ * be deleted. The fe_user_refcnt tracks the number of global flow
+ * has refs.
+ */
+ uint32_t fe_user_refcnt; /* fe_lock */
+ uint_t fe_flags; /* fe_lock */
+
+ /*
+ * Function/args to invoke for delivering matching packets
+ * Only the function ff_fn may be changed dynamically and atomically.
+ * The ff_arg1 and ff_arg2 are set at creation time and may not
+ * be changed.
+ */
+ flow_fn_t fe_cb_fn; /* fe_lock */
+ void *fe_cb_arg1; /* fe_lock */
+ void *fe_cb_arg2; /* fe_lock */
+
+ void *fe_client_cookie; /* WO */
+ void *fe_rx_ring_group; /* SL */
+ void *fe_rx_srs[MAX_RINGS_PER_GROUP]; /* fe_lock */
+ int fe_rx_srs_cnt; /* fe_lock */
+ void *fe_tx_srs; /* WO */
+
+ /*
+ * This is a unicast flow, and is a mac_client_impl_t
+ */
+ void *fe_mcip; /* WO */
+
+ /*
+ * Used by mci_flent_list of mac_client_impl_t to track flows sharing
+ * the same mac_client_impl_t.
+ */
+ struct flow_entry_s *fe_client_next;
+
+ /*
+ * This is a broadcast or multicast flow and is a mac_bcast_grp_t
+ */
+ void *fe_mbg; /* WO */
+ uint_t fe_type; /* WO */
+
+ /*
+ * BW control info.
+ */
+ mac_bw_ctl_t fe_tx_bw;
+ mac_bw_ctl_t fe_rx_bw;
+
+ /*
+ * Used by flow table lookup code
+ */
+ flow_match_fn_t fe_match;
+
+ /*
+ * Used by mac_flow_remove().
+ */
+ int fe_index;
+ flow_tab_t *fe_flow_tab;
+
+ kstat_t *fe_ksp;
+ flow_stats_t fe_flowstats;
+ boolean_t fe_desc_logged;
+ zoneid_t fe_zoneid;
+ uint64_t fe_nic_speed;
+};
+
+/*
+ * Various structures used by the flows framework for keeping track
+ * of packet state information.
+ */
+
+/* Layer 2 */
+typedef struct flow_l2info_s {
+ uchar_t *l2_start;
+ uint8_t *l2_daddr;
+ uint16_t l2_vid;
+ uint32_t l2_sap;
+ uint_t l2_hdrsize;
+} flow_l2info_t;
+
+/* Layer 3 */
+typedef struct flow_l3info_s {
+ uchar_t *l3_start;
+ uint8_t l3_protocol;
+ uint8_t l3_version;
+ boolean_t l3_dst_or_src;
+ uint_t l3_hdrsize;
+ boolean_t l3_fragmented;
+} flow_l3info_t;
+
+/* Layer 4 */
+typedef struct flow_l4info_s {
+ uchar_t *l4_start;
+ uint16_t l4_src_port;
+ uint16_t l4_dst_port;
+ uint16_t l4_hash_port;
+} flow_l4info_t;
+
+/*
+ * Combined state structure.
+ * Holds flow direction and an mblk_t pointer.
+ */
+struct flow_state_s {
+ uint_t fs_flags;
+ mblk_t *fs_mp;
+ flow_l2info_t fs_l2info;
+ flow_l3info_t fs_l3info;
+ flow_l4info_t fs_l4info;
+};
+
+/*
+ * Flow ops vector.
+ * There are two groups of functions. The ones ending with _fe are
+ * called when a flow is being added. The others (hash, accept) are
+ * called at flow lookup time.
+ */
+#define FLOW_MAX_ACCEPT 16
+typedef struct flow_ops_s {
+ /*
+ * fo_accept_fe():
+ * Validates the contents of the flow and checks whether
+ * it's compatible with the flow table. sets the fe_match
+ * function of the flow.
+ */
+ int (*fo_accept_fe)(flow_tab_t *, flow_entry_t *);
+ /*
+ * fo_hash_fe():
+ * Generates a hash index to the flow table. This function
+ * must use the same algorithm as fo_hash(), which is used
+ * by the flow lookup code path.
+ */
+ uint32_t (*fo_hash_fe)(flow_tab_t *, flow_entry_t *);
+ /*
+ * fo_match_fe():
+ * This is used for finding identical flows.
+ */
+ boolean_t (*fo_match_fe)(flow_tab_t *, flow_entry_t *,
+ flow_entry_t *);
+ /*
+ * fo_insert_fe():
+ * Used for inserting a flow to a flow chain.
+ * Protocols that have special ordering requirements would
+ * need to implement this. For those that don't,
+ * flow_generic_insert_fe() may be used.
+ */
+ int (*fo_insert_fe)(flow_tab_t *, flow_entry_t **,
+ flow_entry_t *);
+
+ /*
+ * Calculates the flow hash index based on the accumulated
+ * state in flow_state_t. Must use the same algorithm as
+ * fo_hash_fe().
+ */
+ uint32_t (*fo_hash)(flow_tab_t *, flow_state_t *);
+
+ /*
+ * Array of accept fuctions.
+ * Each function in the array will accumulate enough state
+ * (header length, protocol) to allow the next function to
+ * proceed. We support up to FLOW_MAX_ACCEPT functions which
+ * should be sufficient for all practical purposes.
+ */
+ int (*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *,
+ flow_state_t *);
+} flow_ops_t;
+
+/*
+ * Generic flow table.
+ */
+struct flow_tab_s {
+ krwlock_t ft_lock;
+ /*
+ * Contains a list of functions (described above)
+ * specific to this table type.
+ */
+ flow_ops_t ft_ops;
+
+ /*
+ * Indicates what types of flows are supported.
+ */
+ flow_mask_t ft_mask;
+
+ /*
+ * An array of flow_entry_t * of size ft_size.
+ * Each element is the beginning of a hash chain.
+ */
+ flow_entry_t **ft_table;
+ uint_t ft_size;
+
+ /*
+ * The number of flows inserted into ft_table.
+ */
+ uint_t ft_flow_count;
+ struct mac_impl_s *ft_mip;
+ struct mac_client_impl_s *ft_mcip;
+};
+
+/*
+ * This is used for describing what type of flow table can be created.
+ * mac_flow.c contains a list of these structures.
+ */
+typedef struct flow_tab_info_s {
+ flow_ops_t *fti_ops;
+ flow_mask_t fti_mask;
+ uint_t fti_size;
+} flow_tab_info_t;
+
+#define FLOW_TAB_EMPTY(ft) ((ft) == NULL || (ft)->ft_flow_count == 0)
+
+/*
+ * This is used by mac_tx_send.
+ */
+typedef struct mac_tx_stats_s {
+ uint_t ts_opackets;
+ uint_t ts_obytes;
+ uint_t ts_oerrors;
+} mac_tx_stats_t;
+
+#define FLOW_STAT_UPDATE(f, s, c) { \
+ ((flow_entry_t *)(f))->fe_flowstats.fs_##s += ((uint64_t)(c)); \
+}
+
+#define FLOW_TX_STATS_UPDATE(f, s) { \
+ FLOW_STAT_UPDATE((f), opackets, (s)->ts_opackets); \
+ FLOW_STAT_UPDATE((f), obytes, (s)->ts_obytes); \
+ FLOW_STAT_UPDATE((f), oerrors, (s)->ts_oerrors); \
+}
+
+extern void mac_flow_init();
+extern void mac_flow_fini();
+extern int mac_flow_create(flow_desc_t *, mac_resource_props_t *,
+ char *, void *, uint_t, flow_entry_t **);
+
+extern int mac_flow_add(flow_tab_t *, flow_entry_t *);
+extern int mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *,
+ boolean_t);
+extern int mac_flow_hash_add(flow_entry_t *);
+extern int mac_flow_lookup_byname(char *, flow_entry_t **);
+extern int mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t,
+ flow_entry_t **);
+
+extern int mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *),
+ void *);
+
+extern int mac_flow_walk_nolock(flow_tab_t *,
+ int (*)(flow_entry_t *, void *), void *);
+
+extern void mac_flow_modify(flow_tab_t *, flow_entry_t *,
+ mac_resource_props_t *);
+
+extern void *mac_flow_get_client_cookie(flow_entry_t *);
+
+extern uint32_t mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *);
+
+extern int mac_flow_update(flow_tab_t *, flow_entry_t *, flow_desc_t *);
+extern void mac_flow_get_desc(flow_entry_t *, flow_desc_t *);
+extern void mac_flow_set_desc(flow_entry_t *, flow_desc_t *);
+
+extern void mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t);
+extern void mac_flow_hash_remove(flow_entry_t *);
+extern void mac_flow_wait(flow_entry_t *, mac_flow_state_t);
+extern void mac_flow_quiesce(flow_entry_t *);
+extern void mac_flow_restart(flow_entry_t *);
+extern void mac_flow_cleanup(flow_entry_t *);
+extern void mac_flow_destroy(flow_entry_t *);
+
+extern void mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t,
+ struct mac_impl_s *, flow_tab_t **);
+extern void mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **);
+extern void mac_flow_tab_destroy(flow_tab_t *);
+extern void mac_flow_drop(void *, void *, mblk_t *);
+extern void flow_stat_destroy(flow_entry_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MAC_FLOW_IMPL_H */
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index 6b36a978f0..9c8bfb7ce9 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -26,23 +26,17 @@
#ifndef _SYS_MAC_IMPL_H
#define _SYS_MAC_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/mac.h>
+#include <sys/modhash.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
#include <net/if.h>
+#include <sys/mac_flow_impl.h>
+#include <netinet/ip6.h>
#ifdef __cplusplus
extern "C" {
#endif
-typedef struct mac_multicst_addr_s mac_multicst_addr_t;
-
-struct mac_multicst_addr_s {
- mac_multicst_addr_t *mma_nextp;
- uint_t mma_ref;
- uint8_t mma_addr[MAXMACADDRLEN];
-};
-
typedef struct mac_margin_req_s mac_margin_req_t;
struct mac_margin_req_s {
@@ -51,31 +45,85 @@ struct mac_margin_req_s {
uint32_t mmr_margin;
};
-typedef struct mac_notify_fn_s mac_notify_fn_t;
+/* Generic linked chain type */
+typedef struct mac_chain_s {
+ struct mac_chain_s *next;
+ void *item;
+} mac_chain_t;
-struct mac_notify_fn_s {
- mac_notify_fn_t *mnf_nextp;
- mac_notify_t mnf_fn;
- void *mnf_arg;
-};
+/*
+ * Generic mac callback list manipulation structures and macros. The mac_cb_t
+ * represents a general callback list element embedded in a particular
+ * data structure such as a mac_notify_cb_t or a mac_promisc_impl_t.
+ * The mac_cb_info_t represents general information about list walkers.
+ * Please see the comments above mac_callback_add for more information.
+ */
+/* mcb_flags */
+#define MCB_CONDEMNED 0x1 /* Logically deleted */
+#define MCB_NOTIFY_CB_T 0x2
+#define MCB_TX_NOTIFY_CB_T 0x4
+
+typedef struct mac_cb_s {
+ struct mac_cb_s *mcb_nextp; /* Linked list of callbacks */
+ void *mcb_objp; /* Ptr to enclosing object */
+ size_t mcb_objsize; /* Sizeof the enclosing obj */
+ uint_t mcb_flags;
+} mac_cb_t;
+
+typedef struct mac_cb_info_s {
+ kmutex_t *mcbi_lockp;
+ kcondvar_t mcbi_cv;
+ uint_t mcbi_del_cnt; /* Deleted callback cnt */
+ uint_t mcbi_walker_cnt; /* List walker count */
+} mac_cb_info_t;
+
+typedef struct mac_notify_cb_s {
+ mac_cb_t mncb_link; /* Linked list of callbacks */
+ mac_notify_t mncb_fn; /* callback function */
+ void *mncb_arg; /* callback argument */
+ struct mac_impl_s *mncb_mip;
+} mac_notify_cb_t;
-typedef struct mac_rx_fn_s mac_rx_fn_t;
+/*
+ * mac_callback_add(listinfo, listhead, listelement)
+ * mac_callback_remove(listinfo, listhead, listelement)
+ */
+typedef boolean_t (*mcb_func_t)(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
-struct mac_rx_fn_s {
- mac_rx_fn_t *mrf_nextp;
- mac_rx_t mrf_fn;
- void *mrf_arg;
- boolean_t mrf_inuse;
- boolean_t mrf_active;
-};
+#define MAC_CALLBACK_WALKER_INC(mcbi) { \
+ mutex_enter((mcbi)->mcbi_lockp); \
+ (mcbi)->mcbi_walker_cnt++; \
+ mutex_exit((mcbi)->mcbi_lockp); \
+}
-typedef struct mac_txloop_fn_s mac_txloop_fn_t;
+#define MAC_CALLBACK_WALKER_INC_HELD(mcbi) (mcbi)->mcbi_walker_cnt++;
-struct mac_txloop_fn_s {
- mac_txloop_fn_t *mtf_nextp;
- mac_txloop_t mtf_fn;
- void *mtf_arg;
-};
+#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) { \
+ mac_cb_t *rmlist; \
+ \
+ mutex_enter((mcbi)->mcbi_lockp); \
+ if (--(mcbi)->mcbi_walker_cnt == 0 && (mcbi)->mcbi_del_cnt != 0) { \
+ rmlist = mac_callback_walker_cleanup((mcbi), headp); \
+ mac_callback_free(rmlist); \
+ cv_broadcast(&(mcbi)->mcbi_cv); \
+ } \
+ mutex_exit((mcbi)->mcbi_lockp); \
+}
+
+#define MAC_PROMISC_WALKER_INC(mip) \
+ MAC_CALLBACK_WALKER_INC(&(mip)->mi_promisc_cb_info)
+
+#define MAC_PROMISC_WALKER_DCR(mip) { \
+ mac_cb_info_t *mcbi; \
+ \
+ mcbi = &(mip)->mi_promisc_cb_info; \
+ mutex_enter(mcbi->mcbi_lockp); \
+ if (--mcbi->mcbi_walker_cnt == 0 && mcbi->mcbi_del_cnt != 0) { \
+ i_mac_promisc_walker_cleanup(mip); \
+ cv_broadcast(&mcbi->mcbi_cv); \
+ } \
+ mutex_exit(mcbi->mcbi_lockp); \
+}
typedef struct mactype_s {
const char *mt_ident;
@@ -91,118 +139,354 @@ typedef struct mactype_s {
size_t mt_mappingcount;
} mactype_t;
+/*
+ * Multiple rings implementation.
+ */
+typedef enum {
+ MAC_GROUP_STATE_UNINIT = 0, /* initial state of data structure */
+ MAC_GROUP_STATE_REGISTERED, /* hooked with h/w group */
+ MAC_GROUP_STATE_RESERVED, /* group is reserved and opened */
+ MAC_GROUP_STATE_SHARED /* default group shared among */
+ /* multiple mac clients */
+} mac_group_state_t;
+
+typedef struct mac_ring_s mac_ring_t;
+typedef struct mac_group_s mac_group_t;
+
+/*
+ * Ring data structure for ring control and management.
+ */
+typedef enum {
+ MR_FREE, /* Available for assignment to flows */
+ MR_NEWLY_ADDED, /* Just assigned to another group */
+ MR_INUSE /* Assigned to an SRS */
+} mac_ring_state_t;
+
+/* mr_flag values */
+#define MR_INCIPIENT 0x1
+#define MR_CONDEMNED 0x2
+#define MR_QUIESCE 0x4
+
+struct mac_ring_s {
+ int mr_index; /* index in the original list */
+ mac_ring_type_t mr_type; /* ring type */
+ mac_ring_t *mr_next; /* next ring in the chain */
+ mac_group_handle_t mr_gh; /* reference to group */
+
+ mac_classify_type_t mr_classify_type; /* HW vs SW */
+ struct mac_soft_ring_set_s *mr_srs; /* associated SRS */
+ uint_t mr_refcnt; /* Ring references */
+ /* ring generation no. to guard against drivers using stale rings */
+ uint64_t mr_gen_num;
+
+ kmutex_t mr_lock;
+ kcondvar_t mr_cv; /* mr_lock */
+ mac_ring_state_t mr_state; /* mr_lock */
+ uint_t mr_flag; /* mr_lock */
+
+ mac_ring_info_t mr_info; /* driver supplied info */
+};
+#define mr_driver mr_info.mri_driver
+#define mr_start mr_info.mri_start
+#define mr_stop mr_info.mri_stop
+
+#define MAC_RING_MARK(mr, flag) \
+ (mr)->mr_flag |= flag;
-#define MAC_VNIC_TXINFO_REFHOLD(mvt) { \
- mutex_enter(&(mvt)->mv_lock); \
- (mvt)->mv_refs++; \
- mutex_exit(&(mvt)->mv_lock); \
+#define MAC_RING_UNMARK(mr, flag) \
+ (mr)->mr_flag &= ~flag;
+
+/*
+ * Reference hold and release on mac_ring_t 'mr'
+ */
+#define MR_REFHOLD_LOCKED(mr) { \
+ ASSERT(MUTEX_HELD(&mr->mr_lock)); \
+ (mr)->mr_refcnt++; \
}
-#define MAC_VNIC_TXINFO_REFRELE(mvt) { \
- mutex_enter(&(mvt)->mv_lock); \
- if (--(mvt)->mv_refs == 0 && (mvt)->mv_clearing) { \
- (mvt)->mv_clearing = B_FALSE; \
- cv_signal(&(mvt)->mv_cv); \
- } \
- mutex_exit(&(mvt)->mv_lock); \
+#define MR_REFRELE(mr) { \
+ mutex_enter(&(mr)->mr_lock); \
+ ASSERT((mr)->mr_refcnt != 0); \
+ (mr)->mr_refcnt--; \
+ if ((mr)->mr_refcnt == 0 && \
+ ((mr)->mr_flag & (MR_CONDEMNED | MR_QUIESCE))) \
+ cv_signal(&(mr)->mr_cv); \
+ mutex_exit(&(mr)->mr_lock); \
}
-typedef struct mac_vnic_tx_s {
- mac_txinfo_t mv_txinfo; /* provided by VNIC */
- uint32_t mv_refs;
- kmutex_t mv_lock;
- kcondvar_t mv_cv;
- boolean_t mv_clearing;
-} mac_vnic_tx_t;
+/*
+ * Per mac client flow information associated with a RX group.
+ * The entire structure is SL protected.
+ */
+typedef struct mac_grp_client {
+ struct mac_grp_client *mgc_next;
+ struct mac_client_impl_s *mgc_client;
+} mac_grp_client_t;
+
+#define MAC_RX_GROUP_NO_CLIENT(g) ((g)->mrg_clients == NULL)
+#define MAC_RX_GROUP_ONLY_CLIENT(g) \
+ ((((g)->mrg_clients != NULL) && \
+ ((g)->mrg_clients->mgc_next == NULL)) ? \
+ (g)->mrg_clients->mgc_client : NULL)
/*
- * Each registered MAC is associated with a mac_t structure.
+ * Common ring group data structure for ring control and management.
+ * The entire structure is SL protected
*/
-typedef struct mac_impl_s {
+struct mac_group_s {
+ int mrg_index; /* index in the list */
+ mac_ring_type_t mrg_type; /* ring type */
+ mac_group_state_t mrg_state; /* state of the group */
+ mac_group_t *mrg_next; /* next ring in the chain */
+ mac_handle_t mrg_mh; /* reference to MAC */
+ mac_ring_t *mrg_rings; /* grouped rings */
+ uint_t mrg_cur_count; /* actual size of group */
+
+ mac_grp_client_t *mrg_clients; /* clients list */
+
+ struct mac_client_impl_s *mrg_tx_client; /* TX client pointer */
+ mac_group_info_t mrg_info; /* driver supplied info */
+};
+
+#define mrg_driver mrg_info.mgi_driver
+#define mrg_start mrg_info.mgi_start
+#define mrg_stop mrg_info.mgi_stop
+
+#define GROUP_INTR_HANDLE(g) (g)->mrg_info.mgi_intr.mi_handle
+#define GROUP_INTR_ENABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_enable
+#define GROUP_INTR_DISABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_disable
+
+#define MAC_DEFAULT_GROUP(mh) (((mac_impl_t *)mh)->mi_rx_groups)
+
+#define MAC_RING_TX_DEFAULT(mip, mp) \
+ ((mip->mi_default_tx_ring == NULL) ? \
+ mip->mi_tx(mip->mi_driver, mp) : \
+ mac_ring_tx(mip->mi_default_tx_ring, mp))
+
+#define MAC_TX(mip, ring, mp, mcip) { \
+ /* \
+ * If the MAC client has a bound Hybrid I/O share, \
+ * send the packet through the default tx ring, since \
+ * the tx rings of this client are now mapped in the \
+ * guest domain and not accessible from this domain. \
+ */ \
+ if (mcip->mci_share_bound || (ring == NULL)) \
+ mp = MAC_RING_TX_DEFAULT(mip, mp); \
+ else \
+ mp = mac_ring_tx(ring, mp); \
+}
+
+/* mci_tx_flag */
+#define MCI_TX_QUIESCE 0x1
+
+typedef struct mac_factory_addr_s {
+ boolean_t mfa_in_use;
+ uint8_t mfa_addr[MAXMACADDRLEN];
+ struct mac_client_impl_s *mfa_client;
+} mac_factory_addr_t;
+
+typedef struct mac_mcast_addrs_s {
+ struct mac_mcast_addrs_s *mma_next;
+ uint8_t mma_addr[MAXMACADDRLEN];
+ int mma_ref;
+} mac_mcast_addrs_t;
+
+typedef enum {
+ MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* hardware steering */
+ MAC_ADDRESS_TYPE_UNICAST_PROMISC /* promiscuous mode */
+} mac_address_type_t;
+
+typedef struct mac_impl_s mac_impl_t;
+
+typedef struct mac_address_s {
+ mac_address_type_t ma_type; /* address type */
+ int ma_nusers; /* number of users */
+ /* of that address */
+ struct mac_address_s *ma_next; /* next address */
+ uint8_t ma_addr[MAXMACADDRLEN]; /* address value */
+ size_t ma_len; /* address length */
+ mac_group_t *ma_group; /* asscociated group */
+ mac_impl_t *ma_mip; /* MAC handle */
+} mac_address_t;
+
+extern krwlock_t i_mac_impl_lock;
+extern mod_hash_t *i_mac_impl_hash;
+extern kmem_cache_t *i_mac_impl_cachep;
+extern uint_t i_mac_impl_count;
+
+/*
+ * Each registered MAC is associated with a mac_impl_t structure. The
+ * structure represents the undelying hardware, in terms of definition,
+ * resources (transmit, receive rings etc.), callback functions etc. It
+ * also holds the table of MAC clients that are configured on the device.
+ * The table is used for classifying incoming packets in software.
+ *
+ * The protection scheme uses 2 elements, a coarse serialization mechanism
+ * called perimeter and a finer traditional lock based scheme. More details
+ * can be found in the big block comment in mac.c.
+ *
+ * The protection scheme for each member of the mac_impl_t is described below.
+ *
+ * Write Once Only (WO): Typically these don't change for the lifetime of the
+ * data structure. For example something in mac_impl_t that stays the same
+ * from mac_register to mac_unregister, or something in a mac_client_impl_t
+ * that stays the same from mac_client_open to mac_client_close.
+ *
+ * Serializer (SL): Protected by the Serializer. All SLOP operations on a
+ * mac endpoint go through the serializer. MTOPs don't care about reading
+ * these fields atomically.
+ *
+ * Lock: Traditional mutex/rw lock. Modify operations still go through the
+ * mac serializer, the lock helps synchronize readers with writers.
+ */
+struct mac_impl_s {
+ krwlock_t mi_rw_lock;
+ char mi_name[LIFNAMSIZ]; /* WO */
+ uint32_t mi_state_flags;
+ void *mi_driver; /* Driver private, WO */
+ mac_info_t mi_info; /* WO */
+ mactype_t *mi_type; /* WO */
+ void *mi_pdata; /* WO */
+ size_t mi_pdata_size; /* WO */
+ mac_callbacks_t *mi_callbacks; /* WO */
+ dev_info_t *mi_dip; /* WO */
+ uint32_t mi_ref; /* i_mac_impl_lock */
+ uint_t mi_active; /* SL */
+ link_state_t mi_linkstate; /* none */
+ link_state_t mi_lastlinkstate; /* none */
+ uint_t mi_promisc; /* SL */
+ uint_t mi_devpromisc; /* SL */
+ kmutex_t mi_lock;
+ uint8_t mi_addr[MAXMACADDRLEN]; /* mi_rw_lock */
+ uint8_t mi_dstaddr[MAXMACADDRLEN]; /* mi_rw_lock */
+
/*
- * The following fields are set in mac_register() and will not be
- * changed until mac_unregister(). No lock is needed to access them.
+ * The mac perimeter. All client initiated create/modify operations
+ * on a mac end point go through this.
*/
- char mi_name[LIFNAMSIZ];
- void *mi_driver; /* Driver private data */
- mac_info_t mi_info;
- mactype_t *mi_type;
- void *mi_pdata;
- size_t mi_pdata_size;
- mac_callbacks_t *mi_callbacks;
- dev_info_t *mi_dip;
- minor_t mi_minor;
- dev_t mi_phy_dev;
- kstat_t *mi_ksp;
- uint_t mi_kstat_count;
- mac_txinfo_t mi_txinfo;
- mac_txinfo_t mi_txloopinfo;
-
- krwlock_t mi_gen_lock;
- uint32_t mi_oref;
- uint32_t mi_ref;
- boolean_t mi_disabled;
- boolean_t mi_exclusive;
-
- krwlock_t mi_state_lock;
- uint_t mi_active;
-
- krwlock_t mi_data_lock;
- link_state_t mi_linkstate;
- link_state_t mi_lastlinkstate;
- uint_t mi_promisc;
- uint_t mi_devpromisc;
- uint8_t mi_addr[MAXMACADDRLEN];
- uint8_t mi_dstaddr[MAXMACADDRLEN];
- uint_t mi_sdu_min;
- uint_t mi_sdu_max;
- mac_multicst_addr_t *mi_mmap;
-
- krwlock_t mi_notify_lock;
- uint32_t mi_notify_bits;
- kmutex_t mi_notify_bits_lock;
- kthread_t *mi_notify_thread;
- mac_notify_fn_t *mi_mnfp;
- kcondvar_t mi_notify_cv;
-
- krwlock_t mi_rx_lock;
- mac_rx_fn_t *mi_mrfp;
- krwlock_t mi_tx_lock;
- mac_txloop_fn_t *mi_mtfp;
-
- krwlock_t mi_resource_lock;
- mac_resource_add_t mi_resource_add;
- void *mi_resource_add_arg;
-
- kmutex_t mi_activelink_lock;
- boolean_t mi_activelink;
-
- uint32_t mi_rx_ref; /* #threads in mac_rx() */
- uint32_t mi_rx_removed; /* #callbacks marked */
- /* for removal */
- kmutex_t mi_lock;
- kcondvar_t mi_rx_cv;
- boolean_t mi_shareable;
- boolean_t mi_vnic_present;
- mac_vnic_tx_t *mi_vnic_tx;
- mac_txinfo_t mi_vnic_txinfo;
- mac_txinfo_t mi_vnic_txloopinfo;
- mac_getcapab_t mi_vnic_getcapab_fn;
- void *mi_vnic_getcapab_arg;
-
- boolean_t mi_legacy;
- uint32_t mi_unsup_note;
- uint32_t mi_margin;
+ kmutex_t mi_perim_lock;
+ kthread_t *mi_perim_owner; /* mi_perim_lock */
+ uint_t mi_perim_ocnt; /* mi_perim_lock */
+ kcondvar_t mi_perim_cv; /* mi_perim_lock */
+
+ /* mac notification callbacks */
+ kmutex_t mi_notify_lock;
+ mac_cb_info_t mi_notify_cb_info; /* mi_notify_lock */
+ mac_cb_t *mi_notify_cb_list; /* mi_notify_lock */
+ kthread_t *mi_notify_thread; /* mi_notify_lock */
+ uint_t mi_notify_bits; /* mi_notify_lock */
+
+ uint32_t mi_v12n_level; /* Virt'ion readiness */
/*
+ * RX groups, ring capability
+ * Fields of this block are SL protected.
+ */
+ mac_group_type_t mi_rx_group_type; /* grouping type */
+ uint_t mi_rx_group_count;
+ mac_group_t *mi_rx_groups;
+
+ mac_capab_rings_t mi_rx_rings_cap;
+
+ /*
+ * TX groups and ring capability, SL Protected.
+ */
+ mac_group_type_t mi_tx_group_type; /* grouping type */
+ uint_t mi_tx_group_count;
+ uint_t mi_tx_group_free;
+ mac_group_t *mi_tx_groups;
+
+ mac_capab_rings_t mi_tx_rings_cap;
+
+ mac_ring_handle_t mi_default_tx_ring;
+
+ /*
+ * MAC address list. SL protected.
+ */
+ mac_address_t *mi_addresses;
+
+ /*
+ * This MAC's table of sub-flows
+ */
+ flow_tab_t *mi_flow_tab; /* WO */
+
+ kstat_t *mi_ksp; /* WO */
+ uint_t mi_kstat_count; /* WO */
+ uint_t mi_nactiveclients; /* SL */
+
+ /* for broadcast and multicast support */
+ struct mac_mcast_addrs_s *mi_mcast_addrs; /* mi_rw_lock */
+ struct mac_bcast_grp_s *mi_bcast_grp; /* mi_rw_lock */
+ uint_t mi_bcast_ngrps; /* mi_rw_lock */
+
+ /* list of MAC clients which opened this MAC */
+ struct mac_client_impl_s *mi_clients_list; /* mi_rw_lock */
+ uint_t mi_nclients; /* mi_rw_lock */
+
+ uint32_t mi_margin; /* mi_rw_lock */
+ uint_t mi_sdu_min; /* mi_rw_lock */
+ uint_t mi_sdu_max; /* mi_rw_lock */
+
+ /*
+ * Cache of factory MAC addresses provided by the driver. If
+ * the driver doesn't provide multiple factory MAC addresses,
+ * the mi_factory_addr is set to NULL, and mi_factory_addr_num
+ * is set to zero.
+ */
+ mac_factory_addr_t *mi_factory_addr; /* mi_rw_lock */
+ uint_t mi_factory_addr_num; /* mi_rw_lock */
+
+ /* for promiscuous mode support */
+ kmutex_t mi_promisc_lock;
+ mac_cb_t *mi_promisc_list; /* mi_promisc_lock */
+ mac_cb_info_t mi_promisc_cb_info; /* mi_promisc_lock */
+
+ /* cache of rings over this mac_impl */
+ kmutex_t mi_ring_lock;
+ mac_ring_t *mi_ring_freelist; /* mi_ring_lock */
+
+ /*
+ * These are used for caching the properties, if any, for the
+ * primary MAC client. If the MAC client is not yet in place
+ * when the properties are set then we cache them here to be
+ * applied to the MAC client when it is created.
+ */
+ mac_resource_props_t mi_resource_props; /* SL */
+
+ minor_t mi_minor; /* WO */
+ dev_t mi_phy_dev; /* WO */
+ uint32_t mi_oref; /* SL */
+ uint32_t mi_unsup_note; /* WO */
+ /*
* List of margin value requests added by mac clients. This list is
* sorted: the first one has the greatest value.
*/
mac_margin_req_t *mi_mmrp;
mac_priv_prop_t *mi_priv_prop;
uint_t mi_priv_prop_count;
-} mac_impl_t;
+
+ /*
+ * Hybrid I/O related definitions.
+ */
+ mac_capab_share_t mi_share_capab;
+
+/* This should be the last block in this structure */
+#ifdef DEBUG
+#define MAC_PERIM_STACK_DEPTH 15
+ int mi_perim_stack_depth;
+ pc_t mi_perim_stack[MAC_PERIM_STACK_DEPTH];
+#endif
+};
+
+/* for mi_state_flags */
+#define MIS_DISABLED 0x0001
+#define MIS_IS_VNIC 0x0002
+#define MIS_IS_AGGR 0x0004
+#define MIS_NOTIFY_DONE 0x0008
+#define MIS_EXCLUSIVE 0x0010
+#define MIS_EXCLUSIVE_HELD 0x0020
+#define MIS_LEGACY 0x0040
#define mi_getstat mi_callbacks->mc_getstat
#define mi_start mi_callbacks->mc_start
@@ -212,19 +496,193 @@ typedef struct mac_impl_s {
#define mi_setpromisc mi_callbacks->mc_setpromisc
#define mi_multicst mi_callbacks->mc_multicst
#define mi_unicst mi_callbacks->mc_unicst
-#define mi_resources mi_callbacks->mc_resources
#define mi_tx mi_callbacks->mc_tx
#define mi_ioctl mi_callbacks->mc_ioctl
#define mi_getcapab mi_callbacks->mc_getcapab
+typedef struct mac_notify_task_arg {
+ mac_impl_t *mnt_mip;
+ mac_notify_type_t mnt_type;
+ mac_ring_t *mnt_ring;
+} mac_notify_task_arg_t;
+
+typedef enum {
+ MAC_RX_NO_RESERVE,
+ MAC_RX_RESERVE_DEFAULT,
+ MAC_RX_RESERVE_NONDEFAULT
+} mac_rx_group_reserve_type_t;
+
+/*
+ * XXX All MAC_DBG_PRTs must be replaced with call to dtrace probes. For now
+ * it may be easier to have these printfs for easier debugging
+ */
+#ifdef DEBUG
+extern int mac_dbg;
+#define MAC_DBG_PRT(a) if (mac_dbg > 0) {(void) printf a; }
+#else
+#define MAC_DBG_PRT(a)
+#endif
+
+/*
+ * The mac_perim_handle_t is an opaque type that encodes the 'mip' pointer
+ * and whether internally a mac_open was done when acquiring the perimeter.
+ */
+#define MAC_ENCODE_MPH(mph, mh, need_close) \
+ (mph) = (mac_perim_handle_t)((uintptr_t)(mh) | need_close)
+
+#define MAC_DECODE_MPH(mph, mip, need_close) { \
+ mip = (mac_impl_t *)(((uintptr_t)mph) & ~0x1); \
+ (need_close) = ((uintptr_t)mph & 0x1); \
+}
+
+typedef struct mac_client_impl_s mac_client_impl_t;
+
extern void mac_init(void);
extern int mac_fini(void);
extern void mac_stat_create(mac_impl_t *);
extern void mac_stat_destroy(mac_impl_t *);
extern uint64_t mac_stat_default(mac_impl_t *, uint_t);
+extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);
+extern void mac_create_soft_ring_kstats(mac_impl_t *, int32_t);
+extern boolean_t mac_ip_hdr_length_v6(mblk_t *, ip6_t *, uint16_t *,
+ uint8_t *);
+
+extern mblk_t *mac_copymsgchain_cksum(mblk_t *);
+extern mblk_t *mac_fix_cksum(mblk_t *);
+extern void mac_packet_print(mac_handle_t, mblk_t *);
+extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *,
+ mac_header_info_t *);
+extern void mac_tx_notify(mac_impl_t *);
+
+extern boolean_t mac_callback_find(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern void mac_callback_remove_wait(mac_cb_info_t *);
+extern void mac_callback_free(mac_cb_t *);
+extern mac_cb_t *mac_callback_walker_cleanup(mac_cb_info_t *, mac_cb_t **);
+
+/* in mac_bcast.c */
+extern void mac_bcast_init(void);
+extern void mac_bcast_fini(void);
+extern mac_impl_t *mac_bcast_grp_mip(void *);
+extern int mac_bcast_add(mac_client_impl_t *, const uint8_t *, uint16_t,
+ mac_addrtype_t);
+extern void mac_bcast_delete(mac_client_impl_t *, const uint8_t *, uint16_t);
+extern void mac_bcast_send(void *, void *, mblk_t *, boolean_t);
+extern void mac_bcast_grp_free(void *);
+extern void mac_bcast_refresh(mac_impl_t *, mac_multicst_t, void *,
+ boolean_t);
+extern void mac_client_bcast_refresh(mac_client_impl_t *, mac_multicst_t,
+ void *, boolean_t);
-extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);
+/*
+ * Grouping functions are used internally by MAC layer.
+ */
+extern int mac_group_addmac(mac_group_t *, const uint8_t *);
+extern int mac_group_remmac(mac_group_t *, const uint8_t *);
+extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *,
+ mac_group_t *);
+extern mblk_t *mac_ring_tx(mac_ring_handle_t, mblk_t *);
+extern mac_ring_t *mac_reserve_tx_ring(mac_impl_t *, mac_ring_t *);
+extern void mac_release_tx_ring(mac_ring_handle_t);
+extern mac_group_t *mac_reserve_tx_group(mac_impl_t *, mac_share_handle_t);
+extern void mac_release_tx_group(mac_impl_t *, mac_group_t *);
+
+/*
+ * MAC address functions are used internally by MAC layer.
+ */
+extern mac_address_t *mac_find_macaddr(mac_impl_t *, uint8_t *);
+extern boolean_t mac_check_macaddr_shared(mac_address_t *);
+extern int mac_update_macaddr(mac_address_t *, uint8_t *);
+extern void mac_freshen_macaddr(mac_address_t *, uint8_t *);
+extern void mac_retrieve_macaddr(mac_address_t *, uint8_t *);
+extern void mac_init_macaddr(mac_impl_t *);
+extern void mac_fini_macaddr(mac_impl_t *);
+
+/*
+ * Flow construction/destruction routines.
+ * Not meant to be used by mac clients.
+ */
+extern int mac_link_flow_init(mac_client_handle_t, flow_entry_t *);
+extern void mac_link_flow_clean(mac_client_handle_t, flow_entry_t *);
+
+/*
+ * Called from mac_provider.c
+ */
+extern void mac_fanout_recompute(mac_impl_t *);
+
+/*
+ * The following functions are used internally by the MAC layer to
+ * add/remove/update flows associated with a mac_impl_t. They should
+ * never be used directly by MAC clients.
+ */
+extern int mac_datapath_setup(mac_client_impl_t *, flow_entry_t *, uint32_t);
+extern void mac_datapath_teardown(mac_client_impl_t *, flow_entry_t *,
+ uint32_t);
+extern void mac_srs_group_setup(mac_client_impl_t *, flow_entry_t *,
+ mac_group_t *, uint32_t);
+extern void mac_srs_group_teardown(mac_client_impl_t *, flow_entry_t *,
+ uint32_t);
+extern int mac_rx_classify_flow_quiesce(flow_entry_t *, void *);
+extern int mac_rx_classify_flow_restart(flow_entry_t *, void *);
+extern void mac_tx_client_quiesce(mac_client_impl_t *, uint_t);
+extern void mac_tx_client_restart(mac_client_impl_t *);
+extern void mac_client_quiesce(mac_client_impl_t *);
+extern void mac_client_restart(mac_client_impl_t *);
+
+extern void mac_flow_update_priority(mac_client_impl_t *, flow_entry_t *);
+
+extern void mac_flow_rem_subflow(flow_entry_t *);
+extern void mac_rename_flow(flow_entry_t *, const char *);
+extern void mac_flow_set_name(flow_entry_t *, const char *);
+
+extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t);
+extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t);
+extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *);
+extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *);
+
+extern void i_mac_share_alloc(mac_client_impl_t *);
+extern void i_mac_share_free(mac_client_impl_t *);
+extern void i_mac_perim_enter(mac_impl_t *);
+extern void i_mac_perim_exit(mac_impl_t *);
+extern int i_mac_perim_enter_nowait(mac_impl_t *);
+extern void i_mac_tx_srs_notify(mac_impl_t *, mac_ring_handle_t);
+extern int mac_hold(const char *, mac_impl_t **);
+extern void mac_rele(mac_impl_t *);
+extern int i_mac_disable(mac_impl_t *);
+extern void i_mac_notify(mac_impl_t *, mac_notify_type_t);
+extern void i_mac_notify_exit(mac_impl_t *);
+extern int mac_start(mac_impl_t *);
+extern void mac_stop(mac_impl_t *);
+extern void mac_rx_group_unmark(mac_group_t *, uint_t);
+extern void mac_tx_client_flush(mac_client_impl_t *);
+extern void mac_tx_client_block(mac_client_impl_t *);
+extern void mac_tx_client_unblock(mac_client_impl_t *);
+extern int i_mac_promisc_set(mac_impl_t *, boolean_t, mac_promisc_type_t);
+extern void i_mac_promisc_walker_cleanup(mac_impl_t *);
+extern mactype_t *mactype_getplugin(const char *);
+extern void mac_addr_factory_init(mac_impl_t *);
+extern void mac_addr_factory_fini(mac_impl_t *);
+extern void mac_register_priv_prop(mac_impl_t *, mac_priv_prop_t *, uint_t);
+extern void mac_unregister_priv_prop(mac_impl_t *);
+extern int mac_init_rings(mac_impl_t *, mac_ring_type_t);
+extern void mac_free_rings(mac_impl_t *, mac_ring_type_t);
+
+extern int mac_start_group(mac_group_t *);
+extern void mac_stop_group(mac_group_t *);
+extern int mac_start_ring(mac_ring_t *);
+extern void mac_stop_ring(mac_ring_t *);
+extern int mac_add_macaddr(mac_impl_t *, mac_group_t *, uint8_t *);
+extern int mac_remove_macaddr(mac_address_t *);
+
+extern void mac_set_rx_group_state(mac_group_t *, mac_group_state_t);
+extern void mac_rx_group_add_client(mac_group_t *, mac_client_impl_t *);
+extern void mac_rx_group_remove_client(mac_group_t *, mac_client_impl_t *)
+;
+extern int i_mac_group_add_ring(mac_group_t *, mac_ring_t *, int);
+extern void i_mac_group_rem_ring(mac_group_t *, mac_ring_t *, boolean_t);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h
new file mode 100644
index 0000000000..9564efc00d
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_provider.h
@@ -0,0 +1,478 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MAC_PROVIDER_H
+#define _SYS_MAC_PROVIDER_H
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/mac_flow.h>
+#include <sys/mac.h>
+
+/*
+ * MAC Provider Interface
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * MAC version identifier. This is used by mac_alloc() mac_register() to
+ * verify that incompatible drivers don't register.
+ */
+#define MAC_VERSION 0x1
+
+/*
+ * Opaque handle types
+ */
+typedef struct __mac_rule_handle *mac_rule_handle_t;
+
+/*
+ * Statistics
+ */
+
+#define XCVR_UNDEFINED 0
+#define XCVR_NONE 1
+#define XCVR_10 2
+#define XCVR_100T4 3
+#define XCVR_100X 4
+#define XCVR_100T2 5
+#define XCVR_1000X 6
+#define XCVR_1000T 7
+
+#ifdef _KERNEL
+
+/*
+ * Definitions for MAC Drivers Capabilities
+ */
+/*
+ * MAC layer capabilities. These capabilities are handled by the drivers'
+ * mc_capab_get() callbacks. Some capabilities require the driver to fill
+ * in a given data structure, and others are simply boolean capabilities.
+ * Note that capability values must be powers of 2 so that consumers and
+ * providers of this interface can keep track of which capabilities they
+ * care about by keeping a bitfield of these things around somewhere.
+ */
+typedef enum {
+ /*
+ * Capabilities reserved for internal use only
+ */
+ MAC_CAPAB_VNIC = 0x0001, /* data is mac_capab_vnic_t */
+ MAC_CAPAB_ANCHOR_VNIC = 0x0002, /* boolean only, no data */
+ MAC_CAPAB_AGGR = 0x0004, /* data is mac_capab_aggr_t */
+ MAC_CAPAB_NO_NATIVEVLAN = 0x0008, /* boolean only, no data */
+ MAC_CAPAB_NO_ZCOPY = 0x0010, /* boolean only, no data */
+ MAC_CAPAB_LEGACY = 0x0020, /* data is mac_capab_legacy_t */
+
+ /*
+ * Public Capabilities
+ */
+ MAC_CAPAB_HCKSUM = 0x0100, /* data is a uint32_t */
+ MAC_CAPAB_LSO = 0x0200, /* data is mac_capab_lso_t */
+ MAC_CAPAB_RINGS = 0x0400, /* data is mac_capab_rings_t */
+ MAC_CAPAB_MULTIFACTADDR = 0x0800, /* mac_data_multifactaddr_t */
+ MAC_CAPAB_SHARES = 0x1000 /* data is mac_capab_share_t */
+
+ /* add new capabilities here */
+} mac_capab_t;
+
+
+/*
+ * LSO capability
+ */
+typedef struct lso_basic_tcp_ipv4_s {
+ t_uscalar_t lso_max; /* maximum payload */
+} lso_basic_tcp_ipv4_t;
+
+/*
+ * Currently supported flags for LSO.
+ */
+#define LSO_TX_BASIC_TCP_IPV4 0x01 /* TCP LSO capability */
+
+/*
+ * Future LSO capabilities can be added at the end of the mac_capab_lso_t.
+ * When such capability is added to the GLDv3 framework, the size of the
+ * mac_capab_lso_t it allocates and passes to the drivers increases. Older
+ * drivers wil access only the (upper) sections of that structure, that is the
+ * sections carrying the capabilities they understand. This ensures the
+ * interface can be safely extended in a binary compatible way.
+ */
+typedef struct mac_capab_lso_s {
+ t_uscalar_t lso_flags;
+ lso_basic_tcp_ipv4_t lso_basic_tcp_ipv4;
+ /* Add future lso capabilities here */
+} mac_capab_lso_t;
+
+/*
+ * Multiple Factory MAC Addresses Capability
+ */
+typedef struct mac_capab_multifactaddr_s {
+ /*
+ * Number of factory addresses
+ */
+ uint_t mcm_naddr;
+
+ /*
+ * Callbacks to query all the factory addresses.
+ */
+ void (*mcm_getaddr)(void *, uint_t, uint8_t *);
+} mac_capab_multifactaddr_t;
+
+/*
+ * MAC driver entry point types.
+ */
+typedef int (*mac_getstat_t)(void *, uint_t, uint64_t *);
+typedef int (*mac_start_t)(void *);
+typedef void (*mac_stop_t)(void *);
+typedef int (*mac_setpromisc_t)(void *, boolean_t);
+typedef int (*mac_multicst_t)(void *, boolean_t, const uint8_t *);
+typedef int (*mac_unicst_t)(void *, const uint8_t *);
+typedef void (*mac_ioctl_t)(void *, queue_t *, mblk_t *);
+typedef void (*mac_resources_t)(void *);
+typedef mblk_t *(*mac_tx_t)(void *, mblk_t *);
+typedef boolean_t (*mac_getcapab_t)(void *, mac_capab_t, void *);
+typedef int (*mac_open_t)(void *);
+typedef void (*mac_close_t)(void *);
+typedef int (*mac_set_prop_t)(void *, const char *, mac_prop_id_t,
+ uint_t, const void *);
+typedef int (*mac_get_prop_t)(void *, const char *, mac_prop_id_t,
+ uint_t, uint_t, void *, uint_t *);
+
+/*
+ * Drivers must set all of these callbacks except for mc_resources,
+ * mc_ioctl, and mc_getcapab, which are optional. If any of these optional
+ * callbacks are set, their appropriate flags must be set in mc_callbacks.
+ * Any future additions to this list must also be accompanied by an
+ * associated mc_callbacks flag so that the framework can grow without
+ * affecting the binary compatibility of the interface.
+ */
+typedef struct mac_callbacks_s {
+ uint_t mc_callbacks; /* Denotes which callbacks are set */
+ mac_getstat_t mc_getstat; /* Get the value of a statistic */
+ mac_start_t mc_start; /* Start the device */
+ mac_stop_t mc_stop; /* Stop the device */
+ mac_setpromisc_t mc_setpromisc; /* Enable or disable promiscuous mode */
+ mac_multicst_t mc_multicst; /* Enable or disable a multicast addr */
+ mac_unicst_t mc_unicst; /* Set the unicast MAC address */
+ mac_tx_t mc_tx; /* Transmit a packet */
+ mac_ioctl_t mc_ioctl; /* Process an unknown ioctl */
+ mac_getcapab_t mc_getcapab; /* Get capability information */
+ mac_open_t mc_open; /* Open the device */
+ mac_close_t mc_close; /* Close the device */
+ mac_set_prop_t mc_setprop;
+ mac_get_prop_t mc_getprop;
+} mac_callbacks_t;
+
+typedef struct mac_priv_prop_s {
+ char mpp_name[MAXLINKPROPNAME];
+ uint_t mpp_flags;
+} mac_priv_prop_t;
+
+/*
+ * Virtualization Capabilities
+ */
+/*
+ * The ordering of entries below is important. MAC_HW_CLASSIFIER
+ * is the cutoff below which are entries which don't depend on
+ * H/W. MAC_HW_CLASSIFIER and entries after that are cases where
+ * H/W has been updated through add/modify/delete APIs.
+ */
+typedef enum {
+ MAC_NO_CLASSIFIER = 0,
+ MAC_SW_CLASSIFIER,
+ MAC_HW_CLASSIFIER
+} mac_classify_type_t;
+
+typedef void (*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *,
+ boolean_t);
+
+/*
+ * The virtualization level conveys the extent of the NIC hardware assistance
+ * for traffic steering employed for virtualization:
+ *
+ * MAC_VIRT_NONE: No assist for v12n.
+ *
+ * MAC_VIRT_LEVEL1: Multiple Rx rings with MAC address level
+ * classification between groups of rings.
+ * Requires the support of the MAC_CAPAB_RINGS
+ * capability.
+ *
+ * MAC_VIRT_HIO: Hybrid I/O capable MAC. Require the support
+ * of the MAC_CAPAB_SHARES capability.
+ *
+ * MAC_VIRT_SERIALIZE: Temporary flag *ONLY* for nxge. Mac layer
+ * uses this to enable mac Tx serializer on
+ * outbound traffic and to always enqueue
+ * incoming traffic on Rx soft rings in mac.
+ */
+#define MAC_VIRT_NONE 0x0
+#define MAC_VIRT_LEVEL1 0x1
+#define MAC_VIRT_HIO 0x2
+#define MAC_VIRT_SERIALIZE 0x4
+
+typedef enum {
+ MAC_RING_TYPE_RX = 1, /* Receive ring */
+ MAC_RING_TYPE_TX /* Transmit ring */
+} mac_ring_type_t;
+
+#define MAX_RINGS_PER_GROUP 32
+
+/*
+ * Grouping type of a ring group
+ *
+ * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped.
+ * MAC_GROUP_TYPE_DYNAMIC: The ring group support dynamic re-grouping
+ */
+typedef enum {
+ MAC_GROUP_TYPE_STATIC = 1, /* Static ring group */
+ MAC_GROUP_TYPE_DYNAMIC /* Dynamic ring group */
+} mac_group_type_t;
+
+typedef struct __mac_ring_driver *mac_ring_driver_t;
+typedef struct __mac_group_driver *mac_group_driver_t;
+
+typedef struct mac_ring_info_s mac_ring_info_t;
+typedef struct mac_group_info_s mac_group_info_t;
+
+typedef void (*mac_get_ring_t)(void *, mac_ring_type_t, const int, const int,
+ mac_ring_info_t *, mac_ring_handle_t);
+typedef void (*mac_get_group_t)(void *, mac_ring_type_t, const int,
+ mac_group_info_t *, mac_group_handle_t);
+
+typedef void (*mac_group_add_ring_t)(mac_group_driver_t,
+ mac_ring_driver_t, mac_ring_type_t);
+typedef void (*mac_group_rem_ring_t)(mac_group_driver_t,
+ mac_ring_driver_t, mac_ring_type_t);
+
+/*
+ * Multiple Rings Capability
+ */
+typedef struct mac_capab_rings_s {
+ mac_ring_type_t mr_type; /* Ring type: Rx vs Tx */
+ mac_group_type_t mr_group_type; /* Dynamic vs static grouping */
+ uint_t mr_rnum; /* Number of rings */
+ uint_t mr_gnum; /* Number of ring groups */
+ mac_get_ring_t mr_rget; /* Get ring from driver */
+ mac_get_group_t mr_gget; /* Get ring group from driver */
+ mac_group_add_ring_t mr_gaddring; /* Add ring into a group */
+ mac_group_rem_ring_t mr_gremring; /* Remove ring from a group */
+} mac_capab_rings_t;
+
+/*
+ * Common ring functions and driver interfaces
+ */
+typedef int (*mac_ring_start_t)(mac_ring_driver_t, uint64_t);
+typedef void (*mac_ring_stop_t)(mac_ring_driver_t);
+
+typedef mblk_t *(*mac_ring_send_t)(void *, mblk_t *);
+typedef mblk_t *(*mac_ring_poll_t)(void *, int);
+
+typedef struct mac_ring_info_s {
+ mac_ring_driver_t mri_driver;
+ mac_ring_start_t mri_start;
+ mac_ring_stop_t mri_stop;
+ mac_intr_t mri_intr;
+ union {
+ mac_ring_send_t send;
+ mac_ring_poll_t poll;
+ } mrfunion;
+} mac_ring_info_s;
+
+#define mri_tx mrfunion.send
+#define mri_poll mrfunion.poll
+
+typedef int (*mac_group_start_t)(mac_group_driver_t);
+typedef void (*mac_group_stop_t)(mac_group_driver_t);
+typedef int (*mac_add_mac_addr_t)(void *, const uint8_t *);
+typedef int (*mac_rem_mac_addr_t)(void *, const uint8_t *);
+
+struct mac_group_info_s {
+ mac_group_driver_t mgi_driver; /* Driver reference */
+ mac_group_start_t mgi_start; /* Start the group */
+ mac_group_stop_t mgi_stop; /* Stop the group */
+ uint_t mgi_count; /* Count of rings */
+ mac_intr_t mgi_intr; /* Optional per-group intr */
+
+ /* Only used for rx groups */
+ mac_add_mac_addr_t mgi_addmac; /* Add a MAC address */
+ mac_rem_mac_addr_t mgi_remmac; /* Remove a MAC address */
+};
+
+/*
+ * Share management functions.
+ */
+typedef uint64_t mac_share_handle_t;
+
+/*
+ * Allocate and free a share. Returns ENOSPC if all shares have been
+ * previously allocated.
+ */
+typedef int (*mac_alloc_share_t)(void *, mac_share_handle_t *);
+typedef void (*mac_free_share_t)(mac_share_handle_t);
+
+/*
+ * Bind and unbind a share. Binding a share allows a domain
+ * to have direct access to the groups and rings associated with
+ * that share.
+ */
+typedef int (*mac_bind_share_t)(mac_share_handle_t, uint64_t, uint64_t *);
+typedef void (*mac_unbind_share_t)(mac_share_handle_t);
+
+/*
+ * Return information on about a share.
+ */
+typedef void (*mac_share_query_t)(mac_share_handle_t, mac_ring_type_t,
+ mac_ring_handle_t *, uint_t *);
+
+/*
+ * Basic idea, bind previously created ring groups to shares
+ * for them to be exported (or shared) by another domain.
+ * These interfaces bind/unbind the ring group to a share.
+ * The groups and their rings will be shared with the guest
+ * as soon as the share is bound.
+ */
+typedef int (*mac_share_add_group_t)(mac_share_handle_t,
+ mac_group_driver_t);
+typedef int (*mac_share_rem_group_t)(mac_share_handle_t,
+ mac_group_driver_t);
+
+typedef struct mac_capab_share_s {
+ uint_t ms_snum; /* Number of shares (vr's) */
+ void *ms_handle; /* Handle to driver. */
+ mac_alloc_share_t ms_salloc; /* Get a share from driver. */
+ mac_free_share_t ms_sfree; /* Return a share to driver. */
+ mac_share_add_group_t ms_sadd; /* Add a group to the share. */
+ mac_share_rem_group_t ms_sremove; /* Remove group from share. */
+ mac_share_query_t ms_squery; /* Query share constraints */
+ mac_bind_share_t ms_sbind; /* Bind a share */
+ mac_unbind_share_t ms_sunbind; /* Unbind a share */
+} mac_capab_share_t;
+
+/*
+ * MAC registration interface
+ */
+typedef struct mac_register_s {
+ uint_t m_version; /* set by mac_alloc() */
+ const char *m_type_ident;
+ void *m_driver; /* Driver private data */
+ dev_info_t *m_dip;
+ uint_t m_instance;
+ uint8_t *m_src_addr;
+ uint8_t *m_dst_addr;
+ mac_callbacks_t *m_callbacks;
+ uint_t m_min_sdu;
+ uint_t m_max_sdu;
+ void *m_pdata;
+ size_t m_pdata_size;
+ uint32_t m_margin;
+ mac_priv_prop_t *m_priv_props;
+ size_t m_priv_prop_count;
+ uint32_t m_v12n; /* Virtualization level */
+} mac_register_t;
+
+/*
+ * Flags for mc_callbacks. Requiring drivers to set the flags associated
+ * with optional callbacks initialized in the structure allows the mac
+ * module to add optional callbacks in the future without requiring drivers
+ * to recompile.
+ */
+#define MC_IOCTL 0x001
+#define MC_GETCAPAB 0x002
+#define MC_OPEN 0x004
+#define MC_CLOSE 0x008
+#define MC_SETPROP 0x010
+#define MC_GETPROP 0x020
+
+/*
+ * Driver interface functions.
+ */
+extern void mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
+extern int mac_maxsdu_update(mac_handle_t, uint_t);
+extern int mac_set_prop(mac_handle_t, mac_prop_t *,
+ void *, uint_t);
+extern int mac_get_prop(mac_handle_t, mac_prop_t *,
+ void *, uint_t, uint_t *);
+
+extern mac_register_t *mac_alloc(uint_t);
+extern void mac_free(mac_register_t *);
+extern int mac_register(mac_register_t *, mac_handle_t *);
+extern int mac_disable_nowait(mac_handle_t);
+extern int mac_disable(mac_handle_t);
+extern int mac_unregister(mac_handle_t);
+extern void mac_rx(mac_handle_t, mac_resource_handle_t,
+ mblk_t *);
+extern void mac_rx_ring(mac_handle_t, mac_ring_handle_t,
+ mblk_t *, uint64_t);
+extern void mac_link_update(mac_handle_t, link_state_t);
+extern void mac_unicst_update(mac_handle_t,
+ const uint8_t *);
+extern void mac_tx_update(mac_handle_t);
+extern void mac_tx_ring_update(mac_handle_t,
+ mac_ring_handle_t);
+extern void mac_resource_update(mac_handle_t);
+extern void mac_capab_update(mac_handle_t);
+extern int mac_pdata_update(mac_handle_t, void *,
+ size_t);
+extern void mac_multicast_refresh(mac_handle_t,
+ mac_multicst_t, void *, boolean_t);
+extern void mac_unicst_refresh(mac_handle_t, mac_unicst_t,
+ void *);
+extern void mac_promisc_refresh(mac_handle_t,
+ mac_setpromisc_t, void *);
+extern boolean_t mac_margin_update(mac_handle_t, uint32_t);
+extern void mac_margin_get(mac_handle_t, uint32_t *);
+extern int mac_margin_remove(mac_handle_t, uint32_t);
+extern int mac_margin_add(mac_handle_t, uint32_t *,
+ boolean_t);
+extern void mac_init_ops(struct dev_ops *, const char *);
+extern void mac_fini_ops(struct dev_ops *);
+extern uint32_t mac_no_notification(mac_handle_t);
+
+extern mactype_register_t *mactype_alloc(uint_t);
+extern void mactype_free(mactype_register_t *);
+extern int mactype_register(mactype_register_t *);
+extern int mactype_unregister(const char *);
+extern void mac_set_ring(void *, void *);
+
+extern boolean_t mac_unicst_verify(mac_handle_t,
+ const uint8_t *, uint_t);
+
+extern boolean_t mac_is_vnic(mac_handle_t);
+
+extern int mac_group_add_ring(mac_group_handle_t, int);
+extern void mac_group_rem_ring(mac_group_handle_t,
+ mac_ring_handle_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_PROVIDER_H */
diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h
new file mode 100644
index 0000000000..45fcdf65bf
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_soft_ring.h
@@ -0,0 +1,724 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MAC_SOFT_RING_H
+#define _SYS_MAC_SOFT_RING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/cpuvar.h>
+#include <sys/processor.h>
+#include <sys/stream.h>
+#include <sys/squeue.h>
+#include <sys/dlpi.h>
+#include <sys/mac_impl.h>
+
+#define S_RING_NAMELEN 64
+
+#define MAX_SR_FANOUT 32
+
+extern boolean_t mac_soft_ring_enable;
+extern boolean_t mac_latency_optimize;
+
+typedef struct mac_soft_ring_s mac_soft_ring_t;
+typedef struct mac_soft_ring_set_s mac_soft_ring_set_t;
+
+typedef void (*mac_soft_ring_drain_func_t)(mac_soft_ring_t *);
+typedef mac_tx_cookie_t (*mac_tx_func_t)(mac_soft_ring_set_t *, mblk_t *,
+ uintptr_t, uint16_t, mblk_t **);
+
+
+/* Tx notify callback */
+typedef struct mac_tx_notify_cb_s {
+ mac_cb_t mtnf_link; /* Linked list of callbacks */
+ mac_tx_notify_t mtnf_fn; /* The callback function */
+ void *mtnf_arg; /* Callback function argument */
+} mac_tx_notify_cb_t;
+
+struct mac_soft_ring_s {
+ /* Keep the most used members 64bytes cache aligned */
+ kmutex_t s_ring_lock; /* lock before using any member */
+ uint16_t s_ring_type; /* processing model of the sq */
+ uint16_t s_ring_state; /* state flags and message count */
+ int s_ring_count; /* # of mblocks in mac_soft_ring */
+ size_t s_ring_size; /* Size of data queued */
+ mblk_t *s_ring_first; /* first mblk chain or NULL */
+ mblk_t *s_ring_last; /* last mblk chain or NULL */
+
+ mac_direct_rx_t s_ring_rx_func;
+ void *s_ring_rx_arg1;
+ mac_resource_handle_t s_ring_rx_arg2;
+
+ /*
+ * Threshold after which packets get dropped.
+ * Is always greater than s_ring_tx_hiwat
+ */
+ int s_ring_tx_max_q_cnt;
+ /* # of mblocks after which to apply flow control */
+ int s_ring_tx_hiwat;
+ /* # of mblocks after which to relieve flow control */
+ int s_ring_tx_lowat;
+ boolean_t s_ring_tx_woken_up;
+ uint32_t s_ring_blocked_cnt; /* times blocked for Tx descs */
+ uint32_t s_ring_unblocked_cnt; /* unblock calls from driver */
+ uint32_t s_ring_hiwat_cnt; /* times blocked for Tx descs */
+
+ void *s_ring_tx_arg1;
+ void *s_ring_tx_arg2;
+
+ /* Tx notify callback */
+ mac_cb_info_t s_ring_notify_cb_info; /* cb list info */
+ mac_cb_t *s_ring_notify_cb_list; /* The cb list */
+
+ clock_t s_ring_awaken; /* time async thread was awakened */
+
+ kthread_t *s_ring_run; /* Current thread processing sq */
+ processorid_t s_ring_cpuid; /* processor to bind to */
+ processorid_t s_ring_cpuid_save; /* saved cpuid during offline */
+ kcondvar_t s_ring_async; /* async thread blocks on */
+ clock_t s_ring_wait; /* lbolts to wait after a fill() */
+ timeout_id_t s_ring_tid; /* timer id of pending timeout() */
+ kthread_t *s_ring_worker; /* kernel thread id */
+ char s_ring_name[S_RING_NAMELEN + 1];
+ uint32_t s_ring_total_inpkt;
+ uint32_t s_ring_drops;
+ struct mac_client_impl_s *s_ring_mcip;
+ void *s_ring_flent;
+ kstat_t *s_ring_ksp;
+
+ /* Teardown, poll disable control ops */
+ kcondvar_t s_ring_client_cv; /* Client wait for control op */
+
+ mac_soft_ring_set_t *s_ring_set; /* The SRS this ring belongs to */
+ mac_soft_ring_t *s_ring_next;
+ mac_soft_ring_t *s_ring_prev;
+ mac_soft_ring_drain_func_t s_ring_drain_func;
+};
+
+typedef void (*mac_srs_drain_proc_t)(mac_soft_ring_set_t *, uint_t);
+
+/* Transmit side Soft Ring Set */
+typedef struct mac_srs_tx_s {
+ /* Members for Tx size processing */
+ uint32_t st_mode;
+ mac_tx_func_t st_func;
+ void *st_arg1;
+ void *st_arg2;
+ mac_group_t *st_group; /* TX group for share */
+ boolean_t st_woken_up;
+
+ /*
+ * st_max_q_cnt is the queue depth threshold to limit
+ * outstanding packets on the Tx SRS. Once the limit
+ * is reached, Tx SRS will drop packets until the
+ * limit goes below the threshold.
+ */
+ uint32_t st_max_q_cnt; /* max. outstanding packets */
+ /*
+ * st_hiwat is used Tx serializer and bandwidth mode.
+ * This is the queue depth threshold upto which
+ * packets will get buffered with no flow-control
+ * back pressure applied to the caller. Once this
+ * threshold is reached, back pressure will be
+ * applied to the caller of mac_tx() (mac_tx() starts
+ * returning a cookie to indicate a blocked SRS).
+ * st_hiwat should always be lesser than or equal to
+ * st_max_q_cnt.
+ */
+ uint32_t st_hiwat; /* mblk cnt to apply flow control */
+ uint32_t st_lowat; /* mblk cnt to relieve flow control */
+ uint32_t st_drop_count;
+ /*
+ * Number of times the srs gets blocked due to lack of Tx
+ * desc is noted down. Corresponding wakeup from driver
+ * to unblock is also noted down. They should match in a
+ * correctly working setup. If there is less unblocks
+ * than blocks, then Tx side waits forever for a wakeup
+ * from below. The following protected by srs_lock.
+ */
+ uint32_t st_blocked_cnt; /* times blocked for Tx descs */
+ uint32_t st_unblocked_cnt; /* unblock calls from driver */
+ uint32_t st_hiwat_cnt; /* times blocked for Tx descs */
+} mac_srs_tx_t;
+
+/* Receive side Soft Ring Set */
+typedef struct mac_srs_rx_s {
+ /*
+ * Upcall Function for fanout, Rx processing etc. Perhaps
+ * the same 3 members below can be used for Tx
+ * processing, but looking around, mac_rx_func_t has
+ * proliferated too much into various files at different
+ * places. I am leaving the consolidation battle for
+ * another day.
+ */
+ mac_direct_rx_t sr_func; /* srs_lock */
+ void *sr_arg1; /* srs_lock */
+ mac_resource_handle_t sr_arg2; /* srs_lock */
+ mac_rx_func_t sr_lower_proc; /* Atomically changed */
+ boolean_t sr_enqueue_always; /* enqueue at soft ring */
+ uint32_t sr_poll_pkt_cnt;
+ uint32_t sr_poll_thres;
+
+ /* mblk cnt to apply flow control */
+ uint32_t sr_hiwat;
+ /* mblk cnt to relieve flow control */
+ uint32_t sr_lowat;
+ uint32_t sr_poll_count;
+ uint32_t sr_intr_count;
+ uint32_t sr_drop_count;
+
+ /* Times polling was enabled */
+ uint32_t sr_poll_on;
+ /* Times polling was enabled by worker thread */
+ uint32_t sr_worker_poll_on;
+ /* Times polling was disabled */
+ uint32_t sr_poll_off;
+ /* Poll thread signalled count */
+ uint32_t sr_poll_thr_sig;
+ /* Poll thread busy */
+ uint32_t sr_poll_thr_busy;
+ /* SRS drains, stays in poll mode but doesn't poll */
+ uint32_t sr_poll_drain_no_poll;
+ /*
+ * SRS has nothing to do and no packets in H/W but
+ * there is a backlog in softrings. SRS stays in
+ * poll mode but doesn't do polling.
+ */
+ uint32_t sr_poll_no_poll;
+ /* Active polling restarted */
+ uint32_t sr_below_hiwat;
+ /* Found packets in last poll so try and poll again */
+ uint32_t sr_poll_again;
+ /*
+ * Packets in queue but poll thread not allowed to process so
+ * signal the worker thread.
+ */
+ uint32_t sr_poll_sig_worker;
+ /*
+ * Poll thread has nothing to do and H/W has nothing so
+ * reenable the interrupts.
+ */
+ uint32_t sr_poll_intr_enable;
+ /*
+ * Poll thread has nothing to do and worker thread was already
+ * running so it can decide to reenable interrupt or poll again.
+ */
+ uint32_t sr_poll_goto_sleep;
+ /* Worker thread goes back to draining the queue */
+ uint32_t sr_drain_again;
+ /* More Packets in queue so signal the worker thread to drain */
+ uint32_t sr_drain_worker_sig;
+ /* Poll thread is already running so worker has nothing to do */
+ uint32_t sr_drain_poll_running;
+ /* We have packets already queued so keep polling */
+ uint32_t sr_drain_keep_polling;
+ /* Drain is done and interrupts are reenabled */
+ uint32_t sr_drain_finish_intr;
+ /* Polling thread needs to schedule worker wakeup */
+ uint32_t sr_poll_worker_wakeup;
+
+ /* Chains less than 10 pkts */
+ uint32_t sr_chain_cnt_undr10;
+ /* Chains between 10 & 50 pkts */
+ uint32_t sr_chain_cnt_10to50;
+ /* Chains over 50 pkts */
+ uint32_t sr_chain_cnt_over50;
+} mac_srs_rx_t;
+
+/*
+ * mac_soft_ring_set_s:
+ * This is used both for Tx and Rx side. The srs_type identifies Rx or
+ * Tx type.
+ *
+ * Note that the structure is carefully crafted, with Rx elements coming
+ * first followed by Tx specific members. Future additions to this
+ * structure should follow the same guidelines.
+ *
+ * Rx-side notes:
+ * mac_rx_classify_flow_add() always creates a mac_soft_ring_set_t and fn_flow
+ * points to info from it (func = srs_lower_proc, arg = soft_ring_set). On
+ * interrupt path, srs_lower_proc does B/W adjustment and switch to polling mode
+ * (if poll capable) and feeds the packets to soft_ring_list via choosen
+ * fanout type (specified by srs_type). In poll mode, the poll thread which is
+ * also a pointer can pick up the packets and feed them to various
+ * soft_ring_list.
+ *
+ * The srs_type can either be protocol based or fanout based where fanout itelf
+ * can be various types
+ *
+ * The polling works by turning off interrupts as soon as a packets
+ * are queued on the soft ring set. Once the backlog is clear and poll
+ * thread return empty handed i.e. Rx ring doesn't have anything, the
+ * interrupt is turned back on. For this purpose we keep a separate
+ * srs_poll_pkt_cnt counter which tracks the packets queued between SRS
+ * and the soft rings as well. The counter is incremented when packets
+ * are queued and decremented when SRS processes them (in case it has
+ * no soft rings) or the soft ring process them. Its important that
+ * in case SRS has softrings, the decrement doesn't happen till the
+ * packet is processed by the soft rings since it takes very little time
+ * for SRS to queue packet from SRS to soft rings and it will keep
+ * bringing more packets in the system faster than soft rings can
+ * process them.
+ *
+ * Tx side notes:
+ * The srs structure acts as a serializer with a worker thread. The
+ * default behavior of srs though is to act as a pass-thru. The queues
+ * (srs_first, srs_last, srs_count) get used when Tx ring runs out of Tx
+ * descriptors or to enforce bandwidth limits.
+ *
+ * When multiple Tx rings are present, the SRS state will be set to
+ * SRS_FANOUT_OTH. Outgoing packets coming into mac_tx_srs_process()
+ * function will be fanned out to one of the Tx side soft rings based on
+ * a hint passed in mac_tx_srs_process(). Each soft ring, in turn, will
+ * be associated with a distinct h/w Tx ring.
+ */
+
+struct mac_soft_ring_set_s {
+ /*
+ * Common elements, common to both Rx and Tx SRS type.
+ * The following block of fields are protected by srs_lock
+ */
+ kmutex_t srs_lock;
+ uint32_t srs_type;
+ uint32_t srs_state; /* state flags */
+ uint32_t srs_count;
+ mblk_t *srs_first; /* first mblk chain or NULL */
+ mblk_t *srs_last; /* last mblk chain or NULL */
+ kcondvar_t srs_async; /* cv for worker thread */
+ kcondvar_t srs_cv; /* cv for poll thread */
+ kcondvar_t srs_quiesce_done_cv; /* cv for removal */
+ timeout_id_t srs_tid; /* timeout id for pending timeout */
+
+ /*
+ * List of soft rings & processing function.
+ * The following block is protected by Rx quiescence.
+ * i.e. they can be changed only after quiescing the SRS
+ * Protected by srs_lock.
+ */
+ mac_soft_ring_t *srs_soft_ring_head;
+ mac_soft_ring_t *srs_soft_ring_tail;
+ int srs_soft_ring_count;
+ int srs_soft_ring_quiesced_count;
+ int srs_soft_ring_condemned_count;
+ mac_soft_ring_t **srs_tcp_soft_rings;
+ int srs_tcp_ring_count;
+ mac_soft_ring_t **srs_udp_soft_rings;
+ int srs_udp_ring_count;
+ /*
+ * srs_oth_soft_rings is also used by tx_srs in
+ * when operating in multi tx ring mode.
+ */
+ mac_soft_ring_t **srs_oth_soft_rings;
+ int srs_oth_ring_count;
+
+ /*
+ * Bandwidth control related members.
+ * They are common to both Rx- and Tx-side.
+ * Following protected by srs_lock
+ */
+ mac_bw_ctl_t *srs_bw;
+ size_t srs_size; /* Size of packets queued in bytes */
+ pri_t srs_pri;
+
+ mac_soft_ring_set_t *srs_next; /* mac_srs_g_lock */
+ mac_soft_ring_set_t *srs_prev; /* mac_srs_g_lock */
+
+ /* Attribute specific drain func (BW ctl vs non-BW ctl) */
+ mac_srs_drain_proc_t srs_drain_func; /* Write once (WO) */
+
+ /*
+ * If the associated ring is exclusively used by a mac client, e.g.,
+ * an aggregation, this fields is used to keep a reference to the
+ * MAC client's pseudo ring.
+ */
+ mac_resource_handle_t srs_mrh;
+ /*
+ * The following blocks are write once (WO) and valid for the life
+ * of the SRS
+ */
+ struct mac_client_impl_s *srs_mcip; /* back ptr to mac client */
+ void *srs_flent; /* back ptr to flent */
+ mac_ring_t *srs_ring; /* Ring Descriptor */
+
+ /* Teardown, disable control ops */
+ kcondvar_t srs_client_cv; /* Client wait for the control op */
+
+ kthread_t *srs_worker; /* WO, worker thread */
+ kthread_t *srs_poll_thr; /* WO, poll thread */
+
+ uint_t srs_ind; /* Round Robin indx for picking up SR */
+ processorid_t srs_worker_cpuid; /* processor to bind to */
+ processorid_t srs_worker_cpuid_save; /* saved cpuid during offline */
+ processorid_t srs_poll_cpuid; /* processor to bind to */
+ processorid_t srs_poll_cpuid_save; /* saved cpuid during offline */
+ uint_t srs_fanout_state;
+ mac_cpus_t srs_cpu;
+
+ mac_srs_rx_t srs_rx;
+ mac_srs_tx_t srs_tx;
+};
+
+/*
+ * type flags - combination allowed to process and drain the queue
+ */
+#define ST_RING_WORKER_ONLY 0x0001 /* Worker thread only */
+#define ST_RING_ANY 0x0002 /* Any thread can process the queue */
+#define ST_RING_TCP 0x0004
+#define ST_RING_UDP 0x0008
+#define ST_RING_OTH 0x0010
+
+#define ST_RING_BW_CTL 0x0020
+#define ST_RING_TX 0x0040
+
+/*
+ * State flags.
+ */
+#define S_RING_PROC 0x0001 /* being processed */
+#define S_RING_BOUND 0x0002 /* Worker thread is bound to a cpu */
+#define S_RING_BLOCK 0x0004 /* No Tx descs */
+#define S_RING_TX_HIWAT 0x0008 /* Tx high watermark reached */
+
+#define S_RING_WAKEUP_CLIENT 0x0010 /* flow ctrl, client wakeup needed */
+#define S_RING_BLANK 0x0020 /* Has been put into polling mode */
+#define S_RING_CLIENT_WAIT 0x0040 /* Client waiting for control op */
+
+#define S_RING_CONDEMNED 0x0100 /* Being torn down */
+#define S_RING_CONDEMNED_DONE 0x0200 /* Being torn down */
+#define S_RING_QUIESCE 0x0400 /* No traffic flow, transient flag */
+#define S_RING_QUIESCE_DONE 0x0800 /* No traffic flow, transient flag */
+
+#define S_RING_RESTART 0x1000 /* Go back to normal traffic flow */
+#define S_RING_ENQUEUED 0x2000 /* Pkts enqueued in Tx soft ring */
+
+/*
+ * arguments for processors to bind to
+ */
+#define S_RING_BIND_NONE -1
+
+/*
+ * defines for srs_type - identifies a link or a sub-flow
+ * and other static characteristics of a SRS like a tx
+ * srs, tcp only srs, etc.
+ */
+#define SRST_LINK 0x00000001
+#define SRST_FLOW 0x00000002
+#define SRST_NO_SOFT_RINGS 0x00000004
+#define SRST_TCP_ONLY 0x00000008
+
+#define SRST_FANOUT_PROTO 0x00000010
+#define SRST_FANOUT_SRC_IP 0x00000020
+#define SRST_FANOUT_OTH 0x00000040
+#define SRST_DEFAULT_GRP 0x00000080
+
+#define SRST_TX 0x00000100
+#define SRST_BW_CONTROL 0x00000200
+#define SRST_DIRECT_POLL 0x00000400
+
+#define SRST_DLS_BYPASS 0x00001000
+#define SRST_CLIENT_POLL_ENABLED 0x00002000
+
+/*
+ * soft ring set flags. These bits are dynamic in nature and get
+ * applied to srs_state. They reflect the state of SRS at any
+ * point of time
+ */
+#define SRS_BLANK 0x00000001
+#define SRS_WORKER_BOUND 0x00000002
+#define SRS_POLL_BOUND 0x00000004
+#define SRS_POLLING_CAPAB 0x00000008
+
+#define SRS_PROC 0x00000010
+#define SRS_GET_PKTS 0x00000020
+#define SRS_POLLING 0x00000040
+#define SRS_BW_ENFORCED 0x00000080
+
+#define SRS_WORKER 0x00000100
+#define SRS_ENQUEUED 0x00000200
+#define SRS_ANY_PROCESS 0x00000400
+#define SRS_PROC_FAST 0x00000800
+
+#define SRS_POLL_PROC 0x00001000
+#define SRS_TX_BLOCKED 0x00002000 /* out of Tx descs */
+#define SRS_TX_HIWAT 0x00004000 /* Tx count exceeds hiwat */
+#define SRS_TX_WAKEUP_CLIENT 0x00008000 /* Flow-ctl: wakeup client */
+
+#define SRS_CLIENT_PROC 0x00010000
+#define SRS_CLIENT_WAIT 0x00020000
+#define SRS_QUIESCE 0x00040000
+#define SRS_QUIESCE_DONE 0x00080000
+
+#define SRS_CONDEMNED 0x00100000
+#define SRS_CONDEMNED_DONE 0x00200000
+#define SRS_POLL_THR_QUIESCED 0x00400000
+#define SRS_RESTART 0x00800000
+
+#define SRS_RESTART_DONE 0x01000000
+#define SRS_POLL_THR_RESTART 0x02000000
+#define SRS_IN_GLIST 0x04000000
+#define SRS_POLL_THR_EXITED 0x08000000
+
+#define SRS_QUIESCE_PERM 0x10000000
+#define SRS_LATENCY_OPT 0x20000000
+
+#define SRS_QUIESCED(srs) (srs->srs_state & SRS_QUIESCE_DONE)
+
+/*
+ * If the SRS_QUIESCE_PERM flag is set, the SRS worker thread will not be
+ * able to be restarted.
+ */
+#define SRS_QUIESCED_PERMANENT(srs) (srs->srs_state & SRS_QUIESCE_PERM)
+
+/*
+ * soft ring set (SRS) Tx modes
+ */
+typedef enum {
+ SRS_TX_DEFAULT = 0,
+ SRS_TX_SERIALIZE,
+ SRS_TX_FANOUT,
+ SRS_TX_BW,
+ SRS_TX_BW_FANOUT
+} mac_tx_srs_mode_t;
+
+/*
+ * SRS fanout states
+ */
+typedef enum {
+ SRS_FANOUT_UNINIT = 0,
+ SRS_FANOUT_INIT,
+ SRS_FANOUT_REINIT
+} mac_srs_fanout_state_t;
+
+/*
+ * Structure for dls statistics
+ */
+struct dls_kstats {
+ kstat_named_t dlss_soft_ring_pkt_drop;
+};
+
+extern struct dls_kstats dls_kstat;
+
+#define DLS_BUMP_STAT(x, y) (dls_kstat.x.value.ui32 += y)
+
+/* Turn dynamic polling off */
+#define MAC_SRS_POLLING_OFF(mac_srs) { \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ if (((mac_srs)->srs_state & (SRS_POLLING_CAPAB|SRS_POLLING)) == \
+ (SRS_POLLING_CAPAB|SRS_POLLING)) { \
+ (mac_srs)->srs_state &= ~SRS_POLLING; \
+ (void) mac_hwring_enable_intr((mac_ring_handle_t) \
+ (mac_srs)->srs_ring); \
+ (mac_srs)->srs_rx.sr_poll_off++; \
+ } \
+}
+
+#define MAC_COUNT_CHAIN(mac_srs, head, tail, cnt, sz) { \
+ mblk_t *tmp; \
+ boolean_t bw_ctl = B_FALSE; \
+ \
+ ASSERT((head) != NULL); \
+ cnt = 0; \
+ sz = 0; \
+ if ((mac_srs)->srs_type & SRST_BW_CONTROL) \
+ bw_ctl = B_TRUE; \
+ tmp = tail = (head); \
+ if ((head)->b_next == NULL) { \
+ cnt = 1; \
+ if (bw_ctl) \
+ sz += msgdsize(head); \
+ } else { \
+ while (tmp != NULL) { \
+ tail = tmp; \
+ cnt++; \
+ if (bw_ctl) \
+ sz += msgdsize(tmp); \
+ tmp = tmp->b_next; \
+ } \
+ } \
+}
+
+/*
+ * Decrement the cumulative packet count in SRS and its
+ * soft rings. If the srs_poll_pkt_cnt goes below lowat, then check
+ * if if the interface was left in a polling mode and no one
+ * is really processing the queue (to get the interface out
+ * of poll mode). If no one is processing the queue, then
+ * acquire the PROC and signal the poll thread to check the
+ * interface for packets and get the interface back to interrupt
+ * mode if nothing is found.
+ */
+#define MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt) { \
+ mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
+ ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
+ \
+ srs_rx->sr_poll_pkt_cnt -= cnt; \
+ if ((srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_poll_thres) && \
+ (((mac_srs)->srs_state & \
+ (SRS_POLLING|SRS_PROC|SRS_GET_PKTS)) == SRS_POLLING)) \
+ { \
+ (mac_srs)->srs_state |= (SRS_PROC|SRS_GET_PKTS); \
+ cv_signal(&(mac_srs)->srs_cv); \
+ srs_rx->sr_below_hiwat++; \
+ } \
+}
+
+/*
+ * The following two macros are used to update the inbound packet and byte.
+ * count. The packet and byte count reflect the packets and bytes that are
+ * taken out of the SRS's queue, i.e. indicating they are being delivered.
+ * The srs_count and srs_size are updated in different locations as the
+ * srs_size is also used to take into account any bandwidth limits. The
+ * srs_size is updated only when a soft ring, if any, sends a packet up,
+ * as opposed to updating it when the SRS sends a packet to the SR, i.e.
+ * the srs_size reflects the packets in the SRS and SRs. These
+ * macros decrement the srs_size and srs_count and also increment the
+ * ipackets and ibytes stats resp.
+ *
+ * xxx-venu These are done under srs_lock, for now we still update
+ * mci_stat_ibytes/mci_stat_ipackets atomically, need to check if
+ * just updating them would be accurate enough.
+ *
+ * If we are updating these for a sub-flow SRS, then we need to also
+ * updated it's MAC client bandwidth info, if the MAC client is also
+ * bandwidth regulated.
+ */
+#define MAC_UPDATE_SRS_SIZE_LOCKED(srs, sz) { \
+ if ((srs)->srs_type & SRST_BW_CONTROL) { \
+ mutex_enter(&(srs)->srs_bw->mac_bw_lock); \
+ (srs)->srs_bw->mac_bw_sz -= (sz); \
+ (srs)->srs_bw->mac_bw_used += (sz); \
+ mutex_exit(&(srs)->srs_bw->mac_bw_lock); \
+ } \
+}
+
+#define MAC_TX_UPDATE_BW_INFO(srs, sz) { \
+ (srs)->srs_bw->mac_bw_sz -= (sz); \
+ (srs)->srs_bw->mac_bw_used += (sz); \
+}
+
+#define TX_MULTI_RING_MODE(mac_srs) \
+ ((mac_srs)->srs_tx.st_mode == SRS_TX_FANOUT || \
+ (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT)
+
+/* Soft ring flags for teardown */
+#define SRS_POLL_THR_OWNER (SRS_PROC | SRS_POLLING | SRS_GET_PKTS)
+#define SRS_PAUSE (SRS_CONDEMNED | SRS_QUIESCE)
+#define S_RING_PAUSE (S_RING_CONDEMNED | S_RING_QUIESCE)
+
+/* Soft rings */
+extern void mac_soft_ring_init(void);
+extern void mac_soft_ring_finish(void);
+extern void mac_fanout_setup(mac_client_impl_t *, flow_entry_t *,
+ mac_resource_props_t *, mac_direct_rx_t, void *, mac_resource_handle_t);
+
+extern void mac_soft_ring_worker_wakeup(mac_soft_ring_t *);
+extern void mac_soft_ring_blank(void *, time_t, uint_t, int);
+extern mblk_t *mac_soft_ring_poll(mac_soft_ring_t *, int);
+extern void mac_soft_ring_destroy(mac_soft_ring_t *);
+extern void mac_soft_ring_dls_bypass(void *, mac_direct_rx_t, void *);
+
+/* Rx SRS */
+extern mac_soft_ring_set_t *mac_srs_create(struct mac_client_impl_s *,
+ flow_entry_t *, uint32_t, mac_direct_rx_t, void *, mac_resource_handle_t,
+ mac_ring_t *);
+extern void mac_srs_free(mac_soft_ring_set_t *);
+extern void mac_srs_signal(mac_soft_ring_set_t *, uint_t);
+extern cpu_t *mac_srs_bind(mac_soft_ring_set_t *, processorid_t);
+
+extern void mac_srs_change_upcall(void *, mac_direct_rx_t, void *);
+extern void mac_srs_quiesce_initiate(mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_enable(struct mac_client_impl_s *,
+ mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_disable(struct mac_client_impl_s *,
+ mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_quiesce(struct mac_client_impl_s *,
+ mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_restart(struct mac_client_impl_s *,
+ mac_soft_ring_set_t *);
+extern void mac_rx_srs_quiesce(mac_soft_ring_set_t *, uint_t);
+extern void mac_rx_srs_restart(mac_soft_ring_set_t *);
+extern void mac_rx_srs_subflow_process(void *, mac_resource_handle_t, mblk_t *,
+ boolean_t);
+extern void mac_tx_srs_quiesce(mac_soft_ring_set_t *, uint_t);
+
+/* Tx SRS, Tx softring */
+extern void mac_tx_srs_wakeup(mac_soft_ring_set_t *, mac_ring_handle_t);
+extern void mac_tx_srs_setup(struct mac_client_impl_s *,
+ flow_entry_t *, uint32_t);
+extern mac_tx_func_t mac_tx_get_func(uint32_t);
+extern mblk_t *mac_tx_send(mac_client_handle_t, mac_ring_handle_t, mblk_t *,
+ mac_tx_stats_t *);
+extern boolean_t mac_tx_srs_ring_present(mac_soft_ring_set_t *, mac_ring_t *);
+extern void mac_tx_srs_add_ring(mac_soft_ring_set_t *, mac_ring_t *);
+extern void mac_tx_srs_del_ring(mac_soft_ring_set_t *, mac_ring_t *);
+extern mac_tx_cookie_t mac_tx_srs_no_desc(mac_soft_ring_set_t *, mblk_t *,
+ uint16_t, mblk_t **);
+
+/* Subflow specific stuff */
+extern int mac_srs_flow_create(struct mac_client_impl_s *, flow_entry_t *,
+ mac_resource_props_t *, int, int, mac_direct_rx_t);
+extern void mac_srs_update_bwlimit(flow_entry_t *, mac_resource_props_t *);
+extern void mac_srs_adjust_subflow_bwlimit(struct mac_client_impl_s *);
+extern void mac_srs_update_drv(struct mac_client_impl_s *);
+extern void mac_update_srs_priority(mac_soft_ring_set_t *, pri_t);
+extern void mac_client_update_classifier(mac_client_impl_t *, boolean_t);
+
+extern void mac_soft_ring_intr_enable(void *);
+extern void mac_soft_ring_intr_disable(void *);
+extern mac_soft_ring_t *mac_soft_ring_create(int, clock_t, void *, uint16_t,
+ pri_t, mac_client_impl_t *, mac_soft_ring_set_t *,
+ processorid_t, mac_direct_rx_t, void *, mac_resource_handle_t);
+extern cpu_t *mac_soft_ring_bind(mac_soft_ring_t *, processorid_t);
+ extern void mac_soft_ring_unbind(mac_soft_ring_t *);
+extern void mac_soft_ring_free(mac_soft_ring_t *, boolean_t);
+extern void mac_soft_ring_signal(mac_soft_ring_t *, uint_t);
+extern void mac_rx_soft_ring_process(mac_client_impl_t *, mac_soft_ring_t *,
+ mblk_t *, mblk_t *, int, size_t);
+extern mac_tx_cookie_t mac_tx_soft_ring_process(mac_soft_ring_t *,
+ mblk_t *, uint16_t, mblk_t **);
+extern void mac_srs_worker_quiesce(mac_soft_ring_set_t *);
+extern void mac_srs_worker_restart(mac_soft_ring_set_t *);
+extern void mac_rx_attach_flow_srs(mac_impl_t *, flow_entry_t *,
+ mac_soft_ring_set_t *, mac_ring_t *, mac_classify_type_t);
+
+extern void mac_rx_srs_drain_bw(mac_soft_ring_set_t *, uint_t);
+extern void mac_rx_srs_drain(mac_soft_ring_set_t *, uint_t);
+extern void mac_rx_srs_process(void *, mac_resource_handle_t, mblk_t *,
+ boolean_t);
+extern void mac_srs_worker(mac_soft_ring_set_t *);
+extern void mac_rx_srs_poll_ring(mac_soft_ring_set_t *);
+extern void mac_tx_srs_drain(mac_soft_ring_set_t *, uint_t);
+
+extern void mac_tx_srs_restart(mac_soft_ring_set_t *);
+extern void mac_rx_srs_remove(mac_soft_ring_set_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_SOFT_RING_H */
diff --git a/usr/src/uts/common/sys/modhash.h b/usr/src/uts/common/sys/modhash.h
index 5860ad165a..68d1c4dedd 100644
--- a/usr/src/uts/common/sys/modhash.h
+++ b/usr/src/uts/common/sys/modhash.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_MODHASH_H
#define _SYS_MODHASH_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Generic hash implementation for the kernel.
*/
@@ -129,6 +126,8 @@ int mod_hash_destroy(mod_hash_t *, mod_hash_key_t);
int mod_hash_find(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
int mod_hash_find_cb(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
void (*)(mod_hash_key_t, mod_hash_val_t));
+int mod_hash_find_cb_rval(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
+ int (*)(mod_hash_key_t, mod_hash_val_t), int *);
void mod_hash_walk(mod_hash_t *,
uint_t (*)(mod_hash_key_t, mod_hash_val_t *, void *), void *);
diff --git a/usr/src/uts/common/sys/nxge/nxge.h b/usr/src/uts/common/sys/nxge/nxge.h
index 37cd6db405..624e433572 100644
--- a/usr/src/uts/common/sys/nxge/nxge.h
+++ b/usr/src/uts/common/sys/nxge/nxge.h
@@ -319,6 +319,7 @@ typedef struct _filter_t {
uint32_t all_sap_cnt;
} filter_t, *p_filter_t;
+
typedef struct _nxge_port_stats_t {
/*
* Overall structure size
@@ -470,6 +471,8 @@ typedef struct _nxge_stats_t {
} nxge_stats_t, *p_nxge_stats_t;
+
+
typedef struct _nxge_intr_t {
boolean_t intr_registered; /* interrupts are registered */
boolean_t intr_enabled; /* interrupts are enabled */
@@ -497,7 +500,7 @@ typedef struct _nxge_ldgv_t {
p_nxge_ldg_t ldgp;
p_nxge_ldv_t ldvp;
p_nxge_ldv_t ldvp_syserr;
- int ldvp_syserr_allocated;
+ boolean_t ldvp_syserr_alloced;
} nxge_ldgv_t, *p_nxge_ldgv_t;
typedef enum {
@@ -542,7 +545,8 @@ typedef struct {
#define NXGE_DC_SET(map, channel) map |= (1 << channel)
#define NXGE_DC_RESET(map, channel) map &= (~(1 << channel))
-#define NXGE_LOGICAL_GROUP_MAX NXGE_MAX_TDCS
+/* For now, we only support up to 8 RDC/TDC groups */
+#define NXGE_LOGICAL_GROUP_MAX NXGE_MAX_RDC_GROUPS
typedef struct {
int sequence; /* To order groups in time. */
@@ -558,6 +562,12 @@ typedef struct {
} nxge_grp_set_t;
/*
+ * Transmit Ring Group
+ * TX groups will be used exclusively for the purpose of Hybrid I/O. From
+ * the point of view of the nxge driver, the groups will be software
+ * constructs which will be used to establish the relationship between TX
+ * rings and shares.
+ *
* Receive Ring Group
* One of the advanced virtualization features is the ability to bundle
* multiple Receive Rings in a single group. One or more MAC addresses may
@@ -567,12 +577,16 @@ typedef struct {
* RX ring groups can come with a predefined set of member rings, or they
* are programmable by adding and removing rings to/from them.
*/
-typedef struct _nxge_rx_ring_group_t {
+typedef struct _nxge_ring_group_t {
mac_group_handle_t ghandle;
p_nxge_t nxgep;
+ boolean_t started;
+ mac_ring_type_t type;
int gindex;
int sindex;
-} nxge_rx_ring_group_t;
+ int rdctbl;
+ int n_mac_addrs;
+} nxge_ring_group_t;
/*
* Ring Handle
@@ -581,7 +595,7 @@ typedef struct _nxge_ring_handle_t {
p_nxge_t nxgep;
int index; /* port-wise */
mac_ring_handle_t ring_handle;
-} nxge_ring_handle_t;
+} nxge_ring_handle_t, *p_nxge_ring_handle_t;
/*
* Share Handle
@@ -613,9 +627,6 @@ struct _nxge_t {
uint64_t nxge_debug_level; /* driver state bit flags */
kmutex_t genlock[1];
enum nxge_mac_state nxge_mac_state;
- ddi_softintr_t resched_id; /* reschedule callback */
- boolean_t resched_needed;
- boolean_t resched_running;
p_dev_regs_t dev_regs;
npi_handle_t npi_handle;
@@ -695,17 +706,12 @@ struct _nxge_t {
p_rx_rcr_rings_t rx_rcr_rings;
p_rx_mbox_areas_t rx_mbox_areas_p;
- uint32_t start_rdc;
- uint32_t max_rdcs;
uint32_t rdc_mask;
/* Transmit descriptors rings */
p_tx_rings_t tx_rings;
p_tx_mbox_areas_t tx_mbox_areas_p;
- uint32_t start_tdc;
- uint32_t max_tdcs;
-
ddi_dma_handle_t dmasparehandle;
ulong_t sys_page_sz;
@@ -777,7 +783,15 @@ struct _nxge_t {
nxge_grp_set_t tx_set;
boolean_t tdc_is_shared[NXGE_MAX_TDCS];
- nxge_rx_ring_group_t rx_hio_groups[NXGE_MAX_RDC_GROUPS];
+ boolean_t rx_channel_started[NXGE_MAX_RDCS];
+
+ /* Ring Handles */
+ nxge_ring_handle_t tx_ring_handles[NXGE_MAX_TDCS];
+ nxge_ring_handle_t rx_ring_handles[NXGE_MAX_RDCS];
+
+ nxge_ring_group_t tx_hio_groups[NXGE_MAX_TDC_GROUPS];
+ nxge_ring_group_t rx_hio_groups[NXGE_MAX_RDC_GROUPS];
+
nxge_share_handle_t shares[NXGE_MAX_VRS];
};
diff --git a/usr/src/uts/common/sys/nxge/nxge_common.h b/usr/src/uts/common/sys/nxge/nxge_common.h
index f2bbc8e064..7956b5f653 100644
--- a/usr/src/uts/common/sys/nxge/nxge_common.h
+++ b/usr/src/uts/common/sys/nxge/nxge_common.h
@@ -277,15 +277,24 @@ typedef struct nxge_tdc_cfg {
#define RDC_TABLE_ENTRY_METHOD_SEQ 0
#define RDC_TABLE_ENTRY_METHOD_REP 1
+/* per transmit DMA channel table group data structure */
+typedef struct nxge_tdc_grp {
+ uint32_t start_tdc; /* assume assigned in sequence */
+ uint8_t max_tdcs;
+ dc_map_t map;
+ uint8_t grp_index; /* nxge_t.tx_set.group[grp_index] */
+} nxge_tdc_grp_t, *p_nxge_tdc_grp_t;
+
/* per receive DMA channel table group data structure */
typedef struct nxge_rdc_grp {
- uint32_t flag; /* 0: not configured 1: configured */
+ boolean_t flag; /* 0: not configured 1: configured */
uint8_t port;
- uint8_t start_rdc; /* assume assigned in sequence */
+ uint32_t start_rdc; /* assume assigned in sequence */
uint8_t max_rdcs;
uint8_t def_rdc;
dc_map_t map;
uint16_t config_method;
+ uint8_t grp_index; /* nxge_t.rx_set.group[grp_index] */
} nxge_rdc_grp_t, *p_nxge_rdc_grp_t;
#define RDC_MAP_IN(map, rdc) \
@@ -383,7 +392,6 @@ typedef struct nxge_hw_pt_cfg {
uint32_t ser_ldvid;
uint32_t def_rdc; /* default RDC */
uint32_t drr_wt; /* port DRR weight */
- uint32_t start_grpid; /* starting group ID */
uint32_t max_grpids; /* max group ID */
uint32_t grpids[NXGE_MAX_RDCS]; /* RDC group IDs */
uint32_t max_rdc_grpids; /* max RDC group ID */
@@ -393,6 +401,7 @@ typedef struct nxge_hw_pt_cfg {
uint32_t start_mac_entry; /* where to put the first mac */
uint32_t max_macs; /* the max mac entry allowed */
uint32_t mac_pref; /* preference over VLAN */
+ uint32_t def_mac_txdma_grpid; /* default TDC group ID */
uint32_t def_mac_rxdma_grpid; /* default RDC group ID */
uint32_t vlan_pref; /* preference over MAC */
@@ -417,6 +426,9 @@ typedef struct nxge_dma_pt_cfg {
*/
uint32_t tx_dma_map; /* Transmit DMA channel bit map */
+ /* Transmit DMA channel: device wise */
+ nxge_tdc_grp_t tdc_grps[NXGE_MAX_TDC_GROUPS];
+
/* Receive DMA channel */
nxge_rdc_grp_t rdc_grps[NXGE_MAX_RDC_GROUPS];
diff --git a/usr/src/uts/common/sys/nxge/nxge_defs.h b/usr/src/uts/common/sys/nxge/nxge_defs.h
index db061381da..8f8e226b32 100644
--- a/usr/src/uts/common/sys/nxge/nxge_defs.h
+++ b/usr/src/uts/common/sys/nxge/nxge_defs.h
@@ -278,6 +278,12 @@ extern "C" {
*/
#define NXGE_MAX_VRS 8
+/*
+ * TDC groups are used exclusively for the purpose of Hybrid I/O
+ * TX needs one group for each VR
+ */
+#define NXGE_MAX_TDC_GROUPS (NXGE_MAX_VRS)
+
/* Max. RDC table groups */
#define NXGE_MAX_RDC_GROUPS 8
#define NXGE_MAX_RDCS 16
diff --git a/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h b/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h
index fc99701ca3..d7270a6fb1 100644
--- a/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h
+++ b/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h
@@ -18,7 +18,6 @@
*
* CDDL HEADER END
*/
-
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -33,6 +32,7 @@ extern "C" {
#include <nxge_defs.h>
+
/* FZC_FFLP Offsets */
#define FFLP_ENET_VLAN_TBL_REG (FZC_FFLP + 0x00000)
@@ -1284,6 +1284,7 @@ typedef struct tcam_entry {
* before this header file.
* Need to move these includes to impl files ...
*/
+
#include <netinet/in.h>
typedef union flow_template {
diff --git a/usr/src/uts/common/sys/nxge/nxge_flow.h b/usr/src/uts/common/sys/nxge/nxge_flow.h
index 352834d796..c76f2731a1 100644
--- a/usr/src/uts/common/sys/nxge/nxge_flow.h
+++ b/usr/src/uts/common/sys/nxge/nxge_flow.h
@@ -18,7 +18,6 @@
*
* CDDL HEADER END
*/
-
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
diff --git a/usr/src/uts/common/sys/nxge/nxge_hio.h b/usr/src/uts/common/sys/nxge/nxge_hio.h
index 2a25341111..10487202b6 100644
--- a/usr/src/uts/common/sys/nxge/nxge_hio.h
+++ b/usr/src/uts/common/sys/nxge/nxge_hio.h
@@ -34,7 +34,7 @@ extern "C" {
#include <nxge_mac.h>
#include <nxge_ipp.h>
#include <nxge_fflp.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#if defined(sun4v)
#include <sys/vnet_res.h>
#endif
@@ -249,9 +249,10 @@ typedef struct nxge_hio_vr {
size_t size;
vr_region_t region; /* 1 of 8 regions. */
- uint8_t rdc_tbl; /* 1 of 8 RDC tables. */
+ int rdc_tbl; /* 1 of 8 RDC tables. */
+ int tdc_tbl; /* 1 of 8 TDC tables. */
ether_addr_t altmac; /* The alternate MAC address. */
- mac_addr_slot_t slot; /* According to nxge_m_mmac_add(). */
+ int slot; /* According to nxge_m_mmac_add(). */
#if defined(sun4v)
vio_net_handle_t vhp; /* The handle given to us by the vnet. */
@@ -369,12 +370,18 @@ extern const char *nxge_ddi_perror(int);
*/
extern void nxge_hio_group_get(void *arg, mac_ring_type_t type, int group,
mac_group_info_t *infop, mac_group_handle_t ghdl);
-extern int nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie,
- mac_share_handle_t *shandle);
+extern int nxge_hio_share_alloc(void *arg, mac_share_handle_t *shandle);
extern void nxge_hio_share_free(mac_share_handle_t shandle);
extern void nxge_hio_share_query(mac_share_handle_t shandle,
- mac_ring_type_t type, uint32_t *rmin, uint32_t *rmax, uint64_t *rmap,
- uint64_t *gnum);
+ mac_ring_type_t type, mac_ring_handle_t *rings, uint_t *n_rings);
+extern int nxge_hio_share_add_group(mac_share_handle_t,
+ mac_group_driver_t);
+extern int nxge_hio_share_rem_group(mac_share_handle_t,
+ mac_group_driver_t);
+extern int nxge_hio_share_bind(mac_share_handle_t, uint64_t cookie,
+ uint64_t *rcookie);
+extern void nxge_hio_share_unbind(mac_share_handle_t);
+
/* nxge_hio_guest.c */
extern void nxge_hio_unregister(nxge_t *);
@@ -416,12 +423,6 @@ extern int nxge_hio_hostinfo_get_rdc_table(p_nxge_t);
extern int nxge_hio_hostinfo_init(nxge_t *, nxge_hio_vr_t *, ether_addr_t *);
extern void nxge_hio_hostinfo_uninit(nxge_t *, nxge_hio_vr_t *);
- /* nxge_rxdma.c */
-extern nxge_status_t nxge_rx_poll(nxge_t *, int);
-
- /* nxge_txdma.c */
-extern uint_t nxge_tx_poll(nxge_t *, int);
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/nxge/nxge_impl.h b/usr/src/uts/common/sys/nxge/nxge_impl.h
index 5420ac00bb..63779b4e88 100644
--- a/usr/src/uts/common/sys/nxge/nxge_impl.h
+++ b/usr/src/uts/common/sys/nxge/nxge_impl.h
@@ -36,6 +36,8 @@ extern "C" {
#define NIU_MAJOR_VER 1
#define NIU_MINOR_VER 1
+#if defined(sun4v)
+
/*
* NIU HV API v1.0 definitions
*/
@@ -44,6 +46,8 @@ extern "C" {
#define N2NIU_TX_LP_CONF 0x144
#define N2NIU_TX_LP_INFO 0x145
+#endif /* defined(sun4v) */
+
#ifndef _ASM
#include <sys/types.h>
@@ -81,8 +85,7 @@ extern "C" {
#include <sys/netlb.h>
#include <sys/ddi_intr.h>
-#include <sys/mac.h>
-#include <sys/mac_impl.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#if defined(sun4v)
@@ -611,7 +614,6 @@ struct _nxge_ldg_t {
uint8_t ldg; /* logical group number */
uint8_t vldg_index;
boolean_t arm;
- boolean_t interrupted;
uint16_t ldg_timer; /* counter */
uint8_t func;
uint8_t vector;
@@ -749,6 +751,13 @@ typedef struct _nxge_mmac_stats_t {
struct ether_addr mmac_avail_pool[16];
} nxge_mmac_stats_t, *p_nxge_mmac_stats_t;
+/*
+ * Copied from mac.h. Should be cleaned up by driver.
+ */
+#define MMAC_SLOT_USED 0x1 /* address slot used */
+#define MMAC_VENDOR_ADDR 0x2 /* address returned is vendor supplied */
+
+
#define NXGE_MAX_MMAC_ADDRS 32
#define NXGE_NUM_MMAC_ADDRS 8
#define NXGE_NUM_OF_PORTS_QUAD 4
@@ -885,6 +894,8 @@ void nxge_hw_set_mac_modes(p_nxge_t);
/* nxge_send.c. */
uint_t nxge_reschedule(caddr_t);
+mblk_t *nxge_tx_ring_send(void *, mblk_t *);
+int nxge_start(p_nxge_t, p_tx_ring_t, p_mblk_t);
/* nxge_rxdma.c */
nxge_status_t nxge_rxdma_cfg_rdcgrp_default_rdc(p_nxge_t,
@@ -1050,6 +1061,8 @@ int nxge_get_nports(p_nxge_t);
void nxge_free_buf(buf_alloc_type_t, uint64_t, uint32_t);
+#if defined(sun4v)
+
uint64_t hv_niu_rx_logical_page_conf(uint64_t, uint64_t,
uint64_t, uint64_t);
#pragma weak hv_niu_rx_logical_page_conf
@@ -1131,6 +1144,8 @@ uint64_t hv_niu_vrtx_to_logical_dev(uint32_t cookie, uint64_t v_chidx,
uint64_t *ldn);
#pragma weak hv_niu_vrtx_to_logical_dev
+#endif /* defined(sun4v) */
+
#ifdef NXGE_DEBUG
char *nxge_dump_packet(char *, int);
#endif
diff --git a/usr/src/uts/common/sys/nxge/nxge_rxdma.h b/usr/src/uts/common/sys/nxge/nxge_rxdma.h
index 43a7185148..a336dbb9cb 100644
--- a/usr/src/uts/common/sys/nxge/nxge_rxdma.h
+++ b/usr/src/uts/common/sys/nxge/nxge_rxdma.h
@@ -155,6 +155,13 @@ typedef struct _nxge_rdc_sys_stats {
uint32_t zcp_eop_err;
} nxge_rdc_sys_stats_t, *p_nxge_rdc_sys_stats_t;
+/*
+ * Software reserved buffer offset
+ */
+typedef struct _nxge_rxbuf_off_hdr_t {
+ uint32_t index;
+} nxge_rxbuf_off_hdr_t, *p_nxge_rxbuf_off_hdr_t;
+
typedef struct _rx_msg_t {
nxge_os_dma_common_t buf_dma;
@@ -231,8 +238,11 @@ typedef struct _rx_rcr_ring_t {
uint32_t intr_timeout;
uint32_t intr_threshold;
uint64_t max_receive_pkts;
- mac_resource_handle_t rcr_mac_handle;
+ mac_ring_handle_t rcr_mac_handle;
+ uint64_t rcr_gen_num;
uint32_t rcvd_pkt_bytes; /* Received bytes of a packet */
+ p_nxge_ldv_t ldvp;
+ p_nxge_ldg_t ldgp;
} rx_rcr_ring_t, *p_rx_rcr_ring_t;
@@ -359,11 +369,13 @@ typedef struct _rx_mbox_t {
typedef struct _rx_rbr_rings_t {
p_rx_rbr_ring_t *rbr_rings;
uint32_t ndmas;
+ boolean_t rxbuf_allocated;
} rx_rbr_rings_t, *p_rx_rbr_rings_t;
typedef struct _rx_rcr_rings_t {
p_rx_rcr_ring_t *rcr_rings;
uint32_t ndmas;
+ boolean_t cntl_buf_allocated;
} rx_rcr_rings_t, *p_rx_rcr_rings_t;
typedef struct _rx_mbox_areas_t {
@@ -414,6 +426,10 @@ void nxge_rxdma_fix_channel(p_nxge_t, uint16_t);
void nxge_rxdma_fixup_channel(p_nxge_t, uint16_t, int);
int nxge_rxdma_get_ring_index(p_nxge_t, uint16_t);
+mblk_t *nxge_rx_poll(void *, int);
+int nxge_enable_poll(void *);
+int nxge_disable_poll(void *);
+
void nxge_rxdma_regs_dump_channels(p_nxge_t);
nxge_status_t nxge_rxdma_handle_sys_errors(p_nxge_t);
void nxge_rxdma_inject_err(p_nxge_t, uint32_t, uint8_t);
@@ -422,6 +438,8 @@ extern nxge_status_t nxge_alloc_rx_mem_pool(p_nxge_t);
extern nxge_status_t nxge_alloc_rxb(p_nxge_t nxgep, int channel);
extern void nxge_free_rxb(p_nxge_t nxgep, int channel);
+int nxge_get_rxring_index(p_nxge_t, int, int);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/nxge/nxge_serialize.h b/usr/src/uts/common/sys/nxge/nxge_serialize.h
deleted file mode 100644
index f235de7b2e..0000000000
--- a/usr/src/uts/common/sys/nxge/nxge_serialize.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_NXGE_NXGE_SERIALIZE_H
-#define _SYS_NXGE_NXGE_SERIALIZE_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define NXGE_TX_AVG_CNT 200000000
-#define NXGE_TX_AVG_RES 2000 /* sleep at least a tick */
-#define MAXHRS 3 /* # of packets to process */
-#define ONESEC 1000000000 /* one second */
-
-#include <sys/stream.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/kmem.h>
-#include <sys/ddi.h>
-#include <sys/callb.h>
-
-/*
- * Thread state flags
- */
-#define NXGE_TX_STHREAD_RUNNING 0x0001 /* thread started */
-#define NXGE_TX_STHREAD_DESTROY 0x0002 /* thread is being destroyed */
-#define NXGE_TX_STHREAD_EXIT 0x0003 /* thread exits */
-
-typedef int (onetrack_t)(mblk_t *, void *);
-
-typedef struct {
- kmutex_t lock;
- int count;
- mblk_t *head;
- mblk_t *tail;
- void *cookie;
- onetrack_t *serialop;
- int owned;
- /* Counter tracks the total time spent in serializer function */
- hrtime_t totaltime;
- /*
- * Counter tracks the total number of time the serializer
- * function was called.
- */
- long totalcount;
- /*
- * Counter maintains the average time spent in the serializer function
- * and is derived as (totaltime/totalcount).
- */
- int avg;
- /*
- * The lenght of the queue to which the serializer function
- * will append data.
- */
- int length;
- kcondvar_t serial_cv;
- kcondvar_t timecv;
- kmutex_t serial;
- uint32_t s_state;
- boolean_t s_need_signal;
- callb_cpr_t s_cprinfo;
- kthread_t *tx_sthread;
- kmutex_t timelock;
-} nxge_serialize_t;
-
-/*
- * Prototypes definitions
- */
-nxge_serialize_t *nxge_serialize_create(int, onetrack_t *, void *);
-void nxge_serialize_destroy(nxge_serialize_t *);
-void nxge_serialize_enter(nxge_serialize_t *, mblk_t *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_NXGE_NXGE_SERIALIZE_H */
diff --git a/usr/src/uts/common/sys/nxge/nxge_txdma.h b/usr/src/uts/common/sys/nxge/nxge_txdma.h
index 859f6a124e..829d67ebce 100644
--- a/usr/src/uts/common/sys/nxge/nxge_txdma.h
+++ b/usr/src/uts/common/sys/nxge/nxge_txdma.h
@@ -18,7 +18,6 @@
*
* CDDL HEADER END
*/
-
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -31,9 +30,9 @@
extern "C" {
#endif
+#include <sys/taskq.h>
#include <sys/nxge/nxge_txdma_hw.h>
#include <npi_txdma.h>
-#include <sys/nxge/nxge_serialize.h>
#define TXDMA_PORT_BITMAP(nxgep) (nxgep->pt_config.tx_dma_map)
@@ -152,14 +151,13 @@ typedef struct _tx_ring_t {
uint32_t tx_ring_offline;
boolean_t tx_ring_busy;
- p_tx_msg_t tx_free_list_p;
- nxge_os_mutex_t freelock;
-
nxge_os_mutex_t lock;
+ mac_ring_handle_t tx_ring_handle;
+ ddi_taskq_t *taskq;
uint16_t index;
uint16_t tdc;
struct nxge_tdc_cfg *tdc_p;
- uint_t tx_ring_size;
+ int tx_ring_size;
uint32_t num_chunks;
uint_t tx_wrap_mask;
@@ -170,11 +168,10 @@ typedef struct _tx_ring_t {
tx_ring_kick_t ring_kick_tail;
txdma_mailbox_t tx_mbox;
- uint_t descs_pending;
+ int descs_pending;
boolean_t queueing;
nxge_os_mutex_t sq_lock;
- nxge_serialize_t *serial;
p_mblk_t head;
p_mblk_t tail;
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index 2591642dc0..8d93c7780e 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -161,6 +161,7 @@ void secpolicy_fs_mount_clearopts(cred_t *, struct vfs *);
int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *,
const vattr_t *, cred_t *);
int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t);
+int secpolicy_dld_ioctl(const cred_t *, const char *, const char *);
int secpolicy_xvm_control(const cred_t *);
int secpolicy_basic_exec(const cred_t *, vnode_t *);
diff --git a/usr/src/uts/common/sys/softmac_impl.h b/usr/src/uts/common/sys/softmac_impl.h
index 3fcfc97415..5f9d1401a7 100644
--- a/usr/src/uts/common/sys/softmac_impl.h
+++ b/usr/src/uts/common/sys/softmac_impl.h
@@ -26,8 +26,6 @@
#ifndef _SYS_SOFTMAC_IMPL_H
#define _SYS_SOFTMAC_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/ethernet.h>
#include <sys/taskq.h>
@@ -37,6 +35,9 @@
#include <sys/stream.h>
#include <sys/dlpi.h>
#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
#include <sys/mac_ether.h>
#ifdef __cplusplus
@@ -68,14 +69,20 @@ typedef struct softmac_lower_s {
boolean_t sl_pending_ioctl;
mblk_t *sl_ack_mp;
- mac_resource_handle_t sl_handle;
ldi_handle_t sl_lh;
} softmac_lower_t;
-enum softmac_state {
+typedef enum {
SOFTMAC_INITIALIZED,
SOFTMAC_READY
-};
+} softmac_lower_state_t;
+
+typedef enum {
+ SOFTMAC_UNINIT,
+ SOFTMAC_ATTACH_INPROG,
+ SOFTMAC_ATTACH_DONE,
+ SOFTMAC_DETACH_INPROG,
+} softmac_state_t;
typedef struct softmac_dev_s {
dev_t sd_dev;
@@ -86,8 +93,12 @@ typedef struct softmac_dev_s {
*/
#define SOFTMAC_GLDV3 0x01
#define SOFTMAC_NOSUPP 0x02
-#define SOFTMAC_ATTACH_DONE 0x04
-#define SOFTMAC_NEED_RECREATE 0x08
+#define SOFTMAC_NEED_RECREATE 0x04
+#define SOFTMAC_NOTIFY_QUIT 0x08
+
+#define SMAC_NONZERO_NODECNT(softmac) \
+ ((softmac->smac_softmac[0] != NULL) + \
+ (softmac->smac_softmac[1] != NULL))
/*
* The softmac structure allows all minor nodes (at most two, style-1 and
@@ -111,18 +122,14 @@ typedef struct softmac {
uint32_t smac_cnt; /* # of minor nodes for this device */
/*
- * The following fields are protected by softmac_hash_lock.
- */
- /*
+ * The following fields are protected by smac_mutex.
+ *
* The smac_hold_cnt field increases when softmac_hold_device() is
* called to force the dls_vlan_t of the device to be created. The
* device pre-detach fails if this counter is not 0.
*/
+ softmac_state_t smac_state;
uint32_t smac_hold_cnt;
-
- /*
- * The following fields are protected by smac_lock.
- */
kmutex_t smac_mutex;
kcondvar_t smac_cv;
uint32_t smac_flags;
@@ -145,6 +152,16 @@ typedef struct softmac {
uint32_t smac_attached_left;
/*
+ * Thread handles the DL_NOTIFY_IND message from the lower stream.
+ */
+ kthread_t *smac_notify_thread;
+ /*
+ * Head and tail of the DL_NOTIFY_IND messsages.
+ */
+ mblk_t *smac_notify_head;
+ mblk_t *smac_notify_tail;
+
+ /*
* The remaining fields are used to register the MAC for a legacy
* device. They are set in softmac_mac_register() and do not change.
* One can access them when mac_register() is done without locks.
@@ -177,11 +194,8 @@ typedef struct softmac {
dl_capab_mdt_t smac_mdt_capab;
boolean_t smac_mdt;
- /*
- * The following fields are protected by smac_lock
- */
- krwlock_t smac_lock;
- enum softmac_state smac_state;
+ /* Following fields protected by the mac perimeter */
+ softmac_lower_state_t smac_lower_state;
/* Lower stream structure */
softmac_lower_t *smac_lower;
} softmac_t;
@@ -193,9 +207,6 @@ typedef struct smac_ioc_start_s {
#define SMAC_IOC ('S' << 24 | 'M' << 16 | 'C' << 8)
#define SMAC_IOC_START (SMAC_IOC | 0x01)
-#define SOFTMAC_BLANK_TICKS 128
-#define SOFTMAC_BLANK_PKT_COUNT 8
-
extern dev_info_t *softmac_dip;
#define SOFTMAC_DEV_NAME "softmac"
@@ -217,9 +228,9 @@ extern int softmac_m_unicst(void *, const uint8_t *);
extern void softmac_m_ioctl(void *, queue_t *, mblk_t *);
extern int softmac_m_stat(void *, uint_t, uint64_t *);
extern mblk_t *softmac_m_tx(void *, mblk_t *);
-extern void softmac_m_resources(void *);
extern int softmac_proto_tx(softmac_lower_t *, mblk_t *, mblk_t **);
extern void softmac_ioctl_tx(softmac_lower_t *, mblk_t *, mblk_t **);
+extern void softmac_notify_thread(void *);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index 64e52ba808..ec09b3a88b 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SQUEUE_H
#define _SYS_SQUEUE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -47,7 +44,30 @@ typedef struct squeue_s squeue_t;
(mp)->b_prev = (mblk_t *)(arg); \
}
-#define GET_SQUEUE(mp) ((conn_t *)((mp)->b_prev))->conn_sqp
+#define GET_SQUEUE(mp) ((conn_t *)((mp)->b_prev))->conn_sqp
+
+#define SQ_FILL 0x0001
+#define SQ_NODRAIN 0x0002
+#define SQ_PROCESS 0x0004
+
+#define SQUEUE_ENTER(sqp, head, tail, cnt, flag, tag) { \
+ sqp->sq_enter(sqp, head, tail, cnt, flag, tag); \
+}
+
+#define SQUEUE_ENTER_ONE(sqp, mp, proc, arg, flag, tag) { \
+ ASSERT(mp->b_next == NULL); \
+ ASSERT(mp->b_prev == NULL); \
+ SET_SQUEUE(mp, proc, arg); \
+ SQUEUE_ENTER(sqp, mp, mp, 1, flag, tag); \
+}
+
+/*
+ * May be called only by a thread executing in the squeue. The thread must
+ * not continue to execute any code needing squeue protection after calling
+ * this macro. Please see the comments in squeue.c for more details.
+ */
+#define SQUEUE_SWITCH(connp, new_sqp) \
+ (connp)->conn_sqp = new_sqp;
/*
* Facility-special private data in squeues.
@@ -57,26 +77,13 @@ typedef enum {
SQPRIVATE_MAX
} sqprivate_t;
-typedef void (*sqproc_t)(void *, mblk_t *, void *);
-
extern void squeue_init(void);
-extern squeue_t *squeue_create(char *, processorid_t, clock_t, pri_t);
+extern squeue_t *squeue_create(clock_t, pri_t);
extern void squeue_bind(squeue_t *, processorid_t);
extern void squeue_unbind(squeue_t *);
-extern void squeue_enter_chain(squeue_t *, mblk_t *, mblk_t *,
- uint32_t, uint8_t);
-extern void squeue_enter(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t);
-extern void squeue_enter_nodrain(squeue_t *, mblk_t *, sqproc_t, void *,
- uint8_t);
-extern void squeue_fill(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t);
+extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
+ uint32_t, int, uint8_t);
extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
-extern processorid_t squeue_binding(squeue_t *);
-
-extern void squeue_profile_reset(squeue_t *);
-extern void squeue_profile_enable(squeue_t *);
-extern void squeue_profile_disable(squeue_t *);
-extern void squeue_profile_stop(void);
-extern void squeue_profile_start(void);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index 54870c067c..501377e53f 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -19,20 +19,21 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SQUEUE_IMPL_H
#define _SYS_SQUEUE_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
+#include <sys/disp.h>
+#include <sys/types.h>
#include <sys/squeue.h>
+#include <inet/ip.h>
#define SQ_NAMELEN 31
@@ -55,6 +56,8 @@ extern "C" {
#define SQUEUE_PROFILE 0
#endif
+#define SQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI
+
typedef struct sqstat_s {
uint_t sq_max_qlen;
uint_t sq_npackets_worker;
@@ -70,60 +73,102 @@ typedef struct sqstat_s {
hrtime_t sq_time_other;
} sqstat_t;
+typedef struct squeue_set_s {
+ squeue_t *sqs_head;
+ squeue_t *sqs_default;
+ processorid_t sqs_cpuid;
+} squeue_set_t;
+
+typedef void (*sqproc_t)(void *, mblk_t *, void *);
+typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t,
+ int, uint8_t);
+typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t);
+
+extern void squeue_worker_wakeup(squeue_t *);
+extern int ip_squeue_flag;
+
struct squeue_s {
- /* Keep the most used members 64bytes cache aligned */
+ sq_enter_proc_t sq_enter; /* sq_process function */
+ sq_drain_proc_t sq_drain; /* sq_drain function */
kmutex_t sq_lock; /* lock before using any member */
uint32_t sq_state; /* state flags and message count */
int sq_count; /* # of mblocks in squeue */
mblk_t *sq_first; /* first mblk chain or NULL */
mblk_t *sq_last; /* last mblk chain or NULL */
- clock_t sq_awaken; /* time async thread was awakened */
kthread_t *sq_run; /* Current thread processing sq */
- void *sq_rx_ring;
- clock_t sq_avg_drain_time; /* Avg time to drain a pkt */
+ ill_rx_ring_t *sq_rx_ring; /* The Rx ring tied to this sq */
+ ill_t *sq_ill; /* The ill this squeue is tied to */
- processorid_t sq_bind; /* processor to bind to */
- kcondvar_t sq_async; /* async thread blocks on */
+ clock_t sq_curr_time; /* Current tick (lbolt) */
+ kcondvar_t sq_worker_cv; /* cond var. worker thread blocks on */
+ kcondvar_t sq_poll_cv; /* cond variable poll_thr waits on */
+ kcondvar_t sq_ctrlop_done_cv; /* cond variable for ctrl ops */
clock_t sq_wait; /* lbolts to wait after a fill() */
- uintptr_t sq_private[SQPRIVATE_MAX];
timeout_id_t sq_tid; /* timer id of pending timeout() */
+ clock_t sq_awaken; /* time async thread was awakened */
+
+ processorid_t sq_bind; /* processor to bind to */
kthread_t *sq_worker; /* kernel thread id */
- char sq_name[SQ_NAMELEN + 1];
+ kthread_t *sq_poll_thr; /* polling thread */
+ uintptr_t sq_private[SQPRIVATE_MAX];
+
+ squeue_t *sq_next; /* managed by squeue creator */
+ squeue_set_t *sq_set; /* managed by squeue creator */
-#if SQUEUE_DEBUG
- /* Debug-only fields */
+ pri_t sq_priority; /* squeue thread priority */
+
+ /* Keep the debug-only fields at the end of the structure */
+#ifdef DEBUG
int sq_isintr; /* serviced by interrupt */
mblk_t *sq_curmp;
void (*sq_curproc)();
conn_t *sq_connp;
uchar_t sq_tag;
#endif
-
-#if SQUEUE_PROFILE
- /* Profiling fields */
- kstat_t *sq_kstat; /* exported statistics */
- sqstat_t sq_stats;
-#endif
};
/*
* State flags.
* Note: The MDB IP module depends on the values of these flags.
*/
-#define SQS_PROC 0x0001 /* being processed */
-#define SQS_WORKER 0x0002 /* worker thread */
-#define SQS_ENTER 0x0004 /* enter thread */
-#define SQS_FAST 0x0008 /* enter-fast thread */
-#define SQS_USER 0x0010 /* A non interrupt user */
-#define SQS_BOUND 0x0020 /* Worker thread is bound */
-#define SQS_PROFILE 0x0040 /* Enable profiling */
-#define SQS_REENTER 0x0080 /* Re entered thread */
-#define SQS_TMO_PROG 0x0100 /* Timeout is being set */
-#define SQS_POLL_CAPAB 0x0200 /* Squeue can control interrupts */
-#define SQS_NO_INTR 0x0400 /* Interrupts currently disabled */
-#define SQS_ILL_BOUND 0x0800 /* Squeue bound to an ill */
-#define SQS_GET_PKTS 0x1000 /* Moving pkts from NIC in progress */
-#define SQS_DEFAULT 0x2000 /* The default squeue for the CPU */
+#define SQS_PROC 0x00000001 /* being processed */
+#define SQS_WORKER 0x00000002 /* worker thread */
+#define SQS_ENTER 0x00000004 /* enter thread */
+#define SQS_FAST 0x00000008 /* enter-fast thread */
+
+#define SQS_USER 0x00000010 /* A non interrupt user */
+#define SQS_BOUND 0x00000020 /* Worker thread is bound */
+#define SQS_REENTER 0x00000040 /* Re entered thread */
+#define SQS_TMO_PROG 0x00000080 /* Timeout is being set */
+
+#define SQS_POLL_CAPAB 0x00000100 /* Squeue can control interrupts */
+#define SQS_ILL_BOUND 0x00000200 /* Squeue bound to an ill */
+#define SQS_GET_PKTS 0x00000400 /* Moving pkts from NIC in progress */
+#define SQS_DEFAULT 0x00000800 /* The default squeue for the CPU */
+
+#define SQS_POLLING 0x00001000 /* Squeue in polling mode */
+#define SQS_INTR_BLANK 0x00002000 /* Interrupt blanking capability */
+#define SQS_PROC_HELD 0x00004000 /* SQS_PROC is held by the caller */
+#define SQS_FORCE_TIMER 0x00008000 /* Schedule worker due to B/W control */
+
+#define SQS_POLL_CLEANUP 0x00010000
+#define SQS_POLL_CLEANUP_DONE 0x00020000
+#define SQS_POLL_QUIESCE 0x00040000
+#define SQS_POLL_QUIESCE_DONE 0x00080000
+
+#define SQS_POLL_RESTART 0x00100000
+#define SQS_POLL_THR_QUIESCED 0x00200000
+#define SQS_POLL_THR_RESTART 0x00400000
+#define SQS_POLL_PROC 0x00800000 /* Poll thread processing the sq */
+
+#define SQS_POLL_RESTART_DONE 0x01000000
+#define SQS_POLL_THR_QUIESCE 0x02000000
+
+#define SQS_WORKER_THR_CONTROL \
+ (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP)
+
+#define SQS_POLL_THR_CONTROL \
+ (SQS_POLL_THR_QUIESCE | SQS_POLL_THR_RESTART)
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index 6436c5a0cc..41097cab7f 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -30,8 +30,6 @@
#ifndef _SYS_STREAM_H
#define _SYS_STREAM_H
-#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 11.44 */
-
/*
* For source compatibility
*/
@@ -414,6 +412,7 @@ typedef struct bcache {
#define STRUIO_ZCNOTIFY 0x10 /* notify stream head when mblk acked */
#define STRUIO_EAGER 0x20 /* new eager; db_cksumstart has squeue to use */
#define STRUIO_POLICY 0x40 /* new eager when IPsec is enabled */
+#define STRUIO_CONNECT 0x80 /* conn did a connect */
/*
* Message flags. These are interpreted by the stream head.
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index 401e69dc5e..04c778feaa 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -30,8 +30,6 @@
#ifndef _SYS_STRSUBR_H
#define _SYS_STRSUBR_H
-#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.17 */
-
/*
* WARNING:
* Everything in this file is private, belonging to the
@@ -1238,6 +1236,8 @@ extern int hcksum_assoc(mblk_t *, struct multidata_s *, struct pdesc_s *,
uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, int);
extern void hcksum_retrieve(mblk_t *, struct multidata_s *, struct pdesc_s *,
uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+extern void lso_info_set(mblk_t *, uint32_t, uint32_t);
+extern void lso_info_get(mblk_t *, uint32_t *, uint32_t *);
extern unsigned int bcksum(uchar_t *, int, unsigned int);
extern boolean_t is_vmloaned_mblk(mblk_t *, struct multidata_s *,
struct pdesc_s *);
diff --git a/usr/src/uts/common/sys/vlan.h b/usr/src/uts/common/sys/vlan.h
index 2a4e4c8ef0..11c7d41e83 100644
--- a/usr/src/uts/common/sys/vlan.h
+++ b/usr/src/uts/common/sys/vlan.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -30,14 +30,14 @@
#ifndef _SYS_VLAN_H
#define _SYS_VLAN_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
#define VLAN_TAGSZ 4
+#define VLAN_TPID 0x8100u
+
#define VLAN_ID_MASK 0x0fffu
#define VLAN_ID_SIZE 12
#define VLAN_ID_SHIFT 0
diff --git a/usr/src/uts/common/sys/vnic.h b/usr/src/uts/common/sys/vnic.h
index d17da6bf44..37f962e2ff 100644
--- a/usr/src/uts/common/sys/vnic.h
+++ b/usr/src/uts/common/sys/vnic.h
@@ -30,35 +30,101 @@
#include <sys/ethernet.h>
#include <sys/param.h>
#include <sys/mac.h>
+#include <sys/mac_flow.h>
#include <sys/dld_ioc.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
- * Note that the datastructures defined here define an ioctl interface
- * that is shared betwen user and kernel space. The vnic driver thus
- * assumes that the structures have identical layout and size when
- * compiled in either IPL32 or LP64.
+ * Extended diagnostic codes that can be returned by the various
*/
+typedef enum {
+ VNIC_IOC_DIAG_NONE,
+ VNIC_IOC_DIAG_MACADDR_NIC,
+ VNIC_IOC_DIAG_MACADDR_INUSE,
+ VNIC_IOC_DIAG_MACADDR_INVALID,
+ VNIC_IOC_DIAG_MACADDRLEN_INVALID,
+ VNIC_IOC_DIAG_MACFACTORYSLOTINVALID,
+ VNIC_IOC_DIAG_MACFACTORYSLOTUSED,
+ VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED,
+ VNIC_IOC_DIAG_MACFACTORYNOTSUP,
+ VNIC_IOC_DIAG_MACPREFIX_INVALID,
+ VNIC_IOC_DIAG_MACPREFIXLEN_INVALID,
+ VNIC_IOC_DIAG_MACMARGIN_INVALID,
+ VNIC_IOC_DIAG_NO_HWRINGS
+} vnic_ioc_diag_t;
/*
- * For now, we support only MAC addresses specified by value.
+ * Allowed VNIC MAC address types.
+ *
+ * - VNIC_MAC_ADDR_TYPE_FIXED, VNIC_MAC_ADDR_TYPE_RANDOM:
+ * The MAC address is specified by value by the caller, which
+ * itself can obtain it from the user directly,
+ * or pick it in a random fashion. Which method is used by the
+ * caller is irrelevant to the VNIC driver. However two different
+ * types are provided so that the information can be made available
+ * back to user-space when listing the kernel defined VNICs.
+ *
+ * When a VNIC is created, the address in passed through the
+ * vc_mac_addr and vc_mac_len fields of the vnic_ioc_create_t
+ * structure.
+ *
+ * - VNIC_MAC_ADDR_TYPE_FACTORY: the MAC address is obtained from
+ * one of the MAC factory MAC addresses of the underyling NIC.
+ *
+ * - VNIC_MAC_ADDR_TYPE_AUTO: the VNIC driver attempts to
+ * obtain the address from one of the factory MAC addresses of
+ * the underlying NIC. If none is available, the specified
+ * MAC address value is used.
+ *
+ * - VNIC_MAC_ADDR_TYPE_PRIMARY: this is a VNIC based VLAN. The
+ * address for this is the address of the primary MAC client.
+ *
*/
typedef enum {
- VNIC_MAC_ADDR_TYPE_FIXED
+ VNIC_MAC_ADDR_TYPE_FIXED,
+ VNIC_MAC_ADDR_TYPE_RANDOM,
+ VNIC_MAC_ADDR_TYPE_FACTORY,
+ VNIC_MAC_ADDR_TYPE_AUTO,
+ VNIC_MAC_ADDR_TYPE_PRIMARY
} vnic_mac_addr_type_t;
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
#define VNIC_IOC_CREATE VNICIOC(1)
+#define VNIC_IOC_CREATE_NODUPCHECK 0x00000001
+#define VNIC_IOC_CREATE_ANCHOR 0x00000002
+
+/*
+ * Force creation of VLAN based VNIC without checking if the
+ * undelying MAC supports the margin size.
+ */
+#define VNIC_IOC_CREATE_FORCE 0x00000004
+
+/* Allocate a hardware ring to the vnic */
+#define VNIC_IOC_CREATE_REQ_HWRINGS 0x00000008
+
typedef struct vnic_ioc_create {
datalink_id_t vc_vnic_id;
datalink_id_t vc_link_id;
- uint_t vc_mac_len;
vnic_mac_addr_type_t vc_mac_addr_type;
+ uint_t vc_mac_len;
uchar_t vc_mac_addr[MAXMACADDRLEN];
+ uint_t vc_mac_prefix_len;
+ int vc_mac_slot;
+ uint16_t vc_vid;
+ uint_t vc_status;
+ uint_t vc_flags;
+ vnic_ioc_diag_t vc_diag;
+ mac_resource_props_t vc_resource_props;
} vnic_ioc_create_t;
#define VNIC_IOC_DELETE VNICIOC(2)
@@ -69,33 +135,43 @@ typedef struct vnic_ioc_delete {
#define VNIC_IOC_INFO VNICIOC(3)
-typedef struct vnic_ioc_info_vnic {
+typedef struct vnic_info {
datalink_id_t vn_vnic_id;
datalink_id_t vn_link_id;
- uint32_t vn_mac_len;
- uchar_t vn_mac_addr[MAXMACADDRLEN];
vnic_mac_addr_type_t vn_mac_addr_type;
-} vnic_ioc_info_vnic_t;
+ uint_t vn_mac_len;
+ uchar_t vn_mac_addr[MAXMACADDRLEN];
+ uint_t vn_mac_slot;
+ uint32_t vn_mac_prefix_len;
+ uint16_t vn_vid;
+ boolean_t vn_force;
+ mac_resource_props_t vn_resource_props;
+} vnic_info_t;
typedef struct vnic_ioc_info {
- uint_t vi_nvnics;
- uint_t vi_size;
- datalink_id_t vi_vnic_id; /* DATALINK_ALL_LINKID returns all */
- datalink_id_t vi_linkid;
+ vnic_info_t vi_info;
} vnic_ioc_info_t;
#define VNIC_IOC_MODIFY VNICIOC(4)
#define VNIC_IOC_MODIFY_ADDR 0x01
+#define VNIC_IOC_MODIFY_RESOURCE_CTL 0x02
typedef struct vnic_ioc_modify {
datalink_id_t vm_vnic_id;
uint_t vm_modify_mask;
+ uint_t vm_mac_len;
+ int vm_mac_slot;
uchar_t vm_mac_addr[MAXMACADDRLEN];
vnic_mac_addr_type_t vm_mac_addr_type;
- uint_t vm_mac_len;
+ mac_resource_props_t vm_resource_props;
+ vnic_ioc_diag_t vm_diag;
} vnic_ioc_modify_t;
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h
index 6cb64523a8..b5dd59eea3 100644
--- a/usr/src/uts/common/sys/vnic_impl.h
+++ b/usr/src/uts/common/sys/vnic_impl.h
@@ -26,96 +26,40 @@
#ifndef _SYS_VNIC_IMPL_H
#define _SYS_VNIC_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
#include <sys/vnic.h>
+#include <sys/mac_flow.h>
#include <sys/ksynch.h>
#ifdef __cplusplus
extern "C" {
#endif
-typedef void (*vnic_rx_fn_t)(void *, void *, mblk_t *);
-
-typedef struct vnic_flow_fn_info_s {
- vnic_rx_fn_t ff_fn;
- void *ff_arg1;
- void *ff_arg2;
-} vnic_flow_fn_info_t;
-
-typedef struct vnic_flow_s {
- uchar_t vf_addr[MAXMACADDRLEN];
- uint_t vf_addr_len;
- vnic_flow_fn_info_t vf_fn_info;
- void *vf_cookie;
- struct vnic_flow_s *vf_next;
- kmutex_t vf_lock;
- kcondvar_t vf_cv;
- uint32_t vf_refs;
- boolean_t vf_clearing;
- boolean_t vf_is_active;
-} vnic_flow_t;
-
-typedef struct vnic_flow_tab_s {
- vnic_flow_t *vt_flow_list;
- krwlock_t vt_lock;
- uint_t vt_addr_len;
-} vnic_flow_tab_t;
-
-typedef struct vnic_mac_s {
- mac_handle_t va_mh;
- uint_t va_refs;
- datalink_id_t va_linkid;
- const mac_txinfo_t *va_txinfo;
- struct vnic_bcast_grp_s *va_bcast_grp;
- krwlock_t va_bcast_grp_lock;
- size_t va_addr_len;
- mac_notify_handle_t va_notify_hdl;
- mac_rx_handle_t va_rx_hdl;
- vnic_flow_t *va_active_flow;
- vnic_flow_tab_t *va_flow_tab;
- boolean_t va_mac_set;
- struct vnic_s *va_promisc;
- krwlock_t va_promisc_lock;
- uint64_t va_promisc_gen;
-} vnic_mac_t;
-
typedef struct vnic_s {
- datalink_id_t vn_id;
+ datalink_id_t vn_id;
uint32_t
- vn_started : 1,
- vn_promisc : 1,
- vn_bcast_grp : 1,
- vn_multi_mac : 1,
- vn_promisc_mac : 1,
- vn_pad_to_bit_31 : 27;
-
- int vn_slot_id;
- multiaddress_capab_t vn_mma_capab;
- uint8_t vn_addr[ETHERADDRL];
- vnic_mac_addr_type_t vn_addr_type;
-
- mac_handle_t vn_mh;
- uint32_t vn_margin;
- vnic_mac_t *vn_vnic_mac;
- vnic_flow_t *vn_flow_ent;
- uint32_t vn_hcksum_txflags;
- struct vnic_s *vn_promisc_next;
-
- uint64_t vn_stat_multircv;
- uint64_t vn_stat_brdcstrcv;
- uint64_t vn_stat_multixmt;
- uint64_t vn_stat_brdcstxmt;
- uint64_t vn_stat_ierrors;
- uint64_t vn_stat_oerrors;
- uint64_t vn_stat_rbytes;
- uint64_t vn_stat_ipackets;
- uint64_t vn_stat_obytes;
- uint64_t vn_stat_opackets;
+ vn_started : 1,
+ vn_pad_to_bit_31 : 31;
+
+ mac_handle_t vn_mh;
+ mac_handle_t vn_lower_mh;
+ mac_client_handle_t vn_mch;
+ mac_unicast_handle_t vn_muh;
+ uint32_t vn_margin;
+ int vn_slot_id;
+ vnic_mac_addr_type_t vn_addr_type;
+ uint8_t vn_addr[MAXMACADDRLEN];
+ size_t vn_addr_len;
+ uint16_t vn_vid;
+ boolean_t vn_force;
+ datalink_id_t vn_link_id;
+ mac_notify_handle_t vn_mnh;
+
+ uint32_t vn_hcksum_txflags;
} vnic_t;
-#define vn_txinfo vn_vnic_mac->va_txinfo
-
#define vn_madd_naddr vn_mma_capab.maddr_naddr
#define vn_maddr_naddrfree vn_mma_capab.maddr_naddrfree
#define vn_maddr_flag vn_mma_capab.maddr_flag
@@ -126,68 +70,19 @@ typedef struct vnic_s {
#define vn_maddr_modify vn_mma_capab.maddr_modify
#define vn_maddr_get vn_mma_capab.maddr_get
-#define VNIC_FLOW_REFHOLD(flow) { \
- mutex_enter(&(flow)->vf_lock); \
- (flow)->vf_refs++; \
- mutex_exit(&(flow)->vf_lock); \
-}
-
-#define VNIC_FLOW_REFRELE(flow) { \
- mutex_enter(&(flow)->vf_lock); \
- if (--(flow)->vf_refs == 0 && (flow)->vf_clearing) { \
- (flow)->vf_clearing = B_FALSE; \
- cv_signal(&(flow)->vf_cv); \
- } \
- mutex_exit(&(flow)->vf_lock); \
-}
-
-extern int vnic_dev_create(datalink_id_t, datalink_id_t, int, uchar_t *);
+extern int vnic_dev_create(datalink_id_t, datalink_id_t, vnic_mac_addr_type_t *,
+ int *, uchar_t *, int *, uint_t, uint16_t, mac_resource_props_t *,
+ uint32_t, vnic_ioc_diag_t *);
extern int vnic_dev_modify(datalink_id_t, uint_t, vnic_mac_addr_type_t,
- uint_t, uchar_t *);
-extern int vnic_dev_delete(datalink_id_t);
-
-typedef int (*vnic_info_new_vnic_fn_t)(void *, datalink_id_t,
- vnic_mac_addr_type_t, uint_t, uint8_t *, datalink_id_t);
+ uint_t, uchar_t *, uint_t, mac_resource_props_t *);
+extern int vnic_dev_delete(datalink_id_t, uint32_t);
extern void vnic_dev_init(void);
extern void vnic_dev_fini(void);
extern uint_t vnic_dev_count(void);
extern dev_info_t *vnic_get_dip(void);
-extern int vnic_info(uint_t *, datalink_id_t, datalink_id_t, void *,
- vnic_info_new_vnic_fn_t);
-
-extern void vnic_rx(void *, void *, mblk_t *);
-extern mblk_t *vnic_fix_cksum(mblk_t *);
-extern mblk_t *vnic_copymsgchain_cksum(mblk_t *);
-extern mblk_t *vnic_copymsg_cksum(mblk_t *);
-
-extern void vnic_promisc_rx(vnic_mac_t *, vnic_t *, mblk_t *);
-
-extern void vnic_bcast_init(void);
-extern void vnic_bcast_fini(void);
-extern int vnic_bcast_add(vnic_t *, const uint8_t *, mac_addrtype_t);
-extern void vnic_bcast_delete(vnic_t *, const uint8_t *);
-extern void vnic_bcast_send(void *, void *, mblk_t *);
-
-extern void vnic_classifier_init(void);
-extern void vnic_classifier_fini(void);
-extern vnic_flow_t *vnic_classifier_flow_create(uint_t, uchar_t *, void *,
- boolean_t, int);
-extern void vnic_classifier_flow_destroy(vnic_flow_t *);
-extern void vnic_classifier_flow_add(vnic_mac_t *, vnic_flow_t *, vnic_rx_fn_t,
- void *, void *);
-extern void vnic_classifier_flow_remove(vnic_mac_t *, vnic_flow_t *);
-extern void vnic_classifier_flow_update_addr(vnic_flow_t *, uchar_t *);
-extern void vnic_classifier_flow_update_fn(vnic_flow_t *, vnic_rx_fn_t,
- void *, void *);
-extern int vnic_classifier_flow_tab_init(vnic_mac_t *, uint_t, int);
-extern void vnic_classifier_flow_tab_fini(vnic_mac_t *);
-extern vnic_flow_t *vnic_classifier_get_flow(vnic_mac_t *, mblk_t *);
-extern void *vnic_classifier_get_client_cookie(vnic_flow_t *);
-extern vnic_flow_fn_info_t *vnic_classifier_get_fn_info(vnic_flow_t *);
-extern boolean_t vnic_classifier_is_active(vnic_flow_t *);
-
+extern int vnic_info(vnic_info_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/syscall/acctctl.c b/usr/src/uts/common/syscall/acctctl.c
index 4fb322a211..ce325109be 100644
--- a/usr/src/uts/common/syscall/acctctl.c
+++ b/usr/src/uts/common/syscall/acctctl.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/param.h>
@@ -115,6 +113,7 @@ ac_file_in_use(vnode_t *vp)
mutex_enter(&acg->ac_proc.ac_lock);
mutex_enter(&acg->ac_task.ac_lock);
mutex_enter(&acg->ac_flow.ac_lock);
+ mutex_enter(&acg->ac_net.ac_lock);
}
for (acg = list_head(&exacct_globals_list); !in_use && acg != NULL;
@@ -125,7 +124,8 @@ ac_file_in_use(vnode_t *vp)
*/
if (vn_compare(acg->ac_proc.ac_vnode, vp) ||
vn_compare(acg->ac_task.ac_vnode, vp) ||
- vn_compare(acg->ac_flow.ac_vnode, vp))
+ vn_compare(acg->ac_flow.ac_vnode, vp) ||
+ vn_compare(acg->ac_net.ac_vnode, vp))
in_use = B_TRUE;
}
@@ -137,6 +137,7 @@ ac_file_in_use(vnode_t *vp)
mutex_exit(&acg->ac_proc.ac_lock);
mutex_exit(&acg->ac_task.ac_lock);
mutex_exit(&acg->ac_flow.ac_lock);
+ mutex_exit(&acg->ac_net.ac_lock);
}
mutex_exit(&exacct_globals_list_lock);
return (in_use);
@@ -449,17 +450,21 @@ acctctl(int cmd, void *buf, size_t bufsz)
info = &acg->ac_proc;
maxres = AC_PROC_MAX_RES;
break;
+ /*
+ * Flow/net accounting isn't configurable in non-global
+ * zones, but we have this field on a per-zone basis for future
+ * expansion as well as the ability to return default "unset"
+ * values for the various AC_*_GET queries. AC_*_SET commands
+ * fail with EPERM for AC_FLOW and AC_NET in non-global zones.
+ */
case AC_FLOW:
- /*
- * Flow accounting isn't currently configurable in non-global
- * zones, but we have this field on a per-zone basis for future
- * expansion as well as the ability to return default "unset"
- * values for the various AC_*_GET queries. AC_*_SET commands
- * fail with EPERM for AC_FLOW in non-global zones.
- */
info = &acg->ac_flow;
maxres = AC_FLOW_MAX_RES;
break;
+ case AC_NET:
+ info = &acg->ac_net;
+ maxres = AC_NET_MAX_RES;
+ break;
default:
return (set_errno(EINVAL));
}
@@ -468,7 +473,8 @@ acctctl(int cmd, void *buf, size_t bufsz)
case AC_STATE_SET:
if ((error = secpolicy_acct(CRED())) != 0)
break;
- if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+ if ((mode == AC_FLOW || mode == AC_NET) &&
+ getzoneid() != GLOBAL_ZONEID) {
error = EPERM;
break;
}
@@ -480,7 +486,8 @@ acctctl(int cmd, void *buf, size_t bufsz)
case AC_FILE_SET:
if ((error = secpolicy_acct(CRED())) != 0)
break;
- if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+ if ((mode == AC_FLOW || mode == AC_NET) &&
+ getzoneid() != GLOBAL_ZONEID) {
error = EPERM;
break;
}
@@ -492,7 +499,8 @@ acctctl(int cmd, void *buf, size_t bufsz)
case AC_RES_SET:
if ((error = secpolicy_acct(CRED())) != 0)
break;
- if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+ if ((mode == AC_FLOW || mode == AC_NET) &&
+ getzoneid() != GLOBAL_ZONEID) {
error = EPERM;
break;
}
@@ -580,6 +588,7 @@ exacct_zone_shutdown(zoneid_t zoneid, void *data)
exacct_free_info(&acg->ac_proc);
exacct_free_info(&acg->ac_task);
exacct_free_info(&acg->ac_flow);
+ exacct_free_info(&acg->ac_net);
}
/* ARGSUSED */
@@ -595,6 +604,7 @@ exacct_zone_fini(zoneid_t zoneid, void *data)
mutex_destroy(&acg->ac_proc.ac_lock);
mutex_destroy(&acg->ac_task.ac_lock);
mutex_destroy(&acg->ac_flow.ac_lock);
+ mutex_destroy(&acg->ac_net.ac_lock);
kmem_free(acg, sizeof (*acg));
}
diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c
index 6ac3e6e6ab..308f3c60ff 100644
--- a/usr/src/uts/common/xen/io/xnb.c
+++ b/usr/src/uts/common/xen/io/xnb.c
@@ -35,6 +35,7 @@
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/mac.h>
+#include <sys/mac_impl.h> /* XXXXBOW - remove, included for mac_fix_cksum() */
#include <sys/dlpi.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
@@ -247,7 +248,7 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
HCK_FULLCKSUM, KM_NOSLEEP);
- return (vnic_fix_cksum(mp));
+ return (mac_fix_cksum(mp));
}
mblk_t *
diff --git a/usr/src/uts/common/xen/io/xnbo.c b/usr/src/uts/common/xen/io/xnbo.c
index 790e850289..79831ee7f1 100644
--- a/usr/src/uts/common/xen/io/xnbo.c
+++ b/usr/src/uts/common/xen/io/xnbo.c
@@ -34,8 +34,12 @@
#include "xnb.h"
#include <sys/sunddi.h>
+#include <sys/ddi.h>
#include <sys/modctl.h>
#include <sys/strsubr.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
#include <sys/mac.h>
#include <net/if.h>
#include <sys/dlpi.h>
@@ -45,9 +49,9 @@
typedef struct xnbo {
mac_handle_t o_mh;
- mac_rx_handle_t o_mrh;
- const mac_txinfo_t *o_mtx;
- mac_notify_handle_t o_mnh;
+ mac_client_handle_t o_mch;
+ mac_unicast_handle_t o_mah;
+ mac_promisc_handle_t o_mphp;
boolean_t o_running;
boolean_t o_promiscuous;
uint32_t o_hcksum_capab;
@@ -70,11 +74,9 @@ xnbo_to_mac(xnb_t *xnbp, mblk_t *mp)
goto fail;
}
- mp = xnbop->o_mtx->mt_fn(xnbop->o_mtx->mt_arg, mp);
-
- if (mp != NULL) {
+ if (mac_tx(xnbop->o_mch, mp, 0,
+ MAC_DROP_ON_NO_DESC, NULL) != NULL) {
xnbp->xnb_stat_mac_full++;
- goto fail;
}
return;
@@ -156,7 +158,8 @@ xnbo_cksum_to_peer(xnb_t *xnbp, mblk_t *mp)
*/
/*ARGSUSED*/
static void
-xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
xnb_t *xnbp = arg;
@@ -173,7 +176,8 @@ xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
*/
/*ARGSUSED*/
static void
-xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
xnb_t *xnbp = arg;
xnbo_t *xnbop = xnbp->xnb_flavour_data;
@@ -216,25 +220,12 @@ xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
#undef ADD
if (keep_head != NULL)
- xnbo_from_mac(xnbp, mrh, keep_head);
+ xnbo_from_mac(xnbp, mrh, keep_head, B_FALSE);
if (free_head != NULL)
freemsgchain(free_head);
}
-static void
-xnbo_notify(void *arg, mac_notify_type_t type)
-{
- xnb_t *xnbp = arg;
- xnbo_t *xnbop = xnbp->xnb_flavour_data;
-
- switch (type) {
- case MAC_NOTE_PROMISC:
- xnbop->o_mtx = mac_tx_get(xnbop->o_mh);
- break;
- }
-}
-
static boolean_t
xnbo_open_mac(xnb_t *xnbp, char *mac)
{
@@ -242,8 +233,10 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
int err, need_rx_filter, need_setphysaddr, need_promiscuous;
const mac_info_t *mi;
char *xsname;
- void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *);
+ void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+ struct ether_addr ea;
uint_t max_sdu;
+ mac_diag_t diag;
xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
@@ -279,8 +272,22 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
return (B_FALSE);
}
- xnbop->o_mnh = mac_notify_add(xnbop->o_mh, xnbo_notify, xnbp);
- ASSERT(xnbop->o_mnh != NULL);
+ if (mac_client_open(xnbop->o_mh, &xnbop->o_mch, NULL,
+ MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) {
+ cmn_err(CE_WARN, "xnbo_open_mac: "
+ "error (%d) opening mac client", err);
+ xnbo_close_mac(xnbop);
+ return (B_FALSE);
+ }
+
+ err = mac_unicast_primary_add(xnbop->o_mch, &xnbop->o_mah, &diag);
+ if (err != 0) {
+ cmn_err(CE_WARN, "xnbo_open_mac: "
+ "failed to get the primary MAC address of "
+ "%s: %d", mac, err);
+ xnbo_close_mac(xnbop);
+ return (B_FALSE);
+ }
/*
* Should the receive path filter packets from the downstream
@@ -294,11 +301,27 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
else
rx_fn = xnbo_from_mac;
- xnbop->o_mrh = mac_rx_add(xnbop->o_mh, rx_fn, xnbp);
- ASSERT(xnbop->o_mrh != NULL);
-
- xnbop->o_mtx = mac_tx_get(xnbop->o_mh);
- ASSERT(xnbop->o_mtx != NULL);
+ /*
+ * Should we set the underlying NIC into promiscuous mode? The
+ * default is "no".
+ */
+ if (xenbus_scanf(XBT_NULL, xsname,
+ "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0)
+ need_promiscuous = 0;
+ if (need_promiscuous == 0) {
+ mac_rx_set(xnbop->o_mch, rx_fn, xnbp);
+ } else {
+ err = mac_promisc_add(xnbop->o_mch, MAC_CLIENT_PROMISC_ALL,
+ rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
+ if (err != 0) {
+ cmn_err(CE_WARN, "xnbo_open_mac: "
+ "cannot enable promiscuous mode of %s: %d",
+ mac, err);
+ xnbo_close_mac(xnbop);
+ return (B_FALSE);
+ }
+ xnbop->o_promiscuous = B_TRUE;
+ }
if (!mac_capab_get(xnbop->o_mh, MAC_CAPAB_HCKSUM,
&xnbop->o_hcksum_capab))
@@ -312,45 +335,17 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
"SUNW-need-set-physaddr", "%d", &need_setphysaddr) != 0)
need_setphysaddr = 0;
if (need_setphysaddr > 0) {
- struct ether_addr ea;
-
- err = mac_unicst_set(xnbop->o_mh, xnbp->xnb_mac_addr);
+ err = mac_unicast_primary_set(xnbop->o_mh, xnbp->xnb_mac_addr);
/* Warn, but continue on. */
if (err != 0) {
bcopy(xnbp->xnb_mac_addr, ea.ether_addr_octet,
ETHERADDRL);
cmn_err(CE_WARN, "xnbo_open_mac: "
"cannot set MAC address of %s to "
- "%s: %d", mac, ether_sprintf(&ea),
- err);
- }
- }
-
- /*
- * Should we set the underlying NIC into promiscuous mode? The
- * default is "no".
- */
- if (xenbus_scanf(XBT_NULL, xsname,
- "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0)
- need_promiscuous = 0;
- if (need_promiscuous > 0) {
- err = mac_promisc_set(xnbop->o_mh, B_TRUE, MAC_DEVPROMISC);
- if (err != 0) {
- cmn_err(CE_WARN, "xnbo_open_mac: "
- "cannot enable promiscuous mode of %s: %d",
- mac, err);
- xnbo_close_mac(xnbop);
- return (B_FALSE);
+ "%s: %d", mac, ether_sprintf(&ea), err);
}
- xnbop->o_promiscuous = B_TRUE;
}
- if ((err = mac_start(xnbop->o_mh)) != 0) {
- cmn_err(CE_WARN, "xnbo_open_mac: "
- "cannot start mac device (%d)", err);
- xnbo_close_mac(xnbop);
- return (B_FALSE);
- }
xnbop->o_running = B_TRUE;
return (B_TRUE);
@@ -385,26 +380,24 @@ xnbo_close_mac(xnbo_t *xnbop)
return;
if (xnbop->o_running) {
- mac_stop(xnbop->o_mh);
xnbop->o_running = B_FALSE;
}
if (xnbop->o_promiscuous) {
- (void) mac_promisc_set(xnbop->o_mh, B_FALSE,
- MAC_DEVPROMISC);
+ (void) mac_promisc_remove(xnbop->o_mphp);
xnbop->o_promiscuous = B_FALSE;
+ } else {
+ mac_rx_clear(xnbop->o_mch);
}
- xnbop->o_mtx = NULL;
-
- if (xnbop->o_mrh != NULL) {
- mac_rx_remove(xnbop->o_mh, xnbop->o_mrh, B_TRUE);
- xnbop->o_mrh = NULL;
+ if (xnbop->o_mah != NULL) {
+ (void) mac_unicast_remove(xnbop->o_mch, xnbop->o_mah);
+ xnbop->o_mah = NULL;
}
- if (xnbop->o_mnh != NULL) {
- mac_notify_remove(xnbop->o_mh, xnbop->o_mnh);
- xnbop->o_mnh = NULL;
+ if (xnbop->o_mch != NULL) {
+ mac_client_close(xnbop->o_mch, 0);
+ xnbop->o_mch = NULL;
}
mac_close(xnbop->o_mh);
@@ -453,8 +446,9 @@ xnbo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
xnbop = kmem_zalloc(sizeof (*xnbop), KM_SLEEP);
xnbop->o_mh = NULL;
- xnbop->o_mrh = NULL;
- xnbop->o_mtx = NULL;
+ xnbop->o_mch = NULL;
+ xnbop->o_mah = NULL;
+ xnbop->o_mphp = NULL;
xnbop->o_running = B_FALSE;
xnbop->o_hcksum_capab = 0;
diff --git a/usr/src/uts/common/xen/io/xnbu.c b/usr/src/uts/common/xen/io/xnbu.c
index f5c0ba9809..80e2378608 100644
--- a/usr/src/uts/common/xen/io/xnbu.c
+++ b/usr/src/uts/common/xen/io/xnbu.c
@@ -40,7 +40,7 @@
#include <sys/strsubr.h>
#include <sys/dlpi.h>
#include <sys/pattr.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <xen/sys/xendev.h>
@@ -51,19 +51,16 @@ static int xnbu_m_set_mac_addr(void *, const uint8_t *);
static int xnbu_m_set_multicast(void *, boolean_t, const uint8_t *);
static int xnbu_m_set_promiscuous(void *, boolean_t);
static int xnbu_m_stat(void *, uint_t, uint64_t *);
-static void xnbu_m_blank(void *, time_t, uint_t);
-static void xnbu_m_resources(void *);
static boolean_t xnbu_m_getcapab(void *, mac_capab_t, void *);
static mblk_t *xnbu_m_send(void *, mblk_t *);
typedef struct xnbu {
mac_handle_t u_mh;
- mac_resource_handle_t u_mrh;
boolean_t u_need_sched;
} xnbu_t;
static mac_callbacks_t xnb_callbacks = {
- MC_RESOURCES | MC_GETCAPAB,
+ MC_GETCAPAB,
xnbu_m_stat,
xnbu_m_start,
xnbu_m_stop,
@@ -71,7 +68,6 @@ static mac_callbacks_t xnb_callbacks = {
xnbu_m_set_multicast,
xnbu_m_set_mac_addr,
xnbu_m_send,
- xnbu_m_resources,
NULL,
xnbu_m_getcapab
};
@@ -84,7 +80,7 @@ xnbu_to_host(xnb_t *xnbp, mblk_t *mp)
ASSERT(mp != NULL);
- mac_rx(xnbup->u_mh, xnbup->u_mrh, mp);
+ mac_rx(xnbup->u_mh, NULL, mp);
mutex_enter(&xnbp->xnb_rx_lock);
@@ -328,32 +324,6 @@ xnbu_m_stat(void *arg, uint_t stat, uint64_t *val)
return (0);
}
-/*ARGSUSED*/
-static void
-xnbu_m_blank(void *arg, time_t ticks, uint_t count)
-{
- /*
- * XXPV dme: blanking is not currently implemented.
- */
-}
-
-static void
-xnbu_m_resources(void *arg)
-{
- xnb_t *xnbp = arg;
- xnbu_t *xnbup = xnbp->xnb_flavour_data;
- mac_rx_fifo_t mrf;
-
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = xnbu_m_blank;
- mrf.mrf_arg = (void *)xnbp;
- mrf.mrf_normal_blank_time = 128; /* XXPV dme: see xnbu_m_blank() */
- mrf.mrf_normal_pkt_count = 8; /* XXPV dme: see xnbu_m_blank() */
-
- xnbup->u_mrh = mac_resource_add(xnbup->u_mh,
- (mac_resource_t *)&mrf);
-}
-
static boolean_t
xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
@@ -369,11 +339,6 @@ xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
*capab = 0;
break;
}
-
- case MAC_CAPAB_POLL:
- /* Just return B_TRUE. */
- break;
-
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/common/xen/io/xnf.c b/usr/src/uts/common/xen/io/xnf.c
index c14c651c61..0813d6cbe1 100644
--- a/usr/src/uts/common/xen/io/xnf.c
+++ b/usr/src/uts/common/xen/io/xnf.c
@@ -80,7 +80,7 @@
#include <inet/ip_impl.h>
#include <sys/gld.h>
#include <sys/modctl.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/bootinfo.h>
#include <sys/mach_mmu.h>
@@ -148,8 +148,6 @@ static int xnf_set_promiscuous(void *, boolean_t);
static mblk_t *xnf_send(void *, mblk_t *);
static uint_t xnf_intr(caddr_t);
static int xnf_stat(void *, uint_t, uint64_t *);
-static void xnf_blank(void *, time_t, uint_t);
-static void xnf_resources(void *);
static void xnf_ioctl(void *, queue_t *, mblk_t *);
static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
@@ -178,7 +176,7 @@ static boolean_t xnf_kstat_init(xnf_t *xnfp);
* XXPV dme: remove MC_IOCTL?
*/
static mac_callbacks_t xnf_callbacks = {
- MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
+ MC_IOCTL | MC_GETCAPAB,
xnf_stat,
xnf_start,
xnf_stop,
@@ -186,7 +184,6 @@ static mac_callbacks_t xnf_callbacks = {
xnf_set_multicast,
xnf_set_mac_addr,
xnf_send,
- xnf_resources,
xnf_ioctl,
xnf_getcapab
};
@@ -1436,7 +1433,7 @@ xnf_intr(caddr_t arg)
mp = xnf_process_recv(xnfp);
if (mp != NULL)
- mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp);
+ mac_rx(xnfp->xnf_mh, NULL, mp);
}
xnfp->xnf_stat_interrupts++;
@@ -2518,39 +2515,6 @@ xnf_stat(void *arg, uint_t stat, uint64_t *val)
/*ARGSUSED*/
static void
-xnf_blank(void *arg, time_t ticks, uint_t count)
-{
- /*
- * XXPV dme: blanking is not currently implemented.
- *
- * It's not obvious how to use the 'ticks' argument here.
- *
- * 'Count' might be used as an indicator of how to set
- * rsp_event when posting receive buffers to the rx_ring. It
- * would replace the code at the tail of xnf_process_recv()
- * that simply indicates that the next completed packet should
- * cause an interrupt.
- */
-}
-
-static void
-xnf_resources(void *arg)
-{
- xnf_t *xnfp = arg;
- mac_rx_fifo_t mrf;
-
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = xnf_blank;
- mrf.mrf_arg = (void *)xnfp;
- mrf.mrf_normal_blank_time = 128; /* XXPV dme: see xnf_blank() */
- mrf.mrf_normal_pkt_count = 8; /* XXPV dme: see xnf_blank() */
-
- xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh,
- (mac_resource_t *)&mrf);
-}
-
-/*ARGSUSED*/
-static void
xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
{
miocnak(q, mp, 0, EINVAL);
@@ -2588,11 +2552,6 @@ xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
*capab = 0;
break;
}
-
- case MAC_CAPAB_POLL:
- /* Just return B_TRUE. */
- break;
-
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/common/xen/io/xnf.h b/usr/src/uts/common/xen/io/xnf.h
index d8edf89f86..9b0cc4c357 100644
--- a/usr/src/uts/common/xen/io/xnf.h
+++ b/usr/src/uts/common/xen/io/xnf.h
@@ -135,7 +135,6 @@ typedef struct xnf {
struct tx_pktinfo xnf_tx_pkt_info[NET_TX_RING_SIZE];
struct xnf_buffer_desc *xnf_rxpkt_bufptr[XNF_MAX_RXDESCS];
- mac_resource_handle_t xnf_rx_handle;
ddi_iblock_cookie_t xnf_icookie;
kmutex_t xnf_tx_buf_mutex;
kmutex_t xnf_rx_buf_mutex;
diff --git a/usr/src/uts/i86xpv/xnb/Makefile b/usr/src/uts/i86xpv/xnb/Makefile
index 4fa08e3f70..dc7503a46e 100644
--- a/usr/src/uts/i86xpv/xnb/Makefile
+++ b/usr/src/uts/i86xpv/xnb/Makefile
@@ -20,10 +20,9 @@
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
#
# This makefile drives the production of the xnb
# network driver support module.
@@ -59,7 +58,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
#
# Module depends on VNIC.
#
-LDFLAGS += -dy -N drv/vnic
+LDFLAGS += -dy -N drv/vnic -N misc/mac
#
# use Solaris specific code in xen public header files
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index ee03e0967f..e29afc6c29 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -1245,6 +1245,8 @@ fcnname/**/_info: \
STUB(dld, dld_init_ops, nomod_void);
STUB(dld, dld_fini_ops, nomod_void);
STUB(dld, dld_autopush, nomod_minus_one);
+ STUB(dld, dld_ioc_register, nomod_einval);
+ STUB(dld, dld_ioc_unregister, nomod_void);
END_MODULE(dld);
#endif
@@ -1255,12 +1257,15 @@ fcnname/**/_info: \
*/
#ifndef DLS_MODULE
MODULE(dls,misc);
- STUB(dls, dls_devnet_vid, nomod_zero);
STUB(dls, dls_devnet_mac, nomod_zero);
STUB(dls, dls_devnet_hold_tmp, nomod_einval);
STUB(dls, dls_devnet_rele_tmp, nomod_void);
+ STUB(dls, dls_devnet_hold_link, nomod_einval);
+ STUB(dls, dls_devnet_rele_link, nomod_void);
STUB(dls, dls_devnet_prop_task_wait, nomod_void);
STUB(dls, dls_mgmt_get_linkid, nomod_einval);
+ STUB(dls, dls_devnet_macname2linkid, nomod_einval);
+ STUB(dls, dls_mgmt_get_linkinfo, nomod_einval);
END_MODULE(dls);
#endif
diff --git a/usr/src/uts/intel/io/amd8111s/amd8111s_main.c b/usr/src/uts/intel/io/amd8111s/amd8111s_main.c
index 6587531959..1664ee7543 100644
--- a/usr/src/uts/intel/io/amd8111s/amd8111s_main.c
+++ b/usr/src/uts/intel/io/amd8111s/amd8111s_main.c
@@ -76,7 +76,6 @@ static int amd8111s_detach(dev_info_t *, ddi_detach_cmd_t);
static int amd8111s_m_unicst(void *, const uint8_t *);
static int amd8111s_m_promisc(void *, boolean_t);
static int amd8111s_m_stat(void *, uint_t, uint64_t *);
-static void amd8111s_m_resources(void *arg);
static void amd8111s_m_ioctl(void *, queue_t *, mblk_t *);
static int amd8111s_m_multicst(void *, boolean_t, const uint8_t *addr);
static int amd8111s_m_start(void *);
@@ -186,11 +185,9 @@ static ddi_device_acc_attr_t pcn_acc_attr = {
DDI_STRICTORDER_ACC
};
-#define AMD8111S_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL)
-
static mac_callbacks_t amd8111s_m_callbacks = {
- AMD8111S_M_CALLBACK_FLAGS,
+ MC_IOCTL,
amd8111s_m_stat,
amd8111s_m_start,
amd8111s_m_stop,
@@ -198,7 +195,6 @@ static mac_callbacks_t amd8111s_m_callbacks = {
amd8111s_m_multicst,
amd8111s_m_unicst,
amd8111s_m_tx,
- amd8111s_m_resources,
amd8111s_m_ioctl
};
@@ -248,29 +244,6 @@ _fini()
return (status);
}
-/* Adjust Interrupt Coalescing Register to coalesce interrupts */
-static void
-amd8111s_m_blank(void *arg, time_t ticks, uint32_t count)
-{
- _NOTE(ARGUNUSED(arg, ticks, count));
-}
-
-static void
-amd8111s_m_resources(void *arg)
-{
- struct LayerPointers *adapter = arg;
- mac_rx_fifo_t mrf;
-
- mrf.mrf_type = MAC_RX_FIFO;
- mrf.mrf_blank = amd8111s_m_blank;
- mrf.mrf_arg = (void *)adapter;
- mrf.mrf_normal_blank_time = 128;
- mrf.mrf_normal_pkt_count = 8;
-
- adapter->pOdl->mrh = mac_resource_add(adapter->pOdl->mh,
- (mac_resource_t *)&mrf);
-}
-
/*
* Loopback Support
*/
@@ -665,7 +638,7 @@ amd8111s_receive(struct LayerPointers *pLayerPointers)
}
if (ret_mp) {
- mac_rx(pOdl->mh, pOdl->mrh, ret_mp);
+ mac_rx(pOdl->mh, NULL, ret_mp);
}
(void) ddi_dma_sync(pOdl->rx_desc_dma_handle, 0, 0,
diff --git a/usr/src/uts/intel/io/amd8111s/amd8111s_main.h b/usr/src/uts/intel/io/amd8111s/amd8111s_main.h
index 922f5150c1..00f430273f 100755..100644
--- a/usr/src/uts/intel/io/amd8111s/amd8111s_main.h
+++ b/usr/src/uts/intel/io/amd8111s/amd8111s_main.h
@@ -1,13 +1,11 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef AMD8111S_MAIN_H
#define AMD8111S_MAIN_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Copyright (c) 2001-2006 Advanced Micro Devices, Inc. All rights reserved.
*
@@ -55,10 +53,6 @@
* nationals of countries subject to national security controls.
*/
-
-#pragma ident "@(#)$RCSfile: odl.h,v $ $Revision: 1.1 $ " \
-"$Date: 2004/04/22 15:22:52 $ AMD"
-
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/kmem.h>
@@ -79,7 +73,7 @@
#include <sys/ethernet.h>
#include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/netlb.h>
#include "amd8111s_hw.h"
@@ -278,7 +272,6 @@ struct odl {
dev_info_t *devinfo;
mac_handle_t mh; /* mac module handle */
- mac_resource_handle_t mrh;
struct amd8111s_statistics statistics;
diff --git a/usr/src/uts/intel/ip/Makefile b/usr/src/uts/intel/ip/Makefile
index c2e44f9934..6cd3d4ac5a 100644
--- a/usr/src/uts/intel/ip/Makefile
+++ b/usr/src/uts/intel/ip/Makefile
@@ -19,10 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
#
# This makefile drives the production of the ip driver
# kernel module.
diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64
index 5854497325..f4bcb8ab0c 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.debug64
+++ b/usr/src/uts/intel/ip/ip.global-objs.debug64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
-crctab
default_ip6_asp_table
do_tcp_direct_sockfs
do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_input_proc
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
ip_squeue_create_callback
ip_squeue_enter
ip_squeue_enter_unbound
ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
ip_squeue_worker_wait
-ip_squeues_per_cpu
ip_thread_data
ip_thread_list
ip_thread_rwlock
@@ -221,10 +216,6 @@ req_arr
rn_mkfreelist
rn_ones
rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
rt_entry_cache
rts_conn_cache
rts_g_t_info_ack
@@ -262,19 +253,12 @@ sin_null
skip_sctp_cksum
sqset_global_list
sqset_global_size
+sqset_lock
squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_kstat
-squeue_kstat_lock
-squeue_profile
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
squeue_workerwait_ms
squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
tcp_acceptor_rinit
tcp_acceptor_winit
tcp_conn_cache
@@ -307,10 +291,8 @@ tcp_rinitv4
tcp_rinitv6
tcp_sack_info_cache
tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
tcp_squeue_wput
-tcp_squeue_wput_proc
tcp_static_maxpsz
tcp_taskq
tcp_timercache
@@ -318,6 +300,7 @@ tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
+tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tsol_strict_error
diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64
index 065904b585..3866432363 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.obj64
+++ b/usr/src/uts/intel/ip/ip.global-objs.obj64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
-crctab
default_ip6_asp_table
do_tcp_direct_sockfs
do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_input_proc
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
ip_squeue_create_callback
ip_squeue_enter
ip_squeue_enter_unbound
ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
ip_squeue_worker_wait
-ip_squeues_per_cpu
ip_thread_data
ip_thread_list
ip_thread_rwlock
@@ -217,10 +212,6 @@ req_arr
rn_mkfreelist
rn_ones
rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
rt_entry_cache
rts_conn_cache
rts_g_t_info_ack
@@ -254,16 +245,12 @@ sin6_null
sin_null
sqset_global_list
sqset_global_size
+sqset_lock
squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
squeue_workerwait_ms
squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
tcp_acceptor_rinit
tcp_acceptor_winit
tcp_conn_cache
@@ -296,10 +283,8 @@ tcp_rinitv4
tcp_rinitv6
tcp_sack_info_cache
tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
tcp_squeue_wput
-tcp_squeue_wput_proc
tcp_static_maxpsz
tcp_taskq
tcp_timercache
@@ -307,6 +292,7 @@ tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
+tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tsol_strict_error
diff --git a/usr/src/uts/intel/mac/Makefile b/usr/src/uts/intel/mac/Makefile
index 12bd648ee0..870b260f75 100644
--- a/usr/src/uts/intel/mac/Makefile
+++ b/usr/src/uts/intel/mac/Makefile
@@ -22,13 +22,10 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
#
# This makefile drives the production of the mac driver
# kernel module.
#
-
#
# Path to the base of the uts directory tree (usually /usr/src/uts).
#
@@ -53,7 +50,6 @@ include $(UTSBASE)/intel/Makefile.intel
ALL_TARGET = $(BINARY)
LINT_TARGET = $(MODULE).lint
INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
-LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
#
# Overrides.
@@ -61,6 +57,9 @@ LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
CFLAGS += $(CCVERBOSE)
LDFLAGS += -dy
+LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+
#
# Default build targets.
#
diff --git a/usr/src/uts/intel/vnic/Makefile b/usr/src/uts/intel/vnic/Makefile
index 748d61a8b0..83a4c749c2 100644
--- a/usr/src/uts/intel/vnic/Makefile
+++ b/usr/src/uts/intel/vnic/Makefile
@@ -22,9 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
-
#
# Path to the base of the uts directory tree (usually /usr/src/uts).
#
@@ -55,7 +52,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
# Overrides
#
CFLAGS += $(CCVERBOSE)
-LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Ndrv/ip -Nmisc/dls
+LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls
#
# Default build targets.
diff --git a/usr/src/uts/intel/xge/Makefile b/usr/src/uts/intel/xge/Makefile
index 6689f7a758..8541c1b052 100644
--- a/usr/src/uts/intel/xge/Makefile
+++ b/usr/src/uts/intel/xge/Makefile
@@ -20,11 +20,9 @@
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the Neterion Xframe
# 10G Ethernet (XGE) driver module in x86 systems
#
diff --git a/usr/src/uts/sparc/ip/Makefile b/usr/src/uts/sparc/ip/Makefile
index c330f273f9..515f079865 100644
--- a/usr/src/uts/sparc/ip/Makefile
+++ b/usr/src/uts/sparc/ip/Makefile
@@ -19,17 +19,15 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
#
# This makefile drives the production of the ip driver
# kernel module.
#
# sparc architecture dependent
#
-
#
# Path to the base of the uts directory tree (usually /usr/src/uts).
#
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64
index 5854497325..f4bcb8ab0c 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.debug64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
-crctab
default_ip6_asp_table
do_tcp_direct_sockfs
do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_input_proc
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
ip_squeue_create_callback
ip_squeue_enter
ip_squeue_enter_unbound
ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
ip_squeue_worker_wait
-ip_squeues_per_cpu
ip_thread_data
ip_thread_list
ip_thread_rwlock
@@ -221,10 +216,6 @@ req_arr
rn_mkfreelist
rn_ones
rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
rt_entry_cache
rts_conn_cache
rts_g_t_info_ack
@@ -262,19 +253,12 @@ sin_null
skip_sctp_cksum
sqset_global_list
sqset_global_size
+sqset_lock
squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_kstat
-squeue_kstat_lock
-squeue_profile
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
squeue_workerwait_ms
squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
tcp_acceptor_rinit
tcp_acceptor_winit
tcp_conn_cache
@@ -307,10 +291,8 @@ tcp_rinitv4
tcp_rinitv6
tcp_sack_info_cache
tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
tcp_squeue_wput
-tcp_squeue_wput_proc
tcp_static_maxpsz
tcp_taskq
tcp_timercache
@@ -318,6 +300,7 @@ tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
+tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tsol_strict_error
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64
index 065904b585..3866432363 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.obj64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
-crctab
default_ip6_asp_table
do_tcp_direct_sockfs
do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_input_proc
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
ip_squeue_create_callback
ip_squeue_enter
ip_squeue_enter_unbound
ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
ip_squeue_worker_wait
-ip_squeues_per_cpu
ip_thread_data
ip_thread_list
ip_thread_rwlock
@@ -217,10 +212,6 @@ req_arr
rn_mkfreelist
rn_ones
rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
rt_entry_cache
rts_conn_cache
rts_g_t_info_ack
@@ -254,16 +245,12 @@ sin6_null
sin_null
sqset_global_list
sqset_global_size
+sqset_lock
squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
squeue_workerwait_ms
squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
tcp_acceptor_rinit
tcp_acceptor_winit
tcp_conn_cache
@@ -296,10 +283,8 @@ tcp_rinitv4
tcp_rinitv6
tcp_sack_info_cache
tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
tcp_squeue_wput
-tcp_squeue_wput_proc
tcp_static_maxpsz
tcp_taskq
tcp_timercache
@@ -307,6 +292,7 @@ tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
+tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tsol_strict_error
diff --git a/usr/src/uts/sparc/mac/Makefile b/usr/src/uts/sparc/mac/Makefile
index d343e0bc74..5ef314a2ef 100644
--- a/usr/src/uts/sparc/mac/Makefile
+++ b/usr/src/uts/sparc/mac/Makefile
@@ -22,14 +22,12 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
#
# This makefile drives the production of the mac driver
# kernel module.
#
# sparc architecture dependent
#
-
#
# Path to the base of the uts directory tree (usually /usr/src/uts).
#
@@ -54,7 +52,6 @@ include $(UTSBASE)/sparc/Makefile.sparc
ALL_TARGET = $(BINARY)
LINT_TARGET = $(MODULE).lint
INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
-LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
#
# Overrides.
@@ -64,6 +61,9 @@ $(RELEASE_BUILD)CFLAGS += -xinline=auto -xcrossfile
$(RELEASE_BUILD)COPTIMIZE = -xO5
LDFLAGS += -dy
+LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+
#
# Default build targets.
#
diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s
index e45cd91325..e315c9857c 100644
--- a/usr/src/uts/sparc/ml/modstubs.s
+++ b/usr/src/uts/sparc/ml/modstubs.s
@@ -1199,6 +1199,8 @@ stubs_base:
MODULE(dld,drv);
STUB(dld, dld_init_ops, nomod_void);
STUB(dld, dld_fini_ops, nomod_void);
+ STUB(dld, dld_ioc_register, nomod_einval);
+ STUB(dld, dld_ioc_unregister, nomod_void);
STUB(dld, dld_autopush, nomod_minus_one);
END_MODULE(dld);
#endif
@@ -1210,12 +1212,15 @@ stubs_base:
*/
#ifndef DLS_MODULE
MODULE(dls,misc);
- STUB(dls, dls_devnet_vid, nomod_zero);
STUB(dls, dls_devnet_mac, nomod_zero);
STUB(dls, dls_devnet_hold_tmp, nomod_einval);
STUB(dls, dls_devnet_rele_tmp, nomod_void);
+ STUB(dls, dls_devnet_hold_link, nomod_einval);
+ STUB(dls, dls_devnet_rele_link, nomod_void);
STUB(dls, dls_devnet_prop_task_wait, nomod_void);
STUB(dls, dls_mgmt_get_linkid, nomod_einval);
+ STUB(dls, dls_devnet_macname2linkid, nomod_einval);
+ STUB(dls, dls_mgmt_get_linkinfo, nomod_einval);
END_MODULE(dls);
#endif
diff --git a/usr/src/uts/sparc/vnic/Makefile b/usr/src/uts/sparc/vnic/Makefile
index f3389cb97a..41052c901d 100644
--- a/usr/src/uts/sparc/vnic/Makefile
+++ b/usr/src/uts/sparc/vnic/Makefile
@@ -22,9 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
-
#
# Path to the base of the uts directory tree (usually /usr/src/uts).
#
@@ -55,7 +52,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
# Overrides
#
CFLAGS += $(CCVERBOSE)
-LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Ndrv/ip -Nmisc/dls
+LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls
#
# Default build targets.
diff --git a/usr/src/uts/sparc/xge/Makefile b/usr/src/uts/sparc/xge/Makefile
index 2d66030c07..f30c4612e3 100644
--- a/usr/src/uts/sparc/xge/Makefile
+++ b/usr/src/uts/sparc/xge/Makefile
@@ -20,11 +20,9 @@
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the Neterion Xframe
# 10G Ethernet (XGE) driver module in x86 systems
#
diff --git a/usr/src/uts/sun/io/eri/eri.c b/usr/src/uts/sun/io/eri/eri.c
index 0fac98abf1..7635d9553e 100644
--- a/usr/src/uts/sun/io/eri/eri.c
+++ b/usr/src/uts/sun/io/eri/eri.c
@@ -47,7 +47,7 @@
#include <sys/ethernet.h>
#include <sys/vlan.h>
#include <sys/policy.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/dlpi.h>
@@ -200,7 +200,6 @@ static mac_callbacks_t eri_m_callbacks = {
eri_m_multicst,
eri_m_unicst,
eri_m_tx,
- NULL,
eri_m_ioctl,
eri_m_getcapab
};
@@ -1293,7 +1292,6 @@ eri_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
*hcksum_txflags = HCKSUM_INET_PARTIAL;
return (B_TRUE);
}
- case MAC_CAPAB_POLL:
default:
return (B_FALSE);
}
diff --git a/usr/src/uts/sun/io/hme.c b/usr/src/uts/sun/io/hme.c
index 399d995b10..0423d1d736 100644
--- a/usr/src/uts/sun/io/hme.c
+++ b/usr/src/uts/sun/io/hme.c
@@ -44,7 +44,7 @@
#include <sys/pattr.h>
#include <sys/dlpi.h>
#include <sys/strsubr.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/ethernet.h>
#include <sys/vlan.h>
@@ -487,7 +487,6 @@ static mac_callbacks_t hme_m_callbacks = {
hme_m_multicst,
hme_m_unicst,
hme_m_tx,
- NULL,
hme_m_ioctl,
hme_m_getcapab,
};
diff --git a/usr/src/uts/sun/io/qfe.c b/usr/src/uts/sun/io/qfe.c
index 4a98701b87..ad9bfe8fee 100644
--- a/usr/src/uts/sun/io/qfe.c
+++ b/usr/src/uts/sun/io/qfe.c
@@ -36,7 +36,7 @@
#include <sys/kmem.h>
#include <sys/modctl.h>
#include <sys/conf.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
diff --git a/usr/src/uts/sun4u/io/rmclomv.c b/usr/src/uts/sun4u/io/rmclomv.c
index 2afee7d1dd..93e236b121 100644
--- a/usr/src/uts/sun4u/io/rmclomv.c
+++ b/usr/src/uts/sun4u/io/rmclomv.c
@@ -61,7 +61,6 @@
#define CPU_SIGNATURE_DELAY_TIME 5000000 /* 5 secs, in microsecs */
extern void pmugpio_watchdog_pat();
-static clock_t timesync_interval;
extern int watchdog_activated;
static int last_watchdog_msg = 1;
@@ -118,6 +117,10 @@ static uint_t rmc_clear_watchdog_timer(void);
static void send_watchdog_msg(int msg);
static void plat_timesync(void *arg);
+static kmutex_t timesync_lock;
+static clock_t timesync_interval = 0;
+static timeout_id_t timesync_tid = 0;
+
/*
* Driver entry points
*/
@@ -310,6 +313,7 @@ _init(void)
mutex_init(&rmclomv_refresh_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&rmclomv_cache_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&rmclomv_state_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&timesync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&rmclomv_checkrmc_sig_cv, NULL, CV_DRIVER, NULL);
cv_init(&rmclomv_refresh_sig_cv, NULL, CV_DRIVER, NULL);
@@ -344,6 +348,7 @@ _fini(void)
return (error);
cv_destroy(&rmclomv_refresh_sig_cv);
cv_destroy(&rmclomv_checkrmc_sig_cv);
+ mutex_destroy(&timesync_lock);
mutex_destroy(&rmclomv_state_lock);
mutex_destroy(&rmclomv_cache_lock);
mutex_destroy(&rmclomv_refresh_lock);
@@ -479,8 +484,9 @@ rmclomv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
static int
rmclomv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
- int instance;
- int err;
+ timeout_id_t tid;
+ int instance;
+ int err;
switch (cmd) {
case DDI_DETACH:
@@ -502,6 +508,13 @@ rmclomv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
rmclomv_reset_cache(NULL, NULL, NULL);
ddi_remove_minor_node(dip, NULL);
+ mutex_enter(&timesync_lock);
+ tid = timesync_tid;
+ timesync_tid = 0;
+ timesync_interval = 0;
+ mutex_exit(&timesync_lock);
+ (void) untimeout(tid);
+
/* Forget the dev info */
rmclomv_dip = NULL;
rmc_comm_unregister();
@@ -3419,7 +3432,10 @@ plat_timesync(void *arg)
(void) rmc_comm_request_nowait(&request, 0);
- (void) timeout(plat_timesync, NULL, timesync_interval);
+ mutex_enter(&timesync_lock);
+ if (timesync_interval != 0)
+ timesync_tid = timeout(plat_timesync, NULL, timesync_interval);
+ mutex_exit(&timesync_lock);
}
/*
diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c
index 191cfba92b..64f3c278f5 100644
--- a/usr/src/uts/sun4v/io/vnet.c
+++ b/usr/src/uts/sun4v/io/vnet.c
@@ -39,7 +39,7 @@
#include <sys/ethernet.h>
#include <sys/dlpi.h>
#include <net/if.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
diff --git a/usr/src/uts/sun4v/io/vnet_gen.c b/usr/src/uts/sun4v/io/vnet_gen.c
index 2a273019b8..b6671a36ad 100644
--- a/usr/src/uts/sun4v/io/vnet_gen.c
+++ b/usr/src/uts/sun4v/io/vnet_gen.c
@@ -42,7 +42,7 @@
#include <sys/sunddi.h>
#include <sys/strsun.h>
#include <sys/note.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/ldc.h>
#include <sys/mach_descrip.h>
diff --git a/usr/src/uts/sun4v/io/vsw.c b/usr/src/uts/sun4v/io/vsw.c
index 27ad33ff66..fc3fdceeeb 100644
--- a/usr/src/uts/sun4v/io/vsw.c
+++ b/usr/src/uts/sun4v/io/vsw.c
@@ -53,12 +53,12 @@
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
@@ -78,7 +78,7 @@
static int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
static int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
-static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
+static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *);
/* MDEG routines */
static int vsw_mdeg_register(vsw_t *vswp);
@@ -88,7 +88,7 @@ static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
static int vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
static int vsw_read_mdprops(vsw_t *vswp);
static void vsw_vlan_read_ids(void *arg, int type, md_t *mdp,
- mde_cookie_t node, uint16_t *pvidp, uint16_t **vidspp,
+ mde_cookie_t node, uint16_t *pvidp, vsw_vlanid_t **vidspp,
uint16_t *nvidsp, uint16_t *default_idp);
static int vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
md_t *mdp, mde_cookie_t *node);
@@ -99,6 +99,8 @@ static void vsw_mtu_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
static int vsw_mtu_update(vsw_t *vswp, uint32_t mtu);
static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
+static boolean_t vsw_cmp_vids(vsw_vlanid_t *vids1,
+ vsw_vlanid_t *vids2, int nvids);
/* Mac driver related routines */
static int vsw_mac_register(vsw_t *);
@@ -132,13 +134,9 @@ static int vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
md_t *prev_mdp, mde_cookie_t prev_mdex);
extern int vsw_port_attach(vsw_port_t *port);
extern vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
-extern int vsw_mac_attach(vsw_t *vswp);
-extern void vsw_mac_detach(vsw_t *vswp);
extern int vsw_mac_open(vsw_t *vswp);
extern void vsw_mac_close(vsw_t *vswp);
-extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
-extern void vsw_reconfig_hw(vsw_t *);
+extern void vsw_mac_cleanup_ports(vsw_t *vswp);
extern void vsw_unset_addrs(vsw_t *vswp);
extern void vsw_setup_layer2_post_process(vsw_t *vswp);
extern void vsw_create_vlans(void *arg, int type);
@@ -150,6 +148,16 @@ extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
mblk_t **npt);
extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
extern void vsw_hio_cleanup(vsw_t *vswp);
+extern void vsw_hio_start_ports(vsw_t *vswp);
+extern void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled);
+extern int vsw_mac_multicast_add(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
+extern void vsw_mac_multicast_remove(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
+extern void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
+ vsw_vlanid_t *new_vids, int new_nvids);
+extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
+extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
+extern void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
+ uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
extern void vsw_reset_ports(vsw_t *vswp);
extern void vsw_port_reset(vsw_port_t *portp);
void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled);
@@ -223,16 +231,6 @@ boolean_t vsw_hio_enabled = B_TRUE; /* Enable/disable HybridIO */
int vsw_hio_max_cleanup_retries = 10; /* Max retries for HybridIO cleanp */
int vsw_hio_cleanup_delay = 10000; /* 10ms */
-/*
- * External tunables.
- */
-/*
- * Enable/disable thread per ring. This is a mode selection
- * that is done a vsw driver attach time.
- */
-boolean_t vsw_multi_ring_enable = B_FALSE;
-int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
-
/* Number of transmit descriptors - must be power of 2 */
uint32_t vsw_ntxds = VSW_RING_NUM_EL;
@@ -543,11 +541,11 @@ vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
vswp->instance = instance;
ddi_set_driver_private(dip, (caddr_t)vswp);
- mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&vswp->swtmout_lock, NULL, MUTEX_DRIVER, NULL);
+ rw_init(&vswp->maccl_rwlock, NULL, RW_DRIVER, NULL);
rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
- rw_init(&vswp->mac_rwlock, NULL, RW_DRIVER, NULL);
rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
@@ -669,10 +667,9 @@ vsw_attach_fail:
if (progress & PROG_swmode) {
vsw_stop_switching_timeout(vswp);
vsw_hio_cleanup(vswp);
- WRITE_ENTER(&vswp->mac_rwlock);
- vsw_mac_detach(vswp);
+ mutex_enter(&vswp->mac_lock);
vsw_mac_close(vswp);
- RW_EXIT(&vswp->mac_rwlock);
+ mutex_exit(&vswp->mac_lock);
}
if (progress & PROG_taskq)
@@ -697,11 +694,11 @@ vsw_attach_fail:
if (progress & PROG_locks) {
rw_destroy(&vswp->plist.lockrw);
rw_destroy(&vswp->mfdbrw);
- rw_destroy(&vswp->mac_rwlock);
rw_destroy(&vswp->if_lockrw);
+ rw_destroy(&vswp->maccl_rwlock);
mutex_destroy(&vswp->swtmout_lock);
mutex_destroy(&vswp->mca_lock);
- mutex_destroy(&vswp->hw_lock);
+ mutex_destroy(&vswp->mac_lock);
}
ddi_soft_state_free(vsw_state, instance);
@@ -736,6 +733,9 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
/* Stop any pending timeout to setup switching mode. */
vsw_stop_switching_timeout(vswp);
+ /* Cleanup the interface's mac client */
+ vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
+
if (vswp->if_state & VSW_IF_REG) {
if (vsw_mac_unregister(vswp) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
@@ -746,13 +746,8 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
vsw_mdeg_unregister(vswp);
- /* remove mac layer callback */
- WRITE_ENTER(&vswp->mac_rwlock);
- if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
- mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
- vswp->mrh = NULL;
- }
- RW_EXIT(&vswp->mac_rwlock);
+ /* cleanup HybridIO */
+ vsw_hio_cleanup(vswp);
if (vsw_detach_ports(vswp) != 0) {
cmn_err(CE_WARN, "!vsw%d: Unable to unconfigure ports",
@@ -762,24 +757,19 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
rw_destroy(&vswp->if_lockrw);
- /* cleanup HybridIO */
- vsw_hio_cleanup(vswp);
-
- mutex_destroy(&vswp->hw_lock);
+ vsw_mac_cleanup_ports(vswp);
/*
* Now that the ports have been deleted, stop and close
* the physical device.
*/
- WRITE_ENTER(&vswp->mac_rwlock);
-
- vsw_mac_detach(vswp);
+ mutex_enter(&vswp->mac_lock);
vsw_mac_close(vswp);
+ mutex_exit(&vswp->mac_lock);
- RW_EXIT(&vswp->mac_rwlock);
-
- rw_destroy(&vswp->mac_rwlock);
+ mutex_destroy(&vswp->mac_lock);
mutex_destroy(&vswp->swtmout_lock);
+ rw_destroy(&vswp->maccl_rwlock);
/*
* Destroy any free pools that may still exist.
@@ -936,15 +926,12 @@ vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
/*
* Read the 'vsw-switch-mode' property from the specified MD node.
*
- * Returns 0 on success and the number of modes found in 'found',
- * otherwise returns 1.
+ * Returns 0 on success, otherwise returns 1.
*/
static int
-vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
- uint8_t *modes, int *found)
+vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint8_t *mode)
{
int len = 0;
- int smode_num = 0;
char *smode = NULL;
char *curr_mode = NULL;
@@ -956,7 +943,6 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
* first item in list.
*/
len = 0;
- smode_num = 0;
if (md_get_prop_data(mdp, node, smode_propname,
(uint8_t **)(&smode), &len) != 0) {
/*
@@ -965,7 +951,6 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
*/
cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
" from the MD", vswp->instance);
- *found = 0;
return (1);
}
@@ -979,25 +964,24 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
* 'routed' - layer 3 (i.e. IP) routing, underlying HW
* in non-promiscuous mode.
*/
- while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
+ while (curr_mode < (smode + len)) {
D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
if (strcmp(curr_mode, "switched") == 0) {
- modes[smode_num++] = VSW_LAYER2;
+ *mode = VSW_LAYER2;
} else if (strcmp(curr_mode, "promiscuous") == 0) {
- modes[smode_num++] = VSW_LAYER2_PROMISC;
+ *mode = VSW_LAYER2 | VSW_LAYER2_PROMISC;
} else if (strcmp(curr_mode, "routed") == 0) {
- modes[smode_num++] = VSW_LAYER3;
+ *mode = VSW_LAYER3;
} else {
- DWARN(vswp, "%s: Unknown switch mode %s, "
- "setting to default 'switched' mode",
- __func__, curr_mode);
- modes[smode_num++] = VSW_LAYER2;
+ cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
+ "setting to default switched mode",
+ vswp->instance, curr_mode);
+ *mode = VSW_LAYER2;
}
curr_mode += strlen(curr_mode) + 1;
}
- *found = smode_num;
- D2(vswp, "%s: %d modes found", __func__, smode_num);
+ D2(vswp, "%s: %d mode", __func__, *mode);
D1(vswp, "%s: exit", __func__);
@@ -1082,16 +1066,16 @@ vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
D1(vswp, "%s: enter", __func__);
- WRITE_ENTER(&vswp->mac_rwlock);
+ mutex_enter(&vswp->mac_lock);
if (vswp->mh == NULL) {
- RW_EXIT(&vswp->mac_rwlock);
+ mutex_exit(&vswp->mac_lock);
return (EINVAL);
}
/* return stats from underlying device */
*val = mac_stat_get(vswp->mh, stat);
- RW_EXIT(&vswp->mac_rwlock);
+ mutex_exit(&vswp->mac_lock);
return (0);
}
@@ -1107,14 +1091,8 @@ vsw_m_stop(void *arg)
vswp->if_state &= ~VSW_IF_UP;
RW_EXIT(&vswp->if_lockrw);
- mutex_enter(&vswp->hw_lock);
-
- (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
-
- if (vswp->recfg_reqd)
- vsw_reconfig_hw(vswp);
-
- mutex_exit(&vswp->hw_lock);
+ /* Cleanup and close the mac client */
+ vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
}
@@ -1122,6 +1100,7 @@ vsw_m_stop(void *arg)
static int
vsw_m_start(void *arg)
{
+ int rv;
vsw_t *vswp = (vsw_t *)arg;
D1(vswp, "%s: enter", __func__);
@@ -1143,9 +1122,13 @@ vsw_m_start(void *arg)
/* if in layer2 mode, program unicast address. */
if (vswp->mh != NULL) {
- mutex_enter(&vswp->hw_lock);
- (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
- mutex_exit(&vswp->hw_lock);
+ /* Init a mac client and program addresses */
+ rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV);
+ if (rv != 0) {
+ cmn_err(CE_NOTE,
+ "!vsw%d: failed to program interface "
+ "unicast address\n", vswp->instance);
+ }
}
RW_EXIT(&vswp->if_lockrw);
@@ -1211,29 +1194,21 @@ vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
* Call into the underlying driver to program the
* address into HW.
*/
- WRITE_ENTER(&vswp->mac_rwlock);
- if (vswp->mh != NULL) {
- ret = mac_multicst_add(vswp->mh, mca);
- if (ret != 0) {
- cmn_err(CE_NOTE, "!vsw%d: unable to "
- "add multicast address",
- vswp->instance);
- RW_EXIT(&vswp->mac_rwlock);
- (void) vsw_del_mcst(vswp,
- VSW_LOCALDEV, addr, NULL);
- kmem_free(mcst_p, sizeof (*mcst_p));
- return (ret);
- }
- mcst_p->mac_added = B_TRUE;
+ ret = vsw_mac_multicast_add(vswp, NULL, mcst_p,
+ VSW_LOCALDEV);
+ if (ret != 0) {
+ (void) vsw_del_mcst(vswp,
+ VSW_LOCALDEV, addr, NULL);
+ kmem_free(mcst_p, sizeof (*mcst_p));
+ return (ret);
}
- RW_EXIT(&vswp->mac_rwlock);
mutex_enter(&vswp->mca_lock);
mcst_p->nextp = vswp->mcap;
vswp->mcap = mcst_p;
mutex_exit(&vswp->mca_lock);
} else {
- cmn_err(CE_NOTE, "!vsw%d: unable to add multicast "
+ cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
"address", vswp->instance);
}
return (ret);
@@ -1252,12 +1227,7 @@ vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
mcst_p = vsw_del_addr(VSW_LOCALDEV, vswp, addr);
ASSERT(mcst_p != NULL);
- WRITE_ENTER(&vswp->mac_rwlock);
- if (vswp->mh != NULL && mcst_p->mac_added) {
- (void) mac_multicst_remove(vswp->mh, mca);
- mcst_p->mac_added = B_FALSE;
- }
- RW_EXIT(&vswp->mac_rwlock);
+ vsw_mac_multicast_remove(vswp, NULL, mcst_p, VSW_LOCALDEV);
kmem_free(mcst_p, sizeof (*mcst_p));
}
@@ -1685,8 +1655,7 @@ vsw_readmd_exit:
static int
vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
{
- int i;
- uint64_t macaddr = 0;
+ uint64_t macaddr = 0;
D1(vswp, "%s: enter", __func__);
@@ -1703,17 +1672,12 @@ vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
vsw_save_lmacaddr(vswp, macaddr);
- if (vsw_get_md_smodes(vswp, mdp, node, vswp->smode, &vswp->smode_num)) {
+ if (vsw_get_md_smodes(vswp, mdp, node, &vswp->smode)) {
DWARN(vswp, "%s: Unable to read %s property from MD, "
"defaulting to 'switched' mode",
__func__, smode_propname);
- for (i = 0; i < NUM_SMODES; i++)
- vswp->smode[i] = VSW_LAYER2;
-
- vswp->smode_num = NUM_SMODES;
- } else {
- ASSERT(vswp->smode_num != 0);
+ vswp->smode = VSW_LAYER2;
}
/* read mtu */
@@ -1751,7 +1715,7 @@ vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
*/
static void
vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node,
- uint16_t *pvidp, uint16_t **vidspp, uint16_t *nvidsp,
+ uint16_t *pvidp, vsw_vlanid_t **vidspp, uint16_t *nvidsp,
uint16_t *default_idp)
{
vsw_t *vswp;
@@ -1823,11 +1787,12 @@ vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node,
if (nvids != 0) {
D2(vswp, "%s: %s(%d): ", __func__, vid_propname, inst);
- vids_size = sizeof (uint16_t) * nvids;
+ vids_size = sizeof (vsw_vlanid_t) * nvids;
*vidspp = kmem_zalloc(vids_size, KM_SLEEP);
for (i = 0; i < nvids; i++) {
- (*vidspp)[i] = data[i] & 0xFFFF;
- D2(vswp, " %d ", (*vidspp)[i]);
+ (*vidspp)[i].vl_vid = data[i] & 0xFFFF;
+ (*vidspp)[i].vl_set = B_FALSE;
+ D2(vswp, " %d ", (*vidspp)[i].vl_vid);
}
D2(vswp, "\n");
}
@@ -1959,35 +1924,6 @@ vsw_mtu_update(vsw_t *vswp, uint32_t mtu)
RW_EXIT(&vswp->if_lockrw);
- WRITE_ENTER(&vswp->mac_rwlock);
-
- if (vswp->mh == 0) {
- /*
- * Physical device is not available yet; mtu will be
- * updated after we open it successfully, as we have
- * saved the new mtu.
- */
- D2(vswp, "%s: Physical device:%s is not "
- "available yet; can't update its mtu\n",
- __func__, vswp->physname);
-
- } else {
-
- /*
- * Stop and restart to enable the
- * new mtu in the physical device.
- */
- vsw_mac_detach(vswp);
- rv = vsw_mac_attach(vswp);
- if (rv != 0) {
- RW_EXIT(&vswp->mac_rwlock);
- return (EIO);
- }
-
- }
-
- RW_EXIT(&vswp->mac_rwlock);
-
/* Reset ports to renegotiate with the new mtu */
vsw_reset_ports(vswp);
@@ -2014,8 +1950,8 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
char physname[LIFNAMSIZ];
char drv[LIFNAMSIZ];
uint_t ddi_instance;
- uint8_t new_smode[NUM_SMODES];
- int i, smode_num = 0;
+ uint8_t new_smode;
+ int i;
uint64_t macaddr = 0;
enum {MD_init = 0x1,
MD_physname = 0x2,
@@ -2025,7 +1961,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
MD_mtu = 0x20} updated;
int rv;
uint16_t pvid;
- uint16_t *vids;
+ vsw_vlanid_t *vids;
uint16_t nvids;
uint32_t mtu;
@@ -2099,25 +2035,16 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
/*
* Check if switching modes have changed.
*/
- if (vsw_get_md_smodes(vswp, mdp, node,
- new_smode, &smode_num)) {
+ if (vsw_get_md_smodes(vswp, mdp, node, &new_smode)) {
cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
vswp->instance, smode_propname);
goto fail_reconf;
} else {
- ASSERT(smode_num != 0);
- if (smode_num != vswp->smode_num) {
- D2(vswp, "%s: number of modes changed from %d to %d",
- __func__, vswp->smode_num, smode_num);
- }
+ if (new_smode != vswp->smode) {
+ D2(vswp, "%s: switching mode changed from %d to %d",
+ __func__, vswp->smode, new_smode);
- for (i = 0; i < smode_num; i++) {
- if (new_smode[i] != vswp->smode[i]) {
- D2(vswp, "%s: mode changed from %d to %d",
- __func__, vswp->smode[i], new_smode[i]);
- updated |= MD_smode;
- break;
- }
+ updated |= MD_smode;
}
}
@@ -2129,7 +2056,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
if ((pvid != vswp->pvid) || /* pvid changed? */
(nvids != vswp->nvids) || /* # of vids changed? */
((nvids != 0) && (vswp->nvids != 0) && /* vids changed? */
- bcmp(vids, vswp->vids, sizeof (uint16_t) * nvids))) {
+ !vsw_cmp_vids(vids, vswp->vids, nvids))) {
updated |= MD_vlans;
}
@@ -2149,7 +2076,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
* Now make any changes which are needed...
*/
- if (updated & (MD_physname | MD_smode)) {
+ if (updated & (MD_physname | MD_smode | MD_mtu)) {
/*
* Stop any pending timeout to setup switching mode.
@@ -2161,19 +2088,17 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
/*
* Remove unicst, mcst addrs of vsw interface
- * and ports from the physdev.
+ * and ports from the physdev. This also closes
+ * the corresponding mac clients.
*/
vsw_unset_addrs(vswp);
/*
* Stop, detach and close the old device..
*/
- WRITE_ENTER(&vswp->mac_rwlock);
-
- vsw_mac_detach(vswp);
+ mutex_enter(&vswp->mac_lock);
vsw_mac_close(vswp);
-
- RW_EXIT(&vswp->mac_rwlock);
+ mutex_exit(&vswp->mac_lock);
/*
* Update phys name.
@@ -2189,11 +2114,15 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
* Update array with the new switch mode values.
*/
if (updated & MD_smode) {
- for (i = 0; i < smode_num; i++)
- vswp->smode[i] = new_smode[i];
+ vswp->smode = new_smode;
+ }
- vswp->smode_num = smode_num;
- vswp->smode_idx = 0;
+ /* Update mtu */
+ if (updated & MD_mtu) {
+ rv = vsw_mtu_update(vswp, mtu);
+ if (rv != 0) {
+ goto fail_update;
+ }
}
/*
@@ -2237,24 +2166,9 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
READ_ENTER(&vswp->if_lockrw);
if (vswp->if_state & VSW_IF_UP) {
+ /* reconfigure with new address */
+ vsw_if_mac_reconfig(vswp, B_FALSE, 0, NULL, 0);
- mutex_enter(&vswp->hw_lock);
- /*
- * Remove old mac address of vsw interface
- * from the physdev
- */
- (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
- /*
- * Program new mac address of vsw interface
- * in the physdev
- */
- rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
- mutex_exit(&vswp->hw_lock);
- if (rv != 0) {
- cmn_err(CE_NOTE,
- "!vsw%d: failed to program interface "
- "unicast address\n", vswp->instance);
- }
/*
* Notify the MAC layer of the changed address.
*/
@@ -2270,32 +2184,24 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
/* Remove existing vlan ids from the hash table. */
vsw_vlan_remove_ids(vswp, VSW_LOCALDEV);
- /* save the new vlan ids */
- vswp->pvid = pvid;
- if (vswp->nvids != 0) {
- kmem_free(vswp->vids, sizeof (uint16_t) * vswp->nvids);
- vswp->nvids = 0;
- }
- if (nvids != 0) {
- vswp->nvids = nvids;
+ if (vswp->if_state & VSW_IF_UP) {
+ vsw_if_mac_reconfig(vswp, B_TRUE, pvid, vids, nvids);
+ } else {
+ if (vswp->nvids != 0) {
+ kmem_free(vswp->vids,
+ sizeof (vsw_vlanid_t) * vswp->nvids);
+ }
vswp->vids = vids;
+ vswp->nvids = nvids;
+ vswp->pvid = pvid;
}
/* add these new vlan ids into hash table */
vsw_vlan_add_ids(vswp, VSW_LOCALDEV);
} else {
if (nvids != 0) {
- kmem_free(vids, sizeof (uint16_t) * nvids);
- }
- }
-
- if (updated & MD_mtu) {
-
- rv = vsw_mtu_update(vswp, mtu);
- if (rv != 0) {
- goto fail_update;
+ kmem_free(vids, sizeof (vsw_vlanid_t) * nvids);
}
-
}
return;
@@ -2397,7 +2303,7 @@ vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
/* now update all properties into the port */
portp->p_vswp = vswp;
portp->p_instance = inst;
- portp->addr_set = VSW_ADDR_UNSET;
+ portp->addr_set = B_FALSE;
ether_copy(&ea, &portp->p_macaddr);
if (nchan > VSW_PORT_MAX_LDCS) {
D2(vswp, "%s: using first of %d ldc ids",
@@ -2466,7 +2372,7 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
vsw_port_t *portp;
boolean_t updated_vlans = B_FALSE;
uint16_t pvid;
- uint16_t *vids;
+ vsw_vlanid_t *vids;
uint16_t nvids;
uint64_t val;
boolean_t hio_enabled = B_FALSE;
@@ -2503,7 +2409,7 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
if ((pvid != portp->pvid) || /* pvid changed? */
(nvids != portp->nvids) || /* # of vids changed? */
((nvids != 0) && (portp->nvids != 0) && /* vids changed? */
- bcmp(vids, portp->vids, sizeof (uint16_t) * nvids))) {
+ !vsw_cmp_vids(vids, portp->vids, nvids))) {
updated_vlans = B_TRUE;
}
@@ -2512,20 +2418,8 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
/* Remove existing vlan ids from the hash table. */
vsw_vlan_remove_ids(portp, VSW_VNETPORT);
- /* save the new vlan ids */
- portp->pvid = pvid;
- if (portp->nvids != 0) {
- kmem_free(portp->vids,
- sizeof (uint16_t) * portp->nvids);
- portp->nvids = 0;
- }
- if (nvids != 0) {
- portp->vids = kmem_zalloc(sizeof (uint16_t) *
- nvids, KM_SLEEP);
- bcopy(vids, portp->vids, sizeof (uint16_t) * nvids);
- portp->nvids = nvids;
- kmem_free(vids, sizeof (uint16_t) * nvids);
- }
+ /* Reconfigure vlans with network device */
+ vsw_mac_port_reconfig_vlans(portp, pvid, vids, nvids);
/* add these new vlan ids into hash table */
vsw_vlan_add_ids(portp, VSW_VNETPORT);
@@ -2628,3 +2522,23 @@ vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr)
}
RW_EXIT(&vswp->if_lockrw);
}
+
+/* Compare VLAN ids, array size expected to be same. */
+static boolean_t
+vsw_cmp_vids(vsw_vlanid_t *vids1, vsw_vlanid_t *vids2, int nvids)
+{
+ int i, j;
+ uint16_t vid;
+
+ for (i = 0; i < nvids; i++) {
+ vid = vids1[i].vl_vid;
+ for (j = 0; j < nvids; j++) {
+ if (vid == vids2[i].vl_vid)
+ break;
+ }
+ if (j == nvids) {
+ return (B_FALSE);
+ }
+ }
+ return (B_TRUE);
+}
diff --git a/usr/src/uts/sun4v/io/vsw_hio.c b/usr/src/uts/sun4v/io/vsw_hio.c
index 278896d977..084c338548 100644
--- a/usr/src/uts/sun4v/io/vsw_hio.c
+++ b/usr/src/uts/sun4v/io/vsw_hio.c
@@ -53,7 +53,7 @@
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
@@ -80,9 +80,9 @@ extern int vsw_hio_cleanup_delay;
/* Functions imported from other files */
extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
-extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
extern void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
+extern void vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
+ uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
/* Functions exported to other files */
void vsw_hio_init(vsw_t *vswp);
@@ -104,11 +104,24 @@ static int vsw_send_dds_msg(vsw_ldc_t *ldcp, uint8_t dds_subclass,
uint64_t cookie, uint64_t macaddr, uint32_t req_id);
static int vsw_send_dds_resp_msg(vsw_ldc_t *ldcp, vio_dds_msg_t *dmsg, int ack);
static int vsw_hio_send_delshare_msg(vsw_share_t *vsharep);
-static int vsw_hio_bind_macaddr(vsw_share_t *vsharep);
-static void vsw_hio_unbind_macaddr(vsw_share_t *vsharep);
static boolean_t vsw_hio_reboot_callb(void *arg, int code);
static boolean_t vsw_hio_panic_callb(void *arg, int code);
+/*
+ * Locking strategy for HybridIO is followed as below:
+ *
+ * - As the Shares are associated with a network device, the
+ * the global lock('vswp>mac_lock') is used for all Shares
+ * related operations.
+ * - The 'port->maccl_rwlock' is used to synchronize only the
+ * the operations that operate on that port's mac client. That
+ * is, the share_bind and unbind operations only.
+ *
+ * - The locking hierarchy follows that the global mac_lock is
+ * acquired first and then the ports mac client lock(maccl_rwlock)
+ */
+
+
static kstat_t *vsw_hio_setup_kstats(char *ks_mod, char *ks_name, vsw_t *vswp);
static void vsw_hio_destroy_kstats(vsw_t *vswp);
static int vsw_hio_kstats_update(kstat_t *ksp, int rw);
@@ -122,32 +135,23 @@ void
vsw_hio_init(vsw_t *vswp)
{
vsw_hio_t *hiop = &vswp->vhio;
+ int num_shares;
int i;
- int rv;
+ ASSERT(MUTEX_HELD(&vswp->mac_lock));
D1(vswp, "%s:enter\n", __func__);
- mutex_enter(&vswp->hw_lock);
if (vsw_hio_enabled == B_FALSE) {
- mutex_exit(&vswp->hw_lock);
return;
}
vswp->hio_capable = B_FALSE;
- rv = mac_capab_get(vswp->mh, MAC_CAPAB_SHARES, &hiop->vh_scapab);
- if (rv == B_FALSE) {
+ num_shares = mac_share_capable(vswp->mh);
+ if (num_shares == 0) {
D2(vswp, "%s: %s is not HybridIO capable\n", __func__,
vswp->physname);
- mutex_exit(&vswp->hw_lock);
return;
}
- rv = mac_capab_get(vswp->mh, MAC_CAPAB_RINGS, &hiop->vh_rcapab);
- if (rv == B_FALSE) {
- DWARN(vswp, "%s: %s has no RINGS capability\n", __func__,
- vswp->physname);
- mutex_exit(&vswp->hw_lock);
- return;
- }
- hiop->vh_num_shares = hiop->vh_scapab.ms_snum;
+ hiop->vh_num_shares = num_shares;
hiop->vh_shares = kmem_zalloc((sizeof (vsw_share_t) *
hiop->vh_num_shares), KM_SLEEP);
for (i = 0; i < hiop->vh_num_shares; i++) {
@@ -176,7 +180,6 @@ vsw_hio_init(vsw_t *vswp)
D2(vswp, "%s: %s is HybridIO capable num_shares=%d\n", __func__,
vswp->physname, hiop->vh_num_shares);
D1(vswp, "%s:exit\n", __func__);
- mutex_exit(&vswp->hw_lock);
}
/*
@@ -187,13 +190,9 @@ vsw_hio_init(vsw_t *vswp)
static vsw_share_t *
vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp)
{
- vsw_hio_t *hiop = &vswp->vhio;
- mac_capab_share_t *hcapab = &hiop->vh_scapab;
vsw_share_t *vsharep;
vsw_port_t *portp = ldcp->ldc_port;
uint64_t ldc_id = ldcp->ldc_id;
- uint32_t rmin, rmax;
- uint64_t rmap;
int rv;
D1(vswp, "%s:enter\n", __func__);
@@ -202,39 +201,19 @@ vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp)
/* No free shares available */
return (NULL);
}
- /*
- * Allocate a Share - it will come with rings/groups
- * already assigned to it.
- */
- rv = hcapab->ms_salloc(hcapab->ms_handle, ldc_id,
- &vsharep->vs_cookie, &vsharep->vs_shdl);
+
+ WRITE_ENTER(&portp->maccl_rwlock);
+ rv = mac_share_bind(portp->p_mch, ldc_id, &vsharep->vs_cookie);
+ RW_EXIT(&portp->maccl_rwlock);
if (rv != 0) {
- D2(vswp, "Alloc a share failed for ldc=0x%lx rv=%d",
- ldc_id, rv);
return (NULL);
}
- /*
- * Query the RX group number to bind the port's
- * MAC address to it.
- */
- hcapab->ms_squery(vsharep->vs_shdl, MAC_RING_TYPE_RX,
- &rmin, &rmax, &rmap, &vsharep->vs_gnum);
-
/* Cache some useful info */
vsharep->vs_ldcid = ldcp->ldc_id;
vsharep->vs_macaddr = vnet_macaddr_strtoul(
portp->p_macaddr.ether_addr_octet);
vsharep->vs_portp = ldcp->ldc_port;
-
- /* Bind the Guest's MAC address */
- rv = vsw_hio_bind_macaddr(vsharep);
- if (rv != 0) {
- /* something went wrong, cleanup */
- hcapab->ms_sfree(vsharep->vs_shdl);
- return (NULL);
- }
-
vsharep->vs_state |= VSW_SHARE_ASSIGNED;
D1(vswp, "%s:exit\n", __func__);
@@ -242,61 +221,6 @@ vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp)
}
/*
- * vsw_hio_bind_macaddr -- Remove the port's MAC address from the
- * physdev and bind it to the Share's RX group.
- */
-static int
-vsw_hio_bind_macaddr(vsw_share_t *vsharep)
-{
- vsw_t *vswp = vsharep->vs_vswp;
- vsw_port_t *portp = vsharep->vs_portp;
- mac_capab_rings_t *rcapab = &vswp->vhio.vh_rcapab;
- mac_group_info_t *ginfop = &vsharep->vs_rxginfo;
- int rv;
-
- /* Get the RX groupinfo */
- rcapab->mr_gget(rcapab->mr_handle, MAC_RING_TYPE_RX,
- vsharep->vs_gnum, &vsharep->vs_rxginfo, NULL);
-
- /* Unset the MAC address first */
- if (portp->addr_set != VSW_ADDR_UNSET) {
- (void) vsw_unset_hw(vswp, portp, VSW_VNETPORT);
- }
-
- /* Bind the MAC address to the RX group */
- rv = ginfop->mrg_addmac(ginfop->mrg_driver,
- (uint8_t *)&portp->p_macaddr.ether_addr_octet);
- if (rv != 0) {
- /* Restore the address back as it was */
- (void) vsw_set_hw(vswp, portp, VSW_VNETPORT);
- return (rv);
- }
- return (0);
-}
-
-/*
- * vsw_hio_unbind_macaddr -- Unbind the port's MAC address and restore
- * it back as it was before.
- */
-static void
-vsw_hio_unbind_macaddr(vsw_share_t *vsharep)
-{
- vsw_t *vswp = vsharep->vs_vswp;
- vsw_port_t *portp = vsharep->vs_portp;
- mac_group_info_t *ginfop = &vsharep->vs_rxginfo;
-
- if (portp == NULL) {
- return;
- }
- /* Unbind the MAC address from the RX group */
- (void) ginfop->mrg_remmac(ginfop->mrg_driver,
- (uint8_t *)&portp->p_macaddr.ether_addr_octet);
-
- /* Program the MAC address back */
- (void) vsw_set_hw(vswp, portp, VSW_VNETPORT);
-}
-
-/*
* vsw_hio_find_free_share -- Find a free Share.
*/
static vsw_share_t *
@@ -380,16 +304,13 @@ static void
vsw_hio_free_share(vsw_share_t *vsharep)
{
vsw_t *vswp = vsharep->vs_vswp;
- vsw_hio_t *hiop = &vswp->vhio;
- mac_capab_share_t *hcapab = &hiop->vh_scapab;
+ vsw_port_t *portp = vsharep->vs_portp;
D1(vswp, "%s:enter\n", __func__);
- /* First unbind the MAC address and restore it back */
- vsw_hio_unbind_macaddr(vsharep);
-
- /* free share */
- hcapab->ms_sfree(vsharep->vs_shdl);
+ WRITE_ENTER(&portp->maccl_rwlock);
+ mac_share_unbind(portp->p_mch);
+ RW_EXIT(&portp->maccl_rwlock);
vsharep->vs_state = VSW_SHARE_FREE;
vsharep->vs_macaddr = 0;
@@ -455,7 +376,7 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot)
* HybridIO.
*/
READ_ENTER(&plist->lockrw);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
/*
* first clear the hio_capable flag so that no more
* HybridIO operations are initiated.
@@ -515,9 +436,9 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot)
* This delay is also needed for the port reset to
* release the Hybrid resource.
*/
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
drv_usecwait(vsw_hio_cleanup_delay);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
max_retries--;
} while ((free_shares < hiop->vh_num_shares) && (max_retries > 0));
@@ -532,7 +453,7 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot)
kmem_free(hiop->vh_shares, sizeof (vsw_share_t) * hiop->vh_num_shares);
hiop->vh_shares = NULL;
hiop->vh_num_shares = 0;
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
RW_EXIT(&plist->lockrw);
D1(vswp, "%s:exit\n", __func__);
}
@@ -560,12 +481,12 @@ vsw_hio_start_ports(vsw_t *vswp)
}
reset = B_FALSE;
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
vsharep = vsw_hio_find_vshare_port(vswp, portp);
if (vsharep == NULL) {
reset = B_TRUE;
}
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
if (reset == B_TRUE) {
/* Cause a rest to trigger HybridIO setup */
@@ -586,9 +507,9 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp)
int rv;
D1(vswp, "%s:enter ldc=0x%lx", __func__, ldcp->ldc_id);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
if (vswp->hio_capable == B_FALSE) {
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
D2(vswp, "%s:not HIO capable", __func__);
return;
}
@@ -596,14 +517,14 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp)
/* Verify if a share was already allocated */
vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id);
if (vsharep != NULL) {
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
D2(vswp, "%s:Share already allocated to ldc=0x%lx",
__func__, ldcp->ldc_id);
return;
}
vsharep = vsw_hio_alloc_share(vswp, ldcp);
if (vsharep == NULL) {
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
D2(vswp, "%s: no Share available for ldc=0x%lx",
__func__, ldcp->ldc_id);
return;
@@ -616,12 +537,12 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp)
* Failed to send a DDS message, so cleanup now.
*/
vsw_hio_free_share(vsharep);
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
return;
}
vsharep->vs_state &= ~VSW_SHARE_DDS_ACKD;
vsharep->vs_state |= VSW_SHARE_DDS_SENT;
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
/* DERR only to print by default */
DERR(vswp, "Share allocated for ldc_id=0x%lx Cookie=0x%lX",
@@ -640,16 +561,16 @@ vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp)
D1(vswp, "%s:enter ldc=0x%lx", __func__, ldcp->ldc_id);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id);
if (vsharep == NULL) {
D1(vswp, "%s:no share found for ldc=0x%lx",
__func__, ldcp->ldc_id);
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
return;
}
vsw_hio_free_share(vsharep);
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
D1(vswp, "%s:exit ldc=0x%lx", __func__, ldcp->ldc_id);
}
@@ -669,12 +590,12 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep)
uint64_t macaddr = vsharep->vs_macaddr;
int rv;
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
- mutex_exit(&vswp->hw_lock);
+ ASSERT(MUTEX_HELD(&vswp->mac_lock));
+ mutex_exit(&vswp->mac_lock);
portp = vsharep->vs_portp;
if (portp == NULL) {
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
return (0);
}
@@ -683,7 +604,7 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep)
ldcp = ldcl->head;
if ((ldcp == NULL) || (ldcp->ldc_id != vsharep->vs_ldcid)) {
RW_EXIT(&ldcl->lockrw);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
return (0);
}
req_id = VSW_DDS_NEXT_REQID(vsharep);
@@ -691,7 +612,7 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep)
cookie, macaddr, req_id);
RW_EXIT(&ldcl->lockrw);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
if (rv == 0) {
vsharep->vs_state &= ~VSW_SHARE_DDS_ACKD;
vsharep->vs_state |= VSW_SHARE_DDS_SENT;
@@ -740,14 +661,14 @@ vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg)
/* discard */
return;
}
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
/*
* We expect to receive DDS messages only from guests that
* have HybridIO started.
*/
vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id);
if (vsharep == NULL) {
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
return;
}
@@ -816,7 +737,7 @@ vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg)
__func__, dmsg->dds_subclass);
break;
}
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
D1(vswp, "%s:exit ldc=0x%lx\n", __func__, ldcp->ldc_id);
}
@@ -857,8 +778,12 @@ vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled)
/* Hybrid Mode is disabled, so stop HybridIO */
vsw_hio_stop_port(portp);
portp->p_hio_enabled = B_FALSE;
+
+ vsw_port_mac_reconfig(portp, B_FALSE, 0, NULL, 0);
} else {
portp->p_hio_enabled = B_TRUE;
+ vsw_port_mac_reconfig(portp, B_FALSE, 0, NULL, 0);
+
/* reset the port to initiate HybridIO setup */
vsw_hio_port_reset(portp, B_FALSE);
}
@@ -877,16 +802,16 @@ vsw_hio_stop_port(vsw_port_t *portp)
int max_retries = vsw_hio_max_cleanup_retries;
D1(vswp, "%s:enter\n", __func__);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
if (vswp->hio_capable == B_FALSE) {
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
return;
}
vsharep = vsw_hio_find_vshare_port(vswp, portp);
if (vsharep == NULL) {
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
return;
}
@@ -925,9 +850,9 @@ vsw_hio_stop_port(vsw_port_t *portp)
* messages come and get processed, that is, shares
* get freed.
*/
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
drv_usecwait(vsw_hio_cleanup_delay);
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
/* Check if the share still assigned to this port */
if ((vsharep->vs_portp != portp) ||
@@ -937,7 +862,7 @@ vsw_hio_stop_port(vsw_port_t *portp)
max_retries--;
} while ((vsharep->vs_state != VSW_SHARE_FREE) && (max_retries > 0));
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
D1(vswp, "%s:exit\n", __func__);
}
@@ -1111,7 +1036,7 @@ vsw_hio_kstats_update(kstat_t *ksp, int rw)
return (0);
}
- mutex_enter(&vswp->hw_lock);
+ mutex_enter(&vswp->mac_lock);
hiokp->hio_num_shares.value.ul = (uint32_t)hiop->vh_num_shares;
for (i = 0; i < hiop->vh_num_shares; i++) {
hiokp->share[i].assigned.value.ul =
@@ -1119,7 +1044,7 @@ vsw_hio_kstats_update(kstat_t *ksp, int rw)
hiokp->share[i].state.value.ul =
hiop->vh_shares[i].vs_state;
}
- mutex_exit(&vswp->hw_lock);
+ mutex_exit(&vswp->mac_lock);
} else {
return (EACCES);
}
diff --git a/usr/src/uts/sun4v/io/vsw_ldc.c b/usr/src/uts/sun4v/io/vsw_ldc.c
index e2273596a1..bfd6dde2fb 100644
--- a/usr/src/uts/sun4v/io/vsw_ldc.c
+++ b/usr/src/uts/sun4v/io/vsw_ldc.c
@@ -58,7 +58,6 @@
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
-#include <sys/mac.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
@@ -88,7 +87,7 @@ int vsw_detach_ports(vsw_t *vswp);
int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
int vsw_port_detach(vsw_t *vswp, int p_instance);
-int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
+int vsw_portsend(vsw_port_t *port, mblk_t *mp);
int vsw_port_attach(vsw_port_t *portp);
vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
@@ -165,7 +164,6 @@ static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
static void vsw_ldc_rx_worker(void *arg);
/* Misc support routines */
-static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
static void vsw_free_ring(dring_info_t *);
static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
@@ -183,8 +181,7 @@ static void display_ring(dring_info_t *);
* Functions imported from other files.
*/
extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
-extern void vsw_reconfig_hw(vsw_t *);
+extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
extern void vsw_del_mcst_port(vsw_port_t *port);
extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
@@ -205,7 +202,10 @@ extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
extern void vsw_hio_stop_port(vsw_port_t *portp);
-extern void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
+extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
+extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
+extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
+
#define VSW_NUM_VMPOOLS 3 /* number of vio mblk pools */
@@ -309,6 +309,7 @@ vsw_port_attach(vsw_port_t *port)
int i;
int nids = port->num_ldcs;
uint64_t *ldcids;
+ int rv;
D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
@@ -328,6 +329,7 @@ vsw_port_attach(vsw_port_t *port)
mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
+ rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
@@ -339,29 +341,20 @@ vsw_port_attach(vsw_port_t *port)
D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
DERR(vswp, "%s: ldc_attach failed", __func__);
-
- rw_destroy(&port->p_ldclist.lockrw);
-
- cv_destroy(&port->state_cv);
- mutex_destroy(&port->state_lock);
-
- mutex_destroy(&port->tx_lock);
- mutex_destroy(&port->mca_lock);
- kmem_free(port, sizeof (vsw_port_t));
- return (1);
+ goto exit_error;
}
}
if (vswp->switching_setup_done == B_TRUE) {
/*
- * If the underlying physical device has been setup,
- * program the mac address of this port in it.
- * Otherwise, port macaddr will be set after the physical
- * device is successfully setup by the timeout handler.
+ * If the underlying network device has been setup,
+ * then open a mac client and porgram the mac address
+ * for this port.
*/
- mutex_enter(&vswp->hw_lock);
- (void) vsw_set_hw(vswp, port, VSW_VNETPORT);
- mutex_exit(&vswp->hw_lock);
+ rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
+ if (rv != 0) {
+ goto exit_error;
+ }
}
/* create the fdb entry for this port/mac address */
@@ -386,11 +379,23 @@ vsw_port_attach(vsw_port_t *port)
/* announce macaddr of vnet to the physical switch */
if (vsw_publish_macaddr_count != 0) { /* enabled */
- vsw_publish_macaddr(vswp, (uint8_t *)&(port->p_macaddr));
+ vsw_publish_macaddr(vswp, port);
}
D1(vswp, "%s: exit", __func__);
return (0);
+
+exit_error:
+ rw_destroy(&port->p_ldclist.lockrw);
+
+ cv_destroy(&port->state_cv);
+ mutex_destroy(&port->state_lock);
+
+ rw_destroy(&port->maccl_rwlock);
+ mutex_destroy(&port->tx_lock);
+ mutex_destroy(&port->mca_lock);
+ kmem_free(port, sizeof (vsw_port_t));
+ return (1);
}
/*
@@ -427,6 +432,9 @@ vsw_port_detach(vsw_t *vswp, int p_instance)
*/
RW_EXIT(&plist->lockrw);
+ /* Cleanup and close the mac client */
+ vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
+
/* Remove the fdb entry for this port/mac address */
vsw_fdbe_del(vswp, &(port->p_macaddr));
vsw_destroy_vlans(port, VSW_VNETPORT);
@@ -434,23 +442,6 @@ vsw_port_detach(vsw_t *vswp, int p_instance)
/* Remove any multicast addresses.. */
vsw_del_mcst_port(port);
- /* Remove address if was programmed into HW. */
- mutex_enter(&vswp->hw_lock);
-
- /*
- * Port's address may not have been set in hardware. This could
- * happen if the underlying physical device is not yet available and
- * vsw_setup_switching_timeout() may be in progress.
- * We remove its addr from hardware only if it has been set before.
- */
- if (port->addr_set != VSW_ADDR_UNSET)
- (void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
-
- if (vswp->recfg_reqd)
- vsw_reconfig_hw(vswp);
-
- mutex_exit(&vswp->hw_lock);
-
if (vsw_port_delete(port)) {
return (1);
}
@@ -482,10 +473,8 @@ vsw_detach_ports(vsw_t *vswp)
return (1);
}
- /* Remove address if was programmed into HW. */
- mutex_enter(&vswp->hw_lock);
- (void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
- mutex_exit(&vswp->hw_lock);
+ /* Cleanup and close the mac client */
+ vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
/* Remove the fdb entry for this port/mac address */
vsw_fdbe_del(vswp, &(port->p_macaddr));
@@ -560,6 +549,7 @@ vsw_port_delete(vsw_port_t *port)
rw_destroy(&port->p_ldclist.lockrw);
+ rw_destroy(&port->maccl_rwlock);
mutex_destroy(&port->mca_lock);
mutex_destroy(&port->tx_lock);
@@ -570,6 +560,11 @@ vsw_port_delete(vsw_port_t *port)
kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
port->num_ldcs = 0;
}
+
+ if (port->nvids != 0) {
+ kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
+ }
+
kmem_free(port, sizeof (vsw_port_t));
D1(vswp, "%s: exit", __func__);
@@ -4205,12 +4200,13 @@ vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
/* transmit the packet over the given port */
int
-vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
+vsw_portsend(vsw_port_t *port, mblk_t *mp)
{
vsw_ldc_list_t *ldcl = &port->p_ldclist;
vsw_ldc_t *ldcp;
+ mblk_t *mpt;
+ int count;
int status = 0;
- uint32_t n;
READ_ENTER(&ldcl->lockrw);
/*
@@ -4224,18 +4220,13 @@ vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
return (1);
}
- n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
+ count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
- count -= n;
- if (count == 0) {
- goto vsw_portsend_exit;
+ if (count != 0) {
+ status = ldcp->tx(ldcp, mp, mpt, count);
}
- status = ldcp->tx(ldcp, mp, mpt, count);
-
-vsw_portsend_exit:
RW_EXIT(&ldcl->lockrw);
-
return (status);
}
@@ -5735,14 +5726,6 @@ vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
}
-static caddr_t
-vsw_print_ethaddr(uint8_t *a, char *ebuf)
-{
- (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
- a[0], a[1], a[2], a[3], a[4], a[5]);
- return (ebuf);
-}
-
/*
* Reset and free all the resources associated with
* the channel.
diff --git a/usr/src/uts/sun4v/io/vsw_phys.c b/usr/src/uts/sun4v/io/vsw_phys.c
index 962ccc1cb9..127e1635c1 100644
--- a/usr/src/uts/sun4v/io/vsw_phys.c
+++ b/usr/src/uts/sun4v/io/vsw_phys.c
@@ -55,7 +55,8 @@
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
-#include <sys/mac.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
@@ -63,134 +64,133 @@
#include <sys/mac.h>
#include <sys/mdeg.h>
#include <sys/vsw.h>
+#include <sys/vlan.h>
/* MAC Ring table functions. */
-static void vsw_mac_ring_tbl_init(vsw_t *vswp);
-static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
-static void vsw_queue_worker(vsw_mac_ring_t *rrp);
-static void vsw_queue_stop(vsw_queue_t *vqp);
-static vsw_queue_t *vsw_queue_create();
-static void vsw_queue_destroy(vsw_queue_t *vqp);
-static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
-static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
+static void vsw_port_rx_cb(void *, mac_resource_handle_t, mblk_t *,
+ boolean_t);
+static void vsw_if_rx_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t);
/* MAC layer routines */
-static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
- mac_resource_t *mrp);
-static int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
-static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
-static int vsw_unset_hw_addr(vsw_t *, int);
-static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
-static int vsw_prog_if(vsw_t *);
+static int vsw_set_port_hw_addr(vsw_port_t *port);
+static int vsw_set_if_hw_addr(vsw_t *vswp);
+static void vsw_unset_hw_addr(vsw_t *, vsw_port_t *, int);
+static int vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type);
+static void vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type);
+static void vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type);
+static void vsw_mac_multicast_remove_all(vsw_t *vswp,
+ vsw_port_t *portp, int type);
+static void vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch,
+ uint8_t *macaddr, uint16_t flags, vsw_vlanid_t *vids, int nvids);
+static void vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids,
+ int nvids);
static void vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu);
/* Support functions */
-static int vsw_prog_ports(vsw_t *);
int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
+void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
void vsw_reconfig_hw(vsw_t *);
-int vsw_mac_attach(vsw_t *vswp);
-void vsw_mac_detach(vsw_t *vswp);
int vsw_mac_open(vsw_t *vswp);
void vsw_mac_close(vsw_t *vswp);
+int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
+ int type);
+void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
+ mcst_addr_t *mcst_p, int type);
+int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
+void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
+void vsw_mac_cleanup_ports(vsw_t *vswp);
void vsw_unset_addrs(vsw_t *vswp);
void vsw_set_addrs(vsw_t *vswp);
-int vsw_get_hw_maddr(vsw_t *);
-mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
-void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
+mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
+void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
+void vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
+ uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
+void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
+ vsw_vlanid_t *new_vids, int new_nvids);
+void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
+ uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
+/*
+ * Functions imported from other files.
+ */
+extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
+extern void vsw_hio_stop_port(vsw_port_t *portp);
+extern void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
+extern uint32_t vsw_publish_macaddr_count;
+extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
+ mblk_t **npt);
static char mac_mtu_propname[] = "mtu";
/*
* Tunables used in this file.
*/
extern int vsw_mac_open_retries;
-extern boolean_t vsw_multi_ring_enable;
-extern int vsw_mac_rx_rings;
-extern uint32_t vsw_publish_macaddr_count;
-/*
- * Check to see if the card supports the setting of multiple unicst
- * addresses.
- *
- * Returns 0 if card supports the programming of multiple unicast addresses,
- * otherwise returns 1.
- */
-int
-vsw_get_hw_maddr(vsw_t *vswp)
-{
- D1(vswp, "%s: enter", __func__);
- ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+#define WRITE_MACCL_ENTER(vswp, port, type) \
+ (type == VSW_LOCALDEV) ? rw_enter(&vswp->maccl_rwlock, RW_WRITER) :\
+ rw_enter(&port->maccl_rwlock, RW_WRITER)
- if (vswp->mh == NULL)
- return (1);
+#define READ_MACCL_ENTER(vswp, port, type) \
+ (type == VSW_LOCALDEV) ? rw_enter(&vswp->maccl_rwlock, RW_READER) :\
+ rw_enter(&port->maccl_rwlock, RW_READER)
- if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
- cmn_err(CE_NOTE, "!vsw%d: device (%s) does not support "
- "programming multiple addresses", vswp->instance,
- vswp->physname);
- return (1);
- }
+#define RW_MACCL_EXIT(vswp, port, type) \
+ (type == VSW_LOCALDEV) ? rw_exit(&vswp->maccl_rwlock) : \
+ rw_exit(&port->maccl_rwlock)
- D2(vswp, "%s: %d addrs : %d free", __func__,
- vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
- D1(vswp, "%s: exit", __func__);
+/*
+ * Locking strategy in this file is explained as follows:
+ * - A global lock(vswp->mac_lock) is used to protect the
+ * MAC calls that deal with entire device. That is, the
+ * operations that deal with mac_handle which include
+ * mac_open()/close() and mac_client_open().
+ *
+ * - A per port/interface RW lock(maccl_rwlock) is used protect
+ * the operations that deal with the MAC client.
+ *
+ * When both mac_lock and maccl_rwlock need to be held, the
+ * mac_lock need be acquired first and then maccl_rwlock. That is,
+ * mac_lock---->maccl_rwlock
+ *
+ * The 'mca_lock' that protects the mcast list is also acquired
+ * within the context of maccl_rwlock. The hierarchy for this
+ * one is as below:
+ * maccl_rwlock---->mca_lock
+ */
- return (0);
-}
/*
* Program unicast and multicast addresses of vsw interface and the ports
- * into the physical device.
+ * into the network device.
*/
void
vsw_set_addrs(vsw_t *vswp)
{
vsw_port_list_t *plist = &vswp->plist;
vsw_port_t *port;
- mcst_addr_t *mcap;
int rv;
READ_ENTER(&vswp->if_lockrw);
if (vswp->if_state & VSW_IF_UP) {
- /* program unicst addr of vsw interface in the physdev */
- if (vswp->addr_set == VSW_ADDR_UNSET) {
- mutex_enter(&vswp->hw_lock);
- rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
- mutex_exit(&vswp->hw_lock);
- if (rv != 0) {
- cmn_err(CE_NOTE,
- "!vsw%d: failed to program interface "
- "unicast address\n", vswp->instance);
- }
- /*
- * Notify the MAC layer of the changed address.
- */
- mac_unicst_update(vswp->if_mh,
- (uint8_t *)&vswp->if_addr);
+ /* Open a mac client and program addresses */
+ rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV);
+ if (rv != 0) {
+ cmn_err(CE_NOTE,
+ "!vsw%d: failed to program interface "
+ "unicast address\n", vswp->instance);
}
- /* program mcast addrs of vsw interface in the physdev */
- mutex_enter(&vswp->mca_lock);
- WRITE_ENTER(&vswp->mac_rwlock);
- for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
- if (mcap->mac_added)
- continue;
- rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
- if (rv == 0) {
- mcap->mac_added = B_TRUE;
- } else {
- cmn_err(CE_NOTE, "!vsw%d: unable to add "
- "multicast address: %s\n", vswp->instance,
- ether_sprintf((void *)&mcap->mca));
- }
+ /*
+ * Notify the MAC layer of the changed address.
+ */
+ if (rv == 0) {
+ mac_unicst_update(vswp->if_mh,
+ (uint8_t *)&vswp->if_addr);
}
- RW_EXIT(&vswp->mac_rwlock);
- mutex_exit(&vswp->mca_lock);
}
@@ -198,43 +198,24 @@ vsw_set_addrs(vsw_t *vswp)
WRITE_ENTER(&plist->lockrw);
- /* program unicast address of ports in the physical device */
- mutex_enter(&vswp->hw_lock);
+ /* program unicast address of ports in the network device */
for (port = plist->head; port != NULL; port = port->p_next) {
- if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */
+ if (port->addr_set) /* addr already set */
continue;
- if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
- cmn_err(CE_NOTE,
- "!vsw%d: port:%d failed to set unicast address\n",
- vswp->instance, port->p_instance);
- }
- }
- mutex_exit(&vswp->hw_lock);
- /* program multicast addresses of ports in the physdev */
- for (port = plist->head; port != NULL; port = port->p_next) {
- mutex_enter(&port->mca_lock);
- WRITE_ENTER(&vswp->mac_rwlock);
- for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
- if (mcap->mac_added)
- continue;
- rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
- if (rv == 0) {
- mcap->mac_added = B_TRUE;
- } else {
- cmn_err(CE_NOTE, "!vsw%d: unable to add "
- "multicast address: %s\n", vswp->instance,
- ether_sprintf((void *)&mcap->mca));
- }
+ /* Open a mac client and program addresses */
+ rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
+ if (rv != 0) {
+ cmn_err(CE_NOTE,
+ "!vsw%d: failed to program port(%d) "
+ "unicast address\n", vswp->instance,
+ port->p_instance);
}
- RW_EXIT(&vswp->mac_rwlock);
- mutex_exit(&port->mca_lock);
}
-
/* announce macaddr of vnets to the physical switch */
if (vsw_publish_macaddr_count != 0) { /* enabled */
for (port = plist->head; port != NULL; port = port->p_next) {
- vsw_publish_macaddr(vswp, (uint8_t *)&port->p_macaddr);
+ vsw_publish_macaddr(vswp, port);
}
}
@@ -242,93 +223,37 @@ vsw_set_addrs(vsw_t *vswp)
}
/*
- * Remove unicast and multicast addresses of vsw interface and the ports
- * from the physical device.
+ * Remove unicast, multicast addresses and close mac clients
+ * for the vsw interface and all ports.
*/
void
vsw_unset_addrs(vsw_t *vswp)
{
- vsw_port_list_t *plist = &vswp->plist;
- vsw_port_t *port;
- mcst_addr_t *mcap;
-
READ_ENTER(&vswp->if_lockrw);
-
if (vswp->if_state & VSW_IF_UP) {
- /*
- * Remove unicast addr of vsw interfce
- * from current physdev
- */
- mutex_enter(&vswp->hw_lock);
- (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
- mutex_exit(&vswp->hw_lock);
-
- /*
- * Remove mcast addrs of vsw interface
- * from current physdev
- */
- mutex_enter(&vswp->mca_lock);
- WRITE_ENTER(&vswp->mac_rwlock);
- for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
- if (!mcap->mac_added)
- continue;
- (void) mac_multicst_remove(vswp->mh,
- (uchar_t *)&mcap->mca);
- mcap->mac_added = B_FALSE;
- }
- RW_EXIT(&vswp->mac_rwlock);
- mutex_exit(&vswp->mca_lock);
-
+ /* Cleanup and close the mac client for the interface */
+ vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
}
-
RW_EXIT(&vswp->if_lockrw);
- WRITE_ENTER(&plist->lockrw);
-
- /*
- * Remove unicast address of ports from the current physical device
- */
- mutex_enter(&vswp->hw_lock);
- for (port = plist->head; port != NULL; port = port->p_next) {
- /* Remove address if was programmed into HW. */
- if (port->addr_set == VSW_ADDR_UNSET)
- continue;
- (void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
- }
- mutex_exit(&vswp->hw_lock);
-
- /* Remove multicast addresses of ports from the current physdev */
- for (port = plist->head; port != NULL; port = port->p_next) {
- mutex_enter(&port->mca_lock);
- WRITE_ENTER(&vswp->mac_rwlock);
- for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
- if (!mcap->mac_added)
- continue;
- (void) mac_multicst_remove(vswp->mh,
- (uchar_t *)&mcap->mca);
- mcap->mac_added = B_FALSE;
- }
- RW_EXIT(&vswp->mac_rwlock);
- mutex_exit(&port->mca_lock);
- }
-
- RW_EXIT(&plist->lockrw);
+ /* Cleanup and close the mac clients for all ports */
+ vsw_mac_cleanup_ports(vswp);
}
/*
- * Open the underlying physical device for access in layer2 mode.
+ * Open the underlying network device for access in layer2 mode.
* Returns:
- * 0 on success
- * EAGAIN if mac_open() fails due to the device being not available yet.
- * EIO on any other failures.
+ * 0 on success
+ * EAGAIN if mac_open() fails due to the device being not available yet.
+ * EIO on any other failures.
*/
int
vsw_mac_open(vsw_t *vswp)
{
- int rv;
+ int rv;
- ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+ ASSERT(MUTEX_HELD(&vswp->mac_lock));
if (vswp->mh != NULL) {
/* already open */
@@ -352,14 +277,15 @@ vsw_mac_open(vsw_t *vswp)
if (rv == ENOENT || rv == EBADF) {
return (EAGAIN);
} else {
- cmn_err(CE_WARN, "vsw%d: device (%s) open failed rv:%x",
+ cmn_err(CE_WARN, "vsw%d: mac_open %s failed rv:%x",
vswp->instance, vswp->physname, rv);
return (EIO);
}
}
-
vswp->mac_open_retries = 0;
+ vsw_mac_set_mtu(vswp, vswp->mtu);
+
return (0);
}
@@ -369,1005 +295,852 @@ vsw_mac_open(vsw_t *vswp)
void
vsw_mac_close(vsw_t *vswp)
{
- ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+ ASSERT(MUTEX_HELD(&vswp->mac_lock));
if (vswp->mh != NULL) {
+ if (vswp->mtu != vswp->mtu_physdev_orig) {
+ vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig);
+ }
mac_close(vswp->mh);
vswp->mh = NULL;
}
}
/*
- * Link into the MAC layer to gain access to the services provided by
- * the underlying physical device driver (which should also have
- * registered with the MAC layer).
- *
- * Only when in layer 2 mode.
+ * Add multicast addr.
*/
int
-vsw_mac_attach(vsw_t *vswp)
+vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
+ int type)
{
- D1(vswp, "%s: enter", __func__);
-
- ASSERT(vswp->mrh == NULL);
- ASSERT(vswp->mstarted == B_FALSE);
- ASSERT(vswp->mresources == B_FALSE);
-
- ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
-
- ASSERT(vswp->mh != NULL);
-
- D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
-
- vsw_mac_set_mtu(vswp, vswp->mtu);
-
- if (vsw_multi_ring_enable) {
- /*
- * Initialize the ring table.
- */
- vsw_mac_ring_tbl_init(vswp);
-
- /*
- * Register our rx callback function.
- */
- vswp->mrh = mac_rx_add(vswp->mh,
- vsw_rx_queue_cb, (void *)vswp);
- ASSERT(vswp->mrh != NULL);
-
- /*
- * Register our mac resource callback.
- */
- mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
- vswp->mresources = B_TRUE;
-
- /*
- * Get the ring resources available to us from
- * the mac below us.
- */
- mac_resources(vswp->mh);
- } else {
- /*
- * Just register our rx callback function
- */
- vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
- ASSERT(vswp->mrh != NULL);
- }
-
- /* Get the MAC tx fn */
- vswp->txinfo = mac_tx_get(vswp->mh);
-
- /* start the interface */
- if (mac_start(vswp->mh) != 0) {
- cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
- vswp->instance);
- goto mac_fail_exit;
+ int ret = 0;
+ mac_client_handle_t mch;
+
+ WRITE_MACCL_ENTER(vswp, port, type);
+
+ mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch;
+
+ if (mch != NULL) {
+ ret = mac_multicast_add(mch, mcst_p->mca.ether_addr_octet);
+ if (ret != 0) {
+ cmn_err(CE_WARN, "!vsw%d: unable to "
+ "program multicast address(%s) err=%d",
+ vswp->instance,
+ ether_sprintf((void *)&mcst_p->mca), ret);
+ RW_MACCL_EXIT(vswp, port, type);
+ return (ret);
+ }
+ mcst_p->mac_added = B_TRUE;
}
- vswp->mstarted = B_TRUE;
-
- D1(vswp, "%s: exit", __func__);
- return (0);
-
-mac_fail_exit:
- vsw_mac_detach(vswp);
-
- D1(vswp, "%s: exit", __func__);
- return (1);
+ RW_MACCL_EXIT(vswp, port, type);
+ return (ret);
}
+/*
+ * Remove multicast addr.
+ */
void
-vsw_mac_detach(vsw_t *vswp)
+vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
+ int type)
{
- D1(vswp, "vsw_mac_detach: enter");
-
- ASSERT(vswp != NULL);
- ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+ mac_client_handle_t mch;
- if (vsw_multi_ring_enable) {
- vsw_mac_ring_tbl_destroy(vswp);
- }
+ WRITE_MACCL_ENTER(vswp, port, type);
+ mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch;
- if (vswp->mh != NULL) {
- if (vswp->mstarted)
- mac_stop(vswp->mh);
- if (vswp->mrh != NULL)
- mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
- if (vswp->mresources)
- mac_resource_set(vswp->mh, NULL, NULL);
- if (vswp->mtu != vswp->mtu_physdev_orig) {
- vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig);
- }
+ if (mch != NULL && mcst_p->mac_added) {
+ mac_multicast_remove(mch, mcst_p->mca.ether_addr_octet);
+ mcst_p->mac_added = B_FALSE;
}
-
- vswp->mrh = NULL;
- vswp->txinfo = NULL;
- vswp->mstarted = B_FALSE;
-
- D1(vswp, "vsw_mac_detach: exit");
+ RW_MACCL_EXIT(vswp, port, type);
}
+
/*
- * Depending on the mode specified, the capabilites and capacity
- * of the underlying device setup the physical device.
- *
- * If in layer 3 mode, then do nothing.
- *
- * If in layer 2 programmed mode attempt to program the unicast address
- * associated with the port into the physical device. If this is not
- * possible due to resource exhaustion or simply because the device does
- * not support multiple unicast addresses then if required fallback onto
- * putting the card into promisc mode.
- *
- * If in promisc mode then simply set the card into promisc mode.
- *
- * Returns 0 success, 1 on failure.
+ * Add all multicast addresses of the port.
*/
-int
-vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
+static void
+vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type)
{
- mac_multi_addr_t mac_addr;
- int err;
+ mcst_addr_t *mcap;
+ mac_client_handle_t mch;
+ kmutex_t *mca_lockp;
+ int rv;
- D1(vswp, "%s: enter", __func__);
-
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
-
- if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
- return (0);
-
- if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
- return (vsw_set_hw_promisc(vswp, port, type));
- }
-
- /*
- * Attempt to program the unicast address into the HW.
- */
- mac_addr.mma_addrlen = ETHERADDRL;
- if (type == VSW_VNETPORT) {
- ASSERT(port != NULL);
- ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
+ if (type == VSW_LOCALDEV) {
+ ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+ mch = vswp->mch;
+ mcap = vswp->mcap;
+ mca_lockp = &vswp->mca_lock;
} else {
- ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
+ ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock));
+ mch = portp->p_mch;
+ mcap = portp->mcap;
+ mca_lockp = &portp->mca_lock;
}
- err = vsw_set_hw_addr(vswp, &mac_addr);
- if (err == ENOSPC) {
- /*
- * Mark that attempt should be made to re-config sometime
- * in future if a port is deleted.
- */
- vswp->recfg_reqd = B_TRUE;
-
- /*
- * Only 1 mode specified, nothing more to do.
- */
- if (vswp->smode_num == 1)
- return (err);
+ if (mch == NULL)
+ return;
- /*
- * If promiscuous was next mode specified try to
- * set the card into that mode.
- */
- if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
- (vswp->smode[vswp->smode_idx + 1] ==
- VSW_LAYER2_PROMISC)) {
- vswp->smode_idx += 1;
- return (vsw_set_hw_promisc(vswp, port, type));
+ mutex_enter(mca_lockp);
+ for (mcap = mcap; mcap != NULL; mcap = mcap->nextp) {
+ if (mcap->mac_added)
+ continue;
+ rv = mac_multicast_add(mch, (uchar_t *)&mcap->mca);
+ if (rv == 0) {
+ mcap->mac_added = B_TRUE;
+ } else {
+ cmn_err(CE_WARN, "!vsw%d: unable to program "
+ "multicast address(%s) err=%d", vswp->instance,
+ ether_sprintf((void *)&mcap->mca), rv);
}
- return (err);
}
+ mutex_exit(mca_lockp);
+}
- if (err != 0)
- return (err);
+/*
+ * Remove all multicast addresses of the port.
+ */
+static void
+vsw_mac_multicast_remove_all(vsw_t *vswp, vsw_port_t *portp, int type)
+{
+ mac_client_handle_t mch;
+ mcst_addr_t *mcap;
+ kmutex_t *mca_lockp;
- if (type == VSW_VNETPORT) {
- port->addr_slot = mac_addr.mma_slot;
- port->addr_set = VSW_ADDR_HW;
+ ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
+ if (type == VSW_LOCALDEV) {
+ ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+ mch = vswp->mch;
+ mcap = vswp->mcap;
+ mca_lockp = &vswp->mca_lock;
} else {
- vswp->addr_slot = mac_addr.mma_slot;
- vswp->addr_set = VSW_ADDR_HW;
+ ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock));
+ mch = portp->p_mch;
+ mcap = portp->mcap;
+ mca_lockp = &portp->mca_lock;
}
- D2(vswp, "programmed addr %s into slot %d "
- "of device %s", ether_sprintf((void *)mac_addr.mma_addr),
- mac_addr.mma_slot, vswp->physname);
-
- D1(vswp, "%s: exit", __func__);
+ if (mch == NULL)
+ return;
- return (0);
+ mutex_enter(mca_lockp);
+ for (; mcap != NULL; mcap = mcap->nextp) {
+ if (!mcap->mac_added)
+ continue;
+ (void) mac_multicast_remove(mch, (uchar_t *)&mcap->mca);
+ mcap->mac_added = B_FALSE;
+ }
+ mutex_exit(mca_lockp);
}
/*
- * If in layer 3 mode do nothing.
- *
- * If in layer 2 switched mode remove the address from the physical
- * device.
- *
- * If in layer 2 promiscuous mode disable promisc mode.
- *
- * Returns 0 on success.
+ * Open a mac client and program uncast and multicast addresses
+ * for a port or the interface.
+ * Returns:
+ * 0 on success
+ * non-zero for failure.
*/
int
-vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
+vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type)
{
- mac_addr_slot_t slot;
- int rv;
-
- D1(vswp, "%s: enter", __func__);
-
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
-
- if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
- return (0);
+ int rv;
- switch (type) {
- case VSW_VNETPORT:
- ASSERT(port != NULL);
-
- if (port->addr_set == VSW_ADDR_PROMISC) {
- return (vsw_unset_hw_promisc(vswp, port, type));
-
- } else if (port->addr_set == VSW_ADDR_HW) {
- slot = port->addr_slot;
- if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
- port->addr_set = VSW_ADDR_UNSET;
- }
+ mutex_enter(&vswp->mac_lock);
+ WRITE_MACCL_ENTER(vswp, port, type);
+ rv = vsw_maccl_open(vswp, port, type);
- break;
+ /* Release mac_lock now */
+ mutex_exit(&vswp->mac_lock);
- case VSW_LOCALDEV:
- if (vswp->addr_set == VSW_ADDR_PROMISC) {
- return (vsw_unset_hw_promisc(vswp, NULL, type));
-
- } else if (vswp->addr_set == VSW_ADDR_HW) {
- slot = vswp->addr_slot;
- if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
- vswp->addr_set = VSW_ADDR_UNSET;
- }
-
- break;
-
- default:
- /* should never happen */
- DERR(vswp, "%s: unknown type %d", __func__, type);
- ASSERT(0);
- return (1);
+ if (rv == 0) {
+ (void) vsw_set_hw(vswp, port, type);
+ vsw_mac_multicast_add_all(vswp, port, type);
}
-
- D1(vswp, "%s: exit", __func__);
+ RW_MACCL_EXIT(vswp, port, type);
return (rv);
}
/*
- * Attempt to program a unicast address into HW.
+ * Open a MAC client for a port or an interface.
+ * The flags and their purpose as below:
*
- * Returns 0 on sucess, 1 on failure.
+ * MAC_OPEN_FLAGS_NO_HWRINGS -- This flag is used by default
+ * for all ports/interface so that they are associated with
+ * default group & resources. It will not be used for the
+ * ports that have HybridIO is enabled so that the h/w resources
+ * assigned to it.
+ *
+ * MAC_OPEN_FLAGS_SHARES_DESIRED -- This flag is used to indicate
+ * that a port desires a Share. This will be the case with the
+ * the ports that have hybrid mode enabled. This will only cause
+ * MAC layer to allocate a share and corresponding resources
+ * ahead of time.
+ *
+ * MAC_OPEN_FLAGS_TAG_DISABLE -- This flag is used for VLAN
+ * support. It will cause MAC to not add any tags, but expect
+ * vsw to tag the packets.
+ *
+ * MAC_OPEN_FLAGS_STRIP_DISABLE -- This flag is used for VLAN
+ * support. It will case the MAC layer to not strip the tags.
+ * Vsw may have to strip the tag for pvid case.
*/
static int
-vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
+vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type)
{
- void *mah;
- int rv = EINVAL;
-
- D1(vswp, "%s: enter", __func__);
-
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
-
- if (vswp->maddr.maddr_handle == NULL)
- return (rv);
-
- mah = vswp->maddr.maddr_handle;
-
- rv = vswp->maddr.maddr_add(mah, mac);
+ int rv = 0;
+ int instance;
+ char mac_cl_name[MAXNAMELEN];
+ const char *dev_name;
+ mac_client_handle_t *mchp;
+ uint64_t flags = (MAC_OPEN_FLAGS_NO_HWRINGS |
+ MAC_OPEN_FLAGS_TAG_DISABLE |
+ MAC_OPEN_FLAGS_STRIP_DISABLE);
+
+ ASSERT(MUTEX_HELD(&vswp->mac_lock));
+ if (vswp->mh == NULL) {
+ /*
+ * In case net-dev is changed (either set to nothing or
+ * using aggregation device), return success here as the
+ * timeout mechanism will handle it.
+ */
+ return (0);
+ }
- if (rv == 0)
- return (rv);
+ mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch;
+ if (*mchp != NULL) {
+ /* already open */
+ return (0);
+ }
+ dev_name = ddi_driver_name(vswp->dip);
+ instance = ddi_get_instance(vswp->dip);
+ if (type == VSW_VNETPORT) {
+ if (port->p_hio_enabled == B_TRUE) {
+ flags &= ~MAC_OPEN_FLAGS_NO_HWRINGS;
+ flags |= MAC_OPEN_FLAGS_SHARES_DESIRED;
+ }
+ (void) snprintf(mac_cl_name, MAXNAMELEN, "%s%d%s%d", dev_name,
+ instance, "_port", port->p_instance);
+ } else {
+ (void) snprintf(mac_cl_name, MAXNAMELEN, "%s%s%d",
+ dev_name, "_if", instance);
+ }
- /*
- * Its okay for the add to fail because we have exhausted
- * all the resouces in the hardware device. Any other error
- * we want to flag.
- */
- if (rv != ENOSPC) {
- cmn_err(CE_NOTE, "!vsw%d: error programming "
- "address %s into HW err (%d)",
- vswp->instance, ether_sprintf((void *)mac->mma_addr), rv);
+ rv = mac_client_open(vswp->mh, mchp, mac_cl_name, flags);
+ if (rv != 0) {
+ cmn_err(CE_NOTE, "!vsw%d:%s mac_client_open() failed\n",
+ vswp->instance, mac_cl_name);
}
- D1(vswp, "%s: exit", __func__);
return (rv);
}
/*
- * Remove a unicast mac address which has previously been programmed
- * into HW.
- *
- * Returns 0 on sucess, 1 on failure.
+ * Clean up by removing uncast, multicast addresses and
+ * closing the MAC client for a port or the interface.
*/
-static int
-vsw_unset_hw_addr(vsw_t *vswp, int slot)
+void
+vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type)
{
- void *mah;
- int rv;
-
- D1(vswp, "%s: enter", __func__);
-
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
- ASSERT(slot >= 0);
+ WRITE_MACCL_ENTER(vswp, port, type);
+ vsw_unset_hw(vswp, port, type);
+ vsw_maccl_close(vswp, port, type);
+ vsw_mac_multicast_remove_all(vswp, port, type);
+ RW_MACCL_EXIT(vswp, port, type);
+}
- if (vswp->maddr.maddr_handle == NULL)
- return (1);
+/*
+ * Close a MAC client for a port or an interface.
+ */
+static void
+vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type)
+{
+ mac_client_handle_t *mchp;
- mah = vswp->maddr.maddr_handle;
+ ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
- rv = vswp->maddr.maddr_remove(mah, slot);
- if (rv != 0) {
- DWARN(vswp, "%s: unable to remove address "
- "from slot %d in device %s (err %d)",
- __func__, slot, vswp->physname, rv);
- return (1);
+ mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch;
+ if (*mchp != NULL) {
+ mac_client_close(*mchp, 0);
+ *mchp = NULL;
}
+}
- D2(vswp, "removed addr from slot %d in device %s",
- slot, vswp->physname);
+/*
+ * Cleanup MAC client related stuff for all ports.
+ */
+void
+vsw_mac_cleanup_ports(vsw_t *vswp)
+{
+ vsw_port_list_t *plist = &vswp->plist;
+ vsw_port_t *port;
- D1(vswp, "%s: exit", __func__);
- return (0);
+ READ_ENTER(&plist->lockrw);
+ for (port = plist->head; port != NULL; port = port->p_next) {
+ vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
+ }
+ RW_EXIT(&plist->lockrw);
}
/*
- * Set network card into promisc mode.
+ * Depending on the mode specified, the capabilites and capacity
+ * of the underlying device setup the physical device.
+ *
+ * If in layer 3 mode, then do nothing.
*
- * Returns 0 on success, 1 on failure.
+ * If in layer 2 mode, open a mac client and program the mac-address
+ * and vlan-ids. The MAC layer will take care of programming
+ * the address into h/w or set the h/w into promiscuous mode.
+ *
+ * Returns 0 success, 1 on failure.
*/
-static int
-vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
+int
+vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
+ int err = 1;
+
D1(vswp, "%s: enter", __func__);
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
- WRITE_ENTER(&vswp->mac_rwlock);
- if (vswp->mh == NULL) {
- RW_EXIT(&vswp->mac_rwlock);
- return (1);
- }
-
- if (vswp->promisc_cnt++ == 0) {
- if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
- vswp->promisc_cnt--;
- RW_EXIT(&vswp->mac_rwlock);
- return (1);
- }
- cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
- "promiscuous mode", vswp->instance, vswp->physname);
- }
- RW_EXIT(&vswp->mac_rwlock);
+ if (vswp->smode == VSW_LAYER3)
+ return (0);
if (type == VSW_VNETPORT) {
ASSERT(port != NULL);
- port->addr_set = VSW_ADDR_PROMISC;
+ err = vsw_set_port_hw_addr(port);
} else {
- vswp->addr_set = VSW_ADDR_PROMISC;
+ err = vsw_set_if_hw_addr(vswp);
}
D1(vswp, "%s: exit", __func__);
-
- return (0);
+ return (err);
}
/*
- * Turn off promiscuous mode on network card.
+ * If in layer 3 mode do nothing.
*
- * Returns 0 on success, 1 on failure.
+ * If in layer 2 switched mode remove the address from the physical
+ * device.
+ *
+ * If in layer 2 promiscuous mode disable promisc mode.
+ *
+ * Returns 0 on success.
*/
-static int
-vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
+void
+vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
- vsw_port_list_t *plist = &vswp->plist;
-
- D2(vswp, "%s: enter", __func__);
+ D1(vswp, "%s: enter", __func__);
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
- WRITE_ENTER(&vswp->mac_rwlock);
- if (vswp->mh == NULL) {
- RW_EXIT(&vswp->mac_rwlock);
- return (1);
- }
-
- if (--vswp->promisc_cnt == 0) {
- if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
- vswp->promisc_cnt++;
- RW_EXIT(&vswp->mac_rwlock);
- return (1);
- }
-
- /*
- * We are exiting promisc mode either because we were
- * only in promisc mode because we had failed over from
- * switched mode due to HW resource issues, or the user
- * wanted the card in promisc mode for all the ports and
- * the last port is now being deleted. Tweak the message
- * accordingly.
- */
- if (plist->num_ports != 0) {
- cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
- "programmed mode", vswp->instance, vswp->physname);
- } else {
- cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
- "promiscuous mode", vswp->instance, vswp->physname);
- }
- }
- RW_EXIT(&vswp->mac_rwlock);
+ if (vswp->smode == VSW_LAYER3)
+ return;
if (type == VSW_VNETPORT) {
ASSERT(port != NULL);
- ASSERT(port->addr_set == VSW_ADDR_PROMISC);
- port->addr_set = VSW_ADDR_UNSET;
+ vsw_unset_hw_addr(vswp, port, type);
} else {
- ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
- vswp->addr_set = VSW_ADDR_UNSET;
+ vsw_unset_hw_addr(vswp, NULL, type);
}
D1(vswp, "%s: exit", __func__);
- return (0);
}
/*
- * Determine whether or not we are operating in our prefered
- * mode and if not whether the physical resources now allow us
- * to operate in it.
+ * Program the macaddress and vlans of a port.
*
- * If a port is being removed should only be invoked after port has been
- * removed from the port list.
+ * Returns 0 on sucess, 1 on failure.
*/
-void
-vsw_reconfig_hw(vsw_t *vswp)
+static int
+vsw_set_port_hw_addr(vsw_port_t *port)
{
- int s_idx;
+ vsw_t *vswp = port->p_vswp;
+ uint16_t mac_flags = 0;
+ mac_diag_t diag;
+ uint8_t *macaddr;
+ uint16_t vid = VLAN_ID_NONE;
+ int rv;
D1(vswp, "%s: enter", __func__);
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
-
- if (vswp->maddr.maddr_handle == NULL) {
- return;
- }
+ ASSERT(RW_WRITE_HELD(&port->maccl_rwlock));
+ if (port->p_mch == NULL)
+ return (0);
/*
- * If we are in layer 2 (i.e. switched) or would like to be
- * in layer 2 then check if any ports or the vswitch itself
- * need to be programmed into the HW.
- *
- * This can happen in two cases - switched was specified as
- * the prefered mode of operation but we exhausted the HW
- * resources and so failed over to the next specifed mode,
- * or switched was the only mode specified so after HW
- * resources were exhausted there was nothing more we
- * could do.
+ * If the port has a specific 'pvid', then
+ * register with that vlan-id, otherwise register
+ * with VLAN_ID_NONE.
*/
- if (vswp->smode_idx > 0)
- s_idx = vswp->smode_idx - 1;
- else
- s_idx = vswp->smode_idx;
-
- if (vswp->smode[s_idx] != VSW_LAYER2) {
- return;
+ if (port->pvid != vswp->default_vlan_id) {
+ vid = port->pvid;
}
+ macaddr = (uint8_t *)port->p_macaddr.ether_addr_octet;
- D2(vswp, "%s: attempting reconfig..", __func__);
-
- /*
- * First, attempt to set the vswitch mac address into HW,
- * if required.
- */
- if (vsw_prog_if(vswp)) {
- return;
+ if (!(vswp->smode & VSW_LAYER2_PROMISC)) {
+ mac_flags |= MAC_UNICAST_HW;
}
- /*
- * Next, attempt to set any ports which have not yet been
- * programmed into HW.
- */
- if (vsw_prog_ports(vswp)) {
- return;
+ if (port->addr_set == B_FALSE) {
+ port->p_muh = NULL;
+ rv = mac_unicast_add(port->p_mch, macaddr, mac_flags,
+ &port->p_muh, vid, &diag);
+
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Failed to program"
+ "macaddr,vid(%s, %d) err=%d",
+ vswp->instance, ether_sprintf((void *)macaddr),
+ vid, rv);
+ return (rv);
+ }
+ port->addr_set = B_TRUE;
+
+ D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s",
+ __func__, ether_sprintf((void *)macaddr), vid,
+ vswp->physname);
}
- /*
- * By now we know that have programmed all desired ports etc
- * into HW, so safe to mark reconfiguration as complete.
- */
- vswp->recfg_reqd = B_FALSE;
+ /* Add vlans to the MAC layer */
+ vsw_mac_add_vlans(vswp, port->p_mch, macaddr,
+ mac_flags, port->vids, port->nvids);
- vswp->smode_idx = s_idx;
+ mac_rx_set(port->p_mch, vsw_port_rx_cb, (void *)port);
D1(vswp, "%s: exit", __func__);
+ return (rv);
}
/*
- * Check to see if vsw itself is plumbed, and if so whether or not
- * its mac address should be written into HW.
+ * Program the macaddress and vlans of a port.
*
- * Returns 0 if could set address, or didn't have to set it.
- * Returns 1 if failed to set address.
+ * Returns 0 on sucess, 1 on failure.
*/
static int
-vsw_prog_if(vsw_t *vswp)
+vsw_set_if_hw_addr(vsw_t *vswp)
{
- mac_multi_addr_t addr;
+ uint16_t mac_flags = 0;
+ mac_diag_t diag;
+ uint8_t *macaddr;
+ uint8_t primary_addr[ETHERADDRL];
+ uint16_t vid = VLAN_ID_NONE;
+ int rv;
D1(vswp, "%s: enter", __func__);
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
+ ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+ if (vswp->mch == NULL)
+ return (0);
- READ_ENTER(&vswp->if_lockrw);
- if ((vswp->if_state & VSW_IF_UP) &&
- (vswp->addr_set != VSW_ADDR_HW)) {
+ macaddr = (uint8_t *)vswp->if_addr.ether_addr_octet;
+
+ /* check if it is the primary macaddr of the card. */
+ mac_unicast_primary_get(vswp->mh, primary_addr);
+ if (ether_cmp((void *)primary_addr, (void*)macaddr) == 0) {
+ mac_flags |= MAC_UNICAST_PRIMARY;
+ }
+
+ /*
+ * If the interface has a specific 'pvid', then
+ * register with that vlan-id, otherwise register
+ * with VLAN_ID_NONE.
+ */
+ if (vswp->pvid != vswp->default_vlan_id) {
+ vid = vswp->pvid;
+ }
- addr.mma_addrlen = ETHERADDRL;
- ether_copy(&vswp->if_addr, &addr.mma_addr);
+ if (!(vswp->smode & VSW_LAYER2_PROMISC)) {
+ mac_flags |= MAC_UNICAST_HW;
+ }
- if (vsw_set_hw_addr(vswp, &addr) != 0) {
- RW_EXIT(&vswp->if_lockrw);
- return (1);
+ if (vswp->addr_set == B_FALSE) {
+ vswp->muh = NULL;
+ rv = mac_unicast_add(vswp->mch, macaddr, mac_flags,
+ &vswp->muh, vid, &diag);
+
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Failed to program"
+ "macaddr,vid(%s, %d) err=%d",
+ vswp->instance, ether_sprintf((void *)macaddr),
+ vid, rv);
+ return (rv);
}
+ vswp->addr_set = B_TRUE;
- vswp->addr_slot = addr.mma_slot;
+ D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s",
+ __func__, ether_sprintf((void *)macaddr), vid,
+ vswp->physname);
+ }
- /*
- * If previously when plumbed had had to place
- * interface into promisc mode, now reverse that.
- *
- * Note that interface will only actually be set into
- * non-promisc mode when last port/interface has been
- * programmed into HW.
- */
- if (vswp->addr_set == VSW_ADDR_PROMISC)
- (void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
+ vsw_mac_add_vlans(vswp, vswp->mch, macaddr, mac_flags,
+ vswp->vids, vswp->nvids);
- vswp->addr_set = VSW_ADDR_HW;
- }
- RW_EXIT(&vswp->if_lockrw);
+ mac_rx_set(vswp->mch, vsw_if_rx_cb, (void *)vswp);
D1(vswp, "%s: exit", __func__);
- return (0);
+ return (rv);
}
/*
- * Scan the port list for any ports which have not yet been set
- * into HW. For those found attempt to program their mac addresses
- * into the physical device.
+ * Remove a unicast mac address which has previously been programmed
+ * into HW.
*
- * Returns 0 if able to program all required ports (can be 0) into HW.
- * Returns 1 if failed to set at least one mac address.
+ * Returns 0 on sucess, 1 on failure.
*/
-static int
-vsw_prog_ports(vsw_t *vswp)
+static void
+vsw_unset_hw_addr(vsw_t *vswp, vsw_port_t *port, int type)
{
- mac_multi_addr_t addr;
- vsw_port_list_t *plist = &vswp->plist;
- vsw_port_t *tp;
- int rv = 0;
+ vsw_vlanid_t *vids;
+ int nvids;
+ mac_client_handle_t mch = NULL;
D1(vswp, "%s: enter", __func__);
- ASSERT(MUTEX_HELD(&vswp->hw_lock));
+ ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
- READ_ENTER(&plist->lockrw);
- for (tp = plist->head; tp != NULL; tp = tp->p_next) {
- if (tp->addr_set != VSW_ADDR_HW) {
- addr.mma_addrlen = ETHERADDRL;
- ether_copy(&tp->p_macaddr, &addr.mma_addr);
-
- if (vsw_set_hw_addr(vswp, &addr) != 0) {
- rv = 1;
- break;
- }
-
- tp->addr_slot = addr.mma_slot;
-
- /*
- * If when this port had first attached we had
- * had to place the interface into promisc mode,
- * then now reverse that.
- *
- * Note that the interface will not actually
- * change to non-promisc mode until all ports
- * have been programmed.
- */
- if (tp->addr_set == VSW_ADDR_PROMISC)
- (void) vsw_unset_hw_promisc(vswp,
- tp, VSW_VNETPORT);
-
- tp->addr_set = VSW_ADDR_HW;
- }
+ if (type == VSW_VNETPORT) {
+ ASSERT(port != NULL);
+ ASSERT(RW_WRITE_HELD(&port->maccl_rwlock));
+ vids = port->vids;
+ nvids = port->nvids;
+ } else {
+ ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+ vids = vswp->vids;
+ nvids = vswp->nvids;
}
- RW_EXIT(&plist->lockrw);
- D1(vswp, "%s: exit", __func__);
- return (rv);
-}
+ /* First clear the callback */
+ if (type == VSW_LOCALDEV) {
+ mch = vswp->mch;
+ } else if (type == VSW_VNETPORT) {
+ mch = port->p_mch;
+ }
-static void
-vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
-{
- ringp->ring_state = VSW_MAC_RING_FREE;
- ringp->ring_arg = NULL;
- ringp->ring_blank = NULL;
- ringp->ring_vqp = NULL;
- ringp->ring_vswp = vswp;
-}
-static void
-vsw_mac_ring_tbl_init(vsw_t *vswp)
-{
- int i;
+ if (mch == NULL) {
+ return;
+ }
- mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
+ mac_rx_clear(mch);
- vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
- vswp->mac_ring_tbl =
- kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
+ /* Remove vlans */
+ vsw_mac_remove_vlans(mch, vids, nvids);
- for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
- vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
-}
+ if ((type == VSW_LOCALDEV) && (vswp->addr_set == B_TRUE)) {
+ (void) mac_unicast_remove(vswp->mch, vswp->muh);
+ vswp->muh = NULL;
+ D2(vswp, "removed vsw interface mac-addr from "
+ "the device %s", vswp->physname);
+ vswp->addr_set = B_FALSE;
-static void
-vsw_mac_ring_tbl_destroy(vsw_t *vswp)
-{
- int i;
- vsw_mac_ring_t *ringp;
-
- mutex_enter(&vswp->mac_ring_lock);
- for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
- ringp = &vswp->mac_ring_tbl[i];
-
- if (ringp->ring_state != VSW_MAC_RING_FREE) {
- /*
- * Destroy the queue.
- */
- vsw_queue_stop(ringp->ring_vqp);
- vsw_queue_destroy(ringp->ring_vqp);
-
- /*
- * Re-initialize the structure.
- */
- vsw_mac_ring_tbl_entry_init(vswp, ringp);
- }
+ } else if ((type == VSW_VNETPORT) && (port->addr_set == B_TRUE)) {
+ (void) mac_unicast_remove(port->p_mch, port->p_muh);
+ port->p_muh = NULL;
+ D2(vswp, "removed port(0x%p) mac-addr from "
+ "the device %s", port, vswp->physname);
+ port->addr_set = B_FALSE;
}
- mutex_exit(&vswp->mac_ring_lock);
- mutex_destroy(&vswp->mac_ring_lock);
- kmem_free(vswp->mac_ring_tbl,
- vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
- vswp->mac_ring_tbl_sz = 0;
+ D1(vswp, "%s: exit", __func__);
}
/*
- * Handle resource add callbacks from the driver below.
+ * receive callback routine for vsw interface. Invoked by MAC layer when there
+ * are pkts being passed up from physical device for this vsw interface.
*/
-static mac_resource_handle_t
-vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
+/* ARGSUSED */
+static void
+vsw_if_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
+ _NOTE(ARGUNUSED(mrh))
+
vsw_t *vswp = (vsw_t *)arg;
- mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp;
- vsw_mac_ring_t *ringp;
- vsw_queue_t *vqp;
- int i;
+ mblk_t *mpt;
+ int count;
ASSERT(vswp != NULL);
- ASSERT(mrp != NULL);
- ASSERT(vswp->mac_ring_tbl != NULL);
D1(vswp, "%s: enter", __func__);
- /*
- * Check to make sure we have the correct resource type.
- */
- if (mrp->mr_type != MAC_RX_FIFO)
- return (NULL);
-
- /*
- * Find a open entry in the ring table.
- */
- mutex_enter(&vswp->mac_ring_lock);
- for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
- ringp = &vswp->mac_ring_tbl[i];
-
- /*
- * Check for an empty slot, if found, then setup queue
- * and thread.
- */
- if (ringp->ring_state == VSW_MAC_RING_FREE) {
- /*
- * Create the queue for this ring.
- */
- vqp = vsw_queue_create();
-
- /*
- * Initialize the ring data structure.
- */
- ringp->ring_vqp = vqp;
- ringp->ring_arg = mrfp->mrf_arg;
- ringp->ring_blank = mrfp->mrf_blank;
- ringp->ring_state = VSW_MAC_RING_INUSE;
-
- /*
- * Create the worker thread.
- */
- vqp->vq_worker = thread_create(NULL, 0,
- vsw_queue_worker, ringp, 0, &p0,
- TS_RUN, minclsyspri);
- if (vqp->vq_worker == NULL) {
- vsw_queue_destroy(vqp);
- vsw_mac_ring_tbl_entry_init(vswp, ringp);
- ringp = NULL;
- }
-
- if (ringp != NULL) {
- /*
- * Make sure thread get's running state for
- * this ring.
- */
- mutex_enter(&vqp->vq_lock);
- while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
- (vqp->vq_state != VSW_QUEUE_DRAINED)) {
- cv_wait(&vqp->vq_cv, &vqp->vq_lock);
- }
-
- /*
- * If the thread is not running, cleanup.
- */
- if (vqp->vq_state == VSW_QUEUE_DRAINED) {
- vsw_queue_destroy(vqp);
- vsw_mac_ring_tbl_entry_init(vswp,
- ringp);
- ringp = NULL;
- }
- mutex_exit(&vqp->vq_lock);
- }
-
- mutex_exit(&vswp->mac_ring_lock);
- D1(vswp, "%s: exit", __func__);
- return ((mac_resource_handle_t)ringp);
+ READ_ENTER(&vswp->if_lockrw);
+ if (vswp->if_state & VSW_IF_UP) {
+ RW_EXIT(&vswp->if_lockrw);
+ count = vsw_vlan_frame_untag(vswp, VSW_LOCALDEV, &mp, &mpt);
+ if (count != 0) {
+ mac_rx(vswp->if_mh, NULL, mp);
}
+ } else {
+ RW_EXIT(&vswp->if_lockrw);
+ freemsgchain(mp);
}
- mutex_exit(&vswp->mac_ring_lock);
- /*
- * No slots in the ring table available.
- */
D1(vswp, "%s: exit", __func__);
- return (NULL);
}
+/*
+ * receive callback routine for port. Invoked by MAC layer when there
+ * are pkts being passed up from physical device for this port.
+ */
+/* ARGSUSED */
static void
-vsw_queue_stop(vsw_queue_t *vqp)
+vsw_port_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
{
- mutex_enter(&vqp->vq_lock);
+ _NOTE(ARGUNUSED(mrh))
- if (vqp->vq_state == VSW_QUEUE_RUNNING) {
- vqp->vq_state = VSW_QUEUE_STOP;
- cv_signal(&vqp->vq_cv);
+ vsw_t *vswp;
+ vsw_port_t *port = arg;
- while (vqp->vq_state != VSW_QUEUE_DRAINED)
- cv_wait(&vqp->vq_cv, &vqp->vq_lock);
- }
+ ASSERT(port != NULL);
+
+ vswp = port->p_vswp;
- vqp->vq_state = VSW_QUEUE_STOPPED;
+ D1(vswp, "vsw_port_rx_cb: enter");
- mutex_exit(&vqp->vq_lock);
+ /*
+ * Send the packets to the peer directly.
+ */
+ (void) vsw_portsend(port, mp);
+
+ D1(vswp, "vsw_port_rx_cb: exit");
}
-static vsw_queue_t *
-vsw_queue_create()
+/*
+ * Send a message out over the physical device
+ * via the MAC layer.
+ *
+ * Returns any mblks that it was unable to transmit.
+ */
+mblk_t *
+vsw_tx_msg(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port)
{
- vsw_queue_t *vqp;
+ mac_client_handle_t mch;
+ mac_unicast_handle_t muh;
- vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
+ READ_MACCL_ENTER(vswp, port, caller);
- mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
- vqp->vq_first = NULL;
- vqp->vq_last = NULL;
- vqp->vq_state = VSW_QUEUE_STOPPED;
+ mch = (caller == VSW_LOCALDEV) ? vswp->mch : port->p_mch;
+ muh = (caller == VSW_LOCALDEV) ? vswp->muh : port->p_muh;
- return (vqp);
-}
+ if ((mch != NULL) && (muh != NULL)) {
+ /* packets are sent or dropped */
+ (void) mac_tx(mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL);
+ }
-static void
-vsw_queue_destroy(vsw_queue_t *vqp)
-{
- cv_destroy(&vqp->vq_cv);
- mutex_destroy(&vqp->vq_lock);
- kmem_free(vqp, sizeof (vsw_queue_t));
+ RW_MACCL_EXIT(vswp, port, caller);
+ return (NULL);
}
-static void
-vsw_queue_worker(vsw_mac_ring_t *rrp)
+/*
+ * vsw_port_mac_reconfig -- Cleanup and close the MAC client
+ * and reopen and re-configure the MAC client with new flags etc.
+ * This function is useful for two different purposes:
+ * 1) To update the MAC client with new vlan-ids. This is done
+ * by freeing the existing vlan-ids and reopen with the new
+ * vlan-ids.
+ *
+ * 2) If the Hybrid mode status of a port changes, then the
+ * MAC client need to be closed and re-opened, otherwise,
+ * Share related resources may not be freed(hybird mode disabled)
+ * or assigned(hybrid mode enabled). To accomplish this,
+ * this function simply closes and reopens the MAC client.
+ * The reopen will result in using the flags based on the
+ * new hybrid mode of the port.
+ */
+void
+vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
+ uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids)
{
- mblk_t *mp;
- vsw_queue_t *vqp = rrp->ring_vqp;
- vsw_t *vswp = rrp->ring_vswp;
-
- mutex_enter(&vqp->vq_lock);
-
- ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
+ vsw_t *vswp = portp->p_vswp;
+ int rv;
+ D1(vswp, "%s: enter", __func__);
/*
- * Set the state to running, since the thread is now active.
+ * Remove the multi-cast addresses, unicast address
+ * and close the mac-client.
*/
- vqp->vq_state = VSW_QUEUE_RUNNING;
- cv_signal(&vqp->vq_cv);
-
- while (vqp->vq_state == VSW_QUEUE_RUNNING) {
- /*
- * Wait for work to do or the state has changed
- * to not running.
- */
- while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
- (vqp->vq_first == NULL)) {
- cv_wait(&vqp->vq_cv, &vqp->vq_lock);
- }
-
- /*
- * Process packets that we received from the interface.
- */
- if (vqp->vq_first != NULL) {
- mp = vqp->vq_first;
-
- vqp->vq_first = NULL;
- vqp->vq_last = NULL;
-
- mutex_exit(&vqp->vq_lock);
-
- /* switch the chain of packets received */
- vswp->vsw_switch_frame(vswp, mp,
- VSW_PHYSDEV, NULL, NULL);
-
- mutex_enter(&vqp->vq_lock);
+ mutex_enter(&vswp->mac_lock);
+ WRITE_ENTER(&portp->maccl_rwlock);
+ vsw_mac_multicast_remove_all(vswp, portp, VSW_VNETPORT);
+ vsw_unset_hw(vswp, portp, VSW_VNETPORT);
+ vsw_maccl_close(vswp, portp, VSW_VNETPORT);
+
+ if (update_vlans == B_TRUE) {
+ if (portp->nvids != 0) {
+ kmem_free(portp->vids,
+ sizeof (vsw_vlanid_t) * portp->nvids);
+ portp->vids = NULL;
+ portp->nvids = 0;
}
+ portp->vids = new_vids;
+ portp->nvids = new_nvids;
+ portp->pvid = new_pvid;
}
/*
- * We are drained and signal we are done.
+ * Now re-open the mac-client and
+ * configure unicast addr and multicast addrs.
*/
- vqp->vq_state = VSW_QUEUE_DRAINED;
- cv_signal(&vqp->vq_cv);
+ rv = vsw_maccl_open(vswp, portp, VSW_VNETPORT);
+ if (rv != 0) {
+ goto recret;
+ }
- /*
- * Exit lock and drain the remaining packets.
- */
- mutex_exit(&vqp->vq_lock);
+ if (vsw_set_hw(vswp, portp, VSW_VNETPORT)) {
+ cmn_err(CE_NOTE, "!vsw%d: port:%d failed to "
+ "set unicast address\n", vswp->instance, portp->p_instance);
+ goto recret;
+ }
- /*
- * Exit the thread
- */
- thread_exit();
+ vsw_mac_multicast_add_all(vswp, portp, VSW_VNETPORT);
+
+recret:
+ RW_EXIT(&portp->maccl_rwlock);
+ mutex_exit(&vswp->mac_lock);
+ D1(vswp, "%s: exit", __func__);
}
/*
- * static void
- * vsw_rx_queue_cb() - Receive callback routine when
- * vsw_multi_ring_enable is non-zero. Queue the packets
- * to a packet queue for a worker thread to process.
+ * vsw_if_mac_reconfig -- Reconfigure the vsw interfaace's mac-client
+ * by closing and re-opening it. This function is used handle the
+ * following two cases:
+ *
+ * 1) Handle the MAC address change for the interface.
+ * 2) Handle vlan update.
*/
-static void
-vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+void
+vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
+ uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids)
{
- vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh;
- vsw_t *vswp = (vsw_t *)arg;
- vsw_queue_t *vqp;
- mblk_t *bp, *last;
-
- ASSERT(mrh != NULL);
- ASSERT(vswp != NULL);
- ASSERT(mp != NULL);
+ int rv;
D1(vswp, "%s: enter", __func__);
-
/*
- * Find the last element in the mblk chain.
+ * Remove the multi-cast addresses, unicast address
+ * and close the mac-client.
*/
- bp = mp;
- do {
- last = bp;
- bp = bp->b_next;
- } while (bp != NULL);
-
- /* Get the queue for the packets */
- vqp = ringp->ring_vqp;
-
- /*
- * Grab the lock such we can queue the packets.
- */
- mutex_enter(&vqp->vq_lock);
-
- if (vqp->vq_state != VSW_QUEUE_RUNNING) {
- freemsgchain(mp);
- mutex_exit(&vqp->vq_lock);
- goto vsw_rx_queue_cb_exit;
+ mutex_enter(&vswp->mac_lock);
+ WRITE_ENTER(&vswp->maccl_rwlock);
+ vsw_mac_multicast_remove_all(vswp, NULL, VSW_LOCALDEV);
+ vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
+ vsw_maccl_close(vswp, NULL, VSW_LOCALDEV);
+
+ if (update_vlans == B_TRUE) {
+ if (vswp->nvids != 0) {
+ kmem_free(vswp->vids,
+ sizeof (vsw_vlanid_t) * vswp->nvids);
+ vswp->vids = NULL;
+ vswp->nvids = 0;
+ }
+ vswp->vids = new_vids;
+ vswp->nvids = new_nvids;
+ vswp->pvid = new_pvid;
}
/*
- * Add the mblk chain to the queue. If there
- * is some mblks in the queue, then add the new
- * chain to the end.
+ * Now re-open the mac-client and
+ * configure unicast addr and multicast addrs.
*/
- if (vqp->vq_first == NULL)
- vqp->vq_first = mp;
- else
- vqp->vq_last->b_next = mp;
-
- vqp->vq_last = last;
+ rv = vsw_maccl_open(vswp, NULL, VSW_LOCALDEV);
+ if (rv != 0) {
+ goto ifrecret;
+ }
- /*
- * Signal the worker thread that there is work to
- * do.
- */
- cv_signal(&vqp->vq_cv);
+ if (vsw_set_hw(vswp, NULL, VSW_LOCALDEV)) {
+ cmn_err(CE_NOTE, "!vsw%d:failed to set unicast address\n",
+ vswp->instance);
+ goto ifrecret;
+ }
- /*
- * Let go of the lock and exit.
- */
- mutex_exit(&vqp->vq_lock);
+ vsw_mac_multicast_add_all(vswp, NULL, VSW_LOCALDEV);
-vsw_rx_queue_cb_exit:
+ifrecret:
+ RW_EXIT(&vswp->maccl_rwlock);
+ mutex_exit(&vswp->mac_lock);
D1(vswp, "%s: exit", __func__);
}
/*
- * receive callback routine. Invoked by MAC layer when there
- * are pkts being passed up from physical device.
+ * vsw_mac_port_reconfig_vlans -- Reconfigure a port to handle
+ * vlan configuration update. As the removal of the last unicast-address,vid
+ * from the MAC client results in releasing all resources, it expects
+ * no Shares to be associated with such MAC client.
*
- * PERF: It may be more efficient when the card is in promisc
- * mode to check the dest address of the pkts here (against
- * the FDB) rather than checking later. Needs to be investigated.
+ * To handle vlan configuration update for a port that already has
+ * a Share bound, then we need to free that share prior to reconfiguration.
+ * Initiate the hybrdIO setup again after the completion of reconfiguration.
*/
-static void
-vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+void
+vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
+ vsw_vlanid_t *new_vids, int new_nvids)
{
- _NOTE(ARGUNUSED(mrh))
-
- vsw_t *vswp = (vsw_t *)arg;
+ /*
+ * As the reconfiguration involves the close of
+ * mac client, cleanup HybridIO and later restart
+ * HybridIO setup again.
+ */
+ if (portp->p_hio_enabled == B_TRUE) {
+ vsw_hio_stop_port(portp);
+ }
+ vsw_port_mac_reconfig(portp, B_TRUE, new_pvid, new_vids, new_nvids);
+ if (portp->p_hio_enabled == B_TRUE) {
+ /* reset to setup the HybridIO again. */
+ vsw_hio_port_reset(portp, B_FALSE);
+ }
+}
- ASSERT(vswp != NULL);
+/* Add vlans to MAC client */
+static void
+vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch, uint8_t *macaddr,
+ uint16_t flags, vsw_vlanid_t *vids, int nvids)
+{
+ vsw_vlanid_t *vidp;
+ mac_diag_t diag;
+ int rv;
+ int i;
- D1(vswp, "vsw_rx_cb: enter");
+ /* Add vlans to the MAC layer */
+ for (i = 0; i < nvids; i++) {
+ vidp = &vids[i];
- /* switch the chain of packets received */
- vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
+ if (vidp->vl_set == B_TRUE) {
+ continue;
+ }
- D1(vswp, "vsw_rx_cb: exit");
+ rv = mac_unicast_add(mch, macaddr, flags,
+ &vidp->vl_muh, vidp->vl_vid, &diag);
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Failed to program"
+ "macaddr,vid(%s, %d) err=%d",
+ vswp->instance, ether_sprintf((void *)macaddr),
+ vidp->vl_vid, rv);
+ } else {
+ vidp->vl_set = B_TRUE;
+ D2(vswp, "%s:programmed macaddr(%s) vid(%d) "
+ "into device %s", __func__,
+ ether_sprintf((void *)macaddr),
+ vidp->vl_vid, vswp->physname);
+ }
+ }
}
-/*
- * Send a message out over the physical device via the MAC layer.
- *
- * Returns any mblks that it was unable to transmit.
- */
-mblk_t *
-vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
+/* Remove vlans from the MAC client */
+static void
+vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids, int nvids)
{
- const mac_txinfo_t *mtp;
+ int i;
+ vsw_vlanid_t *vidp;
- READ_ENTER(&vswp->mac_rwlock);
- if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) {
-
- DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
- RW_EXIT(&vswp->mac_rwlock);
- return (mp);
- } else {
- mtp = vswp->txinfo;
- mp = mtp->mt_fn(mtp->mt_arg, mp);
+ for (i = 0; i < nvids; i++) {
+ vidp = &vids[i];
+ if (vidp->vl_set == B_FALSE) {
+ continue;
+ }
+ mac_unicast_remove(mch, vidp->vl_muh);
+ vidp->vl_set = B_FALSE;
}
- RW_EXIT(&vswp->mac_rwlock);
-
- return (mp);
}
#define ARH_FIXED_LEN 8 /* Length of fixed part of ARP header(see arp.h) */
@@ -1386,7 +1159,7 @@ vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
* vsw_publish_macaddr_count to zero in /etc/system.
*/
void
-vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
+vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp)
{
mblk_t *mp;
mblk_t *bp;
@@ -1404,7 +1177,7 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
/* Initialize eth header */
ehp = (struct ether_header *)mp->b_rptr;
bcopy(&etherbroadcastaddr, &ehp->ether_dhost, ETHERADDRL);
- bcopy(addr, &ehp->ether_shost, ETHERADDRL);
+ bcopy(&portp->p_macaddr, &ehp->ether_shost, ETHERADDRL);
ehp->ether_type = htons(ETHERTYPE_REVARP);
/* Initialize arp packet */
@@ -1420,13 +1193,13 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
cp += ARH_FIXED_LEN;
/* Sender's hardware address and protocol address */
- bcopy(addr, cp, ETHERADDRL);
+ bcopy(&portp->p_macaddr, cp, ETHERADDRL);
cp += ETHERADDRL;
bzero(cp, plen); /* INADDR_ANY */
cp += plen;
/* Target hardware address and protocol address */
- bcopy(addr, cp, ETHERADDRL);
+ bcopy(&portp->p_macaddr, cp, ETHERADDRL);
cp += ETHERADDRL;
bzero(cp, plen); /* INADDR_ANY */
cp += plen;
@@ -1441,7 +1214,7 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
}
/* transmit the packet */
- bp = vsw_tx_msg(vswp, bp);
+ bp = vsw_tx_msg(vswp, bp, VSW_VNETPORT, portp);
if (bp != NULL) {
freemsg(bp);
}
@@ -1453,50 +1226,18 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
static void
vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu)
{
- mac_prop_t mp;
- uint32_t val;
- int rv;
- uint_t perm_flags = MAC_PROP_PERM_RW;
- mp.mp_id = MAC_PROP_MTU;
- mp.mp_name = mac_mtu_propname;
- mp.mp_flags = 0;
-
- /* Get the mtu of the physical device */
- rv = mac_get_prop(vswp->mh, &mp, (void *)&val, sizeof (uint32_t),
- &perm_flags);
- if (rv != 0) {
- cmn_err(CE_NOTE,
- "!vsw%d: Unable to get the mtu of the physical device:%s\n",
- vswp->instance, vswp->physname);
- return;
- }
-
- /* Return if the mtu is read-only */
- if (perm_flags != MAC_PROP_PERM_RW) {
- cmn_err(CE_NOTE,
- "!vsw%d: Read-only mtu of the physical device:%s\n",
- vswp->instance, vswp->physname);
- return;
- }
-
- /* save the original mtu of physdev to reset it back later if needed */
- vswp->mtu_physdev_orig = val;
-
- if (val == mtu) {
- /* no need to set, as the device already has the right mtu */
- return;
- }
-
- mp.mp_id = MAC_PROP_MTU;
- mp.mp_name = mac_mtu_propname;
- mp.mp_flags = 0;
+ uint_t mtu_orig;
+ int rv;
- /* Set the mtu in the physical device */
- rv = mac_set_prop(vswp->mh, &mp, &mtu, sizeof (uint32_t));
+ rv = mac_set_mtu(vswp->mh, mtu, &mtu_orig);
if (rv != 0) {
cmn_err(CE_NOTE,
"!vsw%d: Unable to set the mtu:%d, in the "
"physical device:%s\n",
vswp->instance, mtu, vswp->physname);
+ return;
}
+
+ /* save the original mtu of physdev to reset it back later if needed */
+ vswp->mtu_physdev_orig = mtu_orig;
}
diff --git a/usr/src/uts/sun4v/io/vsw_switching.c b/usr/src/uts/sun4v/io/vsw_switching.c
index 8c4ad6d4d0..5033f0665c 100644
--- a/usr/src/uts/sun4v/io/vsw_switching.c
+++ b/usr/src/uts/sun4v/io/vsw_switching.c
@@ -58,7 +58,6 @@
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
-#include <sys/mac.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
@@ -82,6 +81,8 @@ static int vsw_setup_layer2(vsw_t *);
static int vsw_setup_layer3(vsw_t *);
/* Switching/data transmit routines */
+static void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
+ vsw_port_t *port, mac_resource_handle_t);
static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
vsw_port_t *port, mac_resource_handle_t);
static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
@@ -117,26 +118,26 @@ void vsw_del_mcst_vsw(vsw_t *);
/* Support functions */
static mblk_t *vsw_dupmsgchain(mblk_t *mp);
-static uint32_t vsw_get_same_dest_list(struct ether_header *ehp,
- mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
+static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
/*
* Functions imported from other files.
*/
-extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
+extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
extern int vsw_mac_open(vsw_t *vswp);
extern void vsw_mac_close(vsw_t *vswp);
extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
mblk_t *mp, vsw_macrx_flags_t flags);
extern void vsw_set_addrs(vsw_t *vswp);
-extern int vsw_get_hw_maddr(vsw_t *);
-extern int vsw_mac_attach(vsw_t *vswp);
-extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt,
- uint32_t count);
+extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
extern void vsw_hio_init(vsw_t *vswp);
extern void vsw_hio_start_ports(vsw_t *vswp);
+extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
+ mcst_addr_t *mcst_p, int type);
+extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
+ mcst_addr_t *mcst_p, int type);
/*
* Tunables used in this file.
@@ -226,9 +227,9 @@ vsw_stop_switching_timeout(vsw_t *vswp)
(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
- WRITE_ENTER(&vswp->mac_rwlock);
+ mutex_enter(&vswp->mac_lock);
vswp->mac_open_retries = 0;
- RW_EXIT(&vswp->mac_rwlock);
+ mutex_exit(&vswp->mac_lock);
}
/*
@@ -246,39 +247,24 @@ vsw_stop_switching_timeout(vsw_t *vswp)
int
vsw_setup_switching(vsw_t *vswp)
{
- int i, rv = 1;
+ int rv = 1;
D1(vswp, "%s: enter", __func__);
/*
* Select best switching mode.
- * Note that we start from the saved smode_idx. This is done as
- * this routine can be called from the timeout handler to retry
- * setting up a specific mode. Currently only the function which
- * sets up layer2/promisc mode returns EAGAIN if the underlying
- * physical device is not available yet, causing retries.
+ * This is done as this routine can be called from the timeout
+ * handler to retry setting up a specific mode. Currently only
+ * the function which sets up layer2/promisc mode returns EAGAIN
+ * if the underlying network device is not available yet, causing
+ * retries.
*/
- for (i = vswp->smode_idx; i < vswp->smode_num; i++) {
- vswp->smode_idx = i;
- switch (vswp->smode[i]) {
- case VSW_LAYER2:
- case VSW_LAYER2_PROMISC:
- rv = vsw_setup_layer2(vswp);
- break;
-
- case VSW_LAYER3:
- rv = vsw_setup_layer3(vswp);
- break;
-
- default:
- DERR(vswp, "unknown switch mode");
- break;
- }
-
- if ((rv == 0) || (rv == EAGAIN))
- break;
-
- /* all other errors(rv != 0): continue & select the next mode */
+ if (vswp->smode & VSW_LAYER2) {
+ rv = vsw_setup_layer2(vswp);
+ } else if (vswp->smode & VSW_LAYER3) {
+ rv = vsw_setup_layer3(vswp);
+ } else {
+ DERR(vswp, "unknown switch mode");
rv = 1;
}
@@ -290,7 +276,7 @@ vsw_setup_switching(vsw_t *vswp)
}
D2(vswp, "%s: Operating in mode %d", __func__,
- vswp->smode[vswp->smode_idx]);
+ vswp->smode);
D1(vswp, "%s: exit", __func__);
@@ -312,7 +298,12 @@ vsw_setup_layer2(vsw_t *vswp)
D1(vswp, "%s: enter", __func__);
+ /*
+ * Until the network device is successfully opened,
+ * set the switching to use vsw_switch_l2_frame.
+ */
vswp->vsw_switch_frame = vsw_switch_l2_frame;
+ vswp->mac_cl_switching = B_FALSE;
rv = strlen(vswp->physname);
if (rv == 0) {
@@ -320,61 +311,42 @@ vsw_setup_layer2(vsw_t *vswp)
* Physical device name is NULL, which is
* required for layer 2.
*/
- cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
+ cmn_err(CE_WARN, "!vsw%d: no network device name specified",
vswp->instance);
return (EIO);
}
- WRITE_ENTER(&vswp->mac_rwlock);
+ mutex_enter(&vswp->mac_lock);
rv = vsw_mac_open(vswp);
if (rv != 0) {
if (rv != EAGAIN) {
- cmn_err(CE_WARN, "!vsw%d: Unable to open physical "
+ cmn_err(CE_WARN, "!vsw%d: Unable to open network "
"device: %s\n", vswp->instance, vswp->physname);
}
- RW_EXIT(&vswp->mac_rwlock);
+ mutex_exit(&vswp->mac_lock);
return (rv);
}
- if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
- /*
- * Verify that underlying device can support multiple
- * unicast mac addresses.
- */
- rv = vsw_get_hw_maddr(vswp);
- if (rv != 0) {
- goto exit_error;
- }
- }
-
/*
- * Attempt to link into the MAC layer so we can get
- * and send packets out over the physical adapter.
+ * Now we can use the mac client switching, so set the switching
+ * function to use vsw_switch_l2_frame_mac_client(), which simply
+ * sends the packets to MAC layer for switching.
*/
- rv = vsw_mac_attach(vswp);
- if (rv != 0) {
- /*
- * Registration with the MAC layer has failed,
- * so return error so that can fall back to next
- * prefered switching method.
- */
- cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: "
- "%s\n", vswp->instance, vswp->physname);
- goto exit_error;
- }
+ vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
+ vswp->mac_cl_switching = B_TRUE;
D1(vswp, "%s: exit", __func__);
- RW_EXIT(&vswp->mac_rwlock);
-
/* Initialize HybridIO related stuff */
vsw_hio_init(vswp);
+
+ mutex_exit(&vswp->mac_lock);
return (0);
exit_error:
vsw_mac_close(vswp);
- RW_EXIT(&vswp->mac_rwlock);
+ mutex_exit(&vswp->mac_lock);
return (EIO);
}
@@ -400,6 +372,31 @@ vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
}
/*
+ * Use mac client for layer 2 switching .
+ */
+static void
+vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
+ vsw_port_t *port, mac_resource_handle_t mrh)
+{
+ _NOTE(ARGUNUSED(mrh))
+
+ mblk_t *ret_m;
+
+ /*
+ * This switching function is expected to be called by
+ * the ports or the interface only. The packets from
+ * physical interface already switched.
+ */
+ ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
+
+ if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
+ DERR(vswp, "%s: drop mblks to "
+ "phys dev", __func__);
+ freemsgchain(ret_m);
+ }
+}
+
+/*
* Switch the given ethernet frame when operating in layer 2 mode.
*
* vswp: pointer to the vsw instance
@@ -419,8 +416,6 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
{
struct ether_header *ehp;
mblk_t *bp, *ret_m;
- mblk_t *mpt = NULL;
- uint32_t count;
vsw_fdbe_t *fp;
D1(vswp, "%s: enter (caller %d)", __func__, caller);
@@ -435,8 +430,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
bp = mp;
while (bp) {
ehp = (struct ether_header *)bp->b_rptr;
- count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
- ASSERT(count != 0);
+ mp = vsw_get_same_dest_list(ehp, &bp);
+ ASSERT(mp != NULL);
D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
__func__, MBLKSIZE(mp), MBLKL(mp));
@@ -476,7 +471,7 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
* vsw_port (connected to a vnet device -
* VSW_VNETPORT)
*/
- (void) vsw_portsend(fp->portp, mp, mpt, count);
+ (void) vsw_portsend(fp->portp, mp);
/* Release the reference on the fdb entry */
VSW_FDBE_REFRELE(fp);
@@ -517,8 +512,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
VSW_MACRX_PROMISC |
VSW_MACRX_COPYMSG);
- if ((ret_m = vsw_tx_msg(vswp, mp))
- != NULL) {
+ if ((ret_m = vsw_tx_msg(vswp, mp,
+ caller, arg)) != NULL) {
DERR(vswp, "%s: drop mblks to "
"phys dev", __func__);
freemsgchain(ret_m);
@@ -539,8 +534,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
* Pkt came down the stack, send out
* over physical device.
*/
- if ((ret_m = vsw_tx_msg(vswp, mp))
- != NULL) {
+ if ((ret_m = vsw_tx_msg(vswp, mp,
+ caller, NULL)) != NULL) {
DERR(vswp, "%s: drop mblks to "
"phys dev", __func__);
freemsgchain(ret_m);
@@ -566,8 +561,6 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
{
struct ether_header *ehp;
mblk_t *bp = NULL;
- mblk_t *mpt;
- uint32_t count;
vsw_fdbe_t *fp;
D1(vswp, "%s: enter (caller %d)", __func__, caller);
@@ -587,8 +580,8 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
bp = mp;
while (bp) {
ehp = (struct ether_header *)bp->b_rptr;
- count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
- ASSERT(count != 0);
+ mp = vsw_get_same_dest_list(ehp, &bp);
+ ASSERT(mp != NULL);
D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
__func__, MBLKSIZE(mp), MBLKL(mp));
@@ -601,7 +594,7 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
if (fp != NULL) {
D2(vswp, "%s: sending to target port", __func__);
- (void) vsw_portsend(fp->portp, mp, mpt, count);
+ (void) vsw_portsend(fp->portp, mp);
/* Release the reference on the fdb entry */
VSW_FDBE_REFRELE(fp);
@@ -644,8 +637,7 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
void
vsw_setup_layer2_post_process(vsw_t *vswp)
{
- if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
- (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) {
+ if (vswp->smode & VSW_LAYER2) {
/*
* Program unicst, mcst addrs of vsw
* interface and ports in the physdev.
@@ -676,13 +668,13 @@ vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
* Broadcast message from inside ldoms so send to outside
* world if in either of layer 2 modes.
*/
- if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
- (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
+ if ((vswp->smode & VSW_LAYER2) &&
((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
nmp = vsw_dupmsgchain(mp);
if (nmp) {
- if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
+ if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
+ != NULL) {
DERR(vswp, "%s: dropping pkt(s) "
"consisting of %ld bytes of data for"
" physical device", __func__, MBLKL(ret_m));
@@ -716,20 +708,12 @@ vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
} else {
nmp = vsw_dupmsgchain(mp);
if (nmp) {
- mblk_t *mpt = nmp;
- uint32_t count = 1;
-
- /* Find tail */
- while (mpt->b_next != NULL) {
- mpt = mpt->b_next;
- count++;
- }
/*
* The plist->lockrw is protecting the
* portp from getting destroyed here.
* So, no ref_cnt is incremented here.
*/
- (void) vsw_portsend(portp, nmp, mpt, count);
+ (void) vsw_portsend(portp, nmp);
} else {
DERR(vswp, "vsw_forward_all: nmp NULL");
}
@@ -772,12 +756,12 @@ vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
* over the physical adapter, and then check to see if any other
* vnets are interested in it.
*/
- if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
- (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
+ if ((vswp->smode & VSW_LAYER2) &&
((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
nmp = vsw_dupmsgchain(mp);
if (nmp) {
- if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
+ if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
+ != NULL) {
DERR(vswp, "%s: dropping pkt(s) consisting of "
"%ld bytes of data for physical device",
__func__, MBLKL(ret_m));
@@ -819,21 +803,12 @@ vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
nmp = vsw_dupmsgchain(mp);
if (nmp) {
- mblk_t *mpt = nmp;
- uint32_t count = 1;
-
- /* Find tail */
- while (mpt->b_next != NULL) {
- mpt = mpt->b_next;
- count++;
- }
/*
* The vswp->mfdbrw is protecting the
* portp from getting destroyed here.
* So, no ref_cnt is incremented here.
*/
- (void) vsw_portsend(port, nmp, mpt,
- count);
+ (void) vsw_portsend(port, nmp);
}
} else {
vsw_mac_rx(vswp, NULL,
@@ -970,32 +945,46 @@ vsw_vlan_add_ids(void *arg, int type)
rv = mod_hash_insert(vswp->vlan_hashp,
(mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
(mod_hash_val_t)B_TRUE);
- ASSERT(rv == 0);
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
+ "the interface", vswp->instance, vswp->pvid);
+ }
for (i = 0; i < vswp->nvids; i++) {
rv = mod_hash_insert(vswp->vlan_hashp,
- (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
+ (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
(mod_hash_val_t)B_TRUE);
- ASSERT(rv == 0);
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
+ " for the interface", vswp->instance,
+ vswp->pvid);
+ }
}
} else if (type == VSW_VNETPORT) {
vsw_port_t *portp = (vsw_port_t *)arg;
+ vsw_t *vswp = portp->p_vswp;
rv = mod_hash_insert(portp->vlan_hashp,
(mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
(mod_hash_val_t)B_TRUE);
- ASSERT(rv == 0);
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
+ "the port(%d)", vswp->instance, vswp->pvid,
+ portp->p_instance);
+ }
for (i = 0; i < portp->nvids; i++) {
rv = mod_hash_insert(portp->vlan_hashp,
- (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
+ (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
(mod_hash_val_t)B_TRUE);
- ASSERT(rv == 0);
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
+ " for the port(%d)", vswp->instance,
+ vswp->pvid, portp->p_instance);
+ }
}
- } else {
- return;
}
}
@@ -1021,10 +1010,12 @@ vsw_vlan_remove_ids(void *arg, int type)
}
for (i = 0; i < vswp->nvids; i++) {
- rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->vids[i]);
+ rv = vsw_vlan_lookup(vswp->vlan_hashp,
+ vswp->vids[i].vl_vid);
if (rv == B_TRUE) {
rv = mod_hash_remove(vswp->vlan_hashp,
- (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
+ (mod_hash_key_t)VLAN_ID_KEY(
+ vswp->vids[i].vl_vid),
(mod_hash_val_t *)&vp);
ASSERT(rv == 0);
}
@@ -1043,10 +1034,12 @@ vsw_vlan_remove_ids(void *arg, int type)
}
for (i = 0; i < portp->nvids; i++) {
- rv = vsw_vlan_lookup(portp->vlan_hashp, portp->vids[i]);
+ rv = vsw_vlan_lookup(portp->vlan_hashp,
+ portp->vids[i].vl_vid);
if (rv == B_TRUE) {
rv = mod_hash_remove(portp->vlan_hashp,
- (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
+ (mod_hash_key_t)VLAN_ID_KEY(
+ portp->vids[i].vl_vid),
(mod_hash_val_t *)&vp);
ASSERT(rv == 0);
}
@@ -1097,7 +1090,11 @@ vsw_fdbe_add(vsw_t *vswp, void *port)
*/
rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
(mod_hash_val_t)fp);
- ASSERT(rv == 0);
+ if (rv != 0) {
+ cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
+ "the port(%d)", vswp->instance,
+ ether_sprintf(&portp->p_macaddr), portp->p_instance);
+ }
}
/*
@@ -1264,7 +1261,7 @@ vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
* Returns:
* np: head of updated chain of packets
* npt: tail of updated chain of packets
- * rv: count of any packets dropped
+ * rv: count of the packets in the returned list
*/
uint32_t
vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
@@ -1285,6 +1282,7 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
+
if (type == VSW_LOCALDEV) {
vswp = (vsw_t *)arg;
pvid = vswp->pvid;
@@ -1298,6 +1296,27 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
pvid = portp->pvid;
}
+ /*
+ * If the MAC layer switching in place, then
+ * untagging required only if the pvid is not
+ * the same as default_vlan_id. This is because,
+ * the MAC layer will send packets for the
+ * registered vlans only.
+ */
+ if ((vswp->mac_cl_switching == B_TRUE) &&
+ (pvid == vswp->default_vlan_id)) {
+ /* simply count and set the tail */
+ count = 1;
+ bp = *np;
+ ASSERT(bp != NULL);
+ while (bp->b_next != NULL) {
+ bp = bp->b_next;
+ count++;
+ }
+ *npt = bp;
+ return (count);
+ }
+
bpn = bph = bpt = NULL;
count = 0;
@@ -1313,45 +1332,67 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
/*
- * Check if the destination is in the same vlan.
+ * If MAC layer switching in place, then we
+ * need to untag only if the tagged packet has
+ * vlan-id same as the pvid.
*/
- rv = vsw_vlan_lookup(vlan_hashp, vlan_id);
- if (rv == B_FALSE) {
- /* drop the packet */
- freemsg(bp);
- count++;
- continue;
- }
+ if (vswp->mac_cl_switching == B_TRUE) {
- /*
- * Check the frame header if tag/untag is needed.
- */
- if (is_tagged == B_FALSE) {
- /*
- * Untagged frame. We shouldn't have an untagged
- * packet at this point, unless the destination's
- * vlan id is default-vlan-id; if it is not the
- * default-vlan-id, we drop the packet.
- */
- if (vlan_id != vswp->default_vlan_id) {
- /* drop the packet */
- freemsg(bp);
- count++;
- continue;
- }
- } else {
- /*
- * Tagged frame, untag if it's the destination's pvid.
- */
+ /* only tagged packets expected here */
+ ASSERT(is_tagged == B_TRUE);
if (vlan_id == pvid) {
-
bp = vnet_vlan_remove_tag(bp);
if (bp == NULL) {
/* packet dropped */
- count++;
continue;
}
}
+ } else { /* No MAC layer switching */
+
+ /*
+ * Check the frame header if tag/untag is needed.
+ */
+ if (is_tagged == B_FALSE) {
+ /*
+ * Untagged frame. We shouldn't have an
+ * untagged packet at this point, unless
+ * the destination's vlan id is
+ * default-vlan-id; if it is not the
+ * default-vlan-id, we drop the packet.
+ */
+ if (vlan_id != vswp->default_vlan_id) {
+ /* drop the packet */
+ freemsg(bp);
+ continue;
+ }
+ } else { /* Tagged */
+ /*
+ * Tagged frame, untag if it's the
+ * destination's pvid.
+ */
+ if (vlan_id == pvid) {
+
+ bp = vnet_vlan_remove_tag(bp);
+ if (bp == NULL) {
+ /* packet dropped */
+ continue;
+ }
+ } else {
+
+ /*
+ * Check if the destination is in the
+ * same vlan.
+ */
+ rv = vsw_vlan_lookup(vlan_hashp,
+ vlan_id);
+ if (rv == B_FALSE) {
+ /* drop the packet */
+ freemsg(bp);
+ continue;
+ }
+ }
+
+ }
}
/* build a chain of processed packets */
@@ -1361,12 +1402,11 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
bpt->b_next = bp;
bpt = bp;
}
-
+ count++;
}
*np = bph;
*npt = bpt;
-
return (count);
}
@@ -1476,26 +1516,13 @@ vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
* just increments a ref counter (which is
* used when the address is being deleted)
*/
- WRITE_ENTER(&vswp->mac_rwlock);
- if (vswp->mh != NULL) {
- if (mac_multicst_add(vswp->mh,
- (uchar_t *)&mcst_pkt->mca[i])) {
- RW_EXIT(&vswp->mac_rwlock);
- cmn_err(CE_WARN, "!vsw%d: "
- "unable to add multicast "
- "address: %s\n",
- vswp->instance,
- ether_sprintf((void *)
- &mcst_p->mca));
- (void) vsw_del_mcst(vswp,
- VSW_VNETPORT, addr, port);
- kmem_free(mcst_p,
- sizeof (*mcst_p));
- return (1);
- }
- mcst_p->mac_added = B_TRUE;
+ if (vsw_mac_multicast_add(vswp, port, mcst_p,
+ VSW_VNETPORT)) {
+ (void) vsw_del_mcst(vswp,
+ VSW_VNETPORT, addr, port);
+ kmem_free(mcst_p, sizeof (*mcst_p));
+ return (1);
}
- RW_EXIT(&vswp->mac_rwlock);
mutex_enter(&port->mca_lock);
mcst_p->nextp = port->mcap;
@@ -1530,24 +1557,8 @@ vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
* if other ports are interested in this
* address.
*/
- WRITE_ENTER(&vswp->mac_rwlock);
- if (vswp->mh != NULL && mcst_p->mac_added) {
- if (mac_multicst_remove(vswp->mh,
- (uchar_t *)&mcst_pkt->mca[i])) {
- RW_EXIT(&vswp->mac_rwlock);
- cmn_err(CE_WARN, "!vsw%d: "
- "unable to remove mcast "
- "address: %s\n",
- vswp->instance,
- ether_sprintf((void *)
- &mcst_p->mca));
- kmem_free(mcst_p,
- sizeof (*mcst_p));
- return (1);
- }
- mcst_p->mac_added = B_FALSE;
- }
- RW_EXIT(&vswp->mac_rwlock);
+ vsw_mac_multicast_remove(vswp, port, mcst_p,
+ VSW_VNETPORT);
kmem_free(mcst_p, sizeof (*mcst_p));
} else {
@@ -1780,13 +1791,7 @@ vsw_del_mcst_port(vsw_port_t *port)
* if other ports are interested in this
* address.
*/
- WRITE_ENTER(&vswp->mac_rwlock);
- if (vswp->mh != NULL && mcap->mac_added) {
- (void) mac_multicst_remove(vswp->mh,
- (uchar_t *)&mcap->mca);
- }
- RW_EXIT(&vswp->mac_rwlock);
-
+ vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
kmem_free(mcap, sizeof (*mcap));
mutex_enter(&port->mca_lock);
@@ -1829,11 +1834,9 @@ vsw_del_mcst_vsw(vsw_t *vswp)
D1(vswp, "%s: exit", __func__);
}
-static uint32_t
-vsw_get_same_dest_list(struct ether_header *ehp,
- mblk_t **rhead, mblk_t **rtail, mblk_t **mpp)
+mblk_t *
+vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
{
- uint32_t count = 0;
mblk_t *bp;
mblk_t *nbp;
mblk_t *head = NULL;
@@ -1860,16 +1863,12 @@ vsw_get_same_dest_list(struct ether_header *ehp,
tail->b_next = bp;
tail = bp;
}
- count++;
} else {
prev = bp;
}
bp = nbp;
}
- *rhead = head;
- *rtail = tail;
- DTRACE_PROBE1(vsw_same_dest, int, count);
- return (count);
+ return (head);
}
static mblk_t *
diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c
index 694930fe28..df698ebe69 100644
--- a/usr/src/uts/sun4v/os/mach_startup.c
+++ b/usr/src/uts/sun4v/os/mach_startup.c
@@ -308,18 +308,18 @@ mach_hw_copy_limit(void)
}
/*
- * We need to enable soft ring functionality on Niagara platform since
- * one strand can't handle interrupts for a 1Gb NIC. Set the tunable
- * ip_squeue_soft_ring by default on this platform. We can also set
- * ip_threads_per_cpu to track number of threads per core. The variables
- * themselves are defined in space.c and used by IP module
+ * We need to enable soft ring functionality on Niagara platforms since
+ * one strand can't handle interrupts for a 1Gb NIC. So set the tunable
+ * mac_soft_ring_enable by default on this platform.
+ * mac_soft_ring_enable variable is defined in space.c and used by MAC
+ * module. This tunable in concert with mac_soft_ring_count (declared
+ * in mac.h) will configure the number of fanout soft rings for a link.
*/
-extern uint_t ip_threads_per_cpu;
-extern boolean_t ip_squeue_soft_ring;
+extern boolean_t mac_soft_ring_enable;
void
startup_platform(void)
{
- ip_squeue_soft_ring = B_TRUE;
+ mac_soft_ring_enable = B_TRUE;
if (clock_tick_threshold == 0)
clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD;
if (clock_tick_ncpus == 0)
diff --git a/usr/src/uts/sun4v/sys/vnet_res.h b/usr/src/uts/sun4v/sys/vnet_res.h
index 035ad1328c..b5cd4472fb 100644
--- a/usr/src/uts/sun4v/sys/vnet_res.h
+++ b/usr/src/uts/sun4v/sys/vnet_res.h
@@ -27,12 +27,12 @@
#ifndef _VNET_RES_H
#define _VNET_RES_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
+#include <sys/mac_provider.h>
+
/*
* Vio network resource types.
* VIO_NET_RES_LDC_SERVICE:
diff --git a/usr/src/uts/sun4v/sys/vsw.h b/usr/src/uts/sun4v/sys/vsw.h
index 069e26d60a..456480f909 100644
--- a/usr/src/uts/sun4v/sys/vsw.h
+++ b/usr/src/uts/sun4v/sys/vsw.h
@@ -40,6 +40,7 @@ extern "C" {
#include <sys/vio_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/ethernet.h>
+#include <sys/mac_client.h>
#include <sys/vio_util.h>
#include <sys/vgen_stats.h>
#include <sys/vsw_ldc.h>
@@ -59,57 +60,6 @@ extern "C" {
#define VSW_LOCALDEV 4 /* vsw configured as an eth interface */
/*
- * Vsw queue -- largely modeled after squeue
- *
- * VSW_QUEUE_RUNNING, vqueue thread for queue is running.
- * VSW_QUEUE_DRAINED, vqueue thread has drained current work and is exiting.
- * VSW_QUEUE_STOP, request for the vqueue thread to stop.
- * VSW_QUEUE_STOPPED, vqueue thread is not running.
- */
-#define VSW_QUEUE_RUNNING 0x01
-#define VSW_QUEUE_DRAINED 0x02
-#define VSW_QUEUE_STOP 0x04
-#define VSW_QUEUE_STOPPED 0x08
-
-typedef struct vsw_queue_s {
- kmutex_t vq_lock; /* Lock, before using any member. */
- kcondvar_t vq_cv; /* Async threads block on. */
- uint32_t vq_state; /* State flags. */
-
- mblk_t *vq_first; /* First mblk chain or NULL. */
- mblk_t *vq_last; /* Last mblk chain. */
-
- processorid_t vq_bind; /* Process to bind to */
- kthread_t *vq_worker; /* Queue's thread */
-} vsw_queue_t;
-
-/*
- * VSW MAC Ring Resources.
- * MAC Ring resource is composed of this state structure and
- * a kernel thread to perform the processing of the ring.
- */
-typedef struct vsw_mac_ring_s {
- uint32_t ring_state;
-
- mac_blank_t ring_blank;
- void *ring_arg;
-
- vsw_queue_t *ring_vqp;
- struct vsw *ring_vswp;
-} vsw_mac_ring_t;
-
-/*
- * Maximum Ring Resources.
- */
-#define VSW_MAC_RX_RINGS 0x40
-
-/*
- * States for entry in ring table.
- */
-#define VSW_MAC_RING_FREE 1
-#define VSW_MAC_RING_INUSE 2
-
-/*
* Number of hash chains in the multicast forwarding database.
*/
#define VSW_NCHAINS 8
@@ -139,6 +89,15 @@ typedef struct vsw_mac_ring_s {
#define VSW_PRI_ETH_DEFINED(vswp) ((vswp)->pri_num_types != 0)
/*
+ * vlan-id information.
+ */
+typedef struct vsw_vlanid {
+ uint16_t vl_vid; /* vlan-id */
+ mac_unicast_handle_t vl_muh; /* mac unicast handle */
+ boolean_t vl_set; /* set? */
+} vsw_vlanid_t;
+
+/*
* vsw instance state information.
*/
typedef struct vsw {
@@ -147,9 +106,7 @@ typedef struct vsw {
uint64_t regprop; /* "reg" property */
struct vsw *next; /* next in list */
char physname[LIFNAMSIZ]; /* phys-dev */
- uint8_t smode[NUM_SMODES]; /* switching mode */
- int smode_idx; /* curr pos in smode array */
- int smode_num; /* # of modes specified */
+ uint8_t smode; /* switching mode */
kmutex_t swtmout_lock; /* setup switching tmout lock */
boolean_t swtmout_enabled; /* setup switching tmout on */
timeout_id_t swtmout_id; /* setup switching tmout id */
@@ -174,24 +131,16 @@ typedef struct vsw {
vsw_port_t *, mac_resource_handle_t);
/* mac layer */
- krwlock_t mac_rwlock; /* protect fields below */
+ kmutex_t mac_lock; /* protect mh */
mac_handle_t mh;
- mac_rx_handle_t mrh;
- multiaddress_capab_t maddr; /* Multiple uni addr capable */
- const mac_txinfo_t *txinfo; /* MAC tx routine */
- boolean_t mstarted; /* Mac Started? */
- boolean_t mresources; /* Mac Resources cb? */
-
- /*
- * MAC Ring Resources.
- */
- kmutex_t mac_ring_lock; /* Lock for the table. */
- uint32_t mac_ring_tbl_sz;
- vsw_mac_ring_t *mac_ring_tbl; /* Mac ring table. */
-
- kmutex_t hw_lock; /* sync access to HW */
+ krwlock_t maccl_rwlock; /* protect fields below */
+ mac_client_handle_t mch; /* mac client handle */
+ mac_unicast_handle_t muh; /* mac unicast handle */
+
boolean_t recfg_reqd; /* Reconfig of addrs needed */
- int promisc_cnt;
+
+ /* mac layer switching flag */
+ boolean_t mac_cl_switching;
/* Machine Description updates */
mdeg_node_spec_t *inst_spec;
@@ -204,8 +153,7 @@ typedef struct vsw {
krwlock_t if_lockrw;
uint8_t if_state; /* interface state */
- mac_addr_slot_t addr_slot; /* Unicast address slot */
- int addr_set; /* Addr set where */
+ boolean_t addr_set; /* is addr set to HW */
/* multicast addresses when configured as eth interface */
kmutex_t mca_lock; /* multicast lock */
@@ -216,7 +164,7 @@ typedef struct vsw {
vio_mblk_pool_t *pri_tx_vmp; /* tx priority mblk pool */
uint16_t default_vlan_id; /* default vlan id */
uint16_t pvid; /* port vlan id (untagged) */
- uint16_t *vids; /* vlan ids (tagged) */
+ vsw_vlanid_t *vids; /* vlan ids (tagged) */
uint16_t nvids; /* # of vids */
uint32_t vids_size; /* size alloc'd for vids list */
diff --git a/usr/src/uts/sun4v/sys/vsw_hio.h b/usr/src/uts/sun4v/sys/vsw_hio.h
index 70b79ea04e..1521d6cff9 100644
--- a/usr/src/uts/sun4v/sys/vsw_hio.h
+++ b/usr/src/uts/sun4v/sys/vsw_hio.h
@@ -55,10 +55,6 @@ typedef struct vsw_share {
uint64_t vs_macaddr; /* Associated MAC addr */
uint64_t vs_cookie; /* Share Cookie from alloc_share */
- /* physdev's share related info */
- mac_share_handle_t vs_shdl; /* HIO share handle */
- mac_group_info_t vs_rxginfo; /* RX group info */
- uint64_t vs_gnum; /* RX group number */
} vsw_share_t;
#define VSW_SHARE_FREE 0x0
@@ -68,11 +64,8 @@ typedef struct vsw_share {
/* Hybrid related info */
typedef struct vsw_hio {
- mac_capab_rings_t vh_rcapab; /* Rings capability data */
- mac_capab_share_t vh_scapab; /* Share capability data */
- vsw_share_t *vh_shares; /* Array of Shares */
uint32_t vh_num_shares; /* Number of shares available */
-
+ vsw_share_t *vh_shares; /* Array of Shares */
uint32_t vh_kstat_size; /* size for the whole kstats */
vsw_hio_kstats_t *vh_kstatsp; /* stats for vsw hio */
kstat_t *vh_ksp; /* kstats */
diff --git a/usr/src/uts/sun4v/sys/vsw_ldc.h b/usr/src/uts/sun4v/sys/vsw_ldc.h
index 31344465f5..46d04fac10 100644
--- a/usr/src/uts/sun4v/sys/vsw_ldc.h
+++ b/usr/src/uts/sun4v/sys/vsw_ldc.h
@@ -362,10 +362,6 @@ typedef struct mcst_addr {
#define VSW_PORT_DETACHING 0x2 /* In process of being detached */
#define VSW_PORT_DETACHABLE 0x4 /* Safe to detach */
-#define VSW_ADDR_UNSET 0x0 /* Addr not set */
-#define VSW_ADDR_HW 0x1 /* Addr programmed in HW */
-#define VSW_ADDR_PROMISC 0x2 /* Card in promisc to see addr */
-
/* port information associated with a vsw */
typedef struct vsw_port {
int p_instance; /* port instance */
@@ -382,20 +378,22 @@ typedef struct vsw_port {
kmutex_t state_lock;
kcondvar_t state_cv;
+ krwlock_t maccl_rwlock; /* protect fields below */
+ mac_client_handle_t p_mch; /* mac client handle */
+ mac_unicast_handle_t p_muh; /* mac unicast handle */
+
kmutex_t mca_lock; /* multicast lock */
mcst_addr_t *mcap; /* list of multicast addrs */
- mac_addr_slot_t addr_slot; /* Unicast address slot */
- int addr_set; /* Addr set where */
+ boolean_t addr_set; /* Addr set where */
/*
* mac address of the port & connected device
*/
struct ether_addr p_macaddr;
uint16_t pvid; /* port vlan id (untagged) */
- uint16_t *vids; /* vlan ids (tagged) */
+ struct vsw_vlanid *vids; /* vlan ids (tagged) */
uint16_t nvids; /* # of vids */
- uint32_t vids_size; /* size alloc'd for vids list */
mod_hash_t *vlan_hashp; /* vlan hash table */
uint32_t vlan_nchains; /* # of vlan hash chains */
@@ -444,7 +442,7 @@ static struct ether_addr etherbroadcastaddr = {
};
#define IS_BROADCAST(ehp) \
- (ether_cmp(&ehp->ether_dhost, &etherbroadcastaddr) == 0)
+ (bcmp(&ehp->ether_dhost, &etherbroadcastaddr, ETHERADDRL) == 0)
#define IS_MULTICAST(ehp) \
((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)