summaryrefslogtreecommitdiff
path: root/usr/src/uts/common
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r--usr/src/uts/common/Makefile.files76
-rw-r--r--usr/src/uts/common/Makefile.rules85
-rw-r--r--usr/src/uts/common/brand/lx/autofs/lx_autofs.c3174
-rw-r--r--usr/src/uts/common/brand/lx/autofs/lxautofs.conf14
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps.h222
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps_node.c1014
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c1071
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c1552
-rw-r--r--usr/src/uts/common/brand/lx/devfs/lxd.h244
-rw-r--r--usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c368
-rw-r--r--usr/src/uts/common/brand/lx/devfs/lxd_node.c1003
-rw-r--r--usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c860
-rw-r--r--usr/src/uts/common/brand/lx/devfs/lxd_vnops.c1520
-rw-r--r--usr/src/uts/common/brand/lx/dtrace/lx_systrace.c499
-rw-r--r--usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf27
-rw-r--r--usr/src/uts/common/brand/lx/io/lx_netlink.c2232
-rw-r--r--usr/src/uts/common/brand/lx/io/lx_ptm.c1188
-rw-r--r--usr/src/uts/common/brand/lx/io/lx_ptm.conf27
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_acct.c198
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_acl.c213
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_audit.c1604
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_brand.c2701
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_lockd.c338
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_misc.c1196
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_pid.c499
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_ptrace.c2710
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_signal.c50
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_syscall.c1229
-rw-r--r--usr/src/uts/common/brand/lx/procfs/lx_proc.h378
-rw-r--r--usr/src/uts/common/brand/lx/procfs/lx_prsubr.c917
-rw-r--r--usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c377
-rw-r--r--usr/src/uts/common/brand/lx/procfs/lx_prvnops.c8377
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_acl.h45
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_audit.h38
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_autofs.h511
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h162
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_brand.h778
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_fcntl.h161
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_futex.h143
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_impl.h52
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_ldt.h91
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_misc.h136
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_ptm.h44
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_siginfo.h190
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_signal.h32
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_socket.h444
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_syscalls.h341
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_types.h144
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_userhz.h64
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_access.c223
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_aio.c1345
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_brk.c55
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_chmod.c107
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_chown.c180
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_clone.c513
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_close.c30
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_cpu.c36
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_dup.c53
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_epoll.c303
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_eventfd.c126
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_fadvise.c103
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_fallocate.c251
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_fcntl.c701
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_futex.c1665
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_getcwd.c52
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_getdents.c416
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_getpid.c75
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_getrandom.c33
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_id.c509
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_ioctl.c1865
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_ioprio.c66
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_kill.c408
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_link.c194
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_lseek.c82
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_mem.c1118
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_miscsys.c495
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_mkdir.c38
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c121
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_mount.c675
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_open.c288
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_personality.c112
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_pgrp.c189
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_pipe.c309
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_poll.c786
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_prctl.c288
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_priority.c192
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_rename.c39
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_rlimit.c587
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_rw.c956
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_sched.c1161
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_socket.c4537
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_splice.c491
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_stat.c486
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_sync.c86
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c207
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_thread_area.c194
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_time.c72
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_timer.c637
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_umask.c52
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_uname.c82
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_wait.c377
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_xattr.c519
-rw-r--r--usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h198
-rw-r--r--usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c443
-rw-r--r--usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c365
-rw-r--r--usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c2165
-rw-r--r--usr/src/uts/common/brand/sn1/sn1_brand.c106
-rw-r--r--usr/src/uts/common/brand/sn1/sn1_brand.h7
-rw-r--r--usr/src/uts/common/brand/solaris10/s10_brand.c107
-rw-r--r--usr/src/uts/common/brand/solaris10/s10_brand.h6
-rw-r--r--usr/src/uts/common/conf/param.c29
-rw-r--r--usr/src/uts/common/contract/process.c24
-rw-r--r--usr/src/uts/common/crypto/api/kcf_random.c4
-rw-r--r--usr/src/uts/common/crypto/core/kcf_sched.c6
-rw-r--r--usr/src/uts/common/crypto/io/dprov.c14
-rw-r--r--usr/src/uts/common/crypto/io/sha2_mod.c74
-rw-r--r--usr/src/uts/common/disp/cmt.c8
-rw-r--r--usr/src/uts/common/disp/cpucaps.c285
-rw-r--r--usr/src/uts/common/disp/cpupart.c14
-rw-r--r--usr/src/uts/common/disp/disp.c228
-rw-r--r--usr/src/uts/common/disp/fx.c12
-rw-r--r--usr/src/uts/common/disp/priocntl.c4
-rw-r--r--usr/src/uts/common/disp/rt.c9
-rw-r--r--usr/src/uts/common/disp/rt_dptbl.c4
-rw-r--r--usr/src/uts/common/disp/thread.c372
-rw-r--r--usr/src/uts/common/disp/thread_intr.c37
-rw-r--r--usr/src/uts/common/dtrace/dtrace.c23
-rw-r--r--usr/src/uts/common/dtrace/sdt_subr.c33
-rw-r--r--usr/src/uts/common/exec/aout/aout.c5
-rw-r--r--usr/src/uts/common/exec/elf/elf.c1433
-rw-r--r--usr/src/uts/common/exec/elf/elf_impl.h17
-rw-r--r--usr/src/uts/common/exec/elf/elf_notes.c2
-rw-r--r--usr/src/uts/common/exec/intp/intp.c37
-rw-r--r--usr/src/uts/common/exec/java/java.c5
-rw-r--r--usr/src/uts/common/exec/shbin/shbin.c9
-rw-r--r--usr/src/uts/common/fs/dev/sdev_netops.c259
-rw-r--r--usr/src/uts/common/fs/dev/sdev_plugin.c948
-rw-r--r--usr/src/uts/common/fs/dev/sdev_subr.c209
-rw-r--r--usr/src/uts/common/fs/dev/sdev_vfsops.c23
-rw-r--r--usr/src/uts/common/fs/dev/sdev_vnops.c46
-rw-r--r--usr/src/uts/common/fs/dev/sdev_zvolops.c6
-rw-r--r--usr/src/uts/common/fs/fem.c688
-rw-r--r--usr/src/uts/common/fs/fifofs/fifosubr.c19
-rw-r--r--usr/src/uts/common/fs/fifofs/fifovnops.c40
-rw-r--r--usr/src/uts/common/fs/fs_subr.c40
-rw-r--r--usr/src/uts/common/fs/fs_subr.h4
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c640
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c127
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c614
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c1450
-rw-r--r--usr/src/uts/common/fs/lookup.c2
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_subr.c526
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_vfsops.c367
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_vnops.c3103
-rw-r--r--usr/src/uts/common/fs/lxproc/lxproc.h278
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_vfsops.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_vnops.c14
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vfsops.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vnops.c48
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_auth.c26
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_server.c3
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_sys.c3
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_vfsops.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_vnops.c24
-rw-r--r--usr/src/uts/common/fs/pcfs/pc_dir.c5
-rw-r--r--usr/src/uts/common/fs/pcfs/pc_vnops.c5
-rw-r--r--usr/src/uts/common/fs/portfs/port.c36
-rw-r--r--usr/src/uts/common/fs/proc/prargv.c441
-rw-r--r--usr/src/uts/common/fs/proc/prcontrol.c16
-rw-r--r--usr/src/uts/common/fs/proc/prdata.h4
-rw-r--r--usr/src/uts/common/fs/proc/prioctl.c32
-rw-r--r--usr/src/uts/common/fs/proc/prsubr.c65
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c159
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_kshare.c1
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_server.c16
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.c8
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c50
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_subr.c74
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter.c24
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter_impl.h2
-rw-r--r--usr/src/uts/common/fs/sockfs/socksubr.c7
-rw-r--r--usr/src/uts/common/fs/sockfs/socksyscalls.c120
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi_impl.h3
-rw-r--r--usr/src/uts/common/fs/swapfs/swap_subr.c6
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_dir.c61
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_subr.c136
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_tnode.c70
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vfsops.c278
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vnops.c99
-rw-r--r--usr/src/uts/common/fs/udfs/udf_dir.c6
-rw-r--r--usr/src/uts/common/fs/udfs/udf_vnops.c14
-rw-r--r--usr/src/uts/common/fs/ufs/ufs_vnops.c29
-rw-r--r--usr/src/uts/common/fs/vfs.c4
-rw-r--r--usr/src/uts/common/fs/vnode.c125
-rw-r--r--usr/src/uts/common/fs/zfs/abd.c5
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c15
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c13
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c1
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_recv.c6
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c3
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c3
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c119
-rw-r--r--usr/src/uts/common/fs/zfs/sa.c12
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c36
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c36
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_zone.h63
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h14
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c13
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_initialize.c4
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c12
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_dir.c1
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c165
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c12
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c47
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_zone.c1419
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c20
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c30
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c126
-rw-r--r--usr/src/uts/common/inet/bpf.h49
-rw-r--r--usr/src/uts/common/inet/bpf_filter.c (renamed from usr/src/uts/common/io/bpf/bpf_filter.c)46
-rw-r--r--usr/src/uts/common/inet/inet_hash.h37
-rw-r--r--usr/src/uts/common/inet/ip.h17
-rw-r--r--usr/src/uts/common/inet/ip/conn_opt.c22
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c117
-rw-r--r--usr/src/uts/common/inet/ip/icmp_opt_data.c6
-rw-r--r--usr/src/uts/common/inet/ip/ip.c103
-rw-r--r--usr/src/uts/common/inet/ip/ip6_input.c22
-rw-r--r--usr/src/uts/common/inet/ip/ip6_output.c13
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c184
-rw-r--r--usr/src/uts/common/inet/ip/ip_input.c32
-rw-r--r--usr/src/uts/common/inet/ip/ip_output.c8
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c14
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c165
-rw-r--r--usr/src/uts/common/inet/ip/sadb.c7
-rw-r--r--usr/src/uts/common/inet/ip_impl.h23
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h4
-rw-r--r--usr/src/uts/common/inet/ipd/ipd.c6
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c479
-rw-r--r--usr/src/uts/common/inet/ipf/ipf.conf5
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_stack.h27
-rw-r--r--usr/src/uts/common/inet/ipf/solaris.c1
-rw-r--r--usr/src/uts/common/inet/mib2.h59
-rw-r--r--usr/src/uts/common/inet/rawip_impl.h6
-rw-r--r--usr/src/uts/common/inet/sockmods/datafilt.c116
-rw-r--r--usr/src/uts/common/inet/sockmods/sockmod_pfp.c11
-rw-r--r--usr/src/uts/common/inet/squeue.c456
-rw-r--r--usr/src/uts/common/inet/tcp.h29
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c87
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_bind.c227
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c9
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c198
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c128
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_output.c68
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_socket.c33
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_stats.c112
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_time_wait.c4
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_timers.c47
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h96
-rw-r--r--usr/src/uts/common/inet/tcp_stats.h21
-rw-r--r--usr/src/uts/common/inet/udp/udp.c161
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c4
-rw-r--r--usr/src/uts/common/inet/udp_impl.h7
-rw-r--r--usr/src/uts/common/io/aggr/aggr_grp.c684
-rw-r--r--usr/src/uts/common/io/aggr/aggr_port.c154
-rw-r--r--usr/src/uts/common/io/aggr/aggr_recv.c17
-rw-r--r--usr/src/uts/common/io/bpf/bpf_wrap.c35
-rw-r--r--usr/src/uts/common/io/bridge.c192
-rw-r--r--usr/src/uts/common/io/chxge/ch.c8
-rw-r--r--usr/src/uts/common/io/cons.c16
-rw-r--r--usr/src/uts/common/io/cpqary3/cpqary3.c2
-rw-r--r--usr/src/uts/common/io/devpoll.c366
-rw-r--r--usr/src/uts/common/io/dld/dld_drv.c64
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c154
-rw-r--r--usr/src/uts/common/io/dld/dld_str.c104
-rw-r--r--usr/src/uts/common/io/dls/dls.c140
-rw-r--r--usr/src/uts/common/io/dls/dls_link.c129
-rw-r--r--usr/src/uts/common/io/dls/dls_mgmt.c461
-rw-r--r--usr/src/uts/common/io/dls/dls_stat.c172
-rw-r--r--usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE32
-rw-r--r--usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip1
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas.c5510
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas.conf15
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas.h1766
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas_list.h212
-rw-r--r--usr/src/uts/common/io/elxl/elxl.c6
-rw-r--r--usr/src/uts/common/io/eventfd.c88
-rw-r--r--usr/src/uts/common/io/fibre-channel/impl/fctl.c6
-rw-r--r--usr/src/uts/common/io/gld.c7
-rw-r--r--usr/src/uts/common/io/gsqueue/gsqueue.c608
-rw-r--r--usr/src/uts/common/io/hook.c2
-rw-r--r--usr/src/uts/common/io/i40e/i40e_gld.c103
-rw-r--r--usr/src/uts/common/io/i40e/i40e_intr.c190
-rw-r--r--usr/src/uts/common/io/i40e/i40e_main.c615
-rw-r--r--usr/src/uts/common/io/i40e/i40e_stats.c77
-rw-r--r--usr/src/uts/common/io/i40e/i40e_sw.h115
-rw-r--r--usr/src/uts/common/io/i40e/i40e_transceiver.c1160
-rw-r--r--usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c7
-rw-r--r--usr/src/uts/common/io/inotify.c1555
-rw-r--r--usr/src/uts/common/io/inotify.conf16
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c5
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h3
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_api.c25
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_api.h5
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_common.c254
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_common.h7
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_type.h5
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c29
-rw-r--r--usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h3
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_main.c363
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_sw.h20
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket.c14
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket_impl.h6
-rw-r--r--usr/src/uts/common/io/ksyms.c9
-rw-r--r--usr/src/uts/common/io/mac/mac.c1166
-rw-r--r--usr/src/uts/common/io/mac/mac_bcast.c10
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c258
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c368
-rw-r--r--usr/src/uts/common/io/mac/mac_flow.c3
-rw-r--r--usr/src/uts/common/io/mac/mac_protect.c118
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c76
-rw-r--r--usr/src/uts/common/io/mac/mac_sched.c253
-rw-r--r--usr/src/uts/common/io/mac/mac_soft_ring.c42
-rw-r--r--usr/src/uts/common/io/mac/mac_stat.c14
-rw-r--r--usr/src/uts/common/io/mac/mac_util.c1361
-rw-r--r--usr/src/uts/common/io/mem.c11
-rw-r--r--usr/src/uts/common/io/mr_sas/mr_sas.conf8
-rw-r--r--usr/src/uts/common/io/nfp/THIRDPARTYLICENSE19
-rw-r--r--usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip1
-rw-r--r--usr/src/uts/common/io/nfp/autoversion.h21
-rw-r--r--usr/src/uts/common/io/nfp/drvlist.c19
-rw-r--r--usr/src/uts/common/io/nfp/hostif.c1192
-rw-r--r--usr/src/uts/common/io/nfp/i21285.c310
-rw-r--r--usr/src/uts/common/io/nfp/i21285.h43
-rw-r--r--usr/src/uts/common/io/nfp/i21555.c423
-rw-r--r--usr/src/uts/common/io/nfp/i21555.h51
-rw-r--r--usr/src/uts/common/io/nfp/i21555d.c28
-rw-r--r--usr/src/uts/common/io/nfp/nfdev-common.h141
-rw-r--r--usr/src/uts/common/io/nfp/nfdev-solaris.h37
-rw-r--r--usr/src/uts/common/io/nfp/nfp.h113
-rw-r--r--usr/src/uts/common/io/nfp/nfp_cmd.h68
-rw-r--r--usr/src/uts/common/io/nfp/nfp_common.h68
-rw-r--r--usr/src/uts/common/io/nfp/nfp_error.h48
-rw-r--r--usr/src/uts/common/io/nfp/nfp_hostif.h54
-rw-r--r--usr/src/uts/common/io/nfp/nfp_ifvers.c51
-rw-r--r--usr/src/uts/common/io/nfp/nfp_osif.h105
-rw-r--r--usr/src/uts/common/io/nfp/nfpci.h171
-rw-r--r--usr/src/uts/common/io/nfp/osif.c184
-rw-r--r--usr/src/uts/common/io/overlay/overlay.c2184
-rw-r--r--usr/src/uts/common/io/overlay/overlay.conf16
-rw-r--r--usr/src/uts/common/io/overlay/overlay.mapfile46
-rw-r--r--usr/src/uts/common/io/overlay/overlay_fm.c82
-rw-r--r--usr/src/uts/common/io/overlay/overlay_mux.c368
-rw-r--r--usr/src/uts/common/io/overlay/overlay_plugin.c281
-rw-r--r--usr/src/uts/common/io/overlay/overlay_prop.c122
-rw-r--r--usr/src/uts/common/io/overlay/overlay_target.c1651
-rw-r--r--usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c394
-rw-r--r--usr/src/uts/common/io/pciex/pcie.c193
-rw-r--r--usr/src/uts/common/io/pciex/pcie_fault.c53
-rw-r--r--usr/src/uts/common/io/pciex/pciev.c6
-rw-r--r--usr/src/uts/common/io/physmem.c8
-rw-r--r--usr/src/uts/common/io/pseudo.conf9
-rw-r--r--usr/src/uts/common/io/pseudonex.c26
-rw-r--r--usr/src/uts/common/io/ptm.c47
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpgbin0 -> 86314 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpgbin0 -> 37055 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.pngbin0 -> 9054 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.pngbin0 -> 9907 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpgbin0 -> 46722 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.binbin0 -> 1177408 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.binbin0 -> 528720 bytes
-rw-r--r--usr/src/uts/common/io/qede/qede_list.h1
-rw-r--r--usr/src/uts/common/io/qede/qede_version.h1
-rw-r--r--usr/src/uts/common/io/random.c3
-rw-r--r--usr/src/uts/common/io/rsm/rsm.c2
-rw-r--r--usr/src/uts/common/io/sata/adapters/ahci/ahci.c17
-rw-r--r--usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c2
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt.c565
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf16
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c2023
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c282
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c362
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c238
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c1457
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c286
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c367
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c613
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c160
-rw-r--r--usr/src/uts/common/io/scsi/targets/sd.c107
-rw-r--r--usr/src/uts/common/io/signalfd.c4
-rw-r--r--usr/src/uts/common/io/simnet/simnet.c8
-rw-r--r--usr/src/uts/common/io/stream.c10
-rw-r--r--usr/src/uts/common/io/tl.c17
-rw-r--r--usr/src/uts/common/io/usb/clients/hid/hid.c212
-rw-r--r--usr/src/uts/common/io/usb/usba/genconsole.c33
-rw-r--r--usr/src/uts/common/io/usb/usba/hubdi.c26
-rw-r--r--usr/src/uts/common/io/usb/usba/parser.c145
-rw-r--r--usr/src/uts/common/io/usb/usba/usba.c17
-rw-r--r--usr/src/uts/common/io/usb/usba/usba10_calls.c3
-rw-r--r--usr/src/uts/common/io/usb/usba/usba_bos.c420
-rw-r--r--usr/src/uts/common/io/usb/usba/usba_devdb.c5
-rw-r--r--usr/src/uts/common/io/usb/usba/usba_ugen.c12
-rw-r--r--usr/src/uts/common/io/usb/usba/usbai.c19
-rw-r--r--usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c4
-rw-r--r--usr/src/uts/common/io/usb/usba/usbai_register.c7
-rw-r--r--usr/src/uts/common/io/usb/usba/usbai_req.c7
-rw-r--r--usr/src/uts/common/io/usb/usba/usbai_util.c7
-rw-r--r--usr/src/uts/common/io/usb/usba10/usba10.c3
-rw-r--r--usr/src/uts/common/io/vioif/vioif.c204
-rw-r--r--usr/src/uts/common/io/vnd/frameio.c465
-rw-r--r--usr/src/uts/common/io/vnd/vnd.c5857
-rw-r--r--usr/src/uts/common/io/vnd/vnd.conf16
-rw-r--r--usr/src/uts/common/io/vnic/vnic_dev.c95
-rw-r--r--usr/src/uts/common/io/zfd.c1154
-rw-r--r--usr/src/uts/common/klm/klmmod.c5
-rw-r--r--usr/src/uts/common/klm/mapfile-mod6
-rw-r--r--usr/src/uts/common/klm/nlm_dispatch.c11
-rw-r--r--usr/src/uts/common/klm/nlm_impl.c37
-rw-r--r--usr/src/uts/common/klm/nlm_impl.h2
-rw-r--r--usr/src/uts/common/krtld/kobj.c20
-rw-r--r--usr/src/uts/common/mapfiles/README68
-rw-r--r--usr/src/uts/common/mapfiles/ddi.mapfile192
-rw-r--r--usr/src/uts/common/mapfiles/dtrace.mapfile.awk34
-rw-r--r--usr/src/uts/common/mapfiles/kernel.mapfile41
-rw-r--r--usr/src/uts/common/mapfiles/mac.mapfile57
-rw-r--r--usr/src/uts/common/mapfiles/random.mapfile37
-rw-r--r--usr/src/uts/common/netinet/in.h7
-rw-r--r--usr/src/uts/common/netinet/udp.h14
-rw-r--r--usr/src/uts/common/nfs/nfssys.h12
-rw-r--r--usr/src/uts/common/os/acct.c19
-rw-r--r--usr/src/uts/common/os/brand.c187
-rw-r--r--usr/src/uts/common/os/clock_highres.c59
-rw-r--r--usr/src/uts/common/os/contract.c6
-rw-r--r--usr/src/uts/common/os/core.c22
-rw-r--r--usr/src/uts/common/os/cpu.c238
-rw-r--r--usr/src/uts/common/os/cred.c8
-rw-r--r--usr/src/uts/common/os/cyclic.c58
-rw-r--r--usr/src/uts/common/os/ddi_intr_irm.c2
-rw-r--r--usr/src/uts/common/os/exec.c129
-rw-r--r--usr/src/uts/common/os/exit.c319
-rw-r--r--usr/src/uts/common/os/fio.c35
-rw-r--r--usr/src/uts/common/os/fork.c52
-rw-r--r--usr/src/uts/common/os/grow.c34
-rw-r--r--usr/src/uts/common/os/id_space.c159
-rw-r--r--usr/src/uts/common/os/ipc.c26
-rw-r--r--usr/src/uts/common/os/kmem.c40
-rw-r--r--usr/src/uts/common/os/kstat_fr.c11
-rw-r--r--usr/src/uts/common/os/lgrp.c4
-rw-r--r--usr/src/uts/common/os/logsubr.c6
-rw-r--r--usr/src/uts/common/os/lwp.c129
-rw-r--r--usr/src/uts/common/os/main.c12
-rw-r--r--usr/src/uts/common/os/mem_config.c3
-rw-r--r--usr/src/uts/common/os/mmapobj.c13
-rw-r--r--usr/src/uts/common/os/modctl.c6
-rw-r--r--usr/src/uts/common/os/modsysfile.c27
-rw-r--r--usr/src/uts/common/os/pid.c27
-rw-r--r--usr/src/uts/common/os/policy.c32
-rw-r--r--usr/src/uts/common/os/priv_defs8
-rw-r--r--usr/src/uts/common/os/rctl.c32
-rw-r--r--usr/src/uts/common/os/rctl_proc.c28
-rw-r--r--usr/src/uts/common/os/sched.c15
-rw-r--r--usr/src/uts/common/os/schedctl.c18
-rw-r--r--usr/src/uts/common/os/shm.c41
-rw-r--r--usr/src/uts/common/os/sig.c100
-rw-r--r--usr/src/uts/common/os/smb_subr.c8
-rw-r--r--usr/src/uts/common/os/streamio.c46
-rw-r--r--usr/src/uts/common/os/strsubr.c7
-rw-r--r--usr/src/uts/common/os/subr.c50
-rw-r--r--usr/src/uts/common/os/sunddi.c14
-rw-r--r--usr/src/uts/common/os/sysent.c37
-rw-r--r--usr/src/uts/common/os/timer.c446
-rw-r--r--usr/src/uts/common/os/timers.c49
-rw-r--r--usr/src/uts/common/os/vm_pageout.c875
-rw-r--r--usr/src/uts/common/os/vmem.c2
-rw-r--r--usr/src/uts/common/os/zone.c1175
-rw-r--r--usr/src/uts/common/refhash/refhash.c (renamed from usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c)6
-rw-r--r--usr/src/uts/common/sys/Makefile19
-rw-r--r--usr/src/uts/common/sys/acct.h3
-rw-r--r--usr/src/uts/common/sys/aggr_impl.h75
-rw-r--r--usr/src/uts/common/sys/auxv.h30
-rw-r--r--usr/src/uts/common/sys/brand.h106
-rw-r--r--usr/src/uts/common/sys/buf.h8
-rw-r--r--usr/src/uts/common/sys/contract/process.h6
-rw-r--r--usr/src/uts/common/sys/cpucaps.h5
-rw-r--r--usr/src/uts/common/sys/cpucaps_impl.h6
-rw-r--r--usr/src/uts/common/sys/cpuvar.h182
-rw-r--r--usr/src/uts/common/sys/cred.h1
-rw-r--r--usr/src/uts/common/sys/cyclic.h2
-rw-r--r--usr/src/uts/common/sys/disp.h13
-rw-r--r--usr/src/uts/common/sys/dktp/dadk.h2
-rw-r--r--usr/src/uts/common/sys/dld.h12
-rw-r--r--usr/src/uts/common/sys/dld_impl.h5
-rw-r--r--usr/src/uts/common/sys/dld_ioc.h3
-rw-r--r--usr/src/uts/common/sys/dlpi.h19
-rw-r--r--usr/src/uts/common/sys/dls.h12
-rw-r--r--usr/src/uts/common/sys/dls_impl.h11
-rw-r--r--usr/src/uts/common/sys/dls_mgmt.h15
-rw-r--r--usr/src/uts/common/sys/elf.h44
-rw-r--r--usr/src/uts/common/sys/eventfd.h10
-rw-r--r--usr/src/uts/common/sys/exec.h39
-rw-r--r--usr/src/uts/common/sys/file.h11
-rw-r--r--usr/src/uts/common/sys/frameio.h107
-rw-r--r--usr/src/uts/common/sys/fs/fifonode.h16
-rw-r--r--usr/src/uts/common/sys/fs/hyprlofs.h91
-rw-r--r--usr/src/uts/common/sys/fs/hyprlofs_info.h174
-rw-r--r--usr/src/uts/common/sys/fs/sdev_impl.h61
-rw-r--r--usr/src/uts/common/sys/fs/sdev_plugin.h106
-rw-r--r--usr/src/uts/common/sys/fs/tmp.h54
-rw-r--r--usr/src/uts/common/sys/fx.h10
-rw-r--r--usr/src/uts/common/sys/gsqueue.h59
-rw-r--r--usr/src/uts/common/sys/hook_impl.h4
-rw-r--r--usr/src/uts/common/sys/id_space.h5
-rw-r--r--usr/src/uts/common/sys/inotify.h153
-rw-r--r--usr/src/uts/common/sys/ipc_impl.h2
-rw-r--r--usr/src/uts/common/sys/ipd.h4
-rw-r--r--usr/src/uts/common/sys/iso/signal_iso.h3
-rw-r--r--usr/src/uts/common/sys/klwp.h11
-rw-r--r--usr/src/uts/common/sys/kobj.h12
-rw-r--r--usr/src/uts/common/sys/ksocket.h6
-rw-r--r--usr/src/uts/common/sys/limits.h32
-rw-r--r--usr/src/uts/common/sys/mac.h41
-rw-r--r--usr/src/uts/common/sys/mac_client.h7
-rw-r--r--usr/src/uts/common/sys/mac_client_impl.h85
-rw-r--r--usr/src/uts/common/sys/mac_client_priv.h23
-rw-r--r--usr/src/uts/common/sys/mac_flow.h22
-rw-r--r--usr/src/uts/common/sys/mac_impl.h133
-rw-r--r--usr/src/uts/common/sys/mac_provider.h90
-rw-r--r--usr/src/uts/common/sys/mman.h1
-rw-r--r--usr/src/uts/common/sys/mntent.h2
-rw-r--r--usr/src/uts/common/sys/netconfig.h1
-rw-r--r--usr/src/uts/common/sys/neti.h7
-rw-r--r--usr/src/uts/common/sys/netstack.h3
-rw-r--r--usr/src/uts/common/sys/overlay.h96
-rw-r--r--usr/src/uts/common/sys/overlay_common.h65
-rw-r--r--usr/src/uts/common/sys/overlay_impl.h205
-rw-r--r--usr/src/uts/common/sys/overlay_plugin.h324
-rw-r--r--usr/src/uts/common/sys/overlay_target.h293
-rw-r--r--usr/src/uts/common/sys/param.h2
-rw-r--r--usr/src/uts/common/sys/pattr.h20
-rw-r--r--usr/src/uts/common/sys/pci.h5
-rw-r--r--usr/src/uts/common/sys/pci_cap.h24
-rw-r--r--usr/src/uts/common/sys/pcie.h152
-rw-r--r--usr/src/uts/common/sys/pcie_impl.h42
-rw-r--r--usr/src/uts/common/sys/policy.h2
-rw-r--r--usr/src/uts/common/sys/poll_impl.h5
-rw-r--r--usr/src/uts/common/sys/proc.h11
-rw-r--r--usr/src/uts/common/sys/procfs.h1
-rw-r--r--usr/src/uts/common/sys/prsystm.h4
-rw-r--r--usr/src/uts/common/sys/ptms.h19
-rw-r--r--usr/src/uts/common/sys/refhash.h (renamed from usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h)8
-rw-r--r--usr/src/uts/common/sys/resource.h1
-rw-r--r--usr/src/uts/common/sys/rt.h11
-rw-r--r--usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h147
-rw-r--r--usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h2
-rw-r--r--usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h750
-rw-r--r--usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h345
-rw-r--r--usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h371
-rw-r--r--usr/src/uts/common/sys/scsi/generic/inquiry.h7
-rw-r--r--usr/src/uts/common/sys/scsi/targets/sddef.h10
-rw-r--r--usr/src/uts/common/sys/sensors.h81
-rw-r--r--usr/src/uts/common/sys/shm.h5
-rw-r--r--usr/src/uts/common/sys/shm_impl.h17
-rw-r--r--usr/src/uts/common/sys/signal.h5
-rw-r--r--usr/src/uts/common/sys/smbios.h61
-rw-r--r--usr/src/uts/common/sys/smbios_impl.h45
-rw-r--r--usr/src/uts/common/sys/socket.h10
-rw-r--r--usr/src/uts/common/sys/socketvar.h42
-rw-r--r--usr/src/uts/common/sys/sockfilter.h10
-rw-r--r--usr/src/uts/common/sys/squeue.h17
-rw-r--r--usr/src/uts/common/sys/squeue_impl.h10
-rw-r--r--usr/src/uts/common/sys/stream.h6
-rw-r--r--usr/src/uts/common/sys/strsubr.h8
-rw-r--r--usr/src/uts/common/sys/sunddi.h20
-rw-r--r--usr/src/uts/common/sys/sysconfig.h3
-rw-r--r--usr/src/uts/common/sys/sysevent.h3
-rw-r--r--usr/src/uts/common/sys/sysevent/datalink.h54
-rw-r--r--usr/src/uts/common/sys/sysevent/eventdefs.h4
-rw-r--r--usr/src/uts/common/sys/systrace.h13
-rw-r--r--usr/src/uts/common/sys/termios.h18
-rw-r--r--usr/src/uts/common/sys/thread.h19
-rw-r--r--usr/src/uts/common/sys/time.h15
-rw-r--r--usr/src/uts/common/sys/timer.h37
-rw-r--r--usr/src/uts/common/sys/uadmin.h3
-rw-r--r--usr/src/uts/common/sys/uio.h12
-rw-r--r--usr/src/uts/common/sys/usb/clients/hid/hidminor.h19
-rw-r--r--usr/src/uts/common/sys/usb/clients/hid/hidvar.h5
-rw-r--r--usr/src/uts/common/sys/usb/usba/bos.h242
-rw-r--r--usr/src/uts/common/sys/usb/usba/usba10.h3
-rw-r--r--usr/src/uts/common/sys/usb/usba/usba_impl.h9
-rw-r--r--usr/src/uts/common/sys/usb/usba/usba_private.h32
-rw-r--r--usr/src/uts/common/sys/usb/usba/usba_types.h12
-rw-r--r--usr/src/uts/common/sys/usb/usbai.h4
-rw-r--r--usr/src/uts/common/sys/user.h25
-rw-r--r--usr/src/uts/common/sys/vm.h3
-rw-r--r--usr/src/uts/common/sys/vm_usage.h7
-rw-r--r--usr/src/uts/common/sys/vmsystm.h8
-rw-r--r--usr/src/uts/common/sys/vnd.h141
-rw-r--r--usr/src/uts/common/sys/vnd_errno.h72
-rw-r--r--usr/src/uts/common/sys/vnic_impl.h4
-rw-r--r--usr/src/uts/common/sys/vnode.h13
-rw-r--r--usr/src/uts/common/sys/vxlan.h47
-rw-r--r--usr/src/uts/common/sys/zfd.h78
-rw-r--r--usr/src/uts/common/sys/zone.h203
-rw-r--r--usr/src/uts/common/syscall/brandsys.c8
-rw-r--r--usr/src/uts/common/syscall/chdir.c29
-rw-r--r--usr/src/uts/common/syscall/fcntl.c3
-rw-r--r--usr/src/uts/common/syscall/memcntl.c8
-rw-r--r--usr/src/uts/common/syscall/open.c8
-rw-r--r--usr/src/uts/common/syscall/poll.c358
-rw-r--r--usr/src/uts/common/syscall/rusagesys.c1
-rw-r--r--usr/src/uts/common/syscall/rw.c375
-rw-r--r--usr/src/uts/common/syscall/sendfile.c19
-rw-r--r--usr/src/uts/common/syscall/stat.c2
-rw-r--r--usr/src/uts/common/syscall/sysconfig.c51
-rw-r--r--usr/src/uts/common/syscall/uadmin.c10
-rw-r--r--usr/src/uts/common/syscall/umount.c11
-rw-r--r--usr/src/uts/common/vm/hat.h10
-rw-r--r--usr/src/uts/common/vm/page.h7
-rw-r--r--usr/src/uts/common/vm/page_lock.c10
-rw-r--r--usr/src/uts/common/vm/page_retire.c7
-rw-r--r--usr/src/uts/common/vm/seg_kmem.c83
-rw-r--r--usr/src/uts/common/vm/seg_kmem.h18
-rw-r--r--usr/src/uts/common/vm/seg_vn.c11
-rw-r--r--usr/src/uts/common/vm/vm_as.c19
-rw-r--r--usr/src/uts/common/vm/vm_page.c29
-rw-r--r--usr/src/uts/common/vm/vm_pvn.c28
-rw-r--r--usr/src/uts/common/vm/vm_usage.c252
-rw-r--r--usr/src/uts/common/xen/io/xnb.c5
635 files changed, 135572 insertions, 7180 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index e344b15a00..0cef482d82 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -288,6 +288,7 @@ GENUNIX_OBJS += \
rctl.o \
rctlsys.o \
readlink.o \
+ refhash.o \
refstr.o \
rename.o \
resolvepath.o \
@@ -437,6 +438,8 @@ PROFILE_OBJS += profile.o
SYSTRACE_OBJS += systrace.o
+LX_SYSTRACE_OBJS += lx_systrace.o
+
LOCKSTAT_OBJS += lockstat.o
FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o
@@ -499,6 +502,10 @@ PTSL_OBJS += tty_pts.o
PTM_OBJS += ptm.o
+LX_PTM_OBJS += lx_ptm.o
+
+LX_NETLINK_OBJS += lx_netlink.o
+
MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \
mii_marvell.o mii_realtek.o mii_other.o
@@ -556,6 +563,7 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \
sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o \
sctp_misc.o
IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o
+IP_COMM_OBJS = inet_hash.o
IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
@@ -566,12 +574,14 @@ IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
ip_helper_stream.o ip_tunables.o \
ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \
conn_opt.o ip_attr.o ip_dce.o \
+ bpf_filter.o \
$(IP_ICMP_OBJS) \
$(IP_RTS_OBJS) \
$(IP_TCP_OBJS) \
$(IP_UDP_OBJS) \
$(IP_SCTP_OBJS) \
- $(IP_ILB_OBJS)
+ $(IP_ILB_OBJS) \
+ $(IP_COMM_OBJS)
IP6_OBJS += ip6ddi.o
@@ -589,6 +599,8 @@ IPSECESP_OBJS += ipsecespddi.o ipsecesp.o
IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o
+DATAFILT_OBJS += datafilt.o
+
SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o
SPPPTUN_OBJS += sppptun.o sppptun_mod.o
@@ -642,7 +654,7 @@ TL_OBJS += tl.o
DUMP_OBJS += dump.o
-BPF_OBJS += bpf.o bpf_filter.o bpf_mod.o bpf_dlt.o bpf_mac.o
+BPF_OBJS += bpf.o bpf_wrap.o bpf_mod.o bpf_dlt.o bpf_mac.o
CLONE_OBJS += clone.o
@@ -686,6 +698,15 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
VNIC_OBJS += vnic_ctl.o vnic_dev.o
+OVERLAY_OBJS += overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \
+ overlay_prop.o overlay_target.o
+
+OVERLAY_VXLAN_OBJS += overlay_vxlan.o
+
+VND_OBJS += vnd.o frameio.o
+
+GSQUEUE_OBJS += gsqueue.o
+
SIMNET_OBJS += simnet.o
IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o
@@ -832,7 +853,7 @@ SATA_OBJS += sata.o
USBA_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \
usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \
- usba_devdb.o usba10_calls.o usba_ugen.o
+ usba_devdb.o usba10_calls.o usba_ugen.o usba_bos.o
USBA10_OBJS += usba10.o
@@ -938,6 +959,8 @@ SIGNALFD_OBJS += signalfd.o
I8042_OBJS += i8042.o
+INOTIFY_OBJS += inotify.o
+
KB8042_OBJS += \
at_keyprocess.o \
kb8042.o \
@@ -1012,6 +1035,8 @@ QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o
ZCONS_OBJS += zcons.o
+ZFD_OBJS += zfd.o
+
NV_SATA_OBJS += nv_sata.o
SI3124_OBJS += si3124.o
@@ -1065,8 +1090,7 @@ DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o
DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \
sdev_ptsops.o sdev_zvolops.o sdev_comm.o \
sdev_profile.o sdev_ncache.o sdev_netops.o \
- sdev_ipnetops.o \
- sdev_vtops.o
+ sdev_ipnetops.o sdev_vtops.o sdev_plugin.o
CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \
ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o
@@ -1083,8 +1107,13 @@ PIPE_OBJS += pipe.o
HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \
hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o
+HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \
+ hyprlofs_vnops.o hyprlofs_vfsops.o
+
LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o
+LXPROC_OBJS += lxpr_subr.o lxpr_vfsops.o lxpr_vnops.o
+
NAMEFS_OBJS += namevfs.o namevno.o
NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \
@@ -1236,8 +1265,8 @@ SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \
PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \
pc_vfsops.o pc_vnops.o
-PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \
- prvfsops.o prvnops.o
+PROC_OBJS += prargv.o prcontrol.o prioctl.o prsubr.o \
+ prusrio.o prvfsops.o prvnops.o
MNTFS_OBJS += mntvfsops.o mntvnops.o
@@ -1402,6 +1431,7 @@ ZFS_COMMON_OBJS += \
zfs_fuid.o \
zfs_sa.o \
zfs_znode.o \
+ zfs_zone.o \
zil.o \
zio.o \
zio_checksum.o \
@@ -1867,7 +1897,7 @@ ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o
MXFE_OBJS += mxfe.o
-MPTSAS_OBJS += mptsas.o mptsas_hash.o mptsas_impl.o mptsas_init.o \
+MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o \
mptsas_raid.o mptsas_smhba.o
SFE_OBJS += sfe.o sfe_util.o
@@ -1902,9 +1932,9 @@ LINT_DEFS += -Dunix
# It is a bug in the current compilation system that the assember
# can't process the -Y I, flag.
#
-NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
-AS_INC_PATH += $(INC_PATH) -I$(UTSBASE)/common
-INCLUDE_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
+NATIVE_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
+AS_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) -I$(UTSBASE)/common
+INCLUDE_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
PCIEB_OBJS += pcieb.o
@@ -2102,6 +2132,11 @@ MEGA_SAS_OBJS = megaraid_sas.o
MR_SAS_OBJS = ld_pd_map.o mr_sas.o mr_sas_tbolt.o mr_sas_list.o
#
+# DR_SAS module
+#
+DR_SAS_OBJS = dr_sas.o
+
+#
# CPQARY3 module
#
CPQARY3_OBJS = cpqary3.o cpqary3_noe.o cpqary3_talk2ctlr.o \
@@ -2110,6 +2145,20 @@ CPQARY3_OBJS = cpqary3.o cpqary3_noe.o cpqary3_talk2ctlr.o \
cpqary3_bd.o
#
+# HP Smart Array driver module (smrt)
+#
+SMRT_OBJS = smrt.o \
+ smrt_device.o \
+ smrt_interrupts.o \
+ smrt_commands.o \
+ smrt_logvol.o \
+ smrt_hba.o \
+ smrt_ciss_simple.o \
+ smrt_ciss.o \
+ smrt_physical.o \
+ smrt_sata.o
+
+#
# ISCSI_INITIATOR module
#
ISCSI_INITIATOR_OBJS = chap.o iscsi_io.o iscsi_thread.o \
@@ -2149,6 +2198,11 @@ URF_OBJS = urf_usbgem.o
UPF_OBJS = upf_usbgem.o
#
+# NFP objects
+#
+NFP_OBJS = hostif.o osif.o drvlist.o i21555.o i21285.o i21555d.o
+
+#
# BNXE objects
#
BNXE_OBJS += bnxe_cfg.o \
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index c369cd3b63..e739dae95f 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -102,6 +102,10 @@ $(OBJS_DIR)/%.o: $(COMMONBASE)/avl/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(COMMONBASE)/inet/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(COMMONBASE)/ucode/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -265,10 +269,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hsfs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hyprlofs/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lofs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lxproc/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/mntfs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -759,6 +771,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/drm/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/dr_sas/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/efe/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -959,6 +975,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/net80211/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nfp/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nge/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -978,6 +998,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/npi/%.c
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/%.s
$(COMPILE.s) -o $@ $<
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/plugins/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/pci-ide/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1082,6 +1110,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/scsi/adapters/scsi_vhci/fops/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/scsi/adapters/smrt/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/fibre-channel/ulp/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1122,6 +1154,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/gsqueue/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sfe/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1134,6 +1170,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/softmac/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vnd/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/uath/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1478,9 +1518,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioblk/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(COMMONBASE)/idspace/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioif/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+
#
# krtld must refer to its own bzero/bcopy until the kernel is fully linked
#
@@ -1545,6 +1590,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1672,6 +1721,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/acl/%.c
$(LINTS_DIR)/%.ln: $(COMMONBASE)/avl/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(COMMONBASE)/inet/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(COMMONBASE)/ucode/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -1786,9 +1838,15 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/fifofs/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hsfs/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hyprlofs/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lofs/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lxproc/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/mntfs/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2137,6 +2195,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dmfe/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/drm/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dr_sas/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/efe/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2284,6 +2345,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/mwl/mwl_fw/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/net80211/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nfp/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nge/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2299,6 +2363,12 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/%.s
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/npi/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/plugins/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/pci-ide/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2368,6 +2438,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/scsi/adapters/scsi_vhci/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/scsi/adapters/scsi_vhci/fops/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/scsi/adapters/smrt/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/fibre-channel/ulp/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2407,6 +2480,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/sdcard/impl/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/gsqueue/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/sfe/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2416,6 +2492,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/simnet/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/softmac/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/vnd/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/uath/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2677,6 +2756,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/nexus/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/pcs/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/refhash/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -2770,3 +2852,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/skd/%.c
$(LINTS_DIR)/%.ln: $(COMMONBASE)/fsreparse/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln: $(COMMONBASE)/idspace/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/brand/lx/autofs/lx_autofs.c b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c
new file mode 100644
index 0000000000..730deae80e
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c
@@ -0,0 +1,3174 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * See the big theory statement in ../sys/lx_autofs.h
+ */
+
+#include <fs/fs_subr.h>
+#include <sys/stat.h>
+#include <sys/atomic.h>
+#include <sys/cmn_err.h>
+#include <sys/dirent.h>
+#include <sys/fs/fifonode.h>
+#include <sys/modctl.h>
+#include <sys/mount.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/conf.h>
+#include <sys/sdt.h>
+
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+
+#include <sys/dnlc.h>
+#include <nfs/rnode.h>
+#include <nfs/rnode4.h>
+#include <sys/lx_autofs_impl.h>
+#include <sys/lx_types.h>
+
+/*
+ * External functions
+ */
+extern uintptr_t space_fetch(char *key);
+extern int space_store(char *key, uintptr_t ptr);
+extern int umount2_engine(vfs_t *, int, cred_t *, int);
+
+/*
+ * Globals
+ */
+static vfsops_t *lx_autofs_vfsops;
+static vnodeops_t *lx_autofs_vn_ops = NULL;
+static int lx_autofs_fstype;
+static major_t lx_autofs_major;
+static minor_t lx_autofs_minor = 0;
+static dev_info_t *lx_autofs_dip = NULL;
+
+#define LX_AUTOFS_DEV_VERSION_MAJOR 1
+#define LX_AUTOFS_DEV_VERSION_MINOR 0
+
+/* The Linux autofs superblock magic number */
+#define LX_AUTOFS_SB_MAGIC 0x0187
+
+/* Linux autofs mount types */
+#define LX_AUTOFS_TYPE_INDIRECT 1
+#define LX_AUTOFS_TYPE_DIRECT 2
+#define LX_AUTOFS_TYPE_OFFSET 4
+
+/* Structure passed for autofs dev ioctls */
+typedef struct lx_autofs_dv_ioctl {
+ uint32_t lad_ver_major;
+ uint32_t lad_ver_minor;
+ uint32_t lad_size;
+ uint32_t lad_ioctlfd;
+ uint32_t lad_arg1;
+ uint32_t lad_arg2;
+ char lad_path[0];
+} lx_autofs_dv_ioctl_t;
+
+/*
+ * Support functions
+ */
+static void
+lx_autofs_strfree(char *str)
+{
+ kmem_free(str, strlen(str) + 1);
+}
+
+static char *
+lx_autofs_strdup(char *str)
+{
+ int n = strlen(str);
+ char *ptr = kmem_alloc(n + 1, KM_SLEEP);
+ bcopy(str, ptr, n + 1);
+ return (ptr);
+}
+
+static int
+lx_autofs_str_to_int(char *str, int *val)
+{
+ long res;
+
+ if (str == NULL)
+ return (-1);
+
+ if ((ddi_strtol(str, NULL, 10, &res) != 0) ||
+ (res < INT_MIN) || (res > INT_MAX))
+ return (-1);
+
+ *val = res;
+ return (0);
+}
+
+static void
+ls_autofs_stack_init(list_t *lp)
+{
+ list_create(lp,
+ sizeof (stack_elem_t), offsetof(stack_elem_t, se_list));
+}
+
+static void
+lx_autofs_stack_fini(list_t *lp)
+{
+ ASSERT(list_head(lp) == NULL);
+ list_destroy(lp);
+}
+
+static void
+lx_autofs_stack_push(list_t *lp, caddr_t ptr1, caddr_t ptr2, caddr_t ptr3)
+{
+ stack_elem_t *se;
+
+ se = kmem_alloc(sizeof (*se), KM_SLEEP);
+ se->se_ptr1 = ptr1;
+ se->se_ptr2 = ptr2;
+ se->se_ptr3 = ptr3;
+ list_insert_head(lp, se);
+}
+
+static int
+lx_autofs_stack_pop(list_t *lp, caddr_t *ptr1, caddr_t *ptr2, caddr_t *ptr3)
+{
+ stack_elem_t *se;
+
+ if ((se = list_head(lp)) == NULL)
+ return (-1);
+ list_remove(lp, se);
+ if (ptr1 != NULL)
+ *ptr1 = se->se_ptr1;
+ if (ptr2 != NULL)
+ *ptr2 = se->se_ptr2;
+ if (ptr3 != NULL)
+ *ptr3 = se->se_ptr3;
+ kmem_free(se, sizeof (*se));
+ return (0);
+}
+
+static vnode_t *
+lx_autofs_fifo_peer_vp(vnode_t *vp)
+{
+ fifonode_t *fnp = VTOF(vp);
+ fifonode_t *fn_dest = fnp->fn_dest;
+ return (FTOV(fn_dest));
+}
+
+static vnode_t *
+lx_autofs_vn_alloc(vfs_t *vfsp, vnode_t *uvp)
+{
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ vnode_t *vp, *vp_old;
+
+ /* Allocate a new vnode structure in case we need it. */
+ vp = vn_alloc(KM_SLEEP);
+ vn_setops(vp, lx_autofs_vn_ops);
+ VN_SET_VFS_TYPE_DEV(vp, vfsp, uvp->v_type, uvp->v_rdev);
+ vp->v_data = uvp;
+ ASSERT(vp->v_count == 1);
+
+ /*
+ * Take a hold on the vfs structure. This is how unmount will
+ * determine if there are any active vnodes in the file system.
+ */
+ VFS_HOLD(vfsp);
+
+ /*
+ * Check if we already have a vnode allocated for this underlying
+ * vnode_t.
+ */
+ mutex_enter(&data->lav_lock);
+ if (mod_hash_find(data->lav_vn_hash,
+ (mod_hash_key_t)uvp, (mod_hash_val_t *)&vp_old) != 0) {
+
+ /*
+ * Didn't find an existing node.
+ * Add this node to the hash and return.
+ */
+ VERIFY(mod_hash_insert(data->lav_vn_hash,
+ (mod_hash_key_t)uvp,
+ (mod_hash_val_t)vp) == 0);
+ mutex_exit(&data->lav_lock);
+ return (vp);
+ }
+
+ /* Get a hold on the existing vnode and free up the one we allocated. */
+ VN_HOLD(vp_old);
+ mutex_exit(&data->lav_lock);
+
+ /* Free up the new vnode we allocated. */
+ VN_RELE(uvp);
+ VFS_RELE(vfsp);
+ vn_invalid(vp);
+ vn_free(vp);
+
+ return (vp_old);
+}
+
+static void
+lx_autofs_vn_free(vnode_t *vp)
+{
+ vfs_t *vfsp = vp->v_vfsp;
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ vnode_t *uvp = vp->v_data;
+ vnode_t *vp_tmp;
+
+ ASSERT(MUTEX_HELD((&data->lav_lock)));
+ ASSERT(MUTEX_HELD((&vp->v_lock)));
+
+ ASSERT(vp->v_count == 0);
+
+ /* We're about to free this vnode so take it out of the hash. */
+ (void) mod_hash_remove(data->lav_vn_hash,
+ (mod_hash_key_t)uvp, (mod_hash_val_t)&vp_tmp);
+
+ /*
+ * No one else can lookup this vnode any more so there's no need
+ * to hold locks.
+ */
+ mutex_exit(&data->lav_lock);
+ mutex_exit(&vp->v_lock);
+
+ /* Release the underlying vnode. */
+ VN_RELE(uvp);
+ VFS_RELE(vfsp);
+ vn_invalid(vp);
+ vn_free(vp);
+}
+
+static lx_autofs_automnt_req_t *
+lx_autofs_la_alloc(lx_autofs_vfs_t *data, boolean_t *is_dup, boolean_t expire,
+ char *nm)
+{
+ lx_autofs_automnt_req_t *laar, *laar_dup;
+
+ /* Pre-allocate a new automounter request before grabbing locks. */
+ laar = kmem_zalloc(sizeof (*laar), KM_SLEEP);
+ mutex_init(&laar->laar_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&laar->laar_cv, NULL, CV_DEFAULT, NULL);
+ laar->laar_ref = 1;
+
+ if (data->lav_min_proto == 5) {
+ laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS5;
+
+ if (data->lav_mnttype == LXAMT_INDIR) {
+ if (expire) {
+ laar->laar_pkt.lap_type =
+ LX_AUTOFS_PTYPE_EXPIRE_INDIR;
+ } else {
+ laar->laar_pkt.lap_type =
+ LX_AUTOFS_PTYPE_MISSING_INDIR;
+ }
+ } else {
+ if (expire) {
+ laar->laar_pkt.lap_type =
+ LX_AUTOFS_PTYPE_EXPIRE_DIRECT;
+ } else {
+ laar->laar_pkt.lap_type =
+ LX_AUTOFS_PTYPE_MISSING_DIRECT;
+ }
+ }
+ laar->laar_pkt_size = sizeof (lx_autofs_v5_pkt_t);
+
+ laar->laar_pkt.lap_v5.lap_dev = data->lav_dev;
+ laar->laar_pkt.lap_v5.lap_ino = data->lav_ino;
+ /*
+ * Note that we're currently not filling in the other v5 pkt
+ * fields (pid, uid, etc.) since they don't appear to be used
+ * by the automounter. We can fill those in later if it proves
+ * necessary.
+ */
+
+ /*
+ * For indirect mounts the token expected by the automounter is
+ * the name of the directory entry to look up (not the entire
+ * path that is being accessed.) For direct mounts the Linux
+ * kernel passes a dummy name, so this is just as good.
+ */
+ laar->laar_pkt.lap_v5.lap_name_len = strlen(nm);
+ if (laar->laar_pkt.lap_v5.lap_name_len >
+ (sizeof (laar->laar_pkt.lap_v5.lap_name) - 1)) {
+ zcmn_err(getzoneid(), CE_NOTE,
+ "invalid autofs automnt req: \"%s\"", nm);
+ kmem_free(laar, sizeof (*laar));
+ return (NULL);
+ }
+ (void) strlcpy(laar->laar_pkt.lap_v5.lap_name, nm,
+ sizeof (laar->laar_pkt.lap_v5.lap_name));
+
+ } else if (expire) {
+ zcmn_err(getzoneid(), CE_WARN,
+ "unsupported expire protocol request: \"%s\"", nm);
+ kmem_free(laar, sizeof (*laar));
+ return (NULL);
+
+ } else {
+ ASSERT(expire == B_FALSE);
+
+ /* Older protocol pkt (really v2) */
+ laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS2;
+ laar->laar_pkt.lap_type = LX_AUTOFS_PTYPE_MISSING;
+ laar->laar_pkt_size = sizeof (lx_autofs_v2_pkt_t);
+
+ /*
+ * The token expected by the linux automount is the name of
+ * the directory entry to look up. (And not the entire
+ * path that is being accessed.)
+ */
+ laar->laar_pkt.lap_v2.lap_name_len = strlen(nm);
+ if (laar->laar_pkt.lap_v2.lap_name_len >
+ (sizeof (laar->laar_pkt.lap_v2.lap_name) - 1)) {
+ zcmn_err(getzoneid(), CE_NOTE,
+ "invalid autofs lookup: \"%s\"", nm);
+ kmem_free(laar, sizeof (*laar));
+ return (NULL);
+ }
+ (void) strlcpy(laar->laar_pkt.lap_v2.lap_name, nm,
+ sizeof (laar->laar_pkt.lap_v2.lap_name));
+ }
+
+ /* Assign a unique id for this request. */
+ laar->laar_pkt.lap_id = id_alloc(data->lav_ids);
+
+ /* Check for an outstanding request for this path. */
+ mutex_enter(&data->lav_lock);
+ if (mod_hash_find(data->lav_path_hash,
+ (mod_hash_key_t)nm, (mod_hash_val_t *)&laar_dup) == 0) {
+ /*
+ * There's already an outstanding request for this
+ * path so we don't need a new one.
+ */
+ id_free(data->lav_ids, laar->laar_pkt.lap_id);
+ kmem_free(laar, sizeof (*laar));
+ laar = laar_dup;
+
+ /* Bump the ref count on the old request. */
+ atomic_add_int(&laar->laar_ref, 1);
+
+ *is_dup = 1;
+ } else {
+ /* Add it to the hashes. */
+ VERIFY(mod_hash_insert(data->lav_id_hash,
+ (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id,
+ (mod_hash_val_t)laar) == 0);
+ VERIFY(mod_hash_insert(data->lav_path_hash,
+ (mod_hash_key_t)lx_autofs_strdup(nm),
+ (mod_hash_val_t)laar) == 0);
+
+ *is_dup = 0;
+ }
+ mutex_exit(&data->lav_lock);
+
+ return (laar);
+}
+
+static lx_autofs_automnt_req_t *
+lx_autofs_la_find(lx_autofs_vfs_t *data, int id)
+{
+ lx_autofs_automnt_req_t *laar;
+
+ /* Check for an outstanding request for this id. */
+ mutex_enter(&data->lav_lock);
+ if (mod_hash_find(data->lav_id_hash, (mod_hash_key_t)(uintptr_t)id,
+ (mod_hash_val_t *)&laar) != 0) {
+ mutex_exit(&data->lav_lock);
+ return (NULL);
+ }
+ atomic_add_int(&laar->laar_ref, 1);
+ mutex_exit(&data->lav_lock);
+ return (laar);
+}
+
+static void
+lx_autofs_la_complete(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar)
+{
+ lx_autofs_automnt_req_t *laar_tmp;
+
+ /* Remove this request from the hashes so no one can look it up. */
+ mutex_enter(&data->lav_lock);
+ (void) mod_hash_remove(data->lav_id_hash,
+ (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id,
+ (mod_hash_val_t)&laar_tmp);
+ if (data->lav_min_proto == 5) {
+ (void) mod_hash_remove(data->lav_path_hash,
+ (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name,
+ (mod_hash_val_t)&laar_tmp);
+ } else {
+ (void) mod_hash_remove(data->lav_path_hash,
+ (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name,
+ (mod_hash_val_t)&laar_tmp);
+ }
+ mutex_exit(&data->lav_lock);
+
+ /* Mark this requst as complete and wakeup anyone waiting on it. */
+ mutex_enter(&laar->laar_lock);
+ laar->laar_complete = 1;
+ cv_broadcast(&laar->laar_cv);
+ mutex_exit(&laar->laar_lock);
+}
+
+static void
+lx_autofs_la_release(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar)
+{
+ ASSERT(!MUTEX_HELD(&laar->laar_lock));
+ if (atomic_add_int_nv(&laar->laar_ref, -1) > 0)
+ return;
+ ASSERT(laar->laar_ref == 0);
+ id_free(data->lav_ids, laar->laar_pkt.lap_id);
+ kmem_free(laar, sizeof (*laar));
+}
+
+static void
+lx_autofs_la_abort(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar)
+{
+ lx_autofs_automnt_req_t *laar_tmp;
+
+ /*
+ * This is a little tricky. We're aborting the wait for this
+ * request. So if anyone else is waiting for this request we
+ * can't free it, but if no one else is waiting for the request
+ * we should free it.
+ */
+ mutex_enter(&data->lav_lock);
+ if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) {
+ mutex_exit(&data->lav_lock);
+ return;
+ }
+ ASSERT(laar->laar_ref == 0);
+
+ /* Remove this request from the hashes so no one can look it up. */
+ (void) mod_hash_remove(data->lav_id_hash,
+ (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id,
+ (mod_hash_val_t)&laar_tmp);
+ if (data->lav_min_proto == 5) {
+ (void) mod_hash_remove(data->lav_path_hash,
+ (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name,
+ (mod_hash_val_t)&laar_tmp);
+ } else {
+ (void) mod_hash_remove(data->lav_path_hash,
+ (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name,
+ (mod_hash_val_t)&laar_tmp);
+ }
+ mutex_exit(&data->lav_lock);
+
+ /* It's ok to free this now because the ref count was zero. */
+ id_free(data->lav_ids, laar->laar_pkt.lap_id);
+ kmem_free(laar, sizeof (*laar));
+}
+
+static int
+lx_autofs_fifo_lookup(pid_t pgrp, int fd, file_t **fpp_wr, file_t **fpp_rd)
+{
+ proc_t *prp;
+ uf_info_t *fip;
+ uf_entry_t *ufp_wr, *ufp_rd = NULL;
+ file_t *fp_wr, *fp_rd = NULL;
+ vnode_t *vp_wr, *vp_rd;
+ int i;
+
+ /*
+ * sprlock() is zone aware, so assuming this mount call was
+ * initiated by a process in a zone, if it tries to specify
+ * a pgrp outside of it's zone this call will fail.
+ *
+ * Also, we want to grab hold of the main automounter process
+ * and its going to be the group leader for pgrp, so its
+ * pid will be equal to pgrp.
+ */
+ prp = sprlock(pgrp);
+ if (prp == NULL)
+ return (-1);
+ mutex_exit(&prp->p_lock);
+
+ /* Now we want to access the processes open file descriptors. */
+ fip = P_FINFO(prp);
+ mutex_enter(&fip->fi_lock);
+
+ /* Sanity check fifo write fd. */
+ if (fd >= fip->fi_nfiles) {
+ mutex_exit(&fip->fi_lock);
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+ return (-1);
+ }
+
+ /* Get a pointer to the write fifo. */
+ UF_ENTER(ufp_wr, fip, fd);
+ if (((fp_wr = ufp_wr->uf_file) == NULL) ||
+ ((vp_wr = fp_wr->f_vnode) == NULL) || (vp_wr->v_type != VFIFO)) {
+ /* Invalid fifo fd. */
+ UF_EXIT(ufp_wr);
+ mutex_exit(&fip->fi_lock);
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+ return (-1);
+ }
+
+ /*
+ * Now we need to find the read end of the fifo (for reasons
+ * explained below.) We assume that the read end of the fifo
+ * is in the same process as the write end.
+ */
+ vp_rd = lx_autofs_fifo_peer_vp(fp_wr->f_vnode);
+ for (i = 0; i < fip->fi_nfiles; i++) {
+ if (i == fd)
+ continue;
+ UF_ENTER(ufp_rd, fip, i);
+ if (((fp_rd = ufp_rd->uf_file) != NULL) &&
+ (fp_rd->f_vnode == vp_rd))
+ break;
+ UF_EXIT(ufp_rd);
+ }
+ if (i == fip->fi_nfiles) {
+ /* Didn't find it. */
+ UF_EXIT(ufp_wr);
+ mutex_exit(&fip->fi_lock);
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+ return (-1);
+ }
+
+ /*
+ * We need to drop fi_lock before we can try to acquire f_tlock
+ * the good news is that the file pointers are protected because
+ * we're still holding uf_lock.
+ */
+ mutex_exit(&fip->fi_lock);
+
+ /*
+ * Here we bump the open counts on the fifos. The reason
+ * that we do this is because when we go to write to the
+ * fifo we want to ensure that they are actually open (and
+ * not in the process of being closed) without having to
+ * stop the automounter. (If the write end of the fifo
+ * were closed and we tried to write to it we would panic.
+ * If the read end of the fifo was closed and we tried to
+ * write to the other end, the process that invoked the
+ * lookup operation would get an unexpected SIGPIPE.)
+ */
+ mutex_enter(&fp_wr->f_tlock);
+ fp_wr->f_count++;
+ ASSERT(fp_wr->f_count >= 2);
+ mutex_exit(&fp_wr->f_tlock);
+
+ mutex_enter(&fp_rd->f_tlock);
+ fp_rd->f_count++;
+ ASSERT(fp_rd->f_count >= 2);
+ mutex_exit(&fp_rd->f_tlock);
+
+ /* Release all our locks. */
+ UF_EXIT(ufp_wr);
+ UF_EXIT(ufp_rd);
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+
+ /* Return the file pointers. */
+ *fpp_rd = fp_rd;
+ *fpp_wr = fp_wr;
+ return (0);
+}
+
+static uint_t
+/*ARGSUSED*/
+lx_autofs_fifo_close_cb(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+{
+ int *id = (int *)arg;
+ /* Return the key and terminate the walk. */
+ *id = (uintptr_t)key;
+ return (MH_WALK_TERMINATE);
+}
+
+static void
+lx_autofs_fifo_close(lx_autofs_vfs_t *data)
+{
+ /*
+ * Close the fifo to prevent any future requests from
+ * getting sent to the automounter.
+ */
+ mutex_enter(&data->lav_lock);
+ if (data->lav_fifo_wr != NULL) {
+ (void) closef(data->lav_fifo_wr);
+ data->lav_fifo_wr = NULL;
+ }
+ if (data->lav_fifo_rd != NULL) {
+ (void) closef(data->lav_fifo_rd);
+ data->lav_fifo_rd = NULL;
+ }
+ mutex_exit(&data->lav_lock);
+
+ /*
+ * Wakeup any threads currently waiting for the automounter
+ * note that it's possible for multiple threads to have entered
+ * this function and to be doing the work below simultaneously.
+ */
+ for (;;) {
+ lx_autofs_automnt_req_t *laar;
+ int id;
+
+ /* Lookup the first entry in the hash. */
+ id = -1;
+ mod_hash_walk(data->lav_id_hash,
+ lx_autofs_fifo_close_cb, &id);
+ if (id == -1) {
+ /* No more id's in the hash. */
+ break;
+ }
+ if ((laar = lx_autofs_la_find(data, id)) == NULL) {
+ /* Someone else beat us to it. */
+ continue;
+ }
+
+ /* Mark the request as complete and release it. */
+ lx_autofs_la_complete(data, laar);
+ lx_autofs_la_release(data, laar);
+ }
+}
+
+static int
+lx_autofs_fifo_verify_rd(lx_autofs_vfs_t *data)
+{
+ proc_t *prp;
+ uf_info_t *fip;
+ uf_entry_t *ufp_rd = NULL;
+ file_t *fp_rd = NULL;
+ vnode_t *vp_rd;
+ int i;
+
+ ASSERT(MUTEX_HELD((&data->lav_lock)));
+
+ /* Check if we've already been shut down. */
+ if (data->lav_fifo_wr == NULL) {
+ ASSERT(data->lav_fifo_rd == NULL);
+ return (-1);
+ }
+ vp_rd = lx_autofs_fifo_peer_vp(data->lav_fifo_wr->f_vnode);
+
+ /*
+ * sprlock() is zone aware, so assuming this mount call was
+ * initiated by a process in a zone, if it tries to specify
+ * a pgrp outside of it's zone this call will fail.
+ *
+ * Also, we want to grab hold of the main automounter process
+ * and its going to be the group leader for pgrp, so its
+ * pid will be equal to pgrp.
+ */
+ prp = sprlock(data->lav_pgrp);
+ if (prp == NULL)
+ return (-1);
+ mutex_exit(&prp->p_lock);
+
+ /* Now we want to access the processes open file descriptors. */
+ fip = P_FINFO(prp);
+ mutex_enter(&fip->fi_lock);
+
+ /*
+ * Now we need to find the read end of the fifo (for reasons
+ * explained below.) We assume that the read end of the fifo
+ * is in the same process as the write end.
+ */
+ for (i = 0; i < fip->fi_nfiles; i++) {
+ UF_ENTER(ufp_rd, fip, i);
+ if (((fp_rd = ufp_rd->uf_file) != NULL) &&
+ (fp_rd->f_vnode == vp_rd))
+ break;
+ UF_EXIT(ufp_rd);
+ }
+ if (i == fip->fi_nfiles) {
+ /* Didn't find it. */
+ mutex_exit(&fip->fi_lock);
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+ return (-1);
+ }
+
+ /*
+ * Seems the automounter still has the read end of the fifo
+ * open, we're done here. Release all our locks and exit.
+ */
+ mutex_exit(&fip->fi_lock);
+ UF_EXIT(ufp_rd);
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+
+ return (0);
+}
+
+static int
+lx_autofs_fifo_write(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laarp)
+{
+ struct uio uio;
+ struct iovec iov;
+ file_t *fp_wr, *fp_rd;
+ int error;
+
+ /*
+ * The catch here is we need to make sure _we_ don't close
+ * the the fifo while writing to it. (Another thread could come
+ * along and realize the automounter process is gone and close
+ * the fifo. To do this we bump the open count before we
+ * write to the fifo.
+ */
+ mutex_enter(&data->lav_lock);
+ if (data->lav_fifo_wr == NULL) {
+ ASSERT(data->lav_fifo_rd == NULL);
+ mutex_exit(&data->lav_lock);
+ return (ENOENT);
+ }
+ fp_wr = data->lav_fifo_wr;
+ fp_rd = data->lav_fifo_rd;
+
+ /* Bump the open count on the write fifo. */
+ mutex_enter(&fp_wr->f_tlock);
+ fp_wr->f_count++;
+ mutex_exit(&fp_wr->f_tlock);
+
+ /* Bump the open count on the read fifo. */
+ mutex_enter(&fp_rd->f_tlock);
+ fp_rd->f_count++;
+ mutex_exit(&fp_rd->f_tlock);
+
+ mutex_exit(&data->lav_lock);
+
+ iov.iov_base = (caddr_t)&laarp->laar_pkt;
+ iov.iov_len = laarp->laar_pkt_size;
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_loffset = 0;
+ uio.uio_segflg = (short)UIO_SYSSPACE;
+ uio.uio_resid = laarp->laar_pkt_size;
+ uio.uio_llimit = 0;
+ uio.uio_fmode = FWRITE | FNDELAY | FNONBLOCK;
+
+ error = VOP_WRITE(fp_wr->f_vnode, &uio, 0, kcred, NULL);
+ (void) closef(fp_wr);
+ (void) closef(fp_rd);
+
+ /*
+ * After every write we verify that the automounter still has
+ * these files open.
+ */
+ mutex_enter(&data->lav_lock);
+ if (lx_autofs_fifo_verify_rd(data) != 0) {
+ /*
+ * Something happened to the automounter.
+ * Close down the communication pipe we setup.
+ */
+ mutex_exit(&data->lav_lock);
+ lx_autofs_fifo_close(data);
+ if (error != 0)
+ return (error);
+ return (ENOENT);
+ }
+ mutex_exit(&data->lav_lock);
+
+ return (error);
+}
+
+static int
+lx_autofs_bs_readdir(vnode_t *dvp, list_t *dir_stack, list_t *file_stack)
+{
+ struct iovec iov;
+ struct uio uio;
+ dirent64_t *dp, *dbuf;
+ vnode_t *vp;
+ size_t dlen, dbuflen;
+ int eof, error, ndirents = 64;
+ char *nm;
+
+ dlen = ndirents * (sizeof (*dbuf));
+ dbuf = kmem_alloc(dlen, KM_SLEEP);
+
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_fmode = 0;
+ uio.uio_extflg = UIO_COPY_CACHED;
+ uio.uio_loffset = 0;
+ uio.uio_llimit = MAXOFFSET_T;
+
+ eof = 0;
+ error = 0;
+ while (!error && !eof) {
+ uio.uio_resid = dlen;
+ iov.iov_base = (char *)dbuf;
+ iov.iov_len = dlen;
+
+ (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
+ if (VOP_READDIR(dvp, &uio, kcred, &eof, NULL, 0) != 0) {
+ VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
+ kmem_free(dbuf, dlen);
+ return (-1);
+ }
+ VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
+
+ if ((dbuflen = dlen - uio.uio_resid) == 0) {
+ /* We're done. */
+ break;
+ }
+
+ for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
+ dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
+
+ nm = dp->d_name;
+
+ if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
+ continue;
+
+ if (VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, kcred,
+ NULL, NULL, NULL) != 0) {
+ kmem_free(dbuf, dlen);
+ return (-1);
+ }
+ if (vp->v_type == VDIR) {
+ if (dir_stack != NULL) {
+ lx_autofs_stack_push(dir_stack,
+ (caddr_t)dvp,
+ (caddr_t)vp, lx_autofs_strdup(nm));
+ } else {
+ VN_RELE(vp);
+ }
+ } else {
+ if (file_stack != NULL) {
+ lx_autofs_stack_push(file_stack,
+ (caddr_t)dvp,
+ (caddr_t)vp, lx_autofs_strdup(nm));
+ } else {
+ VN_RELE(vp);
+ }
+ }
+ }
+ }
+ kmem_free(dbuf, dlen);
+ return (0);
+}
+
+static void
+lx_autofs_bs_destroy(vnode_t *dvp, char *path)
+{
+ list_t search_stack;
+ list_t dir_stack;
+ list_t file_stack;
+ vnode_t *pdvp, *vp;
+ char *dpath, *fpath;
+ int ret;
+
+ if (VOP_LOOKUP(dvp, path, &vp, NULL, 0, NULL, kcred,
+ NULL, NULL, NULL) != 0) {
+ /* A directory entry with this name doesn't actually exist. */
+ return;
+ }
+
+ if ((vp->v_type & VDIR) == 0) {
+ /* Easy, the directory entry is a file so delete it. */
+ VN_RELE(vp);
+ (void) VOP_REMOVE(dvp, path, kcred, NULL, 0);
+ return;
+ }
+
+ /*
+ * The directory entry is a subdirectory, now we have a bit more
+ * work to do. (We'll have to recurse into the sub directory.)
+ * It would have been much easier to do this recursively but kernel
+ * stacks are notoriously small.
+ */
+ ls_autofs_stack_init(&search_stack);
+ ls_autofs_stack_init(&dir_stack);
+ ls_autofs_stack_init(&file_stack);
+
+ /* Save our newfound subdirectory into a list. */
+ lx_autofs_stack_push(&search_stack, (caddr_t)dvp, (caddr_t)vp,
+ lx_autofs_strdup(path));
+
+ /* Do a recursive depth first search into the subdirectories. */
+ while (lx_autofs_stack_pop(&search_stack,
+ (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) {
+
+ /* Get a list of the subdirectories in this directory. */
+ if (lx_autofs_bs_readdir(dvp, &search_stack, NULL) != 0)
+ goto exit;
+
+ /* Save the current directory a separate stack. */
+ lx_autofs_stack_push(&dir_stack, (caddr_t)pdvp, (caddr_t)dvp,
+ dpath);
+ }
+
+ /*
+ * Now dir_stack contains a list of directories, the deepest paths
+ * are at the top of the list. So let's go through and process them.
+ */
+ while (lx_autofs_stack_pop(&dir_stack,
+ (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) {
+
+ /* Get a list of the files in this directory. */
+ if (lx_autofs_bs_readdir(dvp, NULL, &file_stack) != 0) {
+ VN_RELE(dvp);
+ lx_autofs_strfree(dpath);
+ goto exit;
+ }
+
+ /* Delete all the files in this directory. */
+ while (lx_autofs_stack_pop(&file_stack,
+ NULL, (caddr_t *)&vp, &fpath) == 0) {
+ VN_RELE(vp)
+ ret = VOP_REMOVE(dvp, fpath, kcred, NULL, 0);
+ lx_autofs_strfree(fpath);
+ if (ret != 0) {
+ lx_autofs_strfree(dpath);
+ goto exit;
+ }
+ }
+
+ /* Delete this directory. */
+ VN_RELE(dvp);
+ ret = VOP_RMDIR(pdvp, dpath, pdvp, kcred, NULL, 0);
+ lx_autofs_strfree(dpath);
+ if (ret != 0)
+ goto exit;
+ }
+
+exit:
+ while (
+ (lx_autofs_stack_pop(&search_stack, NULL, (caddr_t *)&vp,
+ &path) == 0) ||
+ (lx_autofs_stack_pop(&dir_stack, NULL, (caddr_t *)&vp,
+ &path) == 0) ||
+ (lx_autofs_stack_pop(&file_stack, NULL, (caddr_t *)&vp,
+ &path) == 0)) {
+ VN_RELE(vp);
+ lx_autofs_strfree(path);
+ }
+ lx_autofs_stack_fini(&search_stack);
+ lx_autofs_stack_fini(&dir_stack);
+ lx_autofs_stack_fini(&file_stack);
+}
+
+static vnode_t *
+lx_autofs_bs_create(vnode_t *dvp, char *bs_name)
+{
+ vnode_t *vp;
+ vattr_t vattr;
+
+ /*
+ * After looking at the mkdir syscall path it seems we don't need
+ * to initialize all of the vattr_t structure.
+ */
+ bzero(&vattr, sizeof (vattr));
+ vattr.va_type = VDIR;
+ vattr.va_mode = 0755; /* u+rwx,og=rx */
+ vattr.va_mask = AT_TYPE|AT_MODE;
+
+ if (VOP_MKDIR(dvp, bs_name, &vattr, &vp, kcred, NULL, 0, NULL) != 0)
+ return (NULL);
+ return (vp);
+}
+
+static int
+lx_autofs_automounter_call(vnode_t *dvp, char *nm)
+{
+ lx_autofs_automnt_req_t *laar;
+ lx_autofs_vfs_t *data;
+ int error;
+ boolean_t is_dup;
+
+ /* Get a pointer to the vfs mount data. */
+ data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data;
+
+ /* The automounter only supports queries in the root directory. */
+ if (dvp != data->lav_root)
+ return (ENOENT);
+
+ /*
+ * Check if the current process is in the automounters process
+ * group. (If it is, the current process is either the autmounter
+ * itself or one of it's forked child processes.) If so, don't
+ * redirect this call back into the automounter because we'll
+ * hang.
+ */
+ mutex_enter(&pidlock);
+ if (data->lav_pgrp == curproc->p_pgrp) {
+ mutex_exit(&pidlock);
+ return (ENOENT);
+ }
+ mutex_exit(&pidlock);
+
+ /* Verify that the automount process pipe still exists. */
+ mutex_enter(&data->lav_lock);
+ if (data->lav_fifo_wr == NULL) {
+ ASSERT(data->lav_fifo_rd == NULL);
+ mutex_exit(&data->lav_lock);
+ return (ENOENT);
+ }
+ mutex_exit(&data->lav_lock);
+
+ /* Allocate an automounter request structure. */
+ if ((laar = lx_autofs_la_alloc(data, &is_dup, B_FALSE,
+ nm)) == NULL)
+ return (ENOENT);
+
+ /*
+ * If we were the first one to allocate this request then we
+ * need to send it to the automounter.
+ */
+ if ((!is_dup) &&
+ ((error = lx_autofs_fifo_write(data, laar)) != 0)) {
+ /*
+ * Unable to send the request to the automounter.
+ * Unblock any other threads waiting on the request
+ * and release the request.
+ */
+ lx_autofs_la_complete(data, laar);
+ lx_autofs_la_release(data, laar);
+ return (error);
+ }
+
+ /* Wait for someone to signal us that this request has completed. */
+ mutex_enter(&laar->laar_lock);
+ while (!laar->laar_complete) {
+ if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) {
+ /* We got a signal, abort this call. */
+ mutex_exit(&laar->laar_lock);
+ lx_autofs_la_abort(data, laar);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&laar->laar_lock);
+
+ if (laar->laar_result == LXACR_READY) {
+ /*
+ * Mount succeeded, keep track for future expire calls.
+ *
+ * See vfs lav_vn_hash. Is this something we could use for
+ * iterating mounts under this autofs? Used by
+ * lx_autofs_vn_alloc
+ */
+ lx_autofs_mntent_t *mp;
+
+ mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), KM_SLEEP);
+ mp->lxafme_len = strlen(nm) + 1;
+ mp->lxafme_path = kmem_zalloc(mp->lxafme_len, KM_SLEEP);
+ mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+ (void) strlcpy(mp->lxafme_path, nm, mp->lxafme_len);
+
+ mutex_enter(&data->lav_lock);
+ list_insert_tail(&data->lav_mnt_list, mp);
+ mutex_exit(&data->lav_lock);
+ }
+
+ lx_autofs_la_release(data, laar);
+
+ return (0);
+}
+
+/*
+ * Same preliminary checks as in lx_autofs_unmount.
+ */
+static boolean_t
+lx_autofs_may_unmount(vfs_t *vfsp, struct cred *cr)
+{
+ lx_autofs_vfs_t *data;
+
+ if (secpolicy_fs_unmount(cr, vfsp) != 0)
+ return (B_FALSE);
+
+ /*
+ * We should never have a reference count of less than 2: one for the
+ * caller, one for the root vnode.
+ */
+ ASSERT(vfsp->vfs_count >= 2);
+
+ /* If there are any outstanding vnodes, we can't unmount. */
+ if (vfsp->vfs_count > 2)
+ return (B_FALSE);
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ ASSERT(data->lav_root->v_vfsp == vfsp);
+
+ /* Check for any remaining holds on the root vnode. */
+ if (data->lav_root->v_count > 1)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static vfs_t *
+lx_autofs_get_mountvfs(char *fs_mntpt, int *cnt)
+{
+ struct vfs *vfsp;
+ struct vfs *vfslist;
+ vfs_t *fnd_vfs = NULL;
+ int fsmplen;
+ int acnt = 0;
+
+ fsmplen = strlen(fs_mntpt);
+
+ vfs_list_read_lock();
+
+ vfsp = vfslist = curzone->zone_vfslist;
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ *cnt = 0;
+ return (NULL);
+ }
+
+ do {
+ /* Skip mounts we shouldn't show. */
+ if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) {
+ char *mntpt;
+
+ mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+ if (strncmp(fs_mntpt, mntpt, fsmplen) == 0 &&
+ (mntpt[fsmplen] == '\0' || mntpt[fsmplen] == '/')) {
+ /*
+ * We'll return the first one we find but don't
+ * return a mount that is actually autofs (i.e.
+ * autofs direct or offset mount).
+ */
+ if (vfsp->vfs_op == lx_autofs_vfsops) {
+ acnt++;
+ } else if (fnd_vfs == NULL) {
+ fnd_vfs = vfsp;
+ VFS_HOLD(fnd_vfs)
+ }
+ }
+ }
+ vfsp = vfsp->vfs_zone_next;
+ } while (vfsp != vfslist);
+
+ vfs_list_unlock();
+
+ *cnt = acnt;
+ return (fnd_vfs);
+}
+
+/*
+ * Unmount all autofs offset mounts below the given path.
+ */
+static boolean_t
+lx_autofs_umount_offset(char *fs_mntpt, struct cred *cr)
+{
+ struct vfs *vfsp;
+ struct vfs *vfslist;
+ boolean_t busy = B_FALSE;
+ int fsmplen = strlen(fs_mntpt);
+
+restart:
+ vfs_list_read_lock();
+
+ vfsp = vfslist = curzone->zone_vfslist;
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ return (B_FALSE);
+ }
+
+ do {
+ char *mntpt;
+ lx_autofs_vfs_t *data;
+
+ /* Skip mounts we should ignore. */
+ if ((vfsp->vfs_flag & VFS_NOMNTTAB)) {
+ vfsp = vfsp->vfs_zone_next;
+ continue;
+ }
+
+ mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+ if (strncmp(fs_mntpt, mntpt, fsmplen) != 0 ||
+ (mntpt[fsmplen] != '\0' && mntpt[fsmplen] != '/')) {
+ vfsp = vfsp->vfs_zone_next;
+ continue;
+ }
+
+ if (vfsp->vfs_op != lx_autofs_vfsops) {
+ /*
+ * Something got mounted over the autofs mountpoint
+ * after we checked that this inidrect hierarchy was
+ * not busy.
+ */
+ busy = B_TRUE;
+ break;
+ }
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ if (data->lav_mnttype != LXAMT_OFFSET) {
+ /*
+ * Something mounted a non-offset autofs fs under this
+ * indirect mnt!
+ */
+ busy = B_TRUE;
+ break;
+ }
+
+ /*
+ * Attempt to umount - set busy if fails.
+ *
+ * umount2_engine will call VFS_RELE, so we need to take an
+ * extra hold to match the behavior during the normal umount
+ * path.
+ *
+ * We also need to drop the list lock to prevent deadlock
+ * during umount.
+ */
+ VFS_HOLD(vfsp);
+ vfs_list_unlock();
+ if (umount2_engine(vfsp, 0, cr, 0) != 0) {
+ busy = B_TRUE;
+ goto errexit;
+ }
+
+ /* Retake list lock and look for more. */
+ goto restart;
+ } while (vfsp != vfslist);
+
+ vfs_list_unlock();
+
+errexit:
+ return (busy);
+}
+
+
+/*
+ * Note that lx_autofs_automounter_call() only supports queries in the root
+ * directory, so all mntent names are relative to that.
+ */
+static int
+lx_autofs_expire(vfs_t *vfsp, struct cred *cr)
+{
+ lx_autofs_vfs_t *data;
+ lx_autofs_mntent_t *mp;
+ lx_autofs_automnt_req_t *laar;
+ boolean_t is_dup;
+ vfs_t *fnd_vfs;
+ int autofs_cnt;
+ boolean_t busy = B_FALSE;
+ char exp_path[MAXPATHLEN];
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+ /*
+ * We process only the first element (i.e. do not do multi). This
+ * works fine for the automounter.
+ */
+ mutex_enter(&data->lav_lock);
+ mp = (lx_autofs_mntent_t *)list_remove_head(&data->lav_mnt_list);
+ mutex_exit(&data->lav_lock);
+ if (mp == NULL) {
+ if (data->lav_mnttype == LXAMT_OFFSET) {
+ /*
+ * During restart the automounter will openmount each
+ * offset mount for management. It won't closemount the
+ * offset mount until we expire it, even though nothing
+ * is mounted over that offset. We handle this as a
+ * special expiration case.
+ */
+ int cnt;
+
+ mutex_enter(&data->lav_lock);
+ cnt = data->lav_openmnt_cnt;
+ mutex_exit(&data->lav_lock);
+
+ if (cnt == 1 && vn_ismntpt(data->lav_root) == 0) {
+ char *mntpt = (char *)
+ refstr_value(vfsp->vfs_mntpt);
+ char *nm = ZONE_PATH_TRANSLATE(mntpt, curzone);
+
+ mp = kmem_zalloc(sizeof (lx_autofs_mntent_t),
+ KM_SLEEP);
+ mp->lxafme_len = strlen(nm) + 1;
+ mp->lxafme_path = kmem_zalloc(mp->lxafme_len,
+ KM_SLEEP);
+ mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+ (void) strlcpy(mp->lxafme_path, nm,
+ mp->lxafme_len);
+
+ goto exp_offset;
+ }
+ }
+
+ return (EAGAIN);
+ }
+
+ /*
+ * We only return an expired mount if it is inactive for the full
+ * timeout. This reduces overly aggressive umount/mount activity.
+ */
+ if (data->lav_timeout > 0) {
+ uint64_t now = TICK_TO_SEC(ddi_get_lbolt64());
+
+ if ((now - mp->lxafme_ts) < data->lav_timeout) {
+ /* put it back at the end of the line */
+ mutex_enter(&data->lav_lock);
+ list_insert_tail(&data->lav_mnt_list, mp);
+ mutex_exit(&data->lav_lock);
+ return (EAGAIN);
+ }
+ }
+
+ if (data->lav_mnttype == LXAMT_INDIR) {
+ (void) snprintf(exp_path, sizeof (exp_path), "%s/%s",
+ (char *)refstr_value(vfsp->vfs_mntpt), mp->lxafme_path);
+ } else {
+ (void) strlcpy(exp_path, (char *)refstr_value(vfsp->vfs_mntpt),
+ sizeof (exp_path));
+ }
+
+ fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt);
+ if (fnd_vfs != NULL) {
+ boolean_t skip = B_FALSE;
+ vfssw_t *vfssw;
+
+ /*
+ * If it's an NFS file system (typical) then we check in
+ * advance to see if it can be unmounted, otherwise, proceed.
+ * The fs-specific umount attempted by the automounter will
+ * either succeed or fail. Both are valid outcomes but checking
+ * now for nfs will save a bunch of work by the automounter
+ * if the fs is busy.
+ *
+ * Unfortunately, for NFS the vfs_fstype is the same for all
+ * versions of NFS, so we need to check the vfs_op member to
+ * determine which version of NFS we're dealing with.
+ */
+ if (!skip && (vfssw = vfs_getvfssw("nfs4")) != NULL) {
+ if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) {
+ (void) dnlc_purge_vfsp(fnd_vfs, 0);
+ if (check_rtable4(fnd_vfs))
+ busy = B_TRUE;
+ skip = B_TRUE;
+ }
+ vfs_unrefvfssw(vfssw);
+ }
+
+ if (!skip && (vfssw = vfs_getvfssw("nfs3")) != NULL) {
+ if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) {
+ (void) dnlc_purge_vfsp(fnd_vfs, 0);
+ if (check_rtable(fnd_vfs))
+ busy = B_TRUE;
+ }
+ vfs_unrefvfssw(vfssw);
+ }
+
+ VFS_RELE(fnd_vfs);
+
+ } else if (autofs_cnt > 0) {
+ /*
+ * The automounter is asking us to expire and we pulled this
+ * name from our vfs mountpoint list, but if
+ * lx_autofs_get_mountvfs returns null then that means we
+ * didn't find a non-autofs mount under this name. Thus, the
+ * name could be a subdirectory under an autofs toplevel
+ * indirect mount with one or more offset mounts below.
+ * autofs_cnt will indicate how many autofs mounts exist below
+ * this subdirectory name.
+ *
+ * The automounter will take care of unmounting any fs mounted
+ * over one of these offset mounts (i.e. offset is like a
+ * direct mount which the automounter will manage) but the
+ * automounter will not unmount the actual autofs offset mount
+ * itself, so we have to do that before we can expire the
+ * top-level subrectory name.
+ */
+ busy = lx_autofs_umount_offset(exp_path, cr);
+ }
+
+ if (busy) {
+ /*
+ * Can't unmount this one right now, put it at the end of the
+ * list and return. The caller will return EAGAIN for the
+ * expire ioctl and the automounter will check again later.
+ */
+ mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+ mutex_enter(&data->lav_lock);
+ list_insert_tail(&data->lav_mnt_list, mp);
+ mutex_exit(&data->lav_lock);
+ return (EAGAIN);
+ }
+
+ /*
+ * See lx_autofs_automounter_call. We want to send a msg up the pipe
+ * to the automounter in a similar way.
+ */
+
+exp_offset:
+ /* Verify that the automount process pipe still exists. */
+ mutex_enter(&data->lav_lock);
+ if (data->lav_fifo_wr == NULL) {
+ ASSERT(data->lav_fifo_rd == NULL);
+ mutex_exit(&data->lav_lock);
+ goto err_free;
+ }
+ mutex_exit(&data->lav_lock);
+
+ /* Allocate an automounter expire structure. */
+ if ((laar = lx_autofs_la_alloc(data, &is_dup, B_TRUE,
+ mp->lxafme_path)) == NULL)
+ goto err_free;
+
+ /*
+ * If we were the first one to allocate this request then we
+ * need to send it to the automounter.
+ */
+ if (!is_dup && lx_autofs_fifo_write(data, laar) != 0) {
+ /*
+ * Unable to send the request to the automounter.
+ * Unblock any other threads waiting on the request
+ * and release the request.
+ */
+ lx_autofs_la_complete(data, laar);
+ lx_autofs_la_release(data, laar);
+ goto err_free;
+ }
+
+ /* Wait for someone to signal us that this request has completed. */
+ mutex_enter(&laar->laar_lock);
+ while (!laar->laar_complete) {
+ if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) {
+ /* We got a signal, abort this request. */
+ mutex_exit(&laar->laar_lock);
+ lx_autofs_la_abort(data, laar);
+ goto err_free;
+ }
+ }
+ mutex_exit(&laar->laar_lock);
+
+ /*
+ * If it failed or if the file system is still mounted after we get the
+ * response from our expire msg, then that means the automounter tried
+ * to unmount it but failed because the file system is busy, so we put
+ * this entry back on our list to try to expire it again later.
+ */
+ fnd_vfs = NULL;
+ if (laar->laar_result == LXACR_FAIL ||
+ (fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt)) != NULL ||
+ autofs_cnt > 0) {
+ if (fnd_vfs != NULL)
+ VFS_RELE(fnd_vfs);
+ mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+ mutex_enter(&data->lav_lock);
+ list_insert_tail(&data->lav_mnt_list, mp);
+ mutex_exit(&data->lav_lock);
+ } else {
+ kmem_free(mp->lxafme_path, mp->lxafme_len);
+ kmem_free(mp, sizeof (lx_autofs_mntent_t));
+ }
+
+ lx_autofs_la_release(data, laar);
+ return (0);
+
+err_free:
+ kmem_free(mp->lxafme_path, mp->lxafme_len);
+ kmem_free(mp, sizeof (lx_autofs_mntent_t));
+ return (EAGAIN);
+}
+
+static int
+lx_autofs_ack(int reqid, vfs_t *vfsp, enum lx_autofs_callres result)
+{
+ lx_autofs_vfs_t *data;
+ lx_autofs_automnt_req_t *laar;
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ if ((laar = lx_autofs_la_find(data, reqid)) == NULL)
+ return (ENXIO);
+
+ /* Mark the request as complete and release it. */
+ laar->laar_result = result;
+ lx_autofs_la_complete(data, laar);
+ lx_autofs_la_release(data, laar);
+ return (0);
+}
+
+static int
+lx_autofs_automounter_ioctl(vnode_t *vp, int cmd, intptr_t arg, cred_t *cr)
+{
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data;
+ int id = arg;
+ int v;
+ int err;
+
+ /*
+ * Be strict.
+ * We only accept ioctls from the automounter process group.
+ */
+ mutex_enter(&pidlock);
+ if (data->lav_pgrp != curproc->p_pgrp) {
+ mutex_exit(&pidlock);
+ return (ENOENT);
+ }
+ mutex_exit(&pidlock);
+
+ switch (cmd) {
+ case LX_AUTOFS_IOC_READY:
+ if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_READY)) != 0)
+ return (err);
+ return (0);
+
+ case LX_AUTOFS_IOC_FAIL:
+ if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_FAIL)) != 0)
+ return (err);
+ return (0);
+
+ case LX_AUTOFS_IOC_CATATONIC:
+ /* The automounter is shutting down. */
+ lx_autofs_fifo_close(data);
+ return (0);
+
+ case LX_AUTOFS_IOC_PROTOVER:
+ v = LX_AUTOFS_PROTO_VERS5;
+ if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0)
+ return (EFAULT);
+ return (0);
+
+ case LX_AUTOFS_IOC_PROTOSUBVER:
+ v = LX_AUTOFS_PROTO_SUBVERSION;
+ if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0)
+ return (EFAULT);
+ return (0);
+
+ case LX_AUTOFS_IOC_ASKUMOUNT:
+ /*
+ * This is asking if autofs can be unmounted, not asking to
+ * actually unmount it. We return 1 if it is busy or 0 if it
+ * can be unmounted.
+ */
+ v = 1;
+ if (lx_autofs_may_unmount(vp->v_vfsp, cr))
+ v = 0;
+
+ if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0)
+ return (EFAULT);
+ return (0);
+
+ case LX_AUTOFS_IOC_SETTIMEOUT:
+ if (copyin((caddr_t)arg, &data->lav_timeout, sizeof (ulong_t))
+ != 0)
+ return (EFAULT);
+ return (0);
+
+ case LX_AUTOFS_IOC_EXPIRE:
+ return (ENOTSUP);
+
+ case LX_AUTOFS_IOC_EXPIRE_MULTI:
+ lx_autofs_expire(vp->v_vfsp, cr);
+ return (EAGAIN);
+
+ default:
+ ASSERT(0);
+ return (ENOTSUP);
+ }
+}
+
+static int
+lx_autofs_parse_mntopt(vfs_t *vfsp, lx_autofs_vfs_t *data)
+{
+ char *fd_str, *pgrp_str, *minproto_str, *maxproto_str;
+ int fd, pgrp, minproto, maxproto;
+ file_t *fp_wr, *fp_rd;
+
+ /* Require these options to be present. */
+ if ((vfs_optionisset(vfsp, LX_MNTOPT_FD, &fd_str) != 1) ||
+ (vfs_optionisset(vfsp, LX_MNTOPT_PGRP, &pgrp_str) != 1) ||
+ (vfs_optionisset(vfsp, LX_MNTOPT_MINPROTO, &minproto_str) != 1) ||
+ (vfs_optionisset(vfsp, LX_MNTOPT_MAXPROTO, &maxproto_str) != 1))
+ return (EINVAL);
+
+ /* Get the values for each parameter. */
+ if ((lx_autofs_str_to_int(fd_str, &fd) != 0) ||
+ (lx_autofs_str_to_int(pgrp_str, &pgrp) != 0) ||
+ (lx_autofs_str_to_int(minproto_str, &minproto) != 0) ||
+ (lx_autofs_str_to_int(maxproto_str, &maxproto) != 0))
+ return (EINVAL);
+
+ /*
+ * We primarily support v2 & v5 of the linux kernel automounter
+ * protocol. The userland daemon typically needs v5. We'll reject
+ * unsupported ioctls later if we get one.
+ */
+ if ((minproto > 5) || (maxproto < 2))
+ return (EINVAL);
+
+ /*
+ * Now we need to lookup the fifos we'll be using
+ * to talk to the userland automounter process.
+ */
+ if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) {
+ /*
+ * The automounter doesn't always have the same id as the pgrp.
+ * This happens when it is started via one of the various
+ * service managers. In this case the fifo lookup will fail
+ * so we retry with our own pid.
+ */
+ int pid = (int)curproc->p_pid;
+
+ if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0)
+ return (EINVAL);
+ }
+
+ if (vfs_optionisset(vfsp, LX_MNTOPT_INDIRECT, NULL)) {
+ data->lav_mnttype = LXAMT_INDIR;
+ }
+ if (vfs_optionisset(vfsp, LX_MNTOPT_DIRECT, NULL)) {
+ if (data->lav_mnttype != LXAMT_NONE)
+ return (EINVAL);
+ data->lav_mnttype = LXAMT_DIRECT;
+ }
+ if (vfs_optionisset(vfsp, LX_MNTOPT_OFFSET, NULL)) {
+ if (data->lav_mnttype != LXAMT_NONE)
+ return (EINVAL);
+ data->lav_mnttype = LXAMT_OFFSET;
+ }
+ /* The automounter does test mounts with none of the options */
+ if (data->lav_mnttype == LXAMT_NONE)
+ data->lav_mnttype = LXAMT_DIRECT;
+
+ /* Save the mount options and fifo pointers. */
+ data->lav_fd = fd;
+ data->lav_min_proto = minproto;
+ data->lav_pgrp = pgrp;
+ data->lav_fifo_rd = fp_rd;
+ data->lav_fifo_wr = fp_wr;
+ return (0);
+}
+
+static uint64_t
+s2l_dev(dev_t dev)
+{
+ major_t maj = getmajor(dev);
+ minor_t min = getminor(dev);
+
+ return (LX_MAKEDEVICE(maj, min));
+}
+
+/*
+ * VFS entry points
+ */
+static int
+lx_autofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ lx_autofs_vfs_t *data;
+ dev_t dev;
+ char name[40];
+ int error;
+ vattr_t va;
+
+ if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+ return (EPERM);
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT)))
+ return (EBUSY);
+
+ /* We don't support mounts in the global zone. */
+ if (getzoneid() == GLOBAL_ZONEID)
+ return (EPERM);
+
+ /*
+ * Offset mounts will occur below the top-level mountpoint so we
+ * need to allow for autofs mounts even though mvp is an autofs.
+ */
+
+ /* Allocate a vfs struct. */
+ data = kmem_zalloc(sizeof (lx_autofs_vfs_t), KM_SLEEP);
+
+ /* Parse mount options. */
+ if ((error = lx_autofs_parse_mntopt(vfsp, data)) != 0) {
+ kmem_free(data, sizeof (lx_autofs_vfs_t));
+ return (error);
+ }
+
+ /* Initialize the backing store. */
+ lx_autofs_bs_destroy(mvp, LX_AUTOFS_BS_DIR);
+ data->lav_bs_vp = lx_autofs_bs_create(mvp, LX_AUTOFS_BS_DIR);
+ if (data->lav_bs_vp == NULL) {
+ kmem_free(data, sizeof (lx_autofs_vfs_t));
+ return (EBUSY);
+ }
+ data->lav_bs_name = LX_AUTOFS_BS_DIR;
+
+ /* Get the backing store inode for use in v5 protocol msgs */
+ va.va_mask = AT_STAT;
+ if ((error = VOP_GETATTR(data->lav_bs_vp, &va, 0, cr, NULL)) != 0) {
+ kmem_free(data, sizeof (lx_autofs_vfs_t));
+ return (error);
+ }
+ data->lav_ino = va.va_nodeid;
+
+ /* We have to hold the underlying vnode we're mounted on. */
+ data->lav_mvp = mvp;
+ VN_HOLD(mvp);
+
+ /* Initialize vfs fields */
+ vfsp->vfs_bsize = DEV_BSIZE;
+ vfsp->vfs_fstype = lx_autofs_fstype;
+ vfsp->vfs_data = data;
+
+ /* Invent a dev_t (sigh) */
+ do {
+ dev = makedevice(lx_autofs_major,
+ atomic_add_32_nv(&lx_autofs_minor, 1) & L_MAXMIN32);
+ } while (vfs_devismounted(dev));
+ vfsp->vfs_dev = dev;
+ vfs_make_fsid(&vfsp->vfs_fsid, dev, lx_autofs_fstype);
+
+ data->lav_dev = s2l_dev(vfsp->vfs_dev);
+
+ /* Create an id space arena for automounter requests. */
+ (void) snprintf(name, sizeof (name), "lx_autofs_id_%d",
+ getminor(vfsp->vfs_dev));
+ data->lav_ids = id_space_create(name, 1, INT_MAX);
+
+ /* Create hashes to keep track of automounter requests. */
+ mutex_init(&data->lav_lock, NULL, MUTEX_DEFAULT, NULL);
+ (void) snprintf(name, sizeof (name), "lx_autofs_path_hash_%d",
+ getminor(vfsp->vfs_dev));
+ data->lav_path_hash = mod_hash_create_strhash(name,
+ LX_AUTOFS_VFS_PATH_HASH_SIZE, mod_hash_null_valdtor);
+ (void) snprintf(name, sizeof (name), "lx_autofs_id_hash_%d",
+ getminor(vfsp->vfs_dev));
+ data->lav_id_hash = mod_hash_create_idhash(name,
+ LX_AUTOFS_VFS_ID_HASH_SIZE, mod_hash_null_valdtor);
+
+ /* Create a hash to keep track of vnodes. */
+ (void) snprintf(name, sizeof (name), "lx_autofs_vn_hash_%d",
+ getminor(vfsp->vfs_dev));
+ data->lav_vn_hash = mod_hash_create_ptrhash(name,
+ LX_AUTOFS_VFS_VN_HASH_SIZE, mod_hash_null_valdtor,
+ sizeof (vnode_t));
+
+ list_create(&data->lav_mnt_list, sizeof (lx_autofs_mntent_t),
+ offsetof(lx_autofs_mntent_t, lxafme_lst));
+
+ /* Create root vnode */
+ data->lav_root = lx_autofs_vn_alloc(vfsp, data->lav_bs_vp);
+
+ data->lav_root->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP;
+
+ /*
+ * For a direct mountpoint we need to allow a filesystem to be
+ * mounted overtop of this autofs mount. Otherwise, disallow that.
+ */
+ if (data->lav_mnttype == LXAMT_INDIR)
+ data->lav_root->v_flag |= VNOMOUNT;
+
+ return (0);
+}
+
+static int
+lx_autofs_unmount(vfs_t *vfsp, int flag, struct cred *cr)
+{
+ lx_autofs_vfs_t *data;
+
+ if (secpolicy_fs_unmount(cr, vfsp) != 0)
+ return (EPERM);
+
+ /* We do not currently support forced unmounts. */
+ if (flag & MS_FORCE)
+ return (ENOTSUP);
+
+ /*
+ * We should never have a reference count of less than 2: one for the
+ * caller, one for the root vnode.
+ */
+ ASSERT(vfsp->vfs_count >= 2);
+
+ /* If there are any outstanding vnodes, we can't unmount. */
+ if (vfsp->vfs_count > 2)
+ return (EBUSY);
+
+ /* Check for any remaining holds on the root vnode. */
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ ASSERT(data->lav_root->v_vfsp == vfsp);
+ if (data->lav_root->v_count > 1)
+ return (EBUSY);
+
+ /* Close the fifo to the automount process. */
+ if (data->lav_fifo_wr != NULL)
+ (void) closef(data->lav_fifo_wr);
+ if (data->lav_fifo_rd != NULL)
+ (void) closef(data->lav_fifo_rd);
+
+ /*
+ * We have to release our hold on our root vnode before we can
+ * delete the backing store. (Since the root vnode is linked
+ * to the backing store.)
+ */
+ VN_RELE(data->lav_root);
+
+ /* Cleanup the backing store. */
+ lx_autofs_bs_destroy(data->lav_mvp, data->lav_bs_name);
+ VN_RELE(data->lav_mvp);
+
+ /*
+ * Delete all listed mounts.
+ */
+ for (;;) {
+ lx_autofs_mntent_t *mp;
+
+ mp = list_remove_head(&data->lav_mnt_list);
+ if (mp == NULL)
+ break;
+ kmem_free(mp->lxafme_path, mp->lxafme_len);
+ kmem_free(mp, sizeof (lx_autofs_mntent_t));
+ }
+
+ /* Cleanup out remaining data structures. */
+ mod_hash_destroy_strhash(data->lav_path_hash);
+ mod_hash_destroy_idhash(data->lav_id_hash);
+ mod_hash_destroy_ptrhash(data->lav_vn_hash);
+ id_space_destroy(data->lav_ids);
+ list_destroy(&data->lav_mnt_list);
+ kmem_free(data, sizeof (lx_autofs_vfs_t));
+
+ return (0);
+}
+
+static int
+lx_autofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+ *vpp = data->lav_root;
+ VN_HOLD(*vpp);
+
+ return (0);
+}
+
+static int
+lx_autofs_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ vnode_t *urvp = data->lav_root->v_data;
+ dev32_t d32;
+ int error;
+
+ if ((error = VFS_STATVFS(urvp->v_vfsp, sp)) != 0)
+ return (error);
+
+ /* Update some of values before returning. */
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sp->f_fsid = d32;
+ (void) strlcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name,
+ sizeof (sp->f_basetype));
+ sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ bzero(sp->f_fstr, sizeof (sp->f_fstr));
+ return (0);
+}
+
+static const fs_operation_def_t lx_autofs_vfstops[] = {
+ { VFSNAME_MOUNT, { .vfs_mount = lx_autofs_mount } },
+ { VFSNAME_UNMOUNT, { .vfs_unmount = lx_autofs_unmount } },
+ { VFSNAME_ROOT, { .vfs_root = lx_autofs_root } },
+ { VFSNAME_STATVFS, { .vfs_statvfs = lx_autofs_statvfs } },
+ { NULL, NULL }
+};
+
+/*
+ * VOP entry points - simple passthrough
+ *
+ * For most VOP entry points we can simply pass the request on to
+ * the underlying filesystem we're mounted on.
+ */
+static int
+lx_autofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ctp)
+{
+ vnode_t *uvp = vp->v_data;
+ return (VOP_CLOSE(uvp, flag, count, offset, cr, ctp));
+}
+
+static int
+lx_autofs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ctp, int flags)
+{
+ vnode_t *uvp = vp->v_data;
+ return (VOP_READDIR(uvp, uiop, cr, eofp, ctp, flags));
+}
+
+static int
+lx_autofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+ caller_context_t *ctp)
+{
+ vnode_t *uvp = vp->v_data;
+ return (VOP_ACCESS(uvp, mode, flags, cr, ctp));
+}
+
+static int
+lx_autofs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+ vnode_t *uvp = vp->v_data;
+ return (VOP_RWLOCK(uvp, write_lock, ctp));
+}
+
+static void
+lx_autofs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+ vnode_t *uvp = vp->v_data;
+ VOP_RWUNLOCK(uvp, write_lock, ctp);
+}
+
+/*
+ * Check if attempting to access a 'direct' mount and if so, call the
+ * automounter to perform the mount. Once the mount occurs, the new filesystem
+ * will be mounted overtop of this autofs mountpoint and we will no longer
+ * come through this path.
+ */
+static vnode_t *
+lx_autofs_do_direct(vnode_t *vp)
+{
+ vfs_t *vfsp = vp->v_vfsp;
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ vnode_t *nvp;
+ boolean_t skip_am_call = B_FALSE;
+
+ if (data->lav_mnttype == LXAMT_INDIR)
+ return (NULL);
+
+ /*
+ * Check if the current process is in the automounter's process group.
+ * If it is, the current process is either the automounter itself or
+ * one of it's children. If so, don't call back into the automounter.
+ */
+ mutex_enter(&pidlock);
+ if (data->lav_pgrp == curproc->p_pgrp) {
+ skip_am_call = B_TRUE;
+ }
+ mutex_exit(&pidlock);
+
+ /*
+ * It is possible there is already a new fs mounted on top of our vnode.
+ * This can happen if the caller first did a lookup of a file name
+ * using our vnode as the directory vp. The lookup would trigger the
+ * autofs mount on top of ourself, but if the caller then uses our
+ * vnode to do a getattr on the directory, it will use the autofs
+ * vnode and not the newly mounted vnode. We need to skip re-calling
+ * the automounter for this case.
+ */
+ if (!skip_am_call && vn_mountedvfs(vp) == NULL) {
+ char tbuf[MAXPATHLEN];
+ char *nm;
+
+ (void) strlcpy(tbuf, (char *)refstr_value(vfsp->vfs_mntpt),
+ sizeof (tbuf));
+ nm = tbuf + strlen(tbuf);
+ while (*nm != '/' && nm != tbuf)
+ nm--;
+ if (*nm == '/')
+ nm++;
+ (void) lx_autofs_automounter_call(vp, nm);
+ }
+
+ /*
+ * We need to take an extra hold on our vp (which is the autofs
+ * root vp) to account for the rele done in traverse. traverse will
+ * take a hold on the new vp so the caller is responsible for calling
+ * VN_RELE on the returned vp.
+ */
+ VN_HOLD(vp);
+ nvp = vp;
+ if (traverse(&nvp) != 0) {
+ VN_RELE(nvp);
+ return (NULL);
+ }
+
+ /* Confirm that we have a non-autofs fs mounted now */
+ if (nvp->v_op == lx_autofs_vn_ops) {
+ VN_RELE(nvp);
+ return (NULL);
+ }
+
+ return (nvp);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+ caller_context_t *ctp, int flags)
+{
+ vnode_t *udvp = dvp->v_data;
+ vnode_t *nvp;
+
+ /* handle direct mount here */
+ if ((nvp = lx_autofs_do_direct(dvp)) != NULL) {
+ int error;
+
+ error = VOP_RMDIR(nvp, nm, cdir, cr, ctp, flags);
+ VN_RELE(nvp);
+ return (error);
+ }
+
+ /*
+ * cdir is the calling processes current directory.
+ * If cdir is lx_autofs vnode then get its real underlying
+ * vnode ptr. (It seems like the only thing cdir is
+ * ever used for is to make sure the user doesn't delete
+ * their current directory.)
+ */
+ if (vn_matchops(cdir, lx_autofs_vn_ops)) {
+ vnode_t *ucdir = cdir->v_data;
+ return (VOP_RMDIR(udvp, nm, ucdir, cr, ctp, flags));
+ }
+
+ return (VOP_RMDIR(udvp, nm, cdir, cr, ctp, flags));
+}
+
+/*
+ * VOP entry points - special passthrough
+ *
+ * For some VOP entry points we will first pass the request on to
+ * the underlying filesystem we're mounted on. If there's an error
+ * then we immediately return the error, but if the request succeeds
+ * we have to do some extra work before returning.
+ */
+static int
+lx_autofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ctp)
+{
+ vnode_t *ovp = *vpp;
+ vnode_t *uvp = ovp->v_data;
+ int error;
+
+ /* direct mounts were handled by the lookup to get *vpp */
+
+ if ((error = VOP_OPEN(&uvp, flag, cr, ctp)) != 0)
+ return (error);
+
+ /* Check for clone opens. */
+ if (uvp == ovp->v_data)
+ return (0);
+
+ /* Deal with clone opens by returning a new vnode. */
+ *vpp = lx_autofs_vn_alloc(ovp->v_vfsp, uvp);
+ VN_RELE(ovp);
+ return (0);
+}
+
+/*
+ * Internally, we have already converted our autofs vfs device number into a
+ * Linux-format device during lx_autofs_mount and stored that device number
+ * in data->lav_dev. However, our lx emulation for the various stat() syscalls
+ * also wants to convert the fsid the same way. That obviously will be
+ * incorrect if we pass along an fsid that is already converted, so we always
+ * pass along the original vfs fsid here. Both lav_dev and lav_ino are passed
+ * in messages to the automounter, and these must match the values obtained by
+ * stat().
+ */
+static int
+lx_autofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ctp)
+{
+ vnode_t *uvp = vp->v_data;
+ vnode_t *dvp;
+ int error;
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data;
+ dev_t autofs_fsid = vp->v_vfsp->vfs_dev;
+
+ if ((dvp = lx_autofs_do_direct(vp)) != NULL) {
+ uvp = dvp;
+ }
+
+ error = VOP_GETATTR(uvp, vap, flags, cr, ctp);
+
+ if (dvp != NULL) {
+ /* we operated on the direct mounted fs */
+ VN_RELE(dvp);
+ if (error == 0) {
+ /*
+ * During automounter restart recovery, the automounter
+ * will fstat the fd provided in the setpipe ioctl. It
+ * uses the resulting inode & dev to correlate future
+ * autofs fifo requests to the correct entry. Thus, we
+ * have to update the attributes with the proper IDs.
+ */
+ vap->va_fsid = autofs_fsid;
+ vap->va_nodeid = data->lav_ino;
+ }
+ } else if (error == 0) {
+ /* Update the attributes with our filesystem id. */
+ vap->va_fsid = autofs_fsid;
+ }
+
+ return (error);
+}
+
+static int
+lx_autofs_mkdir(vnode_t *dvp, char *nm, struct vattr *vap, vnode_t **vpp,
+ cred_t *cr, caller_context_t *ctp, int flags, vsecattr_t *vsecp)
+{
+ vnode_t *udvp = dvp->v_data;
+ vnode_t *nvp;
+ int error;
+
+ if ((nvp = lx_autofs_do_direct(dvp)) != NULL) {
+ udvp = nvp;
+ }
+
+ error = VOP_MKDIR(udvp, nm, vap, vpp, cr, ctp, flags, vsecp);
+
+ if (nvp != NULL) {
+ /* we operated on the direct mounted fs */
+ VN_RELE(nvp);
+ } else if (error == 0) {
+ vnode_t *uvp = NULL;
+
+ /* Update the attributes with our filesystem id. */
+ vap->va_fsid = dvp->v_vfsp->vfs_dev;
+
+ /* Allocate our new vnode. */
+ uvp = *vpp;
+ *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp);
+ }
+
+ return (error);
+}
+
+/*
+ * VOP entry points - custom
+ */
+/*ARGSUSED*/
+static void
+lx_autofs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ctp)
+{
+ lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data;
+
+ /*
+ * We need to hold the vfs lock because if we're going to free
+ * this vnode we have to prevent anyone from looking it up
+ * in the vnode hash.
+ */
+ mutex_enter(&data->lav_lock);
+ mutex_enter(&vp->v_lock);
+
+ if (vp->v_count < 1) {
+ panic("lx_autofs_inactive: bad v_count");
+ /*NOTREACHED*/
+ }
+
+ /* Drop the temporary hold by vn_rele now. */
+ if (--vp->v_count > 0) {
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&data->lav_lock);
+ return;
+ }
+
+ /*
+ * No one should have been blocked on this lock because we're
+ * about to free this vnode.
+ */
+ lx_autofs_vn_free(vp);
+}
+
+static int
+lx_autofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ctp,
+ int *direntflags, pathname_t *realpnp)
+{
+ vnode_t *udvp = dvp->v_data;
+ vnode_t *uvp = NULL;
+ lx_autofs_vfs_t *data;
+ int error = ENOENT;
+
+ data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data;
+
+ /*
+ * For an indirect mount first try to lookup if this path component
+ * already exists.
+ */
+ if (data->lav_mnttype == LXAMT_INDIR) {
+ if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr,
+ ctp, direntflags, realpnp)) == 0) {
+ *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp);
+ return (0);
+ }
+ }
+
+ /* Only query the automounter if the path does not exist. */
+ if (error != ENOENT)
+ return (error);
+
+ if (data->lav_catatonic)
+ return (ENOENT);
+
+ /* Save the uid/gid for the requestor ioctl. */
+ data->lav_uid = crgetuid(cr);
+ data->lav_gid = crgetgid(cr);
+
+ /* Refer the lookup to the automounter. */
+ if ((error = lx_autofs_automounter_call(dvp, nm)) != 0)
+ return (error);
+
+ if (data->lav_mnttype == LXAMT_INDIR) {
+ /*
+ * Indirect mount. The automounter call should have mounted
+ * something on nm. Retry the lookup operation.
+ */
+ if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr,
+ ctp, direntflags, realpnp)) == 0) {
+ *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp);
+ return (0);
+ }
+ } else {
+ /*
+ * Direct or offset mount. The automounter call should have
+ * covered our 'dvp' with a new filesystem. Traverse into the
+ * new mount and retry the lookup.
+ *
+ * We need to take an extra hold on our vp (which is the autofs
+ * root vp) to acount for the rele done in traverse. Our caller
+ * will also do a rele on the original dvp and that would leave
+ * us one ref short on our autofs root vnode.
+ */
+ vnode_t *orig_dvp = dvp;
+
+ VN_HOLD(dvp);
+ if ((error = traverse(&dvp)) != 0) {
+ VN_RELE(dvp);
+ return (error);
+ }
+
+ if (dvp == orig_dvp) {
+ /*
+ * For some reason the automountd did not actually
+ * mount the new filesystem. Return an error.
+ */
+ VN_RELE(dvp);
+ return (ENOENT);
+ }
+
+ error = VOP_LOOKUP(dvp, nm, vpp, pnp, flags, rdir, cr, ctp,
+ direntflags, realpnp);
+
+ /* release the traverse hold */
+ VN_RELE(dvp);
+ }
+ return (error);
+}
+
+static int
+lx_autofs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr,
+ int *rvalp, caller_context_t *ctp)
+{
+ vnode_t *uvp = vp->v_data;
+
+ /* Intercept our ioctls. */
+ switch ((uint_t)cmd) {
+ case LX_AUTOFS_IOC_READY:
+ case LX_AUTOFS_IOC_FAIL:
+ case LX_AUTOFS_IOC_CATATONIC:
+ case LX_AUTOFS_IOC_PROTOVER:
+ case LX_AUTOFS_IOC_SETTIMEOUT:
+ case LX_AUTOFS_IOC_EXPIRE:
+ case LX_AUTOFS_IOC_EXPIRE_MULTI:
+ case LX_AUTOFS_IOC_PROTOSUBVER:
+ case LX_AUTOFS_IOC_ASKUMOUNT:
+ return (lx_autofs_automounter_ioctl(vp, cmd, arg, cr));
+ }
+
+ /* Pass any remaining ioctl on. */
+ return (VOP_IOCTL(uvp, cmd, arg, mode, cr, rvalp, ctp));
+}
+
+/*
+ * VOP entry points definitions
+ */
+static const fs_operation_def_t lx_autofs_tops_root[] = {
+ { VOPNAME_OPEN, { .vop_open = lx_autofs_open } },
+ { VOPNAME_CLOSE, { .vop_close = lx_autofs_close } },
+ { VOPNAME_IOCTL, { .vop_ioctl = lx_autofs_ioctl } },
+ { VOPNAME_RWLOCK, { .vop_rwlock = lx_autofs_rwlock } },
+ { VOPNAME_RWUNLOCK, { .vop_rwunlock = lx_autofs_rwunlock } },
+ { VOPNAME_GETATTR, { .vop_getattr = lx_autofs_getattr } },
+ { VOPNAME_ACCESS, { .vop_access = lx_autofs_access } },
+ { VOPNAME_READDIR, { .vop_readdir = lx_autofs_readdir } },
+ { VOPNAME_LOOKUP, { .vop_lookup = lx_autofs_lookup } },
+ { VOPNAME_INACTIVE, { .vop_inactive = lx_autofs_inactive } },
+ { VOPNAME_MKDIR, { .vop_mkdir = lx_autofs_mkdir } },
+ { VOPNAME_RMDIR, { .vop_rmdir = lx_autofs_rmdir } },
+ { NULL }
+};
+
+/*
+ * DEV-specific entry points
+ */
+
+/*ARGSUSED*/
+static int
+lx_autofs_dev_open(dev_t *devp, int flags, int otyp, cred_t *credp)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_dev_close(dev_t dev, int flags, int otyp, cred_t *credp)
+{
+ return (0);
+}
+
+static int
+lx_autofs_dev_validate_cmd(intptr_t arg, lx_autofs_dv_ioctl_t *dcmd)
+{
+ if (copyin((caddr_t)arg, dcmd, sizeof (lx_autofs_dv_ioctl_t)) != 0)
+ return (EFAULT);
+
+ if (dcmd->lad_ver_major != LX_AUTOFS_DEV_VERSION_MAJOR ||
+ dcmd->lad_ver_minor > LX_AUTOFS_DEV_VERSION_MINOR)
+ return (EINVAL);
+
+ DTRACE_PROBE1(lx__dev__cmd, void *, dcmd);
+
+ /* Fill in the version for return */
+ dcmd->lad_ver_major = LX_AUTOFS_DEV_VERSION_MAJOR;
+ dcmd->lad_ver_minor = LX_AUTOFS_DEV_VERSION_MINOR;
+ return (0);
+}
+
+static vfs_t *
+lx_autofs_dev_getvfs_bypath(char *fs_mntpt)
+{
+ struct vfs *vfsp;
+ struct vfs *vfslist;
+ vfs_t *fnd_vfs = NULL;
+ zone_t *zone = curzone;
+
+ vfs_list_read_lock();
+
+ vfsp = vfslist = curzone->zone_vfslist;
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ return (NULL);
+ }
+
+ do {
+ if (vfsp->vfs_op == lx_autofs_vfsops) {
+ char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+
+ if (strcmp(fs_mntpt, ZONE_PATH_TRANSLATE(mntpt, zone))
+ == 0) {
+ fnd_vfs = vfsp;
+ VFS_HOLD(fnd_vfs)
+ break;
+ }
+ }
+ vfsp = vfsp->vfs_zone_next;
+ } while (vfsp != vfslist);
+
+ vfs_list_unlock();
+
+ return (fnd_vfs);
+}
+
+static int
+lx_autofs_dev_fd_preamble(intptr_t arg, lx_autofs_dv_ioctl_t *dc, vfs_t **vfspp)
+{
+ int err;
+ lx_autofs_vfs_t *data;
+ file_t *fp;
+ vfs_t *vfsp;
+
+ if ((err = lx_autofs_dev_validate_cmd(arg, dc)) != 0)
+ return (err);
+
+ if ((fp = getf(dc->lad_ioctlfd)) == NULL)
+ return (EBADF);
+
+ vfsp = fp->f_vnode->v_vfsp;
+ if (vfsp->vfs_op != lx_autofs_vfsops) {
+ releasef(dc->lad_ioctlfd);
+ return (EBADF);
+ }
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ if (data->lav_root->v_count <= 1) {
+ releasef(dc->lad_ioctlfd);
+ return (EBADF);
+ }
+
+ VFS_HOLD(vfsp);
+ *vfspp = vfsp;
+
+ releasef(dc->lad_ioctlfd);
+ return (0);
+}
+
+static int
+lx_autofs_dev_vers(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+
+ if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+ return (err);
+
+ if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+lx_autofs_dev_protver(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+
+ if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+ return (err);
+
+ dcmd.lad_arg1 = LX_AUTOFS_PROTO_VERS5;
+
+ if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+lx_autofs_dev_protosubver(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+
+ if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+ return (err);
+
+ dcmd.lad_arg1 = LX_AUTOFS_PROTO_SUBVERSION;
+
+ if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+lx_autofs_dev_get_path_cmd(intptr_t arg, lx_autofs_dv_ioctl_t **dcp)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd, *dc;
+
+ if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+ return (err);
+
+ if (dcmd.lad_size <= sizeof (dcmd) ||
+ dcmd.lad_size > (sizeof (dcmd) + MAXPATHLEN))
+ return (EINVAL);
+
+ dc = kmem_alloc(dcmd.lad_size, KM_SLEEP);
+
+ /* re-copyin the full struct with the path */
+ if (copyin((caddr_t)arg, dc, dcmd.lad_size) != 0) {
+ kmem_free(dc, dcmd.lad_size);
+ return (EFAULT);
+ }
+ dc->lad_size = dcmd.lad_size;
+
+ if (dc->lad_path[0] != '/' ||
+ dc->lad_path[dcmd.lad_size - sizeof (dcmd) - 1] != '\0') {
+ kmem_free(dc, dcmd.lad_size);
+ return (EINVAL);
+ }
+
+ *dcp = dc;
+ return (0);
+}
+
+static int
+lx_autofs_dev_openmount(intptr_t arg)
+{
+ int err;
+ int fd;
+ lx_autofs_dv_ioctl_t *dc;
+ vfs_t *vfsp;
+ lx_autofs_vfs_t *data;
+
+ if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0)
+ return (err);
+
+ if ((vfsp = lx_autofs_dev_getvfs_bypath(dc->lad_path)) == NULL) {
+ kmem_free(dc, dc->lad_size);
+ return (EINVAL);
+ }
+
+ /* lad_arg1 is the dev number of the mnt but we don't check that */
+
+ /*
+ * Do an "open" on the root vnode. To fully simulate "open" we also add
+ * a hold on the root vnode itself since lx_autofs_open will only open
+ * (and hold) the underlying vnode.
+ */
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ VN_HOLD(data->lav_root);
+ if ((err = fassign(&data->lav_root, FWRITE|FREAD, &fd)) != 0) {
+ VN_RELE(data->lav_root);
+ VFS_RELE(vfsp);
+ kmem_free(dc, dc->lad_size);
+ return (err);
+ }
+
+ mutex_enter(&data->lav_lock);
+ data->lav_openmnt_cnt++;
+ mutex_exit(&data->lav_lock);
+
+ dc->lad_ioctlfd = fd;
+
+ if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) {
+ mutex_enter(&data->lav_lock);
+ data->lav_openmnt_cnt--;
+ mutex_exit(&data->lav_lock);
+ (void) closeandsetf(fd, NULL);
+ VFS_RELE(vfsp);
+ kmem_free(dc, dc->lad_size);
+ return (EFAULT);
+ }
+ VFS_RELE(vfsp);
+
+ kmem_free(dc, dc->lad_size);
+ return (0);
+}
+
+static int
+lx_autofs_dev_closemount(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+ lx_autofs_vfs_t *data;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+ /* "close" the vnode */
+ if ((err = closeandsetf(dcmd.lad_ioctlfd, NULL)) != 0) {
+ VFS_RELE(vfsp);
+ return (err);
+ }
+
+ mutex_enter(&data->lav_lock);
+ ASSERT(data->lav_openmnt_cnt > 0);
+ data->lav_openmnt_cnt--;
+ mutex_exit(&data->lav_lock);
+
+ VFS_RELE(vfsp);
+ return (0);
+}
+
+static int
+lx_autofs_dev_ready(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_READY)) != 0) {
+ VFS_RELE(vfsp);
+ return (err);
+ }
+
+ VFS_RELE(vfsp);
+ return (0);
+}
+
+static int
+lx_autofs_dev_fail(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_FAIL)) != 0) {
+ VFS_RELE(vfsp);
+ return (err);
+ }
+
+ VFS_RELE(vfsp);
+ return (0);
+}
+
+/*
+ * Update the fifo pipe information we use to talk to the automounter. The
+ * ioctl is used when the automounter restarts. This logic is similar to the
+ * handling done in lx_autofs_parse_mntopt() when the filesytem is first
+ * mounted.
+ */
+static int
+lx_autofs_dev_setpipefd(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+ lx_autofs_vfs_t *data;
+ int fd, pgrp;
+ file_t *fp_wr, *fp_rd;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ mutex_enter(&pidlock);
+ pgrp = curproc->p_pgrp;
+ mutex_exit(&pidlock);
+ fd = dcmd.lad_arg1;
+
+ /* Lookup the new fifos. See comment in lx_autofs_parse_mntopt. */
+ if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) {
+ int pid = (int)curproc->p_pid;
+
+ if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) {
+ VFS_RELE(vfsp);
+ return (EINVAL);
+ }
+ }
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+ /* Close the old fifos. */
+ if (data->lav_fifo_wr != NULL)
+ (void) closef(data->lav_fifo_wr);
+ if (data->lav_fifo_rd != NULL)
+ (void) closef(data->lav_fifo_rd);
+
+ data->lav_fd = fd;
+ data->lav_pgrp = pgrp;
+ data->lav_fifo_rd = fp_rd;
+ data->lav_fifo_wr = fp_wr;
+ /*
+ * Not explicitly in the ioctl spec. but necessary for correct recovery
+ */
+ data->lav_catatonic = B_FALSE;
+
+ VFS_RELE(vfsp);
+
+ return (0);
+}
+
+static int
+lx_autofs_dev_catatonic(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+ lx_autofs_vfs_t *data;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ data->lav_catatonic = B_TRUE;
+ VFS_RELE(vfsp);
+
+ return (0);
+}
+
+static int
+lx_autofs_dev_expire(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ /* If it succeeds in expiring then we don't want to return EAGAIN */
+ if ((err = lx_autofs_expire(vfsp, kcred)) == 0) {
+ VFS_RELE(vfsp);
+ return (0);
+ }
+
+ VFS_RELE(vfsp);
+ return (EAGAIN);
+}
+
+static int
+lx_autofs_dev_timeout(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+ lx_autofs_vfs_t *data;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+ data->lav_timeout = dcmd.lad_arg1;
+ VFS_RELE(vfsp);
+
+ return (0);
+}
+
+static int
+lx_autofs_dev_requestor(intptr_t arg)
+{
+ int err;
+ lx_autofs_dv_ioctl_t *dc;
+ vfs_t *vfsp;
+ vfs_t *fnd_vfs = NULL;
+ struct vfs *vfslist;
+ zone_t *zone = curzone;
+ lx_autofs_vfs_t *data;
+ uid_t uid;
+ gid_t gid;
+
+ if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0)
+ return (err);
+
+ vfs_list_read_lock();
+ vfsp = vfslist = curzone->zone_vfslist;
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ kmem_free(dc, dc->lad_size);
+ return (EINVAL);
+ }
+
+ do {
+ /* Skip mounts we shouldn't show. */
+ if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) {
+ char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+
+ if (strcmp(dc->lad_path,
+ ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) {
+
+ if (vfsp->vfs_op != lx_autofs_vfsops) {
+ /*
+ * Found an indirect mount (probably
+ * NFS) so we need to get the vfs it's
+ * mounted onto.
+ */
+ vnode_t *vn = vfsp->vfs_vnodecovered;
+ vfsp = vn->v_vfsp;
+
+ if (vfsp->vfs_op != lx_autofs_vfsops) {
+ /*
+ * autofs doesn't manage this
+ * path.
+ */
+ break;
+ }
+ }
+
+ fnd_vfs = vfsp;
+ VFS_HOLD(fnd_vfs)
+ break;
+ }
+ }
+ vfsp = vfsp->vfs_zone_next;
+ } while (vfsp != vfslist);
+ vfs_list_unlock();
+
+ if (fnd_vfs == NULL) {
+ kmem_free(dc, dc->lad_size);
+ return (EINVAL);
+ }
+
+ data = (lx_autofs_vfs_t *)fnd_vfs->vfs_data;
+ uid = data->lav_uid;
+ gid = data->lav_gid;
+ VFS_RELE(fnd_vfs);
+
+ dc->lad_arg1 = uid;
+ dc->lad_arg2 = gid;
+
+ if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) {
+ kmem_free(dc, dc->lad_size);
+ return (EFAULT);
+ }
+
+ kmem_free(dc, dc->lad_size);
+ return (0);
+}
+
+static int
+lx_autofs_dev_ismntpt(intptr_t arg)
+{
+ int err = 0;
+ lx_autofs_dv_ioctl_t *dc;
+ struct vfs *vfslist;
+ vfs_t *vfsp;
+ vfs_t *fnd_vfs = NULL;
+ zone_t *zone = curzone;
+
+ if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0)
+ return (err);
+
+ /*
+ * The automounter will always pass a path. It can also either pass an
+ * ioctlfd or, if it's -1, arg1 can be an LX_AUTOFS_TYPE_* value. We
+ * currently don't need those for our algorithm.
+ */
+
+ vfs_list_read_lock();
+ vfsp = vfslist = curzone->zone_vfslist;
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ kmem_free(dc, dc->lad_size);
+ return (0); /* return 0 if not a mount point */
+ }
+
+ do {
+ if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) {
+ char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+
+ if (strcmp(dc->lad_path,
+ ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) {
+
+ /*
+ * To handle direct mounts (on top of an autofs
+ * mount), we must prefer non-autofs vfs for
+ * this request.
+ */
+ if (fnd_vfs != NULL)
+ VFS_RELE(fnd_vfs);
+
+ fnd_vfs = vfsp;
+ VFS_HOLD(fnd_vfs)
+
+ if (fnd_vfs->vfs_op != lx_autofs_vfsops)
+ break;
+ }
+ }
+ vfsp = vfsp->vfs_zone_next;
+ } while (vfsp != vfslist);
+ vfs_list_unlock();
+
+ if (fnd_vfs == NULL) {
+ kmem_free(dc, dc->lad_size);
+ return (0); /* return 0 if not a mount point */
+ }
+
+ /*
+ * arg1 is device number, arg2 is superblock magic number
+ * The superblock value only matters if autofs or not.
+ */
+ dc->lad_arg1 = fnd_vfs->vfs_dev;
+ if (fnd_vfs->vfs_op == lx_autofs_vfsops) {
+ dc->lad_arg2 = LX_AUTOFS_SB_MAGIC;
+ } else {
+ dc->lad_arg2 = ~LX_AUTOFS_SB_MAGIC;
+ }
+
+ VFS_RELE(fnd_vfs);
+
+ if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) {
+ kmem_free(dc, dc->lad_size);
+ return (EFAULT);
+ }
+
+ kmem_free(dc, dc->lad_size);
+
+ /*
+ * We have to return 1 if it is a mount point. The lx ioctl autofs
+ * translator will convert a negative value back to a positive,
+ * non-error return value.
+ */
+ return (-1);
+}
+
+static int
+lx_autofs_dev_askumount(intptr_t arg)
+{
+ int err;
+ int v;
+ lx_autofs_dv_ioctl_t dcmd;
+ vfs_t *vfsp;
+
+ if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+ return (err);
+
+ if (lx_autofs_may_unmount(vfsp, kcred)) {
+ v = 0;
+ } else {
+ v = 1;
+ }
+ VFS_RELE(vfsp);
+
+ dcmd.lad_arg1 = v;
+ if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_dev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ switch (cmd) {
+ case LX_AUTOFS_DEV_IOC_VERSION_CMD:
+ return (lx_autofs_dev_vers(arg));
+
+ case LX_AUTOFS_DEV_IOC_PROTOVER_CMD:
+ return (lx_autofs_dev_protver(arg));
+
+ case LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD:
+ return (lx_autofs_dev_protosubver(arg));
+
+ case LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD:
+ return (lx_autofs_dev_openmount(arg));
+
+ case LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD:
+ return (lx_autofs_dev_closemount(arg));
+
+ case LX_AUTOFS_DEV_IOC_READY_CMD:
+ return (lx_autofs_dev_ready(arg));
+
+ case LX_AUTOFS_DEV_IOC_FAIL_CMD:
+ return (lx_autofs_dev_fail(arg));
+
+ case LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD:
+ return (lx_autofs_dev_setpipefd(arg));
+
+ case LX_AUTOFS_DEV_IOC_CATATONIC_CMD:
+ return (lx_autofs_dev_catatonic(arg));
+
+ case LX_AUTOFS_DEV_IOC_TIMEOUT_CMD:
+ return (lx_autofs_dev_timeout(arg));
+
+ case LX_AUTOFS_DEV_IOC_REQUESTER_CMD:
+ return (lx_autofs_dev_requestor(arg));
+
+ case LX_AUTOFS_DEV_IOC_EXPIRE_CMD:
+ return (lx_autofs_dev_expire(arg));
+
+ case LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD:
+ return (lx_autofs_dev_askumount(arg));
+
+ case LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD:
+ return (lx_autofs_dev_ismntpt(arg));
+ }
+
+ return (EINVAL);
+}
+
+/*
+ * lx_autofs_init() gets invoked via the mod_install() call in
+ * this module's _init() routine. Therefore, the code that cleans
+ * up the structures we allocate below is actually found in
+ * our _fini() routine.
+ */
+/* ARGSUSED */
+static int
+lx_autofs_init(int fstype, char *name)
+{
+ int error;
+
+ lx_autofs_major = ddi_name_to_major(LX_AUTOFS_NAME);
+
+ lx_autofs_fstype = fstype;
+ if ((error = vfs_setfsops(fstype, lx_autofs_vfstops,
+ &lx_autofs_vfsops)) != 0) {
+ cmn_err(CE_WARN, "lx_autofs_init: bad vfs ops template");
+ return (error);
+ }
+
+ if ((error = vn_make_ops(name, lx_autofs_tops_root,
+ &lx_autofs_vn_ops)) != 0) {
+ VERIFY(vfs_freevfsops_by_type(fstype) == 0);
+ lx_autofs_vn_ops = NULL;
+ return (error);
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int instance = ddi_get_instance(dip);
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ ASSERT(instance == 0);
+ if (instance != 0)
+ return (DDI_FAILURE);
+
+ /* create our minor node */
+ if (ddi_create_minor_node(dip, LX_AUTOFS_MINORNAME, S_IFCHR, 0,
+ DDI_PSEUDO, 0) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+
+ lx_autofs_dip = dip;
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ lx_autofs_dip = NULL;
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
+ void **resultp)
+{
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *resultp = lx_autofs_dip;
+ return (DDI_SUCCESS);
+
+ case DDI_INFO_DEVT2INSTANCE:
+ *resultp = (void *)0;
+ return (DDI_SUCCESS);
+ }
+ return (DDI_FAILURE);
+}
+
+/*
+ * Driver flags
+ */
+static struct cb_ops lx_autofs_cb_ops = {
+ lx_autofs_dev_open, /* open */
+ lx_autofs_dev_close, /* close */
+ nodev, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ lx_autofs_dev_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* vb_prop_op */
+ NULL, /* streamtab */
+ D_NEW | D_MP /* Driver compatibility flag */
+};
+
+/*
+ * Module linkage
+ */
+static mntopt_t lx_autofs_mntopt[] = {
+ { LX_MNTOPT_FD, NULL, 0, MO_HASVALUE },
+ { LX_MNTOPT_PGRP, NULL, 0, MO_HASVALUE },
+ { LX_MNTOPT_MINPROTO, NULL, 0, MO_HASVALUE },
+ { LX_MNTOPT_MAXPROTO, NULL, 0, MO_HASVALUE },
+ { LX_MNTOPT_INDIRECT, NULL, 0, 0 },
+ { LX_MNTOPT_DIRECT, NULL, 0, 0 },
+ { LX_MNTOPT_OFFSET, NULL, 0, 0 }
+};
+
+static mntopts_t lx_autofs_mntopts = {
+ sizeof (lx_autofs_mntopt) / sizeof (mntopt_t),
+ lx_autofs_mntopt
+};
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ LX_AUTOFS_NAME,
+ lx_autofs_init,
+ VSW_HASPROTO | VSW_VOLATILEDEV | VSW_ZMOUNT,
+ &lx_autofs_mntopts
+};
+
+static struct dev_ops lx_autofs_dev_ops = {
+ DEVO_REV, /* version */
+ 0, /* refcnt */
+ lx_autofs_info, /* info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ lx_autofs_attach, /* attach */
+ lx_autofs_detach, /* detach */
+ nodev, /* reset */
+ &lx_autofs_cb_ops, /* driver operations */
+ NULL, /* no bus operations */
+ NULL, /* power */
+ ddi_quiesce_not_needed /* quiesce */
+};
+
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+ &mod_fsops, "lx autofs filesystem", &vfw
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops, "lx autofs driver", &lx_autofs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&modlfs,
+ (void *)&modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ int error;
+
+ if ((error = mod_install(&modlinkage)) != 0) {
+ return (error);
+ }
+
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ if ((error = mod_remove(&modlinkage)) != 0)
+ return (error);
+
+ if (lx_autofs_vn_ops != NULL) {
+ vn_freevnodeops(lx_autofs_vn_ops);
+ lx_autofs_vn_ops = NULL;
+ }
+
+ /*
+ * In our init routine, if we get an error after calling
+ * vfs_setfsops() we cleanup by calling vfs_freevfsops_by_type().
+ * But we don't need to call vfs_freevfsops_by_type() here
+ * because the fs framework did this for us as part of the
+ * mod_remove() call above.
+ */
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/autofs/lxautofs.conf b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf
new file mode 100644
index 0000000000..36e0119e33
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf
@@ -0,0 +1,14 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+name="lxautofs" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
new file mode 100644
index 0000000000..46e2cdd886
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
@@ -0,0 +1,222 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _LXCGRPS_H
+#define _LXCGRPS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * cgrps.h: declarations, data structures and macros for lx_cgroup
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/atomic.h>
+#include <vm/anon.h>
+
+/*
+ * cgrpmgr ioctl interface.
+ */
+#define CGRPFS_IOC ('C' << 16 | 'G' << 8)
+#define CGRPFS_GETEVNT (CGRPFS_IOC | 1)
+
+typedef struct cgrpmgr_info {
+ pid_t cgmi_pid;
+ char *cgmi_rel_agent_path;
+ char *cgmi_cgroup_path;
+} cgrpmgr_info_t;
+
+#if defined(_KERNEL)
+
+#include <sys/lx_brand.h>
+
+typedef struct cgrpmgr_info32 {
+ pid_t cgmi_pid;
+ caddr32_t cgmi_rel_agent_path;
+ caddr32_t cgmi_cgroup_path;
+} cgrpmgr_info32_t;
+
+#define CG_PSNSIZE 256 /* max size of pseudo file name entries */
+#define CG_PSDSIZE 16 /* pretend that a dir entry takes 16 bytes */
+
+/*
+ * The order of these entries must be in sync with the cg_ssde_dir array.
+ */
+typedef enum cgrp_ssid {
+ CG_SSID_GENERIC = 1,
+ CG_SSID_NUM /* last ssid for range checking */
+} cgrp_ssid_t;
+
+typedef enum cgrp_nodetype {
+ CG_CGROUP_DIR = 1, /* cgroup directory entry */
+ CG_NOTIFY, /* notify_on_release file */
+ CG_PROCS, /* cgroup.procs file */
+ CG_REL_AGENT, /* release_agent file */
+ CG_TASKS, /* tasks file */
+} cgrp_nodetype_t;
+
+typedef struct cgrp_subsys_dirent {
+ cgrp_nodetype_t cgrp_ssd_type;
+ char *cgrp_ssd_name;
+} cgrp_subsys_dirent_t;
+
+#define N_DIRENTS(m) (cgrp_num_pseudo_ents((m)->cg_ssid) + 2)
+
+/*
+ * A modern systemd-based Linux system typically has 50-60 cgroups so
+ * we size the hash for 2x that number.
+ */
+#define CGRP_HASH_SZ 128
+#define CGRP_AGENT_LEN (MAXPATHLEN + 1)
+
+/*
+ * cgroups per-mount data structure.
+ *
+ * All but the event related fields are protected by cg_contents.
+ * The evnt_list and counter is protected by cg_events.
+ */
+typedef struct cgrp_mnt {
+ struct vfs *cg_vfsp; /* filesystem's vfs struct */
+ struct cgrp_node *cg_rootnode; /* root cgrp_node */
+ char *cg_mntpath; /* name of cgroup mount point */
+ cgrp_ssid_t cg_ssid; /* subsystem type */
+ dev_t cg_dev; /* unique dev # of mounted `device' */
+ uint_t cg_gen; /* node ID source for files */
+ uint_t cg_grp_gen; /* ID source for cgroups */
+ kmutex_t cg_contents; /* global lock for most fs activity */
+ char cg_agent[CGRP_AGENT_LEN]; /* release_agent path */
+ /* ptr to zone data for containing zone */
+ lx_zone_data_t *cg_lxzdata;
+ struct cgrp_node **cg_grp_hash; /* hash list of cgroups in the fs */
+} cgrp_mnt_t;
+
+/*
+ * cgrp_node is the file system dependent node for cgroups.
+ *
+ * The node is used to represent both directories (a cgroup) and pseudo files
+ * within the directory.
+ *
+ * Members are tagged in the comment to note which type of node they apply to:
+ * A - all
+ * D - dir (i.e. a cgroup)
+ * F - pseudo file
+ */
+
+typedef struct cgrp_node {
+ struct cgrp_node *cgn_back; /* A lnked lst of cgrp_nodes */
+ struct cgrp_node *cgn_forw; /* A lnked lst of cgrp_nodes */
+ struct cgrp_dirent *cgn_dir; /* D dirent list */
+ struct cgrp_node *cgn_parent; /* A dir containing this node */
+ struct cgrp_node *cgn_next; /* D link in per-mount cgroup */
+ /* hash table */
+ uint_t cgn_dirents; /* D number of dirents */
+ cgrp_nodetype_t cgn_type; /* A type for this node */
+ uint_t cgn_notify; /* D notify_on_release value */
+ uint_t cgn_task_cnt; /* D number of threads in grp */
+ struct vnode *cgn_vnode; /* A vnode for this cgrp_node */
+ uint_t cgn_id; /* D ID number for the cgroup */
+ struct vattr cgn_attr; /* A attributes */
+} cgrp_node_t;
+
+/*
+ * File system independent to cgroups conversion macros
+ */
+#define VFSTOCGM(vfsp) ((cgrp_mnt_t *)(vfsp)->vfs_data)
+#define VTOCGM(vp) ((cgrp_mnt_t *)(vp)->v_vfsp->vfs_data)
+#define VTOCGN(vp) ((struct cgrp_node *)(vp)->v_data)
+#define CGNTOV(cn) ((cn)->cgn_vnode)
+#define cgnode_hold(cn) VN_HOLD(CGNTOV(cn))
+#define cgnode_rele(cn) VN_RELE(CGNTOV(cn))
+
+/*
+ * Attributes
+ */
+#define cgn_mask cgn_attr.va_mask
+#define cgn_mode cgn_attr.va_mode
+#define cgn_uid cgn_attr.va_uid
+#define cgn_gid cgn_attr.va_gid
+#define cgn_fsid cgn_attr.va_fsid
+#define cgn_nodeid cgn_attr.va_nodeid
+#define cgn_nlink cgn_attr.va_nlink
+#define cgn_size cgn_attr.va_size
+#define cgn_atime cgn_attr.va_atime
+#define cgn_mtime cgn_attr.va_mtime
+#define cgn_ctime cgn_attr.va_ctime
+#define cgn_rdev cgn_attr.va_rdev
+#define cgn_blksize cgn_attr.va_blksize
+#define cgn_nblocks cgn_attr.va_nblocks
+#define cgn_seq cgn_attr.va_seq
+
+/*
+ * cgroup directories are made up of a linked list of cg_dirent structures
+ * hanging off directory cgrp_nodes. File names are not fixed length,
+ * but are null terminated.
+ */
+typedef struct cgrp_dirent {
+ struct cgrp_node *cgd_cgrp_node; /* cg node for this file */
+ struct cgrp_dirent *cgd_next; /* next directory entry */
+ struct cgrp_dirent *cgd_prev; /* prev directory entry */
+ uint_t cgd_offset; /* "offset" of dir entry */
+ uint_t cgd_hash; /* a hash of cgd_name */
+ struct cgrp_dirent *cgd_link; /* linked via hash table */
+ struct cgrp_node *cgd_parent; /* parent, dir we are in */
+ char *cgd_name; /* null terminated */
+} cgrp_dirent_t;
+
+enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */
+enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */
+
+extern struct vnodeops *cgrp_vnodeops;
+
+int cgrp_dirdelete(cgrp_node_t *, cgrp_node_t *, char *, enum dr_op, cred_t *);
+int cgrp_direnter(cgrp_mnt_t *, cgrp_node_t *, char *, enum de_op,
+ cgrp_node_t *, struct vattr *, cgrp_node_t **, cred_t *);
+void cgrp_dirinit(cgrp_node_t *, cgrp_node_t *, cred_t *);
+int cgrp_dirlookup(cgrp_node_t *, char *, cgrp_node_t **, cred_t *);
+void cgrp_dirtrunc(cgrp_node_t *);
+void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *);
+int cgrp_taccess(void *, int, cred_t *);
+ino_t cgrp_inode(cgrp_nodetype_t, unsigned int);
+int cgrp_num_pseudo_ents(cgrp_ssid_t);
+cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t);
+void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *, boolean_t);
+
+#endif /* KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LXCGRPS_H */
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
new file mode 100644
index 0000000000..66b6f60376
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
@@ -0,0 +1,1014 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+
+#include "cgrps.h"
+
+static int cgrp_dirmakecgnode(cgrp_node_t *, cgrp_mnt_t *, struct vattr *,
+ enum de_op, cgrp_node_t **, struct cred *);
+static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *);
+
+static cgrp_subsys_dirent_t cgrp_generic_dir[] = {
+ { CG_PROCS, "cgroup.procs" },
+ { CG_NOTIFY, "notify_on_release" },
+ { CG_TASKS, "tasks" }
+};
+
+typedef struct cgrp_ssde {
+ cgrp_subsys_dirent_t *cg_ssde_files;
+ int cg_ssde_nfiles;
+} cgrp_ssde_t;
+
+#define CGDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0]))
+
+/*
+ * Note, these entries must be in the same order as the cgrp_ssid_t entries.
+ */
+static cgrp_ssde_t cg_ssde_dir[] = {
+ /* subsystems start at 1 */
+ {NULL, 0},
+
+ /* CG_SSID_GENERIC */
+ {cgrp_generic_dir, CGDIRLISTSZ(cgrp_generic_dir)},
+};
+
+
+#define CG_HASH_SIZE 8192 /* must be power of 2 */
+#define CG_MUTEX_SIZE 64
+
+static cgrp_dirent_t *cg_hashtable[CG_HASH_SIZE];
+static kmutex_t cg_hashmutex[CG_MUTEX_SIZE];
+
+#define CG_HASH_INDEX(a) ((a) & (CG_HASH_SIZE-1))
+#define CG_MUTEX_INDEX(a) ((a) & (CG_MUTEX_SIZE-1))
+
+#define CG_HASH(cp, name, hash) \
+ { \
+ char Xc, *Xcp; \
+ hash = (uint_t)(uintptr_t)(cp) >> 8; \
+ for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \
+ hash = (hash << 4) + hash + (uint_t)Xc; \
+ }
+
+#define MODESHIFT 3
+
+typedef enum cgrp_nodehold {
+ NOHOLD,
+ HOLD
+} cgrp_nodehold_t;
+
+void
+cgrp_hash_init(void)
+{
+ int i;
+
+ for (i = 0; i < CG_MUTEX_SIZE; i++)
+ mutex_init(&cg_hashmutex[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+cgrp_hash_in(cgrp_dirent_t *c)
+{
+ uint_t hash;
+ cgrp_dirent_t **prevpp;
+ kmutex_t *cg_hmtx;
+
+ CG_HASH(c->cgd_parent, c->cgd_name, hash);
+ c->cgd_hash = hash;
+ prevpp = &cg_hashtable[CG_HASH_INDEX(hash)];
+ cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)];
+ mutex_enter(cg_hmtx);
+ c->cgd_link = *prevpp;
+ *prevpp = c;
+ mutex_exit(cg_hmtx);
+}
+
+static void
+cgrp_hash_out(cgrp_dirent_t *c)
+{
+ uint_t hash;
+ cgrp_dirent_t **prevpp;
+ kmutex_t *cg_hmtx;
+
+ hash = c->cgd_hash;
+ prevpp = &cg_hashtable[CG_HASH_INDEX(hash)];
+ cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)];
+ mutex_enter(cg_hmtx);
+ while (*prevpp != c)
+ prevpp = &(*prevpp)->cgd_link;
+ *prevpp = c->cgd_link;
+ mutex_exit(cg_hmtx);
+}
+
+static cgrp_dirent_t *
+cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold,
+ cgrp_node_t **found)
+{
+ cgrp_dirent_t *l;
+ uint_t hash;
+ kmutex_t *cg_hmtx;
+ cgrp_node_t *cnp;
+
+ CG_HASH(parent, name, hash);
+ cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)];
+ mutex_enter(cg_hmtx);
+ l = cg_hashtable[CG_HASH_INDEX(hash)];
+ while (l) {
+ if ((l->cgd_hash == hash) &&
+ (l->cgd_parent == parent) &&
+ (strcmp(l->cgd_name, name) == 0)) {
+ /*
+ * We need to make sure that the cgrp_node that
+ * we put a hold on is the same one that we pass back.
+ * Hence, temporary variable cnp is necessary.
+ */
+ cnp = l->cgd_cgrp_node;
+ if (hold == HOLD) {
+ ASSERT(cnp);
+ cgnode_hold(cnp);
+ }
+ if (found)
+ *found = cnp;
+ mutex_exit(cg_hmtx);
+ return (l);
+ } else {
+ l = l->cgd_link;
+ }
+ }
+ mutex_exit(cg_hmtx);
+ return (NULL);
+}
+
+/*
+ * The following functions maintain the per-mount cgroup hash table.
+ */
+static void
+cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+ uint_t cgid;
+ int hsh;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ cgid = cn->cgn_id;
+ hsh = cgid % CGRP_HASH_SZ;
+
+ cn->cgn_next = cgm->cg_grp_hash[hsh];
+ cgm->cg_grp_hash[hsh] = cn;
+}
+
+static void
+cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+ uint_t cgid;
+ int hsh;
+ cgrp_node_t *np = NULL, *curp, *prevp = NULL;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ cgid = cn->cgn_id;
+ hsh = cgid % CGRP_HASH_SZ;
+
+ for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+ curp = curp->cgn_next) {
+ if (curp->cgn_id == cgid) {
+ if (prevp == NULL) {
+ cgm->cg_grp_hash[hsh] = curp->cgn_next;
+ } else {
+ prevp->cgn_next = curp->cgn_next;
+ }
+ np = curp;
+ np->cgn_next = NULL;
+ break;
+ }
+
+ prevp = curp;
+ }
+
+ ASSERT(np != NULL);
+ ASSERT(np->cgn_task_cnt == 0);
+}
+
+/*
+ * Count up the number of threads already running in the zone and initialize the
+ * first cgroup's task counter.
+ *
+ * We have to look at all of the processes to find applicable ones.
+ */
+static void
+cgrp_cg_hash_init(cgrp_node_t *cn)
+{
+ int i;
+ int cnt = 0;
+ zoneid_t zoneid = curproc->p_zone->zone_id;
+ pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+
+ /* Scan all of the process entries */
+ mutex_enter(&pidlock);
+ for (i = 1; i < v.v_proc; i++) {
+ proc_t *p;
+
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, system processes,
+ * a PID of 0, the pid for our zsched process, anything the
+ * security policy doesn't allow us to look at, its not an
+ * lx-branded process and processes that are not in the zone.
+ */
+ if ((p = pid_entry(i)) == NULL ||
+ p->p_stat == SIDL ||
+ (p->p_flag & SSYS) != 0 ||
+ p->p_pid == 0 ||
+ p->p_pid == schedpid ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+ p->p_zone->zone_id != zoneid) {
+ continue;
+ }
+
+ mutex_enter(&p->p_lock);
+ if (p->p_brand != &lx_brand) {
+ mutex_exit(&p->p_lock);
+ continue;
+ }
+ cnt += p->p_lwpcnt;
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * There should be at least the init process with 1 thread in the zone
+ */
+ ASSERT(cnt > 0);
+ cn->cgn_task_cnt = cnt;
+
+ DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt);
+
+ mutex_exit(&pidlock);
+}
+
+cgrp_node_t *
+cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid)
+{
+ int hsh = cgid % CGRP_HASH_SZ;
+ cgrp_node_t *curp;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+ curp = curp->cgn_next) {
+ if (curp->cgn_id == cgid) {
+ return (curp);
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them to give the inode number for
+ * a cgrp pseudo file node.
+ */
+ino_t
+cgrp_inode(cgrp_nodetype_t type, unsigned int cgrpid)
+{
+ /*
+ * cgroup inode format:
+ * 00000000AABBBBBB
+ *
+ * AA - node type (from subsystem list)
+ * BBBBBB - id of the cgroup
+ */
+
+ return ((ino_t)(type << 24) | (cgrpid & 0xffffff));
+}
+
+/*
+ * Return the number of pseudo file entries in a cgroup directory for the
+ * given subsystem.
+ */
+int
+cgrp_num_pseudo_ents(cgrp_ssid_t ssid)
+{
+ cgrp_ssde_t *ssdp = &cg_ssde_dir[ssid];
+
+ return (ssdp->cg_ssde_nfiles);
+}
+
+int
+cgrp_taccess(void *vcp, int mode, cred_t *cred)
+{
+ cgrp_node_t *cn = vcp;
+ int shift = 0;
+ /*
+ * Check access based on owner, group and public perms in cgrp_node.
+ */
+ if (crgetuid(cred) != cn->cgn_uid) {
+ shift += MODESHIFT;
+ if (groupmember(cn->cgn_gid, cred) == 0)
+ shift += MODESHIFT;
+ }
+
+ return (secpolicy_vnode_access2(cred, CGNTOV(cn), cn->cgn_uid,
+ cn->cgn_mode << shift, mode));
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * 0 is returned on success and *foundcp points
+ * to the found cgrp_node with its vnode held.
+ */
+int
+cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp,
+ cred_t *cred)
+{
+ int error;
+
+ ASSERT(MUTEX_HELD(&VTOCGM(parent->cgn_vnode)->cg_contents));
+ *foundcp = NULL;
+ if (parent->cgn_type != CG_CGROUP_DIR)
+ return (ENOTDIR);
+
+ if ((error = cgrp_taccess(parent, VEXEC, cred)))
+ return (error);
+
+ if (*name == '\0') {
+ cgnode_hold(parent);
+ *foundcp = parent;
+ return (0);
+ }
+
+ /*
+ * Search the directory for the matching name
+ * We need the lock protecting the cgn_dir list
+ * so that it doesn't change out from underneath us.
+ * cgrp_hash_lookup() will pass back the cgrp_node
+ * with a hold on it.
+ */
+
+ if (cgrp_hash_lookup(name, parent, HOLD, foundcp) != NULL) {
+ ASSERT(*foundcp);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Enter a directory entry for 'name' and 'cp' into directory 'dir'
+ *
+ * Returns 0 on success.
+ */
+int
+cgrp_direnter(
+ cgrp_mnt_t *cgm,
+ cgrp_node_t *dir, /* target directory to make entry in */
+ char *name, /* name of entry */
+ enum de_op op, /* entry operation */
+ cgrp_node_t *cn, /* existing cgrp_node, if rename */
+ struct vattr *va,
+ cgrp_node_t **cnp, /* return cgrp_node, if create/mkdir */
+ cred_t *cred)
+{
+ cgrp_dirent_t *cdp;
+ cgrp_node_t *found = NULL;
+ int error = 0;
+ char *s;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+ ASSERT(dir->cgn_type == CG_CGROUP_DIR);
+
+ /*
+ * Don't allow '/' characters in pathname component,
+ */
+ for (s = name; *s; s++)
+ if (*s == '/')
+ return (EACCES);
+
+ if (name[0] == '\0')
+ panic("cgrp_direnter: NULL name");
+
+ /*
+ * For rename lock the source entry and check the link count
+ * to see if it has been removed while it was unlocked.
+ * Remember that we can only rename within the same directory.
+ */
+ if (op == DE_RENAME) {
+ if (cn->cgn_nlink == 0) {
+ return (ENOENT);
+ }
+
+ if (cn->cgn_nlink == MAXLINK) {
+ return (EMLINK);
+ }
+ cn->cgn_nlink++;
+ gethrestime(&cn->cgn_ctime);
+ }
+
+ /*
+ * This might be a "dangling detached directory".
+ * it could have been removed, but a reference
+ * to it kept in u_cwd. don't bother searching
+ * it, and with any luck the user will get tired
+ * of dealing with us and cd to some absolute
+ * pathway. *sigh*, thus in ufs, too.
+ */
+ if (dir->cgn_nlink == 0) {
+ error = ENOENT;
+ goto out;
+ }
+
+ /*
+ * Search for the entry. In all cases it is an error if it exists.
+ */
+ cdp = cgrp_hash_lookup(name, dir, HOLD, &found);
+
+ if (cdp) {
+ ASSERT(found != NULL);
+ error = EEXIST;
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(found);
+ mutex_enter(&cgm->cg_contents);
+ } else {
+
+ /*
+ * The entry does not exist. Check write permission in
+ * directory to see if entry can be created.
+ */
+ if ((error = cgrp_taccess(dir, VWRITE, cred)) != 0)
+ goto out;
+ if (op == DE_CREATE || op == DE_MKDIR) {
+ /*
+ * Make new cgrp_node and directory entry as required.
+ */
+ error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred);
+ if (error)
+ goto out;
+
+ if (op == DE_MKDIR) {
+ /*
+ * inherit notify_on_release value from parent
+ */
+ cn->cgn_notify = dir->cgn_notify;
+ }
+ }
+
+ error = cgrp_diraddentry(dir, cn, name);
+ if (error != 0) {
+ if (op == DE_CREATE || op == DE_MKDIR) {
+ /*
+ * Unmake the inode we just made.
+ */
+ if ((cn->cgn_type) == CG_CGROUP_DIR) {
+ ASSERT(cdp == NULL);
+ /*
+ * cleanup allocs made by cgrp_dirinit
+ */
+ cgrp_dirtrunc(cn);
+ }
+ cn->cgn_nlink = 0;
+ gethrestime(&cn->cgn_ctime);
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(cn);
+ mutex_enter(&cgm->cg_contents);
+ cn = NULL;
+ }
+ } else if (cnp) {
+ *cnp = cn;
+ } else if (op == DE_CREATE || op == DE_MKDIR) {
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(cn);
+ mutex_enter(&cgm->cg_contents);
+ }
+ }
+
+out:
+ if (error && op == DE_RENAME) {
+ /* Undo bumped link count. */
+ cn->cgn_nlink--;
+ gethrestime(&cn->cgn_ctime);
+ }
+ return (error);
+}
+
+/*
+ * Delete entry cn of name "nm" from parent dir. This is used to both remove
+ * a cgroup directory and to remove the pseudo file nodes within the cgroup
+ * directory (by recursively calling itself). It frees the dir entry space
+ * and decrements link count on cgrp_node(s).
+ *
+ * Return 0 on success.
+ */
+int
+cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
+ cred_t *cred)
+{
+ cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode);
+ cgrp_dirent_t *cndp;
+ int error;
+ size_t namelen;
+ cgrp_node_t *cnnp;
+ timestruc_t now;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ if (nm[0] == '\0')
+ panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn);
+
+ /*
+ * return error when removing . and ..
+ */
+ if (nm[0] == '.') {
+ if (nm[1] == '\0')
+ return (EINVAL);
+ if (nm[1] == '.' && nm[2] == '\0')
+ return (EEXIST); /* thus in ufs */
+ }
+
+ if ((error = cgrp_taccess(dir, VEXEC|VWRITE, cred)) != 0)
+ return (error);
+
+ if (dir->cgn_dir == NULL)
+ return (ENOENT);
+
+ if (op == DR_RMDIR) {
+ /*
+ * This is the top-level removal of a cgroup dir. Start by
+ * removing the fixed pseudo file entries from the dir. We do
+ * this by recursively calling back into this function with
+ * a different op code. The caller of this function has
+ * already verified that it is safe to remove this directory.
+ */
+ cgrp_dirent_t *cdp;
+
+ ASSERT(cn->cgn_type == CG_CGROUP_DIR);
+
+ cdp = cn->cgn_dir;
+ while (cdp) {
+ cgrp_node_t *pseudo_node;
+ cgrp_dirent_t *nextp;
+
+ if (strcmp(cdp->cgd_name, ".") == 0 ||
+ strcmp(cdp->cgd_name, "..") == 0) {
+ cdp = cdp->cgd_next;
+ continue;
+ }
+
+ pseudo_node = cdp->cgd_cgrp_node;
+ nextp = cdp->cgd_next;
+
+ cgnode_hold(pseudo_node);
+ error = cgrp_dirdelete(cn, pseudo_node,
+ cdp->cgd_name, DR_REMOVE, cred);
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(pseudo_node);
+ mutex_enter(&cgm->cg_contents);
+
+ cdp = nextp;
+ }
+
+ cgrp_cg_hash_remove(cgm, cn);
+ }
+
+ cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp);
+ VERIFY(cndp != NULL);
+ VERIFY(cn == cnnp);
+
+ cgrp_hash_out(cndp);
+
+ /* Take cndp out of the directory list. */
+ ASSERT(cndp->cgd_next != cndp);
+ ASSERT(cndp->cgd_prev != cndp);
+ if (cndp->cgd_prev) {
+ cndp->cgd_prev->cgd_next = cndp->cgd_next;
+ }
+ if (cndp->cgd_next) {
+ cndp->cgd_next->cgd_prev = cndp->cgd_prev;
+ }
+
+ /*
+ * If the roving slot pointer happens to match cndp,
+ * point it at the previous dirent.
+ */
+ if (dir->cgn_dir->cgd_prev == cndp) {
+ dir->cgn_dir->cgd_prev = cndp->cgd_prev;
+ }
+ ASSERT(cndp->cgd_next != cndp);
+ ASSERT(cndp->cgd_prev != cndp);
+
+ /* cndp points to the correct directory entry */
+ namelen = strlen(cndp->cgd_name) + 1;
+
+ kmem_free(cndp, sizeof (cgrp_dirent_t) + namelen);
+ dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen);
+ dir->cgn_dirents--;
+
+ gethrestime(&now);
+ dir->cgn_mtime = now;
+ dir->cgn_ctime = now;
+ cn->cgn_ctime = now;
+
+ ASSERT(cn->cgn_nlink > 0);
+ cn->cgn_nlink--;
+ if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) {
+ cgrp_dirtrunc(cn);
+ ASSERT(cn->cgn_nlink == 0);
+ }
+ return (0);
+}
+
+/*
+ * Initialize a cgrp_node and add it to file list under mount point.
+ */
+void
+cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
+{
+ struct vnode *vp;
+ timestruc_t now;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+ ASSERT(vap != NULL);
+
+ cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ cn->cgn_mask = 0;
+ cn->cgn_attr.va_type = vap->va_type;
+ cn->cgn_nlink = 1;
+ cn->cgn_size = 0;
+
+ if (cred == NULL) {
+ cn->cgn_uid = vap->va_uid;
+ cn->cgn_gid = vap->va_gid;
+ } else {
+ cn->cgn_uid = crgetuid(cred);
+ cn->cgn_gid = crgetgid(cred);
+ }
+
+ cn->cgn_fsid = cgm->cg_dev;
+ cn->cgn_rdev = vap->va_rdev;
+ cn->cgn_blksize = PAGESIZE;
+ cn->cgn_nblocks = 0;
+ gethrestime(&now);
+ cn->cgn_atime = now;
+ cn->cgn_mtime = now;
+ cn->cgn_ctime = now;
+ cn->cgn_seq = 0;
+ cn->cgn_dir = NULL;
+
+ cn->cgn_vnode = vn_alloc(KM_SLEEP);
+ vp = CGNTOV(cn);
+ vn_setops(vp, cgrp_vnodeops);
+ vp->v_vfsp = cgm->cg_vfsp;
+ vp->v_type = vap->va_type;
+ vp->v_rdev = vap->va_rdev;
+ vp->v_data = (caddr_t)cn;
+
+ cn->cgn_nodeid = cgm->cg_gen++;
+
+ /*
+ * Add new cgrp_node to end of linked list of cgrp_nodes for this
+ * cgroup fs. Root directory is handled specially in cgrp_mount.
+ */
+ if (cgm->cg_rootnode != (cgrp_node_t *)NULL) {
+ cn->cgn_forw = NULL;
+ cn->cgn_back = cgm->cg_rootnode->cgn_back;
+ cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn;
+ }
+ vn_exists(vp);
+}
+
+void
+cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name,
+ cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr)
+{
+ cgrp_node_t *ncn;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ VERIFY0(cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL,
+ nattr, &ncn, cr));
+
+ /*
+ * Fix the inode and assign the pseudo file type to be correct.
+ */
+ ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid);
+ ncn->cgn_type = type;
+
+ /*
+ * Since we're creating these entries here and not via the
+ * normal VOP_CREATE code path, we need to do the rele to drop
+ * our hold. This will leave the vnode v_count at 0 when we
+ * come out of cgrp_inactive but we won't reclaim the vnode
+ * there since the cgn_nlink value will still be 1.
+ */
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(ncn);
+ mutex_enter(&cgm->cg_contents);
+}
+
+/*
+ * cgrp_dirinit is used internally to initialize a directory (dir)
+ * with '.' and '..' entries without checking permissions and locking
+ * It also creates the entries for the pseudo file nodes that reside in the
+ * directory.
+ */
+void
+cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
+{
+ cgrp_dirent_t *dot, *dotdot;
+ timestruc_t now;
+ cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode);
+ cgrp_ssde_t *ssdp;
+ cgrp_subsys_dirent_t *pseudo_files;
+ struct vattr nattr;
+ int i;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+ ASSERT(dir->cgn_type == CG_CGROUP_DIR);
+
+ ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM);
+ ssdp = &cg_ssde_dir[cgm->cg_ssid];
+
+ /*
+ * If this is the top-level cgroup created by the mount then we need to
+ * count up the number of procs and tasks already running in the zone.
+ */
+
+ /*
+ * Set the cgroup ID for this cgrp_node by using a counter on each
+ * mount.
+ */
+ dir->cgn_id = cgm->cg_grp_gen++;
+ cgrp_cg_hash_insert(cgm, dir);
+ /* Initialise the first cgroup if this is top-level group */
+ if (parent == dir)
+ cgrp_cg_hash_init(dir);
+
+ /*
+ * Initialize the entries
+ */
+ dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP);
+ dot->cgd_cgrp_node = dir;
+ dot->cgd_offset = 0;
+ dot->cgd_name = (char *)dot + sizeof (cgrp_dirent_t);
+ dot->cgd_name[0] = '.';
+ dot->cgd_parent = dir;
+ cgrp_hash_in(dot);
+
+ dotdot = kmem_zalloc(sizeof (cgrp_dirent_t) + 3, KM_SLEEP);
+ dotdot->cgd_cgrp_node = parent;
+ dotdot->cgd_offset = 1;
+ dotdot->cgd_name = (char *)dotdot + sizeof (cgrp_dirent_t);
+ dotdot->cgd_name[0] = '.';
+ dotdot->cgd_name[1] = '.';
+ dotdot->cgd_parent = dir;
+ cgrp_hash_in(dotdot);
+
+ /*
+ * Initialize directory entry list.
+ */
+ dot->cgd_next = dotdot;
+ dot->cgd_prev = dotdot; /* dot's cgd_prev holds roving slot pointer */
+ dotdot->cgd_next = NULL;
+ dotdot->cgd_prev = dot;
+
+ gethrestime(&now);
+ dir->cgn_mtime = now;
+ dir->cgn_ctime = now;
+
+ parent->cgn_nlink++;
+ parent->cgn_ctime = now;
+
+ dir->cgn_dir = dot;
+ dir->cgn_size = 2 * sizeof (cgrp_dirent_t) + 5; /* dot and dotdot */
+ dir->cgn_dirents = 2;
+ dir->cgn_nlink = 2;
+
+ bzero(&nattr, sizeof (struct vattr));
+ nattr.va_mode = (mode_t)(0644);
+ nattr.va_type = VREG;
+ nattr.va_rdev = 0;
+
+ /*
+ * If this is the top-level dir in the file system then it always
+ * has a release_agent pseudo file. Only the top-level dir has this
+ * file.
+ */
+ if (parent == dir) {
+ cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr,
+ cr);
+ }
+
+ pseudo_files = ssdp->cg_ssde_files;
+ for (i = 0; i < ssdp->cg_ssde_nfiles; i++) {
+ cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name,
+ pseudo_files[i].cgrp_ssd_type, &nattr, cr);
+ }
+}
+
+/*
+ * cgrp_dirtrunc is called to remove all directory entries under this directory.
+ */
+void
+cgrp_dirtrunc(cgrp_node_t *dir)
+{
+ cgrp_dirent_t *cgdp;
+ timestruc_t now;
+
+ ASSERT(MUTEX_HELD(&VTOCGM(dir->cgn_vnode)->cg_contents));
+ ASSERT(dir->cgn_type == CG_CGROUP_DIR);
+
+ for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) {
+ size_t namelen;
+ cgrp_node_t *cn;
+
+ ASSERT(cgdp->cgd_next != cgdp);
+ ASSERT(cgdp->cgd_prev != cgdp);
+ ASSERT(cgdp->cgd_cgrp_node);
+
+ dir->cgn_dir = cgdp->cgd_next;
+ namelen = strlen(cgdp->cgd_name) + 1;
+
+ /*
+ * Adjust the link counts to account for this directory entry
+ * removal. We do hold/rele operations to free up these nodes.
+ */
+ cn = cgdp->cgd_cgrp_node;
+ ASSERT(cn->cgn_nlink > 0);
+ cn->cgn_nlink--;
+
+ cgrp_hash_out(cgdp);
+ kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen);
+ dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen);
+ dir->cgn_dirents--;
+ }
+
+ gethrestime(&now);
+ dir->cgn_mtime = now;
+ dir->cgn_ctime = now;
+
+ ASSERT(dir->cgn_dir == NULL);
+ ASSERT(dir->cgn_size == 0);
+ ASSERT(dir->cgn_dirents == 0);
+}
+
+static int
+cgrp_diraddentry(cgrp_node_t *dir, cgrp_node_t *cn, char *name)
+{
+ cgrp_dirent_t *cdp, *cpdp;
+ size_t namelen, alloc_size;
+ timestruc_t now;
+
+ /*
+ * Make sure the parent directory wasn't removed from
+ * underneath the caller.
+ */
+ if (dir->cgn_dir == NULL)
+ return (ENOENT);
+
+ /* Check that everything is on the same filesystem. */
+ if (cn->cgn_vnode->v_vfsp != dir->cgn_vnode->v_vfsp)
+ return (EXDEV);
+
+ /* Allocate and initialize directory entry */
+ namelen = strlen(name) + 1;
+ alloc_size = namelen + sizeof (cgrp_dirent_t);
+ cdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI);
+ if (cdp == NULL)
+ return (ENOSPC);
+
+ cn->cgn_parent = dir;
+
+ dir->cgn_size += alloc_size;
+ dir->cgn_dirents++;
+ cdp->cgd_cgrp_node = cn;
+ cdp->cgd_parent = dir;
+
+ /* The directory entry and its name were allocated sequentially. */
+ cdp->cgd_name = (char *)cdp + sizeof (cgrp_dirent_t);
+ (void) strcpy(cdp->cgd_name, name);
+
+ cgrp_hash_in(cdp);
+
+ /*
+ * Some utilities expect the size of a directory to remain
+ * somewhat static. For example, a routine which removes
+ * subdirectories between calls to readdir(); the size of the
+ * directory changes from underneath it and so the real
+ * directory offset in bytes is invalid. To circumvent
+ * this problem, we initialize a directory entry with an
+ * phony offset, and use this offset to determine end of
+ * file in cgrp_readdir.
+ */
+ cpdp = dir->cgn_dir->cgd_prev;
+ /*
+ * Install at first empty "slot" in directory list.
+ */
+ while (cpdp->cgd_next != NULL && (cpdp->cgd_next->cgd_offset -
+ cpdp->cgd_offset) <= 1) {
+ ASSERT(cpdp->cgd_next != cpdp);
+ ASSERT(cpdp->cgd_prev != cpdp);
+ ASSERT(cpdp->cgd_next->cgd_offset > cpdp->cgd_offset);
+ cpdp = cpdp->cgd_next;
+ }
+ cdp->cgd_offset = cpdp->cgd_offset + 1;
+
+ /*
+ * If we're at the end of the dirent list and the offset (which
+ * is necessarily the largest offset in this directory) is more
+ * than twice the number of dirents, that means the directory is
+ * 50% holes. At this point we reset the slot pointer back to
+ * the beginning of the directory so we start using the holes.
+ * The idea is that if there are N dirents, there must also be
+ * N holes, so we can satisfy the next N creates by walking at
+ * most 2N entries; thus the average cost of a create is constant.
+ * Note that we use the first dirent's cgd_prev as the roving
+ * slot pointer; it's ugly, but it saves a word in every dirent.
+ */
+ if (cpdp->cgd_next == NULL && cpdp->cgd_offset > 2 * dir->cgn_dirents)
+ dir->cgn_dir->cgd_prev = dir->cgn_dir->cgd_next;
+ else
+ dir->cgn_dir->cgd_prev = cdp;
+
+ ASSERT(cpdp->cgd_next != cpdp);
+ ASSERT(cpdp->cgd_prev != cpdp);
+
+ cdp->cgd_next = cpdp->cgd_next;
+ if (cdp->cgd_next) {
+ cdp->cgd_next->cgd_prev = cdp;
+ }
+ cdp->cgd_prev = cpdp;
+ cpdp->cgd_next = cdp;
+
+ ASSERT(cdp->cgd_next != cdp);
+ ASSERT(cdp->cgd_prev != cdp);
+ ASSERT(cpdp->cgd_next != cpdp);
+ ASSERT(cpdp->cgd_prev != cpdp);
+
+ gethrestime(&now);
+ dir->cgn_mtime = now;
+ dir->cgn_ctime = now;
+
+ return (0);
+}
+
+static int
+cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va,
+ enum de_op op, cgrp_node_t **newnode, struct cred *cred)
+{
+ cgrp_node_t *cn;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+ ASSERT(va != NULL);
+
+ if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+ ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+ return (EOVERFLOW);
+
+ cn = kmem_zalloc(sizeof (cgrp_node_t), KM_SLEEP);
+ cgrp_node_init(cgm, cn, va, cred);
+
+ cn->cgn_vnode->v_rdev = cn->cgn_rdev = NODEV;
+ cn->cgn_vnode->v_type = va->va_type;
+ cn->cgn_uid = crgetuid(cred);
+ cn->cgn_gid = crgetgid(cred);
+
+ if (va->va_mask & AT_ATIME)
+ cn->cgn_atime = va->va_atime;
+ if (va->va_mask & AT_MTIME)
+ cn->cgn_mtime = va->va_mtime;
+
+ if (op == DE_MKDIR) {
+ cn->cgn_type = CG_CGROUP_DIR;
+ cgrp_dirinit(dir, cn, cred);
+ }
+
+ *newnode = cn;
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
new file mode 100644
index 0000000000..7805c3f2bd
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
@@ -0,0 +1,1071 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * The cgroup file system implements a subset of the Linux cgroup functionality
+ * for use by lx-branded zones. On Linux, cgroups are a generic process grouping
+ * mechanism which is used to apply various behaviors to the processes within
+ * the group, although it's primary purpose is for resource management.
+ *
+ * In Linux, the cgroup file system provides two pieces of functionality:
+ * 1) A per-mount set of cgroups arranged in a tree, such that every task in
+ * the system is in one, and only one, of the cgroups in the tree.
+ * 2) A set of subsystems; each subsystem has subsystem-specific state and
+ * behavior and is associated with a cgroup mount. This provides a way to
+ * apply arbitrary functionality (but generally resource management related)
+ * to the processes associated with the nodes in the tree at that mount
+ * point.
+ *
+ * For example, it is common to see cgroup trees (each is its own mount with a
+ * different subsystem controller) for blkio, cpuset, memory, systemd (has no
+ * controller), etc. Within each tree there is a top-level directory with at
+ * least a cgroup.procs, notify_on_release, release_agent, and tasks file.
+ * The cgroup.procs file lists the processes within that group and the tasks
+ * file lists the threads in the group. There could be subdirectories, which
+ * define new cgroups, that then contain a subset of the processes. Each
+ * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and
+ * tasks file.
+ *
+ * Since we're using lx to run user-level code within zones, the majority (all?)
+ * of the cgroup resource management functionality simply doesn't apply to us.
+ * The primary need for cgroups is to support the init program 'systemd' as the
+ * consumer. systemd only requires the process grouping hierarchy of cgroups,
+ * although it can also use the resource management features if they are
+ * available. Given this, our cgroup file system only implements the process
+ * hierarchy and does not report that any resource management controllers are
+ * available for separate mounts.
+ *
+ * In addition to the hierarchy, the other important component of cgroups that
+ * is used by systemd is the 'release_agent'. This provides a mechanism to
+ * run a command when a cgroup becomes empty (the last task in the group
+ * leaves, either by exit or move, and there are no more sub-cgroups). The
+ * 'release_agent' file only exists in the top-level cgroup of the mounted
+ * file system and holds the path to a command to run. The 'notify_on_release'
+ * file exists in each cgroup dir. If that file contains a '1' then the agent
+ * is run when that group becomes empty. The agent is passed a path string of
+ * the cgroup, relative to the file system mount point (e.g. a mount on
+ * /sys/fs/cgroups/systemd with a sub-cgroup of /sys/fs/cgroups/systemd/foo/bar
+ * gets the arg /foo/bar).
+ *
+ * Cgroup membership is implemented via hooks into the lx brand code. When
+ * the cgroup file system loads it installs callbacks for:
+ * lx_cgrp_initlwp
+ * lx_cgrp_freelwp
+ * and when it unloads it clears those hooks. The lx brand code calls those
+ * hooks when a lwp starts and when it exits. Internally we use a
+ * simple reference counter (cgn_task_cnt) on the cgroup node to track how many
+ * threads are in the group, so we can tell when a group becomes empty.
+ * To make this quick, a hash table (cg_grp_hash) is maintained on the
+ * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is
+ * sized so that there should typically only be 0 or 1 cgroups per bucket.
+ * We also keep a reference to the file system in the zone-specific brand data
+ * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t
+ * when it runs the hook.
+ *
+ * Once a cgroup is about to become empty, the final process exiting the cgroup
+ * will launch a new user-level process which execs the release agent. The new
+ * process is created as a child of zsched (indicated by the -1 pid argument
+ * to newproc) and is not associated with the exiting process in any way.
+ *
+ * This file system is similar to tmpfs in that directories only exist in
+ * memory. Each subdirectory represents a different cgroup. Within the cgroup
+ * there are pseudo files (see cg_ssde_dir) with well-defined names which
+ * control the configuration and behavior of the cgroup (see cgrp_nodetype_t).
+ * The primary files within every cgroup are named 'cgroup.procs',
+ * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the
+ * top-level cgroup). The cgroup.procs and tasks files are used to control and
+ * list which processes/threads belong to the cgroup. In the general case there
+ * could be additional files in the cgroup, which defined additional behavior
+ * (i.e. subsystem specific pseudo files), although none exist at this time.
+ *
+ * Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is
+ * used to correlate with the threads to determine cgroup membership. When
+ * assigning a PID to a cgroup (via write) the code updates the br_cgroupid
+ * member in the brand-specific lx_lwp_data structure to control which cgroup
+ * the thread belongs to. Note that because the br_cgroupid lives in
+ * lx_lwp_data, native processes will not appear in the cgroup hierarchy.
+ *
+ * An overview of the behavior for the various vnode operations is:
+ * - no hardlinks or symlinks
+ * - no file create (the subsystem-specific files are a fixed list of
+ * pseudo-files accessible within the directory)
+ * - no file remove
+ * - no file rename, but a directory (i.e. a cgroup) can be renamed within the
+ * containing directory, but not into a different directory
+ * - can mkdir and rmdir to create/destroy cgroups
+ * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup)
+ * - open, read/write, close on the subsytem-specific pseudo files is
+ * allowed, as this is the interface to configure and report on the cgroup.
+ * The pseudo file's mode controls write access and cannot be changed.
+ *
+ * The locking in this file system is simple since the file system is not
+ * subjected to heavy I/O activity and all data is in-memory. There is a single
+ * global mutex for each mount (cg_contents). This mutex is held for the life
+ * of most vnode operations. The most active path is probably the LWP start and
+ * exit hooks which increment/decrement the reference counter on the cgroup
+ * node. The lock is important for this case since we don't want concurrent
+ * activity (such as moving the process into another cgroup) while we're trying
+ * to lookup the cgroup from the mount's hash table. We must be careful to
+ * avoid a deadlock while reading or writing since that code can take pidlock
+ * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of
+ * those is held. To prevent deadlock we always take cg_contents after pidlock
+ * and p_lock.
+ *
+ * EXTENDING THE FILE SYSTEM
+ *
+ * When adding support for a new subsystem, be sure to also update the
+ * lxpr_read_cgroups function in lx_procfs so that the subsystem is reported
+ * by proc.
+ *
+ * Although we don't currently support any subsystem controllers, the design
+ * allows for the file system to be extended to add controller emulation
+ * if needed. New controller IDs (i.e. different subsystems) for a mount can
+ * be defined in the cgrp_ssid_t enum (e.g. CG_SSID_CPUSET or CG_SSID_MEMORY)
+ * and new node types for additional pseudo files in the tree can be defined in
+ * the cgrp_nodetype_t enum (e.g. CG_CPUSET_CPUS or CG_MEMORY_USAGE_IN_BYTES).
+ * The cg_ssde_dir array would need a new entry for the new subsystem to
+ * control which nodes are visible in a directory for the new subsystem.
+ *
+ * New emulation would then need to be written to manage the behavior on the
+ * new pseudo file(s) associated with new cgrp_nodetype_t types.
+ *
+ * Within lx procfs the lxpr_read_pid_cgroup() function would need to be
+ * updated so that it reported the various subsystems used by the different
+ * mounts.
+ *
+ * In addition, in order to support more than one cgroup mount we would need a
+ * list of cgroup IDs associated with every thread, instead of just one ID
+ * (br_cgroupid). The thread data would need to become a struct which held
+ * both an ID and an indication as to which mounted cgroup file system instance
+ * the ID was associated with. We would also need a list of cgroup mounts per
+ * zone, instead the current single zone reference.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+#include <sys/vmparam.h>
+#include <sys/corectl.h>
+#include <sys/contract_impl.h>
+#include <sys/pool.h>
+#include <sys/stack.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+#include "cgrps.h"
+
+/* Module level parameters */
+static int cgrp_fstype;
+static dev_t cgrp_dev;
+
+#define MAX_AGENT_EVENTS 32 /* max num queued events */
+
+#define UMNT_DELAY_TIME drv_usectohz(50000) /* 500th of a second */
+#define UMNT_RETRY_MAX 100 /* 100 times - 2 secs */
+
+/*
+ * cgrp_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. The filesystem module must not be
+ * allowed to go away before the last VFS_FREEVFS() call has been made. Since
+ * this is just an atomic counter, there's no need for locking.
+ */
+static uint32_t cgrp_mountcount;
+
+/*
+ * cgrp_minfree is the minimum amount of swap space that cgroups leaves for
+ * the rest of the zone. In other words, if the amount of free swap space
+ * in the zone drops below cgrp_minfree, cgroup anon allocations will fail.
+ * This number is only likely to become factor when DRAM and swap have both
+ * been capped low to allow for maximum tenancy.
+ */
+size_t cgrp_minfree = 0;
+
+/*
+ * CGMINFREE -- the value from which cgrp_minfree is derived -- should be
+ * configured to a value that is roughly the smallest practical value for
+ * memory + swap minus the largest reasonable size for cgroups in such
+ * a configuration. As of this writing, the smallest practical memory + swap
+ * configuration is 128MB, and it seems reasonable to allow cgroups to consume
+ * no more than half of this, yielding a CGMINFREE of 64MB.
+ */
+#define CGMINFREE 64 * 1024 * 1024 /* 64 Megabytes */
+
+extern pgcnt_t swapfs_minfree;
+
+/*
+ * cgroup vfs operations.
+ */
+static int cgrp_init(int, char *);
+static int cgrp_mount(struct vfs *, struct vnode *,
+ struct mounta *, struct cred *);
+static int cgrp_unmount(struct vfs *, int, struct cred *);
+static int cgrp_root(struct vfs *, struct vnode **);
+static int cgrp_statvfs(struct vfs *, struct statvfs64 *);
+static void cgrp_freevfs(vfs_t *vfsp);
+
+/* Forward declarations for hooks */
+static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t);
+static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "lx_cgroup",
+ cgrp_init,
+ VSW_ZMOUNT,
+ NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+ &mod_fsops, "lx brand cgroups", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+ int error;
+
+ if (cgrp_mountcount)
+ return (EBUSY);
+
+ if ((error = mod_remove(&modlinkage)) != 0)
+ return (error);
+
+ /* Disable hooks used by the lx brand module. */
+ lx_cgrp_initlwp = NULL;
+ lx_cgrp_freelwp = NULL;
+
+ /*
+ * Tear down the operations vectors
+ */
+ (void) vfs_freevfsops_by_type(cgrp_fstype);
+ vn_freevnodeops(cgrp_vnodeops);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * Initialize global locks, etc. Called when loading cgroup module.
+ */
+static int
+cgrp_init(int fstype, char *name)
+{
+ static const fs_operation_def_t cgrp_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = cgrp_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = cgrp_unmount },
+ VFSNAME_ROOT, { .vfs_root = cgrp_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = cgrp_statvfs },
+ VFSNAME_FREEVFS, { .vfs_freevfs = cgrp_freevfs },
+ NULL, NULL
+ };
+ extern const struct fs_operation_def cgrp_vnodeops_template[];
+ int error;
+ extern void cgrp_hash_init();
+ major_t dev;
+
+ cgrp_hash_init();
+ cgrp_fstype = fstype;
+ ASSERT(cgrp_fstype != 0);
+
+ error = vfs_setfsops(fstype, cgrp_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "cgrp_init: bad vfs ops template");
+ return (error);
+ }
+
+ error = vn_make_ops(name, cgrp_vnodeops_template, &cgrp_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "cgrp_init: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * cgrp_minfree doesn't need to be some function of configured
+ * swap space since it really is an absolute limit of swap space
+ * which still allows other processes to execute.
+ */
+ if (cgrp_minfree == 0) {
+ /* Set if not patched */
+ cgrp_minfree = btopr(CGMINFREE);
+ }
+
+ if ((dev = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN, "cgrp_init: Can't get unique device number.");
+ dev = 0;
+ }
+
+ /*
+ * Make the pseudo device
+ */
+ cgrp_dev = makedevice(dev, 0);
+
+ /* Install the hooks used by the lx brand module. */
+ lx_cgrp_initlwp = cgrp_lwp_fork_helper;
+ lx_cgrp_freelwp = cgrp_lwp_exit_helper;
+
+ return (0);
+}
+
+static int
+cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ cgrp_mnt_t *cgm = NULL;
+ struct cgrp_node *cp;
+ struct pathname dpn;
+ int error;
+ struct vattr rattr;
+ cgrp_ssid_t ssid = CG_SSID_GENERIC;
+ lx_zone_data_t *lxzdata;
+
+ if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ return (error);
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ /*
+ * Since we depend on per-thread lx brand data, only allow mounting
+ * within lx zones.
+ */
+ if (curproc->p_zone->zone_brand != &lx_brand)
+ return (EINVAL);
+
+ /*
+ * Ensure we don't allow overlaying mounts
+ */
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * Having the resource be anything but "swap" doesn't make sense.
+ */
+ vfs_setresource(vfsp, "swap", 0);
+
+ /* cgroups don't support read-only mounts */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Here is where we could support subsystem-specific controller
+ * mounting. For example, if mounting a cgroup fs with the 'cpuset'
+ * option to specify that particular controller.
+ *
+ * char *argstr;
+ * if (vfs_optionisset(vfsp, "cpuset", &argstr)) {
+ * if (ssid != CG_SSID_GENERIC) {
+ * error = EINVAL;
+ * goto out;
+ * }
+ * ssid = CG_SSID_CPUSET;
+ * }
+ */
+
+ error = pn_get(uap->dir,
+ (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn);
+ if (error != 0)
+ goto out;
+
+ /*
+ * We currently only support one mount per zone.
+ */
+ lxzdata = ztolxzd(curproc->p_zone);
+ mutex_enter(&lxzdata->lxzd_lock);
+ if (lxzdata->lxzd_cgroup != NULL) {
+ mutex_exit(&lxzdata->lxzd_lock);
+ return (EINVAL);
+ }
+
+ cgm = kmem_zalloc(sizeof (*cgm), KM_SLEEP);
+
+ /* Set but don't bother entering the mutex (not on mount list yet) */
+ mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL);
+
+ cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp;
+ mutex_exit(&lxzdata->lxzd_lock);
+
+ cgm->cg_lxzdata = lxzdata;
+ cgm->cg_ssid = ssid;
+
+ vfsp->vfs_data = (caddr_t)cgm;
+ vfsp->vfs_fstype = cgrp_fstype;
+ vfsp->vfs_dev = cgrp_dev;
+ vfsp->vfs_bsize = PAGESIZE;
+ vfsp->vfs_flag |= VFS_NOTRUNC;
+ vfs_make_fsid(&vfsp->vfs_fsid, cgrp_dev, cgrp_fstype);
+ cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+ (void) strcpy(cgm->cg_mntpath, dpn.pn_path);
+
+ cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ,
+ KM_SLEEP);
+
+ /* allocate and initialize root cgrp_node structure */
+ bzero(&rattr, sizeof (struct vattr));
+ rattr.va_mode = (mode_t)(S_IFDIR | 0755);
+ rattr.va_type = VDIR;
+ rattr.va_rdev = 0;
+ cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP);
+
+ mutex_enter(&cgm->cg_contents);
+ cgrp_node_init(cgm, cp, &rattr, cr);
+
+ CGNTOV(cp)->v_flag |= VROOT;
+
+ /*
+ * initialize linked list of cgrp_nodes so that the back pointer of
+ * the root cgrp_node always points to the last one on the list
+ * and the forward pointer of the last node is null
+ */
+ cp->cgn_back = cp;
+ cp->cgn_forw = NULL;
+ cp->cgn_nlink = 0;
+ cgm->cg_rootnode = cp;
+
+ cp->cgn_type = CG_CGROUP_DIR;
+ cp->cgn_nodeid = cgrp_inode(CG_CGROUP_DIR, cgm->cg_gen);
+
+ /*
+ * This initial cgrp_node will have an ID of 0. All existing processes
+ * inside the zone will have been started with, or inherited, a
+ * br_cgroupid of 0. The cgrp_cg_hash_init function will initialize the
+ * cgn_task_cnt for cgroup 0 to reflect the number of tasks already in
+ * the group.
+ *
+ * Because we must hold cg_contents in cgrp_lwp_fork_helper and
+ * cgrp_lwp_exit_helper, no process can be creating or exiting another
+ * thread (although that is unlikely anyway since the cgroup filesystem
+ * is normally mounted at the start of zone bootup, before anything
+ * else is started).
+ */
+ cgrp_dirinit(cp, cp, cr);
+
+ mutex_exit(&cgm->cg_contents);
+
+ pn_free(&dpn);
+ error = 0;
+ atomic_inc_32(&cgrp_mountcount);
+
+out:
+ if (error == 0)
+ vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
+
+ return (error);
+}
+
+static int
+cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ cgrp_node_t *cgnp, *cancel;
+ struct vnode *vp;
+ int error;
+ uint_t cnt;
+ int retry_cnt = 0;
+
+ if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (error);
+
+retry:
+ mutex_enter(&cgm->cg_contents);
+
+ /*
+ * In the normal unmount case, if there were no open files, only the
+ * root node would have a reference count. However, the user-level
+ * agent manager should have the root vnode open and be waiting in
+ * ioctl. We need to wake the manager and it may take some retries
+ * before it closes its file descriptor.
+ *
+ * With cg_contents held, nothing can be added or removed.
+ * There may be some dirty pages. To prevent fsflush from
+ * disrupting the unmount, put a hold on each node while scanning.
+ * If we find a previously referenced node, undo the holds we have
+ * placed and fail EBUSY.
+ */
+ cgnp = cgm->cg_rootnode;
+
+ ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL);
+
+ vp = CGNTOV(cgnp);
+ mutex_enter(&vp->v_lock);
+
+ if (flag & MS_FORCE) {
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&cgm->cg_contents);
+ return (EINVAL);
+ }
+
+
+ cnt = vp->v_count;
+ if (cnt > 1) {
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&cgm->cg_contents);
+ /* Likely because the user-level manager hasn't exited yet */
+ if (retry_cnt++ < UMNT_RETRY_MAX) {
+ delay(UMNT_DELAY_TIME);
+ goto retry;
+ }
+ return (EBUSY);
+ }
+
+ mutex_exit(&vp->v_lock);
+
+ /*
+ * Check for open files. An open file causes everything to unwind.
+ */
+ for (cgnp = cgnp->cgn_forw; cgnp; cgnp = cgnp->cgn_forw) {
+ vp = CGNTOV(cgnp);
+ mutex_enter(&vp->v_lock);
+ cnt = vp->v_count;
+ if (cnt > 0) {
+ /* An open file; unwind the holds we've been adding. */
+ mutex_exit(&vp->v_lock);
+ cancel = cgm->cg_rootnode->cgn_forw;
+ while (cancel != cgnp) {
+ vp = CGNTOV(cancel);
+ ASSERT(vp->v_count > 0);
+ VN_RELE(vp);
+ cancel = cancel->cgn_forw;
+ }
+ mutex_exit(&cgm->cg_contents);
+ return (EBUSY);
+ } else {
+ /* directly add a VN_HOLD since we have the lock */
+ vp->v_count++;
+ mutex_exit(&vp->v_lock);
+ }
+ }
+
+ mutex_enter(&cgm->cg_lxzdata->lxzd_lock);
+ cgm->cg_lxzdata->lxzd_cgroup = NULL;
+ mutex_exit(&cgm->cg_lxzdata->lxzd_lock);
+ kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ);
+
+ /*
+ * We can drop the mutex now because
+ * no one can find this mount anymore
+ */
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ mutex_exit(&cgm->cg_contents);
+
+ return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS(). This is called by the vfs framework after
+ * umount and the last VFS_RELE, to trigger the release of any resources still
+ * associated with the given vfs_t. This is normally called immediately after
+ * cgrp_umount.
+ */
+void
+cgrp_freevfs(vfs_t *vfsp)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ cgrp_node_t *cn;
+ struct vnode *vp;
+
+ /*
+ * Free all kmemalloc'd and anonalloc'd memory associated with
+ * this filesystem. To do this, we go through the file list twice,
+ * once to remove all the directory entries, and then to remove
+ * all the pseudo files.
+ */
+
+ /*
+ * Now that we are tearing ourselves down we need to remove the
+ * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+ * files from the system causing us to have a negative value. Doing this
+ * seems a bit better than trying to set a flag on the tmount that says
+ * we're tearing down.
+ */
+ vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
+ /*
+ * Remove all directory entries
+ */
+ for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) {
+ mutex_enter(&cgm->cg_contents);
+ if (cn->cgn_type == CG_CGROUP_DIR)
+ cgrp_dirtrunc(cn);
+ mutex_exit(&cgm->cg_contents);
+ }
+
+ ASSERT(cgm->cg_rootnode);
+
+ /*
+ * All links are gone, v_count is keeping nodes in place.
+ * VN_RELE should make the node disappear, unless somebody
+ * is holding pages against it. Nap and retry until it disappears.
+ *
+ * We re-acquire the lock to prevent others who have a HOLD on
+ * a cgrp_node via its pages or anon slots from blowing it away
+ * (in cgrp_inactive) while we're trying to get to it here. Once
+ * we have a HOLD on it we know it'll stick around.
+ *
+ */
+ mutex_enter(&cgm->cg_contents);
+
+ /* Remove all the files (except the rootnode) backwards. */
+ while ((cn = cgm->cg_rootnode->cgn_back) != cgm->cg_rootnode) {
+ mutex_exit(&cgm->cg_contents);
+ /*
+ * All nodes will be released here. Note we handled the link
+ * count above.
+ */
+ vp = CGNTOV(cn);
+ VN_RELE(vp);
+ mutex_enter(&cgm->cg_contents);
+ /*
+ * It's still there after the RELE. Someone else like pageout
+ * has a hold on it so wait a bit and then try again - we know
+ * they'll give it up soon.
+ */
+ if (cn == cgm->cg_rootnode->cgn_back) {
+ VN_HOLD(vp);
+ mutex_exit(&cgm->cg_contents);
+ delay(hz / 4);
+ mutex_enter(&cgm->cg_contents);
+ }
+ }
+ mutex_exit(&cgm->cg_contents);
+
+ VN_RELE(CGNTOV(cgm->cg_rootnode));
+
+ ASSERT(cgm->cg_mntpath);
+
+ kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1);
+
+ mutex_destroy(&cgm->cg_contents);
+ kmem_free(cgm, sizeof (cgrp_mnt_t));
+
+ /* Allow _fini() to succeed now */
+ atomic_dec_32(&cgrp_mountcount);
+}
+
+/*
+ * return root cgnode for given vnode
+ */
+static int
+cgrp_root(struct vfs *vfsp, struct vnode **vpp)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ cgrp_node_t *cp = cgm->cg_rootnode;
+ struct vnode *vp;
+
+ ASSERT(cp);
+
+ vp = CGNTOV(cp);
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ ulong_t blocks;
+ dev32_t d32;
+ zoneid_t eff_zid;
+ struct zone *zp;
+
+ zp = cgm->cg_vfsp->vfs_zone;
+
+ if (zp == NULL)
+ eff_zid = GLOBAL_ZONEUNIQID;
+ else
+ eff_zid = zp->zone_id;
+
+ sbp->f_bsize = PAGESIZE;
+ sbp->f_frsize = PAGESIZE;
+
+ /*
+ * Find the amount of available physical and memory swap
+ */
+ mutex_enter(&anoninfo_lock);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+ mutex_exit(&anoninfo_lock);
+
+ if (blocks > cgrp_minfree)
+ sbp->f_bfree = blocks - cgrp_minfree;
+ else
+ sbp->f_bfree = 0;
+
+ sbp->f_bavail = sbp->f_bfree;
+
+ /*
+ * Total number of blocks is just what's available
+ */
+ sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+ if (eff_zid != GLOBAL_ZONEUNIQID &&
+ zp->zone_max_swap_ctl != UINT64_MAX) {
+ /*
+ * If the fs is used by a zone with a swap cap,
+ * then report the capped size.
+ */
+ rctl_qty_t cap, used;
+ pgcnt_t pgcap, pgused;
+
+ mutex_enter(&zp->zone_mem_lock);
+ cap = zp->zone_max_swap_ctl;
+ used = zp->zone_max_swap;
+ mutex_exit(&zp->zone_mem_lock);
+
+ pgcap = btop(cap);
+ pgused = btop(used);
+
+ sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+ sbp->f_bavail = sbp->f_bfree;
+ sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+ }
+
+ /*
+ * The maximum number of files available is approximately the number
+ * of cgrp_nodes we can allocate from the remaining kernel memory
+ * available to cgroups. This is fairly inaccurate since it doesn't
+ * take into account the names stored in the directory entries.
+ */
+ sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+ (sizeof (cgrp_node_t) + sizeof (cgrp_dirent_t));
+ sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sbp->f_fsid = d32;
+ (void) strcpy(sbp->f_basetype, vfssw[cgrp_fstype].vsw_name);
+ (void) strncpy(sbp->f_fstr, cgm->cg_mntpath, sizeof (sbp->f_fstr));
+ /* ensure null termination */
+ sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+ sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sbp->f_namemax = MAXNAMELEN - 1;
+ return (0);
+}
+
+static int
+cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen)
+{
+ cgrp_node_t *parent;
+ cgrp_dirent_t *dp;
+
+ buf[0] = '\0';
+
+ parent = cn->cgn_parent;
+ if (parent == NULL || parent == cn) {
+ (void) strlcpy(buf, ".", blen);
+ return (0);
+ }
+
+ /*
+ * Search the parent dir list to find this cn's name.
+ */
+ for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) {
+ if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) {
+ (void) strlcpy(buf, dp->cgd_name, blen);
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+typedef struct cgrp_rra_arg {
+ char *crraa_agent_path;
+ char *crraa_event_path;
+} cgrp_rra_arg_t;
+
+static void
+cgrp_run_rel_agent(void *a)
+{
+ cgrp_rra_arg_t *rarg = a;
+ proc_t *p = ttoproc(curthread);
+ zone_t *z = p->p_zone;
+ struct core_globals *cg;
+ int res;
+
+ ASSERT(!INGLOBALZONE(curproc));
+
+ /* The following block is derived from start_init_common */
+ ASSERT_STACK_ALIGNED();
+
+ p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
+ p->p_usrstack = (caddr_t)USRSTACK32;
+ p->p_model = DATAMODEL_ILP32;
+ p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
+ p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
+ p->p_stk_ctl = INT32_MAX;
+
+ p->p_as = as_alloc();
+ p->p_as->a_proc = p;
+ p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
+ (void) hat_setup(p->p_as->a_hat, HAT_INIT);
+
+ VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL);
+
+ corectl_path_hold(cg->core_default_path);
+ corectl_content_hold(cg->core_default_content);
+
+ curproc->p_corefile = cg->core_default_path;
+ curproc->p_content = cg->core_default_content;
+
+ init_mstate(curthread, LMS_SYSTEM);
+ res = exec_init(rarg->crraa_agent_path, rarg->crraa_event_path);
+
+ /* End of code derived from start_init_common */
+
+ kmem_free(rarg->crraa_event_path, MAXPATHLEN);
+ kmem_free(rarg->crraa_agent_path, CGRP_AGENT_LEN);
+ kmem_free(rarg, sizeof (cgrp_rra_arg_t));
+
+ /* The following is derived from zone_start_init - see comments there */
+ if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
+ if (proc_exit(CLD_EXITED, res) != 0) {
+ mutex_enter(&p->p_lock);
+ ASSERT(p->p_flag & SEXITLWPS);
+ lwp_exit();
+ }
+ } else {
+ id_t cid = curthread->t_cid;
+
+ mutex_enter(&class_lock);
+ ASSERT(cid < loaded_classes);
+ if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+ z->zone_fixed_hipri) {
+ pcparms_t pcparms;
+
+ pcparms.pc_cid = cid;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+ FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+ FX_DOUPRILIM | FX_DOUPRI;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+ (void) parmsset(&pcparms, curthread);
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+ curthread->t_pri = RTGPPRIO0;
+ }
+ mutex_exit(&class_lock);
+
+ /* cause the process to return to userland. */
+ lwp_rtt();
+ }
+}
+
+/*
+ * Launch the user-level release_agent manager. The event data is the
+ * pathname (relative to the mount point of the file system) of the newly empty
+ * cgroup.
+ *
+ * The cg_contents mutex is held on entry and dropped before returning.
+ */
+void
+cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn, boolean_t is_exit)
+{
+ cgrp_node_t *parent;
+ char nm[MAXNAMELEN];
+ char *argstr, *oldstr, *tmp;
+ id_t cid;
+ proc_t *p = ttoproc(curthread);
+ zone_t *z = p->p_zone;
+ lx_lwp_data_t *plwpd = ttolxlwp(curthread);
+ cgrp_rra_arg_t *rarg;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ /* Nothing to do if the agent is not set */
+ if (cgm->cg_agent[0] == '\0') {
+ mutex_exit(&cgm->cg_contents);
+ return;
+ }
+
+ parent = cn->cgn_parent;
+ /* Cannot remove the top-level cgroup (only via unmount) */
+ if (parent == cn) {
+ mutex_exit(&cgm->cg_contents);
+ return;
+ }
+
+ argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ *argstr = '\0';
+
+ /*
+ * Iterate up the directory tree to construct the agent argument string.
+ */
+ do {
+ VERIFY0(cgrp_get_dirname(cn, nm, sizeof (nm)));
+ DTRACE_PROBE1(cgrp__dir__name, char *, nm);
+ if (*argstr == '\0') {
+ (void) snprintf(argstr, MAXPATHLEN, "/%s", nm);
+ } else {
+ tmp = oldstr;
+ oldstr = argstr;
+ argstr = tmp;
+ (void) snprintf(argstr, MAXPATHLEN, "/%s%s", nm,
+ oldstr);
+ }
+
+ if (cn->cgn_parent == NULL)
+ break;
+ cn = cn->cgn_parent;
+ parent = cn->cgn_parent;
+
+ /*
+ * The arg path is relative to the mountpoint so we stop when
+ * we get to the top level.
+ */
+ if (parent == NULL || parent == cn)
+ break;
+ } while (parent != cn);
+
+ kmem_free(oldstr, MAXPATHLEN);
+
+ rarg = kmem_alloc(sizeof (cgrp_rra_arg_t), KM_SLEEP);
+ rarg->crraa_agent_path = kmem_alloc(sizeof (cgm->cg_agent), KM_SLEEP);
+ (void) strlcpy(rarg->crraa_agent_path, cgm->cg_agent,
+ sizeof (cgm->cg_agent));
+ rarg->crraa_event_path = argstr;
+
+ DTRACE_PROBE2(cgrp__agent__event, cgrp_rra_arg_t *, rarg,
+ int, plwpd->br_cgroupid);
+
+ /*
+ * When we're exiting, the release agent process cannot belong to our
+ * cgroup. When the release agent is called for a move or rmdir, then
+ * we do not change our cgroupid.
+ */
+ if (is_exit) {
+ plwpd->br_cgroupid = 0;
+ }
+
+ /*
+ * The cg_contents mutex cannot be held while taking the pool lock
+ * or calling newproc.
+ */
+ mutex_exit(&cgm->cg_contents);
+
+ if (z->zone_defaultcid > 0) {
+ cid = z->zone_defaultcid;
+ } else {
+ pool_lock();
+ cid = pool_get_class(z->zone_pool);
+ pool_unlock();
+ }
+ if (cid == -1)
+ cid = defaultcid;
+
+ if (newproc(cgrp_run_rel_agent, (void *)rarg, cid, minclsyspri - 1,
+ NULL, -1) != 0) {
+ /* There's nothing we can do if creating the proc fails. */
+ kmem_free(rarg->crraa_event_path, MAXPATHLEN);
+ kmem_free(rarg->crraa_agent_path, sizeof (cgm->cg_agent));
+ kmem_free(rarg, sizeof (cgrp_rra_arg_t));
+ }
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ cgrp_node_t *cn;
+
+ mutex_enter(&cgm->cg_contents);
+ cn = cgrp_cg_hash_lookup(cgm, cg_id);
+ ASSERT(cn != NULL);
+ cn->cgn_task_cnt++;
+ mutex_exit(&cgm->cg_contents);
+
+ DTRACE_PROBE1(cgrp__lwp__fork, void *, cn);
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ cgrp_node_t *cn;
+
+ mutex_enter(&cgm->cg_contents);
+ cn = cgrp_cg_hash_lookup(cgm, cg_id);
+ ASSERT(cn != NULL);
+ if (cn->cgn_task_cnt == 0) {
+ /* top-level cgroup cnt can be 0 during reboot */
+ mutex_exit(&cgm->cg_contents);
+ return;
+ }
+ cn->cgn_task_cnt--;
+ DTRACE_PROBE1(cgrp__lwp__exit, void *, cn);
+
+ if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) &&
+ cn->cgn_notify == 1) {
+ cgrp_rel_agent_event(cgm, cn, B_TRUE);
+ ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents));
+ } else {
+ mutex_exit(&cgm->cg_contents);
+ }
+}
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
new file mode 100644
index 0000000000..0078ad7876
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
@@ -0,0 +1,1552 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <vm/seg_vn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/vm.h>
+#include <sys/prsystm.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+#include "cgrps.h"
+
+typedef enum cgrp_wr_type {
+ CG_WR_PROCS = 1,
+ CG_WR_TASKS
+} cgrp_wr_type_t;
+
+/* ARGSUSED1 */
+static int
+cgrp_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
+{
+ /*
+ * swapon to a cgrp file is not supported so access is denied on open
+ * if VISSWAP is set.
+ */
+ if ((*vpp)->v_flag & VISSWAP)
+ return (EINVAL);
+
+ return (0);
+}
+
+/* ARGSUSED1 */
+static int
+cgrp_close(struct vnode *vp, int flag, int count, offset_t offset,
+ struct cred *cred, caller_context_t *ct)
+{
+ cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+ cleanshares(vp, ttoproc(curthread)->p_pid);
+ return (0);
+}
+
+/*
+ * Lookup proc or task based on pid and typ.
+ */
+static proc_t *
+cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ)
+{
+ int i;
+ zoneid_t zoneid = curproc->p_zone->zone_id;
+ pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ /* getting a proc from a pid is easy */
+ if (typ == CG_WR_PROCS)
+ return (prfind(pid));
+
+ ASSERT(typ == CG_WR_TASKS);
+
+ /*
+ * We have to scan all of the process entries to find the proc
+ * containing this task.
+ */
+ mutex_exit(&pidlock);
+ for (i = 1; i < v.v_proc; i++) {
+ proc_t *p;
+ kthread_t *t;
+
+ mutex_enter(&pidlock);
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, system processes,
+ * a PID of 0, the pid for our zsched process, anything the
+ * security policy doesn't allow us to look at, its not an
+ * lx-branded process and processes that are not in the zone.
+ */
+ if ((p = pid_entry(i)) == NULL ||
+ p->p_stat == SIDL ||
+ (p->p_flag & SSYS) != 0 ||
+ p->p_pid == 0 ||
+ p->p_pid == schedpid ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+ p->p_brand != &lx_brand ||
+ p->p_zone->zone_id != zoneid) {
+ mutex_exit(&pidlock);
+ continue;
+ }
+
+ mutex_enter(&p->p_lock);
+ if ((t = p->p_tlist) == NULL) {
+ /* no threads, skip it */
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ continue;
+ }
+
+ /*
+ * Check all threads in this proc.
+ */
+ do {
+ lx_lwp_data_t *plwpd = ttolxlwp(t);
+ if (plwpd != NULL && plwpd->br_pid == pid) {
+ mutex_exit(&p->p_lock);
+ return (p);
+ }
+
+ t = t->t_forw;
+ } while (t != p->p_tlist);
+
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ }
+
+ mutex_enter(&pidlock);
+ return (NULL);
+}
+
+/*
+ * Move a thread from one cgroup to another. If the old cgroup is empty
+ * we queue up an agent event. We return true in that case since we've
+ * dropped the locks and the caller needs to reacquire them.
+ */
+static boolean_t
+cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn,
+ uint_t cg_id, proc_t *p)
+{
+ cgrp_node_t *ocn;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid);
+ VERIFY(ocn != NULL);
+
+ ASSERT(ocn->cgn_task_cnt > 0);
+ atomic_dec_32(&ocn->cgn_task_cnt);
+ atomic_inc_32(&ncn->cgn_task_cnt);
+ plwpd->br_cgroupid = cg_id;
+
+ if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) &&
+ ocn->cgn_notify == 1) {
+ /*
+ * We want to drop p_lock before queuing the event since
+ * that might sleep. Dropping p_lock might cause the caller to
+ * have to restart the move process from the beginning.
+ */
+ mutex_exit(&p->p_lock);
+ cgrp_rel_agent_event(cgm, ocn, B_FALSE);
+ ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents));
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Assign either all of the threads, or a single thread, for the specified pid
+ * to the new cgroup. Controlled by the typ argument.
+ */
+static int
+cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
+{
+ proc_t *p;
+ kthread_t *t;
+ int error;
+ cgrp_node_t *ncn;
+
+ if (pid == 1)
+ pid = curproc->p_zone->zone_proc_initpid;
+
+ /*
+ * Move one or all threads to this cgroup.
+ */
+ if (typ == CG_WR_TASKS) {
+ error = ESRCH;
+ } else {
+ error = 0;
+ }
+
+restart:
+ mutex_enter(&pidlock);
+
+ p = cgrp_p_for_wr(pid, typ);
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (ESRCH);
+ }
+
+ /*
+ * Fail writes for pids for which there is no corresponding process,
+ * system processes, a pid of 0, the pid for our zsched process,
+ * anything the security policy doesn't allow us to look at, and
+ * processes that are not in the zone.
+ */
+ if (p->p_stat == SIDL ||
+ (p->p_flag & SSYS) != 0 ||
+ p->p_pid == 0 ||
+ p->p_pid == curproc->p_zone->zone_zsched->p_pid ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+ p->p_zone->zone_id != curproc->p_zone->zone_id) {
+ mutex_exit(&pidlock);
+ return (ESRCH);
+ }
+
+ /*
+ * Ignore writes for PID which is not an lx-branded process or with
+ * no threads.
+ */
+
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+ if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL ||
+ p->p_flag & SEXITING) {
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+
+ mutex_enter(&cgm->cg_contents);
+
+ ncn = cgrp_cg_hash_lookup(cgm, cg_id);
+ VERIFY(ncn != NULL);
+
+ do {
+ lx_lwp_data_t *plwpd = ttolxlwp(t);
+ if (plwpd != NULL && plwpd->br_cgroupid != cg_id) {
+ if (typ == CG_WR_PROCS) {
+ if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+ /*
+ * We dropped all of the locks so we
+ * need to start over.
+ */
+ goto restart;
+ }
+
+ } else if (plwpd->br_pid == pid) {
+ /* type is CG_WR_TASKS and we found the task */
+ error = 0;
+ if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+ goto done;
+ } else {
+ break;
+ }
+ }
+ }
+ t = t->t_forw;
+ } while (t != p->p_tlist);
+
+ mutex_exit(&cgm->cg_contents);
+ mutex_exit(&p->p_lock);
+done:
+
+ return (error);
+}
+
+/*
+ * User-level is writing a pid string. We need to get that string and convert
+ * it to a pid. The user-level code has to completely write an entire pid
+ * string at once. The user-level code could write multiple strings (delimited
+ * by newline) although that is frowned upon. However, we must handle this
+ * case too. Thus we consume the input one byte at a time until we get a whole
+ * pid string. We can't consume more than a byte at a time since otherwise we
+ * might be left with a partial pid string.
+ */
+static int
+cgrp_get_pid_str(struct uio *uio, pid_t *pid)
+{
+ char buf[16]; /* big enough for a pid string */
+ int i;
+ int error;
+ char *p = &buf[0];
+ char *ep;
+ long pidnum;
+
+ bzero(buf, sizeof (buf));
+ for (i = 0; uio->uio_resid > 0 && i < sizeof (buf); i++, p++) {
+ error = uiomove(p, 1, UIO_WRITE, uio);
+ if (error != 0)
+ return (error);
+ if (buf[i] == '\n') {
+ buf[i] = '\0';
+ break;
+ }
+ }
+
+ if (buf[0] == '\0' || i >= sizeof (buf)) /* no input or too long */
+ return (EINVAL);
+
+ error = ddi_strtol(buf, &ep, 10, &pidnum);
+ if (error != 0 || *ep != '\0' || pidnum > maxpid || pidnum < 0)
+ return (EINVAL);
+
+ *pid = (pid_t)pidnum;
+ return (0);
+}
+
+static int
+cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio)
+{
+ int error;
+ uint_t value;
+
+ /*
+ * This is cheesy but since we only take a 0 or 1 value we can
+ * let the pid_str function do the uio string conversion.
+ */
+ error = cgrp_get_pid_str(uio, (pid_t *)&value);
+ if (error != 0)
+ return (error);
+
+ if (value != 0 && value != 1)
+ return (EINVAL);
+
+ /*
+ * The flag is on the containing dir. We don't bother taking the
+ * cg_contents lock since this is a simple assignment.
+ */
+ cn->cgn_parent->cgn_notify = value;
+ return (0);
+}
+
+static int
+cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+ int error;
+ int len;
+ char *wrp;
+
+ len = uio->uio_offset + uio->uio_resid;
+ if (len > MAXPATHLEN)
+ return (EFBIG);
+
+ mutex_enter(&cgm->cg_contents);
+
+ wrp = &cgm->cg_agent[uio->uio_offset];
+ error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio);
+ cgm->cg_agent[len] = '\0';
+ if (len > 1 && cgm->cg_agent[len - 1] == '\n')
+ cgm->cg_agent[len - 1] = '\0';
+
+ mutex_exit(&cgm->cg_contents);
+ return (error);
+}
+
+static int
+cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio,
+ cgrp_wr_type_t typ)
+{
+ /* the cgroup ID is on the containing dir */
+ uint_t cg_id = cn->cgn_parent->cgn_id;
+ int error;
+ pid_t pidnum;
+
+ while (uio->uio_resid > 0) {
+ error = cgrp_get_pid_str(uio, &pidnum);
+ if (error != 0)
+ return (error);
+
+ error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ);
+ if (error != 0)
+ return (error);
+ }
+
+ return (0);
+}
+
+static int
+cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+ int error = 0;
+ rlim64_t limit = uio->uio_llimit;
+
+ ASSERT(CGNTOV(cn)->v_type == VREG);
+
+ if (uio->uio_loffset < 0)
+ return (EINVAL);
+
+ if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+ limit = MAXOFFSET_T;
+
+ if (uio->uio_loffset >= MAXOFF_T)
+ return (EFBIG);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ if (limit > MAXOFF_T)
+ limit = MAXOFF_T;
+
+ switch (cn->cgn_type) {
+ case CG_NOTIFY:
+ error = cgrp_wr_notify(cn, uio);
+ break;
+ case CG_PROCS:
+ error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS);
+ break;
+ case CG_REL_AGENT:
+ error = cgrp_wr_rel_agent(cgm, uio);
+ break;
+ case CG_TASKS:
+ error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS);
+ break;
+ default:
+ VERIFY(0);
+ }
+
+ return (error);
+}
+
+/*
+ * Read value from the notify_on_release pseudo file on the parent node
+ * (which is the actual cgroup node). We don't bother taking the cg_contents
+ * lock since it's a single instruction so an empty group action/read will
+ * only see one value or the other.
+ */
+/* ARGSUSED */
+static int
+cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+ int len;
+ int error = 0;
+ char buf[16];
+ char *rdp;
+ /* the flag is on the containing dir */
+ uint_t value = cn->cgn_parent->cgn_notify;
+
+ len = snprintf(buf, sizeof (buf), "%u\n", value);
+ if (uio->uio_offset > len)
+ return (0);
+
+ len -= uio->uio_offset;
+ rdp = &buf[uio->uio_offset];
+ len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+ error = uiomove(rdp, len, UIO_READ, uio);
+ return (error);
+}
+
+/*
+ * Read value from the release_agent pseudo file.
+ */
+static int
+cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+ int len;
+ int error = 0;
+ char *rdp;
+
+ mutex_enter(&cgm->cg_contents);
+
+ if (cgm->cg_agent[0] == '\0') {
+ mutex_exit(&cgm->cg_contents);
+ return (0);
+ }
+
+ len = strlen(cgm->cg_agent);
+ if (uio->uio_offset > len) {
+ mutex_exit(&cgm->cg_contents);
+ return (0);
+ }
+
+ len -= uio->uio_offset;
+ rdp = &cgm->cg_agent[uio->uio_offset];
+ len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+ error = uiomove(rdp, len, UIO_READ, uio);
+
+ mutex_exit(&cgm->cg_contents);
+
+ return (error);
+}
+
+/*
+ * Read pids from the cgroup.procs pseudo file. We have to look at all of the
+ * processes to find applicable ones, then report pids for any process which
+ * has all of its threads in the same cgroup.
+ */
+static int
+cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+ int i;
+ ssize_t offset = 0;
+ ssize_t uresid;
+ zoneid_t zoneid = curproc->p_zone->zone_id;
+ int error = 0;
+ pid_t initpid = curproc->p_zone->zone_proc_initpid;
+ pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+ /* the cgroup ID is on the containing dir */
+ uint_t cg_id = cn->cgn_parent->cgn_id;
+
+ /* Scan all of the process entries */
+ for (i = 1; i < v.v_proc && (uresid = uio->uio_resid) > 0; i++) {
+ proc_t *p;
+ ssize_t len;
+ pid_t pid;
+ char buf[16];
+ char *rdp;
+ kthread_t *t;
+ boolean_t in_cg;
+
+ mutex_enter(&pidlock);
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, system processes,
+ * a PID of 0, the pid for our zsched process, anything the
+ * security policy doesn't allow us to look at, its not an
+ * lx-branded process and processes that are not in the zone.
+ */
+ if ((p = pid_entry(i)) == NULL ||
+ p->p_stat == SIDL ||
+ (p->p_flag & SSYS) != 0 ||
+ p->p_pid == 0 ||
+ p->p_pid == schedpid ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+ p->p_brand != &lx_brand ||
+ p->p_zone->zone_id != zoneid) {
+ mutex_exit(&pidlock);
+ continue;
+ }
+
+ mutex_enter(&p->p_lock);
+ if ((t = p->p_tlist) == NULL) {
+ /* no threads, skip it */
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ continue;
+ }
+
+ /*
+ * Check if all threads are in this cgroup.
+ */
+ in_cg = B_TRUE;
+ mutex_enter(&cgm->cg_contents);
+ do {
+ lx_lwp_data_t *plwpd = ttolxlwp(t);
+ if (plwpd == NULL || plwpd->br_cgroupid != cg_id) {
+ in_cg = B_FALSE;
+ break;
+ }
+
+ t = t->t_forw;
+ } while (t != p->p_tlist);
+ mutex_exit(&cgm->cg_contents);
+
+ mutex_exit(&p->p_lock);
+ if (!in_cg) {
+ /*
+ * This proc, or at least one of its threads, is not
+ * in this cgroup.
+ */
+ mutex_exit(&pidlock);
+ continue;
+ }
+
+ /*
+ * Convert pid to the Linux default of 1 if we're the zone's
+ * init process, otherwise use the value from the proc struct
+ */
+ if (p->p_pid == initpid) {
+ pid = 1;
+ } else {
+ pid = p->p_pid;
+ }
+
+ mutex_exit(&pidlock);
+
+ /*
+ * Generate pid line and write all or part of it if we're
+ * in the right spot within the pseudo file.
+ */
+ len = snprintf(buf, sizeof (buf), "%u\n", pid);
+ if ((offset + len) > uio->uio_offset) {
+ int diff = (int)(uio->uio_offset - offset);
+
+ ASSERT(diff < len);
+ offset += diff;
+ rdp = &buf[diff];
+ len -= diff;
+ if (len > uresid)
+ len = uresid;
+
+ error = uiomove(rdp, len, UIO_READ, uio);
+ if (error != 0)
+ return (error);
+ }
+ offset += len;
+ }
+
+ return (0);
+}
+
+/*
+ * We are given a locked process we know is valid, report on any of its thresds
+ * that are in the cgroup.
+ */
+static int
+cgrp_rd_proc_tasks(uint_t cg_id, proc_t *p, pid_t initpid, ssize_t *offset,
+ struct uio *uio)
+{
+ int error = 0;
+ uint_t tid;
+ char buf[16];
+ char *rdp;
+ kthread_t *t;
+
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+
+ /*
+ * Report all threads in this cgroup.
+ */
+ t = p->p_tlist;
+ do {
+ lx_lwp_data_t *plwpd = ttolxlwp(t);
+ if (plwpd == NULL) {
+ t = t->t_forw;
+ continue;
+ }
+
+ if (plwpd->br_cgroupid == cg_id) {
+ int len;
+
+ /*
+ * Convert taskid to the Linux default of 1 if
+ * we're the zone's init process.
+ */
+ tid = plwpd->br_pid;
+ if (tid == initpid)
+ tid = 1;
+
+ len = snprintf(buf, sizeof (buf), "%u\n", tid);
+ if ((*offset + len) > uio->uio_offset) {
+ int diff;
+
+ diff = (int)(uio->uio_offset - *offset);
+ ASSERT(diff < len);
+ *offset = *offset + diff;
+ rdp = &buf[diff];
+ len -= diff;
+ if (len > uio->uio_resid)
+ len = uio->uio_resid;
+
+ error = uiomove(rdp, len, UIO_READ, uio);
+ if (error != 0)
+ return (error);
+ }
+ *offset = *offset + len;
+ }
+
+ t = t->t_forw;
+ } while (t != p->p_tlist && uio->uio_resid > 0);
+
+ return (0);
+}
+
+/*
+ * Read PIDs from the tasks pseudo file. In order to do this, the process
+ * table is walked, searching for entries which are in the correct state and
+ * match this zone. The LX emulated PIDs will be reported from branded entries
+ * which fulfill the criteria. Since records are being emulated for every task
+ * in the process, PR_LOCK is acquired to prevent changes during output.
+ *
+ * Note: If the buffer is filled and the accessing process is forced into a
+ * subsequent read, the reported threads may changes while locks are dropped in
+ * the mean time.
+ */
+static int
+cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+ int i;
+ ssize_t offset = 0;
+ zoneid_t zoneid = curproc->p_zone->zone_id;
+ cred_t *cred = CRED();
+ int error = 0;
+ pid_t initpid = curproc->p_zone->zone_proc_initpid;
+ /* the cgroup ID is on the containing dir */
+ uint_t cg_id = cn->cgn_parent->cgn_id;
+
+ /* Scan all of the process entries */
+ for (i = 1; i < v.v_proc && uio->uio_resid > 0; i++) {
+ proc_t *p;
+
+ mutex_enter(&pidlock);
+ for (;;) {
+ if ((p = pid_entry(i)) == NULL) {
+ /* Quickly move onto the next slot */
+ if (++i < v.v_proc) {
+ continue;
+ } else {
+ mutex_exit(&pidlock);
+ break;
+ }
+ }
+
+ /*
+ * Check if this process would even be of interest to
+ * cgroupfs before attempting to acquire its PR_LOCK.
+ */
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+ if (p->p_brand != &lx_brand ||
+ p->p_zone->zone_id != zoneid) {
+ mutex_exit(&p->p_lock);
+ p = NULL;
+ break;
+ }
+
+ /* Attempt to grab P_PR_LOCK. */
+ error = sprtrylock_proc(p);
+ if (error == 0) {
+ /* Success */
+ break;
+ } else if (error < 0) {
+ /*
+ * This process is not in a state where
+ * P_PR_LOCK can be acquired. It either
+ * belongs to the system or is a zombie.
+ * Regardless, give up and move on.
+ */
+ mutex_exit(&p->p_lock);
+ p = NULL;
+ break;
+ } else {
+ /*
+ * Wait until P_PR_LOCK is no longer contended
+ * and attempt to acquire it again. Since the
+ * process may have changed state, the entry
+ * lookup must be repeated.
+ */
+ sprwaitlock_proc(p);
+ mutex_enter(&pidlock);
+ }
+ }
+
+ if (p == NULL) {
+ continue;
+ } else if (secpolicy_basic_procinfo(cred, p, curproc) != 0) {
+ sprunlock(p);
+ continue;
+ }
+
+ /* Shuffle locks and output the entry. */
+ mutex_exit(&p->p_lock);
+ mutex_enter(&cgm->cg_contents);
+ error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio);
+ mutex_exit(&cgm->cg_contents);
+ mutex_enter(&p->p_lock);
+
+ sprunlock(p);
+ if (error != 0) {
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static int
+cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+ int error = 0;
+
+ if (uio->uio_loffset >= MAXOFF_T)
+ return (0);
+ if (uio->uio_loffset < 0)
+ return (EINVAL);
+ if (uio->uio_resid == 0)
+ return (0);
+
+ switch (cn->cgn_type) {
+ case CG_NOTIFY:
+ error = cgrp_rd_notify(cgm, cn, uio);
+ break;
+ case CG_PROCS:
+ error = cgrp_rd_procs(cgm, cn, uio);
+ break;
+ case CG_REL_AGENT:
+ error = cgrp_rd_rel_agent(cgm, uio);
+ break;
+ case CG_TASKS:
+ error = cgrp_rd_tasks(cgm, cn, uio);
+ break;
+ default:
+ VERIFY(0);
+ }
+
+ return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
+ struct caller_context *ct)
+{
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm = VTOCGM(vp);
+ int error;
+
+ /*
+ * We don't support reading non-regular files
+ */
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ if (vp->v_type != VREG)
+ return (EINVAL);
+ error = cgrp_rd(cgm, cn, uiop);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
+ struct caller_context *ct)
+{
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm = VTOCGM(vp);
+ int error;
+
+ /*
+ * We don't support writing to non-regular files
+ */
+ if (vp->v_type != VREG)
+ return (EINVAL);
+
+ if (ioflag & FAPPEND) {
+ /* In append mode start at end of file. */
+ uiop->uio_loffset = cn->cgn_size;
+ }
+
+ error = cgrp_wr(cgm, cn, uiop);
+
+ return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
+ caller_context_t *ct)
+{
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
+
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+ vap->va_type = vp->v_type;
+ vap->va_mode = cn->cgn_mode & MODEMASK;
+ vap->va_uid = cn->cgn_uid;
+ vap->va_gid = cn->cgn_gid;
+ vap->va_fsid = cn->cgn_fsid;
+ vap->va_nodeid = (ino64_t)cn->cgn_nodeid;
+ vap->va_nlink = cn->cgn_nlink;
+ vap->va_size = (u_offset_t)cn->cgn_size;
+ vap->va_atime = cn->cgn_atime;
+ vap->va_mtime = cn->cgn_mtime;
+ vap->va_ctime = cn->cgn_ctime;
+ vap->va_blksize = PAGESIZE;
+ vap->va_rdev = cn->cgn_rdev;
+ vap->va_seq = cn->cgn_seq;
+
+ vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+ mutex_exit(&cgm->cg_contents);
+ return (0);
+}
+
+/*ARGSUSED4*/
+static int
+cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
+ caller_context_t *ct)
+{
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
+ int error = 0;
+ struct vattr *get;
+ long mask;
+
+ /*
+ * Cannot set these attributes
+ */
+ if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR) ||
+ (vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE))
+ return (EINVAL);
+
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+
+ get = &cn->cgn_attr;
+ /*
+ * Change file access modes. Must be owner or have sufficient
+ * privileges.
+ */
+ error = secpolicy_vnode_setattr(cred, vp, vap, get, flags, cgrp_taccess,
+ cn);
+
+ if (error)
+ goto out;
+
+ mask = vap->va_mask;
+
+ if (mask & AT_MODE) {
+ get->va_mode &= S_IFMT;
+ get->va_mode |= vap->va_mode & ~S_IFMT;
+ }
+
+ if (mask & AT_UID)
+ get->va_uid = vap->va_uid;
+ if (mask & AT_GID)
+ get->va_gid = vap->va_gid;
+ if (mask & AT_ATIME)
+ get->va_atime = vap->va_atime;
+ if (mask & AT_MTIME)
+ get->va_mtime = vap->va_mtime;
+
+ if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+ gethrestime(&cn->cgn_ctime);
+
+out:
+ mutex_exit(&cgm->cg_contents);
+ return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred,
+ caller_context_t *ct)
+{
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
+ int error;
+
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+ error = cgrp_taccess(cn, mode, cred);
+ mutex_exit(&cgm->cg_contents);
+ return (error);
+}
+
+/* ARGSUSED3 */
+static int
+cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+ struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+ caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+ cgrp_node_t *cn = VTOCGN(dvp);
+ cgrp_mnt_t *cgm;
+ cgrp_node_t *ncn = NULL;
+ int error;
+
+ /* disallow extended attrs */
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ /*
+ * Null component name is a synonym for directory being searched.
+ */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ }
+ ASSERT(cn);
+
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+ error = cgrp_dirlookup(cn, nm, &ncn, cred);
+ mutex_exit(&cgm->cg_contents);
+
+ if (error == 0) {
+ ASSERT(ncn);
+ *vpp = CGNTOV(ncn);
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap,
+ enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred,
+ int flag, caller_context_t *ct, vsecattr_t *vsecp)
+{
+ cgrp_node_t *parent = VTOCGN(dvp);
+ cgrp_node_t *cn = NULL;
+ cgrp_mnt_t *cgm;
+ int error;
+
+ if (*nm == '\0')
+ return (EPERM);
+
+ cgm = VTOCGM(parent->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+ error = cgrp_dirlookup(parent, nm, &cn, cred);
+ if (error == 0) { /* name found */
+ ASSERT(cn);
+
+ mutex_exit(&cgm->cg_contents);
+ /*
+ * Creating an existing file, allow it except for the following
+ * errors.
+ */
+ if (exclusive == EXCL) {
+ error = EEXIST;
+ } else if ((CGNTOV(cn)->v_type == VDIR) && (mode & VWRITE)) {
+ error = EISDIR;
+ } else {
+ error = cgrp_taccess(cn, mode, cred);
+ }
+ if (error != 0) {
+ cgnode_rele(cn);
+ return (error);
+ }
+ *vpp = CGNTOV(cn);
+ return (0);
+ }
+ mutex_exit(&cgm->cg_contents);
+
+ /*
+ * cgroups doesn't allow creation of additional, non-subsystem specific
+ * files in a dir
+ */
+ return (EPERM);
+}
+
+/* ARGSUSED3 */
+static int
+cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred,
+ caller_context_t *ct, int flags)
+{
+ cgrp_node_t *parent = VTOCGN(dvp);
+ int error;
+ cgrp_node_t *cn = NULL;
+ cgrp_mnt_t *cgm;
+
+ /*
+ * Removal of subsystem-specific files is not allowed but we need
+ * to return the correct error if they try to remove a non-existent
+ * file.
+ */
+
+ cgm = VTOCGM(parent->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+ error = cgrp_dirlookup(parent, nm, &cn, cred);
+ mutex_exit(&cgm->cg_contents);
+ if (error)
+ return (error);
+
+ ASSERT(cn);
+ cgnode_rele(cn);
+ return (EPERM);
+}
+
+/* ARGSUSED */
+static int
+cgrp_link(struct vnode *dvp, struct vnode *srcvp, char *cnm, struct cred *cred,
+ caller_context_t *ct, int flags)
+{
+ /* cgroups doesn't support hard links */
+ return (EPERM);
+}
+
+/*
+ * Rename of subsystem-specific files is not allowed but we can rename
+ * directories (i.e. sub-groups). We cannot mv subdirs from one group to
+ * another so the src and dest vnode must be the same.
+ */
+/* ARGSUSED5 */
+static int
+cgrp_rename(
+ struct vnode *odvp, /* source parent vnode */
+ char *onm, /* source name */
+ struct vnode *ndvp, /* destination parent vnode */
+ char *nnm, /* destination name */
+ struct cred *cred,
+ caller_context_t *ct,
+ int flags)
+{
+ cgrp_node_t *fromparent;
+ cgrp_node_t *toparent;
+ cgrp_node_t *fromcn = NULL; /* source cgrp_node */
+ cgrp_mnt_t *cgm = VTOCGM(odvp);
+ int error, err;
+
+ fromparent = VTOCGN(odvp);
+ toparent = VTOCGN(ndvp);
+
+ if (fromparent != toparent)
+ return (EIO);
+
+ /* discourage additional use of toparent */
+ toparent = NULL;
+
+ mutex_enter(&cgm->cg_contents);
+
+ /*
+ * Look up cgrp_node of file we're supposed to rename.
+ */
+ error = cgrp_dirlookup(fromparent, onm, &fromcn, cred);
+ if (error) {
+ mutex_exit(&cgm->cg_contents);
+ return (error);
+ }
+
+ if (fromcn->cgn_type != CG_CGROUP_DIR) {
+ error = EPERM;
+ goto done;
+ }
+
+ /*
+ * Make sure we can delete the old (source) entry. This
+ * requires write permission on the containing directory.
+ */
+ if (((error = cgrp_taccess(fromparent, VWRITE, cred)) != 0))
+ goto done;
+
+ /*
+ * Check for renaming to or from '.' or '..' or that
+ * fromcn == fromparent
+ */
+ if ((onm[0] == '.' &&
+ (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
+ (nnm[0] == '.' &&
+ (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) ||
+ (fromparent == fromcn)) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * Link source to new target
+ */
+ error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME,
+ fromcn, (struct vattr *)NULL,
+ (cgrp_node_t **)NULL, cred);
+
+ if (error)
+ goto done;
+
+ /*
+ * Unlink from source.
+ */
+ error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred);
+
+ /*
+ * The following handles the case where our source cgrp_node was
+ * removed before we got to it.
+ */
+ if (error == ENOENT)
+ error = 0;
+
+ if (err == 0) {
+ vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct);
+ vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct);
+ }
+
+done:
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(fromcn);
+
+ return (error);
+}
+
+/* ARGSUSED5 */
+static int
+cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
+ struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+ cgrp_node_t *parent = VTOCGN(dvp);
+ cgrp_node_t *self = NULL;
+ cgrp_mnt_t *cgm = VTOCGM(dvp);
+ int error;
+
+ /*
+ * Might be dangling directory. Catch it here, because a ENOENT
+ * return from cgrp_dirlookup() is an "ok return".
+ */
+ if (parent->cgn_nlink == 0)
+ return (ENOENT);
+
+ mutex_enter(&cgm->cg_contents);
+ error = cgrp_dirlookup(parent, nm, &self, cred);
+ if (error == 0) {
+ ASSERT(self != NULL);
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(self);
+ return (EEXIST);
+ }
+ if (error != ENOENT) {
+ mutex_exit(&cgm->cg_contents);
+ return (error);
+ }
+
+ error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL,
+ va, &self, cred);
+ if (error) {
+ mutex_exit(&cgm->cg_contents);
+ if (self != NULL)
+ cgnode_rele(self);
+ return (error);
+ }
+ mutex_exit(&cgm->cg_contents);
+ *vpp = CGNTOV(self);
+ return (0);
+}
+
+/* ARGSUSED4 */
+static int
+cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
+ caller_context_t *ct, int flags)
+{
+ cgrp_node_t *parent = VTOCGN(dvp);
+ cgrp_mnt_t *cgm;
+ cgrp_node_t *self = NULL;
+ struct vnode *vp;
+ int error = 0;
+
+ /*
+ * Return error when removing . and ..
+ */
+ if (strcmp(nm, ".") == 0)
+ return (EINVAL);
+ if (strcmp(nm, "..") == 0)
+ return (EEXIST); /* Should be ENOTEMPTY */
+
+ cgm = VTOCGM(parent->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+
+ error = cgrp_dirlookup(parent, nm, &self, cred);
+ if (error) {
+ mutex_exit(&cgm->cg_contents);
+ return (error);
+ }
+
+ vp = CGNTOV(self);
+ if (vp == dvp || vp == cdir) {
+ error = EINVAL;
+ goto done;
+ }
+ if (self->cgn_type != CG_CGROUP_DIR) {
+ error = ENOTDIR;
+ goto done;
+ }
+
+ cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp);
+
+ /*
+ * Check for the existence of any sub-cgroup directories or tasks in
+ * the cgroup.
+ */
+ if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) {
+ error = EEXIST;
+ /*
+ * Update atime because checking cn_dirents is logically
+ * equivalent to reading the directory
+ */
+ gethrestime(&self->cgn_atime);
+ goto done;
+ }
+
+ if (vn_vfswlock(vp)) {
+ error = EBUSY;
+ goto done;
+ }
+ if (vn_mountedvfs(vp) != NULL) {
+ error = EBUSY;
+ } else {
+ error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred);
+ }
+
+ vn_vfsunlock(vp);
+
+ if (parent->cgn_task_cnt == 0 &&
+ parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) {
+ cgrp_rel_agent_event(cgm, parent, B_FALSE);
+ ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents));
+ goto dropped;
+ }
+
+done:
+ mutex_exit(&cgm->cg_contents);
+dropped:
+ vnevent_rmdir(CGNTOV(self), dvp, nm, ct);
+ cgnode_rele(self);
+
+ return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
+ cgrp_dirent_t *cdp;
+ int error = 0;
+ size_t namelen;
+ struct dirent64 *dp;
+ ulong_t offset;
+ ulong_t total_bytes_wanted;
+ long outcount = 0;
+ long bufsize;
+ int reclen;
+ caddr_t outbuf;
+
+ if (uiop->uio_loffset >= MAXOFF_T) {
+ if (eofp)
+ *eofp = 1;
+ return (0);
+ }
+
+ if (uiop->uio_iovcnt != 1)
+ return (EINVAL);
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+
+ if (cn->cgn_dir == NULL) {
+ VERIFY(cn->cgn_nlink == 0);
+ mutex_exit(&cgm->cg_contents);
+ return (0);
+ }
+
+ /*
+ * Get space for multiple directory entries
+ */
+ total_bytes_wanted = uiop->uio_iov->iov_len;
+ bufsize = total_bytes_wanted + sizeof (struct dirent64);
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+ /* LINTED: alignment */
+ dp = (struct dirent64 *)outbuf;
+
+ offset = 0;
+ cdp = cn->cgn_dir;
+ while (cdp) {
+ namelen = strlen(cdp->cgd_name); /* no +1 needed */
+ offset = cdp->cgd_offset;
+ if (offset >= uiop->uio_offset) {
+ reclen = (int)DIRENT64_RECLEN(namelen);
+ if (outcount + reclen > total_bytes_wanted) {
+ if (!outcount) {
+ /* Buffer too small for any entries. */
+ error = EINVAL;
+ }
+ break;
+ }
+ ASSERT(cdp->cgd_cgrp_node != NULL);
+
+ /* use strncpy(9f) to zero out uninitialized bytes */
+
+ (void) strncpy(dp->d_name, cdp->cgd_name,
+ DIRENT64_NAMELEN(reclen));
+ dp->d_reclen = (ushort_t)reclen;
+ dp->d_ino = (ino64_t)cdp->cgd_cgrp_node->cgn_nodeid;
+ dp->d_off = (offset_t)cdp->cgd_offset + 1;
+ dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+ outcount += reclen;
+ ASSERT(outcount <= bufsize);
+ }
+ cdp = cdp->cgd_next;
+ }
+
+ if (!error)
+ error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+ if (!error) {
+ /*
+ * If we reached the end of the list our offset should now be
+ * just past the end.
+ */
+ if (!cdp) {
+ offset += 1;
+ if (eofp)
+ *eofp = 1;
+ } else if (eofp)
+ *eofp = 0;
+ uiop->uio_offset = offset;
+ }
+ gethrestime(&cn->cgn_atime);
+
+ mutex_exit(&cgm->cg_contents);
+
+ kmem_free(outbuf, bufsize);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm,
+ struct cred *cred, caller_context_t *ct, int flags)
+{
+ /* cgroups doesn't support symlinks */
+ return (EPERM);
+}
+
+/* ARGSUSED */
+static void
+cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
+{
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp);
+
+ mutex_enter(&cgm->cg_contents);
+ mutex_enter(&vp->v_lock);
+ ASSERT(vp->v_count >= 1);
+
+ /*
+ * If we don't have the last hold or the link count is non-zero,
+ * there's little to do -- just drop our hold.
+ */
+ if (vp->v_count > 1 || cn->cgn_nlink != 0) {
+ vp->v_count--;
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&cgm->cg_contents);
+ return;
+ }
+
+ if (cn->cgn_forw == NULL)
+ cgm->cg_rootnode->cgn_back = cn->cgn_back;
+ else
+ cn->cgn_forw->cgn_back = cn->cgn_back;
+ cn->cgn_back->cgn_forw = cn->cgn_forw;
+
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&cgm->cg_contents);
+
+ /* Here's our chance to send invalid event */
+ vn_invalid(CGNTOV(cn));
+
+ vn_free(CGNTOV(cn));
+ kmem_free(cn, sizeof (cgrp_node_t));
+}
+
+/* ARGSUSED */
+static int
+cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/* ARGSUSED */
+static int
+cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+ return (write_lock);
+}
+
+/* ARGSUSED */
+static void
+cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+}
+
+static int
+cgrp_pathconf(struct vnode *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+ int error;
+
+ switch (cmd) {
+ case _PC_XATTR_EXISTS:
+ if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
+ *valp = 0; /* assume no attributes */
+ error = 0; /* okay to ask */
+ } else {
+ error = EINVAL;
+ }
+ break;
+ case _PC_SATTR_ENABLED:
+ case _PC_SATTR_EXISTS:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+ (vp->v_type == VREG || vp->v_type == VDIR);
+ error = 0;
+ break;
+ case _PC_TIMESTAMP_RESOLUTION:
+ /* nanosecond timestamp resolution */
+ *valp = 1L;
+ error = 0;
+ break;
+ default:
+ error = fs_pathconf(vp, cmd, valp, cr, ct);
+ }
+ return (error);
+}
+
+
+struct vnodeops *cgrp_vnodeops;
+
+const fs_operation_def_t cgrp_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = cgrp_open },
+ VOPNAME_CLOSE, { .vop_close = cgrp_close },
+ VOPNAME_READ, { .vop_read = cgrp_read },
+ VOPNAME_WRITE, { .vop_write = cgrp_write },
+ VOPNAME_GETATTR, { .vop_getattr = cgrp_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = cgrp_setattr },
+ VOPNAME_ACCESS, { .vop_access = cgrp_access },
+ VOPNAME_LOOKUP, { .vop_lookup = cgrp_lookup },
+ VOPNAME_CREATE, { .vop_create = cgrp_create },
+ VOPNAME_REMOVE, { .vop_remove = cgrp_remove },
+ VOPNAME_LINK, { .vop_link = cgrp_link },
+ VOPNAME_RENAME, { .vop_rename = cgrp_rename },
+ VOPNAME_MKDIR, { .vop_mkdir = cgrp_mkdir },
+ VOPNAME_RMDIR, { .vop_rmdir = cgrp_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = cgrp_readdir },
+ VOPNAME_SYMLINK, { .vop_symlink = cgrp_symlink },
+ VOPNAME_INACTIVE, { .vop_inactive = cgrp_inactive },
+ VOPNAME_RWLOCK, { .vop_rwlock = cgrp_rwlock },
+ VOPNAME_RWUNLOCK, { .vop_rwunlock = cgrp_rwunlock },
+ VOPNAME_SEEK, { .vop_seek = cgrp_seek },
+ VOPNAME_PATHCONF, { .vop_pathconf = cgrp_pathconf },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd.h b/usr/src/uts/common/brand/lx/devfs/lxd.h
new file mode 100644
index 0000000000..437b0b6162
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd.h
@@ -0,0 +1,244 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _LXD_H
+#define _LXD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxd.h: declarations, data structures and macros for lxd (lxd devfs).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/atomic.h>
+#include <vm/anon.h>
+#include <sys/lx_types.h>
+
+#if defined(_KERNEL)
+
+#include <sys/lx_brand.h>
+
+/*
+ * It's unlikely that we need to create more than 50-60 subdirs/symlinks
+ * as front files so we size the file system hash for 2x that number.
+ * The back devfs typically has ~80 nodes so this is also a comfortable size
+ * for the back hash table.
+ */
+#define LXD_HASH_SZ 128
+
+#define LXD_BACK_HASH(v) ((((intptr_t)(v)) >> 10) & ((LXD_HASH_SZ) - 1))
+
+#define LXD_NM_HASH(ldn, name, hash) \
+ { \
+ char Xc, *Xcp; \
+ hash = (uint_t)(uintptr_t)(ldn) >> 8; \
+ for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \
+ hash = (hash << 4) + hash + (uint_t)Xc; \
+ hash &= (LXD_HASH_SZ - 1); \
+ }
+
+
+enum lxd_node_type { LXDNT_NONE, LXDNT_BACK, LXDNT_FRONT };
+
+typedef struct lxd_dev_attr {
+ list_node_t lxda_link;
+ char lxda_name[MAXPATHLEN];
+ uid_t lxda_uid;
+ gid_t lxda_gid;
+ mode_t lxda_mode;
+} lxd_dev_attr_t;
+
+/*
+ * lxd per-mount data structure.
+ *
+ * All fields are protected by lxd_contents.
+ * File renames on a specific file system are protected lxdm_renamelck.
+ */
+typedef struct lxd_mnt {
+ struct vfs *lxdm_vfsp; /* filesystem's vfs struct */
+ struct lxd_node *lxdm_rootnode; /* root lxd_node */
+ char *lxdm_mntpath; /* name of lxd mount point */
+ dev_t lxdm_dev; /* unique dev # of mounted `device' */
+ kmutex_t lxdm_contents; /* per-mount lock */
+ kmutex_t lxdm_renamelck; /* rename lock for this mount */
+ kmutex_t lxdm_attrlck; /* per-mount attr. file lock */
+ list_t lxdm_devattrs; /* list of device attr. settings */
+ uint_t lxdm_gen; /* node ID source for files */
+
+ /* protects buckets in both "dir ent" and "back" hash tables */
+ kmutex_t lxdm_hash_mutex[LXD_HASH_SZ];
+
+ /* per-mount data for "back" vnodes in the fs */
+ uint_t lxdm_back_refcnt; /* # outstanding "back" vnodes */
+ struct lxd_node *lxdm_back_htable[LXD_HASH_SZ];
+
+ /*
+ * Per-mount directory data for "front" nodes in the fs.
+ * Each front node has a directory entry but directory entries can live
+ * on either front or back nodes.
+ */
+ uint_t lxdm_dent_refcnt; /* # outstanding dir ents */
+ struct lxd_dirent *lxdm_dent_htable[LXD_HASH_SZ];
+} lxd_mnt_t;
+
+/*
+ * lxd_node is the file system dependent node for lxd.
+ *
+ * The node is used to represent both front and back files. For front files
+ * the node can represent either a directory or symlink.
+ */
+typedef struct lxd_node {
+ enum lxd_node_type lxdn_type;
+
+ /* Data for "front" nodes */
+ struct lxd_node *lxdn_prev; /* lnked lst of lxd nodes */
+ struct lxd_node *lxdn_next; /* lnked lst of lxd nodes */
+ struct lxd_node *lxdn_parent; /* dir containing this node */
+ krwlock_t lxdn_rwlock; /* serialize mods/dir updates */
+ kmutex_t lxdn_tlock; /* time, flag, and nlink lock */
+
+ /* these could be in a union ala tmpfs but not really necessary */
+ uint_t lxdn_dirents; /* number of dirents */
+ struct lxd_dirent *lxdn_dir; /* dirent list */
+ char *lxdn_symlink; /* pointer to symlink */
+ struct vattr lxdn_attr; /* attributes */
+
+ /* Hash table link */
+ struct lxd_node *lxdn_hnxt; /* link in per-mount entry */
+ /* hash table */
+ vnode_t *lxdn_vnode; /* vnode for this lxd_node */
+
+ vnode_t *lxdn_real_vp; /* back file - real vnode */
+} lxd_node_t;
+
+/*
+ * Attributes
+ */
+#define lxdn_mask lxdn_attr.va_mask
+#define lxdn_mode lxdn_attr.va_mode
+#define lxdn_uid lxdn_attr.va_uid
+#define lxdn_gid lxdn_attr.va_gid
+#define lxdn_fsid lxdn_attr.va_fsid
+#define lxdn_nodeid lxdn_attr.va_nodeid
+#define lxdn_nlink lxdn_attr.va_nlink
+#define lxdn_size lxdn_attr.va_size
+#define lxdn_atime lxdn_attr.va_atime
+#define lxdn_mtime lxdn_attr.va_mtime
+#define lxdn_ctime lxdn_attr.va_ctime
+#define lxdn_rdev lxdn_attr.va_rdev
+#define lxdn_blksize lxdn_attr.va_blksize
+#define lxdn_nblocks lxdn_attr.va_nblocks
+#define lxdn_seq lxdn_attr.va_seq
+
+/*
+ * lx devfs conversion macros
+ */
+#define VFSTOLXDM(vfsp) ((lxd_mnt_t *)(vfsp)->vfs_data)
+#define VTOLXDM(vp) ((lxd_mnt_t *)(vp)->v_vfsp->vfs_data)
+#define VTOLDN(vp) ((lxd_node_t *)(vp)->v_data)
+#define LDNTOV(ln) ((ln)->lxdn_vnode)
+#define ldnode_hold(ln) VN_HOLD(LDNTOV(ln))
+#define ldnode_rele(ln) VN_RELE(LDNTOV(ln))
+
+#define REALVP(vp) (VTOLDN(vp)->lxdn_real_vp)
+
+/*
+ * front directories are made up of a linked list of lxd_dirent structures
+ * hanging off directory lxdn_nodes. File names are not fixed length, but are
+ * null terminated.
+ */
+typedef struct lxd_dirent {
+ lxd_node_t *lddir_node; /* lxd node for this file */
+ struct lxd_dirent *lddir_next; /* next directory entry */
+ struct lxd_dirent *lddir_prev; /* prev directory entry */
+ uint_t lddir_offset; /* "offset" of dir entry */
+ uint_t lddir_hash; /* a hash of lddir_name */
+ struct lxd_dirent *lddir_link; /* linked via hash table */
+ lxd_node_t *lddir_parent; /* parent, dir we are in */
+ char *lddir_name; /* null terminated */
+} lxd_dirent_t;
+
+enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */
+enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */
+
+typedef struct lxd_minor_translator {
+ char *lxd_mt_path; /* illumos minor node path */
+ minor_t lxd_mt_minor; /* illumos minor node number */
+ int lxd_mt_lx_major; /* linux major node number */
+ int lxd_mt_lx_minor; /* linux minor node number */
+} lxd_minor_translator_t;
+
+enum lxd_xl_tp { DTT_INVALID, DTT_LIST, DTT_CUSTOM };
+
+#define xl_list lxd_xl_minor.lxd_xl_list
+#define xl_custom lxd_xl_minor.lxd_xl_custom
+
+typedef struct lxd_devt_translator {
+ char *lxd_xl_driver; /* driver name */
+ major_t lxd_xl_major; /* driver number */
+
+ enum lxd_xl_tp lxd_xl_type; /* dictates how we intrep. xl_minor */
+ union {
+ uintptr_t lxd_xl_foo; /* required to compile */
+ lxd_minor_translator_t *lxd_xl_list;
+ void (*lxd_xl_custom)(dev_t, dev_t *);
+ } lxd_xl_minor;
+} lxd_devt_translator_t;
+
+extern struct vnodeops *lxd_vnodeops;
+extern lxd_devt_translator_t lxd_devt_translators[];
+
+vnode_t *lxd_make_back_node(vnode_t *, lxd_mnt_t *);
+void lxd_free_back_node(lxd_node_t *);
+int lxd_dirdelete(lxd_node_t *, lxd_node_t *, char *, enum dr_op, cred_t *);
+int lxd_direnter(lxd_mnt_t *, lxd_node_t *, char *, enum de_op, lxd_node_t *,
+ lxd_node_t *, struct vattr *, lxd_node_t **, cred_t *);
+void lxd_dirinit(lxd_node_t *, lxd_node_t *);
+int lxd_dirlookup(lxd_node_t *, char *, lxd_node_t **, cred_t *);
+void lxd_dirtrunc(lxd_node_t *);
+void lxd_node_init(lxd_mnt_t *, lxd_node_t *, vnode_t *, vattr_t *, cred_t *);
+int lxd_naccess(void *, int, cred_t *);
+
+void lxd_save_attrs(lxd_mnt_t *, vnode_t *);
+void lxd_apply_db(lxd_mnt_t *);
+
+#endif /* KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LXD_H */
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c b/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c
new file mode 100644
index 0000000000..02d396a36d
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c
@@ -0,0 +1,368 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/cred.h>
+#include <sys/pathname.h>
+#include <sys/debug.h>
+#include <sys/sdt.h>
+#include <fs/fs_subr.h>
+
+#include "lxd.h"
+
+#define LX_ATTR_FILE "/etc/.lxd_dev_attr"
+
+#define RD_BUFSIZE MAXPATHLEN
+#define ENTRY_BUFSIZE (MAXPATHLEN + 32)
+
+static int
+lxd_db_open(int fmode, vnode_t **vpp)
+{
+ return (vn_open(LX_ATTR_FILE, UIO_SYSSPACE, fmode,
+ (int)(0644 & MODEMASK), vpp, CRCREAT, PTOU(curproc)->u_cmask));
+}
+
+static int
+lxd_wr_entry(vnode_t *wvn, off_t offset, char *entry)
+{
+ int len, err;
+ struct uio auio;
+ struct iovec aiov;
+
+ len = strlen(entry);
+ aiov.iov_base = entry;
+ aiov.iov_len = len;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = offset;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = len;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = FWRITE;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ (void) VOP_RWLOCK(wvn, V_WRITELOCK_TRUE, NULL);
+ err = VOP_WRITE(wvn, &auio, FAPPEND, CRED(), NULL);
+ VOP_RWUNLOCK(wvn, V_WRITELOCK_TRUE, NULL);
+
+ if (err != 0)
+ return (0);
+ return (len);
+}
+
+/*
+ * Given an entry, apply a uid, gid and mode change to the given device. There
+ * is no strtok in the kernel but it's easy to tokenize the entry ourselves.
+ *
+ * entries have the form (newline removed by caller):
+ * path uid gid mode\0
+ */
+static int
+lxd_apply_entry(char *entry, char **dpath, uid_t *uidp, gid_t *gidp,
+ mode_t *modep)
+{
+ char *dp, *up, *gp, *mp, *ep;
+ long uid, gid, mode;
+ int error, res = 0;
+ vnode_t *vp;
+ vattr_t va;
+
+ dp = entry;
+
+ /* find and delimit the first field (device name) */
+ for (up = dp; *up != ' ' && *up != '\0'; up++)
+ ;
+ if (*up != ' ')
+ return (-1);
+ *up++ = '\0';
+
+ /* find and delimit the second field (uid) */
+ for (gp = up; *gp != ' ' && *gp != '\0'; gp++)
+ ;
+ if (*gp != ' ')
+ return (-1);
+ *gp++ = '\0';
+
+ /* find and delimit the third field (gid) */
+ for (mp = gp; *mp != ' ' && *mp != '\0'; mp++)
+ ;
+ if (*mp != ' ')
+ return (-1);
+ *mp++ = '\0';
+
+ /* validate the fourth field (mode) */
+ ep = mp + strlen(mp);
+ if (*ep != '\0')
+ return (-1);
+
+ if (*dp != '/')
+ return (-1);
+
+ error = ddi_strtol(up, &ep, 10, &uid);
+ if (error != 0 || *ep != '\0' || uid > MAXUID || uid < 0)
+ return (-1);
+
+ error = ddi_strtol(gp, &ep, 10, &gid);
+ if (error != 0 || *ep != '\0' || gid > MAXUID || gid < 0)
+ return (-1);
+
+ /* note that the mode is octal */
+ error = ddi_strtol(mp, &ep, 8, &mode);
+ if (error != 0 || *ep != '\0' || mode > 0777 || mode < 0)
+ return (-1);
+
+ if (lookupname(dp, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp) != 0) {
+ /*
+ * It's likely the device is no longer visible to the zone.
+ * No matter the reason, we indicate failure.
+ */
+ return (-1);
+ }
+
+ va.va_mask = AT_UID | AT_GID | AT_MODE;
+ va.va_uid = (uid_t)uid;
+ va.va_gid = (gid_t)gid;
+ va.va_mode = (mode_t)mode;
+
+ if (VOP_SETATTR(vp, &va, 0, CRED(), NULL) != 0)
+ res = -1;
+
+ VN_RELE(vp);
+
+ *dpath = dp;
+ *uidp = (uid_t)uid;
+ *gidp = (gid_t)gid;
+ *modep = (mode_t)mode;
+ return (res);
+}
+
+/*
+ * Return true if this is a pre-existing record.
+ */
+static boolean_t
+lxd_save_devattr(lxd_mnt_t *lxdm, char *dpath, uid_t uid, gid_t gid,
+ mode_t mode)
+{
+ lxd_dev_attr_t *da;
+
+ da = list_head(&lxdm->lxdm_devattrs);
+ while (da != NULL) {
+ if (strcmp(dpath, da->lxda_name) == 0) {
+ da->lxda_uid = uid;
+ da->lxda_gid = gid;
+ da->lxda_mode = mode;
+ return (B_TRUE);
+ }
+ da = list_next(&lxdm->lxdm_devattrs, da);
+ }
+
+ da = kmem_zalloc(sizeof (lxd_dev_attr_t), KM_SLEEP);
+ (void) strlcpy(da->lxda_name, dpath, sizeof (da->lxda_name));
+ da->lxda_uid = uid;
+ da->lxda_gid = gid;
+ da->lxda_mode = mode;
+
+ list_insert_tail(&lxdm->lxdm_devattrs, da);
+ return (B_FALSE);
+}
+
+static void
+lxd_save_db(lxd_mnt_t *lxdm)
+{
+ lxd_dev_attr_t *da;
+ char *entry;
+ vnode_t *wvn;
+ off_t woff = 0;
+
+ if (list_is_empty(&lxdm->lxdm_devattrs)) {
+ /* The attribute file is no longer needed. */
+ (void) vn_remove(LX_ATTR_FILE, UIO_SYSSPACE, RMFILE);
+ return;
+ }
+
+ if (lxd_db_open(FWRITE | FCREAT | FTRUNC, &wvn) != 0)
+ return;
+
+ entry = kmem_alloc(ENTRY_BUFSIZE, KM_SLEEP);
+
+ woff = lxd_wr_entry(wvn, woff, "# DO NOT EDIT: this file is "
+ "automatically maintained for lx container devices\n");
+
+ da = list_head(&lxdm->lxdm_devattrs);
+ while (da != NULL) {
+ (void) snprintf(entry, ENTRY_BUFSIZE, "%s %d %d %o\n",
+ da->lxda_name, da->lxda_uid, da->lxda_gid,
+ da->lxda_mode & 0777);
+ woff += lxd_wr_entry(wvn, woff, entry);
+ da = list_next(&lxdm->lxdm_devattrs, da);
+ }
+
+ (void) VOP_CLOSE(wvn, FWRITE, 1, woff, CRED(), NULL);
+
+ kmem_free(entry, ENTRY_BUFSIZE);
+}
+
+/*
+ * This function records the uid, gid and mode information for an lx devfs
+ * block device node after a chown/chmod setattr operation so that these
+ * changes can be persistent across reboots. Since the actual setattr has
+ * already suceeded, the tracking of these changes is done on a "best effort"
+ * basis. That is, if we fail to record the change for some reason, the setattr
+ * will still return success. The vp passed in is the "real vp" for the back
+ * device node.
+ */
+void
+lxd_save_attrs(lxd_mnt_t *lxdm, vnode_t *vp)
+{
+ vattr_t va;
+ char devpath[MAXPATHLEN];
+
+ /* the path returned is relative to the zone's root */
+ if (vnodetopath(curproc->p_zone->zone_rootvp, vp, devpath,
+ sizeof (devpath), CRED()) != 0)
+ return;
+
+ va.va_mask = AT_MODE | AT_UID | AT_GID;
+
+ /*
+ * We just set attrs, so the getattr shouldn't fail. If the device
+ * is not a block device we don't persist the change.
+ */
+ if (VOP_GETATTR(vp, &va, 0, CRED(), NULL) != 0 ||
+ ((va.va_mode & S_IFBLK) != S_IFBLK))
+ return;
+
+ /*
+ * We serialize all updates to the attribute DB file. In practice this
+ * should not be a problem since there is rarely concurrent device
+ * file mode changes.
+ */
+ mutex_enter(&lxdm->lxdm_attrlck);
+
+ (void) lxd_save_devattr(lxdm, devpath, va.va_uid, va.va_gid,
+ va.va_mode & 0777);
+ lxd_save_db(lxdm);
+
+ mutex_exit(&lxdm->lxdm_attrlck);
+}
+
+/*
+ * Re-apply the persistent attribute settings to the devices when this lx
+ * devfs is mounted. As with lxd_save_attrs, this is done on a best effort and
+ * we won't prevent the mount if there is a problem. No locking is needed
+ * while reading the DB file since this action is performed during the
+ * mount of the devfs.
+ */
+void
+lxd_apply_db(lxd_mnt_t *lxdm)
+{
+ vnode_t *rvn;
+ char *buf, *entry, *bp, *ep;
+ struct uio auio;
+ struct iovec aiov;
+ size_t cnt, len, ecnt, roff;
+ char *devpath;
+ uid_t uid;
+ gid_t gid;
+ mode_t mode;
+ boolean_t needs_update = B_FALSE;
+
+ if (lxd_db_open(FREAD, &rvn) != 0)
+ return;
+
+ buf = kmem_alloc(RD_BUFSIZE, KM_SLEEP);
+ entry = kmem_alloc(ENTRY_BUFSIZE, KM_SLEEP);
+
+ roff = 0;
+ ep = entry;
+ ecnt = 0;
+ (void) VOP_RWLOCK(rvn, V_WRITELOCK_FALSE, NULL);
+loop:
+ aiov.iov_base = buf;
+ aiov.iov_len = RD_BUFSIZE;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = roff;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = RD_BUFSIZE;
+ auio.uio_fmode = 0;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ (void) VOP_READ(rvn, &auio, 0, CRED(), NULL);
+
+ len = RD_BUFSIZE - auio.uio_resid;
+ roff += len;
+
+ if (len > 0) {
+ for (bp = buf, cnt = 0; cnt < len; bp++, cnt++) {
+
+ /*
+ * We have an improperly formed entry in the file (too
+ * long). In an attempt to recover we reset the entry
+ * pointer so we can read the rest of the line and try
+ * to absorb the bad line. The code in lxd_apply_entry
+ * will handle any malformed or inapplicable entries.
+ */
+ if (ecnt >= (ENTRY_BUFSIZE - 1)) {
+ ep = entry;
+ ecnt = 0;
+ needs_update = B_TRUE;
+ }
+
+ if (*bp == '\n') {
+ *ep = '\0';
+
+ /* skip comments */
+ if (entry[0] != '#') {
+ if (lxd_apply_entry(entry, &devpath,
+ &uid, &gid, &mode) != 0 ||
+ lxd_save_devattr(lxdm, devpath,
+ uid, gid, mode)) {
+ /*
+ * An invalid entry, a
+ * non-existent device node or
+ * a duplicate entry.
+ */
+ needs_update = B_TRUE;
+ }
+ }
+ ep = entry;
+ ecnt = 0;
+ } else {
+ *ep++ = *bp;
+ ecnt++;
+ }
+ }
+ goto loop;
+ }
+ VOP_RWUNLOCK(rvn, V_WRITELOCK_FALSE, NULL);
+
+ kmem_free(buf, RD_BUFSIZE);
+ kmem_free(entry, ENTRY_BUFSIZE);
+
+ (void) VOP_CLOSE(rvn, FREAD, 1, 0, CRED(), NULL);
+
+ if (needs_update)
+ lxd_save_db(lxdm);
+}
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_node.c b/usr/src/uts/common/brand/lx/devfs/lxd_node.c
new file mode 100644
index 0000000000..0d056ab167
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd_node.c
@@ -0,0 +1,1003 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+
+#include "lxd.h"
+
+#define LXD_HASH_SIZE 8192 /* must be power of 2 */
+#define LXD_MUTEX_SIZE 64
+
+
+#define MODESHIFT 3
+
+typedef enum lxd_nodehold {
+ NOHOLD,
+ HOLD
+} lxd_nodehold_t;
+
+/*
+ * The following functions maintain the per-mount "front" files.
+ */
+static void
+lxd_save_dirent(lxd_dirent_t *de)
+{
+ lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent));
+ uint_t hash;
+ kmutex_t *hmtx;
+
+ LXD_NM_HASH(de->lddir_parent, de->lddir_name, hash);
+ de->lddir_hash = hash;
+
+ hmtx = &lxdm->lxdm_hash_mutex[hash];
+
+ mutex_enter(hmtx);
+ ASSERT(de->lddir_link == NULL);
+ de->lddir_link = lxdm->lxdm_dent_htable[hash];
+ lxdm->lxdm_dent_htable[hash] = de;
+ mutex_exit(hmtx);
+
+ atomic_inc_32(&lxdm->lxdm_dent_refcnt);
+}
+
+static void
+lxd_rm_dirent(lxd_dirent_t *de)
+{
+ lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent));
+ uint_t hash;
+ lxd_dirent_t **prevpp;
+ kmutex_t *hmtx;
+
+ hash = de->lddir_hash;
+ hmtx = &lxdm->lxdm_hash_mutex[hash];
+
+ mutex_enter(hmtx);
+ prevpp = &lxdm->lxdm_dent_htable[hash];
+ while (*prevpp != de)
+ prevpp = &(*prevpp)->lddir_link;
+ *prevpp = de->lddir_link;
+ de->lddir_link = NULL;
+ mutex_exit(hmtx);
+
+ ASSERT(lxdm->lxdm_dent_refcnt > 0);
+ atomic_dec_32(&lxdm->lxdm_dent_refcnt);
+}
+
+static lxd_dirent_t *
+lxd_find_dirent(char *name, lxd_node_t *parent, lxd_nodehold_t do_hold,
+ lxd_node_t **found)
+{
+ lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(parent));
+ lxd_dirent_t *de;
+ uint_t hash;
+ kmutex_t *hmtx;
+
+ LXD_NM_HASH(parent, name, hash);
+ hmtx = &lxdm->lxdm_hash_mutex[hash];
+
+ mutex_enter(hmtx);
+ de = lxdm->lxdm_dent_htable[hash];
+ while (de) {
+ if (de->lddir_hash == hash && de->lddir_parent == parent &&
+ strcmp(de->lddir_name, name) == 0) {
+ lxd_node_t *ldn = de->lddir_node;
+
+ if (do_hold == HOLD) {
+ ASSERT(ldn != NULL);
+ ldnode_hold(ldn);
+ }
+ if (found != NULL)
+ *found = ldn;
+ mutex_exit(hmtx);
+ return (de);
+ }
+
+ de = de->lddir_link;
+ }
+ mutex_exit(hmtx);
+ return (NULL);
+}
+
+int
+lxd_naccess(void *vcp, int mode, cred_t *cr)
+{
+ lxd_node_t *ldn = vcp;
+ int shift = 0;
+ /*
+ * Check access based on owner, group and public perms in lxd_node.
+ */
+ if (crgetuid(cr) != ldn->lxdn_uid) {
+ shift += MODESHIFT;
+ if (groupmember(ldn->lxdn_gid, cr) == 0)
+ shift += MODESHIFT;
+ }
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (secpolicy_vnode_access2(cr, LDNTOV(ldn),
+ ldn->lxdn_uid, ldn->lxdn_mode << shift, mode));
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ return (VOP_ACCESS(ldn->lxdn_real_vp, mode, 0, cr, NULL));
+}
+
+static lxd_node_t *
+lxd_find_back(struct vnode *vp, uint_t hash, lxd_mnt_t *lxdm)
+{
+ lxd_node_t *l;
+
+ ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash]));
+
+ for (l = lxdm->lxdm_back_htable[hash]; l != NULL; l = l->lxdn_hnxt) {
+ if (l->lxdn_real_vp == vp) {
+ ASSERT(l->lxdn_type == LXDNT_BACK);
+
+ VN_HOLD(LDNTOV(l));
+ return (l);
+ }
+ }
+ return (NULL);
+}
+
+static void
+lxd_save_back(lxd_node_t *l, uint_t hash, lxd_mnt_t *lxdm)
+{
+ ASSERT(l->lxdn_type == LXDNT_BACK);
+ ASSERT(l->lxdn_real_vp != NULL);
+ ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash]));
+
+ atomic_inc_32(&lxdm->lxdm_back_refcnt);
+
+ l->lxdn_hnxt = lxdm->lxdm_back_htable[hash];
+ lxdm->lxdm_back_htable[hash] = l;
+}
+
+
+struct vnode *
+lxd_make_back_node(struct vnode *vp, lxd_mnt_t *lxdm)
+{
+ uint_t hash;
+ kmutex_t *hmtx;
+ lxd_node_t *l;
+
+ hash = LXD_BACK_HASH(vp); /* Note: hashing with realvp */
+ hmtx = &lxdm->lxdm_hash_mutex[hash];
+ mutex_enter(hmtx);
+
+ l = lxd_find_back(vp, hash, lxdm);
+ if (l == NULL) {
+ vnode_t *nvp;
+
+ l = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP);
+ nvp = vn_alloc(KM_SLEEP);
+
+ rw_init(&l->lxdn_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&l->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL);
+
+ l->lxdn_vnode = nvp;
+ l->lxdn_type = LXDNT_BACK;
+ l->lxdn_real_vp = vp;
+
+ VN_SET_VFS_TYPE_DEV(nvp, lxdm->lxdm_vfsp, vp->v_type,
+ vp->v_rdev);
+ nvp->v_flag |= (vp->v_flag & (VNOMOUNT|VNOMAP|VDIROPEN));
+ vn_setops(nvp, lxd_vnodeops);
+ nvp->v_data = (caddr_t)l;
+
+ lxd_save_back(l, hash, lxdm);
+ vn_exists(vp);
+ } else {
+ VN_RELE(vp);
+ }
+
+ mutex_exit(hmtx);
+ return (LDNTOV(l));
+}
+
+void
+lxd_free_back_node(lxd_node_t *lp)
+{
+ uint_t hash;
+ kmutex_t *hmtx;
+ lxd_node_t *l;
+ lxd_node_t *lprev = NULL;
+ vnode_t *vp = LDNTOV(lp);
+ vnode_t *realvp = REALVP(vp);
+ lxd_mnt_t *lxdm = VTOLXDM(vp);
+
+ /* in lxd_make_back_node we call lxd_find_back with the realvp */
+ hash = LXD_BACK_HASH(realvp);
+ hmtx = &lxdm->lxdm_hash_mutex[hash];
+ mutex_enter(hmtx);
+
+ mutex_enter(&vp->v_lock);
+ if (vp->v_count > 1) {
+ vp->v_count--; /* release our hold from vn_rele */
+ mutex_exit(&vp->v_lock);
+ mutex_exit(hmtx);
+ return;
+ }
+ mutex_exit(&vp->v_lock);
+
+ for (l = lxdm->lxdm_back_htable[hash]; l != NULL;
+ lprev = l, l = l->lxdn_hnxt) {
+
+ if (l != lp)
+ continue;
+
+ ASSERT(l->lxdn_type == LXDNT_BACK);
+ ASSERT(lxdm->lxdm_back_refcnt > 0);
+
+ atomic_dec_32(&lxdm->lxdm_back_refcnt);
+ vn_invalid(vp);
+
+ if (lprev == NULL) {
+ lxdm->lxdm_back_htable[hash] = l->lxdn_hnxt;
+ } else {
+ lprev->lxdn_hnxt = l->lxdn_hnxt;
+ }
+
+ mutex_exit(hmtx);
+ rw_destroy(&l->lxdn_rwlock);
+ mutex_destroy(&l->lxdn_tlock);
+ kmem_free(l, sizeof (lxd_node_t));
+ vn_free(vp);
+ VN_RELE(realvp);
+ return;
+ }
+
+ panic("lxd_free_back_node");
+ /*NOTREACHED*/
+}
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * 0 is returned on success and *foundcp points
+ * to the found lxd_node with its vnode held.
+ */
+int
+lxd_dirlookup(lxd_node_t *parent, char *name, lxd_node_t **foundnp, cred_t *cr)
+{
+ int error;
+
+ *foundnp = NULL;
+ if (parent->lxdn_vnode->v_type != VDIR)
+ return (ENOTDIR);
+
+ if ((error = lxd_naccess(parent, VEXEC, cr)))
+ return (error);
+
+ if (*name == '\0') {
+ ldnode_hold(parent);
+ *foundnp = parent;
+ return (0);
+ }
+
+ /*
+ * Search the directory for the matching name
+ * We need the lock protecting the lxdn_dir list
+ * so that it doesn't change out from underneath us.
+ * lxd_find_dirent() will pass back the lxd_node
+ * with a hold on it.
+ */
+
+ if (lxd_find_dirent(name, parent, HOLD, foundnp) != NULL) {
+ ASSERT(*foundnp);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Check if the source directory is in the path of the target directory.
+ * The target directory is locked by the caller.
+ */
+static int
+lxd_dircheckpath(lxd_node_t *fromnode, lxd_node_t *toparent)
+{
+ int error = 0;
+ lxd_node_t *dir, *dotdot;
+
+ ASSERT(RW_WRITE_HELD(&toparent->lxdn_rwlock));
+ ASSERT(toparent->lxdn_vnode->v_type == VDIR);
+
+ dotdot = toparent->lxdn_parent;
+ if (dotdot == NULL)
+ return (ENOENT);
+ ldnode_hold(dotdot);
+
+ if (dotdot == toparent) {
+ /* root of fs. search trivially satisfied. */
+ ldnode_rele(dotdot);
+ return (0);
+ }
+
+ for (;;) {
+ /*
+ * Return error for cases like "mv c c/d",
+ * "mv c c/d/e" and so on.
+ */
+ if (dotdot == fromnode) {
+ ldnode_rele(dotdot);
+ error = EINVAL;
+ break;
+ }
+
+ dir = dotdot;
+ dotdot = dir->lxdn_parent;
+ if (dotdot == NULL) {
+ ldnode_rele(dir);
+ error = ENOENT;
+ break;
+ }
+ ldnode_hold(dotdot);
+
+ /*
+ * We're okay if we traverse the directory tree up to
+ * the root directory and don't run into the
+ * parent directory.
+ */
+ if (dir == dotdot) {
+ ldnode_rele(dir);
+ ldnode_rele(dotdot);
+ break;
+ }
+ ldnode_rele(dir);
+ }
+
+ return (error);
+}
+
+static int
+lxd_dir_make_node(lxd_node_t *dir, lxd_mnt_t *lxdm, struct vattr *va,
+ enum de_op op, lxd_node_t **newnode, struct cred *cred)
+{
+ lxd_node_t *ldn;
+
+ ASSERT(va != NULL);
+
+ if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+ ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+ return (EOVERFLOW);
+
+ ldn = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP);
+
+ ldn->lxdn_type = LXDNT_FRONT;
+ lxd_node_init(lxdm, ldn, NULL, va, cred);
+
+ ldn->lxdn_vnode->v_rdev = ldn->lxdn_rdev = NODEV;
+ ldn->lxdn_vnode->v_type = va->va_type;
+ ldn->lxdn_uid = crgetuid(cred);
+ ldn->lxdn_gid = crgetgid(cred);
+ ldn->lxdn_nodeid = lxdm->lxdm_gen++;
+
+ if (va->va_mask & AT_ATIME)
+ ldn->lxdn_atime = va->va_atime;
+ if (va->va_mask & AT_MTIME)
+ ldn->lxdn_mtime = va->va_mtime;
+
+ if (op == DE_MKDIR) {
+ lxd_dirinit(dir, ldn);
+ }
+
+ *newnode = ldn;
+ return (0);
+}
+
+static int
+lxd_diraddentry(lxd_node_t *dir, lxd_node_t *ldn, char *name)
+{
+ lxd_dirent_t *dp, *pdp;
+ size_t namelen, alloc_size;
+ timestruc_t now;
+
+ /*
+ * Make sure the parent directory wasn't removed from
+ * underneath the caller.
+ */
+ if (dir->lxdn_dir == NULL)
+ return (ENOENT);
+
+ /* Check that everything is on the same filesystem. */
+ if (ldn->lxdn_vnode->v_vfsp != dir->lxdn_vnode->v_vfsp)
+ return (EXDEV);
+
+ /* Allocate and initialize directory entry */
+ namelen = strlen(name) + 1;
+ alloc_size = namelen + sizeof (lxd_dirent_t);
+ dp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI);
+ if (dp == NULL)
+ return (ENOSPC);
+
+ ldn->lxdn_parent = dir;
+
+ dir->lxdn_size += alloc_size;
+ dir->lxdn_dirents++;
+ dp->lddir_node = ldn;
+ dp->lddir_parent = dir;
+
+ /* The directory entry and its name were allocated sequentially. */
+ dp->lddir_name = (char *)dp + sizeof (lxd_dirent_t);
+ (void) strcpy(dp->lddir_name, name);
+
+ lxd_save_dirent(dp);
+
+ /*
+ * Some utilities expect the size of a directory to remain
+ * somewhat static. For example, a routine which removes
+ * subdirectories between calls to readdir(); the size of the
+ * directory changes from underneath it and so the real
+ * directory offset in bytes is invalid. To circumvent
+ * this problem, we initialize a directory entry with an
+ * phony offset, and use this offset to determine end of
+ * file in lxd_readdir.
+ */
+ pdp = dir->lxdn_dir->lddir_prev;
+ /*
+ * Install at first empty "slot" in directory list.
+ */
+ while (pdp->lddir_next != NULL &&
+ (pdp->lddir_next->lddir_offset - pdp->lddir_offset) <= 1) {
+ ASSERT(pdp->lddir_next != pdp);
+ ASSERT(pdp->lddir_prev != pdp);
+ ASSERT(pdp->lddir_next->lddir_offset > pdp->lddir_offset);
+ pdp = pdp->lddir_next;
+ }
+ dp->lddir_offset = pdp->lddir_offset + 1;
+
+ /*
+ * If we're at the end of the dirent list and the offset (which
+ * is necessarily the largest offset in this directory) is more
+ * than twice the number of dirents, that means the directory is
+ * 50% holes. At this point we reset the slot pointer back to
+ * the beginning of the directory so we start using the holes.
+ * The idea is that if there are N dirents, there must also be
+ * N holes, so we can satisfy the next N creates by walking at
+ * most 2N entries; thus the average cost of a create is constant.
+ * Note that we use the first dirent's lddir_prev as the roving
+ * slot pointer; it's ugly, but it saves a word in every dirent.
+ */
+ if (pdp->lddir_next == NULL &&
+ pdp->lddir_offset > 2 * dir->lxdn_dirents)
+ dir->lxdn_dir->lddir_prev = dir->lxdn_dir->lddir_next;
+ else
+ dir->lxdn_dir->lddir_prev = dp;
+
+ ASSERT(pdp->lddir_next != pdp);
+ ASSERT(pdp->lddir_prev != pdp);
+
+ dp->lddir_next = pdp->lddir_next;
+ if (dp->lddir_next) {
+ dp->lddir_next->lddir_prev = dp;
+ }
+ dp->lddir_prev = pdp;
+ pdp->lddir_next = dp;
+
+ ASSERT(dp->lddir_next != dp);
+ ASSERT(dp->lddir_prev != dp);
+ ASSERT(pdp->lddir_next != pdp);
+ ASSERT(pdp->lddir_prev != pdp);
+
+ gethrestime(&now);
+ dir->lxdn_mtime = now;
+ dir->lxdn_ctime = now;
+
+ return (0);
+}
+
+/*
+ * Enter a directory entry for 'name' into directory 'dir'
+ *
+ * Returns 0 on success.
+ */
+int
+lxd_direnter(
+ lxd_mnt_t *lxdm,
+ lxd_node_t *dir, /* target directory to make entry in */
+ char *name, /* name of entry */
+ enum de_op op, /* entry operation */
+ lxd_node_t *fromparent, /* original directory if rename */
+ lxd_node_t *ldn, /* existing lxd_node, if rename */
+ struct vattr *va,
+ lxd_node_t **rnp, /* return lxd_node, if create/mkdir */
+ cred_t *cr)
+{
+ lxd_dirent_t *dirp;
+ lxd_node_t *found = NULL;
+ int error = 0;
+ char *s;
+
+ /* lxdn_rwlock is held to serialize direnter and dirdeletes */
+ ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock));
+ ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+ /*
+ * Don't allow '/' characters in pathname component,
+ */
+ for (s = name; *s; s++)
+ if (*s == '/')
+ return (EACCES);
+
+ if (name[0] == '\0')
+ panic("lxd_direnter: NULL name");
+
+ /*
+ * For rename lock the source entry and check the link count
+ * to see if it has been removed while it was unlocked.
+ */
+ if (op == DE_RENAME) {
+ mutex_enter(&ldn->lxdn_tlock);
+ if (ldn->lxdn_nlink == 0) {
+ mutex_exit(&ldn->lxdn_tlock);
+ return (ENOENT);
+ }
+
+ if (ldn->lxdn_nlink == MAXLINK) {
+ mutex_exit(&ldn->lxdn_tlock);
+ return (EMLINK);
+ }
+ ldn->lxdn_nlink++;
+ gethrestime(&ldn->lxdn_ctime);
+ mutex_exit(&ldn->lxdn_tlock);
+ }
+
+ /*
+ * This might be a "dangling detached directory" (it could have been
+ * removed, but a reference to it kept in u_cwd). Don't bother
+ * searching it, and with any luck the user will get tired of dealing
+ * with us and cd to some absolute pathway (thus in ufs, too).
+ */
+ if (dir->lxdn_nlink == 0) {
+ error = ENOENT;
+ goto out;
+ }
+
+ /*
+ * If this is a rename of a directory and the parent is different
+ * (".." must be changed), then the source directory must not be in the
+ * directory hierarchy above the target, as this would orphan
+ * everything below the source directory.
+ */
+ if (op == DE_RENAME) {
+ if (ldn == dir) {
+ error = EINVAL;
+ goto out;
+ }
+ if ((ldn->lxdn_vnode->v_type) == VDIR) {
+ if ((fromparent != dir) &&
+ (error = lxd_dircheckpath(ldn, dir)) != 0) {
+ goto out;
+ }
+ }
+ }
+
+ /* Search for an existing entry. */
+ dirp = lxd_find_dirent(name, dir, HOLD, &found);
+ if (dirp != NULL) {
+ ASSERT(found != NULL);
+ switch (op) {
+ case DE_CREATE:
+ case DE_MKDIR:
+ if (rnp != NULL) {
+ *rnp = found;
+ error = EEXIST;
+ } else {
+ ldnode_rele(found);
+ }
+ break;
+
+ case DE_RENAME:
+ /*
+ * Note that we only hit this path when we're renaming
+ * a symlink from one directory to another and there is
+ * a pre-existing symlink as the target. lxd_rename
+ * will unlink the src from the original directory but
+ * here we need to unlink the dest that we collided
+ * with, then create the new directory entry as we do
+ * below when there is no pre-existing symlink.
+ */
+ if ((error = lxd_naccess(dir, VWRITE, cr)) != 0)
+ goto out;
+
+ ASSERT(found->lxdn_vnode->v_type == VLNK);
+ /* dir rw lock is already held and asserted above */
+ rw_enter(&found->lxdn_rwlock, RW_WRITER);
+ error = lxd_dirdelete(dir, found, name, DR_RENAME, cr);
+ rw_exit(&found->lxdn_rwlock);
+ ldnode_rele(found);
+ if (error != 0)
+ goto out;
+
+ error = lxd_diraddentry(dir, ldn, name);
+ if (error == 0 && rnp != NULL)
+ *rnp = ldn;
+ break;
+ }
+ } else {
+
+ /*
+ * The directory entry does not exist, but the node might if
+ * this is a rename. Check write permission in directory to
+ * see if entry can be created.
+ */
+ if ((error = lxd_naccess(dir, VWRITE, cr)) != 0)
+ goto out;
+ if (op == DE_CREATE || op == DE_MKDIR) {
+ /*
+ * Make new lxd_node and directory entry as required.
+ */
+ error = lxd_dir_make_node(dir, lxdm, va, op, &ldn, cr);
+ if (error)
+ goto out;
+ }
+
+ error = lxd_diraddentry(dir, ldn, name);
+ if (error != 0) {
+ if (op == DE_CREATE || op == DE_MKDIR) {
+ /*
+ * Unmake the inode we just made.
+ */
+ rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+ if ((ldn->lxdn_vnode->v_type) == VDIR) {
+ ASSERT(dirp == NULL);
+ /*
+ * cleanup allocs made by lxd_dirinit
+ */
+ lxd_dirtrunc(ldn);
+ }
+ mutex_enter(&ldn->lxdn_tlock);
+ ldn->lxdn_nlink = 0;
+ gethrestime(&ldn->lxdn_ctime);
+ mutex_exit(&ldn->lxdn_tlock);
+ rw_exit(&ldn->lxdn_rwlock);
+ ldnode_rele(ldn);
+ ldn = NULL;
+ }
+ } else if (rnp != NULL) {
+ *rnp = ldn;
+ } else if (op == DE_CREATE || op == DE_MKDIR) {
+ ldnode_rele(ldn);
+ }
+ }
+
+out:
+ if (error && op == DE_RENAME) {
+ /* Undo bumped link count. */
+ mutex_enter(&ldn->lxdn_tlock);
+ ldn->lxdn_nlink--;
+ gethrestime(&ldn->lxdn_ctime);
+ mutex_exit(&ldn->lxdn_tlock);
+ }
+ return (error);
+}
+
+/*
+ * Delete entry ldn of name "nm" from parent dir. This is used to both remove
+ * a directory and to remove file nodes within the directory (by recursively
+ * calling itself). It frees the dir entry space and decrements link count on
+ * lxd_node(s).
+ *
+ * Return 0 on success.
+ */
+int
+lxd_dirdelete(lxd_node_t *dir, lxd_node_t *ldn, char *nm, enum dr_op op,
+ cred_t *cred)
+{
+ lxd_dirent_t *dirp;
+ int error;
+ size_t namelen;
+ lxd_node_t *fndnp;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock));
+ ASSERT(RW_WRITE_HELD(&ldn->lxdn_rwlock));
+ ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+ if (nm[0] == '\0')
+ panic("lxd_dirdelete: empty name for 0x%p", (void *)ldn);
+
+ /*
+ * return error when removing . and ..
+ */
+ if (nm[0] == '.') {
+ if (nm[1] == '\0')
+ return (EINVAL);
+ if (nm[1] == '.' && nm[2] == '\0')
+ return (EEXIST); /* thus in ufs */
+ }
+
+ if ((error = lxd_naccess(dir, VEXEC|VWRITE, cred)) != 0)
+ return (error);
+
+ if (dir->lxdn_dir == NULL)
+ return (ENOENT);
+
+ if (op == DR_RMDIR) {
+ /*
+ * This is the top-level removal of a directory. Start by
+ * removing any file entries from the dir. We do this by
+ * recursively calling back into this function with a different
+ * op code. The caller of this function has already verified
+ * that it is safe to remove this directory.
+ */
+ lxd_dirent_t *dirp;
+
+ ASSERT(ldn->lxdn_vnode->v_type == VDIR);
+
+ dirp = ldn->lxdn_dir;
+ while (dirp) {
+ lxd_node_t *dn;
+ lxd_dirent_t *nextp;
+
+ if (strcmp(dirp->lddir_name, ".") == 0 ||
+ strcmp(dirp->lddir_name, "..") == 0) {
+ dirp = dirp->lddir_next;
+ continue;
+ }
+
+ dn = dirp->lddir_node;
+ nextp = dirp->lddir_next;
+
+ ldnode_hold(dn);
+ error = lxd_dirdelete(ldn, dn, dirp->lddir_name,
+ DR_REMOVE, cred);
+ ldnode_rele(dn);
+
+ dirp = nextp;
+ }
+ }
+
+ dirp = lxd_find_dirent(nm, dir, NOHOLD, &fndnp);
+ VERIFY(dirp != NULL);
+ VERIFY(ldn == fndnp);
+
+ lxd_rm_dirent(dirp);
+
+ /* Take dirp out of the directory list. */
+ ASSERT(dirp->lddir_next != dirp);
+ ASSERT(dirp->lddir_prev != dirp);
+ if (dirp->lddir_prev) {
+ dirp->lddir_prev->lddir_next = dirp->lddir_next;
+ }
+ if (dirp->lddir_next) {
+ dirp->lddir_next->lddir_prev = dirp->lddir_prev;
+ }
+
+ /*
+ * If the roving slot pointer happens to match dirp,
+ * point it at the previous dirent.
+ */
+ if (dir->lxdn_dir->lddir_prev == dirp) {
+ dir->lxdn_dir->lddir_prev = dirp->lddir_prev;
+ }
+ ASSERT(dirp->lddir_next != dirp);
+ ASSERT(dirp->lddir_prev != dirp);
+
+ /* dirp points to the correct directory entry */
+ namelen = strlen(dirp->lddir_name) + 1;
+
+ kmem_free(dirp, sizeof (lxd_dirent_t) + namelen);
+ dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen);
+ dir->lxdn_dirents--;
+
+ gethrestime(&now);
+ dir->lxdn_mtime = now;
+ dir->lxdn_ctime = now;
+ ldn->lxdn_ctime = now;
+
+ ASSERT(ldn->lxdn_nlink > 0);
+ mutex_enter(&ldn->lxdn_tlock);
+ ldn->lxdn_nlink--;
+ mutex_exit(&ldn->lxdn_tlock);
+ if (op == DR_RMDIR && ldn->lxdn_vnode->v_type == VDIR) {
+ lxd_dirtrunc(ldn);
+ ASSERT(ldn->lxdn_nlink == 0);
+ }
+ return (0);
+}
+
+/*
+ * Initialize a lxd_node and add it to file list under mount point.
+ */
+void
+lxd_node_init(lxd_mnt_t *lxdm, lxd_node_t *ldn, vnode_t *realvp, vattr_t *vap,
+ cred_t *cred)
+{
+ struct vnode *vp;
+ timestruc_t now;
+
+ ASSERT(vap != NULL);
+
+ rw_init(&ldn->lxdn_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&ldn->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL);
+ ldn->lxdn_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ ldn->lxdn_mask = 0;
+ ldn->lxdn_attr.va_type = vap->va_type;
+ ldn->lxdn_nlink = 1;
+ ldn->lxdn_size = 0;
+
+ if (cred == NULL) {
+ ldn->lxdn_uid = vap->va_uid;
+ ldn->lxdn_gid = vap->va_gid;
+ } else {
+ ldn->lxdn_uid = crgetuid(cred);
+ ldn->lxdn_gid = crgetgid(cred);
+ }
+
+ ldn->lxdn_fsid = lxdm->lxdm_dev;
+ ldn->lxdn_rdev = vap->va_rdev;
+ ldn->lxdn_blksize = PAGESIZE;
+ ldn->lxdn_nblocks = 0;
+ gethrestime(&now);
+ ldn->lxdn_atime = now;
+ ldn->lxdn_mtime = now;
+ ldn->lxdn_ctime = now;
+ ldn->lxdn_seq = 0;
+ ldn->lxdn_dir = NULL;
+
+ ldn->lxdn_real_vp = realvp;
+
+ ldn->lxdn_vnode = vn_alloc(KM_SLEEP);
+ vp = LDNTOV(ldn);
+ vn_setops(vp, lxd_vnodeops);
+ vp->v_vfsp = lxdm->lxdm_vfsp;
+ vp->v_type = vap->va_type;
+ vp->v_rdev = vap->va_rdev;
+ vp->v_data = (caddr_t)ldn;
+
+ mutex_enter(&lxdm->lxdm_contents);
+ ldn->lxdn_nodeid = lxdm->lxdm_gen++;
+
+ /*
+ * Add new lxd_node to end of linked list of lxd_nodes for this
+ * lxdevfs. Root directory is handled specially in lxd_mount.
+ */
+ if (lxdm->lxdm_rootnode != (lxd_node_t *)NULL) {
+ ldn->lxdn_next = NULL;
+ ldn->lxdn_prev = lxdm->lxdm_rootnode->lxdn_prev;
+ ldn->lxdn_prev->lxdn_next = lxdm->lxdm_rootnode->lxdn_prev =
+ ldn;
+ }
+ mutex_exit(&lxdm->lxdm_contents);
+ vn_exists(vp);
+}
+
+/*
+ * lxd_dirinit is used internally to initialize a directory (dir)
+ * with '.' and '..' entries without checking permissions and locking
+ * It also creates the entries for the pseudo file nodes that reside in the
+ * directory.
+ */
+void
+lxd_dirinit(lxd_node_t *parent, lxd_node_t *dir)
+{
+ lxd_dirent_t *dot, *dotdot;
+ timestruc_t now;
+ lxd_mnt_t *lxdm = VTOLXDM(dir->lxdn_vnode);
+ struct vattr nattr;
+
+ ASSERT(RW_WRITE_HELD(&parent->lxdn_rwlock));
+ ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+ dir->lxdn_nodeid = lxdm->lxdm_gen++;
+
+ /*
+ * Initialize the entries
+ */
+ dot = kmem_zalloc(sizeof (lxd_dirent_t) + 2, KM_SLEEP);
+ dot->lddir_node = dir;
+ dot->lddir_offset = 0;
+ dot->lddir_name = (char *)dot + sizeof (lxd_dirent_t);
+ dot->lddir_name[0] = '.';
+ dot->lddir_parent = dir;
+ lxd_save_dirent(dot);
+
+ dotdot = kmem_zalloc(sizeof (lxd_dirent_t) + 3, KM_SLEEP);
+ dotdot->lddir_node = parent;
+ dotdot->lddir_offset = 1;
+ dotdot->lddir_name = (char *)dotdot + sizeof (lxd_dirent_t);
+ dotdot->lddir_name[0] = '.';
+ dotdot->lddir_name[1] = '.';
+ dotdot->lddir_parent = dir;
+ lxd_save_dirent(dotdot);
+
+ /*
+ * Initialize directory entry list.
+ */
+ dot->lddir_next = dotdot;
+ dot->lddir_prev = dotdot; /* dot's lddir_prev holds roving slot ptr */
+ dotdot->lddir_next = NULL;
+ dotdot->lddir_prev = dot;
+
+ gethrestime(&now);
+ dir->lxdn_mtime = now;
+ dir->lxdn_ctime = now;
+
+ parent->lxdn_nlink++;
+ parent->lxdn_ctime = now;
+
+ dir->lxdn_dir = dot;
+ dir->lxdn_size = 2 * sizeof (lxd_dirent_t) + 5; /* dot and dotdot */
+ dir->lxdn_dirents = 2;
+ dir->lxdn_nlink = 2;
+ dir->lxdn_parent = parent;
+
+ bzero(&nattr, sizeof (struct vattr));
+ nattr.va_mode = (mode_t)(0644);
+ nattr.va_type = VREG;
+ nattr.va_rdev = 0;
+}
+
+/*
+ * lxd_dirtrunc is called to remove all directory entries under this directory.
+ */
+void
+lxd_dirtrunc(lxd_node_t *dir)
+{
+ lxd_dirent_t *ldp;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock));
+ ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+ for (ldp = dir->lxdn_dir; ldp; ldp = dir->lxdn_dir) {
+ size_t namelen;
+ lxd_node_t *ldn;
+
+ ASSERT(ldp->lddir_next != ldp);
+ ASSERT(ldp->lddir_prev != ldp);
+ ASSERT(ldp->lddir_node);
+
+ dir->lxdn_dir = ldp->lddir_next;
+ namelen = strlen(ldp->lddir_name) + 1;
+
+ /*
+ * Adjust the link counts to account for this directory entry
+ * removal. We do hold/rele operations to free up these nodes.
+ */
+ ldn = ldp->lddir_node;
+
+ ASSERT(ldn->lxdn_nlink > 0);
+ mutex_enter(&ldn->lxdn_tlock);
+ ldn->lxdn_nlink--;
+ mutex_exit(&ldn->lxdn_tlock);
+
+ lxd_rm_dirent(ldp);
+ kmem_free(ldp, sizeof (lxd_dirent_t) + namelen);
+ dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen);
+ dir->lxdn_dirents--;
+ }
+
+ gethrestime(&now);
+ dir->lxdn_mtime = now;
+ dir->lxdn_ctime = now;
+
+ ASSERT(dir->lxdn_dir == NULL);
+ ASSERT(dir->lxdn_size == 0);
+ ASSERT(dir->lxdn_dirents == 0);
+}
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c
new file mode 100644
index 0000000000..b2e2b9b9e3
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c
@@ -0,0 +1,860 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * The lx devfs (lxd) file system is used within lx branded zones to provide
+ * the Linux view of /dev.
+ *
+ * In the past, the Linux /dev was simply a lofs mount pointing at /native/dev.
+ * lxd now provides the Linux /dev.
+ *
+ * The lxd file system is a hybrid of lofs and tmpfs. It supports a "back" file
+ * system which is the special device and corresponds to the special device in
+ * a lofs mount. As with lofs, all files in the special device are accessible
+ * through the lxd mount. Because the zone's devfs is not directly modifiable
+ * within the zone (also mknod(2) is not generally allowed within a zone) it is
+ * impossible to create files in devfs. For lx, in some cases it's useful to be
+ * able to make new symlinks or new directories under /dev. lxd implements
+ * these operations by creating "files" in memory in the same way as tmpfs
+ * does. Within lxd these are referred to as "front" files. For operations such
+ * as lookup or readdir, lxd provides a merged view of both the front and back
+ * files. lxd does not support regular front files or simple I/O (read/write)
+ * to front files, since there is no need for that. For back files, all
+ * operations are simply passed through to the real vnode, as is done with
+ * lofs. Front files are not allowed to mask back files.
+ *
+ * The Linux /dev is now a lxd mount with the special file (i.e. the back
+ * file system) as /native/dev.
+ *
+ * In addition, lx has a need for some illumos/Linux translation for the
+ * various *stat(2) system calls when used on a device. This translation can
+ * be centralized within lxd's getattr vnode entry point.
+ *
+ * Because the front file system only exists in memory and the back file
+ * system is the zone's devfs, which is not persistent across reboots, we
+ * track any device uid/gid/mode changes in a per-zone /etc/.lxd_dev_attr
+ * file and re-apply those changes when the lx devfs file system is mounted.
+ * Currently only changes to block device nodes are persistent.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_ptm.h>
+#include <sys/lx_impl.h>
+
+#include "lxd.h"
+
+/* Module level parameters */
+static int lxd_fstype;
+static dev_t lxd_dev;
+
+/*
+ * lxd_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. The filesystem module must not be
+ * allowed to go away before the last VFS_FREEVFS() call has been made. Since
+ * this is just an atomic counter, there's no need for locking.
+ */
+static uint32_t lxd_mountcount;
+
+/*
+ * lxd_minfree is the minimum amount of swap space that lx devfs leaves for
+ * the rest of the zone.
+ */
+size_t lxd_minfree = 0;
+
+/*
+ * LXDMINFREE -- the value from which lxd_minfree is derived -- should be
+ * configured to a value that is roughly the smallest practical value for
+ * memory + swap minus the largest reasonable size for lxd in such
+ * a configuration. As of this writing, the smallest practical memory + swap
+ * configuration is 128MB, and it seems reasonable to allow lxd to consume
+ * no more than ~10% of this, yielding a LXDMINFREE of 12MB.
+ */
+#define LXDMINFREE 12 * 1024 * 1024 /* 12 Megabytes */
+
+extern pgcnt_t swapfs_minfree;
+
+extern int lxd_symlink(vnode_t *, char *, struct vattr *, char *, cred_t *,
+ caller_context_t *, int);
+extern int stat64(char *, struct stat64 *);
+
+/*
+ * lxd vfs operations.
+ */
+static int lxd_init(int, char *);
+static int lxd_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int lxd_unmount(vfs_t *, int, cred_t *);
+static int lxd_root(vfs_t *, vnode_t **);
+static int lxd_statvfs(vfs_t *, statvfs64_t *);
+static void lxd_freevfs(vfs_t *vfsp);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "lx_devfs",
+ lxd_init,
+ VSW_ZMOUNT,
+ NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+ &mod_fsops, "lx brand devfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modlfs, NULL
+};
+
+/*
+ * Definitions and translators for devt's.
+ */
+static void lxd_pts_devt_translator(dev_t, dev_t *);
+static void lxd_ptm_devt_translator(dev_t, dev_t *);
+
+static kmutex_t lxd_xlate_lock;
+static boolean_t lxd_xlate_initialized = B_FALSE;
+
+static lxd_minor_translator_t lxd_mtranslator_mm[] = {
+ { "/dev/null", 0, 1, 3 },
+ { "/dev/zero", 0, 1, 5 },
+ { NULL, 0, 0, 0 }
+};
+static lxd_minor_translator_t lxd_mtranslator_random[] = {
+ { "/dev/random", 0, 1, 8 },
+ { "/dev/urandom", 0, 1, 9 },
+ { NULL, 0, 0, 0 }
+};
+static lxd_minor_translator_t lxd_mtranslator_sy[] = {
+ { "/dev/tty", 0, LX_TTY_MAJOR, 0 },
+ { NULL, 0, 0, 0 }
+};
+static lxd_minor_translator_t lxd_mtranslator_zcons[] = {
+ { "/dev/console", 0, LX_TTY_MAJOR, 1 },
+ { NULL, 0, 0, 0 }
+};
+lxd_devt_translator_t lxd_devt_translators[] = {
+ { "mm", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_mm },
+ { "random", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_random },
+ { "sy", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_sy },
+ { "zcons", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_zcons },
+ { LX_PTM_DRV, 0, DTT_CUSTOM, (uintptr_t)lxd_ptm_devt_translator },
+ { "pts", 0, DTT_CUSTOM, (uintptr_t)lxd_pts_devt_translator },
+ { NULL, 0, DTT_INVALID, NULL }
+};
+
+int
+_init()
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+ int error;
+
+ if (lxd_mountcount > 0)
+ return (EBUSY);
+
+ if ((error = mod_remove(&modlinkage)) != 0)
+ return (error);
+
+ /*
+ * Tear down the operations vectors
+ */
+ (void) vfs_freevfsops_by_type(lxd_fstype);
+ vn_freevnodeops(lxd_vnodeops);
+ mutex_destroy(&lxd_xlate_lock);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * Initialize global locks, etc. Called when loading lxd module.
+ */
+static int
+lxd_init(int fstype, char *name)
+{
+ static const fs_operation_def_t lxd_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = lxd_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = lxd_unmount },
+ VFSNAME_ROOT, { .vfs_root = lxd_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = lxd_statvfs },
+ VFSNAME_FREEVFS, { .vfs_freevfs = lxd_freevfs },
+ NULL, NULL
+ };
+ extern const struct fs_operation_def lxd_vnodeops_template[];
+ int error;
+ major_t dev;
+
+ lxd_fstype = fstype;
+ ASSERT(lxd_fstype != 0);
+
+ error = vfs_setfsops(fstype, lxd_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "lxd_init: bad vfs ops template");
+ return (error);
+ }
+
+ error = vn_make_ops(name, lxd_vnodeops_template, &lxd_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "lxd_init: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * lxd_minfree doesn't need to be some function of configured
+ * swap space since it really is an absolute limit of swap space
+ * which still allows other processes to execute.
+ */
+ if (lxd_minfree == 0) {
+ /* Set if not patched */
+ lxd_minfree = btopr(LXDMINFREE);
+ }
+
+ if ((dev = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN, "lxd_init: Can't get unique device number.");
+ dev = 0;
+ }
+
+ /*
+ * Make the pseudo device
+ */
+ lxd_dev = makedevice(dev, 0);
+
+ mutex_init(&lxd_xlate_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+}
+
+/*
+ * Initialize device translator mapping table.
+ *
+ * Note that we cannot do this in lxd_init since that can lead to a recursive
+ * rw_enter while we're doing lookupnameat (via sdev_lookup/prof_make_maps/
+ * devi_attach_node/modload). Thus we do it in the mount path and keep track
+ * so that we only initialize the table once.
+ */
+static void
+lxd_xlate_init()
+{
+ int i;
+
+ mutex_enter(&lxd_xlate_lock);
+ if (lxd_xlate_initialized) {
+ mutex_exit(&lxd_xlate_lock);
+ return;
+ }
+
+ for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) {
+ lxd_minor_translator_t *mt;
+ int j;
+
+ lxd_devt_translators[i].lxd_xl_major =
+ mod_name_to_major(lxd_devt_translators[i].lxd_xl_driver);
+
+ /* if this translator doesn't use a list mapping we're done. */
+ if (lxd_devt_translators[i].lxd_xl_type != DTT_LIST)
+ continue;
+
+ /* for each device listed, lookup the minor node number */
+ mt = lxd_devt_translators[i].xl_list;
+ for (j = 0; mt[j].lxd_mt_path != NULL; j++) {
+ vnode_t *vp;
+ struct vattr va;
+ char *tpath;
+ char tnm[MAXPATHLEN];
+
+ /*
+ * The attach might be triggered in either the global
+ * zone or in a non-global zone, so we may need to
+ * adjust the path if we're in a NGZ.
+ */
+ if (curproc->p_zone->zone_id == GLOBAL_ZONEUNIQID) {
+ tpath = mt[j].lxd_mt_path;
+ } else {
+ (void) snprintf(tnm, sizeof (tnm), "/native%s",
+ mt[j].lxd_mt_path);
+ tpath = tnm;
+ }
+
+ if (lookupnameat(tpath, UIO_SYSSPACE, FOLLOW, NULL,
+ &vp, NULL) != 0) {
+ mt[j].lxd_mt_minor = UINT_MAX;
+ continue;
+ }
+
+ va.va_mask = AT_RDEV;
+ if (VOP_GETATTR(vp, &va, 0, kcred, NULL) != 0) {
+ va.va_rdev = NODEV;
+ } else {
+ ASSERT(getmajor(va.va_rdev) ==
+ lxd_devt_translators[i].lxd_xl_major);
+ ASSERT(mt[j].lxd_mt_lx_minor < LX_MAXMIN);
+ }
+
+ mt[j].lxd_mt_minor = getminor(va.va_rdev);
+
+ VN_RELE(vp);
+ }
+ }
+
+ lxd_xlate_initialized = B_TRUE;
+ mutex_exit(&lxd_xlate_lock);
+}
+
+static int
+lxd_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ lxd_mnt_t *lxdm = NULL;
+ struct lxd_node *ldn;
+ struct pathname dpn;
+ int error;
+ int i;
+ int nodev;
+ struct vattr rattr;
+ vnode_t *realrootvp;
+ vnode_t *tvp;
+ lx_zone_data_t *lxzdata;
+ lx_virt_disk_t *vd;
+ vattr_t vattr;
+
+ nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
+
+ if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ return (error);
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ lxd_xlate_init();
+
+ /*
+ * This is the same behavior as with lofs.
+ * Loopback devices which get "nodevices" added can be done without
+ * "nodevices" set because we cannot import devices into a zone
+ * with loopback. Note that we have all zone privileges when
+ * this happens; if not, we'd have gotten "nosuid".
+ */
+ if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+ vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
+
+ /*
+ * Only allow mounting within lx zones.
+ */
+ if (curproc->p_zone->zone_brand != &lx_brand)
+ return (EINVAL);
+
+ /*
+ * Ensure we don't allow overlaying mounts
+ */
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /* lxd doesn't support read-only mounts */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = pn_get(uap->dir,
+ (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Find real root
+ */
+ if ((error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
+ UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))) {
+ pn_free(&dpn);
+ return (error);
+ }
+
+ if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) {
+ pn_free(&dpn);
+ VN_RELE(realrootvp);
+ return (error);
+ }
+
+ /* If realroot is not a devfs, error out */
+ if (strcmp(realrootvp->v_op->vnop_name, "dev") != 0) {
+ pn_free(&dpn);
+ VN_RELE(realrootvp);
+ return (EINVAL);
+ }
+
+ lxdm = kmem_zalloc(sizeof (*lxdm), KM_SLEEP);
+
+ /* init but don't bother entering the mutex (not on mount list yet) */
+ mutex_init(&lxdm->lxdm_contents, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&lxdm->lxdm_renamelck, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&lxdm->lxdm_attrlck, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&lxdm->lxdm_devattrs, sizeof (lxd_dev_attr_t),
+ offsetof(lxd_dev_attr_t, lxda_link));
+
+ /* Initialize the hash table mutexes */
+ for (i = 0; i < LXD_HASH_SZ; i++) {
+ mutex_init(&lxdm->lxdm_hash_mutex[i], NULL, MUTEX_DEFAULT,
+ NULL);
+ }
+
+ lxdm->lxdm_vfsp = vfsp;
+ lxdm->lxdm_gen = 1; /* start inode counter at 1 */
+
+ vfsp->vfs_data = (caddr_t)lxdm;
+ vfsp->vfs_fstype = lxd_fstype;
+ vfsp->vfs_dev = lxd_dev;
+ vfsp->vfs_bsize = PAGESIZE;
+ vfsp->vfs_flag |= VFS_NOTRUNC;
+ vfs_make_fsid(&vfsp->vfs_fsid, lxd_dev, lxd_fstype);
+ lxdm->lxdm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+ (void) strcpy(lxdm->lxdm_mntpath, dpn.pn_path);
+
+ /* allocate and initialize root lxd_node structure */
+ bzero(&rattr, sizeof (struct vattr));
+ rattr.va_mode = (mode_t)(S_IFDIR | 0755);
+ rattr.va_type = VDIR;
+ rattr.va_rdev = 0;
+
+ tvp = lxd_make_back_node(realrootvp, lxdm);
+ ldn = VTOLDN(tvp);
+
+ rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+ LDNTOV(ldn)->v_flag |= VROOT;
+
+ /*
+ * initialize linked list of lxd_nodes so that the back pointer of
+ * the root lxd_node always points to the last one on the list
+ * and the forward pointer of the last node is null
+ */
+ ldn->lxdn_prev = ldn;
+ ldn->lxdn_next = NULL;
+ ldn->lxdn_nlink = 0;
+ lxdm->lxdm_rootnode = ldn;
+
+ ldn->lxdn_nodeid = lxdm->lxdm_gen++;
+ lxd_dirinit(ldn, ldn);
+
+ rw_exit(&ldn->lxdn_rwlock);
+
+ pn_free(&dpn);
+ error = 0;
+ atomic_inc_32(&lxd_mountcount);
+
+ lxzdata = ztolxzd(curproc->p_zone);
+ ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+ vattr.va_mask = AT_TYPE | AT_MODE;
+ vattr.va_type = VLNK;
+ vattr.va_mode = 0777;
+
+ vd = list_head(lxzdata->lxzd_vdisks);
+ while (vd != NULL) {
+ if (vd->lxvd_type == LXVD_ZVOL) {
+ char lnknm[MAXPATHLEN];
+
+ /* Create a symlink for the actual zvol. */
+ (void) snprintf(lnknm, sizeof (lnknm),
+ "./zvol/dsk/%s", vd->lxvd_real_name);
+ (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr,
+ lnknm, cr, NULL, 0);
+ } else if (vd->lxvd_type == LXVD_ZFS_DS) {
+ /*
+ * Create a symlink for the root "disk" using /dev/zfs
+ * as the target device.
+ */
+ (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr,
+ "./zfs", cr, NULL, 0);
+ }
+
+ vd = list_next(lxzdata->lxzd_vdisks, vd);
+ }
+
+ /* Apply any persistent attribute changes. */
+ lxd_apply_db(lxdm);
+
+out:
+ if (error == 0)
+ vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
+
+ return (error);
+}
+
+static int
+lxd_unmount(struct vfs *vfsp, int flag, struct cred *cr)
+{
+ lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+ lxd_node_t *ldn, *cancel;
+ struct vnode *vp;
+ int error;
+ uint_t cnt;
+
+ if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (error);
+
+ mutex_enter(&lxdm->lxdm_contents);
+
+ /*
+ * In the normal unmount case only the root node would have a reference
+ * count.
+ *
+ * With lxdm_contents held, nothing can be added or removed.
+ * If we find a previously referenced node, undo the holds we have
+ * placed and fail EBUSY.
+ */
+ ldn = lxdm->lxdm_rootnode;
+
+ vp = LDNTOV(ldn);
+ mutex_enter(&vp->v_lock);
+
+ if (flag & MS_FORCE) {
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&lxdm->lxdm_contents);
+ return (EINVAL);
+ }
+
+ cnt = vp->v_count;
+ if (cnt > 1) {
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&lxdm->lxdm_contents);
+ return (EBUSY);
+ }
+
+ mutex_exit(&vp->v_lock);
+
+ /*
+ * Check for open files. An open file causes everything to unwind.
+ */
+ for (ldn = ldn->lxdn_next; ldn; ldn = ldn->lxdn_next) {
+ vp = LDNTOV(ldn);
+ mutex_enter(&vp->v_lock);
+ cnt = vp->v_count;
+ if (cnt > 0) {
+ /* An open file; unwind the holds we've been adding. */
+ mutex_exit(&vp->v_lock);
+ cancel = lxdm->lxdm_rootnode->lxdn_next;
+ while (cancel != ldn) {
+ vp = LDNTOV(cancel);
+ ASSERT(vp->v_count > 0);
+ VN_RELE(vp);
+ cancel = cancel->lxdn_next;
+ }
+ mutex_exit(&lxdm->lxdm_contents);
+ return (EBUSY);
+ } else {
+ /*
+ * It may seem incorrect for us to have a vnode with
+ * a count of 0, but this is modeled on tmpfs and works
+ * the same way. See lxd_front_inactive. There we allow
+ * the v_count to go to 0 but rely on the link count to
+ * keep the vnode alive. Since we now want to cleanup
+ * these vnodes we manually add a VN_HOLD so that the
+ * VN_RELEs that occur in the lxd_freevfs() cleanup
+ * will take us down the lxd_inactive code path. We
+ * can directly add a VN_HOLD since we have the lock.
+ */
+ vp->v_count++;
+ mutex_exit(&vp->v_lock);
+ }
+ }
+
+ /*
+ * We can drop the mutex now because
+ * no one can find this mount anymore
+ */
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ mutex_exit(&lxdm->lxdm_contents);
+
+ return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS(). This is called by the vfs framework after
+ * umount and the last VFS_RELE, to trigger the release of any resources still
+ * associated with the given vfs_t. This is normally called immediately after
+ * lxd_unmount.
+ */
+void
+lxd_freevfs(vfs_t *vfsp)
+{
+ lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+ lxd_node_t *ldn;
+ struct vnode *vp;
+ lxd_dev_attr_t *da;
+
+ /*
+ * Free all kmemalloc'd and anonalloc'd memory associated with
+ * this filesystem. To do this, we go through the file list twice,
+ * once to remove all the directory entries, and then to remove
+ * all the pseudo files.
+ */
+
+ /*
+ * Now that we are tearing ourselves down we need to remove the
+ * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+ * files from the system causing us to have a negative value. Doing this
+ * seems a bit better than trying to set a flag on the lxd_mnt_t that
+ * says we're tearing down.
+ */
+ vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
+ /*
+ * Remove all directory entries (this doesn't remove top-level dirs).
+ */
+ for (ldn = lxdm->lxdm_rootnode; ldn; ldn = ldn->lxdn_next) {
+ rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+ if (ldn->lxdn_vnode->v_type == VDIR)
+ lxd_dirtrunc(ldn);
+ rw_exit(&ldn->lxdn_rwlock);
+ }
+
+ ASSERT(lxdm->lxdm_rootnode != NULL);
+
+ /*
+ * All links are gone, v_count is keeping nodes in place.
+ * VN_RELE should make the node disappear, unless somebody
+ * is holding pages against it. Nap and retry until it disappears.
+ *
+ * We re-acquire the lock to prevent others who have a HOLD on a
+ * lxd_node from blowing it away (in lxd_inactive) while we're trying
+ * to get to it here. Once we have a HOLD on it we know it'll stick
+ * around.
+ */
+ mutex_enter(&lxdm->lxdm_contents);
+
+ /*
+ * Remove all the files (except the rootnode) backwards.
+ */
+ while ((ldn = lxdm->lxdm_rootnode->lxdn_prev) != lxdm->lxdm_rootnode) {
+ mutex_exit(&lxdm->lxdm_contents);
+ /*
+ * All nodes will be released here. Note we handled the link
+ * count above.
+ */
+ vp = LDNTOV(ldn);
+ ASSERT(vp->v_type == VLNK || vp->v_type == VDIR ||
+ vp->v_type == VSOCK);
+ VN_RELE(vp);
+ mutex_enter(&lxdm->lxdm_contents);
+ /*
+ * It's still there after the RELE. Someone else like pageout
+ * has a hold on it so wait a bit and then try again - we know
+ * they'll give it up soon.
+ */
+ if (ldn == lxdm->lxdm_rootnode->lxdn_prev) {
+ VN_HOLD(vp);
+ mutex_exit(&lxdm->lxdm_contents);
+ delay(hz / 4);
+ mutex_enter(&lxdm->lxdm_contents);
+ }
+ }
+ mutex_exit(&lxdm->lxdm_contents);
+
+ ASSERT(lxdm->lxdm_back_refcnt == 1);
+ ASSERT(lxdm->lxdm_dent_refcnt == 0);
+
+ VN_RELE(LDNTOV(lxdm->lxdm_rootnode));
+
+ ASSERT(lxdm->lxdm_mntpath != NULL);
+ kmem_free(lxdm->lxdm_mntpath, strlen(lxdm->lxdm_mntpath) + 1);
+
+ da = list_remove_head(&lxdm->lxdm_devattrs);
+ while (da != NULL) {
+ kmem_free(da, sizeof (lxd_dev_attr_t));
+ da = list_remove_head(&lxdm->lxdm_devattrs);
+ }
+ list_destroy(&lxdm->lxdm_devattrs);
+
+ mutex_destroy(&lxdm->lxdm_contents);
+ mutex_destroy(&lxdm->lxdm_renamelck);
+ mutex_destroy(&lxdm->lxdm_attrlck);
+ kmem_free(lxdm, sizeof (lxd_mnt_t));
+
+ /* Allow _fini() to succeed now */
+ atomic_dec_32(&lxd_mountcount);
+}
+
+/*
+ * return root lxdnode for given vnode
+ */
+static int
+lxd_root(struct vfs *vfsp, struct vnode **vpp)
+{
+ lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+ lxd_node_t *ldn = lxdm->lxdm_rootnode;
+ struct vnode *vp;
+
+ ASSERT(ldn != NULL);
+
+ vp = LDNTOV(ldn);
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+lxd_statvfs(struct vfs *vfsp, statvfs64_t *sbp)
+{
+ lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+ ulong_t blocks;
+ dev32_t d32;
+ zoneid_t eff_zid;
+ struct zone *zp;
+
+ zp = lxdm->lxdm_vfsp->vfs_zone;
+
+ if (zp == NULL)
+ eff_zid = GLOBAL_ZONEUNIQID;
+ else
+ eff_zid = zp->zone_id;
+
+ sbp->f_bsize = PAGESIZE;
+ sbp->f_frsize = PAGESIZE;
+
+ /*
+ * Find the amount of available physical and memory swap
+ */
+ mutex_enter(&anoninfo_lock);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+ mutex_exit(&anoninfo_lock);
+
+ if (blocks > lxd_minfree)
+ sbp->f_bfree = blocks - lxd_minfree;
+ else
+ sbp->f_bfree = 0;
+
+ sbp->f_bavail = sbp->f_bfree;
+
+ /*
+ * Total number of blocks is just what's available
+ */
+ sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+ if (eff_zid != GLOBAL_ZONEUNIQID &&
+ zp->zone_max_swap_ctl != UINT64_MAX) {
+ /*
+ * If the fs is used by a zone with a swap cap,
+ * then report the capped size.
+ */
+ rctl_qty_t cap, used;
+ pgcnt_t pgcap, pgused;
+
+ mutex_enter(&zp->zone_mem_lock);
+ cap = zp->zone_max_swap_ctl;
+ used = zp->zone_max_swap;
+ mutex_exit(&zp->zone_mem_lock);
+
+ pgcap = btop(cap);
+ pgused = btop(used);
+
+ sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+ sbp->f_bavail = sbp->f_bfree;
+ sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+ }
+
+ /*
+ * The maximum number of files available is approximately the number
+ * of lxd_nodes we can allocate from the remaining kernel memory
+ * available to lxdevfs in this zone. This is fairly inaccurate since
+ * it doesn't take into account the names stored in the directory
+ * entries.
+ */
+ sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+ (sizeof (lxd_node_t) + sizeof (lxd_dirent_t));
+ sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sbp->f_fsid = d32;
+ (void) strcpy(sbp->f_basetype, vfssw[lxd_fstype].vsw_name);
+ (void) strncpy(sbp->f_fstr, lxdm->lxdm_mntpath, sizeof (sbp->f_fstr));
+ /* ensure null termination */
+ sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+ sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sbp->f_namemax = MAXNAMELEN - 1;
+ return (0);
+}
+
+static void
+lxd_pts_devt_translator(dev_t dev, dev_t *jdev)
+{
+ minor_t min = getminor(dev);
+ int lx_maj, lx_min;
+
+ /*
+ * Linux uses a range of major numbers for pts devices to address the
+ * relatively small minor number space (20 bits).
+ */
+
+ lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN);
+ lx_min = min % LX_MAXMIN;
+ if (lx_maj > LX_PTS_MAJOR_MAX) {
+ /*
+ * The major is outside the acceptable range but there's little
+ * we can presently do about it short of overhauling the
+ * translation logic.
+ */
+ lx_unsupported("pts major out of translation range");
+ }
+
+ *jdev = LX_MAKEDEVICE(lx_maj, lx_min);
+}
+
+/* ARGSUSED */
+static void
+lxd_ptm_devt_translator(dev_t dev, dev_t *jdev)
+{
+ *jdev = LX_MAKEDEVICE(LX_PTM_MAJOR, LX_PTM_MINOR);
+}
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c
new file mode 100644
index 0000000000..8088ba6174
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c
@@ -0,0 +1,1520 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/cred.h>
+#include <sys/pathname.h>
+#include <sys/debug.h>
+#include <sys/sdt.h>
+#include <fs/fs_subr.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <sys/lx_brand.h>
+#include <sys/brand.h>
+
+#include "lxd.h"
+
+static int
+lxd_open(vnode_t **vpp, int flag, struct cred *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(*vpp);
+ vnode_t *vp = *vpp;
+ vnode_t *rvp;
+ vnode_t *oldvp;
+ int error;
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (0);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ oldvp = vp;
+ vp = rvp = REALVP(vp);
+ /*
+ * Need to hold new reference to vp since VOP_OPEN() may
+ * decide to release it.
+ */
+ VN_HOLD(vp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+
+ if (!error && rvp != vp) {
+ /*
+ * the FS which we called should have released the
+ * new reference on vp
+ */
+ *vpp = lxd_make_back_node(rvp, VFSTOLXDM(oldvp->v_vfsp));
+
+ if (IS_DEVVP(*vpp)) {
+ vnode_t *svp;
+
+ svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+ VN_RELE(*vpp);
+ if (svp == NULL)
+ error = ENOSYS;
+ else
+ *vpp = svp;
+ }
+ VN_RELE(oldvp);
+ } else {
+ ASSERT(rvp->v_count > 1);
+ VN_RELE(rvp);
+ }
+
+ return (error);
+}
+
+static int
+lxd_close(vnode_t *vp, int flag, int count, offset_t offset, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (0);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_CLOSE(vp, flag, count, offset, cr, ct));
+}
+
+static int
+lxd_read(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (ENOTSUP);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_READ(vp, uiop, ioflag, cr, ct));
+}
+
+static int
+lxd_write(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (ENOTSUP);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_WRITE(vp, uiop, ioflag, cr, ct));
+}
+
+static int
+lxd_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, struct cred *cr,
+ int *rvalp, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (ENOTSUP);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_IOCTL(vp, cmd, arg, flag, cr, rvalp, ct));
+}
+
+static int
+lxd_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (ENOTSUP);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_SETFL(vp, oflags, nflags, cr, ct));
+}
+
+/*
+ * Translate SunOS devt to Linux devt.
+ */
+static void
+lxd_s2l_devt(dev_t dev, dev_t *rdev)
+{
+ lxd_minor_translator_t *mt;
+ int i, j;
+ major_t maj = getmajor(dev);
+ minor_t min = getminor(dev);
+
+ /* look for a devt translator for this major number */
+ for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) {
+ if (lxd_devt_translators[i].lxd_xl_major == maj)
+ break;
+ }
+
+ if (lxd_devt_translators[i].lxd_xl_driver != NULL) {
+ /* try to translate the illumos devt to a linux devt */
+ switch (lxd_devt_translators[i].lxd_xl_type) {
+ case DTT_INVALID:
+ ASSERT(0);
+ break;
+
+ case DTT_LIST:
+ mt = lxd_devt_translators[i].xl_list;
+ for (j = 0; mt[j].lxd_mt_path != NULL; j++) {
+ if (mt[j].lxd_mt_minor == min) {
+ ASSERT(mt[j].lxd_mt_minor < LX_MAXMIN);
+
+ /* found a translation */
+ *rdev = LX_MAKEDEVICE(
+ mt[j].lxd_mt_lx_major,
+ mt[j].lxd_mt_lx_minor);
+ return;
+ }
+ }
+ break;
+
+ case DTT_CUSTOM:
+ lxd_devt_translators[i].xl_custom(dev, rdev);
+ return;
+ }
+ }
+
+ /* we don't have a translator for this device */
+ *rdev = LX_MAKEDEVICE(maj, min);
+}
+
+static int
+lxd_getattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+ int error;
+ vnode_t *rvp;
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ mutex_enter(&ldn->lxdn_tlock);
+
+ vap->va_type = vp->v_type;
+ vap->va_mode = ldn->lxdn_mode & MODEMASK;
+ vap->va_uid = ldn->lxdn_uid;
+ vap->va_gid = ldn->lxdn_gid;
+ vap->va_fsid = ldn->lxdn_fsid;
+ vap->va_nodeid = (ino64_t)ldn->lxdn_nodeid;
+ vap->va_nlink = ldn->lxdn_nlink;
+ vap->va_size = (u_offset_t)ldn->lxdn_size;
+ vap->va_atime = ldn->lxdn_atime;
+ vap->va_mtime = ldn->lxdn_mtime;
+ vap->va_ctime = ldn->lxdn_ctime;
+ vap->va_blksize = PAGESIZE;
+ vap->va_rdev = 0; /* no devs in front */
+ vap->va_seq = ldn->lxdn_seq;
+
+ vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(
+ vap->va_size)));
+ mutex_exit(&ldn->lxdn_tlock);
+ return (0);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ rvp = REALVP(vp);
+ if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)))
+ return (error);
+
+ /* Skip devt translation for native programs */
+ if (curproc->p_brand != &lx_brand) {
+ return (0);
+ } else {
+ /*
+ * We also skip translation when called from the user-land
+ * emulation code.
+ */
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+
+ if (lwpd == NULL || lwpd->br_stack_mode != LX_STACK_MODE_BRAND)
+ return (0);
+ }
+
+ if (rvp->v_type == VCHR) {
+ dev_t ldev;
+
+ lxd_s2l_devt(vap->va_rdev, &ldev);
+ DTRACE_PROBE3(lxd__devxl, void *, rvp, void *, vap, int, ldev);
+ vap->va_rdev = ldev;
+ }
+
+ return (0);
+}
+
+static int
+lxd_setattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+ lxd_mnt_t *lxdm = VTOLXDM(vp);
+ int res;
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ int error = 0;
+ struct vattr *set;
+ long mask = vap->va_mask;
+
+ /* Cannot set these attributes */
+ if ((mask & AT_NOSET) || (mask & AT_XVATTR) ||
+ (mask & AT_MODE && vap->va_mode & (S_ISUID | S_ISGID)) ||
+ (mask & AT_SIZE))
+ return (EINVAL);
+
+ mutex_enter(&ldn->lxdn_tlock);
+
+ set = &ldn->lxdn_attr;
+ /*
+ * Change file access modes. Must be owner or have sufficient
+ * privileges.
+ */
+ error = secpolicy_vnode_setattr(cr, vp, vap, set, flags,
+ lxd_naccess, ldn);
+ if (error) {
+ mutex_exit(&ldn->lxdn_tlock);
+ return (error);
+ }
+
+ if (mask & AT_MODE) {
+ set->va_mode &= S_IFMT;
+ set->va_mode |= vap->va_mode & ~S_IFMT;
+ }
+
+ if (mask & AT_UID)
+ set->va_uid = vap->va_uid;
+ if (mask & AT_GID)
+ set->va_gid = vap->va_gid;
+ if (mask & AT_ATIME)
+ set->va_atime = vap->va_atime;
+ if (mask & AT_MTIME)
+ set->va_mtime = vap->va_mtime;
+
+ if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+ gethrestime(&ldn->lxdn_ctime);
+
+ mutex_exit(&ldn->lxdn_tlock);
+ return (error);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ res = VOP_SETATTR(vp, vap, flags, cr, ct);
+ if (res == 0 && (vap->va_mask & (AT_MODE | AT_UID | AT_GID))) {
+ lxd_save_attrs(lxdm, vp);
+ }
+ return (res);
+}
+
+static int
+lxd_access(vnode_t *vp, int mode, int flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ int error;
+
+ mutex_enter(&ldn->lxdn_tlock);
+ error = lxd_naccess(ldn, mode, cr);
+ mutex_exit(&ldn->lxdn_tlock);
+ return (error);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ if (mode & VWRITE) {
+ if (vp->v_type == VREG && vn_is_readonly(vp))
+ return (EROFS);
+ }
+ vp = REALVP(vp);
+ return (VOP_ACCESS(vp, mode, flags, cr, ct));
+}
+
+static int
+lxd_fsync(vnode_t *vp, int syncflag, struct cred *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (0);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_FSYNC(vp, syncflag, cr, ct));
+}
+
+/* ARGSUSED */
+static void
+lxd_front_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+ lxd_mnt_t *lxdm = VTOLXDM(vp);
+
+ ASSERT(ldn->lxdn_type == LXDNT_FRONT);
+ rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+
+ mutex_enter(&ldn->lxdn_tlock);
+ mutex_enter(&vp->v_lock);
+ ASSERT(vp->v_count >= 1);
+
+ /*
+ * If we don't have the last hold or the link count is non-zero,
+ * there's little to do -- just drop our hold.
+ */
+ if (vp->v_count > 1 || ldn->lxdn_nlink != 0) {
+ vp->v_count--;
+
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&ldn->lxdn_tlock);
+ rw_exit(&ldn->lxdn_rwlock);
+ return;
+ }
+
+ /*
+ * We have the last hold *and* the link count is zero, so this node is
+ * dead from the filesystem's viewpoint.
+ */
+ if (ldn->lxdn_size != 0) {
+ if (ldn->lxdn_vnode->v_type == VLNK)
+ kmem_free(ldn->lxdn_symlink, ldn->lxdn_size + 1);
+ }
+
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&ldn->lxdn_tlock);
+
+ vn_invalid(LDNTOV(ldn));
+
+ mutex_enter(&lxdm->lxdm_contents);
+ if (ldn->lxdn_next == NULL)
+ lxdm->lxdm_rootnode->lxdn_prev = ldn->lxdn_prev;
+ else
+ ldn->lxdn_next->lxdn_prev = ldn->lxdn_prev;
+ ldn->lxdn_prev->lxdn_next = ldn->lxdn_next;
+
+ mutex_exit(&lxdm->lxdm_contents);
+ rw_exit(&ldn->lxdn_rwlock);
+ rw_destroy(&ldn->lxdn_rwlock);
+ mutex_destroy(&ldn->lxdn_tlock);
+
+ vn_free(LDNTOV(ldn));
+ kmem_free(ldn, sizeof (lxd_node_t));
+}
+
+/*ARGSUSED*/
+static void
+lxd_inactive(vnode_t *vp, struct cred *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ lxd_front_inactive(vp, cr, ct);
+ return;
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ lxd_free_back_node(ldn);
+}
+
+/* ARGSUSED */
+static int
+lxd_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT)
+ return (ENOTSUP);
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_FID(vp, fidp, ct));
+}
+
+/*
+ * For a front node lookup in the dirent hash table and return a shadow vnode
+ * (lxd_node_t type) of type LXDNT_FRONT.
+ *
+ * For a back node, lookup nm name and return a shadow vnode (lxd_node_t type)
+ * of the real vnode found.
+ */
+static int
+lxd_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+ int flags, vnode_t *rdir, struct cred *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ vnode_t *vp = NULL;
+ int error;
+ vnode_t *realdvp;
+ lxd_mnt_t *lxdm = VTOLXDM(dvp);
+ int doingdotdot = 0;
+ lxd_node_t *ldn = VTOLDN(dvp);
+ lxd_node_t *nldn = NULL;
+
+ /*
+ * First check for front file which could be instantiated on either a
+ * front or back node (e.g. the top-level moint point directory node is
+ * a back node which can have front files created in it).
+ */
+
+ /* disallow extended attrs */
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ /* Null component name is a synonym for dir being searched. */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ }
+
+ rw_enter(&ldn->lxdn_rwlock, RW_READER);
+ error = lxd_dirlookup(ldn, nm, &nldn, cr);
+ rw_exit(&ldn->lxdn_rwlock);
+
+ if (error == 0) {
+ /* found */
+ ASSERT(nldn != NULL);
+ *vpp = LDNTOV(nldn);
+ return (0);
+ }
+
+ /* At this point, if dir node is a front node, error */
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (ENOENT);
+ }
+
+ realdvp = REALVP(dvp);
+
+ if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
+ doingdotdot++;
+ /*
+ * Handle ".." out of mounted filesystem
+ */
+ while ((realdvp->v_flag & VROOT) && realdvp != rootdir) {
+ realdvp = realdvp->v_vfsp->vfs_vnodecovered;
+ ASSERT(realdvp != NULL);
+ }
+ }
+
+ *vpp = NULL; /* default(error) case */
+
+ /*
+ * Do the normal lookup
+ */
+ if ((error = VOP_LOOKUP(realdvp, nm, &vp, pnp, flags, rdir, cr,
+ ct, direntflags, realpnp)) != 0) {
+ vp = NULL;
+ goto out;
+ }
+
+ /*
+ * We do this check here to avoid returning a stale file handle to the
+ * caller.
+ */
+ if (nm[0] == '.' && nm[1] == '\0') {
+ ASSERT(vp == realdvp);
+ VN_HOLD(dvp);
+ VN_RELE(vp);
+ *vpp = dvp;
+ return (0);
+ }
+
+ if (doingdotdot) {
+ *vpp = lxd_make_back_node(vp, lxdm);
+ return (0);
+ }
+
+ /*
+ * If this vnode is mounted on, then we
+ * traverse to the vnode which is the root of
+ * the mounted file system.
+ */
+ if ((error = traverse(&vp)) != 0)
+ goto out;
+
+ /*
+ * Make a lxd node for the real vnode.
+ */
+ *vpp = lxd_make_back_node(vp, lxdm);
+ if (vp->v_type != VDIR) {
+ if (IS_DEVVP(*vpp)) {
+ vnode_t *svp;
+
+ svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+ VN_RELE(*vpp);
+ if (svp == NULL) {
+ VN_RELE(vp);
+ error = ENOSYS;
+ } else {
+ *vpp = svp;
+ }
+ }
+ return (error);
+ }
+
+out:
+ if (error != 0 && vp != NULL)
+ VN_RELE(vp);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+lxd_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
+ int mode, vnode_t **vpp, struct cred *cr, int flag, caller_context_t *ct,
+ vsecattr_t *vsecp)
+{
+ int error;
+ lxd_node_t *parent = VTOLDN(dvp);
+ lxd_node_t *lnp = NULL;
+
+ rw_enter(&parent->lxdn_rwlock, RW_READER);
+ error = lxd_dirlookup(parent, nm, &lnp, cr);
+ rw_exit(&parent->lxdn_rwlock);
+
+ /*
+ * If a back node already exists then there is no need to pass
+ * the create to native devfs -- just set the vpp to the back
+ * vnode. If the front node already exists then fail because
+ * it can't represent a regular file. In both cases, enforce
+ * open(2)'s EEXIST and EISDIR semantics.
+ */
+ if (error == 0) {
+ if (exclusive == EXCL) {
+ error = EEXIST;
+ } else if (LDNTOV(lnp)->v_type == VDIR &&
+ (mode & S_IWRITE)) {
+ error = EISDIR;
+ } else if (lnp->lxdn_type == LXDNT_FRONT) {
+ error = ENOTSUP;
+ }
+
+ if (error != 0) {
+ ldnode_rele(lnp);
+ return (error);
+ }
+
+ VERIFY3S(lnp->lxdn_type, ==, LXDNT_BACK);
+ *vpp = lnp->lxdn_vnode;
+
+ return (error);
+ }
+
+ /*
+ * We cannot create files in the back devfs but we want to allow for
+ * O_CREAT on existing files. Pass this through and let the back file
+ * system allow or deny it.
+ */
+ if (parent->lxdn_type == LXDNT_BACK) {
+ vnode_t *vp = NULL;
+
+ if (*nm == '\0') {
+ ASSERT(vpp && dvp == *vpp);
+ vp = REALVP(*vpp);
+ }
+ if ((error = VOP_CREATE(REALVP(dvp), nm, va, exclusive, mode,
+ &vp, cr, flag, ct, vsecp)) == 0) {
+ *vpp = lxd_make_back_node(vp, VFSTOLXDM(dvp->v_vfsp));
+ if (IS_DEVVP(*vpp)) {
+ vnode_t *svp;
+
+ svp = specvp(*vpp, (*vpp)->v_rdev,
+ (*vpp)->v_type, cr);
+ VN_RELE(*vpp);
+ if (svp == NULL) {
+ return (ENOSYS);
+ }
+ *vpp = svp;
+ }
+ return (0);
+ }
+ /*
+ * If we were unable to perform the VOP_CREATE for any reason
+ * other than sdev being read-only, we should bail.
+ */
+ if (error != ENOTSUP && error != EROFS) {
+ return (error);
+ }
+ }
+
+ /*
+ * While we don't allow creating data-containing files under
+ * lx devfs, we must allow VSOCK front nodes to be created so
+ * that paths such as /dev/log can be used as AF_UNIX sockets.
+ */
+ if (va->va_type == VSOCK) {
+ lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode);
+
+ lnp = NULL;
+ rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+ error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL,
+ va, &lnp, cr);
+ rw_exit(&parent->lxdn_rwlock);
+
+ if (error == 0) {
+ *vpp = LDNTOV(lnp);
+ } else if (lnp != NULL) {
+ /*
+ * It's possible that a racing process created an entry
+ * at this name since we last performed the lookup.
+ */
+ ldnode_rele(lnp);
+ }
+ } else {
+ error = ENOTSUP;
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+lxd_remove(vnode_t *dvp, char *nm, struct cred *cr, caller_context_t *ct,
+ int flags)
+{
+ lxd_node_t *parent = VTOLDN(dvp);
+ lxd_node_t *ldn = NULL;
+ int error;
+
+ /* can only remove existing front nodes */
+ error = lxd_dirlookup(parent, nm, &ldn, cr);
+ if (error) {
+ return (error);
+ }
+
+ ASSERT(ldn != NULL);
+ ASSERT(ldn->lxdn_type == LXDNT_FRONT);
+ rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+ rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+
+ error = lxd_dirdelete(parent, ldn, nm, DR_REMOVE, cr);
+
+ rw_exit(&ldn->lxdn_rwlock);
+ rw_exit(&parent->lxdn_rwlock);
+
+ ldnode_rele(ldn);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+lxd_link(vnode_t *tdvp, vnode_t *vp, char *tnm, struct cred *cr,
+ caller_context_t *ct, int flags)
+{
+ return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static int
+lxd_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, struct cred *cr,
+ caller_context_t *ct, int flags)
+{
+ lxd_node_t *oldparent = VTOLDN(odvp);
+ lxd_node_t *newparent;
+ lxd_mnt_t *lxdm = VTOLXDM(oldparent->lxdn_vnode);
+ lxd_node_t *fromnode = NULL;
+ int error;
+ int samedir = 0;
+
+ if (!vn_matchops(ndvp, lxd_vnodeops)) {
+ /* cannot rename out of this file system */
+ return (EACCES);
+ }
+
+ mutex_enter(&lxdm->lxdm_renamelck);
+
+ newparent = VTOLDN(ndvp);
+
+ /*
+ * We can only rename front nodes.
+ */
+ error = lxd_dirlookup(oldparent, onm, &fromnode, cr);
+ if (error != 0) {
+ /* not found in front */
+ mutex_exit(&lxdm->lxdm_renamelck);
+ return (error);
+ }
+
+ /*
+ * Make sure we can delete the old (source) entry. This
+ * requires write permission on the containing directory. If
+ * that directory is "sticky" it requires further checks.
+ */
+ if ((error = lxd_naccess(oldparent, VWRITE, cr)) != 0)
+ goto done;
+
+ /*
+ * Check for renaming to or from '.' or '..' or that
+ * fromnode == oldparent
+ */
+ if ((onm[0] == '.' &&
+ (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
+ (nnm[0] == '.' &&
+ (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) ||
+ (oldparent == fromnode)) {
+ error = EINVAL;
+ goto done;
+ }
+
+ samedir = (oldparent == newparent);
+
+ /*
+ * Make sure we can search and rename into the destination directory.
+ */
+ if (!samedir) {
+ if ((error = lxd_naccess(newparent, VEXEC|VWRITE, cr)) != 0)
+ goto done;
+ }
+
+ /*
+ * Link source to new target
+ */
+ rw_enter(&newparent->lxdn_rwlock, RW_WRITER);
+ error = lxd_direnter(lxdm, newparent, nnm, DE_RENAME,
+ oldparent, fromnode, (struct vattr *)NULL, (lxd_node_t **)NULL,
+ cr);
+ rw_exit(&newparent->lxdn_rwlock);
+
+ if (error)
+ goto done;
+
+ /*
+ * Unlink from source.
+ */
+ rw_enter(&oldparent->lxdn_rwlock, RW_WRITER);
+ rw_enter(&fromnode->lxdn_rwlock, RW_WRITER);
+
+ error = lxd_dirdelete(oldparent, fromnode, onm, DR_RENAME, cr);
+
+ /*
+ * The following handles the case where our source node was
+ * removed before we got to it.
+ */
+ if (error == ENOENT)
+ error = 0;
+
+ rw_exit(&fromnode->lxdn_rwlock);
+ rw_exit(&oldparent->lxdn_rwlock);
+
+done:
+ ldnode_rele(fromnode);
+ mutex_exit(&lxdm->lxdm_renamelck);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+lxd_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
+ struct cred *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+ int error;
+ vnode_t *tvp;
+ lxd_node_t *ndir = NULL;
+ lxd_node_t *parent = VTOLDN(dvp);
+ lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode);
+
+ /* check for existence in both front and back */
+ if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) {
+ /* The entry already exists */
+ VN_RELE(tvp);
+ return (EEXIST);
+ }
+
+ /* make front directory */
+ rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+ error = lxd_direnter(lxdm, parent, nm, DE_MKDIR, NULL, NULL,
+ va, &ndir, cr);
+ rw_exit(&parent->lxdn_rwlock);
+
+ if (error != 0) {
+ if (ndir != NULL)
+ ldnode_rele(ndir);
+ } else {
+ *vpp = LDNTOV(ndir);
+ }
+
+ return (error);
+}
+
+static int
+lxd_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ *vpp = vp;
+ return (0);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ while (vn_matchops(vp, lxd_vnodeops))
+ vp = REALVP(vp);
+
+ if (VOP_REALVP(vp, vpp, ct) != 0)
+ *vpp = vp;
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+lxd_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, struct cred *cr,
+ caller_context_t *ct, int flags)
+{
+ int error;
+ lxd_node_t *ldn;
+ struct vnode *vp;
+ lxd_node_t *parent = VTOLDN(dvp);
+
+ /*
+ * Return error if trying to remove . or ..
+ */
+ if (strcmp(nm, ".") == 0)
+ return (EINVAL);
+ if (strcmp(nm, "..") == 0)
+ return (EEXIST);
+
+ error = lxd_dirlookup(VTOLDN(dvp), nm, &ldn, cr);
+ if (error != 0) {
+ /* not found in front */
+ return (error);
+ }
+
+ rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+ rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+
+ vp = LDNTOV(ldn);
+ if (vp == dvp || vp == cdir) {
+ error = EINVAL;
+ goto err;
+ }
+
+ if (ldn->lxdn_vnode->v_type != VDIR) {
+ error = ENOTDIR;
+ goto err;
+ }
+
+ mutex_enter(&ldn->lxdn_tlock);
+ if (ldn->lxdn_nlink > 2) {
+ mutex_exit(&ldn->lxdn_tlock);
+ error = EEXIST;
+ goto err;
+ }
+ mutex_exit(&ldn->lxdn_tlock);
+
+ /* Check for an empty directory */
+ if (ldn->lxdn_dirents > 2) {
+ error = EEXIST;
+ gethrestime(&ldn->lxdn_atime);
+ goto err;
+ }
+
+ if (vn_vfswlock(vp)) {
+ error = EBUSY;
+ goto err;
+ }
+ if (vn_mountedvfs(vp) != NULL) {
+ error = EBUSY;
+ vn_vfsunlock(vp);
+ goto err;
+ }
+
+ error = lxd_dirdelete(parent, ldn, nm, DR_RMDIR, cr);
+ vn_vfsunlock(vp);
+
+err:
+ rw_exit(&ldn->lxdn_rwlock);
+ rw_exit(&parent->lxdn_rwlock);
+ ldnode_rele(ldn);
+
+ return (error);
+}
+
+/* Not static so it can be used during mount. */
+/* ARGSUSED */
+int
+lxd_symlink(vnode_t *dvp, char *nm, struct vattr *tva, char *tnm,
+ struct cred *cr, caller_context_t *ct, int flags)
+{
+ lxd_node_t *parent = VTOLDN(dvp);
+ lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode);
+ lxd_node_t *self = NULL;
+ vnode_t *tvp;
+ char *cp = NULL;
+ int error;
+ size_t len;
+
+ /* this will check for existence in both front and back */
+ if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) {
+ /* The entry already exists */
+ VN_RELE(tvp);
+ return (EEXIST);
+ }
+
+ /* make symlink in the front */
+ rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+ error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL,
+ tva, &self, cr);
+ rw_exit(&parent->lxdn_rwlock);
+
+ if (error) {
+ if (self != NULL)
+ ldnode_rele(self);
+ return (error);
+ }
+
+ len = strlen(tnm) + 1;
+ cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI);
+ if (cp == NULL) {
+ ldnode_rele(self);
+ return (ENOSPC);
+ }
+ (void) strcpy(cp, tnm);
+
+ self->lxdn_symlink = cp;
+ self->lxdn_size = len - 1;
+ ldnode_rele(self);
+
+ return (error);
+}
+
+static int
+lxd_readlink(vnode_t *vp, struct uio *uiop, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ int error;
+
+ if (vp->v_type != VLNK)
+ return (EINVAL);
+
+ rw_enter(&ldn->lxdn_rwlock, RW_READER);
+ error = uiomove(ldn->lxdn_symlink, ldn->lxdn_size, UIO_READ,
+ uiop);
+ gethrestime(&ldn->lxdn_atime);
+ rw_exit(&ldn->lxdn_rwlock);
+ return (error);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_READLINK(vp, uiop, cr, ct));
+}
+
+static int
+lx_merge_front(vnode_t *vp, struct uio *uiop, off_t req_off, int *eofp)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+ struct dirent *sd;
+ lxd_dirent_t *ldp;
+ enum lxd_node_type type = ldn->lxdn_type;
+ ssize_t uresid;
+ off_t front_off;
+ int error = 0;
+ int sdlen;
+
+ /* skip the front entries if the back read was incomplete */
+ if (*eofp == 0)
+ return (0);
+
+ /*
+ * If this was a back node then reading that node has completed and we
+ * may have a partially full uio struct. eof should be set to true.
+ * Leave it set since we're likely to hit eof for the front nodes (if
+ * any).
+ */
+
+ front_off = uiop->uio_offset + 1;
+ sdlen = sizeof (struct dirent) + MAXPATHLEN;
+ /* zalloc to ensure we don't have anything in the d_name buffer */
+ sd = (struct dirent *)kmem_zalloc(sdlen, KM_SLEEP);
+ ldp = ldn->lxdn_dir;
+ while (ldp != NULL && (uresid = uiop->uio_resid) > 0) {
+ int namelen;
+ int reclen;
+
+ /*
+ * Skip dot and dotdot for back nodes since we have them
+ * already.
+ */
+ if (type == LXDNT_BACK &&
+ (strcmp(ldp->lddir_name, ".") == 0 ||
+ strcmp(ldp->lddir_name, "..") == 0)) {
+ ldp = ldp->lddir_next;
+ continue;
+ }
+
+ /*
+ * Might have previously had a partial readdir of the front
+ * nodes, and now we're back for more, or we may just be
+ * be doing a follow-up readdir after we've previously
+ * returned all front and back nodes.
+ */
+ if (front_off > req_off) {
+ namelen = strlen(ldp->lddir_name); /* no +1 needed */
+ reclen = (int)DIRENT64_RECLEN(namelen);
+
+ /*
+ * If the size of the data to transfer is greater
+ * than that requested, then we can't do it this
+ * transfer.
+ */
+ if (reclen > uresid) {
+ *eofp = 0;
+ /* Buffer too small for any entries. */
+ if (front_off == 0)
+ error = EINVAL;
+ break;
+ }
+
+ (void) strncpy(sd->d_name, ldp->lddir_name,
+ DIRENT64_NAMELEN(reclen));
+ sd->d_reclen = (ushort_t)reclen;
+ sd->d_ino = (ino_t)ldp->lddir_node->lxdn_nodeid;
+ sd->d_off = front_off;
+
+ /* uiomove will adjust iov_base properly */
+ if ((error = uiomove((caddr_t)sd, reclen, UIO_READ,
+ uiop)) != 0) {
+ *eofp = 0;
+ break;
+ }
+ }
+
+ /*
+ * uiomove() above updates both uio_resid and uio_offset by the
+ * same amount but we want uio_offset to change in increments
+ * of 1, which is different from the number of bytes being
+ * returned to the caller, so we set uio_offset explicitly,
+ * ignoring what uiomove() did.
+ */
+ uiop->uio_offset = front_off;
+ front_off++;
+
+ ldp = ldp->lddir_next;
+ }
+
+ kmem_free(sd, sdlen);
+ return (error);
+}
+
+static int
+lxd_readdir(vnode_t *vp, struct uio *uiop, struct cred *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+ vnode_t *rvp;
+ int res;
+ off_t req_off;
+
+ if (uiop->uio_iovcnt != 1)
+ return (EINVAL);
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ req_off = uiop->uio_offset;
+
+ /* First read the back node (if it is one) */
+ if (ldn->lxdn_type == LXDNT_BACK) {
+ rvp = REALVP(vp);
+ res = VOP_READDIR(rvp, uiop, cr, eofp, ct, flags);
+ if (res != 0)
+ return (res);
+ } else {
+ /* setup for merge_front */
+ ASSERT(ldn->lxdn_type == LXDNT_FRONT);
+ /* caller should have already called lxd_rwlock */
+ ASSERT(RW_READ_HELD(&ldn->lxdn_rwlock));
+
+ *eofp = 1;
+ /*
+ * The merge code starts the offset calculation from uio_offset,
+ * which is normally already set to the high value by the back
+ * code, but in this case we need to count up from 0.
+ */
+ uiop->uio_offset = 0;
+ }
+
+ /*
+ * Our back nodes can also have front entries hanging on them so we
+ * need to merge those in. Or, we may simply have a front node (i.e. a
+ * front subdir).
+ */
+ res = lx_merge_front(vp, uiop, req_off, eofp);
+ return (res);
+}
+
+static int
+lxd_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ if (write_lock) {
+ rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+ } else {
+ rw_enter(&ldn->lxdn_rwlock, RW_READER);
+ }
+ return (write_lock);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_RWLOCK(vp, write_lock, ct));
+}
+
+static void
+lxd_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ rw_exit(&ldn->lxdn_rwlock);
+ return;
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ VOP_RWUNLOCK(vp, write_lock, ct);
+}
+
+static int
+lxd_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_SEEK(vp, ooff, noffp, ct));
+}
+
+static int
+lxd_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+ while (vn_matchops(vp1, lxd_vnodeops) &&
+ VTOLDN(vp1)->lxdn_type == LXDNT_BACK) {
+ vp1 = REALVP(vp1);
+ }
+ while (vn_matchops(vp2, lxd_vnodeops) &&
+ VTOLDN(vp2)->lxdn_type == LXDNT_BACK) {
+ vp2 = REALVP(vp2);
+ }
+
+ if (vn_matchops(vp1, lxd_vnodeops) || vn_matchops(vp2, lxd_vnodeops))
+ return (vp1 == vp2);
+
+ return (VOP_CMP(vp1, vp2, ct));
+}
+
+static int
+lxd_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
+ struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+}
+
+static int
+lxd_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
+ struct cred *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_SPACE(vp, cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+lxd_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *prot,
+ struct page *parr[], size_t psz, struct seg *seg, caddr_t addr,
+ enum seg_rw rw, struct cred *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_GETPAGE(vp, off, len, prot, parr, psz, seg, addr, rw, cr,
+ ct));
+}
+
+static int
+lxd_putpage(vnode_t *vp, offset_t off, size_t len, int flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_PUTPAGE(vp, off, len, flags, cr, ct));
+}
+
+static int
+lxd_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len,
+ uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_MAP(vp, off, as, addrp, len, prot, maxprot, flags, cr, ct));
+}
+
+static int
+lxd_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len,
+ uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_ADDMAP(vp, off, as, addr, len, prot, maxprot, flags, cr,
+ ct));
+}
+
+static int
+lxd_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len,
+ uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_DELMAP(vp, off, as, addr, len, prot, maxprot, flags, cr,
+ ct));
+}
+
+static int
+lxd_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_POLL(vp, events, anyyet, reventsp, phpp, ct));
+}
+
+static int
+lxd_dump(vnode_t *vp, caddr_t addr, offset_t bn, offset_t count,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_DUMP(vp, addr, bn, count, ct));
+}
+
+static int
+lxd_pathconf(vnode_t *vp, int cmd, ulong_t *valp, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_PATHCONF(vp, cmd, valp, cr, ct));
+}
+
+static int
+lxd_pageio(vnode_t *vp, struct page *pp, u_offset_t io_off, size_t io_len,
+ int flags, cred_t *cr, caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct));
+}
+
+static void
+lxd_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return;
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ if (vp != NULL && !VN_ISKAS(vp))
+ VOP_DISPOSE(vp, pp, fl, dn, cr, ct);
+}
+
+static int
+lxd_setsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (ENOSYS);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ if (vn_is_readonly(vp))
+ return (EROFS);
+
+ vp = REALVP(vp);
+ return (VOP_SETSECATTR(vp, secattr, flags, cr, ct));
+}
+
+static int
+lxd_getsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (ENOSYS);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_GETSECATTR(vp, secattr, flags, cr, ct));
+}
+
+static int
+lxd_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxd_node_t *ldn = VTOLDN(vp);
+
+ if (ldn->lxdn_type == LXDNT_FRONT) {
+ return (EINVAL);
+ }
+
+ ASSERT(ldn->lxdn_type == LXDNT_BACK);
+ vp = REALVP(vp);
+ return (VOP_SHRLOCK(vp, cmd, shr, flag, cr, ct));
+}
+
+/*
+ * Loopback vnode operations vector.
+ */
+
+struct vnodeops *lxd_vnodeops;
+
+const fs_operation_def_t lxd_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = lxd_open },
+ VOPNAME_CLOSE, { .vop_close = lxd_close },
+ VOPNAME_READ, { .vop_read = lxd_read },
+ VOPNAME_WRITE, { .vop_write = lxd_write },
+ VOPNAME_IOCTL, { .vop_ioctl = lxd_ioctl },
+ VOPNAME_SETFL, { .vop_setfl = lxd_setfl },
+ VOPNAME_GETATTR, { .vop_getattr = lxd_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = lxd_setattr },
+ VOPNAME_ACCESS, { .vop_access = lxd_access },
+ VOPNAME_LOOKUP, { .vop_lookup = lxd_lookup },
+ VOPNAME_CREATE, { .vop_create = lxd_create },
+ VOPNAME_REMOVE, { .vop_remove = lxd_remove },
+ VOPNAME_LINK, { .vop_link = lxd_link },
+ VOPNAME_RENAME, { .vop_rename = lxd_rename },
+ VOPNAME_MKDIR, { .vop_mkdir = lxd_mkdir },
+ VOPNAME_RMDIR, { .vop_rmdir = lxd_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = lxd_readdir },
+ VOPNAME_SYMLINK, { .vop_symlink = lxd_symlink },
+ VOPNAME_READLINK, { .vop_readlink = lxd_readlink },
+ VOPNAME_FSYNC, { .vop_fsync = lxd_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = lxd_inactive },
+ VOPNAME_FID, { .vop_fid = lxd_fid },
+ VOPNAME_RWLOCK, { .vop_rwlock = lxd_rwlock },
+ VOPNAME_RWUNLOCK, { .vop_rwunlock = lxd_rwunlock },
+ VOPNAME_SEEK, { .vop_seek = lxd_seek },
+ VOPNAME_CMP, { .vop_cmp = lxd_cmp },
+ VOPNAME_FRLOCK, { .vop_frlock = lxd_frlock },
+ VOPNAME_SPACE, { .vop_space = lxd_space },
+ VOPNAME_REALVP, { .vop_realvp = lxd_realvp },
+ VOPNAME_GETPAGE, { .vop_getpage = lxd_getpage },
+ VOPNAME_PUTPAGE, { .vop_putpage = lxd_putpage },
+ VOPNAME_MAP, { .vop_map = lxd_map },
+ VOPNAME_ADDMAP, { .vop_addmap = lxd_addmap },
+ VOPNAME_DELMAP, { .vop_delmap = lxd_delmap },
+ VOPNAME_POLL, { .vop_poll = lxd_poll },
+ VOPNAME_DUMP, { .vop_dump = lxd_dump },
+ VOPNAME_DUMPCTL, { .error = fs_error },
+ VOPNAME_PATHCONF, { .vop_pathconf = lxd_pathconf },
+ VOPNAME_PAGEIO, { .vop_pageio = lxd_pageio },
+ VOPNAME_DISPOSE, { .vop_dispose = lxd_dispose },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = lxd_setsecattr },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = lxd_getsecattr },
+ VOPNAME_SHRLOCK, { .vop_shrlock = lxd_shrlock },
+ NULL, NULL
+};
diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c
new file mode 100644
index 0000000000..de5a16c414
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c
@@ -0,0 +1,499 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+
+#include <sys/modctl.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stat.h>
+#include <sys/conf.h>
+#include <sys/frame.h>
+#include <sys/dtrace.h>
+#include <sys/dtrace_impl.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+
+/*
+ * We store the syscall number in the low 16 bits (which limits us to 64k
+ * syscalls). The next bit indicates entry/return probe and the next bit
+ * indicates 64bit/32bit syscall.
+ */
+#define SCALL_MASK 0xffff
+#define ENTRY_FLAG 0x10000
+#define SYSC_64_BIT 0x100000
+
+#define LX_SYSTRACE_IS64BIT(x) ((int)(x) & SYSC_64_BIT)
+#define LX_SYSTRACE_ISENTRY(x) ((int)(x) & ENTRY_FLAG)
+#define LX_SYSTRACE_SYSNUM(x) ((int)(x) & SCALL_MASK)
+
+#define LX_SYSTRACE32_ENTRY(id) (ENTRY_FLAG | (id))
+#define LX_SYSTRACE32_RETURN(id) (id)
+
+#define LX_SYSTRACE64_ENTRY(id) (SYSC_64_BIT | ENTRY_FLAG | (id))
+#define LX_SYSTRACE64_RETURN(id) (SYSC_64_BIT | id)
+
+#define LX_SYSTRACE_ENTRY_AFRAMES 2
+#define LX_SYSTRACE_RETURN_AFRAMES 4
+
+typedef struct lx_systrace_sysent {
+ const char *lss_name;
+ dtrace_id_t lss_entry;
+ dtrace_id_t lss_return;
+} lx_systrace_sysent_t;
+
+static dev_info_t *lx_systrace_devi;
+static dtrace_provider_id_t lx_systrace_id;
+static kmutex_t lx_systrace_lock;
+static uint_t lx_systrace_nenabled;
+
+static int lx_systrace_nsysent32;
+static lx_systrace_sysent_t *lx_systrace_sysent32;
+
+#if defined(_LP64)
+static int lx_systrace_nsysent64;
+static lx_systrace_sysent_t *lx_systrace_sysent64;
+#endif
+
+/*ARGSUSED*/
+static void
+lx_systrace_entry(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2,
+ ulong_t arg3, ulong_t arg4, ulong_t arg5)
+{
+ dtrace_id_t id;
+
+#if defined(_LP64)
+ if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) {
+ if (sysnum >= lx_systrace_nsysent64)
+ return;
+ id = lx_systrace_sysent64[sysnum].lss_entry;
+ } else
+#endif
+ {
+ if (sysnum >= lx_systrace_nsysent32)
+ return;
+ id = lx_systrace_sysent32[sysnum].lss_entry;
+ }
+
+ if (id == DTRACE_IDNONE)
+ return;
+ dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_return(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2,
+ ulong_t arg3, ulong_t arg4, ulong_t arg5)
+{
+ dtrace_id_t id;
+
+#if defined(_LP64)
+ if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) {
+ if (sysnum >= lx_systrace_nsysent64)
+ return;
+ id = lx_systrace_sysent64[sysnum].lss_return;
+ } else
+#endif
+ {
+ if (sysnum >= lx_systrace_nsysent32)
+ return;
+ id = lx_systrace_sysent32[sysnum].lss_return;
+ }
+
+ if (id == DTRACE_IDNONE)
+ return;
+ dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_provide(void *arg, const dtrace_probedesc_t *desc)
+{
+ int i;
+
+ if (desc != NULL)
+ return;
+
+ for (i = 0; i < lx_systrace_nsysent32; i++) {
+ if (dtrace_probe_lookup(lx_systrace_id, "sys32",
+ lx_systrace_sysent32[i].lss_name, "entry") != 0)
+ continue;
+
+ (void) dtrace_probe_create(lx_systrace_id, "sys32",
+ lx_systrace_sysent32[i].lss_name, "entry",
+ LX_SYSTRACE_ENTRY_AFRAMES,
+ (void *)((uintptr_t)LX_SYSTRACE32_ENTRY(i)));
+
+ (void) dtrace_probe_create(lx_systrace_id, "sys32",
+ lx_systrace_sysent32[i].lss_name, "return",
+ LX_SYSTRACE_RETURN_AFRAMES,
+ (void *)((uintptr_t)LX_SYSTRACE32_RETURN(i)));
+
+ lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE;
+ lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE;
+ }
+
+#if defined(_LP64)
+ for (i = 0; i < lx_systrace_nsysent64; i++) {
+ if (dtrace_probe_lookup(lx_systrace_id, "sys64",
+ lx_systrace_sysent64[i].lss_name, "entry") != 0)
+ continue;
+
+ (void) dtrace_probe_create(lx_systrace_id, "sys64",
+ lx_systrace_sysent64[i].lss_name, "entry",
+ LX_SYSTRACE_ENTRY_AFRAMES,
+ (void *)((uintptr_t)LX_SYSTRACE64_ENTRY(i)));
+
+ (void) dtrace_probe_create(lx_systrace_id, "sys64",
+ lx_systrace_sysent64[i].lss_name, "return",
+ LX_SYSTRACE_RETURN_AFRAMES,
+ (void *)((uintptr_t)LX_SYSTRACE64_RETURN(i)));
+
+ lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE;
+ lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE;
+ }
+#endif
+}
+
+/*ARGSUSED*/
+static int
+lx_systrace_enable(void *arg, dtrace_id_t id, void *parg)
+{
+ int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg);
+
+ mutex_enter(&lx_systrace_lock);
+ if (lx_systrace_nenabled++ == 0)
+ lx_brand_systrace_enable();
+ mutex_exit(&lx_systrace_lock);
+
+#if defined(_LP64)
+ if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) {
+ ASSERT(sysnum < lx_systrace_nsysent64);
+
+ if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+ lx_systrace_sysent64[sysnum].lss_entry = id;
+ } else {
+ lx_systrace_sysent64[sysnum].lss_return = id;
+ }
+ } else
+#endif
+ {
+ ASSERT(sysnum < lx_systrace_nsysent32);
+
+ if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+ lx_systrace_sysent32[sysnum].lss_entry = id;
+ } else {
+ lx_systrace_sysent32[sysnum].lss_return = id;
+ }
+ }
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_disable(void *arg, dtrace_id_t id, void *parg)
+{
+ int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg);
+
+#if defined(_LP64)
+ if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) {
+ ASSERT(sysnum < lx_systrace_nsysent64);
+
+ if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+ lx_systrace_sysent64[sysnum].lss_entry = DTRACE_IDNONE;
+ } else {
+ lx_systrace_sysent64[sysnum].lss_return = DTRACE_IDNONE;
+ }
+ } else
+#endif
+ {
+ ASSERT(sysnum < lx_systrace_nsysent32);
+
+ if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+ lx_systrace_sysent32[sysnum].lss_entry = DTRACE_IDNONE;
+ } else {
+ lx_systrace_sysent32[sysnum].lss_return = DTRACE_IDNONE;
+ }
+ }
+
+ mutex_enter(&lx_systrace_lock);
+ if (--lx_systrace_nenabled == 0)
+ lx_brand_systrace_disable();
+ mutex_exit(&lx_systrace_lock);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+}
+
+/*ARGSUSED*/
+static uint64_t
+lx_systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+ int aframes)
+{
+ struct frame *fp = (struct frame *)dtrace_getfp();
+ uintptr_t *stack;
+ uint64_t val = 0;
+ int i;
+
+ if (argno >= 6)
+ return (0);
+
+ /*
+ * Walk the four frames down the stack to the entry or return callback.
+ * Our callback calls dtrace_probe() which calls dtrace_dif_variable()
+ * which invokes this function to get the extended arguments. We get
+ * the frame pointer in via call to dtrace_getfp() above which makes for
+ * four frames.
+ */
+ for (i = 0; i < 4; i++) {
+ fp = (struct frame *)fp->fr_savfp;
+ }
+
+ stack = (uintptr_t *)&fp[1];
+
+ /*
+ * Skip the first argument to the callback -- the system call number.
+ */
+ argno++;
+
+#ifdef __amd64
+ /*
+ * On amd64, the first 6 arguments are passed in registers while
+ * subsequent arguments are on the stack.
+ */
+ argno -= 6;
+#endif
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ val = stack[argno];
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+
+ return (val);
+}
+
+
+static const dtrace_pattr_t lx_systrace_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+};
+
+static dtrace_pops_t lx_systrace_pops = {
+ lx_systrace_provide,
+ NULL,
+ lx_systrace_enable,
+ lx_systrace_disable,
+ NULL,
+ NULL,
+ NULL,
+ lx_systrace_getarg,
+ NULL,
+ lx_systrace_destroy
+};
+
+static int
+lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+ int i;
+
+ switch (cmd) {
+ case DDI_ATTACH:
+ break;
+ case DDI_RESUME:
+ return (DDI_SUCCESS);
+ default:
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_create_minor_node(devi, "lx_systrace", S_IFCHR,
+ 0, DDI_PSEUDO, NULL) == DDI_FAILURE ||
+ dtrace_register("lx-syscall", &lx_systrace_attr,
+ DTRACE_PRIV_USER, 0, &lx_systrace_pops, NULL,
+ &lx_systrace_id) != 0) {
+ ddi_remove_minor_node(devi, NULL);
+ return (DDI_FAILURE);
+ }
+
+ ddi_report_dev(devi);
+ lx_systrace_devi = devi;
+
+ /*
+ * Initialize the 32-bit table.
+ */
+ VERIFY(lx_nsysent32 > 0);
+ lx_systrace_nsysent32 = lx_nsysent32;
+ lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 *
+ sizeof (lx_systrace_sysent_t), KM_SLEEP);
+
+ for (i = 0; i < lx_systrace_nsysent32; i++) {
+ lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name;
+ lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE;
+ lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE;
+ }
+
+#if defined(_LP64)
+ /*
+ * Initialize the 64-bit table.
+ */
+ VERIFY(lx_nsysent64 > 0);
+ lx_systrace_nsysent64 = lx_nsysent64;
+ lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 *
+ sizeof (lx_systrace_sysent_t), KM_SLEEP);
+
+ for (i = 0; i < lx_systrace_nsysent64; i++) {
+ lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name;
+ lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE;
+ lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE;
+ }
+#endif
+
+ /*
+ * Install probe triggers.
+ */
+ lx_systrace_entry_ptr = lx_systrace_entry;
+ lx_systrace_return_ptr = lx_systrace_return;
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
+{
+ switch (cmd) {
+ case DDI_DETACH:
+ break;
+ case DDI_SUSPEND:
+ return (DDI_SUCCESS);
+ default:
+ return (DDI_FAILURE);
+ }
+
+ if (dtrace_unregister(lx_systrace_id) != 0)
+ return (DDI_FAILURE);
+
+ /*
+ * Free tables.
+ */
+ kmem_free(lx_systrace_sysent32, lx_systrace_nsysent32 *
+ sizeof (lx_systrace_sysent_t));
+ lx_systrace_sysent32 = NULL;
+ lx_systrace_nsysent32 = 0;
+
+#if defined(_LP64)
+ kmem_free(lx_systrace_sysent64, lx_systrace_nsysent64 *
+ sizeof (lx_systrace_sysent_t));
+ lx_systrace_sysent64 = NULL;
+ lx_systrace_nsysent64 = 0;
+#endif
+
+ /*
+ * Reset probe triggers.
+ */
+ lx_systrace_entry_ptr = NULL;
+ lx_systrace_return_ptr = NULL;
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+{
+ return (0);
+}
+
+static struct cb_ops lx_systrace_cb_ops = {
+ lx_systrace_open, /* open */
+ nodev, /* close */
+ nulldev, /* strategy */
+ nulldev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ nodev, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ 0, /* streamtab */
+ D_NEW | D_MP /* Driver compatibility flag */
+};
+
+static struct dev_ops lx_systrace_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ ddi_getinfo_1to1, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ lx_systrace_attach, /* attach */
+ lx_systrace_detach, /* detach */
+ nodev, /* reset */
+ &lx_systrace_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed, /* quiesce */
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modldrv modldrv = {
+ &mod_driverops, /* module type (this is a pseudo driver) */
+ "Linux Brand System Call Tracing", /* name of module */
+ &lx_systrace_ops /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf
new file mode 100644
index 0000000000..e4499c8a5b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf
@@ -0,0 +1,27 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+
+name="lx_systrace" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/brand/lx/io/lx_netlink.c b/usr/src/uts/common/brand/lx/io/lx_netlink.c
new file mode 100644
index 0000000000..76d68f5921
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/io/lx_netlink.c
@@ -0,0 +1,2232 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Compatibility for the Linux netlink(7) kernel/user transport, as well as
+ * for in-kernel netlink(7) providers like rtnetlink(7). See RFC 3549 for
+ * details of the protocol, and the Linux man pages for details of the Linux
+ * implementation that we're mimicking.
+ */
+
+#include <sys/strsubr.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+#include <sys/sockio.h>
+#include <sys/brand.h>
+#include <sys/debug.h>
+#include <sys/ucred.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/ip_impl.h>
+#include <inet/ip_ire.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_socket.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_audit.h>
+#include <sys/ethernet.h>
+#include <sys/dlpi.h>
+#include <sys/policy.h>
+#include <sys/ddi.h>
+
+/*
+ * Flags in netlink header
+ * See Linux include/uapi/linux/netlink.h
+ * Additional flags for "GET" requests
+ */
+#define LX_NETLINK_NLM_F_REQUEST 1
+#define LX_NETLINK_NLM_F_MULTI 2
+#define LX_NETLINK_NLM_F_ACK 4
+#define LX_NETLINK_NLM_F_ECHO 8
+#define LX_NETLINK_NLM_F_DUMP_INTR 16
+#define LX_NETLINK_NLM_F_ROOT 0x100
+#define LX_NETLINK_NLM_F_MATCH 0x200
+#define LX_NETLINK_NLM_F_ATOMIC 0x400
+
+/*
+ * Generic message type constants
+ */
+#define LX_NETLINK_NLMSG_NONE 0
+#define LX_NETLINK_NLMSG_NOOP 1
+#define LX_NETLINK_NLMSG_ERROR 2
+#define LX_NETLINK_NLMSG_DONE 3
+#define LX_NETLINK_NLMSG_OVERRUN 4
+
+/*
+ * Protocol constants.
+ */
+#define LX_NETLINK_ROUTE 0
+#define LX_NETLINK_UNUSED 1
+#define LX_NETLINK_USERSOCK 2
+#define LX_NETLINK_FIREWALL 3
+#define LX_NETLINK_SOCK_DIAG 4
+#define LX_NETLINK_NFLOG 5
+#define LX_NETLINK_XFRM 6
+#define LX_NETLINK_SELINUX 7
+#define LX_NETLINK_ISCSI 8
+#define LX_NETLINK_AUDIT 9
+#define LX_NETLINK_FIB_LOOKUP 10
+#define LX_NETLINK_CONNECTOR 11
+#define LX_NETLINK_NETFILTER 12
+#define LX_NETLINK_IP6_FW 13
+#define LX_NETLINK_DNRTMSG 14
+#define LX_NETLINK_KOBJECT_UEVENT 15
+#define LX_NETLINK_GENERIC 16
+#define LX_NETLINK_SCSITRANSPORT 18
+#define LX_NETLINK_ECRYPTFS 19
+#define LX_NETLINK_RDMA 20
+#define LX_NETLINK_CRYPTO 21
+
+/*
+ * rtnetlink(7) attribute-related constants
+ */
+#define LX_NETLINK_NLA_ALIGNTO 4
+
+#define LX_NETLINK_RTM_NEWLINK 16
+#define LX_NETLINK_RTM_DELLINK 17
+#define LX_NETLINK_RTM_GETLINK 18
+#define LX_NETLINK_RTM_SETLINK 19
+#define LX_NETLINK_RTM_NEWADDR 20
+#define LX_NETLINK_RTM_DELADDR 21
+#define LX_NETLINK_RTM_GETADDR 22
+#define LX_NETLINK_RTM_NEWROUTE 24
+#define LX_NETLINK_RTM_DELROUTE 25
+#define LX_NETLINK_RTM_GETROUTE 26
+#define LX_NETLINK_RTM_NEWNEIGH 28
+#define LX_NETLINK_RTM_DELNEIGH 29
+#define LX_NETLINK_RTM_GETNEIGH 30
+#define LX_NETLINK_RTM_NEWRULE 32
+#define LX_NETLINK_RTM_DELRULE 33
+#define LX_NETLINK_RTM_GETRULE 34
+#define LX_NETLINK_RTM_NEWQDISC 36
+#define LX_NETLINK_RTM_DELQDISC 37
+#define LX_NETLINK_RTM_GETQDISC 38
+#define LX_NETLINK_RTM_NEWTCLASS 40
+#define LX_NETLINK_RTM_DELTCLASS 41
+#define LX_NETLINK_RTM_GETTCLASS 42
+#define LX_NETLINK_RTM_NEWTFILTER 44
+#define LX_NETLINK_RTM_DELTFILTER 45
+#define LX_NETLINK_RTM_GETTFILTER 46
+#define LX_NETLINK_RTM_NEWACTION 48
+#define LX_NETLINK_RTM_DELACTION 49
+#define LX_NETLINK_RTM_GETACTION 50
+#define LX_NETLINK_RTM_NEWPREFIX 52
+#define LX_NETLINK_RTM_GETMULTICAST 58
+#define LX_NETLINK_RTM_GETANYCAST 62
+#define LX_NETLINK_RTM_NEWNEIGHTBL 64
+#define LX_NETLINK_RTM_GETNEIGHTBL 66
+#define LX_NETLINK_RTM_SETNEIGHTBL 67
+#define LX_NETLINK_RTM_NEWNDUSEROPT 68
+#define LX_NETLINK_RTM_NEWADDRLABEL 72
+#define LX_NETLINK_RTM_DELADDRLABEL 73
+#define LX_NETLINK_RTM_GETADDRLABEL 74
+#define LX_NETLINK_RTM_GETDCB 78
+#define LX_NETLINK_RTM_SETDCB 79
+#define LX_NETLINK_RTM_NEWNETCONF 80
+#define LX_NETLINK_RTM_GETNETCONF 82
+#define LX_NETLINK_RTM_NEWMDB 84
+#define LX_NETLINK_RTM_DELMDB 85
+#define LX_NETLINK_RTM_GETMDB 86
+#define LX_NETLINK_RTM_MAX 87
+
+/*
+ * rtnetlink(7) attribute constants
+ */
+#define LX_NETLINK_RTA_UNSPEC 0
+#define LX_NETLINK_RTA_DST 1
+#define LX_NETLINK_RTA_SRC 2
+#define LX_NETLINK_RTA_IIF 3
+#define LX_NETLINK_RTA_OIF 4
+#define LX_NETLINK_RTA_GATEWAY 5
+#define LX_NETLINK_RTA_PRIORITY 6
+#define LX_NETLINK_RTA_PREFSRC 7
+#define LX_NETLINK_RTA_METRICS 8
+#define LX_NETLINK_RTA_MULTIPATH 9
+#define LX_NETLINK_RTA_PROTOINFO 10
+#define LX_NETLINK_RTA_FLOW 11
+#define LX_NETLINK_RTA_CACHEINFO 12
+#define LX_NETLINK_RTA_SESSION 13
+#define LX_NETLINK_RTA_MP_ALGO 14
+#define LX_NETLINK_RTA_TABLE 15
+#define LX_NETLINK_RTA_MARK 16
+#define LX_NETLINK_RTA_MFC_STATS 17
+#define LX_NETLINK_MAX_RTA LX_NETLINK_RTA_MFC_STATS
+
+/*
+ * rtnetlink(7) NEWLINK/DELLINK/GETLINK constants
+ */
+#define LX_NETLINK_IFLA_UNSPEC 0
+#define LX_NETLINK_IFLA_ADDRESS 1
+#define LX_NETLINK_IFLA_BROADCAST 2
+#define LX_NETLINK_IFLA_IFNAME 3
+#define LX_NETLINK_IFLA_MTU 4
+#define LX_NETLINK_IFLA_LINK 5
+#define LX_NETLINK_IFLA_QDISC 6
+#define LX_NETLINK_IFLA_STATS 7
+#define LX_NETLINK_IFLA_COST 8
+#define LX_NETLINK_IFLA_PRIORITY 9
+#define LX_NETLINK_IFLA_MASTER 10
+#define LX_NETLINK_IFLA_WIRELESS 11
+#define LX_NETLINK_IFLA_PROTINFO 12
+#define LX_NETLINK_IFLA_TXQLEN 13
+#define LX_NETLINK_IFLA_MAP 14
+#define LX_NETLINK_IFLA_WEIGHT 15
+#define LX_NETLINK_IFLA_OPERSTATE 16
+#define LX_NETLINK_IFLA_LINKMODE 17
+#define LX_NETLINK_IFLA_LINKINFO 18
+#define LX_NETLINK_IFLA_NET_NS_PID 19
+#define LX_NETLINK_IFLA_IFALIAS 20
+#define LX_NETLINK_IFLA_NUM_VF 21
+#define LX_NETLINK_IFLA_VFINFO_LIST 22
+#define LX_NETLINK_IFLA_STATS64 23
+#define LX_NETLINK_IFLA_VF_PORTS 24
+#define LX_NETLINK_IFLA_PORT_SELF 25
+#define LX_NETLINK_IFLA_AF_SPEC 26
+#define LX_NETLINK_IFLA_GROUP 27
+#define LX_NETLINK_IFLA_NET_NS_FD 28
+#define LX_NETLINK_IFLA_EXT_MASK 29
+#define LX_NETLINK_IFLA_PROMISCUITY 30
+#define LX_NETLINK_IFLA_NUM_TX_QUEUES 31
+#define LX_NETLINK_IFLA_NUM_RX_QUEUES 32
+#define LX_NETLINK_IFLA_CARRIER 33
+#define LX_NETLINK_IFLA_PHYS_PORT_ID 34
+#define LX_NETLINK_IFLA_CARRIER_CHANGES 35
+#define LX_NETLINK_IFLA_MAX 36
+
+/*
+ * rtnetlink(7) NEWADDR/DELADDR/GETADDR constants
+ */
+#define LX_NETLINK_IFA_UNSPEC 0
+#define LX_NETLINK_IFA_ADDRESS 1
+#define LX_NETLINK_IFA_LOCAL 2
+#define LX_NETLINK_IFA_LABEL 3
+#define LX_NETLINK_IFA_BROADCAST 4
+#define LX_NETLINK_IFA_ANYCAST 5
+#define LX_NETLINK_IFA_CACHEINFO 6
+#define LX_NETLINK_IFA_MULTICAST 7
+#define LX_NETLINK_IFA_FLAGS 8
+#define LX_NETLINK_IFA_MAX 9
+
+#define LX_NETLINK_IFA_F_SECONDARY 0x01
+#define LX_NETLINK_IFA_F_TEMPORARY LX_NETLINK_IFA_F_SECONDARY
+#define LX_NETLINK_IFA_F_NODAD 0x02
+#define LX_NETLINK_IFA_F_OPTIMISTIC 0x04
+#define LX_NETLINK_IFA_F_DADFAILED 0x08
+#define LX_NETLINK_IFA_F_HOMEADDRESS 0x10
+#define LX_NETLINK_IFA_F_DEPRECATED 0x20
+#define LX_NETLINK_IFA_F_TENTATIVE 0x40
+#define LX_NETLINK_IFA_F_PERMANENT 0x80
+#define LX_NETLINK_IFA_F_MANAGETEMPADDR 0x100
+#define LX_NETLINK_IFA_F_NOPREFIXROUTE 0x200
+
+/*
+ * Linux interface flags.
+ */
+#define LX_IFF_UP (1<<0)
+#define LX_IFF_BROADCAST (1<<1)
+#define LX_IFF_DEBUG (1<<2)
+#define LX_IFF_LOOPBACK (1<<3)
+#define LX_IFF_POINTOPOINT (1<<4)
+#define LX_IFF_NOTRAILERS (1<<5)
+#define LX_IFF_RUNNING (1<<6)
+#define LX_IFF_NOARP (1<<7)
+#define LX_IFF_PROMISC (1<<8)
+#define LX_IFF_ALLMULTI (1<<9)
+#define LX_IFF_MASTER (1<<10)
+#define LX_IFF_SLAVE (1<<11)
+#define LX_IFF_MULTICAST (1<<12)
+#define LX_IFF_PORTSEL (1<<13)
+#define LX_IFF_AUTOMEDIA (1<<14)
+#define LX_IFF_DYNAMIC (1<<15)
+#define LX_IFF_LOWER_UP (1<<16)
+#define LX_IFF_DORMANT (1<<17)
+#define LX_IFF_ECHO (1<<18)
+
+/* rtm_table */
+#define LX_ROUTE_TABLE_MAIN 254
+
+/* rtm_type */
+#define LX_RTN_UNSPEC 0
+#define LX_RTN_UNICAST 1
+#define LX_RTN_LOCAL 2
+#define LX_RTN_BROADCAST 3
+#define LX_RTN_ANYCAST 4
+#define LX_RTN_MULTICAST 5
+#define LX_RTN_BLACKHOLE 6
+#define LX_RTN_UNREACHABLE 7
+#define LX_RTN_PROHIBIT 8
+#define LX_RTN_THROW 9
+#define LX_RTN_NAT 10
+#define LX_RTN_XRESOLVE 11
+
+/* rtm_protocol */
+#define LX_RTPROT_UNSPEC 0
+#define LX_RTPROT_REDIRECT 1 /* From ICMP redir */
+#define LX_RTPROT_KERNEL 2 /* From kernel */
+#define LX_RTPROT_BOOT 3 /* From boot */
+#define LX_RTPROT_STATIC 4 /* From administrator */
+#define LX_RTPROT_NULL 0xff /* Uninitialized */
+
+/* rtm_scope */
+#define LX_RTSCOPE_UNIVERSE 0
+#define LX_RTSCOPE_SITE 200
+#define LX_RTSCOPE_LINK 253
+#define LX_RTSCOPE_HOST 254
+#define LX_RTSCOPE_NOWHERE 255
+
+/*
+ * Audit message types (lxnh_type in the lx_netlink_hdr_t msg header)
+ * See Linux include/uapi/linux/audit.h and user-level auditd source
+ * lib/libaudit.h.
+ *
+ * The types fall into range blocks:
+ * 1000-1099 is for audit system control commands
+ * 1100-2999 various messages, as detailed in include/uapi/linux/audit.h
+ */
+#define LX_AUDIT_GET 1000 /* get audit system status */
+#define LX_AUDIT_SET 1001 /* set audit system status */
+#define LX_AUDIT_WATCH_INS 1007 /* insert file watch */
+#define LX_AUDIT_WATCH_REM 1008 /* remove file watch */
+#define LX_AUDIT_WATCH_LIST 1009 /* list file watchs */
+#define LX_AUDIT_ADD_RULE 1011 /* add syscall rule */
+#define LX_AUDIT_DEL_RULE 1012 /* del syscall rule */
+#define LX_AUDIT_LIST_RULES 1013 /* list syscall rules */
+#define LX_AUDIT_SET_FEATURE 1018
+#define LX_AUDIT_GET_FEATURE 1019
+#define LX_AUDIT_USER_MSG_START 1100
+
+/*
+ * Netlink sockopts
+ */
+#define SOL_LX_NETLINK 270
+
+/* See Linux include/uapi/linux/netlink.h */
+#define LX_NETLINK_SO_ADD_MEMBERSHIP 1
+#define LX_NETLINK_SO_DROP_MEMBERSHIP 2
+#define LX_NETLINK_SO_PKTINFO 3
+#define LX_NETLINK_SO_BROADCAST_ERROR 4
+#define LX_NETLINK_SO_NO_ENOBUFS 5
+#define LX_NETLINK_SO_RX_RING 6
+#define LX_NETLINK_SO_TX_RING 7
+#define LX_NETLINK_SO_LISTEN_ALL_NSID 8
+#define LX_NETLINK_SO_LIST_MEMBERSHIPS 9
+#define LX_NETLINK_SO_CAP_ACK 10
+
+/* Internal socket flags */
+#define LXNLF_RECVUCRED 0x1
+#define LXNLF_AUDITD 0x2
+
+/* nlmsg structure macros */
+#define LXNLMSG_ALIGNTO 4
+#define LXNLMSG_ALIGN(len) \
+ (((len) + LXNLMSG_ALIGNTO - 1) & ~(LXNLMSG_ALIGNTO - 1))
+#define LXNLMSG_HDRLEN \
+ ((int)LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)))
+#define LXNLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN)
+#define LXNLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len))
+#define LXNLMSG_DATA(nlh) ((void*)(((char *)nlh) + NLMSG_LENGTH(0)))
+#define LXNLMSG_PAYLOAD(nlh, len) \
+ ((nlh)->nlmsg_len - NLMSG_SPACE((len)))
+
+#define LXATTR_PAYLOAD(lxa) \
+ ((void*)((caddr_t)(lxa) + sizeof (lx_netlink_attr_t)))
+#define LXATTR_HDRLEN LXNLMSG_ALIGN(sizeof (lx_netlink_attr_t))
+#define LXATTR_LEN(len) (LXATTR_HDRLEN + LXNLMSG_ALIGN(len))
+
+typedef struct lx_netlink_hdr {
+ uint32_t lxnh_len; /* length of message */
+ uint16_t lxnh_type; /* type of message */
+ uint16_t lxnh_flags; /* flags */
+ uint32_t lxnh_seq; /* sequence number */
+ uint32_t lxnh_pid; /* sending pid */
+} lx_netlink_hdr_t;
+
+typedef struct lx_netlink_err {
+ lx_netlink_hdr_t lxne_hdr; /* header */
+ int32_t lxne_errno; /* errno */
+ lx_netlink_hdr_t lxne_failed; /* header of err */
+} lx_netlink_err_t;
+
+typedef struct lx_netlink_attr {
+ uint16_t lxna_len; /* length of attribute */
+ uint16_t lxna_type; /* type of attribute */
+} lx_netlink_attr_t;
+
+typedef struct lx_netlink_ifinfomsg {
+ uint8_t lxnl_ifi_family; /* family: AF_UNSPEC */
+ uint8_t lxnl_ifi__pad;
+ uint16_t lxnl_ifi_type; /* device type */
+ uint32_t lxnl_ifi_index; /* interface index */
+ uint32_t lxnl_ifi_flags; /* device flags */
+ uint32_t lxnl_ifi_change; /* unused; must be -1 */
+} lx_netlink_ifinfomsg_t;
+
+typedef struct lx_netlink_ifaddrmsg {
+ uint8_t lxnl_ifa_family; /* address type */
+ uint8_t lxnl_ifa_prefixlen; /* prefix length of address */
+ uint8_t lxnl_ifa_flags; /* address flags */
+ uint8_t lxnl_ifa_scope; /* address scope */
+ uint8_t lxnl_ifa_index; /* interface index */
+} lx_netlink_ifaddrmsg_t;
+
+typedef struct lx_netlink_rtmsg {
+ uint8_t rtm_family; /* route AF */
+ uint8_t rtm_dst_len; /* destination addr length */
+ uint8_t rtm_src_len; /* source addr length */
+ uint8_t rtm_tos; /* TOS filter */
+ uint8_t rtm_table; /* routing table ID */
+ uint8_t rtm_protocol; /* routing protocol */
+ uint8_t rtm_scope;
+ uint8_t rtm_type;
+ uint32_t rtm_flags;
+} lx_netlink_rtmsg_t;
+
+typedef struct lx_netlink_sockaddr {
+ sa_family_t lxnl_family; /* AF_LX_NETLINK */
+ uint16_t lxnl_pad; /* padding */
+ uint32_t lxnl_port; /* port id */
+ uint32_t lxnl_groups; /* multicast groups mask */
+} lx_netlink_sockaddr_t;
+
+typedef struct lx_netlink_sock {
+ struct lx_netlink_sock *lxns_next; /* list of lx_netlink sockets */
+ sock_upcalls_t *lxns_upcalls; /* pointer to socket upcalls */
+ sock_upper_handle_t lxns_uphandle; /* socket upcall handle */
+ ldi_handle_t lxns_iphandle; /* handle to /dev/ip */
+ ldi_handle_t lxns_ip6handle; /* handle to /dev/ip6 */
+ ldi_handle_t lxns_current; /* current ip handle */
+ int lxns_proto; /* protocol */
+ uint32_t lxns_port; /* port identifier */
+ uint32_t lxns_groups; /* group subscriptions */
+ uint32_t lxns_bufsize; /* buffer size */
+ uint32_t lxns_flags; /* socket flags */
+ kmutex_t lxns_flowctl_mtx; /* protects lxns_flowctrled */
+ boolean_t lxns_flowctrled; /* sock is flow-controlled */
+} lx_netlink_sock_t;
+
+typedef struct lx_netlink_reply {
+ lx_netlink_hdr_t lxnr_hdr; /* header that we're reply to */
+ lx_netlink_sock_t *lxnr_sock; /* socket */
+ uint32_t lxnr_seq; /* sequence number */
+ uint16_t lxnr_type; /* type of reply */
+ mblk_t *lxnr_mp; /* current mblk */
+ mblk_t *lxnr_err; /* error mblk */
+ mblk_t *lxnr_mp1; /* T_UNITDATA_IND mblk */
+ int lxnr_errno; /* errno, if any */
+} lx_netlink_reply_t;
+
+static lx_netlink_sock_t *lx_netlink_head; /* head of lx_netlink sockets */
+static uint_t lx_netlink_audit_cnt; /* prevent unload for audit */
+static kmutex_t lx_netlink_lock; /* lock to protect state */
+static ldi_ident_t lx_netlink_ldi; /* LDI handle */
+static int lx_netlink_bufsize = 4096; /* default buffer size */
+static int lx_netlink_flowctrld; /* # of times flow controlled */
+
+typedef enum {
+ LXNL_BIND,
+ LXNL_SENDMSG
+} lx_netlink_action_t;
+
+#define LX_UNSUP_BUFSZ 64
+
+/*
+ * On Linux, CAP_NET_ADMIN is required to take certain netlink actions. This
+ * restriction is loosened for certain protocol types, provided the activity is
+ * limited to communicating directly with the kernel (rather than transmitting
+ * to the various multicast groups)
+ */
+static int
+lx_netlink_access(lx_netlink_sock_t *lns, cred_t *cr, lx_netlink_action_t act)
+{
+ /* Simple actions are allowed on these netlink protocols. */
+ if (act != LXNL_SENDMSG) {
+ switch (lns->lxns_proto) {
+ case LX_NETLINK_ROUTE:
+ case LX_NETLINK_AUDIT:
+ case LX_NETLINK_KOBJECT_UEVENT:
+ return (0);
+ default:
+ break;
+ }
+ }
+
+ /* CAP_NET_ADMIN roughly maps to PRIV_SYS_IP_CONFIG. */
+ if (secpolicy_ip_config(cr, B_FALSE) != 0) {
+ return (EACCES);
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+lx_netlink_activate(sock_lower_handle_t handle,
+ sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
+ int flags, cred_t *cr)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
+ SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
+ SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
+ sopp.sopp_wroff = 0;
+ sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
+ sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
+ sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
+ sopp.sopp_maxpsz = INFPSZ;
+ sopp.sopp_maxblk = INFPSZ;
+ sopp.sopp_minpsz = 0;
+
+ lxsock->lxns_upcalls = sock_upcalls;
+ lxsock->lxns_uphandle = sock_handle;
+
+ sock_upcalls->su_set_proto_props(sock_handle, &sopp);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_setsockopt(sock_lower_handle_t handle, int level,
+ int option_name, const void *optval, socklen_t optlen, struct cred *cr)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+
+ if (level == SOL_SOCKET && option_name == SO_RECVUCRED) {
+ int *ival;
+ if (optlen != sizeof (int)) {
+ return (EINVAL);
+ }
+ ival = (int *)optval;
+ if (*ival == 0) {
+ lxsock->lxns_flags &= ~LXNLF_RECVUCRED;
+ } else {
+ lxsock->lxns_flags |= LXNLF_RECVUCRED;
+ }
+ return (0);
+ } else if (level == SOL_SOCKET) {
+ /* Punt on the other SOL_SOCKET options */
+ return (0);
+ } else if (level != SOL_LX_NETLINK) {
+ return (EOPNOTSUPP);
+ }
+
+ switch (option_name) {
+ case LX_NETLINK_SO_ADD_MEMBERSHIP:
+ case LX_NETLINK_SO_DROP_MEMBERSHIP:
+ case LX_NETLINK_SO_PKTINFO:
+ case LX_NETLINK_SO_BROADCAST_ERROR:
+ case LX_NETLINK_SO_NO_ENOBUFS:
+ case LX_NETLINK_SO_RX_RING:
+ case LX_NETLINK_SO_TX_RING:
+ /* Blatant lie */
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getsockopt(sock_lower_handle_t handle, int level,
+ int option_name, void *optval, socklen_t *optlen, cred_t *cr)
+{
+ if (level != SOL_LX_NETLINK) {
+ return (EOPNOTSUPP);
+ }
+
+ switch (option_name) {
+ case LX_NETLINK_SO_LIST_MEMBERSHIPS:
+ /* Report that we have 0 members to allow systemd to proceed. */
+ *optlen = 0;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_bind(sock_lower_handle_t handle, struct sockaddr *name,
+ socklen_t namelen, struct cred *cr)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+ lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)name;
+
+ if (namelen != sizeof (lx_netlink_sockaddr_t) ||
+ lxsa->lxnl_family != AF_LX_NETLINK) {
+ return (EINVAL);
+ }
+
+ /*
+ * Perform access checks if attempting to bind on any multicast groups.
+ */
+ if (lxsa->lxnl_groups != 0) {
+ int err;
+
+ if ((err = lx_netlink_access(lxsock, cr, LXNL_BIND)) != 0) {
+ return (err);
+ }
+
+ /* Lie about group subscription for now */
+ lxsock->lxns_groups = lxsa->lxnl_groups;
+ }
+
+ /*
+ * Linux netlink uses nl_port to identify distinct netlink sockets.
+ * Binding to an address of nl_port=0 triggers the kernel to
+ * automatically assign a free nl_port identifier. Originally,
+ * consumers of lx_netlink were required to bind with that automatic
+ * address. We now support non-zero values for nl_port although strict
+ * checking to identify conflicts is not performed. Use of the
+ * id_space facility could be a convenient solution, if a need arose.
+ */
+ if (lxsa->lxnl_port == 0) {
+ /*
+ * Because we are not doing conflict detection, there is no
+ * need to expend effort selecting a unique port for automatic
+ * addressing during bind.
+ */
+ lxsock->lxns_port = curproc->p_pid;
+ } else {
+ lxsock->lxns_port = lxsa->lxnl_port;
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getsockname(sock_lower_handle_t handle, struct sockaddr *sa,
+ socklen_t *len, struct cred *cr)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+ lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)sa;
+
+ if (*len < sizeof (lx_netlink_sockaddr_t))
+ return (EINVAL);
+
+ lxsa->lxnl_family = AF_LX_NETLINK;
+ lxsa->lxnl_pad = 0;
+ lxsa->lxnl_port = lxsock->lxns_port;
+ lxsa->lxnl_groups = lxsock->lxns_groups;
+
+ *len = sizeof (lx_netlink_sockaddr_t);
+
+ return (0);
+}
+
+static mblk_t *
+lx_netlink_alloc_mp1(lx_netlink_sock_t *lxsock)
+{
+ mblk_t *mp;
+ size_t size;
+ struct T_unitdata_ind *tunit;
+ lx_netlink_sockaddr_t *lxsa;
+ boolean_t send_ucred;
+
+ /*
+ * Certain netlink clients (such as systemd) will set SO_RECVUCRED
+ * (via the Linux SCM_CREDENTIALS) on the expectation that all replies
+ * will contain credentials passed via cmsg. They require this to
+ * authenticate those messages as having originated in the kernel by
+ * checking uc_pid == 0.
+ */
+ VERIFY(lxsock != NULL);
+ send_ucred = ((lxsock->lxns_flags & LXNLF_RECVUCRED) != 0);
+
+ /*
+ * Message structure:
+ * +----------------------------+
+ * | struct T_unit_data_ind |
+ * +----------------------------+
+ * | lx_netlink_sockaddr_t |
+ * +----------------------------+ -+
+ * | struct cmsghdr (SCM_UCRED) | |
+ * +----------------------------+ +-(optional)
+ * | struct ucred_s (cmsg data) | |
+ * +----------------------------+ -+
+ */
+ size = sizeof (*tunit) + sizeof (*lxsa);
+ if (send_ucred) {
+ size += sizeof (struct cmsghdr) +
+ ROUNDUP_cmsglen(sizeof (struct ucred_s));
+ }
+ mp = allocb(size, 0);
+ if (mp == NULL) {
+ return (NULL);
+ }
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ tunit = (struct T_unitdata_ind *)mp->b_rptr;
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ lxsa = (lx_netlink_sockaddr_t *)((caddr_t)tunit + sizeof (*tunit));
+ mp->b_wptr += size;
+
+ mp->b_datap->db_type = M_PROTO;
+ tunit->PRIM_type = T_UNITDATA_IND;
+ tunit->SRC_length = sizeof (*lxsa);
+ tunit->SRC_offset = sizeof (*tunit);
+
+ lxsa->lxnl_family = AF_LX_NETLINK;
+ lxsa->lxnl_port = 0;
+ lxsa->lxnl_groups = 0;
+ lxsa->lxnl_pad = 0;
+
+ if (send_ucred) {
+ struct cmsghdr *cmsg;
+ struct ucred_s *ucred;
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ cmsg = (struct cmsghdr *)((caddr_t)lxsa + sizeof (*lxsa));
+ ucred = (struct ucred_s *)CMSG_CONTENT(cmsg);
+ cmsg->cmsg_len = sizeof (*cmsg) + sizeof (*ucred);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_UCRED;
+ bzero(ucred, sizeof (*ucred));
+ ucred->uc_size = sizeof (*ucred);
+ ucred->uc_zoneid = getzoneid();
+
+ tunit->OPT_length = sizeof (*cmsg) +
+ ROUNDUP_cmsglen(sizeof (*ucred));
+ tunit->OPT_offset = tunit->SRC_offset + tunit->SRC_length;
+ } else {
+ tunit->OPT_length = 0;
+ tunit->OPT_offset = 0;
+ }
+
+ return (mp);
+}
+
+static lx_netlink_reply_t *
+lx_netlink_reply(lx_netlink_sock_t *lxsock,
+ lx_netlink_hdr_t *hdr, uint16_t type)
+{
+ lx_netlink_reply_t *reply;
+ mblk_t *err, *mp1;
+
+ /*
+ * We always allocate an error block to assure that even if subsequent
+ * allocations fail, we can return an error.
+ */
+ if ((err = allocb(sizeof (lx_netlink_err_t), 0)) == NULL)
+ return (NULL);
+
+ if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) {
+ freeb(err);
+ return (NULL);
+ }
+
+ reply = kmem_zalloc(sizeof (lx_netlink_reply_t), KM_SLEEP);
+ reply->lxnr_err = err;
+ reply->lxnr_sock = lxsock;
+ reply->lxnr_hdr = *hdr;
+ reply->lxnr_type = type;
+ reply->lxnr_mp1 = mp1;
+
+ return (reply);
+}
+
+static void
+lx_netlink_reply_add(lx_netlink_reply_t *reply, void *payload, uint32_t size)
+{
+ lx_netlink_hdr_t *hdr;
+ lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+ uint32_t aligned;
+ mblk_t *mp = reply->lxnr_mp;
+
+ if (reply->lxnr_errno)
+ return;
+
+ aligned = LXNLMSG_ALIGN(size);
+ hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+
+ if (hdr->lxnh_len + aligned > lxsock->lxns_bufsize) {
+ reply->lxnr_errno = E2BIG;
+ return;
+ }
+
+ bcopy(payload, mp->b_wptr, size);
+ hdr->lxnh_len += aligned;
+ mp->b_wptr += aligned;
+}
+
+static void
+lx_netlink_reply_msg(lx_netlink_reply_t *reply, void *payload, uint32_t size)
+{
+ lx_netlink_hdr_t *hdr;
+ lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+ mblk_t *mp;
+
+ if (reply->lxnr_errno)
+ return;
+
+ VERIFY(reply->lxnr_mp == NULL);
+
+ if ((reply->lxnr_mp = mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) {
+ reply->lxnr_errno = ENOMEM;
+ return;
+ }
+
+ bzero(mp->b_rptr, lxsock->lxns_bufsize);
+ hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+ hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI;
+ hdr->lxnh_len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t));
+ hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+ hdr->lxnh_pid = lxsock->lxns_port;
+
+ mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t));
+
+ if (payload == NULL) {
+ /*
+ * A NULL payload denotes a "done" message.
+ */
+ hdr->lxnh_type = LX_NETLINK_NLMSG_DONE;
+ } else {
+ hdr->lxnh_type = reply->lxnr_type;
+ lx_netlink_reply_add(reply, payload, size);
+ }
+}
+
+static void
+lx_netlink_reply_attr(lx_netlink_reply_t *reply, uint16_t type,
+ void *payload, uint32_t size)
+{
+ lx_netlink_attr_t attr;
+
+ attr.lxna_len = size + sizeof (lx_netlink_attr_t);
+ attr.lxna_type = type;
+
+ lx_netlink_reply_add(reply, &attr, sizeof (attr));
+ lx_netlink_reply_add(reply, payload, size);
+}
+
+static void
+lx_netlink_reply_attr_string(lx_netlink_reply_t *reply,
+ uint16_t type, const char *str)
+{
+ lx_netlink_reply_attr(reply, type, (void *)str, strlen(str) + 1);
+}
+
+static void
+lx_netlink_reply_attr_int32(lx_netlink_reply_t *reply,
+ uint16_t type, int32_t val)
+{
+ int32_t v = val;
+
+ lx_netlink_reply_attr(reply, type, &v, sizeof (int32_t));
+}
+
+static int
+lx_netlink_reply_ioctl(lx_netlink_reply_t *reply, int cmd, void *arg)
+{
+ int rval;
+
+ if (reply->lxnr_errno != 0)
+ return (reply->lxnr_errno);
+
+ if ((rval = ldi_ioctl(reply->lxnr_sock->lxns_current,
+ cmd, (intptr_t)arg, FKIOCTL, kcred, NULL)) != 0) {
+ reply->lxnr_errno = rval;
+ }
+
+ return (rval);
+}
+
+static void
+lx_netlink_reply_sendup(lx_netlink_reply_t *reply, mblk_t *mp, mblk_t *mp1)
+{
+ lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+ int error;
+
+ /*
+ * To prevent the stream head from coalescing messages and to indicate
+ * their origin, we send them as T_UNITDATA_IND messages, not as raw
+ * M_DATA.
+ */
+ mp1->b_cont = mp;
+
+ lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1,
+ msgdsize(mp1), 0, &error, NULL);
+
+ if (error != 0)
+ lx_netlink_flowctrld++;
+}
+
+static void
+lx_netlink_reply_send(lx_netlink_reply_t *reply)
+{
+ mblk_t *mp1;
+
+ if (reply->lxnr_errno)
+ return;
+
+ if ((mp1 = lx_netlink_alloc_mp1(reply->lxnr_sock)) == NULL) {
+ reply->lxnr_errno = ENOMEM;
+ return;
+ }
+
+ lx_netlink_reply_sendup(reply, reply->lxnr_mp, mp1);
+ reply->lxnr_mp = NULL;
+}
+
+static void
+lx_netlink_reply_done(lx_netlink_reply_t *reply)
+{
+ lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+ mblk_t *mp;
+
+ /*
+ * Denote that we're done via a message with a NULL payload.
+ */
+ lx_netlink_reply_msg(reply, NULL, 0);
+
+ if (reply->lxnr_errno) {
+ /*
+ * If anything failed, we'll send up an error message.
+ */
+ lx_netlink_hdr_t *hdr;
+ lx_netlink_err_t *err;
+
+ if (reply->lxnr_mp != NULL) {
+ freeb(reply->lxnr_mp);
+ reply->lxnr_mp = NULL;
+ }
+
+ mp = reply->lxnr_err;
+ VERIFY(mp != NULL);
+ reply->lxnr_err = NULL;
+ err = (lx_netlink_err_t *)mp->b_rptr;
+ hdr = &err->lxne_hdr;
+ mp->b_wptr += sizeof (lx_netlink_err_t);
+
+ err->lxne_failed = reply->lxnr_hdr;
+ err->lxne_errno = reply->lxnr_errno;
+ hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR;
+ hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+ hdr->lxnh_len = sizeof (lx_netlink_err_t);
+ hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+ hdr->lxnh_pid = lxsock->lxns_port;
+ } else {
+ uint32_t status = 0;
+
+ /*
+ * More recent versions of the iproute2 utils expect a status
+ * value after the header, even in the absence of errors.
+ */
+ lx_netlink_reply_add(reply, &status, sizeof (status));
+
+ /*
+ * "done" is also the most minimal response possible. If
+ * lx_netlink_reply_msg() does not set lxnr_errno, we should
+ * be guaranteed enough room to hold this (i.e. our
+ * lx_netlink_reply_add() call should never end up setting
+ * lxnr_errno).
+ */
+ VERIFY0(reply->lxnr_errno);
+
+ mp = reply->lxnr_mp;
+ VERIFY(mp != NULL);
+ reply->lxnr_mp = NULL;
+ }
+
+ lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1);
+
+ if (reply->lxnr_mp != NULL)
+ freeb(reply->lxnr_mp);
+
+ if (reply->lxnr_err != NULL)
+ freeb(reply->lxnr_err);
+
+ kmem_free(reply, sizeof (lx_netlink_reply_t));
+}
+
+static int
+lx_netlink_reply_error(lx_netlink_sock_t *lxsock,
+ lx_netlink_hdr_t *hdr, int errno)
+{
+ /*
+ * The type of the message doesn't matter, as we're going to explicitly
+ * set lxnr_errno and therefore send only an error message.
+ */
+ lx_netlink_reply_t *reply = lx_netlink_reply(lxsock, hdr, 0);
+
+ if (reply == NULL)
+ return (ENOMEM);
+
+ reply->lxnr_errno = errno;
+ lx_netlink_reply_done(reply);
+
+ return (0);
+}
+
+/*
+ * Send an ack message with an explicit errno of 0.
+ * TODO: this needs more work
+ */
+/*
+ * static void
+ * lx_netlink_reply_ack(lx_netlink_reply_t *reply)
+ * {
+ * lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+ * mblk_t *mp;
+ * lx_netlink_hdr_t *hdr;
+ * lx_netlink_err_t *err;
+ *
+ * lx_netlink_reply_msg(reply, NULL, 0);
+ *
+ * mp = reply->lxnr_err;
+ * VERIFY(mp != NULL);
+ * reply->lxnr_err = NULL;
+ * err = (lx_netlink_err_t *)mp->b_rptr;
+ * hdr = &err->lxne_hdr;
+ *
+ * err->lxne_failed = reply->lxnr_hdr;
+ * err->lxne_errno = 0;
+ * hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR;
+ * hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+ * hdr->lxnh_len = sizeof (lx_netlink_err_t);
+ * hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+ * hdr->lxnh_pid = lxsock->lxns_port;
+ *
+ * lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1);
+ *
+ * kmem_free(reply, sizeof (lx_netlink_reply_t));
+ * }
+ */
+
+static int
+lx_netlink_parse_msg_attrs(mblk_t *mp, void **msgp, unsigned int msg_size,
+ lx_netlink_attr_t **attrp, unsigned int *attr_max)
+{
+ lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+ lx_netlink_attr_t *lxa;
+ unsigned char *buf = mp->b_rptr + LXNLMSG_HDRLEN;
+ unsigned int i;
+ uint32_t buf_left = MBLKL(mp) - LXNLMSG_HDRLEN;
+ uint32_t msg_left = hdr->lxnh_len;
+
+ msg_size = LXNLMSG_ALIGN(msg_size);
+ if (msg_size > buf_left || msg_size > msg_left) {
+ return (-1);
+ }
+
+ *msgp = (void *)buf;
+ buf += msg_size;
+ buf_left -= msg_size;
+ msg_left -= msg_size;
+
+ /* Do not bother with attr parsing if not requested */
+ if (attrp == NULL || *attr_max == 0) {
+ return (0);
+ }
+
+ for (i = 0; i < *attr_max; i++) {
+ if (buf_left < LXATTR_HDRLEN || msg_left < LXATTR_HDRLEN) {
+ break;
+ }
+
+ lxa = (lx_netlink_attr_t *)buf;
+ if (lxa->lxna_len > buf_left || lxa->lxna_len > msg_left) {
+ return (-1);
+ }
+
+ attrp[i] = lxa;
+ buf += lxa->lxna_len;
+ buf_left -= lxa->lxna_len;
+ msg_left -= lxa->lxna_len;
+ }
+ *attr_max = i;
+
+ return (0);
+}
+
+/*
+ * Takes an IPv4 address (in network byte order) and returns the address scope.
+ */
+static uint8_t
+lx_ipv4_rtscope(in_addr_t nbo_addr)
+{
+ in_addr_t addr = ntohl(nbo_addr);
+ if ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
+ return (LX_RTSCOPE_HOST);
+ } else if ((addr & IN_AUTOCONF_MASK) == IN_AUTOCONF_NET) {
+ return (LX_RTSCOPE_LINK);
+ } else if ((addr & IN_PRIVATE8_MASK) == IN_PRIVATE8_NET ||
+ (addr & IN_PRIVATE12_MASK) == IN_PRIVATE12_NET ||
+ (addr & IN_PRIVATE16_MASK) == IN_PRIVATE16_NET) {
+ return (LX_RTSCOPE_SITE);
+ } else {
+ return (LX_RTSCOPE_UNIVERSE);
+ }
+}
+
+/*
+ * Takes an IPv6 address and returns the address scope.
+ */
+static uint8_t
+lx_ipv6_rtscope(const in6_addr_t *addr)
+{
+ if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
+ return (LX_RTSCOPE_HOST);
+ } else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
+ return (LX_RTSCOPE_LINK);
+ } else if (IN6_IS_ADDR_SITELOCAL(addr)) {
+ return (LX_RTSCOPE_SITE);
+ } else {
+ return (LX_RTSCOPE_UNIVERSE);
+ }
+}
+
+static void
+lx_netlink_getlink_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr)
+{
+ lx_netlink_ifinfomsg_t ifi;
+ int i;
+ char if_name[IFNAMSIZ];
+ struct sockaddr_dl *sdl;
+ struct sockaddr hwaddr;
+ int hwaddr_size;
+ boolean_t is_loopback;
+
+ struct {
+ int native;
+ int lx;
+ } flags[] = {
+ { IFF_UP, LX_IFF_UP },
+ { IFF_BROADCAST, LX_IFF_BROADCAST },
+ { IFF_DEBUG, LX_IFF_DEBUG },
+ { IFF_LOOPBACK, LX_IFF_LOOPBACK },
+ { IFF_POINTOPOINT, LX_IFF_POINTOPOINT },
+ { IFF_NOTRAILERS, LX_IFF_NOTRAILERS },
+ { IFF_RUNNING, LX_IFF_RUNNING },
+ { IFF_NOARP, LX_IFF_NOARP },
+ { IFF_PROMISC, LX_IFF_PROMISC },
+ { IFF_ALLMULTI, LX_IFF_ALLMULTI },
+ { IFF_MULTICAST, LX_IFF_MULTICAST },
+ { 0 }
+ };
+
+ /*
+ * illumos interfaces that contain a ':' are non-zero logical
+ * interfaces. We should only emit the name of the zeroth logical
+ * interface, since RTM_GETLINK only expects to see the name of
+ * devices. The addresses of all logical devices will be
+ * returned via an RTM_GETADDR.
+ */
+ if (strchr(lifr->lifr_name, ':') != NULL)
+ return;
+
+ /*
+ * Most of the lx_netlink module is architected to emit information in
+ * an illumos-native manner. Socket syscalls such as getsockname will
+ * not translate fields to values Linux programs would expect since
+ * that conversion is performed by the generic socket emulation.
+ *
+ * This is _not_ true of the actual protocol output from lx_netlink.
+ * Since translating it at the socket layer would be onerous, all
+ * output (including constants and names) is pre-translated to values
+ * valid for Linux.
+ */
+
+ bzero(&ifi, sizeof (ifi));
+ ifi.lxnl_ifi_family = AF_UNSPEC;
+ ifi.lxnl_ifi_change = (uint32_t)-1;
+
+ /* Convert the name to be Linux-friendly */
+ (void) strlcpy(if_name, lifr->lifr_name, IFNAMSIZ);
+ lx_ifname_convert(if_name, LX_IF_FROMNATIVE);
+ is_loopback = (strncmp(if_name, "lo", 2) == 0);
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0)
+ return;
+
+ ifi.lxnl_ifi_index = lifr->lifr_index;
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0)
+ return;
+
+ for (i = 0; flags[i].native; i++) {
+ if (lifr->lifr_flags & flags[i].native)
+ ifi.lxnl_ifi_flags |= flags[i].lx;
+ }
+
+ /*
+ * Query the datalink address.
+ * The interface type will be included in the outgoing infomsg while
+ * the address itself will be output separately.
+ */
+ sdl = (struct sockaddr_dl *)&lifr->lifr_addr;
+ bzero(sdl, sizeof (*sdl));
+ if (!is_loopback) {
+ (void) lx_netlink_reply_ioctl(reply, SIOCGLIFHWADDR, lifr);
+ } else {
+ /* Simulate an empty hwaddr for loopback */
+ sdl->sdl_type = DL_LOOP;
+ sdl->sdl_alen = ETHERADDRL;
+ }
+ lx_stol_hwaddr(sdl, &hwaddr, &hwaddr_size);
+
+ ifi.lxnl_ifi_type = hwaddr.sa_family;
+ lx_netlink_reply_msg(reply, &ifi, sizeof (lx_netlink_ifinfomsg_t));
+
+ lx_netlink_reply_attr_string(reply, LX_NETLINK_IFLA_IFNAME, if_name);
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFMTU, lifr) != 0)
+ return;
+
+ lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_MTU, lifr->lifr_mtu);
+
+ if (hwaddr_size != 0) {
+ lx_netlink_reply_attr(reply, LX_NETLINK_IFLA_ADDRESS,
+ hwaddr.sa_data, hwaddr_size);
+ }
+
+ /* Emulate a txqlen of 1. (0 for loopbacks) */
+ lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_TXQLEN,
+ (is_loopback) ? 0 : 1);
+
+ lx_netlink_reply_send(reply);
+}
+
+static void
+lx_netlink_reply_eachfamily(lx_netlink_reply_t *reply,
+ void (*func)(lx_netlink_reply_t *, struct lifreq *), boolean_t distinct)
+{
+ lx_netlink_sock_t *sock = reply->lxnr_sock;
+ int nlifr, i;
+
+ struct {
+ int family;
+ ldi_handle_t handle;
+ struct lifconf lifc;
+ struct lifnum lifn;
+ } families[] = {
+ { AF_INET, sock->lxns_iphandle },
+ { AF_INET6, sock->lxns_ip6handle },
+ { AF_UNSPEC }
+ }, *family, *check;
+
+ for (family = families; family->family != AF_UNSPEC; family++) {
+ struct lifconf *lifc = &family->lifc;
+ struct lifnum *lifn = &family->lifn;
+
+ lifn->lifn_family = family->family;
+ sock->lxns_current = family->handle;
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFNUM, lifn) != 0)
+ break;
+
+ lifc->lifc_family = lifn->lifn_family;
+ lifc->lifc_flags = 0;
+ lifc->lifc_len = lifn->lifn_count * sizeof (struct lifreq);
+ if (lifn->lifn_count == 0) {
+ lifc->lifc_buf = NULL;
+ continue;
+ }
+ lifc->lifc_buf = kmem_alloc(lifc->lifc_len, KM_SLEEP);
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFCONF, lifc) != 0)
+ break;
+
+ nlifr = lifc->lifc_len / sizeof (lifc->lifc_req[0]);
+
+ for (i = 0; i < nlifr; i++) {
+ if (!distinct) {
+ func(reply, &lifc->lifc_req[i]);
+ continue;
+ }
+
+ /*
+ * If we have been asked to provide each interface
+ * exactly once, we need to (annoyingly) check this
+ * name against others that we've already processed for
+ * other families. Yes, this is quadratic time -- but
+ * the number of interfaces per family is expected to
+ * be very small.
+ */
+ for (check = families; check != family; check++) {
+ struct lifconf *clifc = &check->lifc;
+ int cnlifr = clifc->lifc_len /
+ sizeof (clifc->lifc_req[0]), j;
+ char *nm = lifc->lifc_req[i].lifr_name, *cnm;
+
+ for (j = 0; j < cnlifr; j++) {
+ cnm = clifc->lifc_req[j].lifr_name;
+
+ if (strcmp(nm, cnm) == 0)
+ break;
+ }
+
+ if (j != cnlifr)
+ break;
+ }
+
+ if (check != family)
+ continue;
+
+ func(reply, &lifc->lifc_req[i]);
+ }
+ }
+
+ for (family = families; family->family != AF_UNSPEC; family++) {
+ struct lifconf *lifc = &family->lifc;
+
+ if (lifc->lifc_buf != NULL)
+ kmem_free(lifc->lifc_buf, lifc->lifc_len);
+ }
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getlink(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ lx_netlink_reply_t *reply;
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWLINK);
+
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_netlink_reply_eachfamily(reply, lx_netlink_getlink_lifreq, B_TRUE);
+ lx_netlink_reply_done(reply);
+
+ return (0);
+}
+
+static void
+lx_netlink_getaddr_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr)
+{
+ lx_netlink_ifaddrmsg_t ifa;
+
+ bzero(&ifa, sizeof (ifa));
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0)
+ return;
+
+ ifa.lxnl_ifa_index = lifr->lifr_index;
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0)
+ return;
+
+ /*
+ * Don't report on-link subnets
+ */
+ if ((lifr->lifr_flags & IFF_NOLOCAL) != 0)
+ return;
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFSUBNET, lifr) != 0)
+ return;
+
+ ifa.lxnl_ifa_prefixlen = lifr->lifr_addrlen;
+
+ if (lx_netlink_reply_ioctl(reply, SIOCGLIFADDR, lifr) != 0)
+ return;
+
+ if (lifr->lifr_addr.ss_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ ifa.lxnl_ifa_family = LX_AF_INET;
+
+ sin = (struct sockaddr_in *)&lifr->lifr_addr;
+ ifa.lxnl_ifa_scope = lx_ipv4_rtscope(
+ sin->sin_addr.s_addr);
+
+ lx_netlink_reply_msg(reply, &ifa,
+ sizeof (lx_netlink_ifaddrmsg_t));
+
+ lx_netlink_reply_attr_int32(reply,
+ LX_NETLINK_IFA_ADDRESS, sin->sin_addr.s_addr);
+ } else {
+ struct sockaddr_in6 *sin;
+
+ ifa.lxnl_ifa_family = LX_AF_INET6;
+
+ sin = (struct sockaddr_in6 *)&lifr->lifr_addr;
+ ifa.lxnl_ifa_scope = lx_ipv6_rtscope(&sin->sin6_addr);
+
+ lx_netlink_reply_msg(reply, &ifa,
+ sizeof (lx_netlink_ifaddrmsg_t));
+
+ lx_netlink_reply_attr(reply, LX_NETLINK_IFA_ADDRESS,
+ &sin->sin6_addr, sizeof (sin->sin6_addr));
+ }
+
+ lx_netlink_reply_send(reply);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getaddr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ lx_netlink_reply_t *reply;
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWADDR);
+
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_netlink_reply_eachfamily(reply, lx_netlink_getaddr_lifreq, B_FALSE);
+ lx_netlink_reply_done(reply);
+
+ return (0);
+}
+
+struct lx_getroute_ctx {
+ lx_netlink_reply_t *lgrtctx_reply;
+ lx_netlink_rtmsg_t *lgrtctx_rtmsg;
+ lx_netlink_attr_t *lgrtctx_attrs[LX_NETLINK_MAX_RTA];
+ unsigned int lgrtctx_max_attr;
+ lx_netlink_attr_t *lgrtctx_rtadst;
+};
+
+static void
+lx_netlink_getroute_ipv4(ire_t *ire, struct lx_getroute_ctx *ctx)
+{
+ lx_netlink_reply_t *reply = ctx->lgrtctx_reply;
+ lx_netlink_rtmsg_t *rtmsg = ctx->lgrtctx_rtmsg;
+ lx_netlink_attr_t *rtadst = ctx->lgrtctx_rtadst;
+ lx_netlink_rtmsg_t res;
+ ill_t *ill = NULL;
+
+ /* Certain IREs are too specific for netlink */
+ if ((ire->ire_type & (IRE_BROADCAST | IRE_MULTICAST | IRE_NOROUTE |
+ IRE_LOOPBACK | IRE_LOCAL)) != 0 || ire->ire_testhidden != 0) {
+ return;
+ }
+ /*
+ * When listing routes, CLONE entries are undesired.
+ * They are required for 'ip route get' on a local address.
+ */
+ if (rtmsg->rtm_dst_len == 0 && (ire->ire_type & IRE_IF_CLONE) != 0) {
+ return;
+ }
+
+ bzero(&res, sizeof (res));
+ res.rtm_family = LX_AF_INET;
+ res.rtm_table = LX_ROUTE_TABLE_MAIN;
+ res.rtm_type = LX_RTN_UNICAST;
+ res.rtm_dst_len = ire->ire_masklen;
+
+ if (ire->ire_type & (IRE_IF_NORESOLVER|IRE_IF_RESOLVER)) {
+ /* Interface-local networks considered kernel-created */
+ res.rtm_protocol = LX_RTPROT_KERNEL;
+ res.rtm_scope = LX_RTSCOPE_LINK;
+ } else if (ire->ire_flags & RTF_STATIC) {
+ res.rtm_protocol = LX_RTPROT_STATIC;
+ }
+
+ if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) {
+ /*
+ * SpecifY single-destination route.
+ * RTA_DST details will be added later
+ */
+ res.rtm_dst_len = rtmsg->rtm_dst_len;
+ }
+
+
+ lx_netlink_reply_msg(reply, &res, sizeof (res));
+
+ if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) {
+ /* Add RTA_DST details for single-destination route. */
+ lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST,
+ LXATTR_PAYLOAD(rtadst), sizeof (ipaddr_t));
+ } else if (ire->ire_masklen != 0) {
+ lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST,
+ &ire->ire_addr, sizeof (ire->ire_addr));
+ }
+
+ if (ire->ire_ill != NULL) {
+ ill = ire->ire_ill;
+ } else if (ire->ire_dep_parent != NULL) {
+ ill = ire->ire_dep_parent->ire_ill;
+ }
+
+ if (ill != NULL) {
+ uint32_t ifindex, addr_src;
+
+ ifindex = ill->ill_phyint->phyint_ifindex;
+ lx_netlink_reply_attr(reply, LX_NETLINK_RTA_OIF,
+ &ifindex, sizeof (ifindex));
+
+ addr_src = ill->ill_ipif->ipif_lcl_addr;
+ lx_netlink_reply_attr(reply, LX_NETLINK_RTA_PREFSRC,
+ &addr_src, sizeof (addr_src));
+ }
+
+ if (ire->ire_flags & RTF_GATEWAY) {
+ lx_netlink_reply_attr(reply, LX_NETLINK_RTA_GATEWAY,
+ &ire->ire_gateway_addr, sizeof (ire->ire_gateway_addr));
+ }
+
+ lx_netlink_reply_send(reply);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getroute(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr,
+ mblk_t *mp)
+{
+ struct lx_getroute_ctx ctx;
+ lx_netlink_reply_t *reply;
+ lx_netlink_rtmsg_t rtmsg, *rtmsgp;
+ int rtmsg_size = sizeof (rtmsg);
+ netstack_t *ns;
+ int i;
+
+ bzero(&ctx, sizeof (ctx));
+ ctx.lgrtctx_max_attr = LX_NETLINK_MAX_RTA;
+
+ if (lx_netlink_parse_msg_attrs(mp, (void **)&rtmsgp,
+ rtmsg_size, ctx.lgrtctx_attrs, &ctx.lgrtctx_max_attr) != 0) {
+ return (EPROTO);
+ }
+
+ /*
+ * Older version of libnetlink send a truncated rtmsg struct for
+ * certain RTM_GETROUTE queries. We must detect this condition and
+ * truncate our input to prevent later confusion.
+ */
+ if (curproc->p_zone->zone_brand == &lx_brand &&
+ lx_kern_release_cmp(curproc->p_zone, "2.6.32") <= 0 &&
+ rtmsgp->rtm_dst_len == 0) {
+ rtmsg_size = sizeof (rtmsg.rtm_family);
+ }
+ bzero(&rtmsg, sizeof (rtmsg));
+ bcopy(rtmsgp, &rtmsg, rtmsg_size);
+ ctx.lgrtctx_rtmsg = &rtmsg;
+
+ /* If RTA_DST was passed, it effects later decisions */
+ for (i = 0; i < ctx.lgrtctx_max_attr; i++) {
+ lx_netlink_attr_t *attr = ctx.lgrtctx_attrs[i];
+
+ if (attr->lxna_type == LX_NETLINK_RTA_DST &&
+ attr->lxna_len == LXATTR_LEN(sizeof (ipaddr_t))) {
+ ctx.lgrtctx_rtadst = attr;
+ break;
+ }
+ }
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWROUTE);
+ if (reply == NULL) {
+ return (ENOMEM);
+ }
+ ctx.lgrtctx_reply = reply;
+
+ /* Do not report anything outside the main table */
+ if (rtmsg.rtm_table != LX_ROUTE_TABLE_MAIN &&
+ rtmsg.rtm_table != 0) {
+ lx_netlink_reply_done(reply);
+ return (0);
+ }
+
+ ns = netstack_get_current();
+ if (ns == NULL) {
+ lx_netlink_reply_done(reply);
+ return (0);
+ }
+ if (rtmsg.rtm_family == LX_AF_INET || rtmsg.rtm_family == 0) {
+ if (rtmsg.rtm_dst_len == 0x20 && ctx.lgrtctx_rtadst != NULL) {
+ /* resolve route for host */
+ ipaddr_t *dst = LXATTR_PAYLOAD(ctx.lgrtctx_rtadst);
+ ire_t *ire_dst;
+
+ ire_dst = ire_route_recursive_dstonly_v4(*dst, 0, 0,
+ ns->netstack_ip);
+ lx_netlink_getroute_ipv4(ire_dst, &ctx);
+ ire_refrele(ire_dst);
+ } else {
+ /* get route listing */
+ ire_walk_v4(&lx_netlink_getroute_ipv4, &ctx, ALL_ZONES,
+ ns->netstack_ip);
+ }
+ }
+ if (rtmsg.rtm_family == LX_AF_INET6) {
+ /* punt on ipv6 for now */
+ netstack_rele(ns);
+ lx_netlink_reply_done(reply);
+ return (EPROTO);
+ }
+ netstack_rele(ns);
+
+ lx_netlink_reply_done(reply);
+ return (0);
+}
+
+/*
+ * Auditing callback to emit response.
+ */
+static void
+lx_netlink_au_cb(void *r, void *b, uint_t blen)
+{
+ lx_netlink_reply_t *reply = (lx_netlink_reply_t *)r;
+
+ lx_netlink_reply_msg(reply, b, blen);
+}
+
+/*
+ * Audit get
+ */
+static int
+lx_netlink_au_get(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr)
+{
+ lx_netlink_reply_t *reply;
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_GET);
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_audit_get(reply, lx_netlink_au_cb);
+ lx_netlink_reply_send(reply);
+ lx_netlink_reply_done(reply);
+ return (0);
+}
+
+/*
+ * Set or clear flag indicating socket is being used to communicate with the
+ * user-level auditd. Also update the counter which prevents this module
+ * from unloading while auditing is using the socket to the auditd.
+ */
+static void
+lx_netlink_au_sock_cb(void *s, boolean_t set)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)s;
+
+ if (set) {
+ lxsock->lxns_flags |= LXNLF_AUDITD;
+ mutex_enter(&lx_netlink_lock);
+ lx_netlink_audit_cnt++;
+ mutex_exit(&lx_netlink_lock);
+ } else {
+ lxsock->lxns_flags &= ~LXNLF_AUDITD;
+ mutex_enter(&lx_netlink_lock);
+ VERIFY(lx_netlink_audit_cnt > 0);
+ lx_netlink_audit_cnt--;
+ mutex_exit(&lx_netlink_lock);
+ }
+}
+
+static int
+lx_netlink_au_set(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ lx_netlink_reply_t *reply;
+ void *datap;
+ size_t datalen;
+ int err;
+
+ datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t));
+ datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t);
+
+ err = lx_audit_set(lxsock, datap, datalen, lx_netlink_au_sock_cb);
+ if (err != 0)
+ return (err);
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_SET);
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_netlink_reply_done(reply);
+ return (0);
+}
+
+/*
+ * Audit append rule
+ */
+static int
+lx_netlink_au_ar(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ lx_netlink_reply_t *reply;
+ void *datap;
+ size_t datalen;
+ int err;
+
+ /*
+ * TODO: At this time, everything we support fits in a single mblk,
+ * but as we add additional field support, eventually we might need
+ * to handle an mblk chain for really long string data in the
+ * rulep->lxar_buf.
+ */
+ if (mp->b_cont != NULL)
+ return (EINVAL);
+
+ datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t));
+ datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t);
+
+ if ((err = lx_audit_append_rule(datap, datalen)) != 0)
+ return (err);
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_ADD_RULE);
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_netlink_reply_done(reply);
+ return (0);
+}
+
+/*
+ * Audit delete rule
+ */
+static int
+lx_netlink_au_dr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ lx_netlink_reply_t *reply;
+ void *datap;
+ size_t datalen;
+ int err;
+
+ /*
+ * TODO: At this time, everything we support fits in a single mblk,
+ * but as we add additional field support, eventually we might need
+ * to handle an mblk chain for really long string data in the
+ * rulep->lxar_buf.
+ */
+ if (mp->b_cont != NULL)
+ return (EINVAL);
+
+ datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t));
+ datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t);
+
+ if ((err = lx_audit_delete_rule(datap, datalen)) != 0)
+ return (err);
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_DEL_RULE);
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_netlink_reply_done(reply);
+ return (0);
+}
+
+/*
+ * Auditing callback to emit rule list.
+ */
+static void
+lx_netlink_au_lr_cb(void *r, void *b0, uint_t b0_len, void *b1, uint_t b1_len)
+{
+ lx_netlink_reply_t *reply = (lx_netlink_reply_t *)r;
+
+ lx_netlink_reply_msg(reply, b0, b0_len);
+ lx_netlink_reply_add(reply, b1, b1_len);
+ lx_netlink_reply_send(reply);
+}
+
+/*
+ * Audit list rules
+ */
+static int
+lx_netlink_au_lr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr)
+{
+ lx_netlink_reply_t *reply;
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_LIST_RULES);
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_audit_list_rules(reply, lx_netlink_au_lr_cb);
+ lx_netlink_reply_done(reply);
+ return (0);
+}
+
+/*
+ * Audit get feature
+ */
+static int
+lx_netlink_au_gf(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr)
+{
+ lx_netlink_reply_t *reply;
+
+ reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_GET_FEATURE);
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_audit_get_feature(reply, lx_netlink_au_cb);
+ lx_netlink_reply_send(reply);
+ lx_netlink_reply_done(reply);
+ return (0);
+}
+
+/*
+ * Audit user message
+ * User messages are submitted as free-form messages which need to get sent
+ * back up to the auditd. This includes informative messages such as starting
+ * or stopping auditing.
+ */
+static int
+lx_netlink_au_um(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ lx_netlink_reply_t *reply;
+ size_t datalen;
+ void *bp;
+
+ bp = mp->b_rptr + sizeof (lx_netlink_hdr_t);
+ datalen = MBLKL(mp) - (sizeof (lx_netlink_hdr_t));
+
+ /*
+ * TODO: At this time, everything we support fits in a single mblk,
+ * but eventually we might need to handle an mblk chain for a really
+ * long user message.
+ */
+ if (mp->b_cont != NULL)
+ return (EINVAL);
+
+ lx_audit_emit_user_msg(hdr->lxnh_type, datalen, bp);
+
+ if (hdr->lxnh_flags & LX_NETLINK_NLM_F_ACK) {
+ reply = lx_netlink_reply(lxsock, hdr, hdr->lxnh_type);
+ if (reply == NULL)
+ return (ENOMEM);
+
+ lx_netlink_reply_done(reply);
+ }
+ return (0);
+}
+
+static int
+lx_netlink_au_emit_cb(void *s, uint_t type, const char *msg, uint_t size)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)s;
+ lx_netlink_hdr_t *hdr;
+ mblk_t *mp, *mp1;
+ int error;
+ uint32_t len;
+
+ len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t));
+ if (msg != NULL) {
+ len += LXNLMSG_ALIGN(size);
+ if (len > lxsock->lxns_bufsize)
+ return (E2BIG);
+ }
+
+ if ((mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) {
+ return (ENOMEM);
+ }
+
+ bzero(mp->b_rptr, lxsock->lxns_bufsize);
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+ hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI;
+ hdr->lxnh_len = len;
+ hdr->lxnh_type = (msg == NULL ? LX_NETLINK_NLMSG_DONE : type);
+ hdr->lxnh_seq = 0;
+ hdr->lxnh_pid = 0;
+
+ mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t));
+ if (msg != NULL) {
+ bcopy(msg, mp->b_wptr, size);
+ mp->b_wptr += LXNLMSG_ALIGN(size);
+ }
+
+ /* As in lx_netlink_reply_sendup, send as T_UNITDATA_IND message. */
+ if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) {
+ freeb(mp);
+ return (ENOMEM);
+ }
+ mp1->b_cont = mp;
+
+ /*
+ * If the socket is currently flow-controlled, do not allow further
+ * data to be sent out. Messages of the NLMSG_DONE type, triggered by
+ * passing msg == NULL, are excempt from this restriction.
+ */
+ mutex_enter(&lxsock->lxns_flowctl_mtx);
+ if (lxsock->lxns_flowctrled && msg != NULL) {
+ mutex_exit(&lxsock->lxns_flowctl_mtx);
+ freemsg(mp1);
+ return (ENOSPC);
+ }
+
+ lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1,
+ msgdsize(mp1), 0, &error, NULL);
+
+ /*
+ * The socket indicated that it is now flow-controlled. That said, it
+ * still queued the last message, so indicated success (but track the
+ * flow-controlled state).
+ */
+ if (error == ENOSPC) {
+ lxsock->lxns_flowctrled = B_TRUE;
+ lx_netlink_flowctrld++;
+ error = 0;
+ }
+ mutex_exit(&lxsock->lxns_flowctl_mtx);
+
+ return (error);
+}
+
+static int
+lx_netlink_audit(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ /*
+ * This is paranoia, in case our socket somehow escaped the zone.
+ */
+ if (curproc->p_zone->zone_brand != &lx_brand)
+ return (ECONNREFUSED);
+
+ if (MBLKL(mp) < sizeof (lx_netlink_hdr_t))
+ return (EINVAL);
+
+ /*
+ * Ensure audit state is setup whenever we get an audit control msg.
+ * However, we skip initialization for user messages since some apps
+ * (e.g. systemd) blindly send audit messages, even though auditing
+ * is not installed or in use. Uninitialized state is handled in
+ * lx_audit_user_msg().
+ */
+ if (hdr->lxnh_type < LX_AUDIT_USER_MSG_START)
+ lx_audit_init(lx_netlink_au_emit_cb);
+
+ /*
+ * Within Linux, when a netlink message requests an ack, the code
+ * first sends the ack as an error response (NLMSG_ERROR) with an
+ * error code of 0.
+ *
+ * TODO: this needs more work, but is unnecessary for now.
+ * if (hdr->lxnh_flags & LX_NETLINK_NLM_F_ACK) {
+ * reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_NLMSG_ERROR);
+ * if (reply == NULL)
+ * return (ENOMEM);
+ * lx_netlink_reply_ack(reply);
+ * }
+ */
+
+ if (hdr->lxnh_type >= LX_AUDIT_USER_MSG_START) {
+ return (lx_netlink_au_um(lxsock, hdr, mp));
+ }
+
+ switch (hdr->lxnh_type) {
+ case LX_AUDIT_GET:
+ return (lx_netlink_au_get(lxsock, hdr));
+ case LX_AUDIT_SET:
+ return (lx_netlink_au_set(lxsock, hdr, mp));
+ case LX_AUDIT_ADD_RULE:
+ return (lx_netlink_au_ar(lxsock, hdr, mp));
+ case LX_AUDIT_DEL_RULE:
+ return (lx_netlink_au_dr(lxsock, hdr, mp));
+ case LX_AUDIT_LIST_RULES:
+ return (lx_netlink_au_lr(lxsock, hdr));
+ case LX_AUDIT_GET_FEATURE:
+ return (lx_netlink_au_gf(lxsock, hdr));
+ }
+
+ /*
+ * For all other auditing messages (i.e. one we don't yet support), we
+ * return ECONNREFUSED.
+ */
+ return (ECONNREFUSED);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_kobject_uevent(lx_netlink_sock_t *lxsock,
+ lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+ /*
+ * For udev, we just silently accept all writes and never actually
+ * reply with anything -- which appears to be sufficient for things
+ * to work.
+ */
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_send(sock_lower_handle_t handle, mblk_t *mp,
+ struct nmsghdr *msg, cred_t *cr)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+ lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+ int i, rval;
+
+ static struct {
+ int proto;
+ uint16_t type;
+ int (*func)(lx_netlink_sock_t *, lx_netlink_hdr_t *, mblk_t *);
+ } handlers[] = {
+ { LX_NETLINK_ROUTE,
+ LX_NETLINK_RTM_GETLINK, lx_netlink_getlink },
+ { LX_NETLINK_ROUTE,
+ LX_NETLINK_RTM_GETADDR, lx_netlink_getaddr },
+ { LX_NETLINK_ROUTE,
+ LX_NETLINK_RTM_GETROUTE, lx_netlink_getroute },
+ { LX_NETLINK_AUDIT,
+ LX_NETLINK_NLMSG_NONE, lx_netlink_audit },
+ { LX_NETLINK_KOBJECT_UEVENT,
+ LX_NETLINK_NLMSG_NONE, lx_netlink_kobject_uevent },
+ { LX_NETLINK_NLMSG_NOOP, LX_NETLINK_NLMSG_NONE, NULL }
+ };
+
+ if (msg->msg_name != NULL) {
+ lx_netlink_sockaddr_t *lxsa =
+ (lx_netlink_sockaddr_t *)msg->msg_name;
+
+ if (msg->msg_namelen != sizeof (lx_netlink_sockaddr_t) ||
+ lxsa->lxnl_family != AF_LX_NETLINK) {
+ return (EINVAL);
+ }
+
+ /*
+ * If this message is targeted beyond just the OS kernel, an
+ * access check must be made.
+ */
+ if (lxsa->lxnl_port != 0 || lxsa->lxnl_groups != 0) {
+ int err;
+ char buf[LX_UNSUP_BUFSZ];
+
+ err = lx_netlink_access(lxsock, cr, LXNL_SENDMSG);
+ if (err != 0) {
+ return (err);
+ }
+
+ /*
+ * Support for netlink messages beyond rtnetlink(7) is
+ * non-existent at this time. These messages are
+ * tolerated, rather than tossing a potentially fatal
+ * error to the application.
+ */
+ (void) snprintf(buf, LX_UNSUP_BUFSZ,
+ "netlink sendmsg addr port:%X groups:%08X",
+ lxsa->lxnl_port, lxsa->lxnl_groups);
+ lx_unsupported(buf);
+ }
+ }
+
+ if (DB_TYPE(mp) != M_DATA || MBLKL(mp) < sizeof (lx_netlink_hdr_t)) {
+ freemsg(mp);
+ return (EPROTO);
+ }
+
+ for (i = 0; handlers[i].func != NULL; i++) {
+ if (lxsock->lxns_proto != handlers[i].proto)
+ continue;
+
+ if (handlers[i].type != LX_NETLINK_NLMSG_NONE &&
+ hdr->lxnh_type != handlers[i].type)
+ continue;
+
+ rval = handlers[i].func(lxsock, hdr, mp);
+ freemsg(mp);
+
+ return (rval);
+ }
+
+ /*
+ * An unrecognized message. We will bounce up an EOPNOTSUPP reply.
+ */
+ rval = lx_netlink_reply_error(lxsock, hdr, EOPNOTSUPP);
+ freemsg(mp);
+
+ return (rval);
+}
+
+static void
+lx_netlink_clr_flowctrl(sock_lower_handle_t handle)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+
+ mutex_enter(&lxsock->lxns_flowctl_mtx);
+ lxsock->lxns_flowctrled = B_FALSE;
+ mutex_exit(&lxsock->lxns_flowctl_mtx);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_close(sock_lower_handle_t handle, int flags, cred_t *cr)
+{
+ lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle, *sock, **prev;
+
+ if (lxsock->lxns_flags & LXNLF_AUDITD)
+ lx_audit_stop_worker(lxsock, lx_netlink_au_sock_cb);
+
+ mutex_enter(&lx_netlink_lock);
+
+ prev = &lx_netlink_head;
+
+ for (sock = *prev; sock != lxsock; sock = sock->lxns_next)
+ prev = &sock->lxns_next;
+
+ *prev = sock->lxns_next;
+
+ mutex_exit(&lx_netlink_lock);
+
+ (void) ldi_close(lxsock->lxns_iphandle, FREAD, kcred);
+ (void) ldi_close(lxsock->lxns_ip6handle, FREAD, kcred);
+ mutex_destroy(&lxsock->lxns_flowctl_mtx);
+ kmem_free(lxsock, sizeof (lx_netlink_sock_t));
+
+ return (0);
+}
+
+static sock_downcalls_t sock_lx_netlink_downcalls = {
+ lx_netlink_activate, /* sd_activate */
+ sock_accept_notsupp, /* sd_accept */
+ lx_netlink_bind, /* sd_bind */
+ sock_listen_notsupp, /* sd_listen */
+ sock_connect_notsupp, /* sd_connect */
+ sock_getpeername_notsupp, /* sd_getpeername */
+ lx_netlink_getsockname, /* sd_getsockname */
+ lx_netlink_getsockopt, /* sd_getsockopt */
+ lx_netlink_setsockopt, /* sd_setsockopt */
+ lx_netlink_send, /* sd_send */
+ NULL, /* sd_send_uio */
+ NULL, /* sd_recv_uio */
+ NULL, /* sd_poll */
+ sock_shutdown_notsupp, /* sd_shutdown */
+ lx_netlink_clr_flowctrl, /* sd_clr_flowctrl */
+ sock_ioctl_notsupp, /* sd_ioctl */
+ lx_netlink_close /* sd_close */
+};
+
+/*ARGSUSED*/
+static sock_lower_handle_t
+lx_netlink_create(int family, int type, int proto,
+ sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp,
+ int flags, cred_t *credp)
+{
+ lx_netlink_sock_t *lxsock;
+ ldi_handle_t handle, handle6;
+ cred_t *kcred = zone_kcred();
+ int err;
+
+ if (family != AF_LX_NETLINK ||
+ (type != SOCK_DGRAM && type != SOCK_RAW)) {
+ *errorp = EPROTONOSUPPORT;
+ return (NULL);
+ }
+
+ switch (proto) {
+ case LX_NETLINK_ROUTE:
+ case LX_NETLINK_AUDIT:
+ case LX_NETLINK_KOBJECT_UEVENT:
+ break;
+
+ default:
+ *errorp = EPROTONOSUPPORT;
+ return (NULL);
+ }
+
+ if ((err = ldi_open_by_name(DEV_IP, FREAD, kcred,
+ &handle, lx_netlink_ldi)) != 0) {
+ *errorp = err;
+ return (NULL);
+ }
+
+ if ((err = ldi_open_by_name(DEV_IP6, FREAD, kcred,
+ &handle6, lx_netlink_ldi)) != 0) {
+ (void) ldi_close(handle, FREAD, kcred);
+ *errorp = err;
+ return (NULL);
+ }
+
+ *sock_downcalls = &sock_lx_netlink_downcalls;
+ *smodep = SM_ATOMIC;
+
+ lxsock = kmem_zalloc(sizeof (lx_netlink_sock_t), KM_SLEEP);
+ lxsock->lxns_iphandle = handle;
+ lxsock->lxns_ip6handle = handle6;
+ lxsock->lxns_bufsize = lx_netlink_bufsize;
+ lxsock->lxns_proto = proto;
+ mutex_init(&lxsock->lxns_flowctl_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ mutex_enter(&lx_netlink_lock);
+
+ lxsock->lxns_next = lx_netlink_head;
+ lx_netlink_head = lxsock;
+
+ mutex_exit(&lx_netlink_lock);
+
+ return ((sock_lower_handle_t)lxsock);
+}
+
+static void
+lx_netlink_init(void)
+{
+ major_t major = mod_name_to_major("ip");
+ int err;
+
+ VERIFY(major != DDI_MAJOR_T_NONE);
+
+ err = ldi_ident_from_major(major, &lx_netlink_ldi);
+ VERIFY(err == 0);
+}
+
+static void
+lx_netlink_fini(void)
+{
+ ldi_ident_release(lx_netlink_ldi);
+}
+
+static smod_reg_t sinfo = {
+ SOCKMOD_VERSION,
+ "lx_netlink",
+ SOCK_UC_VERSION,
+ SOCK_DC_VERSION,
+ lx_netlink_create,
+ NULL
+};
+
+/* modldrv structure */
+static struct modlsockmod sockmod = {
+ &mod_sockmodops, "AF_LX_NETLINK socket module", &sinfo
+};
+
+/* modlinkage structure */
+static struct modlinkage ml = {
+ MODREV_1,
+ &sockmod,
+ NULL
+};
+
+int
+_init(void)
+{
+ int err;
+
+ lx_netlink_init();
+
+ if ((err = mod_install(&ml)) != 0)
+ lx_netlink_fini();
+
+ return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&ml, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err = 0;
+
+ mutex_enter(&lx_netlink_lock);
+
+ if (lx_netlink_head != NULL || lx_netlink_audit_cnt != 0)
+ err = EBUSY;
+
+ mutex_exit(&lx_netlink_lock);
+
+ if (err == 0) {
+ lx_audit_cleanup();
+ if ((err = mod_remove(&ml)) == 0)
+ lx_netlink_fini();
+ }
+
+ return (err);
+}
diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.c b/usr/src/uts/common/brand/lx/io/lx_ptm.c
new file mode 100644
index 0000000000..23e0c6f459
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/io/lx_ptm.c
@@ -0,0 +1,1188 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2016 Joyent, Inc. All rights reserved.
+ */
+
+
+/*
+ * This driver attempts to emulate some of the the behaviors of
+ * Linux terminal devices (/dev/ptmx and /dev/pts/[0-9][0-9]*) on Solaris
+ *
+ * It does this by layering over the /dev/ptmx device and intercepting
+ * opens to it.
+ *
+ * This driver makes the following assumptions about the way the ptm/pts
+ * drivers on Solaris work:
+ *
+ * - all opens of the /dev/ptmx device node return a unique dev_t.
+ *
+ * - the dev_t minor node value for each open ptm instance corrospondes
+ * to it's associated slave terminal device number. ie. the path to
+ * the slave terminal device associated with an open ptm instance
+ * who's dev_t minor node vaue is 5, is /dev/pts/5.
+ *
+ * - the ptm driver always allocates the lowest numbered slave terminal
+ * device possible.
+ */
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/devops.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/kstr.h>
+#include <sys/lx_ptm.h>
+#include <sys/modctl.h>
+#include <sys/pathname.h>
+#include <sys/ptms.h>
+#include <sys/ptyvar.h>
+#include <sys/stat.h>
+#include <sys/stropts.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/sdt.h>
+
+#define LP_PTM_PATH "/dev/ptmx"
+#define LP_PTS_PATH "/dev/pts/"
+#define LP_PTS_DRV_NAME "pts"
+#define LP_PTS_USEC_DELAY (5 * 1000) /* 5 ms */
+#define LP_PTS_USEC_DELAY_MAX (5 * MILLISEC) /* 5 ms */
+
+/*
+ * this driver is layered on top of the ptm driver. we'd like to
+ * make this drivers minor name space a mirror of the ptm drivers
+ * namespace, but we can't actually do this. the reason is that the
+ * ptm driver is opened via the clone driver. there for no minor nodes
+ * of the ptm driver are actually accessible via the filesystem.
+ * since we're not a streams device we can't be opened by the clone
+ * driver. there for we need to have at least minor node accessible
+ * via the filesystem so that consumers can open it. we use the device
+ * node with a minor number of 0 for this purpose. what this means is
+ * that minor node 0 can't be used to map ptm minor node 0. since this
+ * minor node is now reserved we need to shift our ptm minor node
+ * mappings by one. ie. a ptm minor node with a value of 0 will
+ * corrospond to our minor node with a value of 1. these mappings are
+ * managed with the following macros.
+ */
+#define DEVT_TO_INDEX(x) LX_PTM_DEV_TO_PTS(x)
+#define INDEX_TO_MINOR(x) ((x) + 1)
+
+/*
+ * grow our layered handle array by the same size increment that the ptm
+ * driver uses to grow the pty device space - PTY_MAXDELTA
+ */
+#define LP_PTY_INC 128
+
+/*
+ * lx_ptm_ops contains state information about outstanding operations on the
+ * underlying master terminal device. Currently we only track information
+ * for read operations.
+ *
+ * Note that this data has not been rolled directly into the lx_ptm_handle
+ * structure because we can't put mutex's of condition variables into
+ * lx_ptm_handle structure. The reason is that the array of lx_ptm_handle
+ * structures linked to from the global lx_ptm state can be resized
+ * dynamically, and when it's resized, the new array is at a different
+ * memory location and the old array memory is discarded. Mutexs and cvs
+ * are accessed based off their address, so if this array was re-sized while
+ * there were outstanding operations on any mutexs or cvs in the array
+ * then the system would tip over. In the future the lx_ptm_handle structure
+ * array should probably be replaced with either an array of pointers to
+ * lx_ptm_handle structures or some other kind of data structure containing
+ * pointers to lx_ptm_handle structures. Then the lx_ptm_ops structure
+ * could be folded directly into the lx_ptm_handle structures. (This will
+ * also require the definition of a new locking mechanism to protect the
+ * contents of lx_ptm_handle structures.)
+ */
+typedef struct lx_ptm_ops {
+ int lpo_rops;
+ kcondvar_t lpo_rops_cv;
+ kmutex_t lpo_rops_lock;
+} lx_ptm_ops_t;
+
+/*
+ * Every open of the master terminal device in a zone results in a new
+ * lx_ptm_handle handle allocation. These handles are stored in an array
+ * hanging off the lx_ptm_state structure.
+ */
+typedef struct lx_ptm_handle {
+ /* Device handle to the underlying real /dev/ptmx master terminal. */
+ ldi_handle_t lph_handle;
+
+ /* Flag to indicate if TIOCPKT mode has been enabled. */
+ int lph_pktio;
+
+ /* Number of times the slave device has been opened/closed. */
+ int lph_eofed;
+
+ /* Callback handler in the ptm driver to check if slave is open. */
+ ptmptsopencb_t lph_ppocb;
+
+ /* Pointer to state for operations on underlying device. */
+ lx_ptm_ops_t *lph_lpo;
+} lx_ptm_handle_t;
+
+/*
+ * Global state for the lx_ptm driver.
+ */
+typedef struct lx_ptm_state {
+ /* lx_ptm device devinfo pointer */
+ dev_info_t *lps_dip;
+
+ /* LDI ident used to open underlying real /dev/ptmx master terminals. */
+ ldi_ident_t lps_li;
+
+ /* pts drivers major number */
+ major_t lps_pts_major;
+
+ /* rw lock used to manage access and growth of lps_lh_array */
+ krwlock_t lps_lh_rwlock;
+
+ /* number of elements in lps_lh_array */
+ uint_t lps_lh_count;
+
+ /* Array of handles to underlying real /dev/ptmx master terminals. */
+ lx_ptm_handle_t *lps_lh_array;
+} lx_ptm_state_t;
+
+/* Pointer to the lx_ptm global state structure. */
+static lx_ptm_state_t lps;
+
+/*
+ * List of modules to be autopushed onto slave terminal devices when they
+ * are opened in an lx branded zone.
+ */
+static char *lx_pts_mods[] = {
+ "ptem",
+ "ldterm",
+ "ttcompat",
+ NULL
+};
+
+static void
+lx_ptm_lh_grow(uint_t index)
+{
+ uint_t new_lh_count, old_lh_count;
+ lx_ptm_handle_t *new_lh_array, *old_lh_array;
+
+ /*
+ * allocate a new array. we drop the rw lock on the array so that
+ * readers can still access devices in case our memory allocation
+ * blocks.
+ */
+ new_lh_count = MAX(lps.lps_lh_count + LP_PTY_INC, index + 1);
+ new_lh_array =
+ kmem_zalloc(sizeof (lx_ptm_handle_t) * new_lh_count, KM_SLEEP);
+
+ /*
+ * double check that we still actually need to increase the size
+ * of the array
+ */
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+ if (index < lps.lps_lh_count) {
+ /* someone beat us to it so there's nothing more to do */
+ rw_exit(&lps.lps_lh_rwlock);
+ kmem_free(new_lh_array,
+ sizeof (lx_ptm_handle_t) * new_lh_count);
+ return;
+ }
+
+ /* copy the existing data into the new array */
+ ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL));
+ ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL));
+ if (lps.lps_lh_count != 0) {
+ bcopy(lps.lps_lh_array, new_lh_array,
+ sizeof (lx_ptm_handle_t) * lps.lps_lh_count);
+ }
+
+ /* save info on the old array */
+ old_lh_array = lps.lps_lh_array;
+ old_lh_count = lps.lps_lh_count;
+
+ /* install the new array */
+ lps.lps_lh_array = new_lh_array;
+ lps.lps_lh_count = new_lh_count;
+
+ rw_exit(&lps.lps_lh_rwlock);
+
+ /* free the old array */
+ if (old_lh_array != NULL) {
+ kmem_free(old_lh_array,
+ sizeof (lx_ptm_handle_t) * old_lh_count);
+ }
+}
+
+static void
+lx_ptm_lh_insert(uint_t index, ldi_handle_t lh)
+{
+ lx_ptm_ops_t *lpo;
+
+ ASSERT(lh != NULL);
+
+ /* Allocate and initialize the ops structure */
+ lpo = kmem_zalloc(sizeof (lx_ptm_ops_t), KM_SLEEP);
+ mutex_init(&lpo->lpo_rops_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&lpo->lpo_rops_cv, NULL, CV_DEFAULT, NULL);
+
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+ /* check if we need to grow the size of the layered handle array */
+ if (index >= lps.lps_lh_count) {
+ rw_exit(&lps.lps_lh_rwlock);
+ lx_ptm_lh_grow(index);
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+ }
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle == NULL);
+ ASSERT(lps.lps_lh_array[index].lph_pktio == 0);
+ ASSERT(lps.lps_lh_array[index].lph_eofed == 0);
+ ASSERT(lps.lps_lh_array[index].lph_lpo == NULL);
+
+ /* insert the new handle and return */
+ lps.lps_lh_array[index].lph_handle = lh;
+ lps.lps_lh_array[index].lph_pktio = 0;
+ lps.lps_lh_array[index].lph_eofed = 0;
+ lps.lps_lh_array[index].lph_lpo = lpo;
+
+ rw_exit(&lps.lps_lh_rwlock);
+}
+
+static ldi_handle_t
+lx_ptm_lh_remove(uint_t index)
+{
+ ldi_handle_t lh;
+
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+ ASSERT(lps.lps_lh_array[index].lph_lpo->lpo_rops == 0);
+ ASSERT(!MUTEX_HELD(&lps.lps_lh_array[index].lph_lpo->lpo_rops_lock));
+
+ /* free the write handle */
+ kmem_free(lps.lps_lh_array[index].lph_lpo, sizeof (lx_ptm_ops_t));
+ lps.lps_lh_array[index].lph_lpo = NULL;
+
+ /* remove the handle and return it */
+ lh = lps.lps_lh_array[index].lph_handle;
+ lps.lps_lh_array[index].lph_handle = NULL;
+ lps.lps_lh_array[index].lph_pktio = 0;
+ lps.lps_lh_array[index].lph_eofed = 0;
+ rw_exit(&lps.lps_lh_rwlock);
+ return (lh);
+}
+
+static void
+lx_ptm_lh_get_ppocb(uint_t index, ptmptsopencb_t *ppocb)
+{
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+ *ppocb = lps.lps_lh_array[index].lph_ppocb;
+ rw_exit(&lps.lps_lh_rwlock);
+}
+
+static void
+lx_ptm_lh_set_ppocb(uint_t index, ptmptsopencb_t *ppocb)
+{
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+ lps.lps_lh_array[index].lph_ppocb = *ppocb;
+ rw_exit(&lps.lps_lh_rwlock);
+}
+
+static ldi_handle_t
+lx_ptm_lh_lookup(uint_t index)
+{
+ ldi_handle_t lh;
+
+ rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+ /* return the handle */
+ lh = lps.lps_lh_array[index].lph_handle;
+ rw_exit(&lps.lps_lh_rwlock);
+ return (lh);
+}
+
+static lx_ptm_ops_t *
+lx_ptm_lpo_lookup(uint_t index)
+{
+ lx_ptm_ops_t *lpo;
+
+ rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_lpo != NULL);
+
+ /* return the handle */
+ lpo = lps.lps_lh_array[index].lph_lpo;
+ rw_exit(&lps.lps_lh_rwlock);
+ return (lpo);
+}
+
+static int
+lx_ptm_lh_pktio_get(uint_t index)
+{
+ int pktio;
+
+ rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+ /* return the pktio state */
+ pktio = lps.lps_lh_array[index].lph_pktio;
+ rw_exit(&lps.lps_lh_rwlock);
+ return (pktio);
+}
+
+static void
+lx_ptm_lh_pktio_set(uint_t index, int pktio)
+{
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+ /* set the pktio state */
+ lps.lps_lh_array[index].lph_pktio = pktio;
+ rw_exit(&lps.lps_lh_rwlock);
+}
+
+static int
+lx_ptm_lh_eofed_get(uint_t index)
+{
+ int eofed;
+
+ rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+ /* return the eofed state */
+ eofed = lps.lps_lh_array[index].lph_eofed;
+ rw_exit(&lps.lps_lh_rwlock);
+ return (eofed);
+}
+
+static void
+lx_ptm_lh_eofed_set(uint_t index)
+{
+ rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+ ASSERT(index < lps.lps_lh_count);
+ ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+ /* set the eofed state */
+ lps.lps_lh_array[index].lph_eofed++;
+ rw_exit(&lps.lps_lh_rwlock);
+}
+
+static int
+lx_ptm_read_start(dev_t dev)
+{
+ lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev));
+
+ mutex_enter(&lpo->lpo_rops_lock);
+ ASSERT(lpo->lpo_rops >= 0);
+
+ /* Wait for other read operations to finish */
+ while (lpo->lpo_rops != 0) {
+ if (cv_wait_sig(&lpo->lpo_rops_cv, &lpo->lpo_rops_lock) == 0) {
+ mutex_exit(&lpo->lpo_rops_lock);
+ return (-1);
+ }
+ }
+
+ /* Start a read operation */
+ VERIFY(++lpo->lpo_rops == 1);
+ mutex_exit(&lpo->lpo_rops_lock);
+ return (0);
+}
+
+static void
+lx_ptm_read_end(dev_t dev)
+{
+ lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev));
+
+ mutex_enter(&lpo->lpo_rops_lock);
+ ASSERT(lpo->lpo_rops >= 0);
+
+ /* End a read operation */
+ VERIFY(--lpo->lpo_rops == 0);
+ cv_signal(&lpo->lpo_rops_cv);
+
+ mutex_exit(&lpo->lpo_rops_lock);
+}
+
+static int
+lx_ptm_pts_isopen(dev_t dev)
+{
+ ptmptsopencb_t ppocb;
+
+ lx_ptm_lh_get_ppocb(DEVT_TO_INDEX(dev), &ppocb);
+ return (ppocb.ppocb_func(ppocb.ppocb_arg));
+}
+
+static void
+lx_ptm_eof_read(ldi_handle_t lh)
+{
+ struct uio uio;
+ iovec_t iov;
+ char junk[1];
+
+ /*
+ * We can remove any EOF message from the head of the stream by
+ * doing a zero byte read from the stream.
+ */
+ iov.iov_len = 0;
+ iov.iov_base = junk;
+ uio.uio_iovcnt = 1;
+ uio.uio_iov = &iov;
+ uio.uio_resid = iov.iov_len;
+ uio.uio_offset = 0;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_fmode = 0;
+ uio.uio_extflg = 0;
+ uio.uio_llimit = MAXOFFSET_T;
+ (void) ldi_read(lh, &uio, kcred);
+}
+
+static int
+lx_ptm_eof_drop_1(dev_t dev, int *rvalp)
+{
+ ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+ int err, msg_size, msg_count;
+
+ *rvalp = 0;
+
+ /*
+ * Check if there is an EOF message (represented by a zero length
+ * data message) at the head of the stream. Note that the
+ * I_NREAD ioctl is a streams framework ioctl so it will succeed
+ * even if there have been previous write errors on this stream.
+ */
+ if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size,
+ FKIOCTL, kcred, &msg_count)) != 0)
+ return (err);
+
+ if ((msg_count == 0) || (msg_size != 0)) {
+ /* No EOF message found */
+ return (0);
+ }
+
+ /* Record the fact that the slave device has been closed. */
+ lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev));
+
+ /* drop the EOF */
+ lx_ptm_eof_read(lh);
+ *rvalp = 1;
+ return (0);
+}
+
+static int
+lx_ptm_eof_drop(dev_t dev, int *rvalp)
+{
+ int rval, err;
+
+ if (rvalp != NULL)
+ *rvalp = 0;
+ for (;;) {
+ if ((err = lx_ptm_eof_drop_1(dev, &rval)) != 0)
+ return (err);
+ if (rval == 0)
+ return (0);
+ if (rvalp != NULL)
+ *rvalp = 1;
+ }
+}
+
+static int
+lx_ptm_data_check(dev_t dev, int ignore_eof, int *rvalp)
+{
+ ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+ int err;
+
+ *rvalp = 0;
+ if (ignore_eof) {
+ int size, rval;
+
+ if ((err = ldi_ioctl(lh, FIONREAD, (intptr_t)&size,
+ FKIOCTL, kcred, &rval)) != 0)
+ return (err);
+ if (size != 0)
+ *rvalp = 1;
+ } else {
+ int msg_size, msg_count;
+
+ if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size,
+ FKIOCTL, kcred, &msg_count)) != 0)
+ return (err);
+ if (msg_count != 0)
+ *rvalp = 1;
+ }
+ return (0);
+}
+
+static int
+lx_ptm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int err;
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ddi_create_minor_node(dip, LX_PTM_MINOR_NODE, S_IFCHR,
+ ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+
+ err = ldi_ident_from_dip(dip, &lps.lps_li);
+ if (err != 0) {
+ ddi_remove_minor_node(dip, ddi_get_name(dip));
+ return (DDI_FAILURE);
+ }
+
+ lps.lps_dip = dip;
+ lps.lps_pts_major = ddi_name_to_major(LP_PTS_DRV_NAME);
+
+ rw_init(&lps.lps_lh_rwlock, NULL, RW_DRIVER, NULL);
+ lps.lps_lh_count = 0;
+ lps.lps_lh_array = NULL;
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_ptm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ ldi_ident_release(lps.lps_li);
+ lps.lps_dip = NULL;
+
+ ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL));
+ ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL));
+ if (lps.lps_lh_array != NULL) {
+ kmem_free(lps.lps_lh_array,
+ sizeof (lx_ptm_handle_t) * lps.lps_lh_count);
+ lps.lps_lh_array = NULL;
+ lps.lps_lh_count = 0;
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_ptm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+ struct strioctl iocb;
+ ptmptsopencb_t ppocb = { NULL, NULL };
+ ldi_handle_t lh;
+ major_t maj, our_major = getmajor(*devp);
+ minor_t min, lastmin;
+ uint_t index, anchor = 1;
+ dev_t ptm_dev;
+ int err, rval = 0;
+
+ /*
+ * Don't support the FNDELAY flag and FNONBLOCK until we either
+ * find a Linux app that opens /dev/ptmx with the O_NDELAY
+ * or O_NONBLOCK flags explicitly, or until we create test cases
+ * to determine how reads of master terminal devices opened with
+ * these flags behave in different situations on Linux. Supporting
+ * these flags will involve enhancing our read implementation
+ * and changing the way it deals with EOF notifications.
+ */
+ if (flag & (FNDELAY | FNONBLOCK))
+ return (ENOTSUP);
+
+ /*
+ * we're layered on top of the ptm driver so open that driver
+ * first. (note that we're opening /dev/ptmx in the global
+ * zone, not ourselves in the lx zone.)
+ */
+ err = ldi_open_by_name(LP_PTM_PATH, flag, credp, &lh, lps.lps_li);
+ if (err != 0)
+ return (err);
+
+ /* get the devt returned by the ptmx open */
+ err = ldi_get_dev(lh, &ptm_dev);
+ if (err != 0) {
+ (void) ldi_close(lh, flag, credp);
+ return (err);
+ }
+
+ /*
+ * we're a cloning driver so here's where we'll change the devt that we
+ * return. the ptmx is also a cloning driver so we'll just use
+ * it's minor number as our minor number (it already manages it's
+ * minor name space so no reason to duplicate the effort.)
+ */
+ index = getminor(ptm_dev);
+ *devp = makedevice(our_major, INDEX_TO_MINOR(index));
+
+ /* Get a callback function to query if the pts device is open. */
+ iocb.ic_cmd = PTMPTSOPENCB;
+ iocb.ic_timout = 0;
+ iocb.ic_len = sizeof (ppocb);
+ iocb.ic_dp = (char *)&ppocb;
+
+ err = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, kcred, &rval);
+ if ((err != 0) || (rval != 0)) {
+ (void) ldi_close(lh, flag, credp);
+ return (EIO); /* XXX return something else here? */
+ }
+ ASSERT(ppocb.ppocb_func != NULL);
+
+ /*
+ * now setup autopush for the terminal slave device. this is
+ * necessary so that when a Linux program opens the device we
+ * can push required strmod modules onto the stream. in Solaris
+ * this is normally done by the application that actually
+ * allocates the terminal.
+ */
+ maj = lps.lps_pts_major;
+ min = index;
+ lastmin = 0;
+ err = kstr_autopush(SET_AUTOPUSH, &maj, &min, &lastmin,
+ &anchor, lx_pts_mods);
+ if (err != 0 && err != EEXIST) {
+ (void) ldi_close(lh, flag, credp);
+ return (EIO); /* XXX return something else here? */
+ }
+
+ /* save off this layered handle for future accesses */
+ lx_ptm_lh_insert(index, lh);
+ lx_ptm_lh_set_ppocb(index, &ppocb);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_ptm_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+ ldi_handle_t lh;
+ major_t maj;
+ minor_t min, lastmin;
+ uint_t index;
+ int err;
+ int i;
+
+ index = DEVT_TO_INDEX(dev);
+
+ /*
+ * we must cleanup all the state associated with this major/minor
+ * terminal pair before actually closing the ptm master device.
+ * this is required because once the close of the ptm device is
+ * complete major/minor terminal pair is immediatly available for
+ * re-use in any zone.
+ */
+
+ /* free up our saved reference for this layered handle */
+ lh = lx_ptm_lh_remove(index);
+
+ /* unconfigure autopush for the associated terminal slave device */
+ maj = lps.lps_pts_major;
+ min = index;
+ lastmin = 0;
+ for (i = 0; i < 5; i++) {
+ /*
+ * we loop here because we don't want to release this ptm
+ * node if autopush can't be disabled on the associated
+ * slave device because then bad things could happen if
+ * another brand were to get this terminal allocated
+ * to them. If we keep failing we eventually drive on so that
+ * things don't hang.
+ */
+ err = kstr_autopush(CLR_AUTOPUSH, &maj, &min, &lastmin,
+ 0, NULL);
+ if (err == 0)
+ break;
+
+ cmn_err(CE_WARN, "lx zoneid %d: error %d on kstr_autopush",
+ getzoneid(), err);
+
+ /* wait one second and try again */
+ delay(drv_usectohz(1000000));
+ }
+
+ err = ldi_close(lh, flag, credp);
+
+ /*
+ * note that we don't have to bother with changing the permissions
+ * on the associated slave device here. the reason is that no one
+ * can actually open the device untill it's associated master
+ * device is re-opened, which will result in the permissions on
+ * it being reset.
+ */
+ return (err);
+}
+
+static int
+lx_ptm_read_loop(dev_t dev, struct uio *uiop, cred_t *credp, int *loop)
+{
+ ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+ int err, rval;
+ struct uio uio = *uiop;
+
+ *loop = 0;
+
+ /*
+ * Here's another way that Linux master terminals behave differently
+ * from Solaris master terminals. If you do a read on a Linux
+ * master terminal (that was opened witout NDELAY and NONBLOCK)
+ * who's corrosponding slave terminal is currently closed and
+ * has been opened and closed at least once, Linux return -1 and
+ * set errno to EIO where as Solaris blocks.
+ */
+ if (lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev))) {
+ /* Slave has been opened and closed at least once. */
+ if (lx_ptm_pts_isopen(dev) == 0) {
+ /*
+ * Slave is closed. Make sure that data is avaliable
+ * before attempting a read.
+ */
+ if ((err = lx_ptm_data_check(dev, 0, &rval)) != 0)
+ return (err);
+
+ /* If there is no data available then return. */
+ if (rval == 0)
+ return (EIO);
+ }
+ }
+
+ /* Actually do the read operation. */
+ if ((err = ldi_read(lh, uiop, credp)) != 0)
+ return (err);
+
+ /* If read returned actual data then return. */
+ if (uio.uio_resid != uiop->uio_resid)
+ return (0);
+
+ /*
+ * This was a zero byte read (ie, an EOF). This indicates
+ * that the slave terinal device has been closed. Record
+ * the fact that the slave device has been closed and retry
+ * the read operation.
+ */
+ lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev));
+ *loop = 1;
+ return (0);
+}
+
+static int
+lx_ptm_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int pktio = lx_ptm_lh_pktio_get(DEVT_TO_INDEX(dev));
+ int err, loop;
+ struct uio uio;
+ struct iovec iovp;
+
+ ASSERT(uiop->uio_iovcnt > 0);
+
+ /*
+ * If packet mode has been enabled (via TIOCPKT) we need to pad
+ * all read requests with a leading byte that indicates any
+ * relevant control status information.
+ */
+ if (pktio != 0) {
+ /*
+ * We'd like to write the control information into
+ * the current buffer but we can't yet. We don't
+ * want to modify userspace memory here only to have
+ * the read operation fail later. So instead
+ * what we'll do here is read one character from the
+ * beginning of the memory pointed to by the uio
+ * structure. This will advance the output pointer
+ * by one. Then when the read completes successfully
+ * we can update the byte that we passed over. Before
+ * we do the read make a copy of the current uiop and
+ * iovec structs so we can write to them later.
+ */
+ uio = *uiop;
+ iovp = *uiop->uio_iov;
+ uio.uio_iov = &iovp;
+
+ if (uwritec(uiop) == -1)
+ return (EFAULT);
+ }
+
+ do {
+ /*
+ * Before we actually attempt a read operation we need
+ * to make sure there's some buffer space to actually
+ * read in some data. We do this because if we're in
+ * pktio mode and the caller only requested one byte,
+ * then we've already used up that one byte and we
+ * don't want to pass this read request. Doing a 0
+ * byte read (unless there is a problem with the stream
+ * head) always returns succcess. Normally when a streams
+ * read returns 0 bytes we interpret that as an EOF on
+ * the stream (ie, the slave side has been opened and
+ * closed) and we ignore it and re-try the read operation.
+ * So if we pass on a 0 byte read here lx_ptm_read_loop()
+ * will tell us to loop around and we'll end up in an
+ * infinite loop.
+ */
+ if (uiop->uio_resid == 0)
+ break;
+
+ /*
+ * Serialize all reads. We need to do this so that we can
+ * properly emulate the behavior of master terminals on Linux.
+ * In reality this serializaion should not pose any kind of
+ * performance problem since it would be very strange to have
+ * multiple threads trying to read from the same master
+ * terminal device concurrently.
+ */
+ if (lx_ptm_read_start(dev) != 0)
+ return (EINTR);
+
+ err = lx_ptm_read_loop(dev, uiop, credp, &loop);
+ lx_ptm_read_end(dev);
+ if (err != 0)
+ return (err);
+ } while (loop != 0);
+
+ if (pktio != 0) {
+ uint8_t pktio_data = TIOCPKT_DATA;
+
+ /*
+ * Note that the control status information we
+ * pass back is faked up in the sense that we
+ * don't actually report any events, we always
+ * report a status of 0.
+ */
+ if (uiomove(&pktio_data, 1, UIO_READ, &uio) != 0)
+ return (EFAULT);
+ }
+
+ return (0);
+}
+
+static int
+lx_ptm_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+ int err;
+
+ err = ldi_write(lh, uiop, credp);
+
+ return (err);
+}
+
+static int
+lx_ptm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+ int err;
+
+ /*
+ * here we need to make sure that we never allow the
+ * I_SETSIG and I_ESETSIG ioctls to pass through. we
+ * do this because we can't support them.
+ *
+ * the native Solaris ptm device supports these ioctls because
+ * they are streams framework ioctls and all streams devices
+ * support them by default. these ioctls cause the current
+ * process to be registered with a stream and receive signals
+ * when certain stream events occur.
+ *
+ * a problem arises with cleanup of these registrations
+ * for layered drivers.
+ *
+ * normally the streams framework is notified whenever a
+ * process closes any reference to a stream and it goes ahead
+ * and cleans up these registrations. but actual device drivers
+ * are not notified when a process performs a close operation
+ * unless the process is closing the last opened reference to
+ * the device on the entire system.
+ *
+ * so while we could pass these ioctls on and allow processes
+ * to register for signal delivery, we would never receive
+ * any notification when those processes exit (or close a
+ * stream) and we wouldn't be able to unregister them.
+ *
+ * luckily these operations are streams specific and Linux
+ * doesn't support streams devices. so it doesn't actually
+ * seem like we need to support these ioctls. if it turns
+ * out that we do need to support them for some reason in
+ * the future, the current driver model will have to be
+ * enhanced to better support streams device layering.
+ */
+ if ((cmd == I_SETSIG) || (cmd == I_ESETSIG))
+ return (EINVAL);
+
+ /*
+ * here we fake up support for TIOCPKT. Linux applications expect
+ * /etc/ptmx to support this ioctl, but on Solaris it doesn't.
+ * (it is supported on older bsd style ptys.) so we'll fake
+ * up support for it here.
+ *
+ * the reason that this ioctl is emulated here instead of in
+ * userland is that this ioctl affects the results returned
+ * from read() operations. if this ioctl was emulated in
+ * userland the brand library would need to intercept all
+ * read operations and check to see if pktio was enabled
+ * for the fd being read from. since this ioctl only needs
+ * to be supported on the ptmx device it makes more sense
+ * to support it here where we can easily update the results
+ * returned for read() operations performed on ourselves.
+ */
+ if (cmd == TIOCPKT) {
+ int pktio;
+
+ if (ddi_copyin((void *)arg, &pktio, sizeof (pktio),
+ mode) != DDI_SUCCESS)
+ return (EFAULT);
+
+ if (pktio == 0)
+ lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 0);
+ else
+ lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 1);
+
+ return (0);
+ }
+
+ err = ldi_ioctl(lh, cmd, arg, mode, credp, rvalp);
+
+ /*
+ * On recent versions of Linux some apps issue the following ioctls to
+ * the master side of the ptm before opening the slave side. Because
+ * our streams modules (specifically ptem) aren't autopushed until the
+ * slave side has been opened, these ioctls will fail. To alleviate the
+ * issue we simply pretend that these ioctls have succeeded.
+ *
+ * We could push our own "lx_ptem" module onto the master side of the
+ * stream in lx_ptm_open if we need better emulation, but that would
+ * require an "lx_ptem" module which duplicates most of ptem. ptem
+ * doesn't work properly when pushed on the master side.
+ */
+ if (err == EINVAL && (cmd == TIOCSWINSZ || cmd == TCSETS) &&
+ lx_ptm_pts_isopen(dev) == 0) {
+ /* slave side not open, assume we need to succeed */
+ DTRACE_PROBE1(lx_ptm_ioctl__override, int, cmd);
+ return (0);
+ }
+
+ return (err);
+}
+
+static int
+lx_ptm_poll_loop(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp, int *loop)
+{
+ ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+ short reventsp2;
+ int err, rval;
+
+ *loop = 0;
+
+ /*
+ * If the slave device has been opened and closed at least
+ * once and the slave device is currently closed, then poll
+ * always needs to returns immediatly.
+ */
+ if ((lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev)) != 0) &&
+ (lx_ptm_pts_isopen(dev) == 0)) {
+ /* In this case always return POLLHUP */
+ *reventsp = POLLHUP;
+
+ /*
+ * Check if there really is data on the stream.
+ * If so set the correct return flags.
+ */
+ if ((err = lx_ptm_data_check(dev, 1, &rval)) != 0) {
+ /* Something went wrong. */
+ return (err);
+ }
+ if (rval != 0)
+ *reventsp |= (events & (POLLIN | POLLRDNORM));
+
+ /*
+ * Is the user checking for writability? Note that for ptm
+ * devices Linux seems to ignore the POLLWRBAND write flag.
+ */
+ if ((events & POLLWRNORM) == 0)
+ return (0);
+
+ /*
+ * To check if the stream is writable we have to actually
+ * call poll, but make sure to set anyyet to 1 to prevent
+ * the streams framework from setting up callbacks.
+ */
+ if ((err = ldi_poll(lh, POLLWRNORM, 1, &reventsp2, NULL)) != 0)
+ return (err);
+
+ *reventsp |= (reventsp2 & POLLWRNORM);
+ } else {
+ int lockstate;
+
+ /* The slave device is open, do the poll */
+ if ((err = ldi_poll(lh, events, anyyet, reventsp, phpp)) != 0)
+ return (err);
+
+ /*
+ * Drop any leading EOFs on the stream.
+ *
+ * Note that we have to use pollunlock() here to avoid
+ * recursive mutex enters in the poll framework. The
+ * reason is that if there is an EOF message on the stream
+ * then the act of reading from the queue to remove the
+ * message can cause the ptm drivers event service
+ * routine to be invoked, and if there is no open
+ * slave device then the ptm driver may generate
+ * error messages and put them on the stream. This
+ * in turn will generate a poll event and the poll
+ * framework will try to invoke any poll callbacks
+ * associated with the stream. In the process of
+ * doing that the poll framework will try to aquire
+ * locks that we are already holding. So we need to
+ * drop those locks here before we do our read.
+ */
+ if (pollunlock(&lockstate) != 0) {
+ *reventsp = POLLNVAL;
+ return (0);
+ }
+ err = lx_ptm_eof_drop(dev, &rval);
+ pollrelock(lockstate);
+ if (err)
+ return (err);
+
+ /* If no EOF was dropped then return */
+ if (rval == 0)
+ return (0);
+
+ /*
+ * An EOF was removed from the stream. Retry the entire
+ * poll operation from the top because polls on the ptm
+ * device should behave differently now.
+ */
+ *loop = 1;
+ }
+ return (0);
+}
+
+static int
+lx_ptm_poll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ int loop, err;
+
+ do {
+ /* Serialize ourself wrt read operations. */
+ if (lx_ptm_read_start(dev) != 0)
+ return (EINTR);
+
+ err = lx_ptm_poll_loop(dev,
+ events, anyyet, reventsp, phpp, &loop);
+ lx_ptm_read_end(dev);
+ if (err != 0)
+ return (err);
+ } while (loop != 0);
+ return (0);
+}
+
+static struct cb_ops lx_ptm_cb_ops = {
+ lx_ptm_open, /* open */
+ lx_ptm_close, /* close */
+ nodev, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ lx_ptm_read, /* read */
+ lx_ptm_write, /* write */
+ lx_ptm_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ lx_ptm_poll, /* chpoll */
+ ddi_prop_op, /* prop_op */
+ NULL, /* cb_str */
+ D_NEW | D_MP,
+ CB_REV,
+ NULL,
+ NULL
+};
+
+static struct dev_ops lx_ptm_ops = {
+ DEVO_REV,
+ 0,
+ ddi_getinfo_1to1,
+ nulldev,
+ nulldev,
+ lx_ptm_attach,
+ lx_ptm_detach,
+ nodev,
+ &lx_ptm_cb_ops,
+ NULL,
+ NULL,
+ ddi_quiesce_not_needed, /* quiesce */
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* type of module */
+ "Linux master terminal driver", /* description of module */
+ &lx_ptm_ops /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.conf b/usr/src/uts/common/brand/lx/io/lx_ptm.conf
new file mode 100644
index 0000000000..481b4e3c74
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/io/lx_ptm.conf
@@ -0,0 +1,27 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+
+name="lx_ptm" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/brand/lx/os/lx_acct.c b/usr/src/uts/common/brand/lx/os/lx_acct.c
new file mode 100644
index 0000000000..7f38a240ab
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_acct.c
@@ -0,0 +1,198 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/cred.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/session.h>
+#include <sys/wait.h>
+#include <sys/ddi.h>
+#include <sys/zone.h>
+#include <sys/lx_types.h>
+
+/*
+ * Based on the Linux acct(5) man page, their comp_t definition is the same
+ * as ours. lxac_etime is encoded as a float for v3 accounting records.
+ */
+
+#define LX_ACCT_VERSION 3
+
+/*
+ * Bit flags in lxac_flag. The Linux AFORK and ASU match native. The rest of
+ * the flags diverge.
+ */
+#define LX_AFORK 0x01 /* executed fork, but no exec */
+#define LX_ASU 0x02 /* used superuser privileges */
+#define LX_ACORE 0x08 /* dumped core */
+#define LX_AXSIG 0x10 /* killed by a signal */
+
+typedef struct lx_acct {
+ char lxac_flag;
+ char lxac_version;
+ uint16_t lxac_tty;
+ uint32_t lxac_exitcode;
+ uint32_t lxac_uid;
+ uint32_t lxac_gid;
+ uint32_t lxac_pid;
+ uint32_t lxac_ppid;
+ uint32_t lxac_btime; /* seconds since the epoch */
+ uint32_t lxac_etime; /* float representation of ticks */
+ comp_t lxac_utime;
+ comp_t lxac_stime;
+ comp_t lxac_mem; /* kb */
+ comp_t lxac_io; /* unused */
+ comp_t lxac_rw; /* unused */
+ comp_t lxac_minflt;
+ comp_t lxac_majflt;
+ comp_t lxac_swaps; /* unused */
+ char lxac_comm[16];
+} lx_acct_t;
+
+/*
+ * Same functionality as acct_compress(). Produce a pseudo-floating point
+ * representation with 3 bits base-8 exponent, 13 bits fraction.
+ */
+static comp_t
+lx_acct_compt(ulong_t t)
+{
+ int exp = 0, round = 0;
+
+ while (t >= 8192) {
+ exp++;
+ round = t & 04;
+ t >>= 3;
+ }
+ if (round) {
+ t++;
+ if (t >= 8192) {
+ t >>= 3;
+ exp++;
+ }
+ }
+#ifdef _LP64
+ if (exp > 7) {
+ /* prevent wraparound */
+ t = 8191;
+ exp = 7;
+ }
+#endif
+ return ((exp << 13) + t);
+}
+
+/*
+ * 32-bit IEEE float encoding as-per Linux.
+ */
+static uint32_t
+lx_acct_float(int64_t t)
+{
+ uint32_t val, exp = 190;
+
+ if (t == 0)
+ return (0);
+
+ while (t > 0) {
+ t <<= 1;
+ exp--;
+ }
+ val = (uint32_t)(t >> 40) & 0x7fffffu;
+
+ return (val | (exp << 23));
+}
+
+/*
+ * Write a Linux-formatted record to the accounting file.
+ */
+void
+lx_acct_out(vnode_t *vp, int exit_status)
+{
+ struct proc *p;
+ user_t *ua;
+ struct cred *cr;
+ dev_t d;
+ pid_t pid, ppid;
+ struct vattr va;
+ ssize_t resid = 0;
+ int err;
+ lx_acct_t a;
+
+ p = curproc;
+ ua = PTOU(p);
+ cr = CRED();
+
+ bzero(&a, sizeof (a));
+
+ a.lxac_flag = ua->u_acflag & (LX_AFORK | LX_ASU);
+ a.lxac_version = LX_ACCT_VERSION;
+ d = cttydev(p);
+ a.lxac_tty = LX_MAKEDEVICE(getmajor(d), getminor(d));
+ if (WIFEXITED(exit_status)) {
+ a.lxac_exitcode = WEXITSTATUS(exit_status);
+ } else if (WIFSIGNALED(exit_status)) {
+ a.lxac_flag |= LX_AXSIG;
+ if (WCOREDUMP(exit_status)) {
+ a.lxac_flag |= LX_ACORE;
+ }
+ }
+ a.lxac_uid = crgetruid(cr);
+ a.lxac_gid = crgetrgid(cr);
+ pid = p->p_pid;
+ ppid = p->p_ppid;
+ /* Perform pid translation ala lxpr_fixpid(). */
+ if (pid == curzone->zone_proc_initpid) {
+ pid = 1;
+ ppid = 0;
+ } else {
+ if (ppid == curzone->zone_proc_initpid) {
+ ppid = 1;
+ } else if (ppid == curzone->zone_zsched->p_pid ||
+ (p->p_flag & SZONETOP) != 0) {
+ ppid = 1;
+ }
+ }
+ a.lxac_pid = pid;
+ a.lxac_ppid = ppid;
+ a.lxac_btime = ua->u_start.tv_sec;
+ /* For Linux v3 accounting record, this is an encoded float. */
+ a.lxac_etime = lx_acct_float(ddi_get_lbolt() - ua->u_ticks);
+ a.lxac_utime = lx_acct_compt(NSEC_TO_TICK(p->p_acct[LMS_USER]));
+ a.lxac_stime = lx_acct_compt(
+ NSEC_TO_TICK(p->p_acct[LMS_SYSTEM] + p->p_acct[LMS_TRAP]));
+ a.lxac_mem = lx_acct_compt((ulong_t)(ptob(ua->u_mem) / 1024));
+ /* a.lxac_io unused */
+ /* a.lxac_rw unused */
+ a.lxac_minflt = lx_acct_compt((ulong_t)p->p_ru.minflt);
+ a.lxac_majflt = lx_acct_compt((ulong_t)p->p_ru.majflt);
+ /* a.lxac_swaps unused */
+ bcopy(ua->u_comm, a.lxac_comm, sizeof (a.lxac_comm));
+
+ /*
+ * As with the native acct() handling, we save the size so that if the
+ * write fails, we can reset the size to avoid corrupting the accounting
+ * file.
+ */
+ va.va_mask = AT_SIZE;
+ if (VOP_GETATTR(vp, &va, 0, kcred, NULL) == 0) {
+ err = vn_rdwr(UIO_WRITE, vp, (caddr_t)&a, sizeof (a), 0LL,
+ UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFF_T, kcred, &resid);
+ if (err != 0 || resid != 0)
+ (void) VOP_SETATTR(vp, &va, 0, kcred, NULL);
+ }
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_acl.c b/usr/src/uts/common/brand/lx/os/lx_acl.c
new file mode 100644
index 0000000000..184f05b6ed
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_acl.c
@@ -0,0 +1,213 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+#include <sys/pathname.h>
+#include <sys/acl.h>
+#include <acl/acl_common.h>
+#include <sys/lx_acl.h>
+
+
+typedef struct {
+ uint16_t lpaxe_tag;
+ uint16_t lpaxe_perm;
+ uint32_t lpaxe_id;
+} lx_posix_acl_xattr_entry_t;
+
+typedef struct {
+ uint32_t lpaxh_version;
+ lx_posix_acl_xattr_entry_t lpaxh_entries[];
+} lx_posix_acl_xattr_header_t;
+
+#define LX_POSIX_ACL_XATTR_VERSION 0x0002
+
+/* e_tag entry in struct posix_acl_entry */
+#define LX_ACL_USER_OBJ 0x01 /* USER_OBJ */
+#define LX_ACL_USER 0x02 /* USER */
+#define LX_ACL_GROUP_OBJ 0x04 /* GROUP_OBJ */
+#define LX_ACL_GROUP 0x08 /* GROUP */
+#define LX_ACL_MASK 0x10 /* CLASS_OBJ */
+#define LX_ACL_OTHER 0x20 /* OTHER_OBJ */
+
+
+static int
+lx_acl_from_xattr(enum lx_acl_type atype, void *xattr, uint_t xlen,
+ acl_t **aclpp)
+{
+ lx_posix_acl_xattr_header_t *head = xattr;
+ lx_posix_acl_xattr_entry_t *entry;
+ int err = 0;
+ uint_t count, sz = xlen;
+ const uint_t mask = (atype == LX_ACL_DEFAULT) ? ACL_DEFAULT : 0;
+ acl_t *acl;
+ aclent_t *acle;
+
+ if (xattr == NULL) {
+ /* Handle zero-length set operations */
+ acl = acl_alloc(ACLENT_T);
+ *aclpp = acl;
+ return (0);
+ }
+
+ if (xlen < sizeof (*head)) {
+ return (EINVAL);
+ } else if (head->lpaxh_version != LX_POSIX_ACL_XATTR_VERSION) {
+ return (EOPNOTSUPP);
+ }
+
+ sz -= sizeof (lx_posix_acl_xattr_header_t);
+ if (sz % sizeof (lx_posix_acl_xattr_entry_t) != 0) {
+ return (EINVAL);
+ }
+ count = sz / sizeof (lx_posix_acl_xattr_entry_t);
+
+ acl = acl_alloc(ACLENT_T);
+ if (count == 0) {
+ *aclpp = acl;
+ return (0);
+ }
+
+ acle = kmem_alloc(count * sizeof (aclent_t), KM_SLEEP);
+ acl->acl_cnt = count;
+ acl->acl_aclp = acle;
+ entry = head->lpaxh_entries;
+ for (uint_t i = 0; i < count && err == 0; i++, entry++, acle++) {
+ switch (entry->lpaxe_tag) {
+ case LX_ACL_USER_OBJ:
+ case LX_ACL_GROUP_OBJ:
+ case LX_ACL_OTHER:
+ case LX_ACL_MASK:
+ break;
+ case LX_ACL_USER:
+ case LX_ACL_GROUP:
+ if (entry->lpaxe_id > MAXUID) {
+ err = EINVAL;
+ }
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+ acle->a_id = entry->lpaxe_id | mask;
+ acle->a_type = entry->lpaxe_tag;
+ acle->a_perm = entry->lpaxe_perm;
+ }
+ if (err != 0) {
+ acl_free(acl);
+ return (err);
+ }
+
+ *aclpp = acl;
+ return (0);
+}
+
+/* ARGSUSED */
+int
+lx_acl_setxattr(vnode_t *vp, enum lx_acl_type atype, void *data, size_t len)
+{
+ const boolean_t is_dir = (vp->v_type == VDIR);
+ acl_t *acl = NULL;
+ cred_t *cr = CRED();
+ int err;
+
+ if (vp->v_type == VLNK) {
+ return (ENOTSUP);
+ } else if (atype == LX_ACL_DEFAULT && !is_dir) {
+ return (EACCES);
+ }
+
+ /*
+ * Copyin and verify the input, even through there is little to be done
+ * with the result.
+ */
+ if ((err = lx_acl_from_xattr(atype, data, len, &acl)) != 0) {
+ return (err);
+ }
+
+ /*
+ * Because systemd has decided to scope-creep its way into a position
+ * of moribund domination over all things system software, there exist
+ * work-arounds which are required to address its numerous bugs and
+ * shortcomings. One such case involves the FreeIPA installer needing
+ * to perform setfacl(3) on /run/systemd/ask-password.
+ *
+ * Between the fact that meaningful ACL translation can be challenging
+ * and that the path in question resides on tmpfs (which doesn't yet
+ * support ACLs at all on illumos), faked success is the only palatable
+ * course of action for now. Atonement will follow.
+ *
+ * See also: https://bugzilla.redhat.com/show_bug.cgi?id=1322167
+ */
+ err = ENOTSUP;
+ if (crgetuid(cr) == 0) {
+ char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ if (vnodetopath(NULL, vp, path, MAXPATHLEN, cr) == 0 &&
+ strncmp(path, "/run/systemd/", 13) == 0) {
+ /* Saccharin-sweet fake success */
+ err = 0;
+ }
+ kmem_free(path, MAXPATHLEN);
+ }
+ acl_free(acl);
+
+ return (err);
+}
+
+/* ARGSUSED */
+int
+lx_acl_getxattr(vnode_t *vp, enum lx_acl_type atype, void *data, size_t slen,
+ ssize_t *solen)
+{
+ const boolean_t is_dir = (vp->v_type == VDIR);
+ vsecattr_t vsattr;
+ int err;
+
+ if (vp->v_type == VLNK) {
+ return (ENOTSUP);
+ } else if (atype == LX_ACL_DEFAULT && !is_dir) {
+ return (ENODATA);
+ }
+
+ bzero(&vsattr, sizeof (vsattr));
+ vsattr.vsa_mask = VSA_ACECNT;
+ if ((err = VOP_GETSECATTR(vp, &vsattr, 0, CRED(), NULL)) != 0) {
+ err = (err == ENOENT) ? ENODATA : err;
+ return (err);
+ }
+
+ if (vsattr.vsa_aclentp != NULL)
+ kmem_free(vsattr.vsa_aclentp, vsattr.vsa_aclentsz);
+
+ return (ENODATA);
+}
+
+/* ARGSUSED */
+int
+lx_acl_removexattr(vnode_t *vp, enum lx_acl_type atype)
+{
+ return (ENODATA);
+}
+
+/* ARGSUSED */
+int
+lx_acl_listxattr(vnode_t *vp, uio_t *uio)
+{
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_audit.c b/usr/src/uts/common/brand/lx/os/lx_audit.c
new file mode 100644
index 0000000000..6e522e6d8d
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_audit.c
@@ -0,0 +1,1604 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * The Linux auditing system provides a fairly complex rule-based syntax
+ * for configuring what actions are to be audited. The user-level details
+ * are generally described in the Linux audit.rules(7), auditctl(8), and
+ * auditd(8) man pages. The user/kernel netlink API does not seem to be
+ * documented. The Linux kernel source and the user-level auditd source must
+ * be used to understand the interface we have to emulate. The relevant Linux
+ * source files are:
+ * include/uapi/linux/audit.h
+ * include/linux/audit.h
+ * kernel/audit.c
+ *
+ * The lx_netlink module implements the API used for getting or changing the
+ * audit configuration. For rule-oriented operations (list, append, delete),
+ * an lx_audit_rule_t structure (or sequence when listing) is passed in/out of
+ * the kernel. The netlink code calls into the lx_audit_append_rule or
+ * lx_audit_delete_rule functions here to perform the relevant operation.
+ * Within the lx_audit_rule_t structure, each member has the following
+ * meaning:
+ * lxar_flag: corresponds to user-level list (e.g. "exit" for syscall return)
+ * lxar_action: user-level action (e.g. "always")
+ * lxar_fld_cnt: number of fields specified in lxar_fields, lxar_values, and
+ * lxar_flg_flag arrays
+ * lxar_mask: syscall number bitmask the rule applies to (bit position in
+ * the array corresponds to the syscall number)
+ * laxr_fields: array of fields in the rule (i.e. each -F on user-level rule).
+ * A numeric code (e.g. LX_RF_AUDIT_ARCH) is assigned to each
+ * possible field.
+ * lxar_values: array of numeric field values (e.g. the internal b64 value on
+ * the -F AUDIT_ARCH=b64 rule)
+ * lxar_fld_flag: array of field operators (e.g. the '=' operator on the
+ * -F AUDIT_ARCH=b64 rule)
+ * lxar_buflen: length of the buffer data immediately following
+ * lxar_buf: A variable amount of additional field string data. Non-numeric
+ * field values are passed here. For example, the string associated
+ * with the '-F key=...' or -F path=...' rules. For string values,
+ * the corresponding lxar_values entry is the length of the string.
+ * The strings in lxar_buf are not C strings because they are not
+ * NULL terminated. The character data is pulled out of lxar_buf
+ * in chunks specified by the value and the pointer into the buf
+ * is advanced accordingly.
+ *
+ * There are two primary kinds of actions which we are currently interested in
+ * auditing;
+ * 1) system call return
+ * this corresponds to user-level "exit" rule actions
+ * 2) file system related actions
+ * this corresponds to user-level file system watch rules (-w)
+ *
+ * Only system call return is currently implemented, and only a very limited
+ * subset of all of the possible rule selection behavior.
+ *
+ * The Linux audit rule syntax defines that all selection criteria within a
+ * rule is ANDed together before an audit record is created. However, multiple
+ * rules can be defined for a specific syscall. For example, this user-level
+ * syntax defines two different rules for the "open" syscall:
+ * -a always,exit -F arch=b64 -S open -F auid>=1000 -F key=user-open
+ * -a always,exit -F arch=b64 -S open -F auid=0 -F key=priv-open
+ * The first rule would cause an audit record to be created when an "open"
+ * syscall returns and the syscall was performed by a process with a
+ * loginuid >= 1000. The key added to that audit record would be "user-open".
+ * The second rule would create an audit record if the loginuid was 0 and the
+ * record's key would be "priv-open".
+ *
+ * When auditing is enabled for a syscall return, we have to look at multiple
+ * rules and create an audit record for each rule that matches the selection
+ * criteria.
+ *
+ * Although the current implementation is limited, the overall structure is
+ * designed to be enhanced as more auditing support is added over time.
+ *
+ * By default, auditing is not enabled for a zone and no internal audit data
+ * exists. When the first netlink audit msg is received, the zone's audit state
+ * (lx_audit_state_t) is allocated (via lx_audit_init) and attached to the
+ * zone's lx brand-specific data (lxzd_audit_state). Once allocated, the audit
+ * data will persist until the zone halts.
+ *
+ * Audit records are enqueued onto the lxast_ev_queue and a worker thread
+ * (lx_audit_worker) is responsible for dequeueing the audit records and
+ * sending them up to the user-level auditd.
+ *
+ * Audit rules are stored in the lxast_rules list. This is an internal list
+ * consisting of elements of type lx_audit_rule_ent_t. Each element contains
+ * the input rule (lxare_rule) along with some additional data parsed out of
+ * the rule when it is appended (currently only the arch and key).
+ *
+ * When auditing is enabled for a syscall, the appropriate entry in the
+ * lxast_sys64_rulep (or lxast_sys32_rulep) array will point to the first
+ * rule that is applicable to the syscall. When that syscall returns, rule
+ * matching proceeds from that rule to the end of the rule list.
+ *
+ * New rules are always appended at the end of the list and Linux expects that
+ * rules are matched in order.
+ *
+ * If the rule list ever gets large enough that a linear search, anchored off
+ * the syscall pointer, becomes a performance bottleneck, then we'll have to
+ * explore alternate implementations. However, use of auditing is not that
+ * common to begin with, and most syscalls are typically not audited, so as
+ * long as the number of rules is in the order of tens, then the current
+ * implementation should be fine.
+ *
+ * When a rule is deleted, all associated syscall entries (lxast_sys64_rulep or
+ * lxast_sys32_rulep) are cleared, then the rule list is searched to see if
+ * there are any remaining rules which are applicable to the syscall(s). If so,
+ * pointers are reestablished in the relevant lxast_sys64_rulep (or 32) array.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/ddi.h>
+#include <sys/zone.h>
+#include <sys/strsubr.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sunddi.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+#include <sys/sockio.h>
+#include <sys/brand.h>
+#include <sys/debug.h>
+#include <sys/ucred.h>
+#include <sys/session.h>
+#include <sys/lx_types.h>
+#include <sys/lx_audit.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_socket.h>
+#include <sys/bitmap.h>
+#include <sockcommon.h>
+
+#define LX_AUDIT_FEATURE_VERSION 1
+
+/*
+ * Audit status mask values (lxas_mask in structure defined below)
+ * See Linux include/uapi/linux/audit.h
+ */
+#define LX_AUDIT_STATUS_ENABLED 0x001
+#define LX_AUDIT_STATUS_FAILURE 0x002
+#define LX_AUDIT_STATUS_PID 0x004
+#define LX_AUDIT_STATUS_RATE_LIMIT 0x008
+#define LX_AUDIT_STATUS_BACKLOG_LIMIT 0x010
+#define LX_AUDIT_STATUS_BACKLOG_WAIT_TIME 0x020
+#define LX_AUDIT_STATUS_LOST 0x040
+
+/*
+ * Audit features
+ * See Linux include/uapi/linux/audit.h
+ */
+#define LX_AUDIT_F_BACKLOG_LIMIT 0x001
+#define LX_AUDIT_F_BACKLOG_WAIT_TIME 0x002
+#define LX_AUDIT_F_EXECUTABLE_PATH 0x004
+#define LX_AUDIT_F_EXCLUDE_EXTEND 0x008
+#define LX_AUDIT_F_SESSIONID_FILTER 0x010
+#define LX_AUDIT_F_LOST_RESET 0x020
+#define LX_AUDIT_F_FILTER_FS 0x040
+
+#define LX_AUDIT_FEATURE_ALL (LX_AUDIT_F_BACKLOG_LIMIT | \
+ LX_AUDIT_F_BACKLOG_WAIT_TIME | LX_AUDIT_F_EXECUTABLE_PATH | \
+ LX_AUDIT_F_EXCLUDE_EXTEND | LX_AUDIT_F_SESSIONID_FILTER | \
+ LX_AUDIT_F_LOST_RESET | LX_AUDIT_F_FILTER_FS)
+
+
+/* Audit events */
+#define LX_AUDIT_SYSCALL 1300 /* syscall */
+#define LX_AUDIT_PATH 1302 /* file path */
+#define LX_AUDIT_CONFIG_CHANGE 1305 /* configuration change */
+#define LX_AUDIT_CWD 1307 /* current working directory */
+#define LX_AUDIT_EXECVE 1309 /* exec args */
+#define LX_AUDIT_EOE 1320 /* end of multi-record event */
+
+#define LX_AUDIT_BITMASK_SIZE 64
+#define LX_AUDIT_MAX_KEY_LEN 256
+
+/* Audit rule filter type */
+#define LX_AUDIT_FILTER_USER 0 /* user generated msgs */
+#define LX_AUDIT_FILTER_TASK 1 /* task creation */
+#define LX_AUDIT_FILTER_ENTRY 2 /* syscall entry - obsolete */
+#define LX_AUDIT_FILTER_WATCH 3 /* fs watch */
+#define LX_AUDIT_FILTER_EXIT 4 /* syscall return */
+#define LX_AUDIT_FILTER_TYPE 5 /* audit log start */
+#define LX_AUDIT_FILTER_FS 6 /* audit inode child */
+
+/* Audit rule action type */
+#define LX_AUDIT_ACT_NEVER 0
+#define LX_AUDIT_ACT_POSSIBLE 1
+#define LX_AUDIT_ACT_ALWAYS 2 /* the common case */
+
+#define LX_AUDIT_RULE_MAX_FIELDS 64
+
+/* Linux defaults */
+#define LX_AUDIT_DEF_BACKLOG_LIMIT 64
+#define LX_AUDIT_DEF_WAIT_TIME (60 * HZ_TO_LX_USERHZ(hz))
+
+/*
+ * Audit rule field types
+ * Linux defines a lot of Rule Field values in include/uapi/linux/audit.h.
+ * We currently only handle a few.
+ */
+#define LX_RF_AUDIT_LOGINUID 9 /* e.g. auid>=1000 */
+#define LX_RF_AUDIT_ARCH 11 /* e.g. -F arch=b64 */
+#define LX_RF_AUDIT_WATCH 105 /* user-level -w rule */
+#define LX_RF_AUDIT_PERM 106 /* user-level -p option */
+#define LX_RF_AUDIT_FILTERKEY 210 /* user-level -k key option */
+
+/*
+ * Audit rule field operators
+ * Linux defines the operator values in include/uapi/linux/audit.h.
+ * These 4 bits are combined in various ways for additional operators.
+ */
+#define LX_OF_AUDIT_BM 0x08000000 /* bit mask (&) */
+#define LX_OF_AUDIT_LT 0x10000000
+#define LX_OF_AUDIT_GT 0x20000000
+#define LX_OF_AUDIT_EQ 0x40000000
+#define LX_OF_AUDIT_NE (LX_OF_AUDIT_LT | LX_OF_AUDIT_GT)
+#define LX_OF_AUDIT_BT (LX_OF_AUDIT_BM | LX_OF_AUDIT_EQ) /* bit test (&=) */
+#define LX_OF_AUDIT_LE (LX_OF_AUDIT_LT | LX_OF_AUDIT_EQ)
+#define LX_OF_AUDIT_GE (LX_OF_AUDIT_GT | LX_OF_AUDIT_EQ)
+#define LX_OF_AUDIT_ALL (LX_OF_AUDIT_EQ | LX_OF_AUDIT_NE | LX_OF_AUDIT_BM)
+
+/*
+ * Audit rule arch specification
+ * See Linux EM_X86_64 and EM_386 defs.
+ * -F arch=b64 looks like: 0xc000003e
+ * -F arch=b32 looks like: 0x40000003
+ * If no arch is specified (possible with '-S syslog', '-S all', or '-w <file>')
+ * the rule applies to both architectures and LX_RF_AUDIT_ARCH is not passed.
+ */
+#define LX_AUDIT_ARCH64 0xc000003e
+#define LX_AUDIT_ARCH32 0x40000003
+
+/*
+ * See Linux include/uapi/linux/audit.h, AUDIT_MESSAGE_TEXT_MAX is 8560.
+ * The auditd src has MAX_AUDIT_MESSAGE_LENGTH as 8970.
+ * Until necessary, we'll limit ourselves to a smaller length.
+ */
+#define LX_AUDIT_MESSAGE_TEXT_MAX 1024
+
+typedef struct lx_audit_features {
+ uint32_t lxaf_version;
+ uint32_t lxaf_mask;
+ uint32_t lxaf_features;
+ uint32_t lxaf_lock;
+} lx_audit_features_t;
+
+typedef struct lx_audit_status {
+ uint32_t lxas_mask;
+ uint32_t lxas_enabled;
+ uint32_t lxas_failure;
+ uint32_t lxas_pid;
+ uint32_t lxas_rate_limit;
+ uint32_t lxas_backlog_limit;
+ uint32_t lxas_lost;
+ uint32_t lxas_backlog;
+ /* LINTED: E_ANONYMOUS_UNION_DECL */
+ union {
+ uint32_t lxas_version;
+ uint32_t lxas_feature_bitmap;
+ };
+ uint32_t lxas_backlog_wait_time;
+} lx_audit_status_t;
+
+typedef struct lx_audit_rule {
+ uint32_t lxar_flag;
+ uint32_t lxar_action;
+ uint32_t lxar_fld_cnt;
+ uint32_t lxar_mask[LX_AUDIT_BITMASK_SIZE];
+ uint32_t lxar_fields[LX_AUDIT_RULE_MAX_FIELDS];
+ uint32_t lxar_values[LX_AUDIT_RULE_MAX_FIELDS];
+ uint32_t lxar_fld_flag[LX_AUDIT_RULE_MAX_FIELDS];
+ uint32_t lxar_buflen;
+ /* LINTED: E_ZERO_OR_NEGATIVE_SUBSCRIPT */
+ char lxar_buf[0];
+} lx_audit_rule_t;
+
+/*
+ * Internal structure for an audit rule.
+ * Each rule is on the zone's top-level list of all rules (lxast_rules).
+ * This structure also holds the parsed character string fields from the
+ * original input rule (lxar_buf) so that we don't need to re-parse that
+ * data on every match.
+ */
+typedef struct lx_audit_rule_ent {
+ list_node_t lxare_link;
+ lx_audit_rule_t lxare_rule;
+ char *lxare_buf;
+ boolean_t lxare_is32bit;
+ boolean_t lxare_is64bit;
+ char *lxare_key;
+} lx_audit_rule_ent_t;
+
+typedef enum lx_audit_fail {
+ LXAE_SILENT,
+ LXAE_PRINT, /* default */
+ LXAE_PANIC /* reboot the zone */
+} lx_audit_fail_t;
+
+typedef struct lx_audit_record {
+ list_node_t lxar_link;
+ uint32_t lxar_type;
+ char *lxar_msg;
+} lx_audit_record_t;
+
+/*
+ * Per-zone audit state
+ * Lazy allocated when first needed.
+ *
+ * lxast_rate_limit
+ * Currently unused, but can be get/set. Linux default is 0.
+ * lxast_backlog_limit
+ * The maximum number of outstanding audit events allowed (the Linux kernel
+ * default is 64). If the limit is reached, lxast_failure determines what
+ * to do.
+ * lxast_backlog_wait_time
+ * Currently unused, but can be get/set. Linux default is 60HZ.
+ */
+typedef struct lx_audit_state {
+ lx_audit_fail_t lxast_failure; /* failure behavior */
+ uint32_t lxast_rate_limit;
+ uint32_t lxast_backlog_limit;
+ uint32_t lxast_backlog_wait_time;
+ lx_audit_rule_ent_t *lxast_sys32_rulep[LX_NSYSCALLS];
+ lx_audit_rule_ent_t *lxast_sys64_rulep[LX_NSYSCALLS];
+ kcondvar_t lxast_worker_cv;
+ kmutex_t lxast_lock; /* protects members below */
+ pid_t lxast_pid; /* auditd pid */
+ uint64_t lxast_seq; /* event sequence num */
+ uint32_t lxast_backlog; /* num of queued events */
+ uint32_t lxast_lost; /* num of lost events */
+ void *lxast_sock; /* auditd lx_netlink_sock_t */
+ boolean_t lxast_exit; /* taskq worker should quit */
+ boolean_t lxast_panicing; /* audit forcing reboot? */
+ kthread_t *lxast_worker;
+ list_t lxast_ev_queue; /* audit record queue */
+ list_t lxast_rules; /* the list of rules */
+} lx_audit_state_t;
+
+/*
+ * Function pointer to netlink function used by audit worker threads to send
+ * audit messages up to the user-level auditd.
+ */
+static int (*lx_audit_emit_msg)(void *, uint_t, const char *, uint_t);
+static kmutex_t lx_audit_em_lock; /* protects emit_msg above */
+
+/* From uts/common/brand/lx/syscall/lx_socket.c */
+extern long lx_socket(int, int, int);
+/* From uts/common/syscall/close.c */
+extern int close(int);
+
+static int
+lx_audit_emit_syscall_event(uint_t mtype, void *lxsock, const char *msg)
+{
+ int err;
+
+ err = lx_audit_emit_msg(lxsock, mtype, msg, LX_AUDIT_MESSAGE_TEXT_MAX);
+ if (err != 0)
+ return (err);
+ err = lx_audit_emit_msg(lxsock, 0, NULL, 0);
+ return (err);
+}
+
+/*
+ * Worker thread for audit record output up to user-level auditd.
+ */
+static void
+lx_audit_worker(void *a)
+{
+ lx_audit_state_t *asp = (lx_audit_state_t *)a;
+ lx_audit_record_t *rp;
+ int err;
+
+ VERIFY(asp != NULL);
+
+ mutex_enter(&asp->lxast_lock);
+
+ while (!asp->lxast_exit) {
+
+ if (asp->lxast_backlog == 0 || asp->lxast_sock == NULL ||
+ asp->lxast_pid == 0) {
+ cv_wait(&asp->lxast_worker_cv, &asp->lxast_lock);
+ continue;
+ }
+
+ rp = list_remove_head(&asp->lxast_ev_queue);
+ asp->lxast_backlog--;
+
+ err = lx_audit_emit_syscall_event(rp->lxar_type,
+ asp->lxast_sock, rp->lxar_msg);
+ if (err != ENOMEM && err != ENOSPC) {
+ kmem_free(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX);
+ kmem_free(rp, sizeof (lx_audit_record_t));
+ } else {
+ /*
+ * Put it back on the list, drop the mutex so that
+ * any other audit-related action could occur (such as
+ * socket deletion), then wait briefly before retry.
+ */
+ list_insert_head(&asp->lxast_ev_queue, rp);
+ asp->lxast_backlog++;
+ mutex_exit(&asp->lxast_lock);
+ /* wait 1/10th second and try again */
+ delay(drv_usectohz(100000));
+ mutex_enter(&asp->lxast_lock);
+ }
+ }
+
+ /* Leave state ready for new worker when auditing restarted */
+ asp->lxast_exit = B_FALSE;
+ mutex_exit(&asp->lxast_lock);
+
+ thread_exit();
+}
+
+static void
+lx_audit_set_worker(uint32_t pid, void *lxsock,
+ void (*cb)(void *, boolean_t))
+{
+ lx_audit_state_t *asp = ztolxzd(curzone)->lxzd_audit_state;
+
+ ASSERT(asp != NULL);
+ ASSERT(MUTEX_HELD(&asp->lxast_lock));
+
+ /* First, stop any existing worker thread */
+ while (asp->lxast_sock != NULL) {
+ mutex_exit(&asp->lxast_lock);
+ lx_audit_stop_worker(NULL, cb);
+ mutex_enter(&asp->lxast_lock);
+ /* unlikely we loop, but handle racing setters */
+ }
+
+ VERIFY(asp->lxast_pid == 0);
+ VERIFY(asp->lxast_sock == NULL);
+ VERIFY(asp->lxast_exit == B_FALSE);
+ VERIFY(asp->lxast_worker == NULL);
+ if (pid != 0) {
+ /* Start a worker with the new socket */
+ asp->lxast_sock = lxsock;
+ cb(asp->lxast_sock, B_TRUE);
+ asp->lxast_pid = pid;
+ asp->lxast_worker = thread_create(NULL, 0, lx_audit_worker,
+ asp, 0, curzone->zone_zsched, TS_RUN, minclsyspri);
+ }
+}
+
+static boolean_t
+lx_audit_match_val(uint32_t op, uint32_t ruleval, uint32_t curval)
+{
+ switch (op) {
+ case LX_OF_AUDIT_LT:
+ return (curval < ruleval);
+ case LX_OF_AUDIT_GT:
+ return (curval > ruleval);
+ case LX_OF_AUDIT_EQ:
+ return (curval == ruleval);
+ case LX_OF_AUDIT_NE:
+ return (curval != ruleval);
+ case LX_OF_AUDIT_LE:
+ return (curval <= ruleval);
+ case LX_OF_AUDIT_GE:
+ return (curval >= ruleval);
+ case LX_OF_AUDIT_BM: /* bit mask - any bit is set? */
+ return ((curval & ruleval) != 0);
+ case LX_OF_AUDIT_BT: /* bit test - all bits must be set */
+ return ((curval & ruleval) == ruleval);
+ default:
+ break;
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Per the Linux audit.rules(7) man page, a rule with an auid of -1 means the
+ * process does not have a loginuid. We'll use the absence of a session on the
+ * process to mimic this behavior.
+ */
+static uint32_t
+lx_audit_get_auid()
+{
+ sess_t *s;
+ uint32_t v;
+
+ /*
+ * A process with no session has:
+ * s_dev == 0xffffffffffffffff
+ * s_vp == NULL
+ * s_cred == NULL
+ */
+ s = curproc->p_sessp;
+ if (s != NULL && s->s_vp != NULL) {
+ v = crgetsuid(CRED());
+ } else {
+ v = UINT32_MAX; /* emulate auid of -1 */
+ }
+
+ return (v);
+}
+
+/*
+ * Determine if the rule matches.
+ * Currently, we're actually just checking LX_RF_AUDIT_LOGINUID (-F auid)
+ * fields, but as we add support for additional field matching, this function
+ * should be enhanced.
+ */
+static boolean_t
+lx_audit_syscall_rule_match(lx_audit_rule_ent_t *erp)
+{
+ uint32_t i, v;
+ lx_audit_rule_t *rp = &erp->lxare_rule;
+
+ for (i = 0; i < rp->lxar_fld_cnt; i++) {
+ uint32_t ftype, fval, fop;
+
+ ftype = rp->lxar_fields[i];
+ if (ftype != LX_RF_AUDIT_LOGINUID)
+ continue;
+
+ fop = rp->lxar_fld_flag[i];
+ fval = rp->lxar_values[i];
+ v = lx_audit_get_auid();
+
+ if (!lx_audit_match_val(fop, fval, v))
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+static int
+lx_audit_write(file_t *fp, const char *msg)
+{
+ int fflag;
+ ssize_t count;
+ size_t nwrite = 0;
+ struct uio auio;
+ struct iovec aiov;
+
+ count = strlen(msg);
+ fflag = fp->f_flag;
+
+ aiov.iov_base = (void *) msg;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ return (lx_write_common(fp, &auio, &nwrite, B_FALSE));
+}
+
+/*
+ * We first try to send the msg out to the zone's logging service, then
+ * fallback to the zone's console, although in practice, that is unlikely to
+ * be useful to most users.
+ */
+static void
+lx_audit_log_msg(const char *msg)
+{
+ int fd;
+ struct sockaddr_un addr;
+ struct sonode *so;
+ uint_t alen;
+ uint_t sizediff = (sizeof (addr) - sizeof (addr.sun_path));
+ file_t *fp;
+ int err;
+ vnode_t *vp;
+
+ ttolwp(curthread)->lwp_errno = 0;
+ fd = lx_socket(LX_AF_UNIX, LX_SOCK_DGRAM, 0);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ goto trycons;
+
+ bzero((char *)&addr, sizeof (addr));
+ addr.sun_family = AF_UNIX;
+ (void) strncpy(addr.sun_path, "/dev/log", sizeof (addr.sun_path) - 1);
+ alen = strlen(addr.sun_path) + 1 + sizediff;
+
+ /*
+ * We can't use lx_connect here since that expects to be called from
+ * user-land, so we do the (streamlined) connect ourselves.
+ */
+ if ((so = getsonode(fd, &err, &fp)) == NULL) {
+ (void) close(fd);
+ goto trycons;
+ }
+
+ err = socket_connect(so, (struct sockaddr *)&addr, alen, fp->f_flag,
+ _SOCONNECT_XPG4_2, CRED());
+
+ if (err == 0)
+ err = lx_audit_write(fp, msg);
+
+ releasef(fd); /* release getsonode hold */
+ (void) close(fd);
+
+ if (err == 0)
+ return;
+
+trycons:
+ /* "open" the console device */
+ if (lookupnameatcred("/dev/console", UIO_SYSSPACE, FOLLOW, NULLVPP,
+ &vp, NULL, CRED()) != 0)
+ return;
+
+ if (falloc(vp, FWRITE, &fp, &fd) != 0) {
+ VN_RELE(vp);
+ return;
+ }
+ mutex_exit(&fp->f_tlock);
+ setf(fd, fp);
+
+ /* nothing left to do if console write fails */
+ (void) lx_audit_write(fp, msg);
+ close(fd);
+}
+
+static void
+lx_audit_fail(lx_audit_state_t *asp, const char *msg)
+{
+ ASSERT(MUTEX_HELD(&asp->lxast_lock));
+
+ if (asp->lxast_failure == LXAE_PRINT ||
+ asp->lxast_failure == LXAE_PANIC) {
+ /*
+ * Linux can ratelimit the amount of log spam here, so we'll
+ * do something similar, especially since this could be called
+ * on many syscall returns if the audit daemon is down or
+ * not consuming audit records for some other reason.
+ */
+ if (asp->lxast_lost % 100 == 0)
+ lx_audit_log_msg(msg);
+ if (asp->lxast_failure == LXAE_PANIC &&
+ !asp->lxast_panicing) {
+ /*
+ * Reboot the zone so that no audit records are lost.
+ * We delay a second to give the zone's logger a chance
+ * to handle the log message. We have to drop the lock
+ * here in case the zone's logger itself is making
+ * syscalls which would be audited, although that
+ * wouldn't be the ideal configuration.
+ */
+ asp->lxast_panicing = B_TRUE;
+ mutex_exit(&asp->lxast_lock);
+ lx_audit_log_msg("audit: panic");
+ delay(drv_usectohz(1000000));
+ zone_kadmin(A_SHUTDOWN, AD_BOOT, NULL, kcred);
+ mutex_enter(&asp->lxast_lock);
+ }
+ }
+ asp->lxast_lost++;
+}
+
+/*
+ * This formats the input string into a format that matches Linux. The input
+ * strings are small right now (<= PSARGSZ) so for simpicity we're using
+ * a temporary buffer of adequate size.
+ */
+static void
+lx_audit_fmt_str(char *dst, char *str, uint_t dlen)
+{
+ char *sp, tmp[100];
+
+ (void) strlcpy(tmp, str, sizeof (tmp));
+ if ((sp = strchr(tmp, ' ')) != NULL)
+ *sp = '\0';
+
+ if ((sp = strchr(tmp, '"')) == NULL) {
+ (void) snprintf(dst, dlen, "\"%s\"", tmp);
+ } else {
+ char *p, *dp;
+ uint_t olen = 0;
+
+ ASSERT(dlen > 2);
+ dlen -= 2; /* leave room for terminating nul */
+ dp = dst;
+ for (p = str; *p != '\0' && olen < dlen; p++) {
+ (void) sprintf(dp, "%02x", *p);
+ dp += 2;
+ olen += 2;
+ }
+ *dp = '\0';
+ }
+}
+
+/*
+ * Format and enqueue a syscall audit record.
+ */
+static void
+lx_audit_syscall_fmt_rcd(int sysnum, uint32_t arch, long ret,
+ lx_audit_state_t *asp, lx_audit_rule_ent_t *erp, uint64_t seq,
+ timestruc_t *tsp)
+{
+ klwp_t *lwp;
+ proc_t *p;
+ uint32_t items, sessid;
+ lx_lwp_data_t *lwpd;
+ lx_audit_record_t *rp;
+ cred_t *cr = CRED();
+ minor_t minor;
+ char key[LX_AUDIT_MAX_KEY_LEN + 6]; /* for key="%s" formatting */
+ char exe[PSARGSZ * 2 + 8], comm[MAXCOMLEN * 2 + 8];
+
+ ASSERT(MUTEX_HELD(&asp->lxast_lock));
+
+ if (asp->lxast_backlog >= asp->lxast_backlog_limit) {
+ lx_audit_fail(asp, "audit: backlog limit exceeded");
+ return;
+ }
+
+ if (arch == LX_AUDIT_ARCH32) {
+ items = MIN(4, lx_sysent32[sysnum].sy_narg);
+ } else {
+ ASSERT3U(arch, ==, LX_AUDIT_ARCH64);
+ items = MIN(4, lx_sysent64[sysnum].sy_narg);
+ }
+
+ lwp = ttolwp(curthread);
+ lwpd = lwptolxlwp(lwp);
+ p = curproc;
+
+ /*
+ * For the key, if no key has been set on the rule, Linux formats the
+ * string "(null)" (with no quotes - i.e. key=(null)).
+ */
+ if (erp->lxare_key != NULL) {
+ (void) snprintf(key, sizeof (key), "key=\"%s\"",
+ erp->lxare_key);
+ } else {
+ (void) snprintf(key, sizeof (key), "key=(null)");
+ }
+
+ rp = kmem_alloc(sizeof (lx_audit_record_t), KM_NOSLEEP);
+ if (rp == NULL) {
+ lx_audit_fail(asp, "audit: no kernel memory");
+ return;
+ }
+ rp->lxar_msg = kmem_zalloc(LX_AUDIT_MESSAGE_TEXT_MAX, KM_NOSLEEP);
+ if (rp->lxar_msg == NULL) {
+ kmem_free(rp, sizeof (lx_audit_record_t));
+ lx_audit_fail(asp, "audit: no kernel memory");
+ return;
+ }
+ rp->lxar_type = LX_AUDIT_SYSCALL;
+
+ mutex_enter(&p->p_splock);
+ sessid = p->p_sessp->s_sid;
+ minor = getminor(p->p_sessp->s_dev);
+ mutex_exit(&p->p_splock);
+
+ mutex_enter(&p->p_lock);
+ lx_audit_fmt_str(exe, p->p_user.u_psargs, sizeof (exe));
+ lx_audit_fmt_str(comm, p->p_user.u_comm, sizeof (comm));
+ mutex_exit(&p->p_lock);
+
+ /*
+ * See Linux audit_log_exit() for how a syscall exit record is
+ * formatted.
+ *
+ * For "arch" value, see Linux AUDIT_ARCH_IA64, AUDIT_ARCH_I386,
+ * __AUDIT_ARCH_64BIT and __AUDIT_ARCH_LE definitions.
+ *
+ * For fsuid/fsgid, see lx_setfsuid/lx_setfsgid for how we handle that.
+ */
+ (void) snprintf(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX,
+ "audit(%lu.%03lu:%lu): arch=%x syscall=%u "
+ "success=%s exit=%ld a0=%lu a1=%lu a2=%lu a3=%lu items=%u "
+ "ppid=%u pid=%u auid=%u uid=%u gid=%u euid=%u suid=%u "
+ "fsuid=%u egid=%u sgid=%u fsgid=%u tty=pts%u ses=%u "
+ "comm=%s exe=%s %s",
+ (uint64_t)tsp->tv_sec, /* zone's timestamp */
+ (uint64_t)tsp->tv_nsec / 1000000,
+ seq, /* serial number */
+ arch, /* arch */
+ sysnum, /* syscall */
+ (lwp->lwp_errno == 0 ? "yes" : "no"), /* success */
+ ret, /* exit */
+ lwpd->br_syscall_args[0], /* a0 */
+ lwpd->br_syscall_args[1], /* a1 */
+ lwpd->br_syscall_args[2], /* a2 */
+ lwpd->br_syscall_args[3], /* a3 */
+ items, /* items */
+ lx_lwp_ppid(lwp, NULL, NULL), /* ppid */
+ (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid),
+ lx_audit_get_auid(), /* auid */
+ crgetruid(cr), /* uid */
+ crgetrgid(cr), /* gid */
+ crgetuid(cr), /* euid */
+ crgetsuid(cr), /* saved uid */
+ crgetuid(cr), /* fsuid */
+ crgetgid(cr), /* egid */
+ crgetsgid(cr), /* saved gid */
+ crgetgid(cr), /* fsgid */
+ minor, /* tty */
+ sessid, /* ses */
+ comm, /* comm */
+ exe, /* exe */
+ key); /* key="VAL" */
+
+ list_insert_tail(&asp->lxast_ev_queue, rp);
+ if (asp->lxast_backlog == 0)
+ cv_signal(&asp->lxast_worker_cv);
+ asp->lxast_backlog++;
+}
+
+/*
+ * Get the next rule in the list that is generally applicable to the given
+ * syscall.
+ */
+static lx_audit_rule_ent_t *
+lx_audit_next_applicable_rule(int sysnum, uint32_t arch, lx_audit_state_t *asp,
+ lx_audit_rule_ent_t *erp)
+{
+ ASSERT(MUTEX_HELD(&asp->lxast_lock));
+
+ for (erp = list_next(&asp->lxast_rules, erp);
+ erp != NULL;
+ erp = list_next(&asp->lxast_rules, erp)) {
+ lx_audit_rule_t *r = &erp->lxare_rule;
+
+ /* Determine if the rule in the list has the same ARCH. */
+ if (arch == LX_AUDIT_ARCH32 && !erp->lxare_is32bit)
+ continue;
+ if (arch == LX_AUDIT_ARCH64 && !erp->lxare_is64bit)
+ continue;
+
+ /* Determine if this rule applies to the relevant syscall. */
+ if (BT_TEST32(r->lxar_mask, sysnum))
+ return (erp);
+ }
+
+ return (NULL);
+}
+
+void
+lx_audit_syscall_exit(int sysnum, long ret)
+{
+ lx_zone_data_t *lxzd = ztolxzd(curzone);
+ lx_audit_state_t *asp;
+ uint64_t seq;
+ lx_audit_rule_ent_t *erp;
+ timestruc_t ts;
+ uint32_t arch;
+
+ if (lxzd->lxzd_audit_enabled == LXAE_DISABLED)
+ return;
+
+ if (sysnum >= LX_NSYSCALLS)
+ return;
+
+ asp = lxzd->lxzd_audit_state;
+ ASSERT(asp != NULL);
+
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ arch = LX_AUDIT_ARCH32;
+ } else {
+ ASSERT(get_udatamodel() == DATAMODEL_LP64);
+ arch = LX_AUDIT_ARCH64;
+ }
+
+ /*
+ * Fast top-level check to see if we're auditing this syscall.
+ * We don't take the mutex for this since there is no need.
+ */
+ if (arch == LX_AUDIT_ARCH32) {
+ if (asp->lxast_sys32_rulep[sysnum] == NULL)
+ return;
+ } else {
+ if (asp->lxast_sys64_rulep[sysnum] == NULL)
+ return;
+ }
+
+ mutex_enter(&asp->lxast_lock);
+ if (arch == LX_AUDIT_ARCH32) {
+ erp = asp->lxast_sys32_rulep[sysnum];
+ } else {
+ erp = asp->lxast_sys64_rulep[sysnum];
+ }
+
+ if (erp == NULL) {
+ /* Hit a race and the syscall is no longer being audited */
+ mutex_exit(&asp->lxast_lock);
+ return;
+ }
+
+ /*
+ * All of the records in the set (i.e. same serial number) have
+ * the same timestamp.
+ */
+ seq = asp->lxast_seq++;
+ gethrestime(&ts);
+ ts.tv_sec -= curzone->zone_boot_time;
+
+ /*
+ * We have to determine if the first rule associated with the syscall,
+ * or any subsequent applicable rules, match.
+ *
+ * The first rule associated with the syscall may (or may not) match,
+ * but there can be additional rules which might also match. The first
+ * possible rule is always the one that enables the syscall auditing,
+ * but we also have to iterate to the end of the list to see if any
+ * other rules are applicable to this syscall.
+ */
+ for (; erp != NULL;
+ erp = lx_audit_next_applicable_rule(sysnum, arch, asp, erp)) {
+ if (!lx_audit_syscall_rule_match(erp))
+ continue;
+
+ lx_audit_syscall_fmt_rcd(sysnum, arch, ret, asp, erp, seq, &ts);
+ }
+
+ /*
+ * TODO: Currently we only output a single SYSCALL record.
+ * Real Linux emits a set of audit records for a syscall exit event
+ * (e.g. for an unlink syscall):
+ * type=SYSCALL
+ * type=CWD
+ * type=PATH - one for the parent dir
+ * type=PATH - one for the actual file unlinked
+ * type=PROCTITLE - (this one seems worthless)
+ * followed by an AUDIT_EOE message (which seems to be ignored).
+ *
+ * For syscalls that don't change files in the file system (e.g. ioctl)
+ * there are no PATH records.
+ */
+ mutex_exit(&asp->lxast_lock);
+}
+
+/*
+ * Determine which syscalls this rule applies to and setup a fast pointer for
+ * the syscall to enable it's rule match.
+ *
+ * We have to look at each bit and translate the external syscall bits into the
+ * internal syscall number.
+ */
+static void
+lx_enable_syscall_rule(lx_audit_state_t *asp, lx_audit_rule_t *rulep,
+ lx_audit_rule_ent_t *rp)
+{
+ uint_t sysnum;
+
+ ASSERT(MUTEX_HELD(&asp->lxast_lock));
+
+ for (sysnum = 0; sysnum < LX_NSYSCALLS; sysnum++) {
+ if (BT_TEST32(rulep->lxar_mask, sysnum)) {
+ if (rp->lxare_is32bit) {
+ if (asp->lxast_sys32_rulep[sysnum] == NULL)
+ asp->lxast_sys32_rulep[sysnum] = rp;
+ }
+ if (rp->lxare_is64bit) {
+ if (asp->lxast_sys64_rulep[sysnum] == NULL)
+ asp->lxast_sys64_rulep[sysnum] = rp;
+ }
+ }
+ }
+}
+
+int
+lx_audit_append_rule(void *r, uint_t datalen)
+{
+ lx_audit_rule_t *rulep = (lx_audit_rule_t *)r;
+ char *datap;
+ uint_t i;
+ lx_audit_rule_ent_t *rp;
+ lx_audit_state_t *asp;
+ boolean_t is_32bit = B_TRUE, is_64bit = B_TRUE, sys_found = B_FALSE;
+ char *tdp;
+ char key[LX_AUDIT_MAX_KEY_LEN + 1];
+ uint32_t tlen;
+
+ if (ztolxzd(curproc->p_zone)->lxzd_audit_enabled == LXAE_LOCKED)
+ return (EPERM);
+
+ if (datalen < sizeof (lx_audit_rule_t))
+ return (EINVAL);
+ datalen -= sizeof (lx_audit_rule_t);
+
+ if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS)
+ return (EINVAL);
+
+ if (rulep->lxar_buflen > datalen)
+ return (EINVAL);
+
+ datap = rulep->lxar_buf;
+
+ /*
+ * First check the rule to determine if we support the flag, actions,
+ * and all of the fields specified (since currently, our rule support
+ * is incomplete).
+ *
+ * NOTE: We currently only handle syscall exit rules.
+ */
+ if (rulep->lxar_flag != LX_AUDIT_FILTER_EXIT ||
+ rulep->lxar_action != LX_AUDIT_ACT_ALWAYS)
+ return (ENOTSUP);
+ if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS)
+ return (EINVAL);
+ tdp = datap;
+ tlen = rulep->lxar_buflen;
+ key[0] = '\0';
+ for (i = 0; i < rulep->lxar_fld_cnt; i++) {
+ uint32_t ftype, fval, fop;
+
+ fop = rulep->lxar_fld_flag[i];
+ ftype = rulep->lxar_fields[i];
+ fval = rulep->lxar_values[i];
+ DTRACE_PROBE3(lx__audit__field, uint32_t, fop,
+ uint32_t, ftype, uint32_t, fval);
+
+ if (ftype == LX_RF_AUDIT_ARCH) {
+ if (fop != LX_OF_AUDIT_EQ)
+ return (ENOTSUP);
+ if (!is_32bit || !is_64bit)
+ return (EINVAL);
+ if (fval == LX_AUDIT_ARCH64) {
+ is_32bit = B_FALSE;
+ } else if (fval == LX_AUDIT_ARCH32) {
+ is_64bit = B_FALSE;
+ } else {
+ return (ENOTSUP);
+ }
+ } else if (ftype == LX_RF_AUDIT_LOGINUID) {
+ if ((fop & LX_OF_AUDIT_ALL) == 0)
+ return (ENOTSUP);
+ } else if (ftype == LX_RF_AUDIT_FILTERKEY) {
+ if (fop != LX_OF_AUDIT_EQ)
+ return (ENOTSUP);
+ if (tlen < fval || fval > LX_AUDIT_MAX_KEY_LEN)
+ return (EINVAL);
+ if (key[0] != '\0')
+ return (EINVAL);
+ /* while we're here, save the parsed key */
+ bcopy(tdp, key, fval);
+ key[fval] = '\0';
+ tdp += fval;
+ tlen -= fval;
+ } else {
+ /*
+ * TODO: expand the support for additional Linux field
+ * options.
+ */
+ return (ENOTSUP);
+ }
+ }
+ for (i = 0; i < LX_NSYSCALLS; i++) {
+ if (BT_TEST32(rulep->lxar_mask, i)) {
+ /* At least one syscall enabled in this mask entry */
+ sys_found = B_TRUE;
+ break;
+ }
+ }
+ if (!sys_found)
+ return (ENOTSUP);
+
+ asp = ztolxzd(curzone)->lxzd_audit_state;
+ ASSERT(asp != NULL);
+
+ /*
+ * We have confirmed that we can handle the rule specified.
+ * Before taking the lock, allocate and setup the internal rule struct.
+ */
+ rp = kmem_alloc(sizeof (lx_audit_rule_ent_t), KM_SLEEP);
+ bcopy(rulep, &rp->lxare_rule, sizeof (lx_audit_rule_t));
+ rp->lxare_buf = kmem_alloc(rulep->lxar_buflen, KM_SLEEP);
+ bcopy(datap, rp->lxare_buf, rulep->lxar_buflen);
+ rp->lxare_is32bit = is_32bit;
+ rp->lxare_is64bit = is_64bit;
+ if (key[0] == '\0') {
+ rp->lxare_key = NULL;
+ } else {
+ int slen = strlen(key);
+ rp->lxare_key = kmem_alloc(slen + 1, KM_SLEEP);
+ (void) strlcpy(rp->lxare_key, key, slen + 1);
+ }
+
+ mutex_enter(&asp->lxast_lock);
+ /* Save the rule on our top-level list. */
+ list_insert_tail(&asp->lxast_rules, rp);
+ /* Enable tracing on the relevant syscalls. */
+ lx_enable_syscall_rule(asp, rulep, rp);
+ mutex_exit(&asp->lxast_lock);
+
+ return (0);
+}
+
+int
+lx_audit_delete_rule(void *r, uint_t datalen)
+{
+ lx_audit_rule_t *rulep = (lx_audit_rule_t *)r;
+ char *datap;
+ uint_t sysnum;
+ lx_audit_state_t *asp;
+ lx_audit_rule_ent_t *erp;
+
+ if (ztolxzd(curproc->p_zone)->lxzd_audit_enabled == LXAE_LOCKED)
+ return (EPERM);
+
+ if (datalen < sizeof (lx_audit_rule_t))
+ return (EINVAL);
+ datalen -= sizeof (lx_audit_rule_t);
+
+ if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS)
+ return (EINVAL);
+
+ if (rulep->lxar_buflen > datalen)
+ return (EINVAL);
+
+ datap = rulep->lxar_buf;
+
+ asp = ztolxzd(curzone)->lxzd_audit_state;
+ ASSERT(asp != NULL);
+
+ mutex_enter(&asp->lxast_lock);
+
+ /* Find the matching rule from the rule list */
+ for (erp = list_head(&asp->lxast_rules);
+ erp != NULL;
+ erp = list_next(&asp->lxast_rules, erp)) {
+ lx_audit_rule_t *r;
+ uint_t i;
+ boolean_t mtch;
+
+ r = &erp->lxare_rule;
+ if (rulep->lxar_flag != r->lxar_flag)
+ continue;
+ if (rulep->lxar_action != r->lxar_action)
+ continue;
+ if (rulep->lxar_fld_cnt != r->lxar_fld_cnt)
+ continue;
+ for (i = 0, mtch = B_TRUE; i < LX_AUDIT_BITMASK_SIZE; i++) {
+ if (rulep->lxar_mask[i] != r->lxar_mask[i]) {
+ mtch = B_FALSE;
+ break;
+ }
+ }
+ if (!mtch)
+ continue;
+
+ for (i = 0, mtch = B_TRUE; i < rulep->lxar_fld_cnt; i++) {
+ if (rulep->lxar_fields[i] != r->lxar_fields[i] ||
+ rulep->lxar_values[i] != r->lxar_values[i] ||
+ rulep->lxar_fld_flag[i] != r->lxar_fld_flag[i]) {
+ mtch = B_FALSE;
+ break;
+ }
+ }
+ if (!mtch)
+ continue;
+ if (rulep->lxar_buflen != r->lxar_buflen)
+ continue;
+ if (bcmp(datap, erp->lxare_buf, r->lxar_buflen) == 0)
+ break;
+ }
+
+ /* There is no matching rule */
+ if (erp == NULL) {
+ mutex_exit(&asp->lxast_lock);
+ return (ENOENT);
+ }
+
+ /*
+ * Disable each relevant syscall enabling.
+ */
+ for (sysnum = 0; sysnum < LX_NSYSCALLS; sysnum++) {
+ if (BT_TEST32(rulep->lxar_mask, sysnum)) {
+ /*
+ * If this was the first rule on the list for the
+ * given syscall (likely, since usually only one rule
+ * per syscall) then either disable tracing for that
+ * syscall, or point to the next applicable rule in the
+ * list.
+ */
+ if (erp->lxare_is32bit) {
+ if (asp->lxast_sys32_rulep[sysnum] == erp) {
+ asp->lxast_sys32_rulep[sysnum] =
+ lx_audit_next_applicable_rule(
+ sysnum, LX_AUDIT_ARCH32, asp, erp);
+ }
+ }
+ if (erp->lxare_is64bit) {
+ if (asp->lxast_sys64_rulep[sysnum] == erp) {
+ asp->lxast_sys64_rulep[sysnum] =
+ lx_audit_next_applicable_rule(
+ sysnum, LX_AUDIT_ARCH64, asp, erp);
+ }
+ }
+ }
+ }
+
+ /* Remove the rule from the top-level list */
+ list_remove(&asp->lxast_rules, erp);
+
+ kmem_free(erp->lxare_buf, erp->lxare_rule.lxar_buflen);
+ if (erp->lxare_key != NULL)
+ kmem_free(erp->lxare_key, strlen(erp->lxare_key) + 1);
+ kmem_free(erp, sizeof (lx_audit_rule_ent_t));
+
+ mutex_exit(&asp->lxast_lock);
+ return (0);
+}
+
+void
+lx_audit_emit_user_msg(uint_t mtype, uint_t len, char *datap)
+{
+ lx_zone_data_t *lxzd = ztolxzd(curzone);
+ lx_audit_state_t *asp;
+ lx_audit_record_t *rp;
+ timestruc_t ts;
+ uint_t sessid;
+ proc_t *p = curproc;
+ lx_lwp_data_t *lwpd = lwptolxlwp(ttolwp(curthread));
+ uint_t prelen, alen;
+ char msg[LX_AUDIT_MESSAGE_TEXT_MAX];
+
+ /*
+ * For user messages, auditing may not actually be initialized. If not,
+ * just return.
+ */
+ if (lxzd->lxzd_audit_enabled == LXAE_DISABLED ||
+ lxzd->lxzd_audit_state == NULL)
+ return;
+
+ if (len >= sizeof (msg))
+ len = sizeof (msg) - 1;
+
+ mutex_enter(&p->p_splock);
+ sessid = p->p_sessp->s_sid;
+ mutex_exit(&p->p_splock);
+
+ asp = lxzd->lxzd_audit_state;
+ ASSERT(asp != NULL);
+
+ mutex_enter(&asp->lxast_lock);
+
+ if (asp->lxast_backlog >= asp->lxast_backlog_limit) {
+ lx_audit_fail(asp, "audit: backlog limit exceeded");
+ mutex_exit(&asp->lxast_lock);
+ return;
+ }
+
+ rp = kmem_alloc(sizeof (lx_audit_record_t), KM_NOSLEEP);
+ if (rp == NULL) {
+ lx_audit_fail(asp, "audit: no kernel memory");
+ mutex_exit(&asp->lxast_lock);
+ return;
+ }
+ rp->lxar_msg = kmem_zalloc(LX_AUDIT_MESSAGE_TEXT_MAX, KM_NOSLEEP);
+ if (rp->lxar_msg == NULL) {
+ lx_audit_fail(asp, "audit: no kernel memory");
+ mutex_exit(&asp->lxast_lock);
+ kmem_free(rp, sizeof (lx_audit_record_t));
+ return;
+ }
+ rp->lxar_type = mtype;
+ bcopy(datap, msg, len);
+ msg[len] = '\0';
+
+ gethrestime(&ts);
+ ts.tv_sec -= curzone->zone_boot_time;
+
+ (void) snprintf(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX,
+ "audit(%lu.%03lu:%lu): pid=%u uid=%u auid=%u ses=%u msg=\'",
+ (uint64_t)ts.tv_sec, /* zone's timestamp */
+ (uint64_t)ts.tv_nsec / 1000000,
+ asp->lxast_seq++, /* serial number */
+ (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid),
+ crgetruid(CRED()), /* uid */
+ lx_audit_get_auid(), /* auid */
+ sessid); /* ses */
+
+ prelen = strlen(rp->lxar_msg);
+ alen = LX_AUDIT_MESSAGE_TEXT_MAX - prelen - 2;
+ (void) strlcat(rp->lxar_msg + prelen, msg, alen);
+ (void) strlcat(rp->lxar_msg, "\'", LX_AUDIT_MESSAGE_TEXT_MAX);
+
+ list_insert_tail(&asp->lxast_ev_queue, rp);
+ if (asp->lxast_backlog == 0)
+ cv_signal(&asp->lxast_worker_cv);
+ asp->lxast_backlog++;
+ mutex_exit(&asp->lxast_lock);
+}
+
+void
+lx_audit_list_rules(void *reply,
+ void (*cb)(void *, void *, uint_t, void *, uint_t))
+{
+ lx_audit_state_t *asp;
+ lx_audit_rule_ent_t *rp;
+
+ asp = ztolxzd(curzone)->lxzd_audit_state;
+ ASSERT(asp != NULL);
+
+ /*
+ * Output the rule list
+ */
+ mutex_enter(&asp->lxast_lock);
+ for (rp = list_head(&asp->lxast_rules); rp != NULL;
+ rp = list_next(&asp->lxast_rules, rp)) {
+ cb(reply, &rp->lxare_rule, sizeof (lx_audit_rule_t),
+ rp->lxare_buf, rp->lxare_rule.lxar_buflen);
+ }
+ mutex_exit(&asp->lxast_lock);
+}
+
+void
+lx_audit_get_feature(void *reply, void (*cb)(void *, void *, uint_t))
+{
+ lx_audit_features_t af;
+
+ af.lxaf_version = LX_AUDIT_FEATURE_VERSION;
+ af.lxaf_mask = 0xffffffff;
+ af.lxaf_features = 0;
+ af.lxaf_lock = 0;
+
+ cb(reply, &af, sizeof (af));
+}
+
+void
+lx_audit_get(void *reply, void (*cb)(void *, void *, uint_t))
+{
+ lx_audit_status_t status;
+ lx_zone_data_t *lxzd;
+ lx_audit_state_t *asp;
+
+ lxzd = ztolxzd(curproc->p_zone);
+ asp = lxzd->lxzd_audit_state;
+ ASSERT(asp != NULL);
+
+ bzero(&status, sizeof (status));
+
+ mutex_enter(&asp->lxast_lock);
+ status.lxas_enabled = lxzd->lxzd_audit_enabled;
+ status.lxas_failure = asp->lxast_failure;
+ status.lxas_pid = asp->lxast_pid;
+ status.lxas_rate_limit = asp->lxast_rate_limit;
+ status.lxas_backlog_limit = asp->lxast_backlog_limit;
+ status.lxas_lost = asp->lxast_lost;
+ status.lxas_backlog = asp->lxast_backlog;
+ status.lxas_backlog_wait_time = asp->lxast_backlog_wait_time;
+ status.lxas_feature_bitmap = LX_AUDIT_FEATURE_ALL;
+ mutex_exit(&asp->lxast_lock);
+
+ cb(reply, &status, sizeof (status));
+}
+
+int
+lx_audit_set(void *lxsock, void *s, uint_t datalen,
+ void (*cb)(void *, boolean_t))
+{
+ lx_audit_status_t *statusp = (lx_audit_status_t *)s;
+ lx_zone_data_t *lxzd;
+ lx_audit_state_t *asp;
+
+ /*
+ * Unfortunately, some user-level code does not send down a full
+ * lx_audit_status_t structure in the message (e.g. this occurs on
+ * CentOS7). Only the structure up to, but not including, the embedded
+ * union is being sent in. This appears to be a result of the user-level
+ * code being built for older versions of the kernel. To handle this,
+ * we have to subtract the last 8 bytes from the size in order to
+ * accomodate this code. We'll revalidate with the full size if
+ * LX_AUDIT_STATUS_BACKLOG_WAIT_TIME were to be set in the mask.
+ */
+ if (datalen < sizeof (lx_audit_status_t) - 8)
+ return (EINVAL);
+
+ lxzd = ztolxzd(curproc->p_zone);
+ asp = lxzd->lxzd_audit_state;
+ ASSERT(asp != NULL);
+
+ /* Once the config is locked, we only allow changing the auditd pid */
+ mutex_enter(&asp->lxast_lock);
+ if (lxzd->lxzd_audit_enabled == LXAE_LOCKED &&
+ (statusp->lxas_mask & ~LX_AUDIT_STATUS_PID)) {
+ mutex_exit(&asp->lxast_lock);
+ return (EPERM);
+ }
+
+ if (statusp->lxas_mask & LX_AUDIT_STATUS_FAILURE) {
+ switch (statusp->lxas_failure) {
+ case LXAE_SILENT:
+ case LXAE_PRINT:
+ case LXAE_PANIC:
+ asp->lxast_failure = statusp->lxas_failure;
+ break;
+ default:
+ mutex_exit(&asp->lxast_lock);
+ return (EINVAL);
+ }
+ }
+ if (statusp->lxas_mask & LX_AUDIT_STATUS_PID) {
+ /*
+ * The process that sets the pid is the daemon, so this is the
+ * socket we'll write audit records out to.
+ */
+ lx_audit_set_worker(statusp->lxas_pid, lxsock, cb);
+ }
+ if (statusp->lxas_mask & LX_AUDIT_STATUS_RATE_LIMIT) {
+ asp->lxast_rate_limit = statusp->lxas_rate_limit;
+ }
+ if (statusp->lxas_mask & LX_AUDIT_STATUS_BACKLOG_LIMIT) {
+ asp->lxast_backlog_limit = statusp->lxas_backlog_limit;
+ }
+ if (statusp->lxas_mask & LX_AUDIT_STATUS_BACKLOG_WAIT_TIME) {
+ /*
+ * See the comment above. We have to revalidate the full struct
+ * size since we previously only validated for a shorter struct.
+ */
+ if (datalen < sizeof (lx_audit_status_t)) {
+ mutex_exit(&asp->lxast_lock);
+ return (EINVAL);
+ }
+ asp->lxast_backlog_wait_time = statusp->lxas_backlog_wait_time;
+ }
+ if (statusp->lxas_mask & LX_AUDIT_STATUS_LOST) {
+ asp->lxast_lost = statusp->lxas_lost;
+ }
+
+ if (statusp->lxas_mask & LX_AUDIT_STATUS_ENABLED) {
+ switch (statusp->lxas_enabled) {
+ case 0:
+ lxzd->lxzd_audit_enabled = LXAE_DISABLED;
+ break;
+ case 1:
+ lxzd->lxzd_audit_enabled = LXAE_ENABLED;
+ break;
+ case 2:
+ lxzd->lxzd_audit_enabled = LXAE_LOCKED;
+ break;
+ default:
+ mutex_exit(&asp->lxast_lock);
+ return (EINVAL);
+ }
+ }
+ mutex_exit(&asp->lxast_lock);
+
+ return (0);
+}
+
+void
+lx_audit_stop_worker(void *s, void (*cb)(void *, boolean_t))
+{
+ lx_audit_state_t *asp = ztolxzd(curzone)->lxzd_audit_state;
+ kt_did_t tid = 0;
+
+ ASSERT(asp != NULL);
+ mutex_enter(&asp->lxast_lock);
+ if (s == NULL) {
+ s = asp->lxast_sock;
+ } else {
+ VERIFY(s == asp->lxast_sock);
+ }
+ asp->lxast_sock = NULL;
+ asp->lxast_pid = 0;
+ if (asp->lxast_worker != NULL) {
+ tid = asp->lxast_worker->t_did;
+ asp->lxast_worker = NULL;
+ asp->lxast_exit = B_TRUE;
+ cv_signal(&asp->lxast_worker_cv);
+ }
+ if (s != NULL)
+ cb(s, B_FALSE);
+ mutex_exit(&asp->lxast_lock);
+
+ if (tid != 0)
+ thread_join(tid);
+}
+
+/*
+ * Called when audit netlink message received, in order to perform lazy
+ * allocation of audit state for the zone. We also perform the one-time step to
+ * cache the netlink callback used by the audit worker thread to send messages
+ * up to the auditd.
+ */
+void
+lx_audit_init(int (*cb)(void *, uint_t, const char *, uint_t))
+{
+ lx_zone_data_t *lxzd = ztolxzd(curzone);
+ lx_audit_state_t *asp;
+
+ mutex_enter(&lxzd->lxzd_lock);
+
+ if (lxzd->lxzd_audit_state != NULL) {
+ mutex_exit(&lxzd->lxzd_lock);
+ return;
+ }
+
+ asp = kmem_zalloc(sizeof (lx_audit_state_t), KM_SLEEP);
+
+ mutex_init(&asp->lxast_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&asp->lxast_worker_cv, NULL, CV_DEFAULT, NULL);
+ list_create(&asp->lxast_ev_queue, sizeof (lx_audit_record_t),
+ offsetof(lx_audit_record_t, lxar_link));
+ list_create(&asp->lxast_rules, sizeof (lx_audit_rule_ent_t),
+ offsetof(lx_audit_rule_ent_t, lxare_link));
+ asp->lxast_failure = LXAE_PRINT;
+ asp->lxast_backlog_limit = LX_AUDIT_DEF_BACKLOG_LIMIT;
+ asp->lxast_backlog_wait_time = LX_AUDIT_DEF_WAIT_TIME;
+
+ lxzd->lxzd_audit_state = asp;
+
+ mutex_exit(&lxzd->lxzd_lock);
+
+ mutex_enter(&lx_audit_em_lock);
+ if (lx_audit_emit_msg == NULL)
+ lx_audit_emit_msg = cb;
+ mutex_exit(&lx_audit_em_lock);
+}
+
+/*
+ * Called when netlink module is unloading so that we can clear the cached
+ * netlink callback used by the audit worker thread to send messages up to the
+ * auditd.
+ */
+void
+lx_audit_cleanup(void)
+{
+ mutex_enter(&lx_audit_em_lock);
+ lx_audit_emit_msg = NULL;
+ mutex_exit(&lx_audit_em_lock);
+}
+
+/*
+ * Called when the zone is being destroyed, not when auditing is being disabled.
+ * Note that zsched has already exited and any lxast_worker thread has exited.
+ */
+void
+lx_audit_fini(zone_t *zone)
+{
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+ lx_audit_state_t *asp;
+ lx_audit_record_t *rp;
+ lx_audit_rule_ent_t *erp;
+
+ ASSERT(MUTEX_HELD(&lxzd->lxzd_lock));
+
+ if ((asp = lxzd->lxzd_audit_state) == NULL)
+ return;
+
+ mutex_enter(&asp->lxast_lock);
+
+ VERIFY(asp->lxast_worker == NULL);
+
+ rp = list_remove_head(&asp->lxast_ev_queue);
+ while (rp != NULL) {
+ kmem_free(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX);
+ kmem_free(rp, sizeof (lx_audit_record_t));
+ rp = list_remove_head(&asp->lxast_ev_queue);
+ }
+
+ list_destroy(&asp->lxast_ev_queue);
+ asp->lxast_backlog = 0;
+ asp->lxast_pid = 0;
+
+ erp = list_remove_head(&asp->lxast_rules);
+ while (erp != NULL) {
+ kmem_free(erp->lxare_buf, erp->lxare_rule.lxar_buflen);
+ if (erp->lxare_key != NULL)
+ kmem_free(erp->lxare_key, strlen(erp->lxare_key) + 1);
+ kmem_free(erp, sizeof (lx_audit_rule_ent_t));
+ erp = list_remove_head(&asp->lxast_rules);
+ }
+ list_destroy(&asp->lxast_rules);
+
+ mutex_exit(&asp->lxast_lock);
+
+ cv_destroy(&asp->lxast_worker_cv);
+ mutex_destroy(&asp->lxast_lock);
+ lxzd->lxzd_audit_state = NULL;
+ kmem_free(asp, sizeof (lx_audit_state_t));
+}
+
+/*
+ * Audit initialization/cleanup when lx brand module is loaded and
+ * unloaded.
+ */
+void
+lx_audit_ld()
+{
+ mutex_init(&lx_audit_em_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+lx_audit_unld()
+{
+ mutex_destroy(&lx_audit_em_lock);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c
new file mode 100644
index 0000000000..0f78bca605
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_brand.c
@@ -0,0 +1,2701 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * The LX Brand: emulation of a Linux operating environment within a zone.
+ *
+ * OVERVIEW
+ *
+ * The LX brand enables a full Linux userland -- including a C library,
+ * init(1) framework, and some set of applications -- to run unmodified
+ * within an illumos zone. Unlike illumos, where applications are expected
+ * to link against and consume functions exported from libraries, the
+ * supported Linux binary compatibility boundary is the system call
+ * interface. By accurately emulating the behaviour of Linux system calls,
+ * Linux software can be executed in this environment as if it were running
+ * on a native Linux system.
+ *
+ * EMULATING LINUX SYSTEM CALLS
+ *
+ * Linux system calls are made in 32-bit processes via the "int 0x80"
+ * instruction; in 64-bit processes the "syscall" instruction is used, as it
+ * is with native illumos processes. In both cases, arguments to system
+ * calls are generally passed in registers and the usermode stack is not
+ * interpreted or modified by the Linux kernel.
+ *
+ * When the emulated Linux process makes a system call, it traps into the
+ * illumos kernel. The in-kernel brand module contains various emulation
+ * routines, and can fully service some emulated system calls; e.g. read(2)
+ * and write(2). Other system calls require assistance from the illumos
+ * libc, bouncing back out to the brand library ("lx_brand.so.1") for
+ * emulation.
+ *
+ * The brand mechanism allows for the provision of an alternative trap
+ * handler for the various system call mechanisms. Traditionally this was
+ * used to immediately revector execution to the usermode emulation library,
+ * which was responsible for handling all system calls. In the interests of
+ * more accurate emulation and increased performance, much of the regular
+ * illumos system call path is now invoked. Only the argument processing and
+ * handler dispatch are replaced by the brand, via the per-LWP
+ * "lwp_brand_syscall" interposition function pointer.
+ *
+ * THE NATIVE AND BRAND STACKS
+ *
+ * Some runtime environments (e.g. the Go language) allocate very small
+ * thread stacks, preferring to grow or split the stack as necessary. The
+ * Linux kernel generally does not use the usermode stack when servicing
+ * system calls, so this is not a problem. In order for our emulation to
+ * have the same zero stack impact, we must execute usermode emulation
+ * routines on an _alternate_ stack. This is similar, in principle, to the
+ * use of sigaltstack(3C) to run signal handlers off the main thread stack.
+ *
+ * To this end, the brand library allocates and installs an alternate stack
+ * (called the "native" stack) for each LWP. The in-kernel brand code uses
+ * this stack for usermode emulation calls and interposed signal delivery,
+ * while the emulated Linux process sees only the data on the main thread
+ * stack, known as the "brand" stack. The stack mode is tracked in the
+ * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
+ *
+ * The stack mode doubles as a system call "mode bit". When in the
+ * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
+ * system calls. In other modes, system calls are assumed to be native
+ * illumos system calls as made during brand library initialisation and
+ * usermode emulation.
+ *
+ * USERMODE EMULATION
+ *
+ * When a Linux system call cannot be emulated within the kernel, we preserve
+ * the register state of the Linux process and revector the LWP to the brand
+ * library usermode emulation handler: the "lx_emulate()" function in
+ * "lx_brand.so.1". This revectoring is modelled on the delivery of signals,
+ * and is performed in "lx_emulate_user()".
+ *
+ * First, the emulated process state is written out to the usermode stack of
+ * the process as a "ucontext_t" object. Arguments to the emulation routine
+ * are passed on the stack or in registers, depending on the ABI. When the
+ * usermode emulation is complete, the result is passed back to the kernel
+ * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
+ * for restoration.
+ *
+ * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
+ *
+ * When servicing emulated system calls in the usermode brand library, or
+ * during signal delivery, various state is preserved by the kernel so that
+ * the running LWP may be revectored to a handling routine. The context
+ * allows the kernel to restart the program at the point of interruption,
+ * either at the return of the signal handler, via setcontext(3C); or after
+ * the usermode emulation request has been serviced, via B_EMULATION_DONE.
+ *
+ * In illumos native processes, the saved context (a "ucontext_t" object)
+ * includes the state of registers and the current signal mask at the point
+ * of interruption. The context also includes a link to the most recently
+ * saved context, forming a chain to be unwound as requests complete. The LX
+ * brand requires additional book-keeping to describe the machine state: in
+ * particular, the current stack mode and the occupied extent of the native
+ * stack.
+ *
+ * The brand code is able to interpose on the context save and restore
+ * operations in the kernel -- see "lx_savecontext()" and
+ * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
+ * function correctly in the face of a dual stack LWP. The brand also
+ * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
+ * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
+ * library interposer on the native stack, regardless of the interrupted
+ * execution mode. Linux sigaltstack(2) emulation is performed entirely by
+ * the usermode brand library during signal handler interposition.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/thread.h>
+#include <sys/systm.h>
+#include <sys/syscall.h>
+#include <sys/proc.h>
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/model.h>
+#include <sys/exec.h>
+#include <sys/lx_impl.h>
+#include <sys/machbrand.h>
+#include <sys/lx_syscalls.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_futex.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/lx_userhz.h>
+#include <sys/param.h>
+#include <sys/termios.h>
+#include <sys/sunddi.h>
+#include <sys/ddi.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+#include <sys/auxv.h>
+#include <sys/priv.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/archsystm.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/sdt.h>
+#include <sys/x86_archext.h>
+#include <sys/controlregs.h>
+#include <sys/core.h>
+#include <sys/stack.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <lx_signum.h>
+#include <util/sscanf.h>
+#include <sys/lx_brand.h>
+#include <sys/zfs_ioctl.h>
+#include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
+
+int lx_debug = 0;
+uint_t lx_hz_scale = 0;
+
+void lx_init_brand_data(zone_t *, kmutex_t *);
+void lx_free_brand_data(zone_t *);
+void lx_setbrand(proc_t *);
+int lx_getattr(zone_t *, int, void *, size_t *);
+int lx_setattr(zone_t *, int, void *, size_t);
+int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+void lx_set_kern_version(zone_t *, char *);
+void lx_copy_procdata(proc_t *, proc_t *);
+
+extern int getsetcontext(int, void *);
+extern int waitsys(idtype_t, id_t, siginfo_t *, int);
+#if defined(_SYSCALL32_IMPL)
+extern int getsetcontext32(int, void *);
+extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
+#endif
+
+extern int zvol_name2minor(const char *, minor_t *);
+extern int zvol_create_minor(const char *);
+
+extern void lx_proc_exit(proc_t *);
+extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
+
+extern void lx_io_clear(lx_proc_data_t *);
+extern void lx_io_cleanup(proc_t *);
+
+extern void lx_ioctl_init();
+extern void lx_ioctl_fini();
+extern void lx_socket_init();
+extern void lx_socket_fini();
+
+extern int lx_start_nfs_lockd();
+extern void lx_upcall_statd();
+
+lx_systrace_f *lx_systrace_entry_ptr;
+lx_systrace_f *lx_systrace_return_ptr;
+
+static int lx_systrace_enabled;
+
+/*
+ * cgroup file system maintenance functions which are set when cgroups loads.
+ */
+void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
+/*
+ * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
+ * want an MMU dependency here (and should there be a microprocessor without
+ * a hole, we don't want to start allocating from the top of the VA range).
+ */
+#define LX_MAXSTACK64 0x7ffffff00000
+
+uint64_t lx_maxstack64 = LX_MAXSTACK64;
+
+static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
+ struct intpdata *idata, int level, size_t *execsz, int setid,
+ caddr_t exec_file, struct cred *cred, int *brand_action);
+
+static boolean_t lx_native_exec(uint8_t, const char **);
+static uint32_t lx_map32limit(proc_t *);
+
+static void lx_savecontext(ucontext_t *);
+static void lx_restorecontext(ucontext_t *);
+static caddr_t lx_sendsig_stack(int);
+static void lx_sendsig(int);
+#if defined(_SYSCALL32_IMPL)
+static void lx_savecontext32(ucontext32_t *);
+#endif
+static int lx_setid_clear(vattr_t *, cred_t *);
+#if defined(_LP64)
+static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
+ enum seg_rw);
+#endif
+static void lx_clearbrand(proc_t *, boolean_t);
+
+typedef struct lx_zfs_ds {
+ list_node_t ds_link;
+ char ds_name[MAXPATHLEN];
+ uint64_t ds_cookie;
+} lx_zfs_ds_t;
+
+/* lx brand */
+struct brand_ops lx_brops = {
+ lx_init_brand_data, /* b_init_brand_data */
+ lx_free_brand_data, /* b_free_brand_data */
+ lx_brandsys, /* b_brandsys */
+ lx_setbrand, /* b_setbrand */
+ lx_getattr, /* b_getattr */
+ lx_setattr, /* b_setattr */
+ lx_copy_procdata, /* b_copy_procdata */
+ lx_proc_exit, /* b_proc_exit */
+ lx_exec, /* b_exec */
+ lx_setrval, /* b_lwp_setrval */
+ lx_lwpdata_alloc, /* b_lwpdata_alloc */
+ lx_lwpdata_free, /* b_lwpdata_free */
+ lx_initlwp, /* b_initlwp */
+ lx_initlwp_post, /* b_initlwp_post */
+ lx_forklwp, /* b_forklwp */
+ lx_freelwp, /* b_freelwp */
+ lx_exitlwp, /* b_lwpexit */
+ lx_elfexec, /* b_elfexec */
+ NULL, /* b_sigset_native_to_brand */
+ NULL, /* b_sigset_brand_to_native */
+ lx_sigfd_translate, /* b_sigfd_translate */
+ NSIG, /* b_nsig */
+ lx_exit_with_sig, /* b_exit_with_sig */
+ lx_wait_filter, /* b_wait_filter */
+ lx_native_exec, /* b_native_exec */
+ lx_map32limit, /* b_map32limit */
+ lx_stop_notify, /* b_stop_notify */
+ lx_waitid_helper, /* b_waitid_helper */
+ lx_sigcld_repost, /* b_sigcld_repost */
+ lx_ptrace_issig_stop, /* b_issig_stop */
+ lx_ptrace_sig_ignorable, /* b_sig_ignorable */
+ lx_savecontext, /* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+ lx_savecontext32, /* b_savecontext32 */
+#endif
+ lx_restorecontext, /* b_restorecontext */
+ lx_sendsig_stack, /* b_sendsig_stack */
+ lx_sendsig, /* b_sendsig */
+ lx_setid_clear, /* b_setid_clear */
+#if defined(_LP64)
+ lx_pagefault, /* b_pagefault */
+#else
+ NULL,
+#endif
+ B_FALSE, /* b_intp_parse_arg */
+ lx_clearbrand, /* b_clearbrand */
+ lx_upcall_statd, /* b_rpc_statd */
+ lx_acct_out /* b_acct_out */
+};
+
+struct brand_mach_ops lx_mops = {
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ lx_fixsegreg,
+ lx_fsbase
+};
+
+struct brand lx_brand = {
+ BRAND_VER_1,
+ "lx",
+ &lx_brops,
+ &lx_mops,
+ sizeof (struct lx_proc_data)
+};
+
+static struct modlbrand modlbrand = {
+ &mod_brandops, "lx brand", &lx_brand
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, (void *)&modlbrand, NULL
+};
+
+void
+lx_proc_exit(proc_t *p)
+{
+ lx_proc_data_t *lxpd;
+ proc_t *cp;
+
+ lx_clone_grp_exit(p, B_FALSE);
+ /* Cleanup any outstanding aio contexts */
+ lx_io_cleanup(p);
+
+ mutex_enter(&p->p_lock);
+ VERIFY((lxpd = ptolxproc(p)) != NULL);
+ VERIFY(lxpd->l_ptrace == 0);
+ if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) {
+ mutex_exit(&p->p_lock);
+ return;
+ }
+ mutex_exit(&p->p_lock);
+
+ /* Check for children which desire notification of parental death. */
+ mutex_enter(&pidlock);
+ for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) {
+ mutex_enter(&cp->p_lock);
+ if ((lxpd = ptolxproc(cp)) == NULL) {
+ mutex_exit(&cp->p_lock);
+ continue;
+ }
+ if (lxpd->l_parent_deathsig != 0) {
+ sigtoproc(cp, NULL, lxpd->l_parent_deathsig);
+ }
+ mutex_exit(&cp->p_lock);
+ }
+ mutex_exit(&pidlock);
+}
+
+void
+lx_setbrand(proc_t *p)
+{
+ /* Send SIGCHLD to parent by default when child exits */
+ ptolxproc(p)->l_signal = stol_signo[SIGCHLD];
+
+ lx_read_argv_bounds(p);
+}
+
+/* ARGSUSED */
+int
+lx_setattr(zone_t *zone, int attr, void *ubuf, size_t ubufsz)
+{
+ lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
+
+ switch (attr) {
+ case LX_ATTR_KERN_RELEASE: {
+ char buf[LX_KERN_RELEASE_MAX];
+ bzero(buf, LX_KERN_RELEASE_MAX);
+ if (ubufsz >= LX_KERN_RELEASE_MAX) {
+ return (ERANGE);
+ }
+ if (copyin(ubuf, buf, ubufsz) != 0) {
+ return (EFAULT);
+ }
+ mutex_enter(&lxzd->lxzd_lock);
+ (void) strlcpy(lxzd->lxzd_kernel_release, buf,
+ LX_KERN_RELEASE_MAX);
+ mutex_exit(&lxzd->lxzd_lock);
+ return (0);
+ }
+ case LX_ATTR_KERN_VERSION: {
+ char buf[LX_KERN_VERSION_MAX];
+ bzero(buf, LX_KERN_VERSION_MAX);
+ if (ubufsz >= LX_KERN_VERSION_MAX) {
+ return (ERANGE);
+ }
+ if (copyin(ubuf, buf, ubufsz) != 0) {
+ return (EFAULT);
+ }
+ mutex_enter(&lxzd->lxzd_lock);
+ (void) strlcpy(lxzd->lxzd_kernel_version, buf,
+ LX_KERN_VERSION_MAX);
+ mutex_exit(&lxzd->lxzd_lock);
+ return (0);
+ }
+ case LX_ATTR_TTY_GID: {
+ gid_t gid;
+ if (ubufsz != sizeof (gid)) {
+ return (ERANGE);
+ }
+ if (copyin(ubuf, &gid, ubufsz) != 0) {
+ return (EFAULT);
+ }
+ mutex_enter(&lxzd->lxzd_lock);
+ lxzd->lxzd_ttygrp = gid;
+ mutex_exit(&lxzd->lxzd_lock);
+ return (0);
+ }
+ default:
+ return (EINVAL);
+ }
+}
+
+/* ARGSUSED */
+int
+lx_getattr(zone_t *zone, int attr, void *ubuf, size_t *ubufsz)
+{
+ lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
+ int len;
+
+ switch (attr) {
+ case LX_ATTR_KERN_RELEASE: {
+ char buf[LX_KERN_RELEASE_MAX];
+
+ mutex_enter(&lxzd->lxzd_lock);
+ len = strnlen(lxzd->lxzd_kernel_release, LX_KERN_RELEASE_MAX);
+ len++;
+ if (*ubufsz < len) {
+ mutex_exit(&lxzd->lxzd_lock);
+ return (ERANGE);
+ }
+ bzero(buf, sizeof (buf));
+ (void) strncpy(buf, lxzd->lxzd_kernel_release, sizeof (buf));
+ mutex_exit(&lxzd->lxzd_lock);
+ if (copyout(buf, ubuf, len) != 0) {
+ return (EFAULT);
+ }
+ *ubufsz = len;
+ return (0);
+ }
+ case LX_ATTR_KERN_VERSION: {
+ char buf[LX_KERN_VERSION_MAX];
+
+ mutex_enter(&lxzd->lxzd_lock);
+ len = strnlen(lxzd->lxzd_kernel_version, LX_KERN_VERSION_MAX);
+ len++;
+ if (*ubufsz < len) {
+ mutex_exit(&lxzd->lxzd_lock);
+ return (ERANGE);
+ }
+ bzero(buf, sizeof (buf));
+ (void) strncpy(buf, lxzd->lxzd_kernel_version, sizeof (buf));
+ mutex_exit(&lxzd->lxzd_lock);
+ if (copyout(buf, ubuf, len) != 0) {
+ return (EFAULT);
+ }
+ *ubufsz = len;
+ return (0);
+ }
+ default:
+ return (EINVAL);
+ }
+}
+
+uint32_t
+lx_map32limit(proc_t *p)
+{
+ /*
+ * To be bug-for-bug compatible with Linux, we have MAP_32BIT only
+ * allow mappings in the first 31 bits. This was a nuance in the
+ * original Linux implementation circa 2002, and applications have
+ * come to depend on its behavior.
+ *
+ * This is only relevant for 64-bit processes.
+ */
+ if (p->p_model == DATAMODEL_LP64)
+ return ((uint32_t)1 << 31);
+
+ return ((uint32_t)USERLIMIT32);
+}
+
+void
+lx_brand_systrace_enable(void)
+{
+ VERIFY(!lx_systrace_enabled);
+
+ lx_systrace_enabled = 1;
+}
+
+void
+lx_brand_systrace_disable(void)
+{
+ VERIFY(lx_systrace_enabled);
+
+ lx_systrace_enabled = 0;
+}
+
+void
+lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
+{
+ VERIFY(lwpd->br_ntv_stack != 0);
+
+ /*
+ * The "brand-lx-set-ntv-stack-current" probe has arguments:
+ * arg0: stack pointer before change
+ * arg1: stack pointer after change
+ * arg2: current stack base
+ */
+ DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
+ uintptr_t, lwpd->br_ntv_stack_current,
+ uintptr_t, new_sp,
+ uintptr_t, lwpd->br_ntv_stack);
+
+ lwpd->br_ntv_stack_current = new_sp;
+}
+
+#if defined(_LP64)
+static int
+lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
+ enum seg_rw rw)
+{
+ int syscall_num;
+
+ /*
+ * We only want to handle a very specific set of circumstances.
+ * Namely: this is a 64-bit LX-branded process attempting to execute an
+ * address in a page for which it does not have a valid mapping. If
+ * this is not the case, we bail out as fast as possible.
+ */
+ VERIFY(PROC_IS_BRANDED(p));
+ if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) !=
+ DATAMODEL_NATIVE) {
+ return (-1);
+ }
+
+ if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) {
+ return (-1);
+ }
+
+ /*
+ * This is a valid vsyscall address. We service the system call and
+ * return 0 to signal that the pagefault has been handled completely.
+ */
+ lx_vsyscall_enter(p, lwp, syscall_num);
+ return (0);
+}
+#endif
+
+static void
+lx_clearbrand(proc_t *p, boolean_t lwps_ok)
+{
+ lx_clone_grp_exit(p, lwps_ok);
+}
+
+/*
+ * This hook runs prior to sendsig() processing and allows us to nominate
+ * an alternative stack pointer for delivery of the signal handling frame.
+ * Critically, this routine should _not_ modify any LWP state as the
+ * savecontext() does not run until after this hook.
+ */
+/* ARGSUSED */
+static caddr_t
+lx_sendsig_stack(int sig)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ /*
+ * We want to take signal delivery on the native stack, but only if
+ * one has been allocated and installed for this LWP.
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ /*
+ * The program is not running on the native stack. Return
+ * the native stack pointer from our brand-private data so
+ * that we may switch to it for signal handling.
+ */
+ return ((caddr_t)lwpd->br_ntv_stack_current);
+ } else {
+ struct regs *rp = lwptoregs(lwp);
+
+ /*
+ * Either the program is already running on the native stack,
+ * or one has not yet been allocated for this LWP. Use the
+ * current stack pointer value.
+ */
+ return ((caddr_t)rp->r_sp);
+ }
+}
+
+/*
+ * This hook runs after sendsig() processing and allows us to update the
+ * per-LWP mode flags for system calls and stacks. The pre-signal
+ * context has already been saved and delivered to the user at this point.
+ */
+/* ARGSUSED */
+static void
+lx_sendsig(int sig)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+
+ switch (lwpd->br_stack_mode) {
+ case LX_STACK_MODE_BRAND:
+ case LX_STACK_MODE_NATIVE:
+ /*
+ * In lx_sendsig_stack(), we nominated a stack pointer from the
+ * native stack. Update the stack mode, and the current in-use
+ * extent of the native stack, accordingly:
+ */
+ lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+ lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
+
+ /*
+ * Fix up segment registers, etc.
+ */
+ lx_switch_to_native(lwp);
+ break;
+
+ default:
+ /*
+ * Otherwise, the brand library has not yet installed the
+ * alternate stack for this LWP. Signals will be handled on
+ * the regular stack thread.
+ */
+ return;
+ }
+}
+
+/*
+ * This hook runs prior to the context restoration, allowing us to take action
+ * or modify the context before it is loaded.
+ */
+static void
+lx_restorecontext(ucontext_t *ucp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
+ caddr_t sp = ucp->uc_brand_data[1];
+
+ if (lwpd->br_stack_mode == LX_STACK_MODE_PREINIT) {
+ /*
+ * Since we're here with stack_mode as LX_STACK_MODE_PREINIT,
+ * that can only mean we took a signal really early in this
+ * thread's lifetime, before we had a chance to setup a native
+ * stack and start running the thread's code. Since we're still
+ * handling everything on the single stack, we can't do any of
+ * the usual work below. Note: this means we cannot look at
+ * "flags" since the uc_brand_data may not have been properly
+ * set, depending on where we were when we took the signal.
+ */
+ return;
+ }
+
+ /*
+ * We have a saved native stack pointer value that we must restore
+ * into the per-LWP data.
+ */
+ if (flags & LX_UC_RESTORE_NATIVE_SP) {
+ lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
+ }
+
+ /*
+ * We do not wish to restore the value of uc_link in this context,
+ * so replace it with the value currently in the LWP.
+ */
+ if (flags & LX_UC_IGNORE_LINK) {
+ ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
+ }
+
+ /*
+ * Set or restore the stack mode. Usually this restores the mode, but
+ * the lx_runexe code flow also uses this to set the mode from
+ * LX_STACK_MODE_INIT to LX_UC_STACK_BRAND.
+ */
+ if (flags & LX_UC_STACK_NATIVE) {
+ lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+ } else if (flags & LX_UC_STACK_BRAND) {
+ lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
+ }
+
+#if defined(__amd64)
+ /*
+ * Override the fs/gsbase in the context with the value provided
+ * through the Linux arch_prctl(2) system call.
+ */
+ if (flags & LX_UC_STACK_BRAND) {
+ if (lwpd->br_lx_fsbase != 0) {
+ ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
+ }
+ if (lwpd->br_lx_gsbase != 0) {
+ ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase;
+ }
+ }
+#endif
+}
+
+static void
+lx_savecontext(ucontext_t *ucp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ uintptr_t flags = 0;
+
+ /*
+ * The ucontext_t affords us three private pointer-sized members in
+ * "uc_brand_data". We pack a variety of flags into the first element,
+ * and an optional stack pointer in the second element. The flags
+ * determine which stack pointer (native or brand), if any, is stored
+ * in the second element. The third element may contain the system
+ * call number; this is analogous to the "orig_[er]ax" member of a
+ * Linux "user_regs_struct".
+ */
+
+ if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
+ lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+ /*
+ * Record the value of the native stack pointer to restore
+ * when returning to this branded context:
+ */
+ flags |= LX_UC_RESTORE_NATIVE_SP;
+ ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
+ }
+
+ /*
+ * Save the stack mode:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
+ flags |= LX_UC_STACK_NATIVE;
+ } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ flags |= LX_UC_STACK_BRAND;
+ }
+
+ /*
+ * If we might need to restart this system call, save that information
+ * in the context:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ ucp->uc_brand_data[2] =
+ (void *)(uintptr_t)lwpd->br_syscall_num;
+ if (lwpd->br_syscall_restart) {
+ flags |= LX_UC_RESTART_SYSCALL;
+ }
+ } else {
+ ucp->uc_brand_data[2] = NULL;
+ }
+
+ ucp->uc_brand_data[0] = (void *)flags;
+}
+
+#if defined(_SYSCALL32_IMPL)
+static void
+lx_savecontext32(ucontext32_t *ucp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ unsigned int flags = 0;
+
+ /*
+ * The ucontext_t affords us three private pointer-sized members in
+ * "uc_brand_data". We pack a variety of flags into the first element,
+ * and an optional stack pointer in the second element. The flags
+ * determine which stack pointer (native or brand), if any, is stored
+ * in the second element. The third element may contain the system
+ * call number; this is analogous to the "orig_[er]ax" member of a
+ * Linux "user_regs_struct".
+ */
+
+ if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
+ lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+ /*
+ * Record the value of the native stack pointer to restore
+ * when returning to this branded context:
+ */
+ flags |= LX_UC_RESTORE_NATIVE_SP;
+ ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
+ }
+
+ /*
+ * Save the stack mode:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
+ flags |= LX_UC_STACK_NATIVE;
+ } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ flags |= LX_UC_STACK_BRAND;
+ }
+
+ /*
+ * If we might need to restart this system call, save that information
+ * in the context:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
+ if (lwpd->br_syscall_restart) {
+ flags |= LX_UC_RESTART_SYSCALL;
+ }
+ } else {
+ ucp->uc_brand_data[2] = NULL;
+ }
+
+ ucp->uc_brand_data[0] = flags;
+}
+#endif
+
+static int
+lx_zfs_ioctl(ldi_handle_t lh, int cmd, zfs_cmd_t *zc, size_t *dst_alloc_size)
+{
+ uint64_t cookie;
+ size_t dstsize;
+ int rc, unused;
+
+ cookie = zc->zc_cookie;
+
+ dstsize = (dst_alloc_size == NULL ? 0 : 8192);
+
+again:
+ if (dst_alloc_size != NULL) {
+ zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(dstsize,
+ KM_SLEEP);
+ zc->zc_nvlist_dst_size = dstsize;
+ }
+
+ rc = ldi_ioctl(lh, cmd, (intptr_t)zc, FKIOCTL, kcred, &unused);
+ if (rc == ENOMEM && dst_alloc_size != NULL) {
+ /*
+ * Our nvlist_dst buffer was too small, retry with a bigger
+ * buffer. ZFS will tell us the exact needed size.
+ */
+ size_t newsize = zc->zc_nvlist_dst_size;
+ ASSERT(newsize > dstsize);
+
+ kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, dstsize);
+ dstsize = newsize;
+ zc->zc_cookie = cookie;
+
+ goto again;
+ }
+
+ if (dst_alloc_size != NULL) {
+ *dst_alloc_size = dstsize;
+ }
+
+ return (rc);
+}
+
+static int
+lx_zone_zfs_open(ldi_handle_t *lh, dev_t *zfs_dev)
+{
+ ldi_ident_t li;
+
+ if (ldi_ident_from_mod(&modlinkage, &li) != 0) {
+ return (-1);
+ }
+ if (ldi_open_by_name("/dev/zfs", FREAD|FWRITE, kcred, lh, li) != 0) {
+ ldi_ident_release(li);
+ return (-1);
+ }
+ ldi_ident_release(li);
+ if (ldi_get_dev(*lh, zfs_dev) != 0) {
+ (void) ldi_close(*lh, FREAD|FWRITE, kcred);
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * We only get the relevant properties for zvols. This is because we're
+ * essentially iterating all of the ZFS datasets/zvols on the entire system
+ * when we boot the zone and there is a significant performance penalty if we
+ * have to retrieve all of the properties for everything. Especially since we
+ * don't care about any of them except the zvols actually in our delegated
+ * datasets.
+ *
+ * Note that the two properties we care about, volsize & volblocksize, are
+ * mandatory for zvols and should always be present. Also, note that the
+ * blocksize property value cannot change after the zvol has been created.
+ */
+static void
+lx_zvol_props(ldi_handle_t lh, zfs_cmd_t *zc, uint64_t *vsz, uint64_t *bsz)
+{
+ int rc;
+ size_t size;
+ nvlist_t *nv = NULL, *nv2;
+
+ rc = lx_zfs_ioctl(lh, ZFS_IOC_OBJSET_STATS, zc, &size);
+ if (rc != 0)
+ return;
+
+ rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size, &nv, 0);
+ ASSERT(rc == 0);
+
+ kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
+ zc->zc_nvlist_dst = NULL;
+ zc->zc_nvlist_dst_size = 0;
+
+ if ((rc = nvlist_lookup_nvlist(nv, "volsize", &nv2)) == 0) {
+ uint64_t val;
+
+ rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val);
+ if (rc == 0) {
+ *vsz = val;
+ }
+ }
+
+ if ((rc = nvlist_lookup_nvlist(nv, "volblocksize", &nv2)) == 0) {
+ uint64_t val;
+
+ rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val);
+ if (rc == 0) {
+ *bsz = val;
+ }
+ }
+
+ nvlist_free(nv);
+}
+
+/*
+ * Unlike ZFS proper, which does dynamic zvols, we currently only generate the
+ * zone's "disk" list once at zone boot time and use that consistently in all
+ * of the various subsystems (devfs, sysfs, procfs). This allows us to avoid
+ * re-iterating the datasets every time one of those subsystems accesses a
+ * "disk" and allows us to keep the view consistent across all subsystems, but
+ * it does mean a reboot is required to see new "disks". This is somewhat
+ * mitigated by its similarity to actual disk drives on a real system.
+ */
+static void
+lx_zone_get_zvols(zone_t *zone, ldi_handle_t lh, minor_t *emul_minor)
+{
+ lx_zone_data_t *lxzd;
+ list_t *zvol_lst, ds_lst;
+ int rc;
+ unsigned int devnum = 0;
+ size_t size;
+ zfs_cmd_t *zc;
+ nvpair_t *elem = NULL;
+ nvlist_t *pnv = NULL;
+
+ lxzd = ztolxzd(zone);
+ ASSERT(lxzd != NULL);
+ zvol_lst = lxzd->lxzd_vdisks;
+
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ if (lx_zfs_ioctl(lh, ZFS_IOC_POOL_CONFIGS, zc, &size) != 0) {
+ goto out;
+ }
+ ASSERT(zc->zc_cookie > 0);
+
+ rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size, &pnv, 0);
+ kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
+ if (rc != 0)
+ goto out;
+
+ /*
+ * We use a dataset list to process all of the datasets in the pool
+ * without doing recursion so that we don't risk blowing the kernel
+ * stack.
+ */
+ list_create(&ds_lst, sizeof (lx_zfs_ds_t),
+ offsetof(lx_zfs_ds_t, ds_link));
+
+ while ((elem = nvlist_next_nvpair(pnv, elem)) != NULL) {
+ lx_zfs_ds_t *ds;
+
+ ds = kmem_zalloc(sizeof (lx_zfs_ds_t), KM_SLEEP);
+ (void) strcpy(ds->ds_name, nvpair_name(elem));
+ list_insert_head(&ds_lst, ds);
+
+ while (ds != NULL) {
+ int w; /* dummy variable */
+
+ bzero(zc, sizeof (zfs_cmd_t));
+ zc->zc_cookie = ds->ds_cookie;
+ (void) strcpy(zc->zc_name, ds->ds_name);
+
+ rc = lx_zfs_ioctl(lh, ZFS_IOC_DATASET_LIST_NEXT,
+ zc, NULL);
+ /* Update the cookie before doing anything else. */
+ ds->ds_cookie = zc->zc_cookie;
+
+ if (rc != 0) {
+ list_remove(&ds_lst, ds);
+ kmem_free(ds, sizeof (lx_zfs_ds_t));
+ ds = list_tail(&ds_lst);
+ continue;
+ }
+
+ /* Reserved internal names, skip over these. */
+ if (strchr(zc->zc_name, '$') != NULL ||
+ strchr(zc->zc_name, '%') != NULL)
+ continue;
+
+ if (!zone_dataset_visible_inzone(zone, zc->zc_name, &w))
+ continue;
+
+ if (zc->zc_objset_stats.dds_type == DMU_OST_ZVOL) {
+ lx_virt_disk_t *vd;
+ minor_t m = 0;
+ char *znm = zc->zc_name;
+
+ /* Create a virtual disk entry for the zvol */
+ vd = kmem_zalloc(sizeof (lx_virt_disk_t),
+ KM_SLEEP);
+ vd->lxvd_type = LXVD_ZVOL;
+ (void) snprintf(vd->lxvd_name,
+ sizeof (vd->lxvd_name),
+ "zvol%u", devnum++);
+ (void) strlcpy(vd->lxvd_real_name,
+ zc->zc_name,
+ sizeof (vd->lxvd_real_name));
+
+ /* Record emulated and real dev_t values */
+ vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK,
+ (*emul_minor)++);
+ if (zvol_name2minor(znm, &m) != 0) {
+ (void) zvol_create_minor(znm);
+ VERIFY(zvol_name2minor(znm, &m) == 0);
+ }
+ if (m != 0) {
+ vd->lxvd_real_dev = makedevice(
+ getmajor(lxzd->lxzd_zfs_dev), m);
+ }
+
+ /* Query volume size properties */
+ lx_zvol_props(lh, zc, &vd->lxvd_volsize,
+ &vd->lxvd_blksize);
+
+ list_insert_tail(zvol_lst, vd);
+ } else {
+ lx_zfs_ds_t *nds;
+
+ /* Create a new ds_t for the child. */
+ nds = kmem_zalloc(sizeof (lx_zfs_ds_t),
+ KM_SLEEP);
+ (void) strcpy(nds->ds_name, zc->zc_name);
+ list_insert_after(&ds_lst, ds, nds);
+
+ /* Depth-first, so do the one just created. */
+ ds = nds;
+ }
+ }
+
+ ASSERT(list_is_empty(&ds_lst));
+ }
+
+ list_destroy(&ds_lst);
+
+out:
+ nvlist_free(pnv);
+ kmem_free(zc, sizeof (zfs_cmd_t));
+}
+
+static void
+lx_zone_get_zfsds(zone_t *zone, minor_t *emul_minor)
+{
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+ vfs_t *vfsp = zone->zone_rootvp->v_vfsp;
+
+ /*
+ * Only the root will be mounted at zone init time.
+ * Finding means of discovering other datasets mounted in the zone
+ * would be a good enhancement later.
+ */
+ if (getmajor(vfsp->vfs_dev) == getmajor(lxzd->lxzd_zfs_dev)) {
+ lx_virt_disk_t *vd;
+
+ vd = kmem_zalloc(sizeof (lx_virt_disk_t), KM_SLEEP);
+ vd->lxvd_type = LXVD_ZFS_DS;
+ vd->lxvd_real_dev = vfsp->vfs_dev;
+ vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, (*emul_minor)++);
+ (void) snprintf(vd->lxvd_name, sizeof (vd->lxvd_name),
+ "zfsds%u", 0);
+ (void) strlcpy(vd->lxvd_real_name,
+ refstr_value(vfsp->vfs_resource),
+ sizeof (vd->lxvd_real_name));
+
+ list_insert_tail(lxzd->lxzd_vdisks, vd);
+ }
+}
+
+/* Cleanup virtual disk list */
+static void
+lx_zone_cleanup_vdisks(lx_zone_data_t *lxzd)
+{
+ lx_virt_disk_t *vd;
+
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+ vd = (list_remove_head(lxzd->lxzd_vdisks));
+ while (vd != NULL) {
+ kmem_free(vd, sizeof (lx_virt_disk_t));
+ vd = list_remove_head(lxzd->lxzd_vdisks);
+ }
+
+ list_destroy(lxzd->lxzd_vdisks);
+ kmem_free(lxzd->lxzd_vdisks, sizeof (list_t));
+ lxzd->lxzd_vdisks = NULL;
+}
+
+/*
+ * By default illumos restricts access to ULP_DEF_EPRIV_PORT1 and
+ * ULP_DEF_EPRIV_PORT2 for TCP and UDP, even though these ports are outside of
+ * the privileged port range. Linux does not do this, so we need to remove
+ * these defaults.
+ *
+ * See also: mod_set_extra_privports
+ */
+static void
+lx_fix_ns_eports(netstack_t *ns)
+{
+ tcp_stack_t *tcps;
+ udp_stack_t *udps;
+ in_port_t *ports;
+ uint_t i, nports;
+ kmutex_t *lock;
+
+ tcps = ns->netstack_tcp;
+ ports = tcps->tcps_g_epriv_ports;
+ nports = tcps->tcps_g_num_epriv_ports;
+ lock = &tcps->tcps_epriv_port_lock;
+ mutex_enter(lock);
+ for (i = 0; i < nports; i++)
+ ports[i] = 0;
+ mutex_exit(lock);
+
+ udps = ns->netstack_udp;
+ ports = udps->us_epriv_ports;
+ nports = udps->us_num_epriv_ports;
+ lock = &udps->us_epriv_port_lock;
+ mutex_enter(lock);
+ for (i = 0; i < nports; i++)
+ ports[i] = 0;
+ mutex_exit(lock);
+}
+
+/*
+ * The default limit for TCP buffer sizing on illumos is smaller than its
+ * counterparts on Linux. Adjust it to meet minimum expectations.
+ */
+static void
+lx_fix_ns_buffers(netstack_t *ns)
+{
+ mod_prop_info_t *pinfo;
+ ulong_t target, parsed;
+ char buf[16];
+
+ /*
+ * Prior to kernel 3.4, Linux defaulted to a max of 4MB for both the
+ * tcp_rmem and tcp_wmem tunables. Kernels since then increase the
+ * tcp_rmem default max to 6MB. Since illumos lacks separate tunables
+ * to cap sizing for read and write buffers, the higher value is
+ * selected for compatibility.
+ */
+ if (lx_kern_release_cmp(curzone, "3.4.0") < 0) {
+ target = 4*1024*1024;
+ } else {
+ target = 6*1024*1024;
+ }
+
+ pinfo = mod_prop_lookup(ns->netstack_tcp->tcps_propinfo_tbl,
+ "max_buf", MOD_PROTO_TCP);
+ if (pinfo == NULL ||
+ pinfo->mpi_getf(ns, pinfo, NULL, buf, sizeof (buf), 0) != 0 ||
+ ddi_strtoul(buf, NULL, 10, &parsed) != 0 ||
+ parsed >= target) {
+ return;
+ }
+
+ (void) snprintf(buf, sizeof (buf), "%lu", target);
+ (void) pinfo->mpi_setf(ns, CRED(), pinfo, NULL, buf, 0);
+}
+
+static void
+lx_bootup_hooks()
+{
+ netstack_t *ns;
+
+ ns = netstack_get_current();
+ if (ns == NULL)
+ return;
+
+ lx_fix_ns_eports(ns);
+ lx_fix_ns_buffers(ns);
+
+ netstack_rele(ns);
+}
+
+void
+lx_init_brand_data(zone_t *zone, kmutex_t *zsl)
+{
+ lx_zone_data_t *data;
+ ldi_handle_t lh;
+
+ ASSERT(MUTEX_HELD(zsl));
+ ASSERT(zone->zone_brand == &lx_brand);
+ ASSERT(zone->zone_brand_data == NULL);
+
+ data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
+ mutex_init(&data->lxzd_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /* No need to hold mutex now since zone_brand_data is not set yet. */
+
+ /*
+ * Set the default lxzd_kernel_version to 2.4.
+ * This can be changed by a call to setattr() during zone boot.
+ */
+ (void) strlcpy(data->lxzd_kernel_release, "2.4.21",
+ LX_KERN_RELEASE_MAX);
+ (void) strlcpy(data->lxzd_kernel_version, "BrandZ virtual linux",
+ LX_KERN_VERSION_MAX);
+ data->lxzd_pipe_max_sz = lx_pipe_max_default;
+
+ zone->zone_brand_data = data;
+
+ /*
+ * In Linux, if the init(1) process terminates the system panics.
+ * The zone must reboot to simulate this behaviour.
+ */
+ zone->zone_reboot_on_init_exit = B_TRUE;
+
+ /*
+ * We cannot hold the zone_status_lock while performing zfs operations
+ * so we drop the lock, get the zfs devs as the last step in this
+ * function, then reaquire the lock. Don't add any code after this
+ * which requires that the zone_status_lock was continuously held.
+ */
+ mutex_exit(zsl);
+
+ data->lxzd_vdisks = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(data->lxzd_vdisks, sizeof (lx_virt_disk_t),
+ offsetof(lx_virt_disk_t, lxvd_link));
+
+ if (lx_zone_zfs_open(&lh, &data->lxzd_zfs_dev) == 0) {
+ minor_t emul_minor = 1;
+
+ lx_zone_get_zfsds(zone, &emul_minor);
+ lx_zone_get_zvols(zone, lh, &emul_minor);
+ (void) ldi_close(lh, FREAD|FWRITE, kcred);
+ } else {
+ /* Avoid matching any devices */
+ data->lxzd_zfs_dev = makedevice(-1, 0);
+ }
+ mutex_enter(zsl);
+}
+
+void
+lx_free_brand_data(zone_t *zone)
+{
+ lx_zone_data_t *data = ztolxzd(zone);
+ ASSERT(data != NULL);
+ mutex_enter(&data->lxzd_lock);
+ lx_audit_fini(zone);
+ if (data->lxzd_ioctl_sock != NULL) {
+ /*
+ * Since zone_kcred has been cleaned up already, close the
+ * socket using the global kcred.
+ */
+ (void) ksocket_close(data->lxzd_ioctl_sock, kcred);
+ data->lxzd_ioctl_sock = NULL;
+ }
+ ASSERT(data->lxzd_cgroup == NULL);
+
+ lx_zone_cleanup_vdisks(data);
+
+ mutex_exit(&data->lxzd_lock);
+ zone->zone_brand_data = NULL;
+ mutex_destroy(&data->lxzd_lock);
+ kmem_free(data, sizeof (*data));
+}
+
+void
+lx_unsupported(char *dmsg)
+{
+ lx_proc_data_t *pd = ttolxproc(curthread);
+
+ DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
+
+ if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) {
+ /*
+ * If this process was run with strict mode enabled
+ * (via LX_STRICT in the environment), we mark this
+ * LWP as having triggered an unsupported behaviour.
+ * This flag will be checked at an appropriate point
+ * by lx_check_strict_failure().
+ */
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+
+ lwpd->br_strict_failure = B_TRUE;
+ }
+}
+
+void
+lx_check_strict_failure(lx_lwp_data_t *lwpd)
+{
+ proc_t *p;
+
+ if (!lwpd->br_strict_failure) {
+ return;
+ }
+
+ lwpd->br_strict_failure = B_FALSE;
+
+ /*
+ * If this process is operating in strict mode (via LX_STRICT in
+ * the environment), and has triggered a call to
+ * lx_unsupported(), we drop SIGSYS on it as we return.
+ */
+ p = curproc;
+ mutex_enter(&p->p_lock);
+ sigtoproc(p, curthread, SIGSYS);
+ mutex_exit(&p->p_lock);
+}
+
+void
+lx_trace_sysenter(int syscall_num, uintptr_t *args)
+{
+ if (lx_systrace_enabled) {
+ VERIFY(lx_systrace_entry_ptr != NULL);
+
+ (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
+ args[2], args[3], args[4], args[5]);
+ }
+}
+
+void
+lx_trace_sysreturn(int syscall_num, long ret)
+{
+ if (lx_systrace_enabled) {
+ VERIFY(lx_systrace_return_ptr != NULL);
+
+ (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
+ }
+}
+
+/*
+ * Get the addresses of the user-space system call handler and attach it to
+ * the proc structure. Returning 0 indicates success; the value returned
+ * by the system call is the value stored in rval. Returning a non-zero
+ * value indicates a failure; the value returned is used to set errno, -1
+ * is returned from the syscall and the contents of rval are ignored. To
+ * set errno and have the syscall return a value other than -1 we can
+ * manually set errno and rval and return 0.
+ */
+int
+lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
+ uintptr_t arg3, uintptr_t arg4)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ lx_proc_data_t *pd;
+ struct termios *termios;
+ uint_t termios_len;
+ int error;
+ int code;
+ int sig;
+ lx_brand_registration_t reg;
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ /*
+ * There is one operation that is suppored for non-branded
+ * process. B_EXEC_BRAND. This is the equilivant of an
+ * exec call, but the new process that is created will be
+ * a branded process.
+ */
+ if (cmd == B_EXEC_BRAND) {
+ VERIFY(p->p_zone != NULL);
+ VERIFY(p->p_zone->zone_brand == &lx_brand);
+ return (exec_common(
+ (char *)arg1, (const char **)arg2, (const char **)arg3,
+ EBA_BRAND));
+ }
+
+ /* For all other operations this must be a branded process. */
+ if (p->p_brand == NULL)
+ return (ENOSYS);
+
+ VERIFY(p->p_brand == &lx_brand);
+ VERIFY(p->p_brand_data != NULL);
+
+ switch (cmd) {
+ case B_REGISTER:
+ if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+ lx_print("stack mode was not PREINIT during "
+ "REGISTER\n");
+ return (EINVAL);
+ }
+
+ if (p->p_model == DATAMODEL_NATIVE) {
+ if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
+ lx_print("Failed to copyin brand registration "
+ "at 0x%p\n", (void *)arg1);
+ return (EFAULT);
+ }
+ }
+#ifdef _LP64
+ else {
+ /* 32-bit userland on 64-bit kernel */
+ lx_brand_registration32_t reg32;
+
+ if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
+ lx_print("Failed to copyin brand registration "
+ "at 0x%p\n", (void *)arg1);
+ return (EFAULT);
+ }
+
+ reg.lxbr_version = (uint_t)reg32.lxbr_version;
+ reg.lxbr_handler =
+ (void *)(uintptr_t)reg32.lxbr_handler;
+ reg.lxbr_flags = reg32.lxbr_flags;
+ }
+#endif
+
+ if (reg.lxbr_version != LX_VERSION_1) {
+ lx_print("Invalid brand library version (%u)\n",
+ reg.lxbr_version);
+ return (EINVAL);
+ }
+
+ if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) {
+ lx_print("Invalid brand flags (%u)\n",
+ reg.lxbr_flags);
+ return (EINVAL);
+ }
+
+ lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
+ (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
+ pd = p->p_brand_data;
+ pd->l_handler = (uintptr_t)reg.lxbr_handler;
+ pd->l_flags = reg.lxbr_flags & LX_PROC_ALL;
+
+ /*
+ * There are certain setup tasks which cannot be performed
+ * during the lx_init_brand_data hook due to the calling
+ * context from zoneadmd (in the GZ). This work is instead
+ * delayed until the init process starts inside the zone.
+ */
+ if (p->p_pid == p->p_zone->zone_proc_initpid) {
+ lx_bootup_hooks();
+ }
+
+ return (0);
+
+ case B_TTYMODES:
+ /* This is necessary for emulating TCGETS ioctls. */
+ if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
+ DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
+ &termios_len) != DDI_SUCCESS)
+ return (EIO);
+
+ ASSERT(termios_len == sizeof (*termios));
+
+ if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
+ ddi_prop_free(termios);
+ return (EFAULT);
+ }
+
+ ddi_prop_free(termios);
+ return (0);
+
+ case B_ELFDATA: {
+ mutex_enter(&p->p_lock);
+ pd = curproc->p_brand_data;
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ lx_elf_data_t led;
+
+ bcopy(&pd->l_elf_data, &led, sizeof (led));
+ mutex_exit(&p->p_lock);
+
+ if (copyout(&led, (void *)arg1,
+ sizeof (lx_elf_data_t)) != 0) {
+ return (EFAULT);
+ }
+ }
+#if defined(_LP64)
+ else {
+ /* 32-bit userland on 64-bit kernel */
+ lx_elf_data32_t led32;
+
+ led32.ed_phdr = (int)pd->l_elf_data.ed_phdr;
+ led32.ed_phent = (int)pd->l_elf_data.ed_phent;
+ led32.ed_phnum = (int)pd->l_elf_data.ed_phnum;
+ led32.ed_entry = (int)pd->l_elf_data.ed_entry;
+ led32.ed_base = (int)pd->l_elf_data.ed_base;
+ led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry;
+ mutex_exit(&p->p_lock);
+
+ if (copyout(&led32, (void *)arg1,
+ sizeof (led32)) != 0) {
+ return (EFAULT);
+ }
+ }
+#endif
+ return (0);
+ }
+
+ case B_EXEC_NATIVE:
+ return (exec_common((char *)arg1, (const char **)arg2,
+ (const char **)arg3, EBA_NATIVE));
+
+ /*
+ * The B_TRUSS_POINT subcommand is used so that we can make a no-op
+ * syscall for debugging purposes (dtracing) from within the user-level
+ * emulation.
+ */
+ case B_TRUSS_POINT:
+ return (0);
+
+ case B_LPID_TO_SPAIR: {
+ /*
+ * Given a Linux pid as arg1, return the Solaris pid in arg2 and
+ * the Solaris LWP in arg3. We also translate pid 1 (which is
+ * hardcoded in many applications) to the zone's init process.
+ */
+ pid_t s_pid;
+ id_t s_tid;
+
+ if ((pid_t)arg1 == 1) {
+ s_pid = p->p_zone->zone_proc_initpid;
+ /* handle the dead/missing init(1M) case */
+ if (s_pid == -1)
+ s_pid = 1;
+ s_tid = 1;
+ } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
+ return (ESRCH);
+ }
+
+ if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
+ copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+ }
+
+ case B_PTRACE_STOP_FOR_OPT:
+ return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
+ B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
+
+ case B_PTRACE_CLONE_BEGIN:
+ /*
+ * Leverage ptrace brand call to create a clone group for this
+ * proc if necessary.
+ */
+ lx_clone_grp_create((uint_t)arg3);
+
+ return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
+ B_FALSE : B_TRUE));
+
+ case B_PTRACE_SIG_RETURN: {
+ /*
+ * Our ptrace emulation must emit PR_SYSEXIT for rt_sigreturn.
+ * Since that syscall does not pass through the normal
+ * emulation, which would call lx_syscall_return, the event is
+ * emitted manually. A successful result of the syscall is
+ * assumed since there is little to be done in the face of
+ * failure.
+ */
+ struct regs *rp = lwptoregs(lwp);
+
+ rp->r_r0 = 0;
+ (void) lx_ptrace_stop(LX_PR_SYSEXIT);
+ return (0);
+ }
+
+ case B_UNSUPPORTED: {
+ char dmsg[256];
+
+ if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
+ lx_print("Failed to copyin unsupported msg "
+ "at 0x%p\n", (void *)arg1);
+ return (EFAULT);
+ }
+ dmsg[255] = '\0';
+ lx_unsupported(dmsg);
+
+ lx_check_strict_failure(lwpd);
+
+ return (0);
+ }
+
+ case B_STORE_ARGS: {
+ /*
+ * B_STORE_ARGS subcommand
+ * arg1 = address of struct to be copied in
+ * arg2 = size of the struct being copied in
+ * arg3-arg6 ignored
+ * rval = the amount of data copied.
+ */
+ void *buf;
+
+ /* only have upper limit because arg2 is unsigned */
+ if (arg2 > LX_BR_ARGS_SIZE_MAX) {
+ return (EINVAL);
+ }
+
+ buf = kmem_alloc(arg2, KM_SLEEP);
+ if (copyin((void *)arg1, buf, arg2) != 0) {
+ lx_print("Failed to copyin scall arg at 0x%p\n",
+ (void *) arg1);
+ kmem_free(buf, arg2);
+ /*
+ * Purposely not setting br_scall_args to NULL
+ * to preserve data for debugging.
+ */
+ return (EFAULT);
+ }
+
+ if (lwpd->br_scall_args != NULL) {
+ ASSERT(lwpd->br_args_size > 0);
+ kmem_free(lwpd->br_scall_args,
+ lwpd->br_args_size);
+ }
+
+ lwpd->br_scall_args = buf;
+ lwpd->br_args_size = arg2;
+ *rval = arg2;
+ return (0);
+ }
+
+ case B_HELPER_CLONE:
+ return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
+ (void *)arg4));
+
+ case B_HELPER_SETGROUPS:
+ return (lx_helper_setgroups(arg1, (gid_t *)arg2));
+
+ case B_HELPER_SIGQUEUE:
+ return (lx_helper_rt_sigqueueinfo(arg1, arg2,
+ (siginfo_t *)arg3));
+
+ case B_HELPER_TGSIGQUEUE:
+ return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
+ (siginfo_t *)arg4));
+
+ case B_GETPID:
+ /*
+ * The usermode clone(2) code needs to be able to call
+ * lx_getpid() from native code:
+ */
+ *rval = lx_getpid();
+ return (0);
+
+ case B_SET_NATIVE_STACK:
+ /*
+ * B_SET_NATIVE_STACK subcommand
+ * arg1 = the base of the stack to use for emulation
+ */
+ if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+ lx_print("B_SET_NATIVE_STACK when stack was already "
+ "set to %p\n", (void *)arg1);
+ return (EEXIST);
+ }
+
+ /*
+ * We move from the PREINIT state, where we have no brand
+ * emulation stack, to the INIT state. Here, we are still
+ * running on what will become the BRAND stack, but are running
+ * emulation (i.e. native) code. Once the initialisation
+ * process for this thread has finished, we will jump to
+ * brand-specific code, while moving to the BRAND mode.
+ *
+ * When a new LWP is created, lx_initlwp() will clear the
+ * stack data. If that LWP is actually being duplicated
+ * into a child process by fork(2), lx_forklwp() will copy
+ * it so that the cloned thread will keep using the same
+ * alternate stack.
+ */
+ lwpd->br_ntv_stack = arg1;
+ lwpd->br_stack_mode = LX_STACK_MODE_INIT;
+ lx_lwp_set_native_stack_current(lwpd, arg1);
+
+ return (0);
+
+ case B_GET_CURRENT_CONTEXT:
+ /*
+ * B_GET_CURRENT_CONTEXT subcommand:
+ * arg1 = address for pointer to current ucontext_t
+ */
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
+
+ error = copyout(&addr, (void *)arg1, sizeof (addr));
+ } else
+#endif
+ {
+ error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
+ sizeof (lwp->lwp_oldcontext));
+ }
+
+ return (error != 0 ? EFAULT : 0);
+
+ case B_JUMP_TO_LINUX:
+ /*
+ * B_JUMP_TO_LINUX subcommand:
+ * arg1 = ucontext_t pointer for jump state
+ */
+
+ if (arg1 == NULL)
+ return (EINVAL);
+
+ switch (lwpd->br_stack_mode) {
+ case LX_STACK_MODE_NATIVE: {
+ struct regs *rp = lwptoregs(lwp);
+
+ /*
+ * We are on the NATIVE stack, so we must preserve
+ * the extent of that stack. The pointer will be
+ * reset by a future setcontext().
+ */
+ lx_lwp_set_native_stack_current(lwpd,
+ (uintptr_t)rp->r_sp);
+ break;
+ }
+
+ case LX_STACK_MODE_INIT:
+ /*
+ * The LWP is transitioning to Linux code for the first
+ * time.
+ */
+ break;
+
+ case LX_STACK_MODE_PREINIT:
+ /*
+ * This LWP has not installed an alternate stack for
+ * usermode emulation handling.
+ */
+ return (ENOENT);
+
+ case LX_STACK_MODE_BRAND:
+ /*
+ * The LWP should not be on the BRAND stack.
+ */
+ exit(CLD_KILLED, SIGSYS);
+ return (0);
+ }
+
+ /*
+ * Transfer control to Linux:
+ */
+ return (lx_runexe(lwp, (void *)arg1));
+
+ case B_EMULATION_DONE:
+ /*
+ * B_EMULATION_DONE subcommand:
+ * arg1 = ucontext_t * to restore
+ * arg2 = system call number
+ * arg3 = return code
+ * arg4 = if operation failed, the errno value
+ */
+
+ /*
+ * The first part of this operation is a setcontext() to
+ * restore the register state to the copy we preserved
+ * before vectoring to the usermode emulation routine.
+ * If that fails, we return (hopefully) to the emulation
+ * routine and it will handle the error.
+ */
+#if (_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ error = getsetcontext32(SETCONTEXT, (void *)arg1);
+ } else
+#endif
+ {
+ error = getsetcontext(SETCONTEXT, (void *)arg1);
+ }
+
+ if (error != 0) {
+ return (error);
+ }
+
+ /*
+ * The saved Linux context has been restored. We handle the
+ * return value or errno with code common to the in-kernel
+ * system call emulation.
+ */
+ if ((error = (int)arg4) != 0) {
+ /*
+ * lx_syscall_return() looks at the errno in the LWP,
+ * so set it here:
+ */
+ (void) set_errno(error);
+ }
+ lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
+
+ return (0);
+
+ case B_EXIT_AS_SIG:
+ code = CLD_KILLED;
+ sig = (int)arg1;
+ proc_is_exiting(p);
+ if (exitlwps(1) != 0) {
+ mutex_enter(&p->p_lock);
+ lwp_exit();
+ }
+ ttolwp(curthread)->lwp_cursig = sig;
+ if (sig == SIGSEGV) {
+ if (core(sig, 0) == 0)
+ code = CLD_DUMPED;
+ }
+ exit(code, sig);
+ /* NOTREACHED */
+ break;
+
+ case B_OVERRIDE_KERN_VER: {
+ void *urel = (void *)arg1;
+ void *uver = (void *)arg2;
+ size_t len;
+
+ pd = ptolxproc(p);
+ if (urel != NULL) {
+ if (copyinstr(urel, pd->l_uname_release,
+ LX_KERN_RELEASE_MAX, &len) != 0) {
+ return (EFAULT);
+ }
+ pd->l_uname_release[LX_KERN_RELEASE_MAX - 1] = '\0';
+ }
+ if (uver != NULL) {
+ if (copyinstr(uver, pd->l_uname_version,
+ LX_KERN_VERSION_MAX, &len) != 0) {
+ return (EFAULT);
+ }
+ pd->l_uname_version[LX_KERN_VERSION_MAX - 1] = '\0';
+ }
+
+ return (0);
+ }
+
+ case B_GET_PERSONALITY: {
+ unsigned int result;
+
+ mutex_enter(&p->p_lock);
+ pd = ptolxproc(p);
+ result = pd->l_personality;
+ mutex_exit(&p->p_lock);
+ return (result);
+ }
+
+ case B_START_NFS_LOCKD:
+ (void) lx_start_nfs_lockd();
+ return (0);
+
+ case B_BLOCK_ALL_SIGS:
+ mutex_enter(&p->p_lock);
+ pd = ptolxproc(p);
+ pd->l_block_all_signals++;
+ mutex_exit(&p->p_lock);
+ return (0);
+
+ case B_UNBLOCK_ALL_SIGS: {
+ uint_t result;
+
+ mutex_enter(&p->p_lock);
+ pd = ptolxproc(p);
+ if (pd->l_block_all_signals == 0) {
+ result = set_errno(EINVAL);
+ } else {
+ pd->l_block_all_signals--;
+ result = 0;
+ }
+ mutex_exit(&p->p_lock);
+ return (result);
+ }
+
+ case B_ALL_SIGS_BLOCKED: {
+ uint_t result;
+
+ mutex_enter(&p->p_lock);
+ pd = ptolxproc(p);
+ result = pd->l_block_all_signals;
+ mutex_exit(&p->p_lock);
+ return (result);
+ }
+ }
+
+ return (EINVAL);
+}
+
+/*
+ * Compare linux kernel version to the one set for the zone.
+ * Returns greater than 0 if zone version is higher, less than 0 if the zone
+ * version is lower, and 0 if the versions are equal.
+ */
+int
+lx_kern_release_cmp(zone_t *zone, const char *vers)
+{
+ int zvers[3] = {0, 0, 0};
+ int cvers[3] = {0, 0, 0};
+ int i;
+ lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
+
+ VERIFY(zone->zone_brand == &lx_brand);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ (void) sscanf(lxzd->lxzd_kernel_release, "%d.%d.%d", &zvers[0],
+ &zvers[1], &zvers[2]);
+ mutex_exit(&lxzd->lxzd_lock);
+ (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]);
+
+ for (i = 0; i < 3; i++) {
+ if (zvers[i] > cvers[i]) {
+ return (1);
+ } else if (zvers[i] < cvers[i]) {
+ return (-1);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Linux unconditionally removes the setuid and setgid bits when changing
+ * file ownership. This brand hook overrides the illumos native behaviour,
+ * which is based on the PRIV_FILE_SETID privilege.
+ */
+/* ARGSUSED */
+static int
+lx_setid_clear(vattr_t *vap, cred_t *cr)
+{
+ if (S_ISDIR(vap->va_mode)) {
+ return (0);
+ }
+
+ if (vap->va_mode & S_ISUID) {
+ vap->va_mask |= AT_MODE;
+ vap->va_mode &= ~S_ISUID;
+ }
+ if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ vap->va_mask |= AT_MODE;
+ vap->va_mode &= ~S_ISGID;
+ }
+
+ return (0);
+}
+
+/*
+ * Copy the per-process brand data from a parent proc to a child.
+ */
+void
+lx_copy_procdata(proc_t *cp, proc_t *pp)
+{
+ lx_proc_data_t *cpd, *ppd;
+
+ /*
+ * Since b_copy_procdata is called during getproc(), while the child
+ * process is still being initialized, acquiring cp->p_lock should not
+ * be required.
+ */
+ VERIFY(cp->p_brand == &lx_brand);
+ VERIFY((cpd = cp->p_brand_data) != NULL);
+
+ mutex_enter(&pp->p_lock);
+ VERIFY(pp->p_brand == &lx_brand);
+ VERIFY((ppd = pp->p_brand_data) != NULL);
+
+ bcopy(ppd, cpd, sizeof (lx_proc_data_t));
+ mutex_exit(&pp->p_lock);
+
+ /* Clear any aio contexts from child */
+ lx_io_clear(cpd);
+
+ /*
+ * The l_ptrace count is normally manipulated only while under holding
+ * p_lock. Since this is a freshly created process, it's safe to zero
+ * out. If it is to be inherited, the attach will occur later.
+ */
+ cpd->l_ptrace = 0;
+
+ cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY;
+ cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY;
+
+ cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20;
+ cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20;
+
+ cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY;
+ cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY;
+
+ cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
+ cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
+
+ bzero(cpd->l_clone_grps, sizeof (cpd->l_clone_grps));
+}
+
+#if defined(_LP64)
+static void
+Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
+{
+ bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
+ dst->e_type = src->e_type;
+ dst->e_machine = src->e_machine;
+ dst->e_version = src->e_version;
+ dst->e_entry = src->e_entry;
+ dst->e_phoff = src->e_phoff;
+ dst->e_shoff = src->e_shoff;
+ dst->e_flags = src->e_flags;
+ dst->e_ehsize = src->e_ehsize;
+ dst->e_phentsize = src->e_phentsize;
+ dst->e_phnum = src->e_phnum;
+ dst->e_shentsize = src->e_shentsize;
+ dst->e_shnum = src->e_shnum;
+ dst->e_shstrndx = src->e_shstrndx;
+}
+#endif /* _LP64 */
+
+static void
+restoreexecenv(struct execenv *ep, stack_t *sp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+
+ setexecenv(ep);
+ lwp->lwp_sigaltstack.ss_sp = sp->ss_sp;
+ lwp->lwp_sigaltstack.ss_size = sp->ss_size;
+ lwp->lwp_sigaltstack.ss_flags = sp->ss_flags;
+}
+
+static uintptr_t
+lx_map_vdso(struct uarg *args, struct cred *cred)
+{
+ int err;
+ char *fpath = LX_VDSO_PATH;
+ vnode_t *vp;
+ vattr_t attr;
+ caddr_t addr;
+
+#if defined(_LP64)
+ if (args->to_model != DATAMODEL_NATIVE) {
+ fpath = LX_VDSO_PATH32;
+ }
+#endif
+
+ /*
+ * The comm page should have been mapped in already.
+ */
+ if (args->commpage == NULL) {
+ return (NULL);
+ }
+
+ /*
+ * Ensure the VDSO library is present and appropriately sized.
+ * This lookup is started at the zone root to avoid complications for
+ * processes which have chrooted. For the specified lookup root to be
+ * used, the leading slash must be dropped from the path.
+ */
+ ASSERT(fpath[0] == '/');
+ fpath++;
+ if (lookupnameat(fpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp,
+ curzone->zone_rootvp) != 0) {
+ return (NULL);
+ }
+
+ /*
+ * The VDSO requires data exposed via the comm page in order to
+ * function properly. The VDSO is always mapped in at a fixed known
+ * offset from the comm page, providing an easy means to locate it.
+ */
+ addr = (caddr_t)(args->commpage - LX_VDSO_SIZE);
+ attr.va_mask = AT_SIZE;
+ if (VOP_GETATTR(vp, &attr, 0, cred, NULL) != 0 ||
+ attr.va_size > LX_VDSO_SIZE) {
+ VN_RELE(vp);
+ return (NULL);
+ }
+
+ err = execmap(vp, addr, attr.va_size, 0, 0,
+ PROT_USER|PROT_READ|PROT_EXEC, 1, 0);
+ VN_RELE(vp);
+ if (err != 0) {
+ return (NULL);
+ }
+ return ((uintptr_t)addr);
+}
+
+/*
+ * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux
+ * binaries.
+ */
+/* ARGSUSED4 */
+static int
+lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
+ struct intpdata *idata, int level, size_t *execsz, int setid,
+ caddr_t exec_file, struct cred *cred, int *brand_action)
+{
+ int error;
+ vnode_t *nvp;
+ Ehdr ehdr;
+ Addr uphdr_vaddr;
+ intptr_t voffset;
+ char *interp = NULL;
+ uintptr_t ldaddr = NULL;
+ proc_t *p = ttoproc(curthread);
+ klwp_t *lwp = ttolwp(curthread);
+ lx_proc_data_t *lxpd = ptolxproc(p);
+ struct execenv env, origenv;
+ stack_t orig_sigaltstack;
+ struct user *up = PTOU(ttoproc(curthread));
+ lx_elf_data_t edp;
+ char *lib_path = LX_LIB_PATH;
+ boolean_t execstk = B_TRUE;
+ unsigned int personality;
+
+ ASSERT(p->p_brand == &lx_brand);
+ ASSERT(lxpd != NULL);
+
+ /*
+ * Start with a separate struct for ELF data instead of inheriting
+ * values from the currently running binary. This ensures that fields
+ * such as ed_base are cleared if the new binary does not utilize an
+ * interpreter.
+ */
+ bzero(&edp, sizeof (edp));
+
+#if defined(_LP64)
+ if (args->to_model != DATAMODEL_NATIVE) {
+ lib_path = LX_LIB_PATH32;
+ }
+#endif
+
+ /*
+ * Set the brandname and library name for the new process so that
+ * elfexec() puts them onto the stack.
+ */
+ args->brandname = LX_BRANDNAME;
+ args->emulator = lib_path;
+
+#if defined(_LP64)
+ /*
+ * To conform with the way Linux lays out the address space, we clamp
+ * the stack to be the top of the lower region of the x86-64 canonical
+ * form address space -- which has the side-effect of laying out the
+ * entire address space in that lower region. Note that this only
+ * matters on 64-bit processes (this value will always be greater than
+ * the size of a 32-bit address space) and doesn't actually affect
+ * USERLIMIT: if a Linux-branded processes wishes to map something
+ * into the top half of the address space, it can do so -- but with
+ * the user stack starting at the top of the bottom region, those high
+ * virtual addresses won't be used unless explicitly directed.
+ */
+ args->maxstack = lx_maxstack64;
+#endif
+
+ /*
+ * Search the binary for a PT_GNU_STACK header. The PF_X bit contained
+ * within is used to dictate protection defaults for the stack, among
+ * other things.
+ */
+ if (args->to_model == DATAMODEL_NATIVE) {
+ Ehdr ehdr;
+ Phdr *phdrp;
+ caddr_t phdrbase = NULL;
+ size_t phdrsize = 0;
+ uint_t nphdrs, hsize;
+
+ if ((error = elfreadhdr(vp, cred, &ehdr, &nphdrs, &phdrbase,
+ &phdrsize)) != 0) {
+ return (error);
+ }
+
+ hsize = ehdr.e_phentsize;
+ /* LINTED: alignment */
+ phdrp = (Phdr *)phdrbase;
+ for (uint_t i = nphdrs; i > 0; i--) {
+ switch (phdrp->p_type) {
+ case PT_GNU_STACK:
+ if ((phdrp->p_flags & PF_X) == 0) {
+ execstk = B_FALSE;
+ }
+ break;
+ }
+ /* LINTED: alignment */
+ phdrp = (Phdr *)((caddr_t)phdrp + hsize);
+ }
+ kmem_free(phdrbase, phdrsize);
+ }
+#if defined(_LP64)
+ else {
+ Elf32_Ehdr ehdr;
+ Elf32_Phdr *phdrp;
+ caddr_t phdrbase = NULL;
+ size_t phdrsize = 0;
+ uint_t nphdrs, hsize;
+
+ if ((error = elf32readhdr(vp, cred, &ehdr, &nphdrs, &phdrbase,
+ &phdrsize)) != 0) {
+ return (error);
+ }
+
+ hsize = ehdr.e_phentsize;
+ /* LINTED: alignment */
+ phdrp = (Elf32_Phdr *)phdrbase;
+ for (uint_t i = nphdrs; i > 0; i--) {
+ switch (phdrp->p_type) {
+ case PT_GNU_STACK:
+ if ((phdrp->p_flags & PF_X) == 0) {
+ execstk = B_FALSE;
+ }
+ break;
+ }
+ /* LINTED: alignment */
+ phdrp = (Elf32_Phdr *)((caddr_t)phdrp + hsize);
+ }
+ kmem_free(phdrbase, phdrsize);
+ }
+#endif
+
+ /*
+ * Revert the base personality while maintaining any existing flags.
+ */
+ personality = LX_PER_LINUX | (lxpd->l_personality & ~LX_PER_MASK);
+
+ /*
+ * Linux defaults to an executable stack unless the aformentioned
+ * PT_GNU_STACK entry in the elf header dictates otherwise. Enabling
+ * the READ_IMPLIES_EXEC personality flag is also implied in this case.
+ */
+ if (execstk) {
+ args->stk_prot |= PROT_EXEC;
+ args->stk_prot_override = B_TRUE;
+ personality |= LX_PER_READ_IMPLIES_EXEC;
+ }
+
+ /*
+ * We will first exec the brand library, then map in the linux
+ * executable and the linux linker.
+ */
+ if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP,
+ &nvp))) {
+ uprintf("%s: not found.", lib_path);
+ return (error);
+ }
+
+ /*
+ * We will eventually set the p_exec member to be the vnode for the new
+ * executable when we call setexecenv(). However, if we get an error
+ * before that call we need to restore the execenv to its original
+ * values so that when we return to the caller fop_close() works
+ * properly while cleaning up from the failed exec(). Restoring the
+ * original value will also properly decrement the 2nd VN_RELE that we
+ * took on the brand library.
+ */
+ origenv.ex_bssbase = p->p_bssbase;
+ origenv.ex_brkbase = p->p_brkbase;
+ origenv.ex_brksize = p->p_brksize;
+ origenv.ex_vp = p->p_exec;
+ orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp;
+ orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size;
+ orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
+
+ if (args->to_model == DATAMODEL_NATIVE) {
+ error = elfexec(nvp, uap, args, idata, INTP_MAXDEPTH + 1,
+ execsz, setid, exec_file, cred, brand_action);
+ }
+#if defined(_LP64)
+ else {
+ error = elf32exec(nvp, uap, args, idata, INTP_MAXDEPTH + 1,
+ execsz, setid, exec_file, cred, brand_action);
+ }
+#endif
+ VN_RELE(nvp);
+ if (error != 0) {
+ restoreexecenv(&origenv, &orig_sigaltstack);
+ return (error);
+ }
+
+ /*
+ * exec-ed in the brand library above.
+ * The u_auxv vectors are now setup by elfexec to point to the
+ * brand emulation library and its linker.
+ */
+
+ /*
+ * After execing the brand library (which should have implicitly mapped
+ * in the comm page), map the VDSO into the approprate place in the AS.
+ */
+ lxpd->l_vdso = lx_map_vdso(args, cred);
+
+ bzero(&env, sizeof (env));
+
+ /*
+ * map in the the Linux executable
+ */
+ if (args->to_model == DATAMODEL_NATIVE) {
+ error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
+ &voffset, exec_file, &interp, &env.ex_bssbase,
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
+ }
+#if defined(_LP64)
+ else {
+ Elf32_Ehdr ehdr32;
+ Elf32_Addr uphdr_vaddr32;
+
+ error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
+ &voffset, exec_file, &interp, &env.ex_bssbase,
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
+
+ Ehdr32to64(&ehdr32, &ehdr);
+
+ if (uphdr_vaddr32 == (Elf32_Addr)-1)
+ uphdr_vaddr = (Addr)-1;
+ else
+ uphdr_vaddr = uphdr_vaddr32;
+ }
+#endif
+ if (error != 0) {
+ restoreexecenv(&origenv, &orig_sigaltstack);
+
+ if (interp != NULL)
+ kmem_free(interp, MAXPATHLEN);
+
+ return (error);
+ }
+
+ /*
+ * Save off the important properties of the lx executable. The brand
+ * library will ask us for this data later, when it is ready to set
+ * things up for the lx executable.
+ */
+ edp.ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
+ voffset + uphdr_vaddr;
+ edp.ed_entry = voffset + ehdr.e_entry;
+ edp.ed_phent = ehdr.e_phentsize;
+ edp.ed_phnum = ehdr.e_phnum;
+
+ if (interp != NULL) {
+ if (ehdr.e_type == ET_DYN) {
+ /*
+ * This is a shared object executable, so we need to
+ * pick a reasonable place to put the heap. Just don't
+ * use the first page.
+ */
+ env.ex_brkbase = (caddr_t)PAGESIZE;
+ env.ex_bssbase = (caddr_t)PAGESIZE;
+ }
+
+ /*
+ * If the program needs an interpreter (most do), map it in and
+ * store relevant information about it in the aux vector, where
+ * the brand library can find it.
+ */
+ if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW,
+ NULLVPP, &nvp))) {
+ uprintf("%s: not found.", interp);
+ restoreexecenv(&origenv, &orig_sigaltstack);
+ kmem_free(interp, MAXPATHLEN);
+ return (error);
+ }
+
+ kmem_free(interp, MAXPATHLEN);
+ interp = NULL;
+
+ /*
+ * map in the Linux linker
+ */
+ if (args->to_model == DATAMODEL_NATIVE) {
+ error = mapexec_brand(nvp, args, &ehdr,
+ &uphdr_vaddr, &voffset, exec_file, NULL, NULL,
+ NULL, NULL, NULL, &ldaddr);
+ }
+#if defined(_LP64)
+ else {
+ Elf32_Ehdr ehdr32;
+ Elf32_Addr uphdr_vaddr32;
+
+ error = mapexec32_brand(nvp, args, &ehdr32,
+ &uphdr_vaddr32, &voffset, exec_file, NULL, NULL,
+ NULL, NULL, NULL, &ldaddr);
+
+ Ehdr32to64(&ehdr32, &ehdr);
+
+ if (uphdr_vaddr32 == (Elf32_Addr)-1)
+ uphdr_vaddr = (Addr)-1;
+ else
+ uphdr_vaddr = uphdr_vaddr32;
+ }
+#endif
+
+ VN_RELE(nvp);
+ if (error != 0) {
+ restoreexecenv(&origenv, &orig_sigaltstack);
+ return (error);
+ }
+
+ /*
+ * Now that we know the base address of the brand's linker,
+ * we also save this for later use by the brand library.
+ */
+ edp.ed_base = voffset;
+ edp.ed_ldentry = voffset + ehdr.e_entry;
+ } else {
+ /*
+ * This program has no interpreter. The lx brand library will
+ * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
+ * so in this case, put the entry point of the main executable
+ * there.
+ */
+ if (ehdr.e_type == ET_EXEC) {
+ /*
+ * An executable with no interpreter, this must be a
+ * statically linked executable, which means we loaded
+ * it at the address specified in the elf header, in
+ * which case the e_entry field of the elf header is an
+ * absolute address.
+ */
+ edp.ed_ldentry = ehdr.e_entry;
+ edp.ed_entry = ehdr.e_entry;
+ } else {
+ /*
+ * A shared object with no interpreter, we use the
+ * calculated address from above.
+ */
+ edp.ed_ldentry = edp.ed_entry;
+
+ /*
+ * In all situations except an ET_DYN elf object with no
+ * interpreter, we want to leave the brk and base
+ * values set by mapexec_brand alone. Normally when
+ * running ET_DYN objects on Solaris (most likely
+ * /lib/ld.so.1) the kernel sets brk and base to 0 since
+ * it doesn't know where to put the heap, and later the
+ * linker will call brk() to initialize the heap in:
+ * usr/src/cmd/sgs/rtld/common/setup.c:setup()
+ * after it has determined where to put it. (This
+ * decision is made after the linker loads and inspects
+ * elf properties of the target executable being run.)
+ *
+ * So for ET_DYN Linux executables, we also don't know
+ * where the heap should go, so we'll set the brk and
+ * base to 0. But in this case the Solaris linker will
+ * not initialize the heap, so when the Linux linker
+ * starts running there is no heap allocated. This
+ * seems to be ok on Linux 2.4 based systems because the
+ * Linux linker/libc fall back to using mmap() to
+ * allocate memory. But on 2.6 systems, running
+ * applications by specifying them as command line
+ * arguments to the linker results in segfaults for an
+ * as yet undetermined reason (which seems to indicatej
+ * that a more permanent fix for heap initalization in
+ * these cases may be necessary).
+ */
+ if (ehdr.e_type == ET_DYN) {
+ env.ex_bssbase = (caddr_t)0;
+ env.ex_brkbase = (caddr_t)0;
+ env.ex_brksize = 0;
+ }
+ }
+ }
+
+ env.ex_vp = vp;
+ setexecenv(&env);
+
+ /*
+ * We try to keep /proc's view of the aux vector consistent with
+ * what's on the process stack. See the comment on the lx_times
+ * syscall for an explanation of the hardcoded LX_USERHZ.
+ */
+ if (args->to_model == DATAMODEL_NATIVE) {
+ auxv_t phdr_auxv[4] = {
+ { AT_SUN_BRAND_LX_PHDR, 0 },
+ { AT_SUN_BRAND_LX_INTERP, 0 },
+ { AT_SUN_BRAND_LX_CLKTCK, 0 },
+ { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }
+ };
+ phdr_auxv[0].a_un.a_val = edp.ed_phdr;
+ phdr_auxv[1].a_un.a_val = ldaddr;
+ phdr_auxv[2].a_un.a_val = LX_USERHZ;
+ phdr_auxv[3].a_un.a_val = lxpd->l_vdso;
+
+ if (copyout(&phdr_auxv, args->auxp_brand,
+ sizeof (phdr_auxv)) == -1)
+ return (EFAULT);
+ }
+#if defined(_LP64)
+ else {
+ auxv32_t phdr_auxv32[4] = {
+ { AT_SUN_BRAND_LX_PHDR, 0 },
+ { AT_SUN_BRAND_LX_INTERP, 0 },
+ { AT_SUN_BRAND_LX_CLKTCK, 0 },
+ { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }
+ };
+ phdr_auxv32[0].a_un.a_val = edp.ed_phdr;
+ phdr_auxv32[1].a_un.a_val = ldaddr;
+ phdr_auxv32[2].a_un.a_val = hz;
+ phdr_auxv32[3].a_un.a_val = lxpd->l_vdso;
+
+ if (copyout(&phdr_auxv32, args->auxp_brand,
+ sizeof (phdr_auxv32)) == -1)
+ return (EFAULT);
+ }
+#endif
+
+ /*
+ * /proc uses the AT_ENTRY aux vector entry to deduce
+ * the location of the executable in the address space. The user
+ * structure contains a copy of the aux vector that needs to have those
+ * entries patched with the values of the real lx executable (they
+ * currently contain the values from the lx brand library that was
+ * elfexec'd, above).
+ *
+ * For live processes, AT_BASE is used to locate the linker segment,
+ * which /proc and friends will later use to find Solaris symbols
+ * (such as rtld_db_preinit). However, for core files, /proc uses
+ * AT_ENTRY to find the right segment to label as the executable.
+ * So we set AT_ENTRY to be the entry point of the linux executable,
+ * but leave AT_BASE to be the address of the Solaris linker.
+ */
+ for (uint_t i = 0; i < __KERN_NAUXV_IMPL; i++) {
+ switch (up->u_auxv[i].a_type) {
+ case AT_ENTRY:
+ up->u_auxv[i].a_un.a_val = edp.ed_entry;
+ break;
+
+ case AT_SUN_BRAND_LX_PHDR:
+ up->u_auxv[i].a_un.a_val = edp.ed_phdr;
+ break;
+
+ case AT_SUN_BRAND_LX_INTERP:
+ up->u_auxv[i].a_un.a_val = ldaddr;
+ break;
+
+ case AT_SUN_BRAND_LX_CLKTCK:
+ up->u_auxv[i].a_un.a_val = hz;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ /*
+ * Record the brand ELF data and new personality now that the exec has
+ * proceeded successfully.
+ */
+ bcopy(&edp, &lxpd->l_elf_data, sizeof (edp));
+ lxpd->l_personality = personality;
+
+ return (0);
+}
+
+boolean_t
+lx_native_exec(uint8_t osabi, const char **interp)
+{
+ if (osabi != ELFOSABI_SOLARIS)
+ return (B_FALSE);
+
+ /*
+ * If the process root matches the zone root, prepend /native to the
+ * interpreter path for native executables. Absolute precision from
+ * VN_CMP is not necessary since any change of process root is likely
+ * to make native binaries inaccessible via /native.
+ *
+ * Processes which chroot directly into /native will be able to
+ * function as expected with no need for the prefix.
+ */
+ mutex_enter(&curproc->p_lock);
+ if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) {
+ *interp = "/native";
+ }
+ mutex_exit(&curproc->p_lock);
+
+ return (B_TRUE);
+}
+
+static void
+lx_syscall_init(void)
+{
+ int i;
+
+ /*
+ * Count up the 32-bit Linux system calls. Note that lx_sysent32
+ * has (LX_NSYSCALLS + 1) entries.
+ */
+ for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
+ continue;
+ lx_nsysent32 = i;
+
+#if defined(_LP64)
+ /*
+ * Count up the 64-bit Linux system calls. Note that lx_sysent64
+ * has (LX_NSYSCALLS + 1) entries.
+ */
+ for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
+ continue;
+ lx_nsysent64 = i;
+#endif
+}
+
+int
+_init(void)
+{
+ int err = 0;
+
+ /* Initialize USER_HZ scaling factor */
+ ASSERT(hz >= LX_USERHZ);
+ lx_hz_scale = hz / LX_USERHZ;
+
+ lx_syscall_init();
+ lx_pid_init();
+ lx_ioctl_init();
+ lx_futex_init();
+ lx_ptrace_init();
+ lx_socket_init();
+ lx_audit_ld();
+
+ err = mod_install(&modlinkage);
+ if (err != 0) {
+ cmn_err(CE_WARN, "Couldn't install lx brand module");
+
+ /*
+ * This looks drastic, but it should never happen. These
+ * two data structures should be completely free-able until
+ * they are used by Linux processes. Since the brand
+ * wasn't loaded there should be no Linux processes, and
+ * thus no way for these data structures to be modified.
+ */
+ lx_pid_fini();
+ lx_ioctl_fini();
+ if (lx_futex_fini())
+ panic("lx brand module cannot be loaded or unloaded.");
+ }
+ return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+ int futex_done = 0;
+
+ /*
+ * If there are any zones using this brand, we can't allow it to be
+ * unloaded.
+ */
+ if (brand_zone_count(&lx_brand))
+ return (EBUSY);
+
+ lx_ptrace_fini();
+ lx_pid_fini();
+ lx_ioctl_fini();
+ lx_socket_fini();
+ lx_audit_unld();
+
+ if ((err = lx_futex_fini()) != 0) {
+ goto done;
+ }
+ futex_done = 1;
+
+ err = mod_remove(&modlinkage);
+
+done:
+ if (err) {
+ /*
+ * If we can't unload the module, then we have to get it
+ * back into a sane state.
+ */
+ lx_ptrace_init();
+ lx_pid_init();
+ lx_ioctl_init();
+ lx_socket_init();
+
+ if (futex_done) {
+ lx_futex_init();
+ }
+ }
+
+ return (err);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_lockd.c b/usr/src/uts/common/brand/lx/os/lx_lockd.c
new file mode 100644
index 0000000000..d6d965398a
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_lockd.c
@@ -0,0 +1,338 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * lx_start_nfs_lockd() starts an NFS lockd (lx_lockd) process inside the zone.
+ * This uses the same technique as used in our lx cgroupfs to launch a release
+ * agent process. This is called implicitly when an NFS mount syscall occurs
+ * within the zone. See the user-level lx_lockd source for the "big theory"
+ * comment behind this.
+ *
+ * lx_upcall_statd() is a brand hook that interposes on the rpc.statd RPC
+ * handling so that we can interface to a Linux rpc.statd that must run
+ * when NFSv3 locking is in use. The rpc.statd handles server or client reboots
+ * and interacts with the lockd to reclaim locks after the server reboots. The
+ * rcp.statd also informs the server when we reboot, so the server can release
+ * the locks we held.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/policy.h>
+#include <sys/vmparam.h>
+#include <sys/contract_impl.h>
+#include <sys/pool.h>
+#include <sys/stack.h>
+#include <sys/var.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/pathname.h>
+#include <rpcsvc/nlm_prot.h>
+#include <rpcsvc/sm_inter.h>
+#include <klm/nlm_impl.h>
+
+#define LX_LOCKD_PATH "/native/usr/lib/brand/lx/lx_lockd"
+
+/* Linux lockd RPC called by statd when it detects an NFS server reboot */
+#define LX_NLMPROC_NSM_NOTIFY 16
+
+/* From uts/common/klm/nlm_impl.c */
+extern void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);
+extern void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
+
+/*
+ * Check if the current lockd is still running.
+ */
+static boolean_t
+lx_lockd_alive(pid_t lockd_pid)
+{
+ boolean_t ret = B_FALSE;
+ proc_t *p;
+ vnode_t *vp;
+ char path[MAXPATHLEN];
+
+ mutex_enter(&pidlock);
+ p = prfind(lockd_pid);
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (B_FALSE);
+ }
+
+ mutex_enter(&p->p_lock);
+ if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ return (B_FALSE);
+ }
+ vp = p->p_exec;
+ VN_HOLD(vp);
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if (vnodetopath(NULL, vp, path, sizeof (path), CRED()) == 0 &&
+ strcmp(path, LX_LOCKD_PATH) == 0) {
+ ret = B_TRUE;
+ }
+
+ VN_RELE(vp);
+ return (ret);
+}
+
+static void
+lx_run_lockd(void *a)
+{
+ proc_t *p = curproc;
+ zone_t *z = curzone;
+ struct core_globals *cg;
+ lx_zone_data_t *lxzd = ztolxzd(z);
+ int res;
+
+ ASSERT(!INGLOBALZONE(p));
+ VERIFY(lxzd != NULL);
+
+ /* The following block is derived from start_init_common */
+ ASSERT_STACK_ALIGNED();
+
+ p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
+ p->p_usrstack = (caddr_t)USRSTACK32;
+ p->p_model = DATAMODEL_ILP32;
+ p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
+ p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
+ p->p_stk_ctl = INT32_MAX;
+
+ p->p_as = as_alloc();
+ p->p_as->a_proc = p;
+ p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
+ (void) hat_setup(p->p_as->a_hat, HAT_INIT);
+
+ VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL);
+
+ corectl_path_hold(cg->core_default_path);
+ corectl_content_hold(cg->core_default_content);
+
+ p->p_corefile = cg->core_default_path;
+ p->p_content = cg->core_default_content;
+
+ init_mstate(curthread, LMS_SYSTEM);
+ res = exec_init(LX_LOCKD_PATH, NULL);
+
+ /* End of code derived from start_init_common */
+
+ /* The following is derived from zone_start_init - see comments there */
+ if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
+ if (proc_exit(CLD_EXITED, res) != 0) {
+ mutex_enter(&p->p_lock);
+ ASSERT(p->p_flag & SEXITLWPS);
+ lwp_exit();
+ }
+ } else {
+ id_t cid = curthread->t_cid;
+
+ mutex_enter(&class_lock);
+ ASSERT(cid < loaded_classes);
+ if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+ z->zone_fixed_hipri) {
+ pcparms_t pcparms;
+
+ pcparms.pc_cid = cid;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+ FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+ FX_DOUPRILIM | FX_DOUPRI;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&p->p_lock);
+ (void) parmsset(&pcparms, curthread);
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+ curthread->t_pri = RTGPPRIO0;
+ }
+ mutex_exit(&class_lock);
+
+ /*
+ * Set our pid as the lockd pid in the zone data, or exit
+ * if another process raced and already did so.
+ */
+ mutex_enter(&lxzd->lxzd_lock);
+ if (lxzd->lxzd_lockd_pid != 0) {
+ /* another mount raced and created a new lockd */
+ mutex_exit(&lxzd->lxzd_lock);
+ if (proc_exit(CLD_EXITED, 0) != 0) {
+ mutex_enter(&p->p_lock);
+ ASSERT(p->p_flag & SEXITLWPS);
+ lwp_exit();
+ }
+ return;
+ }
+ lxzd->lxzd_lockd_pid = p->p_pid;
+ mutex_exit(&lxzd->lxzd_lock);
+
+ /* cause the process to return to userland. */
+ lwp_rtt();
+ }
+}
+
+/*
+ * Launch the user-level, native, lx_lockd process.
+ */
+int
+lx_start_nfs_lockd()
+{
+ id_t cid;
+ proc_t *p = ttoproc(curthread);
+ zone_t *z = p->p_zone;
+ lx_zone_data_t *lxzd = ztolxzd(z);
+
+ ASSERT(!INGLOBALZONE(p));
+ ASSERT(lxzd != NULL);
+
+ /*
+ * This should only be called by the mount emulation, which must have
+ * 'root' privileges in order to have performed a mount, but
+ * double-check.
+ */
+ if (crgetuid(CRED()) != 0)
+ return (EPERM);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ if (lxzd->lxzd_lockd_pid != 0) {
+ /* verify lockd is still alive */
+ pid_t lockd_pid;
+
+ lockd_pid = lxzd->lxzd_lockd_pid;
+ mutex_exit(&lxzd->lxzd_lock);
+
+ if (lx_lockd_alive(lockd_pid))
+ return (EEXIST);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ if (lxzd->lxzd_lockd_pid != lockd_pid) {
+ /* another mount raced and created a new lockd */
+ mutex_exit(&lxzd->lxzd_lock);
+ return (EEXIST);
+ }
+
+ /* old lockd is dead, launch a new one */
+ lxzd->lxzd_lockd_pid = 0;
+ }
+ mutex_exit(&lxzd->lxzd_lock);
+
+ if (z->zone_defaultcid > 0) {
+ cid = z->zone_defaultcid;
+ } else {
+ pool_lock();
+ cid = pool_get_class(z->zone_pool);
+ pool_unlock();
+ }
+ if (cid == -1)
+ cid = defaultcid;
+
+ /*
+ * There's nothing to do here if creating the proc fails, but we
+ * return the result to make it obvious while DTracing.
+ */
+ return (newproc(lx_run_lockd, NULL, cid, minclsyspri - 1, NULL, -1));
+}
+
+void
+lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host)
+{
+ struct nlm_nsm *nsm;
+ struct mon args;
+ struct mon_id *mip = &args.mon_id;
+ int family;
+ netobj obj;
+ enum clnt_stat stat;
+
+ /*
+ * For Linux rpc.statd monitor registration, the Linux NSMPROC_MON and
+ * NSMPROC_UNMON RPC upcalls correspond almost directly to the native
+ * SM_MON and SM_UNMON RPC upcalls. The key differences with the native
+ * registration is that in our nlm_host_monitor function we make two
+ * RPC calls:
+ * - the first RPC (nsmaddrproc1_reg_1) uses our private 'nsm_addr'
+ * RPC protocol to register the lockd RPC information that statd
+ * should call when it detects that the remote server rebooted
+ * - the second RPC (sm_mon_1) tells statd the information about the
+ * remote server to be monitored
+ * For Linux, there is only a single RPC from the kernel to the local
+ * statd. This RPC is equivalent to our sm_mon_1 code, but it uses the
+ * Linux-private NLMPROC_NSM_NOTIFY lockd procedure in the 'my_proc'
+ * RPC parameter. This corresponds to our private 'nsm_addr' code, and
+ * tells statd which lockd RPC to call when it detects a server reboot.
+ *
+ * Because our sm_mon_1 RPC is so similar to the Linux RPC, we can use
+ * that directly and simply set the expected value in the 'my_proc'
+ * argument.
+ *
+ * Within the kernel lockd RPC handling, the nlm_prog_3_dtable dispatch
+ * table has an entry for each lockd RPC function. Thus, this table also
+ * contains an entry for the Linux NLMPROC_NSM_NOTIFY procedure. That
+ * procedure number is unused by the native lockd code, so there is no
+ * conflict with dispatching that procedure. The implementation of the
+ * procedure corresponds to the native, private NLM_SM_NOTIFY1
+ * procedure which is called by the native rpc.statd.
+ *
+ * The Linux RPC call to "unmonitor" a host expects the same arguments
+ * as we pass to monitor, so that is also handled here by this same
+ * brand hook.
+ */
+ nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj);
+ nsm = &g->nlm_nsm;
+
+ bzero(&args, sizeof (args));
+
+ mip->mon_name = host->nh_name;
+ mip->my_id.my_name = uts_nodename();
+ mip->my_id.my_prog = NLM_PROG;
+ mip->my_id.my_vers = NLM_SM;
+ mip->my_id.my_proc = LX_NLMPROC_NSM_NOTIFY;
+ if (op == SM_MON) {
+ bcopy(&host->nh_sysid, args.priv, sizeof (uint16_t));
+ }
+
+ sema_p(&nsm->ns_sem);
+ nlm_nsm_clnt_init(nsm->ns_handle, nsm);
+ if (op == SM_MON) {
+ struct sm_stat_res mres;
+
+ bzero(&mres, sizeof (mres));
+ stat = sm_mon_1(&args, &mres, nsm->ns_handle);
+ } else {
+ struct sm_stat ures;
+
+ ASSERT(op == SM_UNMON);
+ bzero(&ures, sizeof (ures));
+ stat = sm_unmon_1(mip, &ures, nsm->ns_handle);
+ }
+ sema_v(&nsm->ns_sem);
+
+ if (stat != RPC_SUCCESS) {
+ NLM_WARN("Failed to contact local statd, stat=%d", stat);
+ if (op == SM_MON) {
+ mutex_enter(&g->lock);
+ host->nh_flags &= ~NLM_NH_MONITORED;
+ mutex_exit(&g->lock);
+ }
+ }
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c
new file mode 100644
index 0000000000..35e42edaa3
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_misc.c
@@ -0,0 +1,1196 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/archsystm.h>
+#include <sys/privregs.h>
+#include <sys/exec.h>
+#include <sys/lwp.h>
+#include <sys/sem.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_siginfo.h>
+#include <sys/lx_futex.h>
+#include <lx_errno.h>
+#include <sys/lx_userhz.h>
+#include <sys/cmn_err.h>
+#include <sys/siginfo.h>
+#include <sys/contract/process_impl.h>
+#include <sys/x86_archext.h>
+#include <sys/sdt.h>
+#include <lx_signum.h>
+#include <lx_syscall.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <net/if.h>
+#include <inet/ip6.h>
+#include <sys/sunddi.h>
+#include <sys/dlpi.h>
+#include <sys/sysmacros.h>
+
+/* Linux specific functions and definitions */
+static void lx_save(klwp_t *);
+static void lx_restore(klwp_t *);
+
+/*
+ * Set the return code for the forked child, always zero
+ */
+/*ARGSUSED*/
+void
+lx_setrval(klwp_t *lwp, int v1, int v2)
+{
+ lwptoregs(lwp)->r_r0 = 0;
+}
+
+/*
+ * Reset process state on exec(2)
+ */
+void
+lx_exec()
+{
+ klwp_t *lwp = ttolwp(curthread);
+ struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+ proc_t *p = ttoproc(curthread);
+ lx_proc_data_t *pd = ptolxproc(p);
+ struct regs *rp = lwptoregs(lwp);
+
+ /* b_exec is called without p_lock held */
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+ /*
+ * Any l_handler handlers set as a result of B_REGISTER are now
+ * invalid; clear them.
+ */
+ pd->l_handler = NULL;
+
+ /*
+ * If this was a multi-threaded Linux process and this lwp wasn't the
+ * main lwp, then we need to make its Illumos and Linux PIDs match.
+ */
+ if (curthread->t_tid != 1) {
+ lx_pid_reassign(curthread);
+ }
+
+ /*
+ * Inform ptrace(2) that we are processing an execve(2) call so that if
+ * we are traced we can post either the PTRACE_EVENT_EXEC event or the
+ * legacy SIGTRAP.
+ */
+ (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0);
+
+ /* clear the fs/gsbase values until the app. can reinitialize them */
+ lwpd->br_lx_fsbase = NULL;
+ lwpd->br_ntv_fsbase = NULL;
+ lwpd->br_lx_gsbase = NULL;
+ lwpd->br_ntv_gsbase = NULL;
+
+ /*
+ * Clear the native stack flags. This will be reinitialised by
+ * lx_init() in the new process image.
+ */
+ lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
+ lwpd->br_ntv_stack = 0;
+ lwpd->br_ntv_stack_current = 0;
+
+ installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save,
+ NULL);
+
+ /*
+ * clear out the tls array
+ */
+ bzero(lwpd->br_tls, sizeof (lwpd->br_tls));
+
+ /*
+ * reset the tls entries in the gdt
+ */
+ kpreempt_disable();
+ lx_restore(lwp);
+ kpreempt_enable();
+
+ /* Grab the updated argv bounds */
+ mutex_enter(&p->p_lock);
+ lx_read_argv_bounds(p);
+ mutex_exit(&p->p_lock);
+
+ /*
+ * The exec syscall doesn't return (so we don't call lx_syscall_return)
+ * but for our ptrace emulation we need to do this so that a tracer
+ * does not get out of sync. We know that by the time this lx_exec
+ * function is called that the exec has succeeded.
+ */
+ rp->r_r0 = 0;
+ (void) lx_ptrace_stop(LX_PR_SYSEXIT);
+}
+
+static void
+lx_cleanlwp(klwp_t *lwp, proc_t *p)
+{
+ struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+ void *rb_list = NULL;
+
+ VERIFY(lwpd != NULL);
+
+ mutex_enter(&p->p_lock);
+ if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) {
+ lx_ptrace_exit(p, lwp);
+ }
+
+ /*
+ * While we have p_lock, clear the TP_KTHREAD flag. This is needed
+ * to prevent races within lx procfs. It's fine for prchoose() to pick
+ * this thread now since it is exiting and no longer blocked in the
+ * kernel.
+ */
+ lwptot(lwp)->t_proc_flag &= ~TP_KTHREAD;
+
+ /*
+ * While we have p_lock, safely grab any robust_list references and
+ * clear the lwp field.
+ */
+ sprlock_proc(p);
+ rb_list = lwpd->br_robust_list;
+ lwpd->br_robust_list = NULL;
+ sprunlock(p);
+
+ if (rb_list != NULL) {
+ lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid);
+ }
+
+ /*
+ * We need to run our context exit operation (lx_save) here to ensure
+ * we don't leave any garbage around. This is necessary to handle the
+ * following calling sequence:
+ * exit -> proc_exit -> lx_freelwp -> removectx
+ * That is, when our branded process exits, proc_exit will call our
+ * lx_freelwp brand hook which does call this function (lx_cleanlwp),
+ * but lx_freelwp also removes our context exit operation. The context
+ * exit functions are run by exitctx, which is called by either
+ * lwp_exit or thread_exit. The thread_exit function is called at the
+ * end of proc_exit when we'll swtch() to another thread, but by then
+ * our context exit function has been removed.
+ *
+ * It's ok if this function happens to be called more than once (for
+ * example, if we exec a native binary).
+ */
+ kpreempt_disable();
+ lx_save(lwp);
+ kpreempt_enable();
+}
+
+void
+lx_exitlwp(klwp_t *lwp)
+{
+ struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+ proc_t *p = lwptoproc(lwp);
+ kthread_t *t;
+ sigqueue_t *sqp = NULL;
+ pid_t ppid;
+ id_t ptid;
+
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+ if (lwpd == NULL) {
+ /* second time thru' */
+ return;
+ }
+
+ lx_cleanlwp(lwp, p);
+
+ if (lwpd->br_clear_ctidp != NULL) {
+ (void) suword32(lwpd->br_clear_ctidp, 0);
+ (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1,
+ NULL, NULL, 0);
+ lwpd->br_clear_ctidp = NULL;
+ }
+
+ if (lwpd->br_signal != 0) {
+ /*
+ * The first thread in a process doesn't cause a signal to
+ * be sent when it exits. It was created by a fork(), not
+ * a clone(), so the parent should get signalled when the
+ * process exits.
+ */
+ if (lwpd->br_ptid == -1)
+ goto free;
+
+ sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+ /*
+ * If br_ppid is 0, it means this is a CLONE_PARENT thread,
+ * so the signal goes to the parent process - not to a
+ * specific thread in this process.
+ */
+ p = lwptoproc(lwp);
+ if (lwpd->br_ppid == 0) {
+ mutex_enter(&p->p_lock);
+ ppid = p->p_ppid;
+ t = NULL;
+ } else {
+ /*
+ * If we have been reparented to init or if our
+ * parent thread is gone, then nobody gets
+ * signaled.
+ */
+ if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) ||
+ (ptid == -1))
+ goto free;
+
+ mutex_enter(&pidlock);
+ if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ goto free;
+ }
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if ((t = idtot(p, ptid)) == NULL) {
+ mutex_exit(&p->p_lock);
+ goto free;
+ }
+ }
+
+ sqp->sq_info.si_signo = lwpd->br_signal;
+ sqp->sq_info.si_code = lwpd->br_exitwhy;
+ sqp->sq_info.si_status = lwpd->br_exitwhat;
+ sqp->sq_info.si_pid = lwpd->br_pid;
+ sqp->sq_info.si_uid = crgetruid(CRED());
+ sigaddqa(p, t, sqp);
+ mutex_exit(&p->p_lock);
+ sqp = NULL;
+ }
+
+free:
+ if (lwpd->br_scall_args != NULL) {
+ ASSERT(lwpd->br_args_size > 0);
+ kmem_free(lwpd->br_scall_args, lwpd->br_args_size);
+ }
+ if (sqp)
+ kmem_free(sqp, sizeof (sigqueue_t));
+}
+
+void
+lx_freelwp(klwp_t *lwp)
+{
+ struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+ proc_t *p = lwptoproc(lwp);
+ lx_zone_data_t *lxzdata;
+ vfs_t *cgrp;
+
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+ if (lwpd == NULL) {
+ /*
+ * There is one case where an LX branded process will possess
+ * LWPs which lack their own brand data. During the course of
+ * executing native binary, the process will be preemptively
+ * branded to allow hooks such as b_native_exec to function.
+ * If that process possesses multiple LWPS, they will _not_ be
+ * branded since they will exit if the exec succeeds. It's
+ * during this LWP exit that lx_freelwp would be called on an
+ * unbranded LWP. When that is the case, it is acceptable to
+ * bypass the hook.
+ */
+ return;
+ }
+
+ /* cgroup integration */
+ lxzdata = ztolxzd(p->p_zone);
+ mutex_enter(&lxzdata->lxzd_lock);
+ cgrp = lxzdata->lxzd_cgroup;
+ if (cgrp != NULL) {
+ VFS_HOLD(cgrp);
+ mutex_exit(&lxzdata->lxzd_lock);
+ ASSERT(lx_cgrp_freelwp != NULL);
+ (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
+ lwpd->br_pid);
+ VFS_RELE(cgrp);
+ } else {
+ mutex_exit(&lxzdata->lxzd_lock);
+ }
+
+ /*
+ * It is possible for the lx_freelwp hook to be called without a prior
+ * call to lx_exitlwp being made. This happens as part of lwp
+ * de-branding when a native binary is executed from a branded process.
+ *
+ * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well
+ * here in lx_freelwp. When the second call is redundant, the
+ * resources will already be freed and no work will be needed.
+ */
+ lx_cleanlwp(lwp, p);
+
+ /*
+ * Remove our system call interposer.
+ */
+ lwp->lwp_brand_syscall = NULL;
+
+ (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
+ lx_save, NULL);
+ if (lwpd->br_pid != 0) {
+ lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid);
+ }
+
+ /*
+ * Discard the affinity mask.
+ */
+ VERIFY(lwpd->br_affinitymask != NULL);
+ cpuset_free(lwpd->br_affinitymask);
+ lwpd->br_affinitymask = NULL;
+
+ /*
+ * Ensure that lx_ptrace_exit() has been called to detach
+ * ptrace(2) tracers and tracees.
+ */
+ VERIFY(lwpd->br_ptrace_tracer == NULL);
+ VERIFY(lwpd->br_ptrace_accord == NULL);
+
+ lwp->lwp_brand = NULL;
+ kmem_free(lwpd, sizeof (struct lx_lwp_data));
+}
+
+void *
+lx_lwpdata_alloc(proc_t *p)
+{
+ lx_lwp_data_t *lwpd;
+ struct lx_pid *lpidp;
+ cpuset_t *affmask;
+ pid_t newpid = 0;
+ struct pid *pidp = NULL;
+
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+ /*
+ * LWPs beyond the first will require a pid to be allocated to emulate
+ * Linux's goofy thread model. While this allocation may be
+ * unnecessary when a single-lwp process undergoes branding, it cannot
+ * be performed during b_initlwp due to p_lock being held.
+ */
+ if (p->p_lwpcnt > 0) {
+ if ((newpid = pid_allocate(p, 0, 0)) < 0) {
+ return (NULL);
+ }
+ pidp = pid_find(newpid);
+ }
+
+ lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP);
+ lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP);
+ affmask = cpuset_alloc(KM_SLEEP);
+
+ lpidp->lxp_lpid = newpid;
+ lpidp->lxp_pidp = pidp;
+ lwpd->br_lpid = lpidp;
+ lwpd->br_affinitymask = affmask;
+
+ return (lwpd);
+}
+
+/*
+ * Free lwp brand data if an error occurred during lwp_create.
+ * Otherwise, lx_freelwp will be used to free the resources after they're
+ * associated with the lwp via lx_initlwp.
+ */
+void
+lx_lwpdata_free(void *lwpbd)
+{
+ lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
+ VERIFY(lwpd != NULL);
+ VERIFY(lwpd->br_lpid != NULL);
+ VERIFY(lwpd->br_affinitymask != NULL);
+
+ cpuset_free(lwpd->br_affinitymask);
+ if (lwpd->br_lpid->lxp_pidp != NULL) {
+ (void) pid_rele(lwpd->br_lpid->lxp_pidp);
+ }
+ kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid));
+ kmem_free(lwpd, sizeof (*lwpd));
+}
+
+void
+lx_initlwp(klwp_t *lwp, void *lwpbd)
+{
+ lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
+ lx_lwp_data_t *plwpd = ttolxlwp(curthread);
+ kthread_t *tp = lwptot(lwp);
+ proc_t *p = lwptoproc(lwp);
+ lx_zone_data_t *lxzdata;
+ vfs_t *cgrp;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+ VERIFY(lwp->lwp_brand == NULL);
+
+ lwpd->br_exitwhy = CLD_EXITED;
+ lwpd->br_lwp = lwp;
+ lwpd->br_clear_ctidp = NULL;
+ lwpd->br_set_ctidp = NULL;
+ lwpd->br_signal = 0;
+ lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
+ cpuset_all(lwpd->br_affinitymask);
+
+ /*
+ * The first thread in a process has ppid set to the parent
+ * process's pid, and ptid set to -1. Subsequent threads in the
+ * process have their ppid set to the pid of the thread that
+ * created them, and their ptid to that thread's tid.
+ */
+ if (tp->t_next == tp) {
+ lwpd->br_ppid = tp->t_procp->p_ppid;
+ lwpd->br_ptid = -1;
+ } else if (plwpd != NULL) {
+ bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls));
+ lwpd->br_ppid = plwpd->br_pid;
+ lwpd->br_ptid = curthread->t_tid;
+ /* The child inherits the fs/gsbase values from the parent */
+ lwpd->br_lx_fsbase = plwpd->br_lx_fsbase;
+ lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase;
+ lwpd->br_lx_gsbase = plwpd->br_lx_gsbase;
+ lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase;
+ } else {
+ /*
+ * Oddball case: the parent thread isn't a Linux process.
+ */
+ lwpd->br_ppid = 0;
+ lwpd->br_ptid = -1;
+ }
+ lwp->lwp_brand = lwpd;
+
+ /*
+ * When during lx_lwpdata_alloc, we must decide whether or not to
+ * allocate a new pid to associate with the lwp. Since p_lock is not
+ * held at that point, the only time we can guarantee a new pid isn't
+ * needed is when p_lwpcnt == 0. This is because other lwps won't be
+ * present to race with us with regards to pid allocation.
+ *
+ * This means that in all other cases (where p_lwpcnt > 0), we expect
+ * that lx_lwpdata_alloc will allocate a pid for us to use here, even
+ * if it is uneeded. If this process is undergoing an exec, for
+ * example, the single existing lwp will not need a new pid when it is
+ * rebranded. In that case, lx_pid_assign will free the uneeded pid.
+ */
+ VERIFY(lwpd->br_lpid->lxp_pidp != NULL || p->p_lwpcnt == 0);
+
+ lx_pid_assign(tp, lwpd->br_lpid);
+ lwpd->br_tgid = lwpd->br_pid;
+ /*
+ * Having performed the lx pid assignement, the lpid reference is no
+ * longer needed. The underlying data will be freed during lx_freelwp.
+ */
+ lwpd->br_lpid = NULL;
+
+ installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
+ lx_save, NULL);
+
+ /*
+ * Install branded system call hooks for this LWP:
+ */
+ lwp->lwp_brand_syscall = lx_syscall_enter;
+
+ /*
+ * The new LWP inherits the parent LWP cgroup ID.
+ */
+ if (plwpd != NULL) {
+ lwpd->br_cgroupid = plwpd->br_cgroupid;
+ }
+ /*
+ * The new LWP inherits the parent LWP emulated scheduling info.
+ */
+ if (plwpd != NULL) {
+ lwpd->br_schd_class = plwpd->br_schd_class;
+ lwpd->br_schd_pri = plwpd->br_schd_pri;
+ lwpd->br_schd_flags = plwpd->br_schd_flags;
+ lwpd->br_schd_runtime = plwpd->br_schd_runtime;
+ lwpd->br_schd_deadline = plwpd->br_schd_deadline;
+ lwpd->br_schd_period = plwpd->br_schd_period;
+ }
+ lxzdata = ztolxzd(p->p_zone);
+ mutex_enter(&lxzdata->lxzd_lock);
+ cgrp = lxzdata->lxzd_cgroup;
+ if (cgrp != NULL) {
+ VFS_HOLD(cgrp);
+ mutex_exit(&lxzdata->lxzd_lock);
+ ASSERT(lx_cgrp_initlwp != NULL);
+ (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
+ lwpd->br_pid);
+ VFS_RELE(cgrp);
+ } else {
+ mutex_exit(&lxzdata->lxzd_lock);
+ }
+}
+
+void
+lx_initlwp_post(klwp_t *lwp)
+{
+ lx_lwp_data_t *plwpd = ttolxlwp(curthread);
+ /*
+ * If the parent LWP has a ptrace(2) tracer, the new LWP may
+ * need to inherit that same tracer.
+ */
+ if (plwpd != NULL) {
+ lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp));
+ }
+}
+
+/*
+ * There is no need to have any locking for either the source or
+ * destination struct lx_lwp_data structs. This is always run in the
+ * thread context of the source thread, and the destination thread is
+ * always newly created and not referred to from anywhere else.
+ */
+void
+lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
+{
+ struct lx_lwp_data *src = srclwp->lwp_brand;
+ struct lx_lwp_data *dst = dstlwp->lwp_brand;
+
+ dst->br_ppid = src->br_pid;
+ dst->br_ptid = lwptot(srclwp)->t_tid;
+ bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls));
+
+ switch (src->br_stack_mode) {
+ case LX_STACK_MODE_BRAND:
+ case LX_STACK_MODE_NATIVE:
+ /*
+ * The parent LWP has an alternate stack installed.
+ * The child LWP should have the same stack base and extent.
+ */
+ dst->br_stack_mode = src->br_stack_mode;
+ dst->br_ntv_stack = src->br_ntv_stack;
+ dst->br_ntv_stack_current = src->br_ntv_stack_current;
+ break;
+
+ default:
+ /*
+ * Otherwise, clear the stack data for this LWP.
+ */
+ dst->br_stack_mode = LX_STACK_MODE_PREINIT;
+ dst->br_ntv_stack = 0;
+ dst->br_ntv_stack_current = 0;
+ }
+
+ /*
+ * copy only these flags
+ */
+ dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
+ dst->br_scall_args = NULL;
+ lx_affinity_forklwp(srclwp, dstlwp);
+
+ /*
+ * Flag so child doesn't ptrace-stop on syscall exit.
+ */
+ dst->br_ptrace_flags |= LX_PTF_NOSTOP;
+
+ if (src->br_clone_grp_flags != 0) {
+ lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp),
+ lwptoproc(dstlwp));
+ /* clone group no longer pending on this thread */
+ src->br_clone_grp_flags = 0;
+ }
+}
+
+/*
+ * When switching a Linux process off the CPU, clear its GDT entries.
+ */
+/* ARGSUSED */
+static void
+lx_save(klwp_t *t)
+{
+ int i;
+
+#if defined(__amd64)
+ reset_sregs();
+#endif
+ for (i = 0; i < LX_TLSNUM; i++)
+ gdt_update_usegd(GDT_TLSMIN + i, &null_udesc);
+}
+
+/*
+ * When switching a Linux process on the CPU, set its GDT entries.
+ *
+ * For 64-bit code we don't have to worry about explicitly setting the
+ * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen
+ * automatically in update_sregs if we are executing in user-land. If this
+ * is the case then pcb_rupdate should be set.
+ */
+static void
+lx_restore(klwp_t *t)
+{
+ struct lx_lwp_data *lwpd = lwptolxlwp(t);
+ user_desc_t *tls;
+ int i;
+
+ ASSERT(lwpd);
+
+ tls = lwpd->br_tls;
+ for (i = 0; i < LX_TLSNUM; i++)
+ gdt_update_usegd(GDT_TLSMIN + i, &tls[i]);
+}
+
+void
+lx_set_gdt(int entry, user_desc_t *descrp)
+{
+
+ gdt_update_usegd(entry, descrp);
+}
+
+void
+lx_clear_gdt(int entry)
+{
+ gdt_update_usegd(entry, &null_udesc);
+}
+
+longlong_t
+lx_nosys()
+{
+ return (set_errno(ENOSYS));
+}
+
+/*
+ * Brand-specific routine to check if given non-Solaris standard segment
+ * register values should be modified to other values.
+ */
+/*ARGSUSED*/
+greg_t
+lx_fixsegreg(greg_t sr, model_t datamodel)
+{
+ uint16_t idx = SELTOIDX(sr);
+
+ ASSERT(sr == (sr & 0xffff));
+
+ /*
+ * If the segment selector is a valid TLS selector, just return it.
+ */
+ if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX)
+ return (sr | SEL_UPL);
+
+ /*
+ * Force the SR into the LDT in ring 3 for 32-bit processes.
+ *
+ * 64-bit processes get the null GDT selector since they are not
+ * allowed to have a private LDT.
+ */
+#if defined(__amd64)
+ return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0);
+#elif defined(__i386)
+ datamodel = datamodel; /* datamodel currently unused for 32-bit */
+ return (sr | SEL_TI_LDT | SEL_UPL);
+#endif /* __amd64 */
+}
+
+/*
+ * Brand-specific function to convert the fsbase as pulled from the register
+ * into a native fsbase suitable for locating the ulwp_t from the kernel.
+ */
+uintptr_t
+lx_fsbase(klwp_t *lwp, uintptr_t fsbase)
+{
+ lx_lwp_data_t *lwpd = lwp->lwp_brand;
+
+ if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND ||
+ lwpd->br_ntv_fsbase == NULL) {
+ return (fsbase);
+ }
+
+ return (lwpd->br_ntv_fsbase);
+}
+
+/*
+ * These two functions simulate winfo and post_sigcld for the lx brand. The
+ * difference is delivering a designated signal as opposed to always SIGCLD.
+ */
+static void
+lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat)
+{
+ ASSERT(MUTEX_HELD(&pidlock));
+ bzero(ip, sizeof (k_siginfo_t));
+ ip->si_signo = ltos_signo[dat->l_signal];
+ ip->si_code = pp->p_wcode;
+ ip->si_pid = pp->p_pid;
+ ip->si_ctid = PRCTID(pp);
+ ip->si_zoneid = pp->p_zone->zone_id;
+ ip->si_status = pp->p_wdata;
+ /*
+ * These siginfo values are converted to USER_HZ in the user-land
+ * brand signal code.
+ */
+ ip->si_stime = pp->p_stime;
+ ip->si_utime = pp->p_utime;
+}
+
+static void
+lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat)
+{
+ proc_t *pp = cp->p_parent;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+ mutex_enter(&pp->p_lock);
+ /*
+ * Since Linux doesn't queue SIGCHLD, or any other non RT
+ * signals, we just blindly deliver whatever signal we can.
+ */
+ ASSERT(sqp != NULL);
+ lx_winfo(cp, &sqp->sq_info, dat);
+ sigaddqa(pp, NULL, sqp);
+ sqp = NULL;
+ mutex_exit(&pp->p_lock);
+}
+
+
+/*
+ * Brand specific code for exiting and sending a signal to the parent, as
+ * opposed to sigcld().
+ */
+void
+lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp)
+{
+ proc_t *pp = cp->p_parent;
+ lx_proc_data_t *lx_brand_data = ptolxproc(cp);
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ switch (cp->p_wcode) {
+ case CLD_EXITED:
+ case CLD_DUMPED:
+ case CLD_KILLED:
+ ASSERT(cp->p_stat == SZOMB);
+ /*
+ * The broadcast on p_srwchan_cv is a kludge to
+ * wakeup a possible thread in uadmin(A_SHUTDOWN).
+ */
+ cv_broadcast(&cp->p_srwchan_cv);
+
+ /*
+ * Add to newstate list of the parent
+ */
+ add_ns(pp, cp);
+
+ cv_broadcast(&pp->p_cv);
+ if ((pp->p_flag & SNOWAIT) ||
+ PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) {
+ if (!(cp->p_pidflag & CLDWAITPID))
+ freeproc(cp);
+ } else if (!(cp->p_pidflag & CLDNOSIGCHLD) &&
+ lx_brand_data->l_signal != 0) {
+ lx_post_exit_sig(cp, sqp, lx_brand_data);
+ sqp = NULL;
+ }
+ break;
+
+ case CLD_STOPPED:
+ case CLD_CONTINUED:
+ case CLD_TRAPPED:
+ panic("Should not be called in this case");
+ }
+
+ if (sqp)
+ siginfofree(sqp);
+}
+
+/*
+ * Filters based on arguments that have been passed in by a separate syscall
+ * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is
+ * applied, otherwise we look at the difference between a clone and non-clone
+ * process.
+ * The definition of a clone process in Linux is a thread that does not deliver
+ * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone
+ * processes. Without that option, a process should only wait on normal
+ * children. The following table shows the cases.
+ *
+ * default __WCLONE
+ * no SIGCHLD - X
+ * SIGCHLD X -
+ *
+ * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on
+ * process exit.
+ *
+ * More information on wait in lx brands can be found at
+ * usr/src/lib/brand/lx/lx_brand/common/wait.c.
+ */
+/* ARGSUSED */
+boolean_t
+lx_wait_filter(proc_t *pp, proc_t *cp)
+{
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+ int flags = lwpd->br_waitid_flags;
+ boolean_t ret;
+
+ if (!lwpd->br_waitid_emulate) {
+ return (B_TRUE);
+ }
+
+ mutex_enter(&cp->p_lock);
+ if (flags & LX_WALL) {
+ ret = B_TRUE;
+ } else {
+ lx_proc_data_t *pd = ptolxproc(cp);
+ boolean_t is_sigchld = B_TRUE;
+ boolean_t match_wclone = B_FALSE;
+
+ /*
+ * When calling clone, an alternate signal can be chosen to
+ * deliver to the parent when the child exits.
+ */
+ if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) {
+ is_sigchld = B_FALSE;
+ }
+ if ((flags & LX_WCLONE) != 0) {
+ match_wclone = B_TRUE;
+ }
+
+ ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE;
+ }
+ mutex_exit(&cp->p_lock);
+
+ return (ret);
+}
+
+void
+lx_ifname_convert(char *ifname, lx_if_action_t act)
+{
+ if (act == LX_IF_TONATIVE) {
+ if (strncmp(ifname, "lo", IFNAMSIZ) == 0)
+ (void) strlcpy(ifname, "lo0", IFNAMSIZ);
+ } else {
+ if (strncmp(ifname, "lo0", IFNAMSIZ) == 0)
+ (void) strlcpy(ifname, "lo", IFNAMSIZ);
+ }
+}
+
+void
+lx_ifflags_convert(uint64_t *flags, lx_if_action_t act)
+{
+ uint64_t buf;
+
+ buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG |
+ IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS |
+ IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI);
+
+ /* Linux has different shift for multicast flag */
+ if (act == LX_IF_TONATIVE) {
+ if (*flags & 0x1000)
+ buf |= IFF_MULTICAST;
+ } else {
+ if (*flags & IFF_MULTICAST)
+ buf |= 0x1000;
+ }
+ *flags = buf;
+}
+
+/*
+ * Convert an IPv6 address into the numbers used by /proc/net/if_inet6
+ */
+unsigned int
+lx_ipv6_scope_convert(const in6_addr_t *addr)
+{
+ if (IN6_IS_ADDR_V4COMPAT(addr)) {
+ return (LX_IPV6_ADDR_COMPATv4);
+ } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
+ return (LX_IPV6_ADDR_LOOPBACK);
+ } else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
+ return (LX_IPV6_ADDR_LINKLOCAL);
+ } else if (IN6_IS_ADDR_SITELOCAL(addr)) {
+ return (LX_IPV6_ADDR_SITELOCAL);
+ } else {
+ return (0x0000U);
+ }
+}
+
+
+void
+lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size)
+{
+ int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data));
+
+ switch (src->sdl_type) {
+ case DL_ETHER:
+ dst->sa_family = LX_ARPHRD_ETHER;
+ break;
+ case DL_LOOP:
+ dst->sa_family = LX_ARPHRD_LOOPBACK;
+ break;
+ default:
+ dst->sa_family = LX_ARPHRD_VOID;
+ }
+
+ bcopy(LLADDR(src), dst->sa_data, copy_size);
+ *size = copy_size;
+}
+
+/*
+ * Brand hook to convert native kernel siginfo signal number, errno, code, pid
+ * and si_status to Linux values. Similar to the stol_ksiginfo function but
+ * this one converts in-place, converts the pid, and does not copyout.
+ */
+void
+lx_sigfd_translate(k_siginfo_t *infop)
+{
+ zone_t *zone = curproc->p_zone;
+
+ infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL);
+ infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL);
+ infop->si_code = lx_stol_sigcode(infop->si_code);
+ infop->si_errno = lx_errno(infop->si_errno, EINVAL);
+
+ /* Map zsched and zone init to pid 1 */
+ if (infop->si_pid == zone->zone_proc_initpid ||
+ infop->si_pid == zone->zone_zsched->p_pid) {
+ infop->si_pid = 1;
+ }
+}
+
+int
+stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip)
+{
+ lx_siginfo_t lsi;
+
+ bzero(&lsi, sizeof (lsi));
+ lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
+ lsi.lsi_code = lx_stol_sigcode(sip->si_code);
+ lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
+
+ switch (lsi.lsi_signo) {
+ case LX_SIGPOLL:
+ lsi.lsi_band = sip->si_band;
+ lsi.lsi_fd = sip->si_fd;
+ break;
+
+ case LX_SIGCHLD:
+ lsi.lsi_pid = sip->si_pid;
+ if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
+ lsi.lsi_status = sip->si_status;
+ } else {
+ lsi.lsi_status = lx_stol_status(sip->si_status,
+ SIGKILL);
+ }
+ lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
+ lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
+ break;
+
+ case LX_SIGILL:
+ case LX_SIGBUS:
+ case LX_SIGFPE:
+ case LX_SIGSEGV:
+ lsi.lsi_addr = sip->si_addr;
+ break;
+
+ default:
+ lsi.lsi_pid = sip->si_pid;
+ lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
+ }
+
+ if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+int
+stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip)
+{
+ lx_siginfo32_t lsi;
+
+ bzero(&lsi, sizeof (lsi));
+ lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
+ lsi.lsi_code = lx_stol_sigcode(sip->si_code);
+ lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
+
+ switch (lsi.lsi_signo) {
+ case LX_SIGPOLL:
+ lsi.lsi_band = sip->si_band;
+ lsi.lsi_fd = sip->si_fd;
+ break;
+
+ case LX_SIGCHLD:
+ lsi.lsi_pid = sip->si_pid;
+ if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
+ lsi.lsi_status = sip->si_status;
+ } else {
+ lsi.lsi_status = lx_stol_status(sip->si_status,
+ SIGKILL);
+ }
+ lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
+ lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
+ break;
+
+ case LX_SIGILL:
+ case LX_SIGBUS:
+ case LX_SIGFPE:
+ case LX_SIGSEGV:
+ lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr;
+ break;
+
+ default:
+ lsi.lsi_pid = sip->si_pid;
+ lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
+ }
+
+ if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Linux uses the original bounds of the argv array when determining the
+ * contents of /proc/<pid/cmdline. We mimic those bounds using argv[0] and
+ * envp[0] as the beginning and end, respectively.
+ */
+void
+lx_read_argv_bounds(proc_t *p)
+{
+ user_t *up = PTOU(p);
+ lx_proc_data_t *pd = ptolxproc(p);
+ uintptr_t addr_arg = up->u_argv;
+ uintptr_t addr_env = up->u_envp;
+ uintptr_t arg_start = 0, env_start = 0, env_end = 0;
+ int i = 0;
+
+ VERIFY(pd != NULL);
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * Use AT_SUN_PLATFORM in the aux vector to find the end of the envp
+ * strings.
+ */
+ for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
+ if (up->u_auxv[i].a_type == AT_SUN_PLATFORM) {
+ env_end = (uintptr_t)up->u_auxv[i].a_un.a_val;
+ }
+ }
+
+ /*
+ * If we come through here for a kernel process (zsched), which happens
+ * with our cgroupfs when we fork the release agent, then u_argv and
+ * u_envp will be NULL. While this won't cause a failure, it does
+ * cause a lot of overhead when the fuword causes a fault, which leads
+ * to a large amount of stack growth and anonymous memory allocation,
+ * all of which is pointless since the first page can't be mapped.
+ */
+ if (addr_arg != NULL || addr_env != NULL) {
+ mutex_exit(&p->p_lock);
+#if defined(_LP64)
+ if (p->p_model != DATAMODEL_NATIVE) {
+ uint32_t buf32;
+ if (fuword32((void *)addr_arg, &buf32) == 0) {
+ arg_start = (uintptr_t)buf32;
+ }
+ if (fuword32((void *)addr_env, &buf32) == 0) {
+ env_start = (uintptr_t)buf32;
+ }
+ } else
+#endif /* defined(_LP64) */
+ {
+ ulong_t buf;
+ if (fulword((void *)addr_arg, &buf) == 0) {
+ arg_start = (uintptr_t)buf;
+ }
+ if (fulword((void *)addr_env, &buf) == 0) {
+ env_start = (uintptr_t)buf;
+ }
+ }
+ mutex_enter(&p->p_lock);
+ }
+
+ pd->l_args_start = arg_start;
+ pd->l_envs_start = env_start;
+ pd->l_envs_end = env_end;
+}
+
+/* Given an LX LWP, determine where user register state is stored. */
+lx_regs_location_t
+lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write)
+{
+ switch (lwpd->br_stack_mode) {
+ case LX_STACK_MODE_BRAND:
+ /*
+ * The LWP was stopped with the brand stack and register state
+ * loaded, e.g. during a syscall emulated within the kernel.
+ */
+ return (LX_REG_LOC_LWP);
+
+ case LX_STACK_MODE_PREINIT:
+ if (for_write) {
+ /* setting registers not allowed in this state */
+ break;
+ }
+ if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED ||
+ lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) {
+ /* The LWP was stopped by tracing on exec. */
+ return (LX_REG_LOC_LWP);
+ }
+ break;
+
+ case LX_STACK_MODE_NATIVE:
+ if (for_write) {
+ /* setting registers not allowed in this state */
+ break;
+ }
+ if (lwpd->br_ptrace_whystop == PR_BRAND) {
+ /* Called while ptrace-event-stopped by lx_exec. */
+ if (lwpd->br_ptrace_whatstop == LX_PR_EVENT) {
+ return (LX_REG_LOC_LWP);
+ }
+
+ /* Called while ptrace-event-stopped after clone. */
+ if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED &&
+ lwpd->br_ptrace_stopsig == LX_SIGSTOP &&
+ (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
+ return (LX_REG_LOC_LWP);
+ }
+
+ /*
+ * Called to obtain syscall exit for other cases
+ * (e.g. pseudo return from rt_sigreturn).
+ */
+ if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT &&
+ (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
+ return (LX_REG_LOC_LWP);
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (lwpd->br_ptrace_stopucp != NULL) {
+ /*
+ * The LWP was stopped in the usermode emulation library
+ * but a ucontext_t for the preserved brand stack and
+ * register state was provided. Return the register state
+ * from that ucontext_t.
+ */
+ VERIFY(ucp != NULL);
+ *ucp = (void *)lwpd->br_ptrace_stopucp;
+ return (LX_REG_LOC_UCP);
+ }
+
+ return (LX_REG_LOC_UNAVAIL);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_pid.c b/usr/src/uts/common/brand/lx/os/lx_pid.c
new file mode 100644
index 0000000000..8439a23e58
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_pid.c
@@ -0,0 +1,499 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/var.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/brand.h>
+#include <sys/zone.h>
+#include <sys/lx_brand.h>
+
+#define LINUX_PROC_FACTOR 8 /* factor down the hash table by this */
+static int hash_len = 4; /* desired average hash chain length */
+static int hash_size; /* no of buckets in the hash table */
+
+static struct lx_pid **stol_pid_hash;
+static struct lx_pid **ltos_pid_hash;
+
+#define LTOS_HASH(pid) ((pid) & (hash_size - 1))
+#define STOL_HASH(pid, tid) (((pid) + (tid)) & (hash_size - 1))
+
+static kmutex_t hash_lock;
+
+static void
+lx_pid_insert_hash(struct lx_pid *lpidp)
+{
+ int shash = STOL_HASH(lpidp->lxp_spid, lpidp->lxp_stid);
+ int lhash = LTOS_HASH(lpidp->lxp_lpid);
+
+ ASSERT(MUTEX_HELD(&hash_lock));
+
+ lpidp->lxp_stol_next = stol_pid_hash[shash];
+ stol_pid_hash[shash] = lpidp;
+
+ lpidp->lxp_ltos_next = ltos_pid_hash[lhash];
+ ltos_pid_hash[lhash] = lpidp;
+}
+
+static struct lx_pid *
+lx_pid_remove_hash(pid_t pid, id_t tid)
+{
+ struct lx_pid **hpp;
+ struct lx_pid *lpidp = NULL;
+
+ ASSERT(MUTEX_HELD(&hash_lock));
+
+ hpp = &stol_pid_hash[STOL_HASH(pid, tid)];
+ while (*hpp) {
+ if ((*hpp)->lxp_spid == pid && (*hpp)->lxp_stid == tid) {
+ lpidp = *hpp;
+ *hpp = (*hpp)->lxp_stol_next;
+ break;
+ }
+ hpp = &(*hpp)->lxp_stol_next;
+ }
+
+ /*
+ * when called during error recovery the pid may already
+ * be released
+ */
+ if (lpidp == NULL)
+ return (NULL);
+
+ hpp = &ltos_pid_hash[LTOS_HASH(lpidp->lxp_lpid)];
+ while (*hpp) {
+ if (*hpp == lpidp) {
+ *hpp = lpidp->lxp_ltos_next;
+ break;
+ }
+ hpp = &(*hpp)->lxp_ltos_next;
+ }
+
+ return (lpidp);
+}
+
+/*
+ * given a solaris pid/tid pair, create a linux pid
+ */
+void
+lx_pid_assign(kthread_t *t, struct lx_pid *lpidp)
+{
+ proc_t *p = ttoproc(t);
+ lx_lwp_data_t *lwpd = ttolxlwp(t);
+ pid_t spid = p->p_pid;
+ id_t stid = t->t_tid;
+
+ /*
+ * When lx_initlwp is called from lx_setbrand, p_lwpcnt will already be
+ * equal to 1. Since lx_initlwp is being called against an lwp that
+ * already exists, an additional pid allocation is not necessary.
+ *
+ * We check for this by testing br_ppid == 0.
+ */
+ if (p->p_lwpcnt > 0 && lwpd->br_ppid != 0) {
+ /*
+ * Assign allocated pid to any thread other than the first.
+ * The lpid and pidp fields should be populated.
+ */
+ VERIFY(lpidp->lxp_pidp != NULL);
+ VERIFY(lpidp->lxp_lpid != 0);
+ } else {
+ /*
+ * There are cases where a pid is speculatively allocated but
+ * is not needed. We are obligated to free it here.
+ */
+ if (lpidp->lxp_pidp != NULL) {
+ (void) pid_rele(lpidp->lxp_pidp);
+ }
+ lpidp->lxp_pidp = NULL;
+ lpidp->lxp_lpid = spid;
+ }
+
+ lpidp->lxp_spid = spid;
+ lpidp->lxp_stid = stid;
+ lpidp->lxp_start = t->t_start;
+ lpidp->lxp_procp = p;
+
+ /*
+ * Now place the pid into the Linux-SunOS and SunOS-Linux conversion
+ * hash tables.
+ */
+ mutex_enter(&hash_lock);
+ lx_pid_insert_hash(lpidp);
+ mutex_exit(&hash_lock);
+
+ lwpd->br_pid = lpidp->lxp_lpid;
+}
+
+/*
+ * If we are exec()ing the process, this thread's tid is about to be reset
+ * to 1. Make sure the Linux PID bookkeeping reflects that change.
+ */
+void
+lx_pid_reassign(kthread_t *t)
+{
+ proc_t *p = ttoproc(t);
+ struct pid *old_pidp;
+ struct lx_pid *lpidp;
+
+ ASSERT(p->p_lwpcnt == 1);
+
+ mutex_enter(&hash_lock);
+
+ /*
+ * Clean up all the traces of this thread's 'fake' Linux PID.
+ */
+ lpidp = lx_pid_remove_hash(p->p_pid, t->t_tid);
+ ASSERT(lpidp != NULL);
+ old_pidp = lpidp->lxp_pidp;
+ lpidp->lxp_pidp = NULL;
+
+ /*
+ * Now register this thread as (pid, 1).
+ */
+ lpidp->lxp_lpid = p->p_pid;
+ lpidp->lxp_spid = p->p_pid;
+ lpidp->lxp_stid = 1;
+ lx_pid_insert_hash(lpidp);
+
+ mutex_exit(&hash_lock);
+
+ if (old_pidp)
+ (void) pid_rele(old_pidp);
+}
+
+/*
+ * release a solaris pid/tid pair
+ */
+void
+lx_pid_rele(pid_t pid, id_t tid)
+{
+ struct lx_pid *lpidp;
+
+ mutex_enter(&hash_lock);
+ lpidp = lx_pid_remove_hash(pid, tid);
+ mutex_exit(&hash_lock);
+
+ if (lpidp) {
+ if (lpidp->lxp_pidp)
+ (void) pid_rele(lpidp->lxp_pidp);
+
+ kmem_free(lpidp, sizeof (*lpidp));
+ }
+}
+
+/*
+ * given a linux pid, return the solaris pid/tid pair
+ */
+int
+lx_lpid_to_spair(pid_t lpid, pid_t *spid, id_t *stid)
+{
+ struct lx_pid *hp;
+
+ if (lpid == 1) {
+ pid_t initpid;
+
+ /*
+ * We are trying to look up the Linux init process for the
+ * current zone, which we pretend has pid 1.
+ */
+ if ((initpid = curzone->zone_proc_initpid) == -1) {
+ /*
+ * We could not find the init process for this zone.
+ */
+ return (-1);
+ }
+
+ if (spid != NULL)
+ *spid = initpid;
+ if (stid != NULL)
+ *stid = 1;
+
+ return (0);
+ }
+
+ mutex_enter(&hash_lock);
+ for (hp = ltos_pid_hash[LTOS_HASH(lpid)]; hp != NULL;
+ hp = hp->lxp_ltos_next) {
+ if (hp->lxp_lpid == lpid) {
+ if (spid)
+ *spid = hp->lxp_spid;
+ if (stid)
+ *stid = hp->lxp_stid;
+ break;
+ }
+ }
+ mutex_exit(&hash_lock);
+ if (hp != NULL)
+ return (0);
+
+ /*
+ * We didn't find this pid in our translation table.
+ * But this still could be the pid of a native process
+ * running in the current zone so check for that here.
+ *
+ * Note that prfind() only searches for processes in the current zone.
+ */
+ mutex_enter(&pidlock);
+ if (prfind(lpid) != NULL) {
+ mutex_exit(&pidlock);
+ if (spid)
+ *spid = lpid;
+ if (stid)
+ *stid = 0;
+ return (0);
+ }
+ mutex_exit(&pidlock);
+
+ return (-1);
+}
+
+/*
+ * Given a Linux pid, locate the proc_t and optionally acquire P_PR_LOCK.
+ * Returns 0 on success with p_lock held for the proc_t in question.
+ */
+int
+lx_lpid_lock(pid_t lpid, zone_t *zone, lx_pid_flag_t flag, proc_t **pp,
+ kthread_t **tp)
+{
+ proc_t *p;
+ kthread_t *t;
+ id_t tid = 0;
+
+ ASSERT(MUTEX_NOT_HELD(&pidlock));
+ ASSERT(pp != NULL);
+ ASSERT(zone != NULL && zone->zone_brand == &lx_brand);
+
+retry:
+ p = NULL;
+ if (lpid == 1) {
+ pid_t initpid;
+
+ /*
+ * Look up the init process for the zone.
+ */
+ if ((initpid = zone->zone_proc_initpid) <= 0) {
+ return (-1);
+ }
+ mutex_enter(&pidlock);
+ p = prfind_zone(initpid, zone->zone_id);
+ tid = 0;
+ } else {
+ struct lx_pid *hp;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&hash_lock);
+ for (hp = ltos_pid_hash[LTOS_HASH(lpid)]; hp != NULL;
+ hp = hp->lxp_ltos_next) {
+ if (hp->lxp_lpid == lpid) {
+ tid = hp->lxp_stid;
+ p = hp->lxp_procp;
+ break;
+ }
+ }
+ mutex_exit(&hash_lock);
+ /*
+ * If the pid wasn't listed in the ltos hash, it may correspond
+ * to an native process in the zone.
+ */
+ if (p == NULL) {
+ p = prfind_zone(lpid, zone->zone_id);
+ tid = 0;
+ }
+ }
+
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (-1);
+ }
+
+ /*
+ * Bail on processes belonging to the system, those which are not yet
+ * complete and zombies (unless explicitly allowed via the flags).
+ */
+ if (p->p_stat == SIDL || (p->p_flag & SSYS) != 0 ||
+ (p->p_stat == SZOMB && (flag & LXP_ZOMBOK) == 0)) {
+ mutex_exit(&pidlock);
+ return (-1);
+ }
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if (flag & LXP_PRLOCK) {
+ /*
+ * It would be convenient to call sprtrylock_proc() for this
+ * task. Unfortunately, its behavior of filtering zombies is
+ * excessive for some lx_proc use cases. Instead, when the
+ * provided flags do not indicate that zombies are allowed,
+ * exiting processes are filtered out (as would be performed by
+ * sprtrylock_proc).
+ */
+ if ((p->p_flag & (SEXITING|SEXITLWPS)) != 0 &&
+ (flag & LXP_ZOMBOK) == 0) {
+ mutex_exit(&p->p_lock);
+ return (-1);
+ }
+ if (p->p_proc_flag & P_PR_LOCK) {
+ sprwaitlock_proc(p);
+ goto retry;
+ } else {
+ p->p_proc_flag |= P_PR_LOCK;
+ THREAD_KPRI_REQUEST();
+ }
+ }
+
+ if (tid == 0) {
+ t = p->p_tlist;
+ } else {
+ lwpdir_t *ld;
+
+ ld = lwp_hash_lookup(p, tid);
+ if (ld == NULL) {
+ if (flag & LXP_PRLOCK) {
+ sprunprlock(p);
+ }
+ mutex_exit(&p->p_lock);
+ return (-1);
+ }
+ t = ld->ld_entry->le_thread;
+ }
+ *pp = p;
+ if (tp != NULL) {
+ *tp = t;
+ }
+ return (0);
+}
+
+
+/*
+ * Given an lwp, return the Linux pid of its parent. If the caller
+ * wants them, we return the SunOS (pid, tid) as well.
+ */
+pid_t
+lx_lwp_ppid(klwp_t *lwp, pid_t *ppidp, id_t *ptidp)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ proc_t *p = lwptoproc(lwp);
+ const pid_t zoneinit = p->p_zone->zone_proc_initpid;
+ const pid_t ppid = p->p_ppid;
+
+ /*
+ * Report a ppid of 1 for processes which are children to either init
+ * or a process outside the zone.
+ */
+ if (ppid == zoneinit || (p->p_flag & SZONETOP) != 0) {
+ goto ppid_is_zinit;
+ }
+
+ /*
+ * Our native concept of a 'parent pid' matches Linux in two cases:
+ *
+ * - TGID and PID are equal: This is either the first thread in the
+ * process or one created with CLONE_THREAD.
+ *
+ * - The brand lwp value for PPID is 0: This is either the child of a
+ * differently-branded process or was created with the CLONE_PARENT.
+ */
+ if (p->p_pid == lwpd->br_tgid || lwpd->br_ppid == 0) {
+ if (ppidp != NULL)
+ *ppidp = ppid;
+ if (ptidp != NULL)
+ *ptidp = -1;
+ return (ppid);
+ }
+
+ /*
+ * In all other cases, we are looking for the parent of this specific
+ * thread, which in Linux refers to the thread that clone(2)d it. We
+ * stashed that thread's PID away when this thread was created.
+ */
+ mutex_enter(&hash_lock);
+ for (struct lx_pid *hp = ltos_pid_hash[LTOS_HASH(lwpd->br_ppid)];
+ hp != NULL; hp = hp->lxp_ltos_next) {
+ if (lwpd->br_ppid == hp->lxp_lpid) {
+ /*
+ * The PID matches, but there are a couple cases when
+ * the translation is not suitable:
+ *
+ * - The cached start time is too young, indicating
+ * that the thread exited and the PID was reused by
+ * another process.
+ * - The parent is zoneinit
+ *
+ * In both cases, a result of ppid=1 is yielded.
+ */
+ if (hp->lxp_start > lwptot(lwp)->t_start ||
+ lwpd->br_ppid == zoneinit) {
+ break;
+ }
+
+ /* Good match, yield the result */
+ if (ppidp != NULL)
+ *ppidp = hp->lxp_spid;
+ if (ptidp != NULL)
+ *ptidp = hp->lxp_stid;
+ mutex_exit(&hash_lock);
+ return (lwpd->br_ppid);
+ }
+ }
+ mutex_exit(&hash_lock);
+ /*
+ * If no match is found in the Linux->SunOS translation hash, fall back
+ * to assuming the zone init process as the parent.
+ */
+
+ppid_is_zinit:
+ if (ppidp != NULL)
+ *ppidp = 1;
+ if (ptidp != NULL)
+ *ptidp = -1;
+ return (1);
+}
+
+void
+lx_pid_init(void)
+{
+ hash_size = 1 << highbit(v.v_proc / (hash_len * LINUX_PROC_FACTOR));
+
+ stol_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size,
+ KM_SLEEP);
+ ltos_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size,
+ KM_SLEEP);
+
+ mutex_init(&hash_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+lx_pid_fini(void)
+{
+ kmem_free(stol_pid_hash, sizeof (struct lx_pid *) * hash_size);
+ kmem_free(ltos_pid_hash, sizeof (struct lx_pid *) * hash_size);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c
new file mode 100644
index 0000000000..252f83fd3f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c
@@ -0,0 +1,2710 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * Emulation of the Linux ptrace(2) interface.
+ *
+ * OVERVIEW
+ *
+ * The Linux process model is somewhat different from the illumos native
+ * model. One critical difference is that each Linux thread has a unique
+ * identifier in the pid namespace. The lx brand assigns a pid to each LWP
+ * within the emulated process, giving the pid of the process itself to the
+ * first LWP.
+ *
+ * The Linux ptrace(2) interface allows for any LWP in a branded process to
+ * exert control over any other LWP within the same zone. Control is exerted
+ * by the use of the ptrace(2) system call itself, which accepts a number of
+ * request codes. Feedback on traced events is primarily received by the
+ * tracer through SIGCLD and the emulated waitpid(2) and waitid(2) system
+ * calls. Many of the possible ptrace(2) requests will only succeed if the
+ * target LWP is in a "ptrace-stop" condition.
+ *
+ * HISTORY
+ *
+ * The brand support for ptrace(2) was originally built on top of the rich
+ * support for debugging and tracing provided through the illumos /proc
+ * interfaces, mounted at /native/proc within the zone. The native legacy
+ * ptrace(3C) functionality was used as a starting point, but was generally
+ * insufficient for complete and precise emulation. The extant legacy
+ * interface, and indeed our native SIGCLD and waitid(2) facilities, are
+ * focused on _process_ level concerns -- the Linux interface has been
+ * extended to be aware of LWPs as well.
+ *
+ * In order to allow us to focus on providing more complete and accurate
+ * emulation without extensive and undesirable changes to the native
+ * facilities, this second generation ptrace(2) emulation is mostly separate
+ * from any other tracing or debugging framework in the system.
+ *
+ * ATTACHING TRACERS TO TRACEES
+ *
+ * There are several ways that a child LWP may becomed traced by a tracer.
+ * To determine which attach method caused a tracee to become attached, one
+ * may inspect the "br_ptrace_attach" member of the LWP-specific brand data
+ * with the debugger.
+ *
+ * The first attach methods to consider are the attaching ptrace(2) requests:
+ *
+ * PTRACE_TRACEME
+ *
+ * If an LWP makes a PTRACE_TRACEME call, it will be attached as a tracee
+ * to its parent LWP (br_ppid). Using PTRACE_TRACEME does _not_ cause the
+ * tracee to be held in a stop condition. It is common practice for
+ * consumers to raise(SIGSTOP) immediately afterward.
+ *
+ * PTRACE_ATTACH
+ *
+ * An LWP may attempt to trace any other LWP in this, or another, process.
+ * We currently allow any attach where the process containing the tracer
+ * LWP has permission to write to /proc for the process containing the
+ * intended tracer. This action also sends a SIGSTOP to the newly attached
+ * tracee.
+ *
+ * The second class of attach methods are the clone(2)/fork(2) inheritance
+ * options that may be set on a tracee with PTRACE_SETOPTIONS:
+ *
+ * PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE
+ *
+ * If these options have been set on a tracee, then a fork(2), vfork(2) or
+ * clone(2) respectively will cause the newly created LWP to be traced by
+ * the same tracer. The same set of ptrace(2) options will also be set on
+ * the new child.
+ *
+ * The third class of attach method is the PTRACE_CLONE flag to clone(2).
+ * This flag induces the same inheritance as PTRACE_O_TRACECLONE, but is
+ * passed by the tracee as an argument to clone(2).
+ *
+ * DETACHING TRACEES
+ *
+ * Tracees can be detached by the tracer with the PTRACE_DETACH request.
+ * This request is only valid when the tracee is in a ptrace(2) stop
+ * condition, and is itself a restarting action.
+ *
+ * If the tracer exits without detaching all of its tracees, then all of the
+ * tracees are automatically detached and restarted. If a tracee was in
+ * "signal-delivery-stop" at the time the tracer exited, the signal will be
+ * released to the child unless it is a SIGSTOP. We drop this instance of
+ * SIGSTOP in order to prevent the child from becoming stopped by job
+ * control.
+ *
+ * ACCORD ALLOCATION AND MANAGEMENT
+ *
+ * The "lx_ptrace_accord_t" object tracks the agreement between a tracer LWP
+ * and zero or more tracee LWPs. It is explicitly illegal for a tracee to
+ * trace its tracer, and we block this in PTRACE_ATTACH/PTRACE_TRACEME.
+ *
+ * An LWP starts out without an accord. If a child of that LWP calls
+ * ptrace(2) with the PTRACE_TRACEME subcommand, or if the LWP itself uses
+ * PTRACE_ATTACH, an accord will be allocated and stored on that LWP. The
+ * accord structure is not released from that LWP until it arrives in
+ * lx_exitlwp(), as called by lwp_exit(). A new accord will not be
+ * allocated, even if one does not exist, once an LWP arrives in lx_exitlwp()
+ * and sets the LX_PTF_EXITING flag. An LWP will have at most one accord
+ * structure throughout its entire lifecycle; once it has one, it has the
+ * same one until death.
+ *
+ * The accord is reference counted (lxpa_refcnt), starting at a count of one
+ * at creation to represent the link from the tracer LWP to its accord. The
+ * accord is not freed until the reference count falls to zero.
+ *
+ * To make mutual exclusion between a detaching tracer and various notifying
+ * tracees simpler, the tracer will hold "pidlock" while it clears the
+ * accord members that point back to the tracer LWP and CV.
+ *
+ * SIGNALS AND JOB CONTROL
+ *
+ * Various actions, either directly ptrace(2) related or commonly associated
+ * with tracing, cause process- or thread-directed SIGSTOP signals to be sent
+ * to tracees (a "signal-delivery-stop"). These signals, and indeed any signal
+ * other than SIGKILL, can be suppressed by the tracer when using a restarting
+ * request (including PTRACE_DETACH) on a child. The signal may also be
+ * substituted for a different signal.
+ *
+ * If a SIGSTOP (or other stopping signal) is not suppressed by the tracer,
+ * it will induce the regular illumos native job control stop of the entire
+ * traced process. This is at least passingly similar to the Linux "group
+ * stop" ptrace(2) condition.
+ *
+ * SYSTEM CALL TRACING
+ *
+ * The ptrace(2) interface enables the tracer to hold the tracee on entry and
+ * exit from system calls. When a stopped tracee is restarted through the
+ * PTRACE_SYSCALL request, the LX_PTF_SYSCALL flag is set until the next
+ * system call boundary. Whether this is a "syscall-entry-stop" or
+ * "syscall-exit-stop", the tracee is held and the tracer is notified via
+ * SIGCLD/waitpid(2) in the usual way. The flag LX_PTF_SYSCALL flag is
+ * cleared after each stop; for ongoing system call tracing the tracee must
+ * be continuously restarted with PTRACE_SYSCALL.
+ *
+ * SPECIAL CASES FOR STOP EVENTS
+ *
+ * The strace command is one of the primary consumers of ptrace. In order for
+ * strace to properly understand what is actually happening when it receives a
+ * signal associated with a stop event, these signals must match Linux behavior
+ * exactly or the strace consumer will get out of sync and report incorrect
+ * state. There are a couple of special cases we have to handle to provide
+ * proper interaction of the syscall-entry-stop, syscall-exit-stop, and
+ * signal-delivery-stop events:
+ * 1) The child process of a clone/fork does not emit a syscall-exit-stop event.
+ * 2) A signal that arrives between syscall-enter-stop & syscall-exit-stop must
+ * not immediately emit signal-delivery-stop. This event must be emitted
+ * after the syscall is interrupted and syscall-exit-stop has been emitted.
+ *
+ * EVENT STOPS
+ *
+ * Various events (particularly FORK, VFORK, CLONE, EXEC and EXIT) are
+ * enabled by the tracer through PTRACE_SETOPTIONS. Once enabled, the tracee
+ * will be stopped at the nominated points of interest and the tracer
+ * notified. The tracer may request additional information about the event,
+ * such as the pid of new LWPs and processes, via PTRACE_GETEVENTMSG.
+ *
+ * LOCK ORDERING RULES
+ *
+ * It is not safe, in general, to hold p_lock for two different processes at
+ * the same time. This constraint is the primary reason for the existence
+ * (and complexity) of the ptrace(2) accord mechanism.
+ *
+ * In order to facilitate looking up accords by the "pid" of a tracer LWP,
+ * p_lock for the tracer process may be held while entering the accord mutex
+ * (lxpa_lock). This mutex protects the accord flags and reference count.
+ * The reference count is manipulated through lx_ptrace_accord_hold() and
+ * lx_ptrace_accord_rele().
+ *
+ * DO NOT interact with the accord mutex (lxpa_lock) directly. The
+ * lx_ptrace_accord_enter() and lx_ptrace_accord_exit() functions do various
+ * book-keeping and lock ordering enforcement and MUST be used.
+ *
+ * It is NOT legal to take ANY p_lock while holding the accord mutex
+ * (lxpa_lock). If the lxpa_tracees_lock is to be held concurrently with
+ * lxpa_lock, lxpa_lock MUST be taken first and dropped before taking p_lock
+ * of any processes from the tracee list.
+ *
+ * It is NOT legal to take a tracee p_lock and then attempt to enter the
+ * accord mutex (or tracee list mutex) of its tracer. When running as the
+ * tracee LWP, the tracee's hold will prevent the accord from being freed.
+ * Use of the LX_PTF_STOPPING or LX_PTF_CLONING flag in the LWP-specific brand
+ * data prevents an exiting tracer from altering the tracee until the tracee
+ * has come to an orderly stop, without requiring the tracee to hold its own
+ * p_lock the entire time it is stopping.
+ *
+ * It is not safe, in general, to enter "pidlock" while holding the p_lock of
+ * any process. It is similarly illegal to hold any accord locks (lxpa_lock
+ * or lxpa_sublock) while attempting to enter "pidlock". As "pidlock" is a
+ * global mutex, it should be held for the shortest possible time.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/procfs.h>
+#include <sys/cmn_err.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/wait.h>
+#include <sys/prsystm.h>
+#include <sys/note.h>
+
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_misc.h>
+#include <lx_syscall.h>
+#include <lx_signum.h>
+
+
+typedef enum lx_ptrace_cont_flags_t {
+ LX_PTC_NONE = 0x00,
+ LX_PTC_SYSCALL = 0x01,
+ LX_PTC_SINGLESTEP = 0x02
+} lx_ptrace_cont_flags_t;
+
+
+extern int lx_user_regs_copyin(lx_lwp_data_t *, void *);
+extern int lx_user_regs_copyout(lx_lwp_data_t *, void *);
+extern int lx_ptrace_peekuser(lx_lwp_data_t *, uintptr_t, void *);
+extern int lx_ptrace_pokeuser(lx_lwp_data_t *, uintptr_t, void *);
+extern int lx_user_fpregs_copyin(lx_lwp_data_t *, void *);
+extern int lx_user_fpregs_copyout(lx_lwp_data_t *, void *);
+extern int lx_user_fpxregs_copyin(lx_lwp_data_t *, void *);
+extern int lx_user_fpxregs_copyout(lx_lwp_data_t *, void *);
+
+/*
+ * Macros for checking the state of an LWP via "br_ptrace_flags":
+ */
+#define LX_PTRACE_BUSY \
+ (LX_PTF_EXITING | LX_PTF_STOPPING | LX_PTF_CLONING)
+
+#define VISIBLE(a) (((a)->br_ptrace_flags & LX_PTF_EXITING) == 0)
+#define TRACEE_BUSY(a) (((a)->br_ptrace_flags & LX_PTRACE_BUSY) != 0)
+
+#define ACCORD_HELD(a) MUTEX_HELD(&(a)->lxpa_lock)
+
+#define LX_PID_TO_INIT(x) ((x) == curproc->p_zone->zone_proc_initpid ? \
+ 1 : (x))
+#define LX_INIT_TO_PID(x) ((x) == 1 ? \
+ curproc->p_zone->zone_proc_initpid : (x))
+
+static kcondvar_t lx_ptrace_busy_cv;
+static kmem_cache_t *lx_ptrace_accord_cache;
+
+/*
+ * Enter the accord mutex.
+ */
+static void
+lx_ptrace_accord_enter(lx_ptrace_accord_t *accord)
+{
+ VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock));
+
+ mutex_enter(&accord->lxpa_lock);
+}
+
+/*
+ * Exit the accord mutex. If the reference count has dropped to zero,
+ * free the accord.
+ */
+static void
+lx_ptrace_accord_exit(lx_ptrace_accord_t *accord)
+{
+ VERIFY(ACCORD_HELD(accord));
+
+ if (accord->lxpa_refcnt > 0) {
+ mutex_exit(&accord->lxpa_lock);
+ return;
+ }
+
+ /*
+ * When the reference count drops to zero we must free the accord.
+ */
+ VERIFY(accord->lxpa_tracer == NULL);
+ VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock));
+ VERIFY(list_is_empty(&accord->lxpa_tracees));
+ VERIFY(accord->lxpa_flags & LX_ACC_TOMBSTONE);
+
+ mutex_destroy(&accord->lxpa_lock);
+ mutex_destroy(&accord->lxpa_tracees_lock);
+
+ kmem_cache_free(lx_ptrace_accord_cache, accord);
+}
+
+/*
+ * Drop our reference to this accord. If this drops the reference count
+ * to zero, the next lx_ptrace_accord_exit() will free the accord.
+ */
+static void
+lx_ptrace_accord_rele(lx_ptrace_accord_t *accord)
+{
+ VERIFY(ACCORD_HELD(accord));
+
+ VERIFY(accord->lxpa_refcnt > 0);
+ accord->lxpa_refcnt--;
+}
+
+/*
+ * Place an additional hold on an accord.
+ */
+static void
+lx_ptrace_accord_hold(lx_ptrace_accord_t *accord)
+{
+ VERIFY(ACCORD_HELD(accord));
+
+ accord->lxpa_refcnt++;
+}
+
+/*
+ * Fetch the accord for this LWP. If one has not yet been created, and the
+ * process is not exiting, allocate it now. Must be called with p_lock held
+ * for the process containing the target LWP.
+ *
+ * If successful, we return holding the accord lock (lxpa_lock).
+ */
+static int
+lx_ptrace_accord_get_locked(klwp_t *lwp, lx_ptrace_accord_t **accordp,
+ boolean_t allocate_one)
+{
+ lx_ptrace_accord_t *lxpa;
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ proc_t *p = lwptoproc(lwp);
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * If this LWP does not have an accord, we wish to allocate
+ * and install one.
+ */
+ if ((lxpa = lwpd->br_ptrace_accord) == NULL) {
+ if (!allocate_one || !VISIBLE(lwpd)) {
+ /*
+ * Either we do not wish to allocate an accord, or this
+ * LWP has already begun exiting from a ptrace
+ * perspective.
+ */
+ *accordp = NULL;
+ return (ESRCH);
+ }
+
+ lxpa = kmem_cache_alloc(lx_ptrace_accord_cache, KM_SLEEP);
+ bzero(lxpa, sizeof (*lxpa));
+
+ /*
+ * The initial reference count is 1 because we are referencing
+ * it in from the soon-to-be tracer LWP.
+ */
+ lxpa->lxpa_refcnt = 1;
+ mutex_init(&lxpa->lxpa_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&lxpa->lxpa_tracees_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&lxpa->lxpa_tracees, sizeof (lx_lwp_data_t),
+ offsetof(lx_lwp_data_t, br_ptrace_linkage));
+ lxpa->lxpa_cvp = &p->p_cv;
+
+ lxpa->lxpa_tracer = lwpd;
+ lwpd->br_ptrace_accord = lxpa;
+ }
+
+ /*
+ * Lock the accord before returning it to the caller.
+ */
+ lx_ptrace_accord_enter(lxpa);
+
+ /*
+ * There should be at least one active reference to this accord,
+ * otherwise it should have been freed.
+ */
+ VERIFY(lxpa->lxpa_refcnt > 0);
+
+ *accordp = lxpa;
+ return (0);
+}
+
+/*
+ * Accords belong to the tracer LWP. Get the accord for this tracer or return
+ * an error if it was not possible. To prevent deadlocks, the caller MUST NOT
+ * hold p_lock on its own or any other process.
+ *
+ * If successful, we return holding the accord lock (lxpa_lock).
+ */
+static int
+lx_ptrace_accord_get_by_pid(pid_t lxpid, lx_ptrace_accord_t **accordp)
+{
+ int ret = ESRCH;
+ proc_t *aproc;
+ kthread_t *athr;
+ klwp_t *alwp;
+ lx_lwp_data_t *alwpd;
+
+ VERIFY(MUTEX_NOT_HELD(&curproc->p_lock));
+
+ /*
+ * Locate the process containing the tracer LWP based on its Linux pid
+ * and lock it.
+ */
+ if (lx_lpid_lock(lxpid, curzone, LXP_PRLOCK, &aproc, &athr) != 0) {
+ return (ESRCH);
+ }
+
+ /*
+ * Locate the tracer LWP itself and ensure that it is visible to
+ * ptrace(2).
+ */
+ if ((alwp = ttolwp(athr)) == NULL ||
+ (alwpd = lwptolxlwp(alwp)) == NULL ||
+ !VISIBLE(alwpd)) {
+ sprunlock(aproc);
+ return (ESRCH);
+ }
+
+ /*
+ * We should not fetch our own accord this way.
+ */
+ if (athr == curthread) {
+ sprunlock(aproc);
+ return (EPERM);
+ }
+
+ /*
+ * Fetch (or allocate) the accord owned by this tracer LWP:
+ */
+ ret = lx_ptrace_accord_get_locked(alwp, accordp, B_TRUE);
+
+ /*
+ * Unlock the process and return.
+ */
+ sprunlock(aproc);
+ return (ret);
+}
+
+/*
+ * Get (or allocate) the ptrace(2) accord for the current LWP, acting as a
+ * tracer. The caller MUST NOT currently hold p_lock on the process containing
+ * this LWP.
+ *
+ * If successful, we return holding the accord lock (lxpa_lock).
+ */
+static int
+lx_ptrace_accord_get(lx_ptrace_accord_t **accordp, boolean_t allocate_one)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ proc_t *p = lwptoproc(lwp);
+ int ret;
+
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+ /*
+ * Lock the tracer (this LWP).
+ */
+ mutex_enter(&p->p_lock);
+
+ /*
+ * Fetch (or allocate) the accord for this LWP:
+ */
+ ret = lx_ptrace_accord_get_locked(lwp, accordp, allocate_one);
+
+ mutex_exit(&p->p_lock);
+
+ return (ret);
+}
+
+/*
+ * Restart an LWP if it is in "ptrace-stop". This function may induce sleep,
+ * so the caller MUST NOT hold any mutexes other than p_lock for the process
+ * containing the LWP.
+ */
+static void
+lx_ptrace_restart_lwp(klwp_t *lwp)
+{
+ kthread_t *rt = lwptot(lwp);
+ proc_t *rproc = lwptoproc(lwp);
+ lx_lwp_data_t *rlwpd = lwptolxlwp(lwp);
+
+ VERIFY(rt != curthread);
+ VERIFY(MUTEX_HELD(&rproc->p_lock));
+
+ /*
+ * Exclude potential meddling from procfs.
+ */
+ prbarrier(rproc);
+
+ /*
+ * Check that the LWP is still in "ptrace-stop" and, if so, restart it.
+ */
+ thread_lock(rt);
+ if (BSTOPPED(rt) && rt->t_whystop == PR_BRAND) {
+ rt->t_schedflag |= TS_BSTART;
+ setrun_locked(rt);
+
+ /*
+ * Clear stop reason.
+ */
+ rlwpd->br_ptrace_whystop = 0;
+ rlwpd->br_ptrace_whatstop = 0;
+ rlwpd->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND);
+ }
+ thread_unlock(rt);
+}
+
+static void
+lx_ptrace_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag,
+ pid_t *event_ppid, pid_t *event_pid)
+{
+ int signo;
+
+ /*
+ * Populate our k_siginfo_t with data about this "ptrace-stop"
+ * condition:
+ */
+ bzero(ip, sizeof (*ip));
+ ip->si_signo = SIGCLD;
+ ip->si_pid = LX_PID_TO_INIT(remote->br_pid);
+ ip->si_code = CLD_TRAPPED;
+
+ switch (remote->br_ptrace_whatstop) {
+ case LX_PR_SYSENTRY:
+ case LX_PR_SYSEXIT:
+ ip->si_status = SIGTRAP;
+ if (remote->br_ptrace_options & LX_PTRACE_O_TRACESYSGOOD) {
+ ip->si_status |= 0x80;
+ }
+ break;
+
+ case LX_PR_SIGNALLED:
+ signo = remote->br_ptrace_stopsig;
+ if (signo < 1 || signo >= LX_NSIG) {
+ /*
+ * If this signal number is not valid, pretend it
+ * was a SIGTRAP.
+ */
+ ip->si_status = SIGTRAP;
+ } else {
+ ip->si_status = ltos_signo[signo];
+ }
+ break;
+
+ case LX_PR_EVENT:
+ ip->si_status = SIGTRAP | remote->br_ptrace_event;
+ /*
+ * Record the Linux pid of both this LWP and the create
+ * event we are dispatching. We will use this information
+ * to unblock any subsequent ptrace(2) events that depend
+ * on this one.
+ */
+ if (event_ppid != NULL)
+ *event_ppid = remote->br_pid;
+ if (event_pid != NULL)
+ *event_pid = (pid_t)remote->br_ptrace_eventmsg;
+ break;
+
+ default:
+ cmn_err(CE_PANIC, "unxpected stop subreason: %d",
+ remote->br_ptrace_whatstop);
+ }
+
+ /*
+ * If WNOWAIT was specified, do not mark the event as posted
+ * so that it may be re-fetched on another call to waitid().
+ */
+ if (waitflag)
+ remote->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND);
+}
+
+/*
+ * Receive notification from stop() of a PR_BRAND stop.
+ */
+void
+lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ lx_ptrace_accord_t *accord;
+ klwp_t *plwp = NULL;
+ proc_t *pp = NULL;
+ lx_lwp_data_t *parent;
+ boolean_t cldpend = B_TRUE;
+ boolean_t cldpost = B_FALSE;
+ sigqueue_t *sqp = NULL;
+
+ /*
+ * We currently only care about LX-specific stop reasons.
+ */
+ if (why != PR_BRAND)
+ return;
+
+ switch (what) {
+ case LX_PR_SYSENTRY:
+ case LX_PR_SYSEXIT:
+ case LX_PR_SIGNALLED:
+ case LX_PR_EVENT:
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected subreason for PR_BRAND"
+ " stop: %d", (int)what);
+ }
+
+ /*
+ * We should be holding the lock on our containing process. The
+ * STOPPING flag should have been set by lx_ptrace_stop() for all
+ * PR_BRAND stops.
+ */
+ VERIFY(MUTEX_HELD(&p->p_lock));
+ VERIFY(lwpd->br_ptrace_flags & LX_PTF_STOPPING);
+ VERIFY((accord = lwpd->br_ptrace_tracer) != NULL);
+
+ /*
+ * We must drop our process lock to take "pidlock". The
+ * LX_PTF_STOPPING flag protects us from an exiting or detaching tracer.
+ */
+ mutex_exit(&p->p_lock);
+
+ /*
+ * Allocate before we enter any mutexes.
+ */
+ sqp = kmem_zalloc(sizeof (*sqp), KM_SLEEP);
+
+ /*
+ * We take pidlock now, which excludes all callers of waitid() and
+ * prevents an exiting tracer from clearing critical accord members.
+ */
+ mutex_enter(&pidlock);
+ mutex_enter(&p->p_lock);
+
+ /*
+ * Get the ptrace(2) "parent" process, to which we may send
+ * a SIGCLD signal later.
+ */
+ if ((parent = accord->lxpa_tracer) != NULL &&
+ (plwp = parent->br_lwp) != NULL) {
+ pp = lwptoproc(plwp);
+ }
+
+ /*
+ * Our tracer should not have been modified in our absence; the
+ * LX_PTF_STOPPING flag prevents it.
+ */
+ VERIFY(lwpd->br_ptrace_tracer == accord);
+
+ /*
+ * Stash data for this stop condition in the LWP data while we hold
+ * both pidlock and our p_lock.
+ */
+ lwpd->br_ptrace_whystop = why;
+ lwpd->br_ptrace_whatstop = what;
+ lwpd->br_ptrace_flags |= LX_PTF_WAITPEND;
+
+ /*
+ * If this event does not depend on an event from the parent LWP,
+ * populate the siginfo_t for the event pending on this tracee LWP.
+ */
+ if (!(lwpd->br_ptrace_flags & LX_PTF_PARENT_WAIT) && pp != NULL) {
+ cldpost = B_TRUE;
+ lx_ptrace_winfo(lwpd, &sqp->sq_info, B_FALSE, NULL, NULL);
+ }
+
+ /*
+ * Drop our p_lock so that we may lock the tracer.
+ */
+ mutex_exit(&p->p_lock);
+ if (cldpost && pp != NULL) {
+ /*
+ * Post the SIGCLD to the tracer.
+ */
+ mutex_enter(&pp->p_lock);
+ if (!sigismember(&pp->p_sig, SIGCLD)) {
+ sigaddqa(pp, plwp->lwp_thread, sqp);
+ cldpend = B_FALSE;
+ sqp = NULL;
+ }
+ mutex_exit(&pp->p_lock);
+ }
+
+ /*
+ * We re-take our process lock now. The lock will be held until
+ * the thread is actually marked stopped, so we will not race with
+ * lx_ptrace_lock_if_stopped() or lx_waitid_helper().
+ */
+ mutex_enter(&p->p_lock);
+
+ /*
+ * We clear the STOPPING flag; stop() continues to hold our p_lock
+ * until our thread stop state is visible.
+ */
+ lwpd->br_ptrace_flags &= ~LX_PTF_STOPPING;
+ lwpd->br_ptrace_flags |= LX_PTF_STOPPED;
+ if (cldpend) {
+ /*
+ * We sent the SIGCLD for this new wait condition already.
+ */
+ lwpd->br_ptrace_flags |= LX_PTF_CLDPEND;
+ }
+
+ /*
+ * If lx_ptrace_exit_tracer(), or a detach operation, is trying to
+ * detach our tracer, it will be sleeping on this CV until
+ * LX_PTF_STOPPING is clear. Wake it now.
+ */
+ cv_broadcast(&lx_ptrace_busy_cv);
+
+ /*
+ * While still holding pidlock, we attempt to wake our tracer from a
+ * potential waitid() slumber.
+ */
+ if (accord->lxpa_cvp != NULL) {
+ cv_broadcast(accord->lxpa_cvp);
+ }
+
+ /*
+ * We release pidlock and return as we were called: with our p_lock
+ * held.
+ */
+ mutex_exit(&pidlock);
+
+ if (sqp != NULL) {
+ kmem_free(sqp, sizeof (*sqp));
+ }
+}
+
+/*
+ * For any restarting action (e.g. PTRACE_CONT, PTRACE_SYSCALL or
+ * PTRACE_DETACH) to be allowed, the tracee LWP must be in "ptrace-stop". This
+ * check must ONLY be run on tracees of the current LWP. If the check is
+ * successful, we return with the tracee p_lock held.
+ *
+ * In the case of PTRACE_DETACH, we can return with the tracee locked even if
+ * it is not in "ptrace-stop". This can happen for various reasons, such as if
+ * the remote process is already job-stopped in the kernel. We must still be
+ * able to detach from this process. We return ENOENT in this case.
+ */
+static int
+lx_ptrace_lock_if_stopped(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote,
+ boolean_t detaching)
+{
+ klwp_t *rlwp = remote->br_lwp;
+ proc_t *rproc = lwptoproc(rlwp);
+ kthread_t *rt = lwptot(rlwp);
+
+ /*
+ * We must never check that we, ourselves, are stopped. We must also
+ * have the accord tracee list locked while we lock our tracees.
+ */
+ VERIFY(curthread != rt);
+ VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock));
+ VERIFY(accord->lxpa_tracer == ttolxlwp(curthread));
+
+ /*
+ * Lock the process containing the tracee LWP.
+ */
+ mutex_enter(&rproc->p_lock);
+ if (!VISIBLE(remote)) {
+ /*
+ * The tracee LWP is currently detaching itself as it exits.
+ * It is no longer visible to ptrace(2).
+ */
+ mutex_exit(&rproc->p_lock);
+ return (ESRCH);
+ }
+
+ /*
+ * We must only check whether tracees of the current LWP are stopped.
+ * We check this condition after confirming visibility as an exiting
+ * tracee may no longer be completely consistent.
+ */
+ VERIFY(remote->br_ptrace_tracer == accord);
+
+ if (!(remote->br_ptrace_flags & LX_PTF_STOPPED)) {
+ if (detaching) {
+ /*
+ * The tracee is not in "ptrace-stop", but we still
+ * return with the locked process. This is indicated
+ * by ENOENT.
+ */
+ return (ENOENT);
+ }
+
+ /*
+ * The tracee is not in "ptrace-stop", so we release the
+ * process.
+ */
+ mutex_exit(&rproc->p_lock);
+ return (ESRCH);
+ }
+
+ /*
+ * The tracee is stopped. We return holding its process lock so that
+ * the caller may manipulate it.
+ */
+ return (0);
+}
+
+static int
+lx_ptrace_setoptions(lx_lwp_data_t *remote, uintptr_t options)
+{
+ /*
+ * Check for valid options.
+ */
+ if ((options & ~LX_PTRACE_O_ALL) != 0) {
+ return (EINVAL);
+ }
+
+ /*
+ * Set ptrace options on the target LWP.
+ */
+ remote->br_ptrace_options = (lx_ptrace_options_t)options;
+
+ return (0);
+}
+
+static int
+lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp)
+{
+ int error;
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ uint32_t tmp = remote->br_ptrace_eventmsg;
+
+ error = copyout(&tmp, umsgp, sizeof (uint32_t));
+ } else
+#endif
+ {
+ error = copyout(&remote->br_ptrace_eventmsg, umsgp,
+ sizeof (ulong_t));
+ }
+
+ return (error);
+}
+
+static int
+lx_ptrace_getsiginfo(lx_lwp_data_t *remote, void *usiginfo)
+{
+ klwp_t *lwp = remote->br_lwp;
+ int lx_sig;
+
+ lx_sig = lx_stol_signo(lwp->lwp_cursig, 0);
+ if (lx_sig < 1 || lwp->lwp_curinfo == NULL) {
+ return (EINVAL);
+ }
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ if (stol_ksiginfo32_copyout(&lwp->lwp_curinfo->sq_info,
+ usiginfo) != 0) {
+ return (EFAULT);
+ }
+ } else
+#endif
+ {
+ if (stol_ksiginfo_copyout(&lwp->lwp_curinfo->sq_info,
+ usiginfo) != 0) {
+ return (EFAULT);
+ }
+ }
+
+ return (0);
+}
+
+
+/*
+ * Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface.
+ */
+static int
+lx_ptrace_cont(lx_lwp_data_t *remote, lx_ptrace_cont_flags_t flags, int signo)
+{
+ klwp_t *lwp = remote->br_lwp;
+
+ if (flags & LX_PTC_SINGLESTEP) {
+ /*
+ * We do not currently support single-stepping.
+ */
+ lx_unsupported("PTRACE_SINGLESTEP not currently implemented");
+ return (EINVAL);
+ }
+
+ /*
+ * The tracer may choose to suppress the delivery of a signal, or
+ * select an alternative signal for delivery. If this is an
+ * appropriate ptrace(2) "signal-delivery-stop", br_ptrace_stopsig
+ * will be used as the new signal number.
+ *
+ * As with so many other aspects of the Linux ptrace(2) interface, this
+ * may fail silently if the state machine is not aligned correctly.
+ */
+ remote->br_ptrace_stopsig = signo;
+ remote->br_ptrace_donesig = 0;
+
+ /*
+ * Handle the syscall-stop flag if this is a PTRACE_SYSCALL restart:
+ */
+ if (flags & LX_PTC_SYSCALL) {
+ remote->br_ptrace_flags |= LX_PTF_SYSCALL;
+ } else {
+ remote->br_ptrace_flags &= ~LX_PTF_SYSCALL;
+ }
+
+ lx_ptrace_restart_lwp(lwp);
+
+ return (0);
+}
+
+/*
+ * Implements the PTRACE_DETACH subcommand of the Linux ptrace(2) interface.
+ *
+ * The LWP identified by the Linux pid "lx_pid" will, if it as a tracee of the
+ * current LWP, be detached and (optionally) set runnable.
+ */
+static void
+lx_ptrace_detach(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, int signo,
+ boolean_t restart)
+{
+ klwp_t *rlwp = remote->br_lwp;
+
+ /*
+ * The tracee LWP may have been in "ptrace-stop" (restart is true if
+ * that was the case). We now hold the tracee's p_lock.
+ * Detach the LWP from the accord and set it running.
+ */
+ VERIFY(!TRACEE_BUSY(remote));
+ VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock));
+ remote->br_ptrace_flags &= ~(LX_PTF_SYSCALL | LX_PTF_INHERIT);
+ VERIFY(list_link_active(&remote->br_ptrace_linkage));
+ list_remove(&accord->lxpa_tracees, remote);
+
+ remote->br_ptrace_attach = LX_PTA_NONE;
+ remote->br_ptrace_tracer = NULL;
+ remote->br_ptrace_flags = 0;
+
+ /*
+ * Decrement traced-lwp count for the process.
+ */
+ ASSERT(MUTEX_HELD(&rlwp->lwp_procp->p_lock));
+ VERIFY(ptolxproc(rlwp->lwp_procp)->l_ptrace-- >= 1);
+
+ /*
+ * The tracer may, as described in lx_ptrace_cont(), choose to suppress
+ * or modify the delivered signal.
+ */
+ remote->br_ptrace_stopsig = signo;
+ remote->br_ptrace_donesig = 0;
+
+ if (restart) {
+ lx_ptrace_restart_lwp(rlwp);
+ }
+}
+
+/*
+ * This routine implements the PTRACE_ATTACH operation of the Linux ptrace(2)
+ * interface.
+ *
+ * This LWP is requesting to be attached as a tracer to another LWP -- the
+ * tracee. If a ptrace accord to track the list of tracees has not yet been
+ * allocated, one will be allocated and attached to this LWP now.
+ *
+ * The "br_ptrace_tracer" on the tracee LWP is set to this accord, and the
+ * tracee LWP is then added to the "lxpa_tracees" list in the accord. We drop
+ * locks between these two phases; the only consumer of trace events from this
+ * accord is this LWP, which obviously cannot be running waitpid(2) at the same
+ * time as this call to ptrace(2).
+ */
+static int
+lx_ptrace_attach(pid_t lx_pid)
+{
+ int error = ESRCH;
+ /*
+ * Our (Tracer) LWP:
+ */
+ lx_ptrace_accord_t *accord;
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+ /*
+ * Remote (Tracee) LWP:
+ */
+ proc_t *rproc;
+ kthread_t *rthr;
+ klwp_t *rlwp;
+ lx_lwp_data_t *rlwpd;
+
+ if (lwpd->br_pid == lx_pid) {
+ /*
+ * We cannot trace ourselves.
+ */
+ return (EPERM);
+ }
+
+ /*
+ * Ensure that we have an accord and obtain a lock on it. This
+ * routine should not fail because the LWP cannot make ptrace(2) system
+ * calls after it has begun exiting.
+ */
+ VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING);
+ VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0);
+
+ /*
+ * Place speculative hold in case the attach is successful.
+ */
+ lx_ptrace_accord_hold(accord);
+ lx_ptrace_accord_exit(accord);
+
+ /*
+ * Locate the process containing the tracee LWP based on its Linux pid
+ * and lock it.
+ */
+ if (lx_lpid_lock(lx_pid, curzone, LXP_PRLOCK, &rproc, &rthr) != 0) {
+ /*
+ * We could not find the target process.
+ */
+ goto errout;
+ }
+
+ /*
+ * Locate the tracee LWP.
+ */
+ if ((rlwp = ttolwp(rthr)) == NULL ||
+ (rlwpd = lwptolxlwp(rlwp)) == NULL ||
+ !VISIBLE(rlwpd)) {
+ /*
+ * The LWP could not be found, was not branded, or is not
+ * visible to ptrace(2) at this time.
+ */
+ goto unlock_errout;
+ }
+
+ /*
+ * We now hold the lock on the tracee. Attempt to install ourselves
+ * as the tracer.
+ */
+ if (curproc != rproc && priv_proc_cred_perm(curproc->p_cred, rproc,
+ NULL, VWRITE) != 0) {
+ /*
+ * This process does not have permission to trace the remote
+ * process.
+ */
+ error = EPERM;
+ } else if (rlwpd->br_ptrace_tracer != NULL) {
+ /*
+ * This LWP is already being traced.
+ */
+ VERIFY(list_link_active(&rlwpd->br_ptrace_linkage));
+ VERIFY(rlwpd->br_ptrace_attach != LX_PTA_NONE);
+ error = EPERM;
+ } else {
+ lx_proc_data_t *rprocd = ptolxproc(rproc);
+
+ /*
+ * Bond the tracee to the accord.
+ */
+ VERIFY0(rlwpd->br_ptrace_flags & LX_PTF_EXITING);
+ VERIFY(rlwpd->br_ptrace_attach == LX_PTA_NONE);
+ rlwpd->br_ptrace_attach = LX_PTA_ATTACH;
+ rlwpd->br_ptrace_tracer = accord;
+
+ /* Don't emit ptrace syscall-stop-exit event on kernel exit. */
+ rlwpd->br_ptrace_flags |= LX_PTF_NOSTOP;
+
+ /*
+ * We had no tracer, and are thus not in the tracees list.
+ * It is safe to take the tracee list lock while we insert
+ * ourselves.
+ */
+ mutex_enter(&accord->lxpa_tracees_lock);
+ VERIFY(!list_link_active(&rlwpd->br_ptrace_linkage));
+ list_insert_tail(&accord->lxpa_tracees, rlwpd);
+ /*
+ * Bump traced-lwp count for the remote process.
+ */
+ rprocd->l_ptrace++;
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ /*
+ * Send a thread-directed SIGSTOP.
+ */
+ sigtoproc(rproc, rthr, SIGSTOP);
+
+
+ error = 0;
+ }
+
+unlock_errout:
+ /*
+ * Unlock the process containing the tracee LWP and the accord.
+ */
+ sprunlock(rproc);
+
+errout:
+ if (error != 0) {
+ /*
+ * The attach was not successful. Remove our speculative
+ * hold.
+ */
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_exit(accord);
+ }
+
+ return (error);
+}
+
+int
+lx_ptrace_set_clone_inherit(int option, boolean_t inherit_flag)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ proc_t *p = lwptoproc(lwp);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ switch (option) {
+ case LX_PTRACE_O_TRACEFORK:
+ case LX_PTRACE_O_TRACEVFORK:
+ case LX_PTRACE_O_TRACECLONE:
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ mutex_enter(&p->p_lock);
+
+ lwpd->br_ptrace_clone_option = option;
+
+ if (inherit_flag) {
+ lwpd->br_ptrace_flags |= LX_PTF_INHERIT;
+ } else {
+ lwpd->br_ptrace_flags &= ~LX_PTF_INHERIT;
+ }
+
+ mutex_exit(&p->p_lock);
+ return (0);
+}
+
+/*
+ * If the parent LWP is being traced, we want to attach ourselves to the
+ * same accord.
+ */
+void
+lx_ptrace_inherit_tracer(lx_lwp_data_t *src, lx_lwp_data_t *dst)
+{
+ proc_t *srcp = lwptoproc(src->br_lwp);
+ proc_t *dstp = lwptoproc(dst->br_lwp);
+ lx_ptrace_accord_t *accord;
+ boolean_t is_fork = B_FALSE;
+
+ VERIFY(MUTEX_HELD(&dstp->p_lock));
+ if (srcp != dstp) {
+ /*
+ * In the case of being called via forklwp, some lock shuffling
+ * is required. The destination p_lock must be dropped to
+ * avoid deadlocks when locking the source and manipulating
+ * ptrace accord resources.
+ */
+ is_fork = B_TRUE;
+ sprlock_proc(dstp);
+ mutex_exit(&dstp->p_lock);
+ mutex_enter(&srcp->p_lock);
+ }
+
+ if ((accord = src->br_ptrace_tracer) == NULL) {
+ /*
+ * The source LWP does not have a tracer to inherit.
+ */
+ goto out;
+ }
+
+ /*
+ * There are two conditions to check when determining if the new
+ * child should inherit the same tracer (and tracing options) as its
+ * parent. Either condition is sufficient to trigger inheritance.
+ */
+ dst->br_ptrace_attach = LX_PTA_NONE;
+ if ((src->br_ptrace_options & src->br_ptrace_clone_option) != 0) {
+ /*
+ * Condition 1:
+ * The clone(2), fork(2) and vfork(2) emulated system calls
+ * populate "br_ptrace_clone_option" with the specific
+ * ptrace(2) SETOPTIONS option that applies to this
+ * operation. If the relevant option has been enabled by the
+ * tracer then we inherit.
+ */
+ dst->br_ptrace_attach |= LX_PTA_INHERIT_OPTIONS;
+
+ } else if ((src->br_ptrace_flags & LX_PTF_INHERIT) != 0) {
+ /*
+ * Condition 2:
+ * If the caller opted in to inheritance with the
+ * PTRACE_CLONE flag to clone(2), the LX_PTF_INHERIT flag
+ * will be set and we inherit.
+ */
+ dst->br_ptrace_attach |= LX_PTA_INHERIT_CLONE;
+ }
+
+ /*
+ * These values only apply for the duration of a single clone(2), et
+ * al, system call.
+ */
+ src->br_ptrace_flags &= ~LX_PTF_INHERIT;
+ src->br_ptrace_clone_option = 0;
+
+ if (dst->br_ptrace_attach == LX_PTA_NONE) {
+ /*
+ * No condition triggered inheritance.
+ */
+ goto out;
+ }
+
+ /*
+ * Set the LX_PTF_CLONING flag to prevent us from being detached
+ * while our p_lock is dropped.
+ */
+ src->br_ptrace_flags |= LX_PTF_CLONING;
+ mutex_exit(&srcp->p_lock);
+
+ /*
+ * Hold the accord for the new LWP.
+ */
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_hold(accord);
+ lx_ptrace_accord_exit(accord);
+
+ /*
+ * Install the tracer and copy the current PTRACE_SETOPTIONS options.
+ */
+ dst->br_ptrace_tracer = accord;
+ dst->br_ptrace_options = src->br_ptrace_options;
+
+ /*
+ * This flag prevents waitid() from seeing events for the new child
+ * until the parent is able to post the relevant ptrace event to
+ * the tracer.
+ */
+ dst->br_ptrace_flags |= LX_PTF_PARENT_WAIT;
+
+ mutex_enter(&accord->lxpa_tracees_lock);
+ VERIFY(list_link_active(&src->br_ptrace_linkage));
+ VERIFY(!list_link_active(&dst->br_ptrace_linkage));
+ list_insert_tail(&accord->lxpa_tracees, dst);
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ /*
+ * Relock our process and clear our busy flag.
+ */
+ mutex_enter(&srcp->p_lock);
+ src->br_ptrace_flags &= ~LX_PTF_CLONING;
+
+ /*
+ * Bump traced-lwp count for the process.
+ */
+ ptolxproc(dstp)->l_ptrace++;
+
+ /*
+ * If lx_ptrace_exit_tracer(), or a detach operation, is trying to
+ * detach our tracer, it will be sleeping on this CV until
+ * LX_PTF_CLONING is clear. Wake it now.
+ */
+ cv_broadcast(&lx_ptrace_busy_cv);
+
+out:
+ if (is_fork) {
+ mutex_exit(&srcp->p_lock);
+ mutex_enter(&dstp->p_lock);
+ sprunprlock(dstp);
+ }
+}
+
+static int
+lx_ptrace_traceme(void)
+{
+ int error;
+ boolean_t did_attach = B_FALSE;
+ /*
+ * Our (Tracee) LWP:
+ */
+ klwp_t *lwp = ttolwp(curthread);
+ proc_t *p = lwptoproc(lwp);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ /*
+ * Remote (Tracer) LWP:
+ */
+ lx_ptrace_accord_t *accord;
+
+ /*
+ * We are intending to be the tracee. Fetch (or allocate) the accord
+ * for our parent LWP.
+ */
+ if ((error = lx_ptrace_accord_get_by_pid(lx_lwp_ppid(lwp, NULL,
+ NULL), &accord)) != 0) {
+ /*
+ * Could not determine the Linux pid of the parent LWP, or
+ * could not get the accord for that LWP.
+ */
+ return (error);
+ }
+
+ /*
+ * We now hold the accord lock.
+ */
+ if (accord->lxpa_flags & LX_ACC_TOMBSTONE) {
+ /*
+ * The accord is marked for death; give up now.
+ */
+ lx_ptrace_accord_exit(accord);
+ return (ESRCH);
+ }
+
+ /*
+ * Bump the reference count so that the accord is not freed. We need
+ * to drop the accord lock before we take our own p_lock.
+ */
+ lx_ptrace_accord_hold(accord);
+ lx_ptrace_accord_exit(accord);
+
+ /*
+ * We now lock _our_ process and determine if we can install our parent
+ * as our tracer.
+ */
+ mutex_enter(&p->p_lock);
+ if (lwpd->br_ptrace_tracer != NULL) {
+ /*
+ * This LWP is already being traced.
+ */
+ VERIFY(lwpd->br_ptrace_attach != LX_PTA_NONE);
+ error = EPERM;
+ } else {
+ /*
+ * Bond ourselves to the accord. We already bumped the accord
+ * reference count.
+ */
+ VERIFY(lwpd->br_ptrace_attach == LX_PTA_NONE);
+ lwpd->br_ptrace_attach = LX_PTA_TRACEME;
+ lwpd->br_ptrace_tracer = accord;
+ did_attach = B_TRUE;
+ error = 0;
+
+ /*
+ * Speculatively bump l_ptrace now before dropping p_lock.
+ * It will be reverted if the tracee attachment fails.
+ */
+ ptolxproc(p)->l_ptrace++;
+ }
+ mutex_exit(&p->p_lock);
+
+ /*
+ * Lock the accord tracee list and add this LWP. Once we are in the
+ * tracee list, it is the responsibility of the tracer to detach us.
+ */
+ if (error == 0) {
+ lx_ptrace_accord_enter(accord);
+ mutex_enter(&accord->lxpa_tracees_lock);
+
+ if (!(accord->lxpa_flags & LX_ACC_TOMBSTONE)) {
+ /*
+ * Put ourselves in the tracee list for this accord.
+ */
+ VERIFY(!list_link_active(&lwpd->br_ptrace_linkage));
+ list_insert_tail(&accord->lxpa_tracees, lwpd);
+ mutex_exit(&accord->lxpa_tracees_lock);
+ lx_ptrace_accord_exit(accord);
+
+ return (0);
+ }
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ /*
+ * The accord has been marked for death. We must
+ * untrace ourselves.
+ */
+ error = ESRCH;
+ lx_ptrace_accord_exit(accord);
+
+ /*
+ * Undo speculative increment of ptracer count.
+ */
+ mutex_enter(&p->p_lock);
+ ptolxproc(p)->l_ptrace--;
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Our optimism was unjustified: We were unable to attach. We need to
+ * lock the process containing this LWP again in order to remove the
+ * tracer.
+ */
+ VERIFY(error != 0);
+ mutex_enter(&p->p_lock);
+ if (did_attach) {
+ /*
+ * Verify that things were as we left them:
+ */
+ VERIFY(!list_link_active(&lwpd->br_ptrace_linkage));
+ VERIFY(lwpd->br_ptrace_tracer == accord);
+
+ lwpd->br_ptrace_attach = LX_PTA_NONE;
+ lwpd->br_ptrace_tracer = NULL;
+ }
+ mutex_exit(&p->p_lock);
+
+ /*
+ * Remove our speculative hold on the accord, possibly causing it to be
+ * freed in the process.
+ */
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_exit(accord);
+
+ return (error);
+}
+
+static boolean_t
+lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what)
+{
+ boolean_t reset_nostop = B_FALSE;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * Mark this LWP as stopping and call stop() to enter "ptrace-stop".
+ */
+ VERIFY0(lwpd->br_ptrace_flags & LX_PTF_STOPPING);
+ lwpd->br_ptrace_flags |= LX_PTF_STOPPING;
+
+ if (lwpd->br_lwp->lwp_nostop == 1 &&
+ lwpd->br_ptrace_event == LX_PTRACE_EVENT_EXEC) {
+ /* We need to clear this to get the signal delivered. */
+ lwpd->br_lwp->lwp_nostop = 0;
+ reset_nostop = B_TRUE;
+ }
+
+ stop(PR_BRAND, what);
+
+ if (reset_nostop) {
+ VERIFY(lwpd->br_lwp->lwp_nostop == 0);
+ lwpd->br_lwp->lwp_nostop = 1;
+ }
+
+ /*
+ * We are back from "ptrace-stop" with our process lock held.
+ */
+ lwpd->br_ptrace_flags &= ~(LX_PTF_STOPPING | LX_PTF_STOPPED |
+ LX_PTF_CLDPEND);
+ lwpd->br_ptrace_stopucp = NULL;
+ cv_broadcast(&lx_ptrace_busy_cv);
+ mutex_exit(&p->p_lock);
+
+ return (B_TRUE);
+}
+
+int
+lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg,
+ uintptr_t ucp)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = lwptoproc(lwp);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ mutex_enter(&p->p_lock);
+ if (lwpd->br_ptrace_tracer == NULL) {
+ mutex_exit(&p->p_lock);
+ return (ESRCH);
+ }
+
+ if (!child) {
+ /*
+ * Only the first event posted by a new process is to be held
+ * until the matching parent event is dispatched, and only if
+ * it is a "child" event. This is not a child event, so we
+ * clear the wait flag.
+ */
+ lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT;
+
+ } else if (option == LX_PTRACE_O_TRACEVFORK) {
+ /*
+ * For a child, we have to handle vfork as a special case. In
+ * lx_ptrace_inherit_tracer() we set LX_PTF_PARENT_WAIT to
+ * force events to be delayed until the parent posts its event.
+ * This flag is cleared in lx_waitid_helper() to enforce a
+ * "happens after" relationship. However, this obviously cannot
+ * work for the vfork case. Thus, we clear our flag now so that
+ * we can deliver the signal in lx_stop_notify(), if necessary.
+ */
+ lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT;
+ }
+
+ if (!(lwpd->br_ptrace_options & option)) {
+ if (option == LX_PTRACE_O_TRACEEXEC) {
+ /*
+ * Without PTRACE_O_TRACEEXEC, the Linux kernel will
+ * send SIGTRAP to the process.
+ */
+ sigtoproc(p, t, SIGTRAP);
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+
+ /*
+ * The flag for this trace event is not enabled, so we will not
+ * stop.
+ */
+ mutex_exit(&p->p_lock);
+ return (ESRCH);
+ }
+
+ if (child) {
+ switch (option) {
+ case LX_PTRACE_O_TRACECLONE:
+ case LX_PTRACE_O_TRACEFORK:
+ case LX_PTRACE_O_TRACEVFORK:
+ /*
+ * Send the child LWP a directed SIGSTOP.
+ */
+ sigtoproc(p, t, SIGSTOP);
+ mutex_exit(&p->p_lock);
+ return (0);
+ default:
+ goto nostop;
+ }
+ }
+
+ lwpd->br_ptrace_eventmsg = msg;
+
+ switch (option) {
+ case LX_PTRACE_O_TRACECLONE:
+ lwpd->br_ptrace_event = LX_PTRACE_EVENT_CLONE;
+ break;
+ case LX_PTRACE_O_TRACEEXEC:
+ lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXEC;
+ lwpd->br_ptrace_eventmsg = 0;
+ break;
+ case LX_PTRACE_O_TRACEEXIT:
+ lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXIT;
+ break;
+ case LX_PTRACE_O_TRACEFORK:
+ lwpd->br_ptrace_event = LX_PTRACE_EVENT_FORK;
+ break;
+ case LX_PTRACE_O_TRACEVFORK:
+ lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK;
+ break;
+ case LX_PTRACE_O_TRACEVFORKDONE:
+ lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK_DONE;
+ lwpd->br_ptrace_eventmsg = 0;
+ break;
+ default:
+ goto nostop;
+ }
+
+ /*
+ * Userland may have passed in a ucontext_t pointer for
+ * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped.
+ */
+ lwpd->br_ptrace_stopucp = ucp;
+
+ /*
+ * p_lock for the process containing the tracee will be dropped by
+ * lx_ptrace_stop_common().
+ */
+ return (lx_ptrace_stop_common(p, lwpd, LX_PR_EVENT) ? 0 : ESRCH);
+
+nostop:
+ lwpd->br_ptrace_event = 0;
+ lwpd->br_ptrace_eventmsg = 0;
+ mutex_exit(&p->p_lock);
+ return (ESRCH);
+}
+
+boolean_t
+lx_ptrace_stop(ushort_t what)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ proc_t *p = lwptoproc(lwp);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ VERIFY(what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT ||
+ what == LX_PR_SIGNALLED);
+
+ /*
+ * If we do not have an accord, bail out early.
+ */
+ if (lwpd->br_ptrace_tracer == NULL)
+ return (B_FALSE);
+
+ /*
+ * Lock this process and re-check the condition.
+ */
+ mutex_enter(&p->p_lock);
+
+ /*
+ * The child after a fork/clone doesn't emit syscall-exit-stop event.
+ */
+ if (what == LX_PR_SYSEXIT && (lwpd->br_ptrace_flags & LX_PTF_NOSTOP)) {
+ lwpd->br_ptrace_flags &= ~LX_PTF_NOSTOP;
+ mutex_exit(&p->p_lock);
+ return (B_FALSE);
+ }
+
+ if (lwpd->br_ptrace_tracer == NULL) {
+ VERIFY0(lwpd->br_ptrace_flags & LX_PTF_SYSCALL);
+ mutex_exit(&p->p_lock);
+ return (B_FALSE);
+ }
+
+ if (what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT) {
+ if (what == LX_PR_SYSENTRY) {
+ lwpd->br_ptrace_flags |= LX_PTF_INSYSCALL;
+ } else {
+ lwpd->br_ptrace_flags &= ~LX_PTF_INSYSCALL;
+ }
+
+ /*
+ * This is a syscall-entry-stop or syscall-exit-stop point.
+ */
+ if (!(lwpd->br_ptrace_flags & LX_PTF_SYSCALL)) {
+ /*
+ * A system call stop has not been requested.
+ */
+ mutex_exit(&p->p_lock);
+ return (B_FALSE);
+ }
+
+ /*
+ * The PTRACE_SYSCALL restart command applies only to the next
+ * system call entry or exit. The tracer must restart us with
+ * PTRACE_SYSCALL while we are in ptrace-stop for us to fire
+ * again at the next system call boundary.
+ */
+ lwpd->br_ptrace_flags &= ~LX_PTF_SYSCALL;
+ }
+
+ /*
+ * p_lock for the process containing the tracee will be dropped by
+ * lx_ptrace_stop_common().
+ */
+ return (lx_ptrace_stop_common(p, lwpd, what));
+}
+
+/*
+ * In addition to performing the ptrace sig_stop handling, this function is
+ * also used to block signal from being delivered.
+ *
+ * Return 0 if issig_forreal() should continue on, -1 if issig_forreal should
+ * recheck after we've made changes, or 1 if issig_forreal should stop checking
+ * signals.
+ */
+int
+lx_ptrace_issig_stop(proc_t *p, klwp_t *lwp)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ int lx_sig;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ if (ptolxproc(p)->l_block_all_signals != 0)
+ return (1);
+
+ /*
+ * In very rare circumstances, a process which is almost completely
+ * through proc_exit() may incur issig checks in the current thread via
+ * clean-up actions. The process will still be branded, but the thread
+ * will have already been stripped of any LX-specific data on its way
+ * to the grave. Bail early if the brand data is missing.
+ */
+ if (lwpd == NULL) {
+ return (0);
+ }
+
+ /*
+ * If we do not have an accord, bail out now. Additionally, if there
+ * is no valid signal then we have no reason to stop.
+ */
+ if (lwpd->br_ptrace_tracer == NULL || lwp->lwp_cursig == SIGKILL ||
+ (lwp->lwp_cursig == 0 || lwp->lwp_cursig > NSIG) ||
+ (lx_sig = stol_signo[lwp->lwp_cursig]) < 1) {
+ if (lwp->lwp_cursig == 0) {
+ /*
+ * If this lwp has no current signal, it means that any
+ * signal ignorance enabled by br_ptrace_donesig has
+ * already taken place (the signal was consumed).
+ * By clearing donesig, we declare desire to ignore no
+ * signals for accurate ptracing.
+ */
+ lwpd->br_ptrace_donesig = 0;
+ }
+ return (0);
+ }
+
+ /*
+ * We can't deliver the signal-delivery-stop condition while we're
+ * between the syscall-enter-stop and syscall-exit-stop conditions.
+ * We must first let the signal interrupt the in-progress syscall, let
+ * it emit syscall-exit-stop with the interrupted result, then we'll
+ * come back here to emit signal-delivery-stop.
+ */
+ if (lwpd->br_ptrace_flags & LX_PTF_INSYSCALL) {
+ return (0);
+ }
+
+ /*
+ * We stash the signal on the LWP where our waitid_helper will find it
+ * and enter the ptrace "signal-delivery-stop" condition.
+ */
+ lwpd->br_ptrace_stopsig = lx_sig;
+ lwpd->br_ptrace_donesig = 0;
+ (void) lx_ptrace_stop_common(p, lwpd, LX_PR_SIGNALLED);
+ mutex_enter(&p->p_lock);
+
+ /*
+ * When we return, the signal may have been altered or suppressed.
+ */
+ if (lwpd->br_ptrace_stopsig != lx_sig) {
+ int native_sig;
+ lx_sig = lwpd->br_ptrace_stopsig;
+
+ if (lx_sig >= LX_NSIG) {
+ lx_sig = 0;
+ }
+
+ /*
+ * Translate signal from Linux signal number back to
+ * an illumos native signal.
+ */
+ if (lx_sig >= LX_NSIG || lx_sig < 0 || (native_sig =
+ ltos_signo[lx_sig]) < 1) {
+ /*
+ * The signal is not deliverable.
+ */
+ lwp->lwp_cursig = 0;
+ lwp->lwp_extsig = 0;
+ if (lwp->lwp_curinfo) {
+ siginfofree(lwp->lwp_curinfo);
+ lwp->lwp_curinfo = NULL;
+ }
+ } else {
+ /*
+ * Alter the currently dispatching signal.
+ */
+ if (native_sig == SIGKILL) {
+ /*
+ * We mark ourselves the victim and request
+ * a restart of signal processing.
+ */
+ p->p_flag |= SKILLED;
+ p->p_flag &= ~SEXTKILLED;
+ return (-1);
+ }
+ lwp->lwp_cursig = native_sig;
+ lwp->lwp_extsig = 0;
+ if (lwp->lwp_curinfo != NULL) {
+ lwp->lwp_curinfo->sq_info.si_signo = native_sig;
+ }
+ }
+ }
+
+ lwpd->br_ptrace_donesig = lwp->lwp_cursig;
+ lwpd->br_ptrace_stopsig = 0;
+ return (0);
+}
+
+boolean_t
+lx_ptrace_sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+ lx_proc_data_t *lxpd = ptolxproc(p);
+
+ /*
+ * Ignored signals and ptrace:
+ *
+ * When a process is being ptraced by another, special care is needed
+ * while handling signals. Since the tracer is interested in all
+ * signals sent to the tracee, an effort must be made to initially
+ * bypass signal ignorance logic. This allows the signal to be placed
+ * in the tracee's sigqueue to be inspected and potentially altered by
+ * the tracer.
+ *
+ * A critical detail in this procedure is how a signal is handled after
+ * tracer has completed processing for the event. If the signal would
+ * have been ignored, were it not for the initial ptrace override, then
+ * lx_ptrace_sig_ignorable must report B_TRUE when the tracee is
+ * restarted and resumes signal processing. This is done by recording
+ * the most recent tracee signal consumed by ptrace.
+ */
+
+ if (lxpd->l_ptrace != 0 && lx_stol_signo(sig, 0) != 0) {
+ /*
+ * This process is being ptraced. Bypass signal ignorance for
+ * anything that maps to a valid Linux signal...
+ */
+ if (lwp != NULL && lwptolxlwp(lwp)->br_ptrace_donesig == sig) {
+ /*
+ * ...Unless it is a signal which has already been
+ * processed by the tracer.
+ */
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+static void
+lx_ptrace_exit_tracer(proc_t *p, lx_lwp_data_t *lwpd,
+ lx_ptrace_accord_t *accord)
+{
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+ lx_ptrace_accord_enter(accord);
+ /*
+ * Mark this accord for death. This means no new tracees can be
+ * attached to this accord.
+ */
+ VERIFY0(accord->lxpa_flags & LX_ACC_TOMBSTONE);
+ accord->lxpa_flags |= LX_ACC_TOMBSTONE;
+ lx_ptrace_accord_exit(accord);
+
+ /*
+ * Walk the list of tracees, detaching them and setting them runnable
+ * if they are stopped.
+ */
+ for (;;) {
+ klwp_t *rlwp;
+ proc_t *rproc;
+ lx_lwp_data_t *remote;
+ kmutex_t *rmp;
+
+ mutex_enter(&accord->lxpa_tracees_lock);
+ if (list_is_empty(&accord->lxpa_tracees)) {
+ mutex_exit(&accord->lxpa_tracees_lock);
+ break;
+ }
+
+ /*
+ * Fetch the first tracee LWP in the list and lock the process
+ * which contains it.
+ */
+ remote = list_head(&accord->lxpa_tracees);
+ rlwp = remote->br_lwp;
+ rproc = lwptoproc(rlwp);
+ /*
+ * The p_lock mutex persists beyond the life of the process
+ * itself. We save the address, here, to prevent the need to
+ * dereference the proc_t after awaking from sleep.
+ */
+ rmp = &rproc->p_lock;
+ mutex_enter(rmp);
+
+ if (TRACEE_BUSY(remote)) {
+ /*
+ * This LWP is currently detaching itself on exit, or
+ * mid-way through stop(). We must wait for this
+ * action to be completed. While we wait on the CV, we
+ * must drop the accord tracee list lock.
+ */
+ mutex_exit(&accord->lxpa_tracees_lock);
+ cv_wait(&lx_ptrace_busy_cv, rmp);
+
+ /*
+ * While we were waiting, some state may have changed.
+ * Restart the walk to be sure we don't miss anything.
+ */
+ mutex_exit(rmp);
+ continue;
+ }
+
+ /*
+ * We now hold p_lock on the process. Remove the tracee from
+ * the list.
+ */
+ VERIFY(list_link_active(&remote->br_ptrace_linkage));
+ list_remove(&accord->lxpa_tracees, remote);
+
+ /*
+ * Unlink the accord and clear our trace flags.
+ */
+ remote->br_ptrace_attach = LX_PTA_NONE;
+ remote->br_ptrace_tracer = NULL;
+ remote->br_ptrace_flags = 0;
+
+ /*
+ * Let go of the list lock before we restart the LWP. We must
+ * not hold any locks other than the process p_lock when
+ * we call lx_ptrace_restart_lwp() as it will thread_lock
+ * the tracee.
+ */
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ /*
+ * Decrement traced-lwp count for the remote process.
+ */
+ VERIFY(ptolxproc(rproc)->l_ptrace-- >= 1);
+
+ /*
+ * Ensure that the LWP is not stopped on our account.
+ */
+ lx_ptrace_restart_lwp(rlwp);
+
+ /*
+ * Unlock the former tracee.
+ */
+ mutex_exit(rmp);
+
+ /*
+ * Drop the hold this tracee had on the accord.
+ */
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_exit(accord);
+ }
+
+ mutex_enter(&p->p_lock);
+ lwpd->br_ptrace_accord = NULL;
+ mutex_exit(&p->p_lock);
+
+ /*
+ * Clean up and release our hold on the accord If we completely
+ * detached all tracee LWPs, this will free the accord. Otherwise, it
+ * will be freed when they complete their cleanup.
+ *
+ * We hold "pidlock" while clearing these members for easy exclusion of
+ * waitid(), etc.
+ */
+ mutex_enter(&pidlock);
+ lx_ptrace_accord_enter(accord);
+ accord->lxpa_cvp = NULL;
+ accord->lxpa_tracer = NULL;
+ mutex_exit(&pidlock);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_exit(accord);
+}
+
+static void
+lx_ptrace_exit_tracee(proc_t *p, lx_lwp_data_t *lwpd)
+{
+ lx_ptrace_accord_t *accord;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * Be careful in the face of detaching and attaching tracers.
+ * lwpd->br_ptrace_tracer is modified only when p->p_lock is held. Lock
+ * ordering says that accord->lxpa_tracees_lock must be taken prior to
+ * p->p_lock, so we have to get a reference to the accord and hold it
+ * across dropping p->p_lock.
+ *
+ * In the face of a tracer going away and a new one coming in, we may
+ * take a lap.
+ */
+again:
+ if ((accord = lwpd->br_ptrace_tracer) == NULL) {
+ return;
+ }
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_hold(accord);
+ lx_ptrace_accord_exit(accord);
+ mutex_exit(&p->p_lock);
+
+ /*
+ * We are the tracee LWP. Lock the accord tracee list and then our
+ * containing process.
+ */
+ mutex_enter(&accord->lxpa_tracees_lock);
+ mutex_enter(&p->p_lock);
+
+ /*
+ * Be sure that the accord currently associated with the lwp is the one
+ * for which we are holding lxpa_tracees_lock.
+ */
+ if (lwpd->br_ptrace_tracer != accord) {
+ mutex_exit(&p->p_lock);
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_exit(accord);
+
+ mutex_enter(&p->p_lock);
+
+ goto again;
+ }
+
+ /*
+ * Remove our reference to the accord. We will release our hold
+ * later.
+ */
+ lwpd->br_ptrace_attach = LX_PTA_NONE;
+ lwpd->br_ptrace_tracer = NULL;
+
+ /*
+ * Remove this LWP from the accord tracee list:
+ */
+ VERIFY(list_link_active(&lwpd->br_ptrace_linkage));
+ list_remove(&accord->lxpa_tracees, lwpd);
+
+ /*
+ * Wake up any tracers waiting for us to detach from the accord.
+ */
+ cv_broadcast(&lx_ptrace_busy_cv);
+
+ /*
+ * Decrement traced-lwp count for the process.
+ */
+ VERIFY(ptolxproc(p)->l_ptrace-- >= 1);
+
+ mutex_exit(&p->p_lock);
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ /*
+ * Grab "pidlock" and wake the tracer if it is blocked in waitid().
+ */
+ mutex_enter(&pidlock);
+ if (accord->lxpa_cvp != NULL) {
+ cv_broadcast(accord->lxpa_cvp);
+ }
+ mutex_exit(&pidlock);
+
+ /*
+ * Release the holds on the accord. One is the hold taken earlier in
+ * this function and the other is lwpd's hold.
+ */
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_exit(accord);
+
+ mutex_enter(&p->p_lock);
+}
+
+/*
+ * This routine is called from lx_exitlwp() when an LWP is ready to exit. If
+ * this LWP is being traced, it will be detached from the tracer's accord. The
+ * routine will also detach any LWPs being traced by this LWP.
+ */
+void
+lx_ptrace_exit(proc_t *p, klwp_t *lwp)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ lx_ptrace_accord_t *accord;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * Mark our LWP as exiting from a ptrace perspective. This will
+ * prevent a new accord from being allocated if one does not exist
+ * already, and will make us invisible to PTRACE_ATTACH/PTRACE_TRACEME.
+ */
+ VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING);
+ lwpd->br_ptrace_flags |= LX_PTF_EXITING;
+
+ if (lwpd->br_ptrace_tracer != NULL) {
+ /*
+ * We are traced by another LWP and must detach ourselves.
+ */
+ lx_ptrace_exit_tracee(p, lwpd);
+ VERIFY(MUTEX_HELD(&p->p_lock));
+ }
+
+ if ((accord = lwpd->br_ptrace_accord) != NULL) {
+ /*
+ * We have been tracing other LWPs, and must detach from
+ * them and clean up our accord.
+ */
+ mutex_exit(&p->p_lock);
+ lx_ptrace_exit_tracer(p, lwpd, accord);
+ mutex_enter(&p->p_lock);
+ }
+}
+
+/*
+ * Called when a SIGCLD signal is dispatched so that we may enqueue another.
+ * Return 0 if we enqueued a signal, or -1 if not.
+ */
+int
+lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ lx_ptrace_accord_t *accord;
+ lx_lwp_data_t *remote;
+ klwp_t *rlwp;
+ proc_t *rproc;
+ boolean_t found = B_FALSE;
+
+ VERIFY(MUTEX_HELD(&pidlock));
+ VERIFY(MUTEX_NOT_HELD(&pp->p_lock));
+ VERIFY(lwptoproc(lwp) == pp);
+
+ mutex_enter(&pp->p_lock);
+ if ((accord = lwpd->br_ptrace_accord) == NULL) {
+ /*
+ * This LWP is not a tracer LWP, so there will be no
+ * SIGCLD.
+ */
+ mutex_exit(&pp->p_lock);
+ return (-1);
+ }
+ mutex_exit(&pp->p_lock);
+
+ mutex_enter(&accord->lxpa_tracees_lock);
+ for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+ remote = list_next(&accord->lxpa_tracees, remote)) {
+ rlwp = remote->br_lwp;
+ rproc = lwptoproc(rlwp);
+
+ /*
+ * Check if this LWP is in "ptrace-stop". If in the correct
+ * stop condition, lock the process containing the tracee LWP.
+ */
+ if (lx_ptrace_lock_if_stopped(accord, remote, B_FALSE) != 0) {
+ continue;
+ }
+
+ if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) {
+ /*
+ * This event depends on waitid() clearing out the
+ * event of another LWP. Skip it for now.
+ */
+ mutex_exit(&rproc->p_lock);
+ continue;
+ }
+
+ if (!(remote->br_ptrace_flags & LX_PTF_CLDPEND)) {
+ /*
+ * No SIGCLD is required for this LWP.
+ */
+ mutex_exit(&rproc->p_lock);
+ continue;
+ }
+
+ if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) ||
+ remote->br_ptrace_whystop == 0 ||
+ remote->br_ptrace_whatstop == 0) {
+ /*
+ * No (new) stop reason to post for this LWP.
+ */
+ mutex_exit(&rproc->p_lock);
+ continue;
+ }
+
+ /*
+ * We found a process of interest. Leave the process
+ * containing the tracee LWP locked and break out of the loop.
+ */
+ found = B_TRUE;
+ break;
+ }
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ if (!found) {
+ return (-1);
+ }
+
+ /*
+ * Generate siginfo for this tracee LWP.
+ */
+ lx_ptrace_winfo(remote, &sqp->sq_info, B_FALSE, NULL, NULL);
+ remote->br_ptrace_flags &= ~LX_PTF_CLDPEND;
+ mutex_exit(&rproc->p_lock);
+
+ mutex_enter(&pp->p_lock);
+ if (sigismember(&pp->p_sig, SIGCLD)) {
+ mutex_exit(&pp->p_lock);
+
+ mutex_enter(&rproc->p_lock);
+ remote->br_ptrace_flags |= LX_PTF_CLDPEND;
+ mutex_exit(&rproc->p_lock);
+
+ return (-1);
+ }
+ sigaddqa(pp, curthread, sqp);
+ mutex_exit(&pp->p_lock);
+
+ return (0);
+}
+
+/*
+ * Consume the next available ptrace(2) event queued against the accord for
+ * this LWP. The event will be emitted as if through waitid(), and converted
+ * by lx_waitpid() and friends before the return to usermode.
+ */
+int
+lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options,
+ boolean_t *brand_wants_wait, int *rval)
+{
+ lx_ptrace_accord_t *accord;
+ klwp_t *lwp = ttolwp(curthread);
+ proc_t *p = lwptoproc(lwp);
+ lx_lwp_data_t *local = lwptolxlwp(lwp);
+ lx_lwp_data_t *remote;
+ boolean_t found = B_FALSE;
+ klwp_t *rlwp = NULL;
+ proc_t *rproc = NULL;
+ pid_t event_pid = 0, event_ppid = 0;
+ boolean_t waitflag = !(options & WNOWAIT);
+ boolean_t target_found = B_FALSE;
+
+ VERIFY(MUTEX_HELD(&pidlock));
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+ /*
+ * By default, we do not expect waitid() to block on our account.
+ */
+ *brand_wants_wait = B_FALSE;
+
+ if (!local->br_waitid_emulate) {
+ /*
+ * This waitid() call is not expecting emulated results.
+ */
+ return (-1);
+ }
+
+ switch (idtype) {
+ case P_ALL:
+ case P_PID:
+ case P_PGID:
+ break;
+ default:
+ /*
+ * This idtype has no power here.
+ */
+ return (-1);
+ }
+
+ if (lx_ptrace_accord_get(&accord, B_FALSE) != 0) {
+ /*
+ * This LWP does not have an accord; it cannot be tracing.
+ */
+ return (-1);
+ }
+
+ /*
+ * We do not need an additional hold on the accord as it belongs to
+ * the running, tracer, LWP.
+ */
+ lx_ptrace_accord_exit(accord);
+
+ mutex_enter(&accord->lxpa_tracees_lock);
+ if (list_is_empty(&accord->lxpa_tracees)) {
+ /*
+ * Though it has an accord, there are currently no tracees in
+ * the list for this LWP.
+ */
+ mutex_exit(&accord->lxpa_tracees_lock);
+ return (-1);
+ }
+
+ /*
+ * Walk the list of tracees and determine if any of them have events to
+ * report.
+ */
+ for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+ remote = list_next(&accord->lxpa_tracees, remote)) {
+ rlwp = remote->br_lwp;
+ rproc = lwptoproc(rlwp);
+
+ /*
+ * We check to see if this LWP matches an id we are waiting for.
+ */
+ switch (idtype) {
+ case P_ALL:
+ break;
+ case P_PID:
+ if (remote->br_pid != id)
+ continue;
+ break;
+ case P_PGID:
+ if (rproc->p_pgrp != id)
+ continue;
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected idtype: %d", idtype);
+ }
+
+ /* This tracee matches provided idtype and id */
+ target_found = B_TRUE;
+
+ /*
+ * Check if this LWP is in "ptrace-stop". If in the correct
+ * stop condition, lock the process containing the tracee LWP.
+ */
+ if (lx_ptrace_lock_if_stopped(accord, remote, B_FALSE) != 0) {
+ continue;
+ }
+
+ if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) {
+ /*
+ * This event depends on waitid() clearing out the
+ * event of another LWP. Skip it for now.
+ */
+ mutex_exit(&rproc->p_lock);
+ continue;
+ }
+
+ if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) ||
+ remote->br_ptrace_whystop == 0 ||
+ remote->br_ptrace_whatstop == 0) {
+ /*
+ * No (new) stop reason to post for this LWP.
+ */
+ mutex_exit(&rproc->p_lock);
+ continue;
+ }
+
+ /*
+ * We found a process of interest. Leave the process
+ * containing the tracee LWP locked and break out of the loop.
+ */
+ found = B_TRUE;
+ break;
+ }
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ if (!found) {
+ /*
+ * There were no events of interest, but we have tracees.
+ * If any of the tracees matched the spcified criteria, signal
+ * to waitid() that it should block if the provided flags allow
+ * for it.
+ */
+ if (target_found) {
+ *brand_wants_wait = B_TRUE;
+ }
+
+ return (-1);
+ }
+
+ /*
+ * Populate the signal information.
+ */
+ lx_ptrace_winfo(remote, ip, waitflag, &event_ppid, &event_pid);
+
+ /*
+ * Unlock the tracee.
+ */
+ mutex_exit(&rproc->p_lock);
+
+ if (event_pid != 0 && event_ppid != 0) {
+ /*
+ * We need to do another pass around the tracee list and
+ * unblock any events that have a "happens after" relationship
+ * with this event.
+ */
+ mutex_enter(&accord->lxpa_tracees_lock);
+ for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+ remote = list_next(&accord->lxpa_tracees, remote)) {
+ rlwp = remote->br_lwp;
+ rproc = lwptoproc(rlwp);
+
+ mutex_enter(&rproc->p_lock);
+
+ if (remote->br_pid != event_pid ||
+ remote->br_ppid != event_ppid) {
+ mutex_exit(&rproc->p_lock);
+ continue;
+ }
+
+ remote->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT;
+
+ mutex_exit(&rproc->p_lock);
+ }
+ mutex_exit(&accord->lxpa_tracees_lock);
+ }
+
+ /*
+ * If we are consuming this wait state, we remove the SIGCLD from
+ * the queue and post another.
+ */
+ if (waitflag) {
+ mutex_exit(&pidlock);
+ sigcld_delete(ip);
+ sigcld_repost();
+ mutex_enter(&pidlock);
+ }
+
+ *rval = 0;
+ return (0);
+}
+
+static int
+lx_ptrace_peek(lx_lwp_data_t *lwpd, uintptr_t addr, void *data)
+{
+ proc_t *p = lwptoproc(lwpd->br_lwp);
+ long buf;
+ int error = 0, size = sizeof (buf);
+
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ size = sizeof (uint32_t);
+ }
+#endif
+ if ((addr & (size - 1)) != 0) {
+ /* unaligned access */
+ return (EINVAL);
+ }
+
+ mutex_exit(&p->p_lock);
+ error = uread(p, &buf, size, addr);
+ mutex_enter(&p->p_lock);
+
+ if (error != 0) {
+ return (EIO);
+ }
+ if (copyout(&buf, data, size) != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+}
+
+static int
+lx_ptrace_poke(lx_lwp_data_t *lwpd, uintptr_t addr, uintptr_t data)
+{
+ proc_t *p = lwptoproc(lwpd->br_lwp);
+ int error = 0, size = sizeof (data);
+
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ size = sizeof (uint32_t);
+ }
+#endif
+ if ((addr & (size - 1)) != 0) {
+ /* unaligned access */
+ return (EINVAL);
+ }
+
+ mutex_exit(&p->p_lock);
+ error = uwrite(p, &data, size, addr);
+ mutex_enter(&p->p_lock);
+
+ if (error != 0) {
+ return (EIO);
+ }
+ return (0);
+}
+
+static int
+lx_ptrace_kill(lx_lwp_data_t *lwpd)
+{
+ sigtoproc(lwptoproc(lwpd->br_lwp), NULL, SIGKILL);
+
+ return (0);
+}
+
+static int
+lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data)
+{
+ lx_lwp_data_t *local = ttolxlwp(curthread);
+ lx_ptrace_accord_t *accord;
+ lx_lwp_data_t *remote;
+ klwp_t *rlwp;
+ proc_t *rproc;
+ int error;
+ boolean_t found = B_FALSE, restart = B_TRUE;
+
+ /*
+ * PTRACE_TRACEME and PTRACE_ATTACH operations induce the tracing of
+ * one LWP by another. The target LWP must not be traced already.
+ */
+ switch (ptrace_op) {
+ case LX_PTRACE_TRACEME:
+ return (lx_ptrace_traceme());
+
+ case LX_PTRACE_ATTACH:
+ return (lx_ptrace_attach(lxpid));
+ }
+
+ /*
+ * Ensure that we have an accord and obtain a lock on it. This routine
+ * should not fail because the LWP cannot make ptrace(2) system calls
+ * after it has begun exiting.
+ */
+ VERIFY0(local->br_ptrace_flags & LX_PTF_EXITING);
+ VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0);
+
+ /*
+ * The accord belongs to this (the tracer) LWP, and we have a hold on
+ * it. We drop the lock so that we can take other locks.
+ */
+ lx_ptrace_accord_exit(accord);
+
+ /*
+ * Does the tracee list contain the pid in question?
+ */
+retry:
+ mutex_enter(&accord->lxpa_tracees_lock);
+ for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+ remote = list_next(&accord->lxpa_tracees, remote)) {
+ if (remote->br_pid == lxpid) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ if (!found) {
+ /*
+ * The requested pid does not appear in the tracee list.
+ */
+ mutex_exit(&accord->lxpa_tracees_lock);
+ return (ESRCH);
+ }
+
+ if (ptrace_op == LX_PTRACE_DETACH) {
+ /*
+ * We're detaching, make sure in-syscall flag is off so that
+ * signal will stop the process directly.
+ */
+ remote->br_ptrace_flags &= ~LX_PTF_INSYSCALL;
+ }
+
+ /*
+ * Attempt to lock the target LWP.
+ */
+ if ((error = lx_ptrace_lock_if_stopped(accord, remote,
+ (ptrace_op == LX_PTRACE_DETACH))) != 0) {
+ /*
+ * The LWP was not in "ptrace-stop". For detach, ENOENT
+ * indicates that the LWP was not in "ptrace-stop", but is
+ * still locked.
+ */
+ if (ptrace_op == LX_PTRACE_DETACH && error == ENOENT) {
+ /*
+ * We're detaching, but the process was not in
+ * ptrace_stop, so we don't want to try to restart it.
+ */
+ restart = B_FALSE;
+ } else {
+ mutex_exit(&accord->lxpa_tracees_lock);
+ return (error);
+ }
+ }
+
+ /*
+ * The target LWP is in "ptrace-stop". We have the containing process
+ * locked.
+ */
+ rlwp = remote->br_lwp;
+ rproc = lwptoproc(rlwp);
+
+ if (ptrace_op == LX_PTRACE_DETACH) {
+ if (TRACEE_BUSY(remote)) {
+ kmutex_t *rmp;
+
+ /*
+ * There is a tricky race condition we have to watch
+ * out for here (for example, if a tracee is in the
+ * kernel in the middle of a syscall). When the tracee
+ * is leaving the kernel, it will set LX_PTF_STOPPING.
+ * In lx_stop_notify() the tracee has to drop its
+ * p_lock, take pidlock, then reacquire p_lock, before
+ * it will clear LX_PTF_STOPPING and set LX_PTF_STOPPED.
+ * During that window, if this tracer is trying to
+ * detach, we have to make sure the tracee is restarted.
+ * We handle this case in the same way we handle
+ * the tracer exiting in lx_ptrace_exit_tracer().
+ */
+ rmp = &rproc->p_lock;
+ mutex_exit(&accord->lxpa_tracees_lock);
+ (void) cv_wait_sig(&lx_ptrace_busy_cv, rmp);
+
+ /*
+ * While we were waiting, state will have changed, so
+ * retry.
+ */
+ mutex_exit(rmp);
+ goto retry;
+ }
+
+ lx_ptrace_detach(accord, remote, (int)data, restart);
+ /*
+ * Drop the lock on both the tracee process and the tracee list.
+ */
+ mutex_exit(&rproc->p_lock);
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ /*
+ * Release a hold from the accord.
+ */
+ lx_ptrace_accord_enter(accord);
+ lx_ptrace_accord_rele(accord);
+ lx_ptrace_accord_exit(accord);
+
+ return (0);
+ }
+
+ /*
+ * The tracees lock is not needed for any of the other operations.
+ * Drop it so further actions can avoid deadlock.
+ */
+ mutex_exit(&accord->lxpa_tracees_lock);
+
+ /*
+ * Process the ptrace(2) request:
+ */
+ switch (ptrace_op) {
+ case LX_PTRACE_CONT:
+ error = lx_ptrace_cont(remote, LX_PTC_NONE, (int)data);
+ break;
+
+ case LX_PTRACE_SYSCALL:
+ error = lx_ptrace_cont(remote, LX_PTC_SYSCALL, (int)data);
+ break;
+
+ case LX_PTRACE_SINGLESTEP:
+ error = lx_ptrace_cont(remote, LX_PTC_SINGLESTEP, (int)data);
+ break;
+
+ case LX_PTRACE_SETOPTIONS:
+ error = lx_ptrace_setoptions(remote, data);
+ break;
+
+ case LX_PTRACE_GETEVENTMSG:
+ error = lx_ptrace_geteventmsg(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_GETREGS:
+ error = lx_user_regs_copyout(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_SETREGS:
+ error = lx_user_regs_copyin(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_GETSIGINFO:
+ error = lx_ptrace_getsiginfo(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_PEEKTEXT:
+ case LX_PTRACE_PEEKDATA:
+ error = lx_ptrace_peek(remote, addr, (void *)data);
+ break;
+
+ case LX_PTRACE_POKETEXT:
+ case LX_PTRACE_POKEDATA:
+ error = lx_ptrace_poke(remote, addr, data);
+ break;
+
+ case LX_PTRACE_PEEKUSER:
+ error = lx_ptrace_peekuser(remote, addr, (void *)data);
+ break;
+
+ case LX_PTRACE_POKEUSER:
+ error = lx_ptrace_pokeuser(remote, addr, (void *)data);
+ break;
+
+ case LX_PTRACE_GETFPREGS:
+ error = lx_user_fpregs_copyout(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_SETFPREGS:
+ error = lx_user_fpregs_copyin(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_GETFPXREGS:
+ error = lx_user_fpxregs_copyout(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_SETFPXREGS:
+ error = lx_user_fpxregs_copyin(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_KILL:
+ error = lx_ptrace_kill(remote);
+ break;
+
+ default:
+ error = EINVAL;
+ }
+
+ /*
+ * Drop the lock on both the tracee process and the tracee list.
+ */
+ mutex_exit(&rproc->p_lock);
+
+ return (error);
+}
+
+int
+lx_ptrace(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data)
+{
+ int error;
+
+ error = lx_ptrace_kernel(ptrace_op, LX_INIT_TO_PID(lxpid), addr, data);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+void
+lx_ptrace_init(void)
+{
+ cv_init(&lx_ptrace_busy_cv, NULL, CV_DEFAULT, NULL);
+
+ lx_ptrace_accord_cache = kmem_cache_create("lx_ptrace_accord",
+ sizeof (lx_ptrace_accord_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+lx_ptrace_fini(void)
+{
+ cv_destroy(&lx_ptrace_busy_cv);
+
+ kmem_cache_destroy(lx_ptrace_accord_cache);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_signal.c b/usr/src/uts/common/brand/lx/os/lx_signal.c
new file mode 100644
index 0000000000..53e0cecc14
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_signal.c
@@ -0,0 +1,50 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/signal.h>
+#include <sys/sunddi.h>
+#include <lx_signum.h>
+
+void
+lx_ltos_sigset(lx_sigset_t *lsigp, k_sigset_t *ssigp)
+{
+ int lx_sig, sig;
+
+ sigemptyset(ssigp);
+ for (lx_sig = 1; lx_sig <= LX_NSIG; lx_sig++) {
+ if (lx_sigismember(lsigp, lx_sig) &&
+ ((sig = ltos_signo[lx_sig]) > 0))
+ sigaddset(ssigp, sig);
+ }
+
+ /* Emulate sigutok() restrictions */
+ ssigp->__sigbits[0] &= (FILLSET0 & ~CANTMASK0);
+ ssigp->__sigbits[1] &= (FILLSET1 & ~CANTMASK1);
+ ssigp->__sigbits[2] &= (FILLSET2 & ~CANTMASK2);
+}
+
+void
+lx_stol_sigset(k_sigset_t *ssigp, lx_sigset_t *lsigp)
+{
+ int sig, lx_sig;
+
+ bzero(lsigp, sizeof (lx_sigset_t));
+ for (sig = 1; sig < NSIG; sig++) {
+ if (sigismember(ssigp, sig) &&
+ ((lx_sig = stol_signo[sig]) > 0))
+ lx_sigaddset(lsigp, lx_sig);
+ }
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c
new file mode 100644
index 0000000000..5a8f9322a0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c
@@ -0,0 +1,1229 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+ */
+
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/thread.h>
+#include <sys/systm.h>
+#include <sys/syscall.h>
+#include <sys/proc.h>
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/model.h>
+#include <sys/privregs.h>
+#include <sys/brand.h>
+#include <sys/machbrand.h>
+#include <sys/sdt.h>
+#include <sys/lx_syscalls.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_misc.h>
+#include <lx_errno.h>
+
+
+/*
+ * Flags for sysent entries:
+ */
+#define LX_SYS_NOSYS_REASON 0x07
+#define LX_SYS_EBPARG6 0x08
+
+/*
+ * Flags that denote the specific reason we do not have a particular system
+ * call. These reasons are only valid if the function is NULL.
+ */
+#define NOSYS_USERMODE 0
+#define NOSYS_NULL 1
+#define NOSYS_NONE 2
+#define NOSYS_NO_EQUIV 3
+#define NOSYS_KERNEL 4
+#define NOSYS_UNDOC 5
+#define NOSYS_OBSOLETE 6
+#define NOSYS_MAX NOSYS_OBSOLETE
+
+#if NOSYS_MAX > LX_SYS_NOSYS_REASON
+#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON
+#endif
+
+/*
+ * Strings describing the reason we do not emulate a particular system call
+ * in the kernel.
+ */
+static char *nosys_reasons[] = {
+ NULL, /* NOSYS_USERMODE means this call is emulated in usermode */
+ "Not done yet",
+ "No such Linux system call",
+ "No equivalent illumos functionality",
+ "Reads/modifies Linux kernel state",
+ "Undocumented and/or rarely used system call",
+ "Unsupported, obsolete system call"
+};
+
+
+#if defined(_LP64)
+/*
+ * System call handler table and entry count for Linux x86_64 (amd64):
+ */
+lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1];
+int lx_nsysent64;
+#endif
+/*
+ * System call handler table and entry count for Linux x86 (i386):
+ */
+lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1];
+int lx_nsysent32;
+
+#if defined(_LP64)
+struct lx_vsyscall
+{
+ uintptr_t lv_addr;
+ uintptr_t lv_scnum;
+} lx_vsyscalls[] = {
+ { LX_VSYS_gettimeofday, LX_SYS_gettimeofday },
+ { LX_VSYS_time, LX_SYS_time },
+ { LX_VSYS_getcpu, LX_SYS_getcpu },
+ { NULL, NULL }
+};
+#endif
+
+#if defined(__amd64)
+static int
+lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args)
+{
+ struct regs *rp = lwptoregs(lwp);
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ /*
+ * Note: Syscall argument passing is different from function
+ * call argument passing on amd64. For function calls, the
+ * fourth arg is passed via %rcx, but for system calls the 4th
+ * arg is passed via %r10. This is because in amd64, the
+ * syscall instruction puts the lower 32 bits of %rflags in
+ * %r11 and puts the %rip value to %rcx.
+ *
+ * Appendix A of the amd64 ABI (Linux conventions) states that
+ * syscalls are limited to 6 args and no arg is passed on the
+ * stack.
+ */
+ args[0] = rp->r_rdi;
+ args[1] = rp->r_rsi;
+ args[2] = rp->r_rdx;
+ args[3] = rp->r_r10;
+ args[4] = rp->r_r8;
+ args[5] = rp->r_r9;
+ } else {
+ /*
+ * If the system call takes 6 args, then libc has stashed them
+ * in memory at the address contained in %ebx. Except for some
+ * syscalls which store the 6th argument in %ebp.
+ */
+ if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) {
+ uint32_t args32[6];
+
+ if (copyin((void *)rp->r_rbx, &args32,
+ sizeof (args32)) != 0) {
+ /*
+ * Clear the argument vector so that the
+ * trace probe does not expose kernel
+ * memory.
+ */
+ bzero(args, 6 * sizeof (uintptr_t));
+ return (set_errno(EFAULT));
+ }
+
+ args[0] = args32[0];
+ args[1] = args32[1];
+ args[2] = args32[2];
+ args[3] = args32[3];
+ args[4] = args32[4];
+ args[5] = args32[5];
+ } else {
+ args[0] = rp->r_rbx;
+ args[1] = rp->r_rcx;
+ args[2] = rp->r_rdx;
+ args[3] = rp->r_rsi;
+ args[4] = rp->r_rdi;
+ args[5] = rp->r_rbp;
+ }
+ }
+
+ return (0);
+}
+
+#else /* !__amd64 */
+
+static int
+lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args)
+{
+ struct regs *rp = lwptoregs(lwp);
+
+ /*
+ * If the system call takes 6 args, then libc has stashed them
+ * in memory at the address contained in %ebx. Except for some
+ * syscalls which store the 6th argument in %ebp.
+ */
+ if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) {
+ if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) !=
+ 0) {
+ /*
+ * Clear the argument vector so that the trace probe
+ * does not expose kernel memory.
+ */
+ bzero(args, 6 * sizeof (uintptr_t));
+ return (set_errno(EFAULT));
+ }
+ } else {
+ args[0] = rp->r_ebx;
+ args[1] = rp->r_ecx;
+ args[2] = rp->r_edx;
+ args[3] = rp->r_esi;
+ args[4] = rp->r_edi;
+ args[5] = rp->r_ebp;
+ }
+
+ return (0);
+}
+#endif
+
+void
+lx_syscall_return(klwp_t *lwp, int syscall_num, long ret)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+ int error = lwp->lwp_errno;
+
+ if (error != EINTR) {
+ /*
+ * If this system call was not interrupted, clear the system
+ * call restart flag before lx_setcontext() can pass it to
+ * usermode.
+ */
+ lwpd->br_syscall_restart = B_FALSE;
+ }
+
+ if (error != 0) {
+ /*
+ * Convert from illumos to Linux errno:
+ */
+ ret = -lx_errno(error, EINVAL);
+ }
+
+ /*
+ * 32-bit Linux system calls return via %eax; 64-bit calls return via
+ * %rax.
+ */
+ rp->r_r0 = ret;
+
+ /*
+ * Hold for the ptrace(2) "syscall-exit-stop" condition if required by
+ * PTRACE_SYSCALL. Note that the register state may be modified by
+ * tracer.
+ */
+ (void) lx_ptrace_stop(LX_PR_SYSEXIT);
+
+ /*
+ * Emit audit record, if necessary.
+ */
+ lx_audit_syscall_exit(syscall_num, ret);
+
+ /*
+ * Fire the DTrace "lx-syscall:::return" probe:
+ */
+ lx_trace_sysreturn(syscall_num, ret);
+
+ /*
+ * Clear errno for next time. We do not clear "br_syscall_restart" or
+ * "br_syscall_num" as they are potentially used by "lx_savecontext()"
+ * in the signal delivery path.
+ */
+ lwp->lwp_errno = 0;
+
+ lx_check_strict_failure(lwpd);
+
+ /*
+ * We want complete control of the registers on return from this
+ * emulated Linux system call:
+ */
+ lwp->lwp_eosys = JUSTRETURN;
+}
+
+static void
+lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason)
+{
+ char buf[100];
+
+ if (s == NULL) {
+ (void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds",
+ syscall_num);
+ } else {
+ VERIFY(unsup_reason < (sizeof (nosys_reasons) /
+ sizeof (*nosys_reasons)));
+
+ if (s->sy_name == NULL) {
+ (void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s",
+ syscall_num, nosys_reasons[unsup_reason]);
+ } else {
+ (void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s",
+ s->sy_name, nosys_reasons[unsup_reason]);
+ }
+ }
+
+ lx_unsupported(buf);
+}
+
+/*
+ * This function is used to override the processing of arguments and
+ * invocation of a handler for emulated system calls, installed on each
+ * branded LWP as "lwp_brand_syscall". If this system call should use the
+ * native path, we return 1. If we handled this system call (and have made
+ * arrangements with respect to post-return usermode register state) we
+ * return 0.
+ */
+int
+lx_syscall_enter(void)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+ int syscall_num;
+ int error;
+ long ret = 0;
+ lx_sysent_t *s;
+ uintptr_t args[6];
+ unsigned int unsup_reason;
+
+ /*
+ * If we got here, we should have an LWP-specific brand data
+ * structure.
+ */
+ VERIFY(lwpd != NULL);
+
+ if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) {
+ /*
+ * The lwp is not in in BRAND execution mode, so we return
+ * to the regular native system call path.
+ */
+ DTRACE_PROBE(brand__lx__syscall__hook__skip);
+ return (1);
+ }
+
+ /*
+ * Clear the restartable system call flag. This flag will be set
+ * on in the system call handler if the call is a candidate for
+ * a restart. It will be saved by lx_setcontext() in the event
+ * that we take a signal, and used in the signal handling path
+ * to restart the system call iff SA_RESTART was set for this
+ * signal. Save the system call number so that we can store it
+ * in the saved context if required.
+ */
+ lwpd->br_syscall_restart = B_FALSE;
+ lwpd->br_syscall_num = (int)rp->r_r0;
+
+ /*
+ * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by
+ * PTRACE_SYSCALL. The system call number and arguments may be
+ * modified by the tracer.
+ */
+ (void) lx_ptrace_stop(LX_PR_SYSENTRY);
+
+ /*
+ * Check that the system call number is within the bounds we expect.
+ */
+ syscall_num = lwpd->br_syscall_num;
+ if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) {
+ lx_syscall_unsup_msg(NULL, syscall_num, 0);
+
+ (void) set_errno(ENOTSUP);
+ lx_syscall_return(lwp, syscall_num, -1);
+ return (0);
+ }
+
+#if defined(_LP64)
+ if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+ s = &lx_sysent64[syscall_num];
+ } else
+#endif
+ {
+ s = &lx_sysent32[syscall_num];
+ }
+
+ /*
+ * Process the arguments for this system call and fire the DTrace
+ * "lx-syscall:::entry" probe:
+ */
+ error = lx_emulate_args(lwp, s, args);
+ lx_trace_sysenter(syscall_num, args);
+ lwpd->br_syscall_args[0] = args[0];
+ lwpd->br_syscall_args[1] = args[1];
+ lwpd->br_syscall_args[2] = args[2];
+ lwpd->br_syscall_args[3] = args[3];
+ if (error != 0) {
+ /*
+ * Could not read and process the arguments. Return the error
+ * to the process.
+ */
+ (void) set_errno(error);
+ lx_syscall_return(lwp, syscall_num, -1);
+ return (0);
+ }
+
+ if (s->sy_callc != NULL) {
+ /*
+ * Call the in-kernel handler for this Linux system call:
+ */
+ lwpd->br_eosys = NORMALRETURN;
+ ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4],
+ args[5]);
+ if (lwpd->br_eosys == NORMALRETURN) {
+ lx_syscall_return(lwp, syscall_num, ret);
+ }
+ return (0);
+ }
+
+ /*
+ * There is no in-kernel handler.
+ */
+ switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) {
+ case NOSYS_USERMODE:
+ /*
+ * Pass to the usermode emulation routine.
+ */
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ lx_emulate_user32(lwp, syscall_num, args);
+ } else
+#endif
+ {
+ lx_emulate_user(lwp, syscall_num, args);
+ }
+ return (0);
+
+ default:
+ /*
+ * We are not emulating this system call at all.
+ */
+ lx_syscall_unsup_msg(s, syscall_num, unsup_reason);
+
+ (void) set_errno(ENOTSUP);
+ lx_syscall_return(lwp, syscall_num, -1);
+ return (0);
+ }
+}
+
+#if defined(_LP64)
+/*
+ * Emulate vsyscall support.
+ *
+ * Linux magically maps a single page into the address space of each process,
+ * allowing them to make 'vsyscalls'. Originally designed to counteract the
+ * perceived overhead of regular system calls, vsyscalls were implemented as
+ * code residing in userspace which could be called directly. The userspace
+ * implementations of these vsyscalls which have now been replaced by
+ * instructions which vector into the normal syscall path.
+ *
+ * Implementing vsyscalls on Illumos is complicated by the fact that the
+ * required static address region resides inside the kernel address space.
+ * Rather than mapping a user-accessible page into the KAS, a different
+ * approach is taken. The vsyscall gate is emulated by interposing on
+ * pagefaults in trap(). An attempt to execute a known vsyscall address will
+ * result in emulating the appropriate system call rather than inducing a
+ * SIGSEGV.
+ */
+void
+lx_vsyscall_enter(proc_t *p, klwp_t *lwp, int scnum)
+{
+ struct regs *rp = lwptoregs(lwp);
+ uintptr_t raddr;
+
+ /*
+ * Fetch the return address from the process stack.
+ */
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+ if (copyin((void *)rp->r_rsp, &raddr, sizeof (raddr)) != 0) {
+#if DEBUG
+ printf("lx_vsyscall_call: bad brand stack at vsyscall "
+ "cmd=%s, pid=%d, sp=0x%p\n", PTOU(p)->u_comm,
+ p->p_pid, (void *)rp->r_rsp);
+#endif
+
+ /*
+ * The process jumped to the vsyscall address without a
+ * correctly configured stack. Terminate the process.
+ */
+ exit(CLD_KILLED, SIGSEGV);
+ return;
+ }
+
+ DTRACE_PROBE1(brand__lx__vsyscall, int, scnum);
+
+ /* Simulate vectoring into the syscall */
+ rp->r_rax = scnum;
+ rp->r_rip = raddr;
+ rp->r_rsp += sizeof (uintptr_t);
+
+ (void) lx_syscall_enter();
+}
+
+boolean_t
+lx_vsyscall_iscall(klwp_t *lwp, uintptr_t addr, int *scnum)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ int i;
+
+ if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) {
+ /*
+ * We only handle vsyscalls when running Linux code.
+ */
+ return (B_FALSE);
+ }
+
+ if (addr < LX_VSYSCALL_ADDR ||
+ addr >= (LX_VSYSCALL_ADDR + LX_VSYSCALL_SIZE)) {
+ /*
+ * Ignore faults outside the vsyscall page.
+ */
+ return (B_FALSE);
+ }
+
+ for (i = 0; lx_vsyscalls[i].lv_addr != NULL; i++) {
+ if (addr == lx_vsyscalls[i].lv_addr) {
+ /*
+ * This is a valid vsyscall address.
+ */
+ *scnum = lx_vsyscalls[i].lv_scnum;
+ return (B_TRUE);
+ }
+ }
+
+ lx_unsupported("bad vsyscall access");
+ return (B_FALSE);
+}
+#endif
+
+/*
+ * Linux defines system call numbers for 32-bit x86 in the file:
+ * arch/x86/syscalls/syscall_32.tbl
+ */
+lx_sysent_t lx_sysent32[] = {
+ {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */
+ {"exit", NULL, 0, 1}, /* 1 */
+ {"fork", NULL, 0, 0}, /* 2 */
+ {"read", lx_read, 0, 3}, /* 3 */
+ {"write", lx_write, 0, 3}, /* 4 */
+ {"open", lx_open, 0, 3}, /* 5 */
+ {"close", lx_close, 0, 1}, /* 6 */
+ {"waitpid", lx_waitpid, 0, 3}, /* 7 */
+ {"creat", lx_creat, 0, 2}, /* 8 */
+ {"link", lx_link, 0, 2}, /* 9 */
+ {"unlink", lx_unlink, 0, 1}, /* 10 */
+ {"execve", NULL, 0, 3}, /* 11 */
+ {"chdir", lx_chdir, 0, 1}, /* 12 */
+ {"time", lx_time, 0, 1}, /* 13 */
+ {"mknod", NULL, 0, 3}, /* 14 */
+ {"chmod", lx_chmod, 0, 2}, /* 15 */
+ {"lchown16", lx_lchown16, 0, 3}, /* 16 */
+ {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */
+ {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */
+ {"lseek", lx_lseek32, 0, 3}, /* 19 */
+ {"getpid", lx_getpid, 0, 0}, /* 20 */
+ {"mount", lx_mount, 0, 5}, /* 21 */
+ {"umount", lx_umount, 0, 1}, /* 22 */
+ {"setuid16", lx_setuid16, 0, 1}, /* 23 */
+ {"getuid16", lx_getuid16, 0, 0}, /* 24 */
+ {"stime", lx_stime, 0, 1}, /* 25 */
+ {"ptrace", lx_ptrace, 0, 4}, /* 26 */
+ {"alarm", lx_alarm, 0, 1}, /* 27 */
+ {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */
+ {"pause", lx_pause, 0, 0}, /* 29 */
+ {"utime", NULL, 0, 2}, /* 30 */
+ {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */
+ {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */
+ {"access", lx_access, 0, 2}, /* 33 */
+ {"nice", lx_nice, 0, 1}, /* 34 */
+ {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */
+ {"sync", lx_sync, 0, 0}, /* 36 */
+ {"kill", lx_kill, 0, 2}, /* 37 */
+ {"rename", lx_rename, 0, 2}, /* 38 */
+ {"mkdir", lx_mkdir, 0, 2}, /* 39 */
+ {"rmdir", NULL, 0, 1}, /* 40 */
+ {"dup", lx_dup, 0, 1}, /* 41 */
+ {"pipe", lx_pipe, 0, 1}, /* 42 */
+ {"times", lx_times, 0, 1}, /* 43 */
+ {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */
+ {"brk", lx_brk, 0, 1}, /* 45 */
+ {"setgid16", lx_setgid16, 0, 1}, /* 46 */
+ {"getgid16", lx_getgid16, 0, 0}, /* 47 */
+ {"signal", NULL, 0, 2}, /* 48 */
+ {"geteuid16", lx_geteuid16, 0, 0}, /* 49 */
+ {"getegid16", lx_getegid16, 0, 0}, /* 50 */
+ {"acct", lx_acct, 0, 1}, /* 51 */
+ {"umount2", lx_umount2, 0, 2}, /* 52 */
+ {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */
+ {"ioctl", lx_ioctl, 0, 3}, /* 54 */
+ {"fcntl", lx_fcntl, 0, 3}, /* 55 */
+ {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */
+ {"setpgid", lx_setpgid, 0, 2}, /* 57 */
+ {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */
+ {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */
+ {"umask", lx_umask, 0, 1}, /* 60 */
+ {"chroot", lx_chroot, 0, 1}, /* 61 */
+ {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */
+ {"dup2", lx_dup2, 0, 2}, /* 63 */
+ {"getppid", lx_getppid, 0, 0}, /* 64 */
+ {"getpgrp", lx_getpgrp, 0, 0}, /* 65 */
+ {"setsid", lx_setsid, 0, 0}, /* 66 */
+ {"sigaction", NULL, 0, 3}, /* 67 */
+ {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */
+ {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */
+ {"setreuid16", lx_setreuid16, 0, 2}, /* 70 */
+ {"setregid16", lx_setregid16, 0, 2}, /* 71 */
+ {"sigsuspend", NULL, 0, 1}, /* 72 */
+ {"sigpending", NULL, 0, 1}, /* 73 */
+ {"sethostname", lx_sethostname, 0, 2}, /* 74 */
+ {"setrlimit", lx_setrlimit, 0, 2}, /* 75 */
+ {"getrlimit", lx_oldgetrlimit, 0, 2}, /* 76 */
+ {"getrusage", lx_getrusage, 0, 2}, /* 77 */
+ {"gettimeofday", lx_gettimeofday, 0, 2}, /* 78 */
+ {"settimeofday", NULL, 0, 2}, /* 79 */
+ {"getgroups16", NULL, 0, 2}, /* 80 */
+ {"setgroups16", NULL, 0, 2}, /* 81 */
+ {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */
+ {"symlink", lx_symlink, 0, 2}, /* 83 */
+ {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */
+ {"readlink", lx_readlink, 0, 3}, /* 85 */
+ {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */
+ {"swapon", lx_swapon, 0, 2}, /* 87 */
+ {"reboot", lx_reboot, 0, 4}, /* 88 */
+ {"readdir", NULL, 0, 3}, /* 89 */
+ {"mmap", lx_mmap, 0, 6}, /* 90 */
+ {"munmap", lx_munmap, 0, 2}, /* 91 */
+ {"truncate", NULL, 0, 2}, /* 92 */
+ {"ftruncate", NULL, 0, 2}, /* 93 */
+ {"fchmod", lx_fchmod, 0, 2}, /* 94 */
+ {"fchown16", lx_fchown16, 0, 3}, /* 95 */
+ {"getpriority", lx_getpriority, 0, 2}, /* 96 */
+ {"setpriority", lx_setpriority, 0, 3}, /* 97 */
+ {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */
+ {"statfs", NULL, 0, 2}, /* 99 */
+ {"fstatfs", NULL, 0, 2}, /* 100 */
+ {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */
+ {"socketcall", lx_socketcall, 0, 2}, /* 102 */
+ {"syslog", lx_syslog, 0, 3}, /* 103 */
+ {"setitimer", NULL, 0, 3}, /* 104 */
+ {"getitimer", lx_getitimer, 0, 2}, /* 105 */
+ {"stat", lx_stat32, 0, 2}, /* 106 */
+ {"lstat", lx_lstat32, 0, 2}, /* 107 */
+ {"fstat", lx_fstat32, 0, 2}, /* 108 */
+ {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */
+ {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */
+ {"vhangup", lx_vhangup, 0, 0}, /* 111 */
+ {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */
+ {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */
+ {"wait4", lx_wait4, 0, 4}, /* 114 */
+ {"swapoff", lx_swapoff, 0, 1}, /* 115 */
+ {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */
+ {"ipc", NULL, 0, 5}, /* 117 */
+ {"fsync", NULL, 0, 1}, /* 118 */
+ {"sigreturn", NULL, 0, 1}, /* 119 */
+ {"clone", NULL, 0, 5}, /* 120 */
+ {"setdomainname", lx_setdomainname, 0, 2}, /* 121 */
+ {"uname", lx_uname, 0, 1}, /* 122 */
+ {"modify_ldt", lx_modify_ldt, 0, 3}, /* 123 */
+ {"adjtimex", NULL, 0, 1}, /* 124 */
+ {"mprotect", lx_mprotect, 0, 3}, /* 125 */
+ {"sigprocmask", NULL, 0, 3}, /* 126 */
+ {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */
+ {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */
+ {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */
+ {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */
+ {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */
+ {"getpgid", lx_getpgid, 0, 1}, /* 132 */
+ {"fchdir", lx_fchdir, 0, 1}, /* 133 */
+ {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */
+ {"sysfs", NULL, 0, 3}, /* 135 */
+ {"personality", lx_personality, 0, 1}, /* 136 */
+ {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */
+ {"setfsuid16", lx_setfsuid16, 0, 1}, /* 138 */
+ {"setfsgid16", lx_setfsgid16, 0, 1}, /* 139 */
+ {"llseek", lx_llseek, 0, 5}, /* 140 */
+ {"getdents", lx_getdents_32, 0, 3}, /* 141 */
+ {"select", lx_select, 0, 5}, /* 142 */
+ {"flock", lx_flock, 0, 2}, /* 143 */
+ {"msync", lx_msync, 0, 3}, /* 144 */
+ {"readv", lx_readv, 0, 3}, /* 145 */
+ {"writev", lx_writev, 0, 3}, /* 146 */
+ {"getsid", lx_getsid, 0, 1}, /* 147 */
+ {"fdatasync", NULL, 0, 1}, /* 148 */
+ {"sysctl", NULL, 0, 1}, /* 149 */
+ {"mlock", lx_mlock, 0, 2}, /* 150 */
+ {"munlock", lx_munlock, 0, 2}, /* 151 */
+ {"mlockall", lx_mlockall, 0, 1}, /* 152 */
+ {"munlockall", lx_munlockall, 0, 0}, /* 153 */
+ {"sched_setparam", lx_sched_setparam, 0, 2}, /* 154 */
+ {"sched_getparam", lx_sched_getparam, 0, 2}, /* 155 */
+ {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 156 */
+ {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 157 */
+ {"sched_yield", lx_sched_yield, 0, 0}, /* 158 */
+ {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 159 */
+ {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 160 */
+ {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 161 */
+ {"nanosleep", lx_nanosleep, 0, 2}, /* 162 */
+ {"mremap", lx_mremap, 0, 5}, /* 163 */
+ {"setresuid16", lx_setresuid16, 0, 3}, /* 164 */
+ {"getresuid16", lx_getresuid16, 0, 3}, /* 165 */
+ {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */
+ {"query_module", NULL, 0, 5}, /* 167 */
+ {"poll", lx_poll, 0, 3}, /* 168 */
+ {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */
+ {"setresgid16", lx_setresgid16, 0, 3}, /* 170 */
+ {"getresgid16", lx_getresgid16, 0, 3}, /* 171 */
+ {"prctl", lx_prctl, 0, 5}, /* 172 */
+ {"rt_sigreturn", NULL, 0, 0}, /* 173 */
+ {"rt_sigaction", NULL, 0, 4}, /* 174 */
+ {"rt_sigprocmask", NULL, 0, 4}, /* 175 */
+ {"rt_sigpending", NULL, 0, 2}, /* 176 */
+ {"rt_sigtimedwait", NULL, 0, 4}, /* 177 */
+ {"rt_sigqueueinfo", NULL, 0, 3}, /* 178 */
+ {"rt_sigsuspend", NULL, 0, 2}, /* 179 */
+ {"pread64", lx_pread32, 0, 5}, /* 180 */
+ {"pwrite64", lx_pwrite32, 0, 5}, /* 181 */
+ {"chown16", lx_chown16, 0, 3}, /* 182 */
+ {"getcwd", lx_getcwd, 0, 2}, /* 183 */
+ {"capget", NULL, 0, 2}, /* 184 */
+ {"capset", NULL, 0, 2}, /* 185 */
+ {"sigaltstack", NULL, 0, 2}, /* 186 */
+ {"sendfile", NULL, 0, 4}, /* 187 */
+ {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */
+ {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */
+ {"vfork", NULL, 0, 0}, /* 190 */
+ {"getrlimit", lx_getrlimit, 0, 2}, /* 191 */
+ {"mmap2", lx_mmap2, LX_SYS_EBPARG6, 6}, /* 192 */
+ {"truncate64", NULL, 0, 3}, /* 193 */
+ {"ftruncate64", NULL, 0, 3}, /* 194 */
+ {"stat64", lx_stat64, 0, 2}, /* 195 */
+ {"lstat64", lx_lstat64, 0, 2}, /* 196 */
+ {"fstat64", lx_fstat64, 0, 2}, /* 197 */
+ {"lchown", lx_lchown, 0, 3}, /* 198 */
+ {"getuid", lx_getuid, 0, 0}, /* 199 */
+ {"getgid", lx_getgid, 0, 0}, /* 200 */
+ {"geteuid", lx_geteuid, 0, 0}, /* 201 */
+ {"getegid", lx_getegid, 0, 0}, /* 202 */
+ {"setreuid", lx_setreuid, 0, 0}, /* 203 */
+ {"setregid", lx_setregid, 0, 0}, /* 204 */
+ {"getgroups", NULL, 0, 2}, /* 205 */
+ {"setgroups", NULL, 0, 2}, /* 206 */
+ {"fchown", lx_fchown, 0, 3}, /* 207 */
+ {"setresuid", lx_setresuid, 0, 3}, /* 208 */
+ {"getresuid", lx_getresuid, 0, 3}, /* 209 */
+ {"setresgid", lx_setresgid, 0, 3}, /* 210 */
+ {"getresgid", lx_getresgid, 0, 3}, /* 211 */
+ {"chown", lx_chown, 0, 3}, /* 212 */
+ {"setuid", lx_setuid, 0, 1}, /* 213 */
+ {"setgid", lx_setgid, 0, 1}, /* 214 */
+ {"setfsuid", lx_setfsuid, 0, 1}, /* 215 */
+ {"setfsgid", lx_setfsgid, 0, 1}, /* 216 */
+ {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */
+ {"mincore", lx_mincore, 0, 3}, /* 218 */
+ {"madvise", lx_madvise, 0, 3}, /* 219 */
+ {"getdents64", lx_getdents64, 0, 3}, /* 220 */
+ {"fcntl64", lx_fcntl64, 0, 3}, /* 221 */
+ {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */
+ {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */
+ {"gettid", lx_gettid, 0, 0}, /* 224 */
+ {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */
+ {"setxattr", lx_setxattr, 0, 5}, /* 226 */
+ {"lsetxattr", lx_lsetxattr, 0, 5}, /* 227 */
+ {"fsetxattr", lx_fsetxattr, 0, 5}, /* 228 */
+ {"getxattr", lx_getxattr, 0, 4}, /* 229 */
+ {"lgetxattr", lx_lgetxattr, 0, 4}, /* 230 */
+ {"fgetxattr", lx_fgetxattr, 0, 4}, /* 231 */
+ {"listxattr", lx_listxattr, 0, 3}, /* 232 */
+ {"llistxattr", lx_llistxattr, 0, 3}, /* 233 */
+ {"flistxattr", lx_flistxattr, 0, 3}, /* 234 */
+ {"removexattr", lx_removexattr, 0, 2}, /* 235 */
+ {"lremovexattr", lx_lremovexattr, 0, 2}, /* 236 */
+ {"fremovexattr", lx_fremovexattr, 0, 2}, /* 237 */
+ {"tkill", lx_tkill, 0, 2}, /* 238 */
+ {"sendfile64", NULL, 0, 4}, /* 239 */
+ {"futex", lx_futex, LX_SYS_EBPARG6, 6}, /* 240 */
+ {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 241 */
+ {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 242 */
+ {"set_thread_area", lx_set_thread_area, 0, 1}, /* 243 */
+ {"get_thread_area", lx_get_thread_area, 0, 1}, /* 244 */
+ {"io_setup", lx_io_setup, 0, 2}, /* 245 */
+ {"io_destroy", lx_io_destroy, 0, 1}, /* 246 */
+ {"io_getevents", lx_io_getevents, 0, 5}, /* 247 */
+ {"io_submit", lx_io_submit, 0, 3}, /* 248 */
+ {"io_cancel", lx_io_cancel, 0, 3}, /* 249 */
+ {"fadvise64", lx_fadvise64_32, 0, 5}, /* 250 */
+ {"nosys", NULL, 0, 0}, /* 251 */
+ {"group_exit", NULL, 0, 1}, /* 252 */
+ {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */
+ {"epoll_create", lx_epoll_create, 0, 1}, /* 254 */
+ {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 255 */
+ {"epoll_wait", lx_epoll_wait, 0, 4}, /* 256 */
+ {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */
+ {"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */
+ {"timer_create", lx_timer_create, 0, 3}, /* 259 */
+ {"timer_settime", NULL, 0, 4}, /* 260 */
+ {"timer_gettime", NULL, 0, 2}, /* 261 */
+ {"timer_getoverrun", NULL, 0, 1}, /* 262 */
+ {"timer_delete", NULL, 0, 1}, /* 263 */
+ {"clock_settime", lx_clock_settime, 0, 2}, /* 264 */
+ {"clock_gettime", lx_clock_gettime, 0, 2}, /* 265 */
+ {"clock_getres", lx_clock_getres, 0, 2}, /* 266 */
+ {"clock_nanosleep", NULL, 0, 4}, /* 267 */
+ {"statfs64", NULL, 0, 2}, /* 268 */
+ {"fstatfs64", NULL, 0, 2}, /* 269 */
+ {"tgkill", lx_tgkill, 0, 3}, /* 270 */
+
+/*
+ * The following system calls only exist in kernel 2.6 and greater:
+ */
+ {"utimes", NULL, 0, 2}, /* 271 */
+ {"fadvise64_64", lx_fadvise64_64, LX_SYS_EBPARG6, 6}, /* 272 */
+ {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */
+ {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */
+ {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */
+ {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */
+ {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */
+ {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */
+ {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */
+ {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */
+ {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */
+ {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */
+ {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */
+ {"waitid", lx_waitid, 0, 4}, /* 284 */
+ {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */
+ {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */
+ {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */
+ {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */
+ {"ioprio_set", lx_ioprio_set, 0, 3}, /* 289 */
+ {"ioprio_get", lx_ioprio_get, 0, 2}, /* 290 */
+ {"inotify_init", NULL, 0, 0}, /* 291 */
+ {"inotify_add_watch", NULL, 0, 3}, /* 292 */
+ {"inotify_rm_watch", NULL, 0, 2}, /* 293 */
+ {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */
+ {"openat", lx_openat, 0, 4}, /* 295 */
+ {"mkdirat", lx_mkdirat, 0, 3}, /* 296 */
+ {"mknodat", NULL, 0, 4}, /* 297 */
+ {"fchownat", lx_fchownat, 0, 5}, /* 298 */
+ {"futimesat", NULL, 0, 3}, /* 299 */
+ {"fstatat64", lx_fstatat64, 0, 4}, /* 300 */
+ {"unlinkat", lx_unlinkat, 0, 3}, /* 301 */
+ {"renameat", lx_renameat, 0, 4}, /* 302 */
+ {"linkat", lx_linkat, 0, 5}, /* 303 */
+ {"symlinkat", lx_symlinkat, 0, 3}, /* 304 */
+ {"readlinkat", lx_readlinkat, 0, 4}, /* 305 */
+ {"fchmodat", lx_fchmodat, 0, 3}, /* 306 */
+ {"faccessat", lx_faccessat, 0, 4}, /* 307 */
+ {"pselect6", lx_pselect, LX_SYS_EBPARG6, 6}, /* 308 */
+ {"ppoll", lx_ppoll, 0, 5}, /* 309 */
+ {"unshare", lx_unshare, 0, 1}, /* 310 */
+ {"set_robust_list", lx_set_robust_list, 0, 2}, /* 311 */
+ {"get_robust_list", lx_get_robust_list, 0, 3}, /* 312 */
+ {"splice", lx_splice, LX_SYS_EBPARG6, 6}, /* 313 */
+ {"sync_file_range", lx_sync_file_range, 0, 4}, /* 314 */
+ {"tee", NULL, NOSYS_NULL, 0}, /* 315 */
+ {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */
+ {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */
+ {"getcpu", lx_getcpu, 0, 3}, /* 318 */
+ {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 319 */
+ {"utimensat", NULL, 0, 4}, /* 320 */
+ {"signalfd", NULL, 0, 3}, /* 321 */
+ {"timerfd_create", NULL, 0, 2}, /* 322 */
+ {"eventfd", lx_eventfd, 0, 1}, /* 323 */
+ {"fallocate", lx_fallocate32, LX_SYS_EBPARG6, 6}, /* 324 */
+ {"timerfd_settime", NULL, 0, 4}, /* 325 */
+ {"timerfd_gettime", NULL, 0, 2}, /* 326 */
+ {"signalfd4", NULL, 0, 4}, /* 327 */
+ {"eventfd2", lx_eventfd2, 0, 2}, /* 328 */
+ {"epoll_create1", lx_epoll_create1, 0, 1}, /* 329 */
+ {"dup3", lx_dup3, 0, 3}, /* 330 */
+ {"pipe2", lx_pipe2, 0, 2}, /* 331 */
+ {"inotify_init1", NULL, 0, 1}, /* 332 */
+ {"preadv", lx_preadv32, 0, 5}, /* 333 */
+ {"pwritev", lx_pwritev32, 0, 5}, /* 334 */
+ {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 335 */
+ {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */
+ {"recvmmsg", lx_recvmmsg, 0, 5}, /* 337 */
+ {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */
+ {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */
+ {"prlimit64", lx_prlimit64, 0, 4}, /* 340 */
+ {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */
+ {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */
+ {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */
+ {"syncfs", lx_syncfs, 0, 1}, /* 344 */
+ {"sendmmsg", lx_sendmmsg, 0, 4}, /* 345 */
+ {"setns", NULL, NOSYS_NULL, 0}, /* 346 */
+ {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */
+ {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */
+ {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */
+ {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */
+ {"sched_setattr", lx_sched_setattr, 0, 3}, /* 351 */
+ {"sched_getattr", lx_sched_getattr, 0, 4}, /* 352 */
+ {"renameat2", NULL, NOSYS_NULL, 0}, /* 353 */
+ {"seccomp", NULL, NOSYS_NULL, 0}, /* 354 */
+ {"getrandom", lx_getrandom, 0, 3}, /* 355 */
+ {"memfd_create", NULL, NOSYS_NULL, 0}, /* 356 */
+ {"bpf", NULL, NOSYS_NULL, 0}, /* 357 */
+ {"execveat", NULL, NOSYS_NULL, 0}, /* 358 */
+};
+
+#if defined(_LP64)
+/*
+ * Linux defines system call numbers for 64-bit x86 in the file:
+ * arch/x86/syscalls/syscall_64.tbl
+ */
+lx_sysent_t lx_sysent64[] = {
+ {"read", lx_read, 0, 3}, /* 0 */
+ {"write", lx_write, 0, 3}, /* 1 */
+ {"open", lx_open, 0, 3}, /* 2 */
+ {"close", lx_close, 0, 1}, /* 3 */
+ {"stat", lx_stat64, 0, 2}, /* 4 */
+ {"fstat", lx_fstat64, 0, 2}, /* 5 */
+ {"lstat", lx_lstat64, 0, 2}, /* 6 */
+ {"poll", lx_poll, 0, 3}, /* 7 */
+ {"lseek", lx_lseek64, 0, 3}, /* 8 */
+ {"mmap", lx_mmap, 0, 6}, /* 9 */
+ {"mprotect", lx_mprotect, 0, 3}, /* 10 */
+ {"munmap", lx_munmap, 0, 2}, /* 11 */
+ {"brk", lx_brk, 0, 1}, /* 12 */
+ {"rt_sigaction", NULL, 0, 4}, /* 13 */
+ {"rt_sigprocmask", NULL, 0, 4}, /* 14 */
+ {"rt_sigreturn", NULL, 0, 0}, /* 15 */
+ {"ioctl", lx_ioctl, 0, 3}, /* 16 */
+ {"pread64", lx_pread, 0, 4}, /* 17 */
+ {"pwrite64", lx_pwrite, 0, 4}, /* 18 */
+ {"readv", lx_readv, 0, 3}, /* 19 */
+ {"writev", lx_writev, 0, 3}, /* 20 */
+ {"access", lx_access, 0, 2}, /* 21 */
+ {"pipe", lx_pipe, 0, 1}, /* 22 */
+ {"select", lx_select, 0, 5}, /* 23 */
+ {"sched_yield", lx_sched_yield, 0, 0}, /* 24 */
+ {"mremap", lx_mremap, 0, 5}, /* 25 */
+ {"msync", lx_msync, 0, 3}, /* 26 */
+ {"mincore", lx_mincore, 0, 3}, /* 27 */
+ {"madvise", lx_madvise, 0, 3}, /* 28 */
+ {"shmget", NULL, 0, 3}, /* 29 */
+ {"shmat", NULL, 0, 4}, /* 30 */
+ {"shmctl", NULL, 0, 3}, /* 31 */
+ {"dup", lx_dup, 0, 1}, /* 32 */
+ {"dup2", lx_dup2, 0, 2}, /* 33 */
+ {"pause", lx_pause, 0, 0}, /* 34 */
+ {"nanosleep", lx_nanosleep, 0, 2}, /* 35 */
+ {"getitimer", lx_getitimer, 0, 2}, /* 36 */
+ {"alarm", lx_alarm, 0, 1}, /* 37 */
+ {"setitimer", NULL, 0, 3}, /* 38 */
+ {"getpid", lx_getpid, 0, 0}, /* 39 */
+ {"sendfile", NULL, 0, 4}, /* 40 */
+ {"socket", lx_socket, 0, 3}, /* 41 */
+ {"connect", lx_connect, 0, 3}, /* 42 */
+ {"accept", lx_accept, 0, 3}, /* 43 */
+ {"sendto", lx_sendto, 0, 6}, /* 44 */
+ {"recvfrom", lx_recvfrom, 0, 6}, /* 45 */
+ {"sendmsg", lx_sendmsg, 0, 3}, /* 46 */
+ {"recvmsg", lx_recvmsg, 0, 3}, /* 47 */
+ {"shutdown", lx_shutdown, 0, 2}, /* 48 */
+ {"bind", lx_bind, 0, 3}, /* 49 */
+ {"listen", lx_listen, 0, 2}, /* 50 */
+ {"getsockname", lx_getsockname, 0, 3}, /* 51 */
+ {"getpeername", lx_getpeername, 0, 3}, /* 52 */
+ {"socketpair", lx_socketpair, 0, 4}, /* 53 */
+ {"setsockopt", lx_setsockopt, 0, 5}, /* 54 */
+ {"getsockopt", lx_getsockopt, 0, 5}, /* 55 */
+ {"clone", NULL, 0, 5}, /* 56 */
+ {"fork", NULL, 0, 0}, /* 57 */
+ {"vfork", NULL, 0, 0}, /* 58 */
+ {"execve", NULL, 0, 3}, /* 59 */
+ {"exit", NULL, 0, 1}, /* 60 */
+ {"wait4", lx_wait4, 0, 4}, /* 61 */
+ {"kill", lx_kill, 0, 2}, /* 62 */
+ {"uname", lx_uname, 0, 1}, /* 63 */
+ {"semget", NULL, 0, 3}, /* 64 */
+ {"semop", NULL, 0, 3}, /* 65 */
+ {"semctl", NULL, 0, 4}, /* 66 */
+ {"shmdt", NULL, 0, 1}, /* 67 */
+ {"msgget", NULL, 0, 2}, /* 68 */
+ {"msgsnd", NULL, 0, 4}, /* 69 */
+ {"msgrcv", NULL, 0, 5}, /* 70 */
+ {"msgctl", NULL, 0, 3}, /* 71 */
+ {"fcntl", lx_fcntl64, 0, 3}, /* 72 */
+ {"flock", lx_flock, 0, 2}, /* 73 */
+ {"fsync", NULL, 0, 1}, /* 74 */
+ {"fdatasync", NULL, 0, 1}, /* 75 */
+ {"truncate", NULL, 0, 2}, /* 76 */
+ {"ftruncate", NULL, 0, 2}, /* 77 */
+ {"getdents", lx_getdents_64, 0, 3}, /* 78 */
+ {"getcwd", lx_getcwd, 0, 2}, /* 79 */
+ {"chdir", lx_chdir, 0, 1}, /* 80 */
+ {"fchdir", lx_fchdir, 0, 1}, /* 81 */
+ {"rename", lx_rename, 0, 2}, /* 82 */
+ {"mkdir", lx_mkdir, 0, 2}, /* 83 */
+ {"rmdir", NULL, 0, 1}, /* 84 */
+ {"creat", lx_creat, 0, 2}, /* 85 */
+ {"link", lx_link, 0, 2}, /* 86 */
+ {"unlink", lx_unlink, 0, 1}, /* 87 */
+ {"symlink", lx_symlink, 0, 2}, /* 88 */
+ {"readlink", lx_readlink, 0, 3}, /* 89 */
+ {"chmod", lx_chmod, 0, 2}, /* 90 */
+ {"fchmod", lx_fchmod, 0, 2}, /* 91 */
+ {"chown", lx_chown, 0, 3}, /* 92 */
+ {"fchown", lx_fchown, 0, 3}, /* 93 */
+ {"lchown", lx_lchown, 0, 3}, /* 94 */
+ {"umask", lx_umask, 0, 1}, /* 95 */
+ {"gettimeofday", lx_gettimeofday, 0, 2}, /* 96 */
+ {"getrlimit", lx_getrlimit, 0, 2}, /* 97 */
+ {"getrusage", lx_getrusage, 0, 2}, /* 98 */
+ {"sysinfo", lx_sysinfo64, 0, 1}, /* 99 */
+ {"times", lx_times, 0, 1}, /* 100 */
+ {"ptrace", lx_ptrace, 0, 4}, /* 101 */
+ {"getuid", lx_getuid, 0, 0}, /* 102 */
+ {"syslog", lx_syslog, 0, 3}, /* 103 */
+ {"getgid", lx_getgid, 0, 0}, /* 104 */
+ {"setuid", lx_setuid, 0, 1}, /* 105 */
+ {"setgid", lx_setgid, 0, 1}, /* 106 */
+ {"geteuid", lx_geteuid, 0, 0}, /* 107 */
+ {"getegid", lx_getegid, 0, 0}, /* 108 */
+ {"setpgid", lx_setpgid, 0, 2}, /* 109 */
+ {"getppid", lx_getppid, 0, 0}, /* 110 */
+ {"getpgrp", lx_getpgrp, 0, 0}, /* 111 */
+ {"setsid", lx_setsid, 0, 0}, /* 112 */
+ {"setreuid", lx_setreuid, 0, 0}, /* 113 */
+ {"setregid", lx_setregid, 0, 0}, /* 114 */
+ {"getgroups", NULL, 0, 2}, /* 115 */
+ {"setgroups", NULL, 0, 2}, /* 116 */
+ {"setresuid", lx_setresuid, 0, 3}, /* 117 */
+ {"getresuid", lx_getresuid, 0, 3}, /* 118 */
+ {"setresgid", lx_setresgid, 0, 3}, /* 119 */
+ {"getresgid", lx_getresgid, 0, 3}, /* 120 */
+ {"getpgid", lx_getpgid, 0, 1}, /* 121 */
+ {"setfsuid", lx_setfsuid, 0, 1}, /* 122 */
+ {"setfsgid", lx_setfsgid, 0, 1}, /* 123 */
+ {"getsid", lx_getsid, 0, 1}, /* 124 */
+ {"capget", NULL, 0, 2}, /* 125 */
+ {"capset", NULL, 0, 2}, /* 126 */
+ {"rt_sigpending", NULL, 0, 2}, /* 127 */
+ {"rt_sigtimedwait", NULL, 0, 4}, /* 128 */
+ {"rt_sigqueueinfo", NULL, 0, 3}, /* 129 */
+ {"rt_sigsuspend", NULL, 0, 2}, /* 130 */
+ {"sigaltstack", NULL, 0, 2}, /* 131 */
+ {"utime", NULL, 0, 2}, /* 132 */
+ {"mknod", NULL, 0, 3}, /* 133 */
+ {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */
+ {"personality", lx_personality, 0, 1}, /* 135 */
+ {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */
+ {"statfs", NULL, 0, 2}, /* 137 */
+ {"fstatfs", NULL, 0, 2}, /* 138 */
+ {"sysfs", NULL, 0, 3}, /* 139 */
+ {"getpriority", lx_getpriority, 0, 2}, /* 140 */
+ {"setpriority", lx_setpriority, 0, 3}, /* 141 */
+ {"sched_setparam", lx_sched_setparam, 0, 2}, /* 142 */
+ {"sched_getparam", lx_sched_getparam, 0, 2}, /* 143 */
+ {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 144 */
+ {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 145 */
+ {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 146 */
+ {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 147 */
+ {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 148 */
+ {"mlock", lx_mlock, 0, 2}, /* 149 */
+ {"munlock", lx_munlock, 0, 2}, /* 150 */
+ {"mlockall", lx_mlockall, 0, 1}, /* 151 */
+ {"munlockall", lx_munlockall, 0, 0}, /* 152 */
+ {"vhangup", lx_vhangup, 0, 0}, /* 153 */
+ {"modify_ldt", lx_modify_ldt, 0, 3}, /* 154 */
+ {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */
+ {"sysctl", NULL, 0, 1}, /* 156 */
+ {"prctl", lx_prctl, 0, 5}, /* 157 */
+ {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */
+ {"adjtimex", NULL, 0, 1}, /* 159 */
+ {"setrlimit", lx_setrlimit, 0, 2}, /* 160 */
+ {"chroot", lx_chroot, 0, 1}, /* 161 */
+ {"sync", lx_sync, 0, 0}, /* 162 */
+ {"acct", lx_acct, 0, 1}, /* 163 */
+ {"settimeofday", NULL, 0, 2}, /* 164 */
+ {"mount", lx_mount, 0, 5}, /* 165 */
+ {"umount2", lx_umount2, 0, 2}, /* 166 */
+ {"swapon", lx_swapon, 0, 2}, /* 167 */
+ {"swapoff", lx_swapoff, 0, 1}, /* 168 */
+ {"reboot", lx_reboot, 0, 4}, /* 169 */
+ {"sethostname", lx_sethostname, 0, 2}, /* 170 */
+ {"setdomainname", lx_setdomainname, 0, 2}, /* 171 */
+ {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */
+ {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */
+ {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */
+ {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */
+ {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */
+ {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */
+ {"query_module", NULL, 0, 5}, /* 178 */
+ {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */
+ {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */
+ {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */
+ {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */
+ {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */
+ {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */
+ {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */
+ {"gettid", lx_gettid, 0, 0}, /* 186 */
+ {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */
+ {"setxattr", lx_setxattr, 0, 5}, /* 188 */
+ {"lsetxattr", lx_lsetxattr, 0, 5}, /* 189 */
+ {"fsetxattr", lx_fsetxattr, 0, 5}, /* 190 */
+ {"getxattr", lx_getxattr, 0, 4}, /* 191 */
+ {"lgetxattr", lx_lgetxattr, 0, 4}, /* 192 */
+ {"fgetxattr", lx_fgetxattr, 0, 4}, /* 193 */
+ {"listxattr", lx_listxattr, 0, 3}, /* 194 */
+ {"llistxattr", lx_llistxattr, 0, 3}, /* 195 */
+ {"flistxattr", lx_flistxattr, 0, 3}, /* 196 */
+ {"removexattr", lx_removexattr, 0, 2}, /* 197 */
+ {"lremovexattr", lx_lremovexattr, 0, 2}, /* 198 */
+ {"fremovexattr", lx_fremovexattr, 0, 2}, /* 199 */
+ {"tkill", lx_tkill, 0, 2}, /* 200 */
+ {"time", lx_time, 0, 1}, /* 201 */
+ {"futex", lx_futex, 0, 6}, /* 202 */
+ {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 203 */
+ {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 204 */
+ {"set_thread_area", lx_set_thread_area, 0, 1}, /* 205 */
+ {"io_setup", lx_io_setup, 0, 2}, /* 206 */
+ {"io_destroy", lx_io_destroy, 0, 1}, /* 207 */
+ {"io_getevents", lx_io_getevents, 0, 5}, /* 208 */
+ {"io_submit", lx_io_submit, 0, 3}, /* 209 */
+ {"io_cancel", lx_io_cancel, 0, 3}, /* 210 */
+ {"get_thread_area", lx_get_thread_area, 0, 1}, /* 211 */
+ {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */
+ {"epoll_create", lx_epoll_create, 0, 1}, /* 213 */
+ {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */
+ {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */
+ {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */
+ {"getdents64", lx_getdents64, 0, 3}, /* 217 */
+ {"set_tid_address", lx_set_tid_address, 0, 1}, /* 218 */
+ {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */
+ {"semtimedop", NULL, 0, 4}, /* 220 */
+ {"fadvise64", lx_fadvise64, 0, 4}, /* 221 */
+ {"timer_create", lx_timer_create, 0, 3}, /* 222 */
+ {"timer_settime", NULL, 0, 4}, /* 223 */
+ {"timer_gettime", NULL, 0, 2}, /* 224 */
+ {"timer_getoverrun", NULL, 0, 1}, /* 225 */
+ {"timer_delete", NULL, 0, 1}, /* 226 */
+ {"clock_settime", lx_clock_settime, 0, 2}, /* 227 */
+ {"clock_gettime", lx_clock_gettime, 0, 2}, /* 228 */
+ {"clock_getres", lx_clock_getres, 0, 2}, /* 229 */
+ {"clock_nanosleep", NULL, 0, 4}, /* 230 */
+ {"exit_group", NULL, 0, 1}, /* 231 */
+ {"epoll_wait", lx_epoll_wait, 0, 4}, /* 232 */
+ {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 233 */
+ {"tgkill", lx_tgkill, 0, 3}, /* 234 */
+ {"utimes", NULL, 0, 2}, /* 235 */
+ {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */
+ {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */
+ {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */
+ {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */
+ {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */
+ {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */
+ {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */
+ {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */
+ {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */
+ {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */
+ {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */
+ {"waitid", lx_waitid, 0, 4}, /* 247 */
+ {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */
+ {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */
+ {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */
+ {"ioprio_set", lx_ioprio_set, 0, 3}, /* 251 */
+ {"ioprio_get", lx_ioprio_get, 0, 2}, /* 252 */
+ {"inotify_init", NULL, 0, 0}, /* 253 */
+ {"inotify_add_watch", NULL, 0, 3}, /* 254 */
+ {"inotify_rm_watch", NULL, 0, 2}, /* 255 */
+ {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */
+ {"openat", lx_openat, 0, 4}, /* 257 */
+ {"mkdirat", lx_mkdirat, 0, 3}, /* 258 */
+ {"mknodat", NULL, 0, 4}, /* 259 */
+ {"fchownat", lx_fchownat, 0, 5}, /* 260 */
+ {"futimesat", NULL, 0, 3}, /* 261 */
+ {"fstatat64", lx_fstatat64, 0, 4}, /* 262 */
+ {"unlinkat", lx_unlinkat, 0, 3}, /* 263 */
+ {"renameat", lx_renameat, 0, 4}, /* 264 */
+ {"linkat", lx_linkat, 0, 5}, /* 265 */
+ {"symlinkat", lx_symlinkat, 0, 3}, /* 266 */
+ {"readlinkat", lx_readlinkat, 0, 4}, /* 267 */
+ {"fchmodat", lx_fchmodat, 0, 3}, /* 268 */
+ {"faccessat", lx_faccessat, 0, 4}, /* 269 */
+ {"pselect6", lx_pselect, 0, 6}, /* 270 */
+ {"ppoll", lx_ppoll, 0, 5}, /* 271 */
+ {"unshare", lx_unshare, 0, 1}, /* 272 */
+ {"set_robust_list", lx_set_robust_list, 0, 2}, /* 273 */
+ {"get_robust_list", lx_get_robust_list, 0, 3}, /* 274 */
+ {"splice", lx_splice, 0, 6}, /* 275 */
+ {"tee", NULL, NOSYS_NULL, 0}, /* 276 */
+ {"sync_file_range", lx_sync_file_range, 0, 4}, /* 277 */
+ {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */
+ {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */
+ {"utimensat", NULL, 0, 4}, /* 280 */
+ {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 281 */
+ {"signalfd", NULL, 0, 3}, /* 282 */
+ {"timerfd_create", NULL, 0, 2}, /* 283 */
+ {"eventfd", lx_eventfd, 0, 1}, /* 284 */
+ {"fallocate", lx_fallocate, 0, 4}, /* 285 */
+ {"timerfd_settime", NULL, 0, 4}, /* 286 */
+ {"timerfd_gettime", NULL, 0, 2}, /* 287 */
+ {"accept4", lx_accept4, 0, 4}, /* 288 */
+ {"signalfd4", NULL, 0, 4}, /* 289 */
+ {"eventfd2", lx_eventfd2, 0, 2}, /* 290 */
+ {"epoll_create1", lx_epoll_create1, 0, 1}, /* 291 */
+ {"dup3", lx_dup3, 0, 3}, /* 292 */
+ {"pipe2", lx_pipe2, 0, 2}, /* 293 */
+ {"inotify_init1", NULL, 0, 1}, /* 294 */
+ {"preadv", lx_preadv, 0, 4}, /* 295 */
+ {"pwritev", lx_pwritev, 0, 4}, /* 296 */
+ {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 297 */
+ {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */
+ {"recvmmsg", lx_recvmmsg, 0, 5}, /* 299 */
+ {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */
+ {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */
+ {"prlimit64", lx_prlimit64, 0, 4}, /* 302 */
+ {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */
+ {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */
+ {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */
+ {"syncfs", lx_syncfs, 0, 1}, /* 306 */
+ {"sendmmsg", lx_sendmmsg, 0, 4}, /* 307 */
+ {"setns", NULL, NOSYS_NULL, 0}, /* 309 */
+ {"getcpu", lx_getcpu, 0, 3}, /* 309 */
+ {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */
+ {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */
+ {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */
+ {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */
+ {"sched_setattr", lx_sched_setattr, 0, 3}, /* 314 */
+ {"sched_getattr", lx_sched_getattr, 0, 4}, /* 315 */
+ {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */
+ {"seccomp", NULL, NOSYS_NULL, 0}, /* 317 */
+ {"getrandom", lx_getrandom, 0, 3}, /* 318 */
+ {"memfd_create", NULL, NOSYS_NULL, 0}, /* 319 */
+ {"kexec_file_load", NULL, NOSYS_NULL, 0}, /* 320 */
+ {"bpf", NULL, NOSYS_NULL, 0}, /* 321 */
+ {"execveat", NULL, NOSYS_NULL, 0}, /* 322 */
+
+ /* XXX TBD gap then x32 syscalls from 512 - 544 */
+};
+#endif
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_proc.h b/usr/src/uts/common/brand/lx/procfs/lx_proc.h
new file mode 100644
index 0000000000..ad86667997
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_proc.h
@@ -0,0 +1,378 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _LX_PROC_H
+#define _LX_PROC_H
+
+#ifdef _LXPROC_NATIVE_H
+#error Attempted to include branded lx_proc.h after native lxproc.h
+#endif
+
+#define _LXPROC_BRANDED_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/nvpair.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define LXPTOZ(lxpnp) \
+ (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define LXPNSIZ 256 /* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define LXPR_SDSIZE 16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+ LXPR_INVALID, /* nodes start at 1 */
+ LXPR_PROCDIR, /* /proc */
+ LXPR_PIDDIR, /* /proc/<pid> */
+ LXPR_PID_AUXV, /* /proc/<pid>/auxv */
+ LXPR_PID_CGROUP, /* /proc/<pid>/cgroup */
+ LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */
+ LXPR_PID_COMM, /* /proc/<pid>/comm */
+ LXPR_PID_CPU, /* /proc/<pid>/cpu */
+ LXPR_PID_CURDIR, /* /proc/<pid>/cwd */
+ LXPR_PID_ENV, /* /proc/<pid>/environ */
+ LXPR_PID_EXE, /* /proc/<pid>/exe */
+ LXPR_PID_GIDMAP, /* /proc/<pid>/gid_map */
+ LXPR_PID_LIMITS, /* /proc/<pid>/limits */
+ LXPR_PID_LOGINUID, /* /proc/<pid>/loginuid */
+ LXPR_PID_MAPS, /* /proc/<pid>/maps */
+ LXPR_PID_MEM, /* /proc/<pid>/mem */
+ LXPR_PID_MOUNTINFO, /* /proc/<pid>/mountinfo */
+ LXPR_PID_MOUNTS, /* /proc/<pid>/mounts */
+ LXPR_PID_OOM_SCR_ADJ, /* /proc/<pid>/oom_score_adj */
+ LXPR_PID_PERSONALITY, /* /proc/<pid>/personality */
+ LXPR_PID_ROOTDIR, /* /proc/<pid>/root */
+ LXPR_PID_STAT, /* /proc/<pid>/stat */
+ LXPR_PID_STATM, /* /proc/<pid>/statm */
+ LXPR_PID_STATUS, /* /proc/<pid>/status */
+ LXPR_PID_TASKDIR, /* /proc/<pid>/task */
+ LXPR_PID_TASK_IDDIR, /* /proc/<pid>/task/<tid> */
+ LXPR_PID_FDDIR, /* /proc/<pid>/fd */
+ LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */
+ LXPR_PID_UIDMAP, /* /proc/<pid>/uid_map */
+ LXPR_PID_TID_AUXV, /* /proc/<pid>/task/<tid>/auxv */
+ LXPR_PID_TID_CGROUP, /* /proc/<pid>/task/<tid>/cgroup */
+ LXPR_PID_TID_CMDLINE, /* /proc/<pid>/task/<tid>/cmdline */
+ LXPR_PID_TID_COMM, /* /proc/<pid>/task/<tid>/comm */
+ LXPR_PID_TID_CPU, /* /proc/<pid>/task/<tid>/cpu */
+ LXPR_PID_TID_CURDIR, /* /proc/<pid>/task/<tid>/cwd */
+ LXPR_PID_TID_ENV, /* /proc/<pid>/task/<tid>/environ */
+ LXPR_PID_TID_EXE, /* /proc/<pid>/task/<tid>/exe */
+ LXPR_PID_TID_GIDMAP, /* /proc/<pid>/task/<tid>/gid_map */
+ LXPR_PID_TID_LIMITS, /* /proc/<pid>/task/<tid>/limits */
+ LXPR_PID_TID_LOGINUID, /* /proc/<pid>/task/<tid>/loginuid */
+ LXPR_PID_TID_MAPS, /* /proc/<pid>/task/<tid>/maps */
+ LXPR_PID_TID_MEM, /* /proc/<pid>/task/<tid>/mem */
+ LXPR_PID_TID_MOUNTINFO, /* /proc/<pid>/task/<tid>/mountinfo */
+ LXPR_PID_TID_OOM_SCR_ADJ, /* /proc/<pid>/task/<tid>/oom_score_adj */
+ LXPR_PID_TID_PERSONALITY, /* /proc/<pid>/task/<tid>/personality */
+ LXPR_PID_TID_ROOTDIR, /* /proc/<pid>/task/<tid>/root */
+ LXPR_PID_TID_STAT, /* /proc/<pid>/task/<tid>/stat */
+ LXPR_PID_TID_STATM, /* /proc/<pid>/task/<tid>/statm */
+ LXPR_PID_TID_STATUS, /* /proc/<pid>/task/<tid>/status */
+ LXPR_PID_TID_FDDIR, /* /proc/<pid>/task/<tid>/fd */
+ LXPR_PID_TID_FD_FD, /* /proc/<pid>/task/<tid>/fd/nn */
+ LXPR_PID_TID_UIDMAP, /* /proc/<pid>/task/<tid>/uid_map */
+ LXPR_CGROUPS, /* /proc/cgroups */
+ LXPR_CMDLINE, /* /proc/cmdline */
+ LXPR_CPUINFO, /* /proc/cpuinfo */
+ LXPR_DEVICES, /* /proc/devices */
+ LXPR_DISKSTATS, /* /proc/diskstats */
+ LXPR_DMA, /* /proc/dma */
+ LXPR_FILESYSTEMS, /* /proc/filesystems */
+ LXPR_INTERRUPTS, /* /proc/interrupts */
+ LXPR_IOPORTS, /* /proc/ioports */
+ LXPR_KCORE, /* /proc/kcore */
+ LXPR_KMSG, /* /proc/kmsg */
+ LXPR_LOADAVG, /* /proc/loadavg */
+ LXPR_MEMINFO, /* /proc/meminfo */
+ LXPR_MODULES, /* /proc/modules */
+ LXPR_MOUNTS, /* /proc/mounts */
+ LXPR_NETDIR, /* /proc/net */
+ LXPR_NET_ARP, /* /proc/net/arp */
+ LXPR_NET_DEV, /* /proc/net/dev */
+ LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */
+ LXPR_NET_IF_INET6, /* /proc/net/if_inet6 */
+ LXPR_NET_IGMP, /* /proc/net/igmp */
+ LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */
+ LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */
+ LXPR_NET_IPV6_ROUTE, /* /proc/net/ipv6_route */
+ LXPR_NET_MCFILTER, /* /proc/net/mcfilter */
+ LXPR_NET_NETSTAT, /* /proc/net/netstat */
+ LXPR_NET_RAW, /* /proc/net/raw */
+ LXPR_NET_ROUTE, /* /proc/net/route */
+ LXPR_NET_RPC, /* /proc/net/rpc */
+ LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */
+ LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */
+ LXPR_NET_SNMP, /* /proc/net/snmp */
+ LXPR_NET_STAT, /* /proc/net/stat */
+ LXPR_NET_TCP, /* /proc/net/tcp */
+ LXPR_NET_TCP6, /* /proc/net/tcp6 */
+ LXPR_NET_UDP, /* /proc/net/udp */
+ LXPR_NET_UDP6, /* /proc/net/udp6 */
+ LXPR_NET_UNIX, /* /proc/net/unix */
+ LXPR_PARTITIONS, /* /proc/partitions */
+ LXPR_SELF, /* /proc/self */
+ LXPR_STAT, /* /proc/stat */
+ LXPR_SWAPS, /* /proc/swaps */
+ LXPR_SYSDIR, /* /proc/sys/ */
+ LXPR_SYS_FSDIR, /* /proc/sys/fs/ */
+ LXPR_SYS_FS_AIO_MAX_NR, /* /proc/sys/fs/aio-max-nr */
+ LXPR_SYS_FS_AIO_NR, /* /proc/sys/fs/aio-nr */
+ LXPR_SYS_FS_FILEMAX, /* /proc/sys/fs/file-max */
+ LXPR_SYS_FS_FILENR, /* /proc/sys/fs/file-nr */
+ LXPR_SYS_FS_INOTIFYDIR, /* /proc/sys/fs/inotify */
+ LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, /* inotify/max_queued_events */
+ LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, /* inotify/max_user_instances */
+ LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, /* inotify/max_user_watches */
+ LXPR_SYS_FS_PIPE_MAX, /* /proc/sys/fs/pipe-max-size */
+ LXPR_SYS_KERNELDIR, /* /proc/sys/kernel/ */
+ LXPR_SYS_KERNEL_CAPLCAP, /* /proc/sys/kernel/cap_last_cap */
+ LXPR_SYS_KERNEL_COREPATT, /* /proc/sys/kernel/core_pattern */
+ LXPR_SYS_KERNEL_HOSTNAME, /* /proc/sys/kernel/hostname */
+ LXPR_SYS_KERNEL_MSGMAX, /* /proc/sys/kernel/msgmax */
+ LXPR_SYS_KERNEL_MSGMNB, /* /proc/sys/kernel/msgmnb */
+ LXPR_SYS_KERNEL_MSGMNI, /* /proc/sys/kernel/msgmni */
+ LXPR_SYS_KERNEL_NGROUPS_MAX, /* /proc/sys/kernel/ngroups_max */
+ LXPR_SYS_KERNEL_OSREL, /* /proc/sys/kernel/osrelease */
+ LXPR_SYS_KERNEL_PID_MAX, /* /proc/sys/kernel/pid_max */
+ LXPR_SYS_KERNEL_RANDDIR, /* /proc/sys/kernel/random */
+ LXPR_SYS_KERNEL_RAND_BOOTID, /* /proc/sys/kernel/random/boot_id */
+ LXPR_SYS_KERNEL_RAND_ENTAVL, /* /proc/sys/kernel/random/entropy_avail */
+ LXPR_SYS_KERNEL_SEM, /* /proc/sys/kernel/sem */
+ LXPR_SYS_KERNEL_SHMALL, /* /proc/sys/kernel/shmall */
+ LXPR_SYS_KERNEL_SHMMAX, /* /proc/sys/kernel/shmmax */
+ LXPR_SYS_KERNEL_SHMMNI, /* /proc/sys/kernel/shmmni */
+ LXPR_SYS_KERNEL_THREADS_MAX, /* /proc/sys/kernel/threads-max */
+ LXPR_SYS_NETDIR, /* /proc/sys/net */
+ LXPR_SYS_NET_COREDIR, /* /proc/sys/net/core */
+ LXPR_SYS_NET_CORE_SOMAXCON, /* /proc/sys/net/core/somaxconn */
+ LXPR_SYS_NET_IPV4DIR, /* /proc/sys/net/ipv4 */
+ LXPR_SYS_NET_IPV4_ICMP_EIB, /* .../icmp_echo_ignore_broadcasts */
+ LXPR_SYS_NET_IPV4_IP_FORWARD, /* .../net/ipv4/ip_forward */
+ LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, /* .../net/ipv4/ip_local_port_range */
+ LXPR_SYS_NET_IPV4_TCP_FIN_TO, /* /proc/sys/net/ipv4/tcp_fin_timeout */
+ LXPR_SYS_NET_IPV4_TCP_KA_INT, /* .../net/ipv4/tcp_keepalive_intvl */
+ LXPR_SYS_NET_IPV4_TCP_KA_TIM, /* .../net/ipv4/tcp_keepalive_time */
+ LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, /* .../net/ipv4/tcp_max_syn_backlog */
+ LXPR_SYS_NET_IPV4_TCP_RETRY2, /* /proc/sys/net/ipv4/tcp_retries2 */
+ LXPR_SYS_NET_IPV4_TCP_RMEM, /* /proc/sys/net/ipv4/tcp_rmem */
+ LXPR_SYS_NET_IPV4_TCP_SACK, /* /proc/sys/net/ipv4/tcp_sack */
+ LXPR_SYS_NET_IPV4_TCP_WINSCALE, /* .../net/ipv4/tcp_window_scaling */
+ LXPR_SYS_NET_IPV4_TCP_WMEM, /* /proc/sys/net/ipv4/tcp_wmem */
+ LXPR_SYS_VMDIR, /* /proc/sys/vm */
+ LXPR_SYS_VM_DIRTY_BG_BYTES, /* .../vm/dirty_background_bytes */
+ LXPR_SYS_VM_DIRTY_BG_RATIO, /* .../vm/dirty_background_ratio */
+ LXPR_SYS_VM_DIRTY_BYTES, /* /proc/sys/vm/dirty_bytes */
+ LXPR_SYS_VM_DIRTY_EXP_CS, /* .../vm/dirty_expire_centisecs */
+ LXPR_SYS_VM_DIRTY_RATIO, /* /proc/sys/vm/dirty_ratio */
+ LXPR_SYS_VM_DIRTYTIME_EXP_SEC, /* .../vm/dirtytime_expire_seconds */
+ LXPR_SYS_VM_DIRTY_WB_CS, /* .../vm/dirty_writeback_centisecs */
+ LXPR_SYS_VM_MAX_MAP_CNT, /* /proc/sys/vm/max_map_count */
+ LXPR_SYS_VM_MINFR_KB, /* /proc/sys/vm/min_free_kbytes */
+ LXPR_SYS_VM_NHUGEP, /* /proc/sys/vm/nr_hugepages */
+ LXPR_SYS_VM_OVERCOMMIT_MEM, /* /proc/sys/vm/overcommit_memory */
+ LXPR_SYS_VM_SWAPPINESS, /* /proc/sys/vm/swappiness */
+ LXPR_UPTIME, /* /proc/uptime */
+ LXPR_VERSION, /* /proc/version */
+ LXPR_VMSTAT, /* /proc/vmstat */
+ LXPR_NFILES /* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define LXPR_FD_PERPROC 2000
+
+/*
+ * Linux sector size for /proc/diskstats
+ */
+#define LXPR_SECTOR_SIZE 512
+
+/*
+ * external dirent characteristics
+ */
+typedef struct {
+ lxpr_nodetype_t d_type;
+ char *d_name;
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+ lxpr_nodetype_t lxpr_type; /* type of this node */
+ vnode_t *lxpr_vnode; /* vnode for the node */
+ vnode_t *lxpr_parent; /* parent directory */
+ vnode_t *lxpr_realvp; /* real vnode, file in dirs */
+ timestruc_t lxpr_time; /* creation etc time for file */
+ mode_t lxpr_mode; /* file mode bits */
+ uid_t lxpr_uid; /* file owner */
+ gid_t lxpr_gid; /* file group owner */
+ pid_t lxpr_pid; /* pid of proc referred to */
+ uint_t lxpr_desc; /* addl. descriptor (fd or tid) */
+ ino_t lxpr_ino; /* node id */
+} lxpr_node_t;
+
+struct zone; /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+ lxpr_node_t *lxprm_node; /* node at root of proc mount */
+ struct zone *lxprm_zone; /* zone for this mount */
+ ldi_ident_t lxprm_li; /* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t *lxpr_vnodeops;
+extern int nproc_highbit; /* highbit(v.v_nproc) */
+
+typedef struct mounta mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern boolean_t lxpr_is_writable(lxpr_nodetype_t);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+extern vnode_t *lxpr_lookup_fdnode(vnode_t *, const char *);
+extern int lxpr_readlink_fdnode(lxpr_node_t *, char *, size_t);
+
+typedef struct lxpr_uiobuf {
+ uio_t *uiop;
+ char *buffer;
+ uint32_t buffsize;
+ char *pos;
+ size_t beg;
+ int error;
+} lxpr_uiobuf_t;
+
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern boolean_t lxpr_uiobuf_nonblock(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+extern int lxpr_core_path_l2s(const char *, char *, size_t);
+extern int lxpr_core_path_s2l(const char *, char *, size_t);
+
+typedef enum lxpr_zombok {
+ NO_ZOMB = 0,
+ ZOMB_OK
+} zombok_t;
+
+extern proc_t *lxpr_lock(lxpr_node_t *, zombok_t);
+extern proc_t *lxpr_lock_pid(lxpr_node_t *, pid_t, zombok_t, kthread_t **);
+extern void lxpr_unlock(proc_t *);
+extern netstack_t *lxpr_netstack(lxpr_node_t *);
+extern void lxpr_fixpid(zone_t *, proc_t *, pid_t *, pid_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef islower
+#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z'))
+#endif
+#ifndef toupper
+#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x))
+#endif
+
+#endif /* _LX_PROC_H */
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c
new file mode 100644
index 0000000000..07dc432329
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c
@@ -0,0 +1,917 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * lxprsubr.c: Various functions for the /lxproc vnodeops.
+ */
+
+#include <sys/varargs.h>
+
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+#include "lx_proc.h"
+
+#define LXPRCACHE_NAME "lxbpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+int lx_pr_bufsize = 4000;
+
+struct lxpr_zfs_ds {
+ list_node_t ds_link;
+ char ds_name[MAXPATHLEN];
+ uint64_t ds_cookie;
+};
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+ /* Allocate memory for both lxpr_uiobuf and output buffer */
+ int bufsize = lx_pr_bufsize;
+ struct lxpr_uiobuf *uiobuf =
+ kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+ uiobuf->uiop = uiop;
+ uiobuf->buffer = (char *)&uiobuf[1];
+ uiobuf->buffsize = bufsize;
+ uiobuf->pos = uiobuf->buffer;
+ uiobuf->beg = 0;
+ uiobuf->error = 0;
+
+ return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+ ASSERT(uiobuf != NULL);
+ ASSERT(uiobuf->pos == uiobuf->buffer);
+
+ kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+ uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+boolean_t
+lxpr_uiobuf_nonblock(struct lxpr_uiobuf *uiobuf)
+{
+ if ((uiobuf->uiop->uio_fmode & FNONBLOCK) != 0)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+ ASSERT(uiobuf->error == 0);
+
+ uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+ off_t off = uiobuf->uiop->uio_offset;
+ caddr_t uaddr = uiobuf->buffer;
+ size_t beg = uiobuf->beg;
+ size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+ if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ ASSERT(off >= beg);
+
+ if (beg + size > off && off >= 0)
+ uiobuf->error =
+ uiomove(uaddr + (off - beg), size - (off - beg),
+ UIO_READ, uiobuf->uiop);
+
+ uiobuf->beg += size;
+ }
+
+ uiobuf->pos = uaddr;
+
+ return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+ /* While we can still carry on */
+ while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+ ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+ /* Enough space in buffer? */
+ if (remain >= size) {
+ bcopy(buf, uiobuf->pos, size);
+ uiobuf->pos += size;
+ return;
+ }
+
+ /* Not enough space, so copy all we can and try again */
+ bcopy(buf, uiobuf->pos, remain);
+ uiobuf->pos += remain;
+ (void) lxpr_uiobuf_flush(uiobuf);
+ buf += remain;
+ size -= remain;
+ }
+}
+
+#define TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+ va_list args;
+ char buff[TYPBUFFSIZE];
+ int len;
+ char *buffer;
+
+ /* Can we still do any output */
+ if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+ return;
+
+ va_start(args, fmt);
+
+ /* Try using stack allocated buffer */
+ len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+ if (len < TYPBUFFSIZE) {
+ va_end(args);
+ lxpr_uiobuf_write(uiobuf, buff, len);
+ return;
+ }
+
+ /* Not enough space in pre-allocated buffer */
+ buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+ /*
+ * We know we allocated the correct amount of space
+ * so no check on the return value
+ */
+ (void) vsnprintf(buffer, len+1, fmt, args);
+ lxpr_uiobuf_write(uiobuf, buffer, len);
+ va_end(args);
+ kmem_free(buffer, len+1);
+}
+
+/*
+ * Lookup process, potentially constrained by pid associated with lxpr_node and
+ * return with p_lock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock_pid(lxpr_node_t *lxpnp, pid_t pid, zombok_t zombie_ok,
+ kthread_t **tp)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ proc_t *p;
+ kthread_t *t;
+ lx_pid_flag_t flags = LXP_PRLOCK;
+
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ /* Consider zsched to be invisible to LX */
+ if (pid == zone->zone_zsched->p_pid) {
+ return (NULL);
+ }
+ if (zombie_ok == ZOMB_OK) {
+ flags |= LXP_ZOMBOK;
+ }
+
+retry:
+ if (lx_lpid_lock(pid, zone, flags, &p, &t) != 0) {
+ return (NULL);
+ }
+
+ /*
+ * Make sure that thread lookups (where non-main LX threads are
+ * assigned a pid not equal to the encompassing parent) match the pid
+ * of the encompasing directory. This must be performed carefully for
+ * the Linux pid 1 as it will not equal the native pid despite the
+ * process matching.
+ *
+ * This is necessary to constrain paths such as /proc/<pid>/task/<tid>.
+ */
+ if (lxpnp->lxpr_pid != 0 && lxpnp->lxpr_pid != pid &&
+ !(pid == 1 && lxpnp->lxpr_pid == zone->zone_proc_initpid)) {
+ klwp_t *lwp;
+ lx_lwp_data_t *lwpd;
+
+ /*
+ * Only LWPs of branded processes will be accessible this way.
+ * The threads of native processes lack pid assignments which
+ * LX uses to emulate Linux's weird thread/process model.
+ */
+ if ((lwp = ttolwp(t)) == NULL ||
+ (lwpd = lwptolxlwp(lwp)) == NULL ||
+ lwpd->br_pid != pid) {
+ sprunlock(p);
+ return (NULL);
+ }
+ }
+
+ if (zombie_ok == NO_ZOMB &&
+ ((p->p_flag & SEXITING) || p->p_stat == SZOMB)) {
+ sprunlock(p);
+ return (NULL);
+ }
+
+ /*
+ * Accessing a process which is undergoing exec(2) is somewhat risky.
+ * In particular, the p_exec field is updated outside p_lock. To avoid
+ * this mess, access is denied when P_PR_EXEC set unless the caller
+ * happens to be the process itself. This allows actions such as
+ * re-exec()-ing /proc/<pid>/exe to make forward progress.
+ *
+ * All other callers must block until the flag is cleared.
+ */
+ if ((p->p_proc_flag & P_PR_EXEC) != 0) {
+ if (p != curproc) {
+ kmutex_t *mp;
+
+ /*
+ * Drop PR_LOCK and wait for the exec() to ping the CV
+ * once it has completed. Afterward, the pid is looked
+ * up again in case the process exited for some reason.
+ */
+ mp = &p->p_lock;
+ sprunprlock(p);
+ cv_wait(&pr_pid_cv[p->p_slot], mp);
+ mutex_exit(mp);
+ goto retry;
+ }
+ }
+
+ if (tp != NULL) {
+ *tp = t;
+ }
+ return (p);
+}
+
+netstack_t *
+lxpr_netstack(lxpr_node_t *lxpnp)
+{
+ return (netstack_hold_if_active(LXPTOZ(lxpnp)->zone_netstack));
+}
+
+/*
+ * Lookup process from pid associated with lxpr_node and return with p_lock and
+ * P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(lxpr_node_t *lxpnp, zombok_t zombie_ok)
+{
+ return (lxpr_lock_pid(lxpnp, lxpnp->lxpr_pid, zombie_ok, NULL));
+}
+
+void
+lxpr_fixpid(zone_t *zone, proc_t *p, pid_t *pidp, pid_t *ppidp)
+{
+ pid_t pid = p->p_pid;
+ pid_t ppid = p->p_ppid;
+
+ ASSERT(p != NULL);
+ ASSERT(pidp != NULL);
+ ASSERT(zone->zone_brand == &lx_brand);
+ ASSERT(pid != zone->zone_zsched->p_pid);
+
+ if (pid == zone->zone_proc_initpid) {
+ pid = 1;
+ ppid = 0; /* parent pid for init is 0 */
+ } else {
+ if (ppid == zone->zone_proc_initpid) {
+ /*
+ * Convert ppid to the Linux default of 1 if our parent
+ * is the zone's init process
+ */
+ ppid = 1;
+ } else if (ppid == zone->zone_zsched->p_pid ||
+ (p->p_flag & SZONETOP) != 0) {
+ /*
+ * Additionally, if the process has no valid parent
+ * inside the zone (or its parent is zsched), lie and
+ * claim init as the parent.
+ */
+ ppid = 1;
+ }
+ }
+
+ *pidp = pid;
+ if (ppidp != NULL) {
+ *ppidp = ppid;
+ }
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ cv_signal(&pr_pid_cv[p->p_slot]);
+ p->p_proc_flag &= ~P_PR_LOCK;
+ mutex_exit(&p->p_lock);
+ THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+ lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+ sizeof (lxpr_node_t), 0,
+ lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+ kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+ lxpr_node_t *lxpnp = buf;
+ vnode_t *vp;
+
+ vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+ if (vp == NULL)
+ return (-1);
+
+ (void) vn_setops(vp, lxpr_vnodeops);
+ vp->v_data = lxpnp;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+ lxpr_node_t *lxpnp = buf;
+
+ vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int desc)
+{
+ switch (type) {
+ case LXPR_PIDDIR:
+ return (maxpid + pid + 1);
+ case LXPR_PID_TASK_IDDIR:
+ return (maxpid + (desc * 10));
+ case LXPR_PROCDIR:
+ return (maxpid + 2);
+ case LXPR_PID_FD_FD:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ LXPR_NFILES + desc);
+ default:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ type);
+ }
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+ /*
+ * If the input node is the root then the parent inode
+ * is the mounted on inode so just return our inode number
+ */
+ if (lxpnp->lxpr_type != LXPR_PROCDIR)
+ return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+ else
+ return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int desc)
+{
+ lxpr_node_t *lxpnp;
+ vnode_t *vp;
+ user_t *up;
+ timestruc_t now;
+
+ /*
+ * Allocate a new node. It is deallocated in vop_inactive
+ */
+ lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+ /*
+ * Set defaults (may be overridden below)
+ */
+ gethrestime(&now);
+ lxpnp->lxpr_type = type;
+ lxpnp->lxpr_realvp = NULL;
+ lxpnp->lxpr_parent = dp;
+ lxpnp->lxpr_desc = desc;
+ VN_HOLD(dp);
+ if (p != NULL) {
+ lxpr_node_t *dlxpnp = VTOLXP(dp);
+
+ lxpnp->lxpr_pid = p->p_pid;
+ /* Propagate the tid whenever possible. */
+ if (desc == 0 && dlxpnp->lxpr_desc != 0) {
+ lxpnp->lxpr_desc = dlxpnp->lxpr_desc;
+ }
+ lxpnp->lxpr_time = PTOU(p)->u_start;
+ lxpnp->lxpr_uid = crgetruid(p->p_cred);
+ lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+ lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, desc);
+ } else {
+ /* Pretend files without a proc belong to sched */
+ lxpnp->lxpr_pid = 0;
+ lxpnp->lxpr_time = now;
+ lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+ lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+ }
+
+ /* initialize the vnode data */
+ vp = lxpnp->lxpr_vnode;
+ vn_reinit(vp);
+ vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+ vp->v_vfsp = dp->v_vfsp;
+
+ /*
+ * Do node specific stuff
+ */
+ if (lxpr_is_writable(type)) {
+ /* These two have different modes; handled later. */
+ if (type != LXPR_PID_FD_FD && type != LXPR_PID_TID_FD_FD) {
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0644;
+ return (lxpnp);
+ }
+ }
+
+ switch (type) {
+ case LXPR_PROCDIR:
+ vp->v_flag |= VROOT;
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by everyone */
+ break;
+
+ case LXPR_PID_CURDIR:
+ ASSERT(p != NULL);
+
+ /*
+ * Zombie check. p_stat is officially protected by pidlock,
+ * but we can't grab pidlock here because we already hold
+ * p_lock. Luckily if we look at the process exit code
+ * we see that p_stat only transisions from SRUN to SZOMB
+ * while p_lock is held. Aside from this, the only other
+ * p_stat transition that we need to be aware about is
+ * SIDL to SRUN, but that's not a problem since lxpr_lock()
+ * ignores nodes in the SIDL state so we'll never get a node
+ * that isn't already in the SRUN state.
+ */
+ if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ up = PTOU(p);
+ lxpnp->lxpr_realvp = up->u_cdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_ROOTDIR:
+ ASSERT(p != NULL);
+ /* Zombie check. see locking comment above */
+ if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ up = PTOU(p);
+ lxpnp->lxpr_realvp =
+ up->u_rdir != NULL ? up->u_rdir : rootdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_EXE:
+ ASSERT(p != NULL);
+ lxpnp->lxpr_realvp = p->p_exec;
+ if (lxpnp->lxpr_realvp != NULL) {
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777;
+ break;
+
+ case LXPR_SELF:
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_TASKDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by everyone */
+ break;
+
+ case LXPR_PID_TASK_IDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by everyone */
+ break;
+
+ case LXPR_PID_FD_FD:
+ case LXPR_PID_TID_FD_FD:
+ ASSERT(p != NULL);
+ /* lxpr_realvp is set after we return */
+ lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */
+ vp->v_type = VLNK;
+ break;
+
+ case LXPR_PID_FDDIR:
+ case LXPR_PID_TID_FDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0500; /* read-search by owner only */
+ break;
+
+ case LXPR_PIDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0511;
+ break;
+
+ case LXPR_NETDIR:
+ case LXPR_SYSDIR:
+ case LXPR_SYS_FSDIR:
+ case LXPR_SYS_FS_INOTIFYDIR:
+ case LXPR_SYS_KERNELDIR:
+ case LXPR_SYS_KERNEL_RANDDIR:
+ case LXPR_SYS_NETDIR:
+ case LXPR_SYS_NET_COREDIR:
+ case LXPR_SYS_NET_IPV4DIR:
+ case LXPR_SYS_VMDIR:
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by all */
+ break;
+
+ case LXPR_PID_AUXV:
+ case LXPR_PID_PERSONALITY:
+ case LXPR_PID_ENV:
+ case LXPR_PID_MEM:
+ ASSERT(p != NULL);
+ /*FALLTHRU*/
+ case LXPR_KCORE:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0400; /* read-only by owner only */
+ break;
+
+ default:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0444; /* read-only by all */
+ break;
+ }
+
+ return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+ ASSERT(lxpnp != NULL);
+ ASSERT(LXPTOV(lxpnp) != NULL);
+
+ /*
+ * delete any association with realvp
+ */
+ if (lxpnp->lxpr_realvp != NULL)
+ VN_RELE(lxpnp->lxpr_realvp);
+
+ /*
+ * delete any association with parent vp
+ */
+ if (lxpnp->lxpr_parent != NULL)
+ VN_RELE(lxpnp->lxpr_parent);
+
+ /*
+ * Release the lxprnode.
+ */
+ kmem_cache_free(lxpr_node_cache, lxpnp);
+}
+
+/*
+ * Attempt to locate vnode for /proc/<pid>/fd/<#>.
+ */
+vnode_t *
+lxpr_lookup_fdnode(vnode_t *dvp, const char *name)
+{
+ lxpr_node_t *lxdp = VTOLXP(dvp);
+ lxpr_node_t *lxfp;
+ char *endptr = NULL;
+ long num;
+ int fd;
+ proc_t *p;
+ vnode_t *vp = NULL;
+ file_t *fp;
+ uf_entry_t *ufp;
+ uf_info_t *fip;
+
+ ASSERT(lxdp->lxpr_type == LXPR_PID_FDDIR ||
+ lxdp->lxpr_type == LXPR_PID_TID_FDDIR);
+
+ if (ddi_strtol(name, &endptr, 10, &num) != 0) {
+ return (NULL);
+ } else if (name[0] < '0' || name[0] > '9' || *endptr != '\0') {
+ /*
+ * ddi_strtol allows leading spaces and trailing garbage
+ * We do not tolerate such foolishness.
+ */
+ return (NULL);
+ } else if ((fd = (int)num) < 0) {
+ return (NULL);
+ }
+
+ /* Lock the owner process */
+ if ((p = lxpr_lock(lxdp, NO_ZOMB)) == NULL) {
+ return (NULL);
+ }
+
+ /* Not applicable to processes which are system-owned. */
+ if (p->p_as == &kas) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ lxfp = lxpr_getnode(dvp, LXPR_PID_FD_FD, p, fd);
+
+ /*
+ * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+ * going away while we dereference into fi_list.
+ */
+ fip = P_FINFO(p);
+ mutex_exit(&p->p_lock);
+ mutex_enter(&fip->fi_lock);
+ if (fd < fip->fi_nfiles) {
+ UF_ENTER(ufp, fip, fd);
+ if ((fp = ufp->uf_file) != NULL) {
+ vp = fp->f_vnode;
+ VN_HOLD(vp);
+ }
+ UF_EXIT(ufp);
+ }
+ mutex_exit(&fip->fi_lock);
+
+ if (vp == NULL) {
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ lxpr_freenode(lxfp);
+ return (NULL);
+ } else {
+ /*
+ * Fill in the lxpr_node so future references will be able to
+ * find the underlying vnode. The vnode is held on the realvp.
+ */
+ lxfp->lxpr_realvp = vp;
+
+ /*
+ * For certain entries (sockets, pipes, etc), Linux expects a
+ * bogus-named symlink. If that's the case, report the type as
+ * VNON to bypass link-following elsewhere in the vfs system.
+ *
+ * See lxpr_readlink for more details.
+ */
+ if (lxpr_readlink_fdnode(lxfp, NULL, 0) == 0)
+ LXPTOV(lxfp)->v_type = VNON;
+ }
+
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ ASSERT(LXPTOV(lxfp) != NULL);
+ return (LXPTOV(lxfp));
+}
+
+/*
+ * Attempt to create Linux-proc-style fake symlinks contents for supported
+ * /proc/<pid>/fd/<#> entries.
+ */
+int
+lxpr_readlink_fdnode(lxpr_node_t *lxpnp, char *bp, size_t len)
+{
+ const char *format;
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+ vattr_t attr;
+
+ switch (rvp->v_type) {
+ case VSOCK:
+ format = "socket:[%lu]";
+ break;
+ case VFIFO:
+ format = "pipe:[%lu]";
+ break;
+ default:
+ return (-1);
+ }
+
+ /* Fetch the inode of the underlying vnode */
+ if (VOP_GETATTR(rvp, &attr, 0, CRED(), NULL) != 0)
+ return (-1);
+
+ if (bp != NULL)
+ (void) snprintf(bp, len, format, (ino_t)attr.va_nodeid);
+ return (0);
+}
+
+/*
+ * Translate a Linux core_pattern path to a native Illumos one, by replacing
+ * the appropriate % escape sequences.
+ *
+ * Any % escape sequences that are not recognised are double-escaped so that
+ * they will be inserted literally into the path (to mimic Linux).
+ */
+int
+lxpr_core_path_l2s(const char *inp, char *outp, size_t outsz)
+{
+ int i = 0, j = 0;
+ char x;
+
+ while (j < outsz - 1) {
+ x = inp[i++];
+ if (x == '\0')
+ break;
+ if (x != '%') {
+ outp[j++] = x;
+ continue;
+ }
+
+ x = inp[i++];
+ if (x == '\0')
+ break;
+
+ /* Make sure we have enough space in the output buffer. */
+ if (j + 2 >= outsz - 1)
+ return (EINVAL);
+
+ switch (x) {
+ case 'E':
+ if (j + 4 >= outsz - 1)
+ return (EINVAL);
+ outp[j++] = '%';
+ outp[j++] = 'd';
+ outp[j++] = '%';
+ outp[j++] = 'f';
+ break;
+ case 'e':
+ outp[j++] = '%';
+ outp[j++] = 'f';
+ break;
+ case 'p':
+ case 'g':
+ case 'u':
+ case 't':
+ case '%':
+ outp[j++] = '%';
+ outp[j++] = x;
+ break;
+ case 'h':
+ outp[j++] = '%';
+ outp[j++] = 'n';
+ break;
+ default:
+ /* No translation, make it literal. */
+ if (j + 3 >= outsz - 1)
+ return (EINVAL);
+ outp[j++] = '%';
+ outp[j++] = '%';
+ outp[j++] = x;
+ break;
+ }
+ }
+
+ outp[j] = '\0';
+ return (0);
+}
+
+/*
+ * Translate an Illumos core pattern path back to Linux format.
+ */
+int
+lxpr_core_path_s2l(const char *inp, char *outp, size_t outsz)
+{
+ int i = 0, j = 0;
+ char x;
+
+ while (j < outsz - 1) {
+ x = inp[i++];
+ if (x == '\0')
+ break;
+ if (x != '%') {
+ outp[j++] = x;
+ continue;
+ }
+
+ x = inp[i++];
+ if (x == '\0')
+ break;
+
+ /* Make sure we have enough space in the output buffer. */
+ if (j + 2 >= outsz - 1)
+ return (EINVAL);
+
+ switch (x) {
+ case 'd':
+ /* No Linux equivalent unless it's %d%f. */
+ if (inp[i] == '%' && inp[i + 1] == 'f') {
+ i += 2;
+ outp[j++] = '%';
+ outp[j++] = 'E';
+ }
+ break;
+ case 'f':
+ outp[j++] = '%';
+ outp[j++] = 'e';
+ break;
+ case 'p':
+ case 'P':
+ case 'g':
+ case 'u':
+ case 't':
+ case '%':
+ outp[j++] = '%';
+ outp[j++] = (x == 'P' ? 'p' : x);
+ break;
+ case 'n':
+ outp[j++] = '%';
+ outp[j++] = 'h';
+ break;
+ default:
+ /* No translation. */
+ break;
+ }
+ }
+
+ outp[j] = '\0';
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c
new file mode 100644
index 0000000000..b4dc5091c2
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c
@@ -0,0 +1,377 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * lxprvfsops.c: vfs operations for /lxprocfs.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+
+#include "lx_proc.h"
+
+/* Module level parameters */
+static int lxprocfstype;
+static dev_t lxprocdev;
+static kmutex_t lxpr_mount_lock;
+
+int nproc_highbit; /* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "lx_proc",
+ lxpr_init,
+ VSW_ZMOUNT,
+ NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+ &mod_fsops, "lx brand procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int retval;
+
+ /*
+ * attempt to unload the module
+ */
+ if ((retval = mod_remove(&modlinkage)) != 0)
+ goto done;
+
+ /*
+ * destroy lxpr_node cache
+ */
+ lxpr_fininodecache();
+
+ /*
+ * clean out the vfsops and vnodeops
+ */
+ (void) vfs_freevfsops_by_type(lxprocfstype);
+ vn_freevnodeops(lxpr_vnodeops);
+
+ mutex_destroy(&lxpr_mount_lock);
+done:
+ return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+ static const fs_operation_def_t lxpr_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = lxpr_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount },
+ VFSNAME_ROOT, { .vfs_root = lxpr_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs },
+ NULL, NULL
+ };
+ extern const fs_operation_def_t lxpr_vnodeops_template[];
+ int error;
+ major_t dev;
+
+ nproc_highbit = highbit(v.v_proc);
+ lxprocfstype = fstype;
+ ASSERT(lxprocfstype != 0);
+
+ mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Associate VFS ops vector with this fstype.
+ */
+ error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+ return (error);
+ }
+
+ /*
+ * Set up vnode ops vector too.
+ */
+ error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * Assign a unique "device" number (used by stat(2)).
+ */
+ if ((dev = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+ dev = 0;
+ }
+
+ /*
+ * Make the pseudo device
+ */
+ lxprocdev = makedevice(dev, 0);
+
+ /*
+ * Initialise cache for lxpr_nodes
+ */
+ lxpr_initnodecache();
+
+ return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt;
+ zone_t *zone = curproc->p_zone;
+ ldi_ident_t li;
+ int err;
+
+ /*
+ * must be root to mount
+ */
+ if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+ return (EPERM);
+
+ /*
+ * mount point must be a directory
+ */
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ /*
+ * Mounting lx_proc is not allowed outside an LX zone.
+ */
+ if (zone->zone_brand != &lx_brand) {
+ return (ENOTSUP);
+ }
+
+ /*
+ * Having the resource be anything but "lxproc" doesn't make sense
+ */
+ vfs_setresource(vfsp, "lxproc", 0);
+
+ lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+ if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+ return (err);
+ }
+ lxpr_mnt->lxprm_li = li;
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * Ensure we don't allow overlaying mounts
+ */
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ mutex_exit(&lxpr_mount_lock);
+ kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * Hold a zone reference for access to the lxzd structure.
+ */
+ zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+ /*
+ * Allocate the first vnode and arbitrarily set the parent vnode to the
+ * mounted over directory
+ */
+ lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+ /* Correctly set the fs for the root node */
+ lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+ vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+ vfsp->vfs_bsize = DEV_BSIZE;
+ vfsp->vfs_fstype = lxprocfstype;
+ vfsp->vfs_data = (caddr_t)lxpr_mnt;
+ vfsp->vfs_dev = lxprocdev;
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+ vnode_t *vp;
+ int count;
+
+ ASSERT(lxpr_mnt != NULL);
+ vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * must be root to unmount
+ */
+ if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EPERM);
+ }
+
+ /*
+ * forced unmount is not supported by this file system
+ */
+ if (flag & MS_FORCE) {
+ mutex_exit(&lxpr_mount_lock);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Ensure that no vnodes are in use on this mount point.
+ */
+ mutex_enter(&vp->v_lock);
+ count = vp->v_count;
+ mutex_exit(&vp->v_lock);
+ if (count > 1) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EBUSY);
+ }
+
+
+ /*
+ * purge the dnlc cache for vnode entries
+ * associated with this file system
+ */
+ count = dnlc_purge_vfsp(vfsp, 0);
+
+ /*
+ * free up the lxprnode
+ */
+ lxpr_freenode(lxpr_mnt->lxprm_node);
+ zone_rele(lxpr_mnt->lxprm_zone);
+
+ ldi_ident_release(lxpr_mnt->lxprm_li);
+
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+ vnode_t *vp = LXPTOV(lxpnp);
+
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+ int n;
+ dev32_t d32;
+ extern uint_t nproc;
+
+ n = v.v_proc - nproc;
+
+ bzero((caddr_t)sp, sizeof (*sp));
+ sp->f_bsize = DEV_BSIZE;
+ sp->f_frsize = DEV_BSIZE;
+ sp->f_blocks = (fsblkcnt64_t)0;
+ sp->f_bfree = (fsblkcnt64_t)0;
+ sp->f_bavail = (fsblkcnt64_t)0;
+ sp->f_files = (fsfilcnt64_t)v.v_proc + 2;
+ sp->f_ffree = (fsfilcnt64_t)n;
+ sp->f_favail = (fsfilcnt64_t)n;
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sp->f_fsid = d32;
+ /* It is guaranteed that vsw_name will fit in f_basetype */
+ (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+ sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sp->f_namemax = 64; /* quite arbitrary */
+ bzero(sp->f_fstr, sizeof (sp->f_fstr));
+
+ /* We know f_fstr is 32 chars */
+ (void) strcpy(sp->f_fstr, "/proc");
+ (void) strcpy(&sp->f_fstr[6], "/proc");
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c
new file mode 100644
index 0000000000..e5ca432bbd
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c
@@ -0,0 +1,8377 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * lx_proc -- a Linux-compatible /proc for the LX brand
+ *
+ * We have -- confusingly -- two implementations of Linux /proc. One is to
+ * support native (but Linux-borne) programs that wish to view the native
+ * system through the Linux /proc model; the other -- this one -- is to
+ * support Linux binaries via the LX brand. These two implementations differ
+ * greatly in their aspirations (and their willingness to bend the truth
+ * of the system to accommodate those aspirations); they should not be unified.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <lx_signum.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_brand.h>
+#include <lx_auxv.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/fcntl.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+#include <sys/param.h>
+#include <sys/utsname.h>
+#include <sys/rctl.h>
+#include <sys/kstat.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_types.h>
+#include <sys/lx_userhz.h>
+#include <sys/brand.h>
+#include <sys/cred_impl.h>
+#include <sys/tihdr.h>
+#include <sys/corectl.h>
+#include <sys/rctl_impl.h>
+#include <inet/ip.h>
+#include <inet/ip_ire.h>
+#include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
+#include <inet/ipclassifier.h>
+#include <sys/socketvar.h>
+#include <fs/sockfs/socktpi.h>
+#include <sys/random.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+extern int prreadargv(proc_t *, char *, size_t, size_t *);
+extern int prreadenvv(proc_t *, char *, size_t, size_t *);
+extern int prreadbuf(proc_t *, uintptr_t, uint8_t *, size_t, size_t *);
+
+#include "lx_proc.h"
+
+extern pgcnt_t swapfs_minfree;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+ caller_context_t *);
+static int lxpr_create(struct vnode *, char *, struct vattr *, enum vcexcl,
+ int, struct vnode **, struct cred *, int, caller_context_t *, vsecattr_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_write(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_space(vnode_t *, int, flock64_t *, int, offset_t, cred_t *,
+ caller_context_t *);
+static int lxpr_setattr(vnode_t *, vattr_t *, int, cred_t *,
+ caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+ caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+ pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+ pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+ caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_poll(vnode_t *, short, int, short *, pollhead_t **,
+ caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static int lxpr_doaccess(lxpr_node_t *, boolean_t, int, int, cred_t *,
+ caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sysdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_fsdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_fs_inotifydir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_kerneldir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_kdir_randdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_netdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_net_coredir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_net_ipv4dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_vmdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_taskdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_task_tid_dir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sysdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_fsdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_kerneldir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_kdir_randdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_netdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_net_coredir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_vmdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_taskdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_task_tid_dir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cgroups(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_devices(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_diskstats(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_filesystems(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_swaps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_vmstat(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_auxv(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_cgroup(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_env(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_id_map(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_limits(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_loginuid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_mountinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_oom_scr_adj(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_personality(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_tid_comm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_tid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_tid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_if_inet6(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ipv6_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp6(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp6(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_aiomax(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_aionr(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_filemax(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_filenr(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *,
+ lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *,
+ lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *,
+ lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_pipe_max(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_caplcap(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_corepatt(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_hostname(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_msgmax(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_msgmnb(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_msgmni(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_osrel(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_pid_max(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_rand_entavl(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_sem(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_shmall(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_shmmax(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_shmmni(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_threads_max(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_core_somaxc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_icmp_eib(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_ip_forward(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *,
+ lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *,
+ lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_retry2(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_rwmem(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_dirty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_minfr_kb(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_nhpages(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_swappiness(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static int lxpr_write_pid_tid_comm(lxpr_node_t *, uio_t *, cred_t *,
+ caller_context_t *);
+static int lxpr_write_pid_loginuid(lxpr_node_t *, uio_t *, cred_t *,
+ caller_context_t *);
+static int lxpr_write_sys_fs_pipe_max(lxpr_node_t *, uio_t *, cred_t *,
+ caller_context_t *);
+static int lxpr_write_sys_net_core_somaxc(lxpr_node_t *, uio_t *, cred_t *,
+ caller_context_t *);
+static int lxpr_write_sys_net_ipv4_icmp_eib(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, uio_t *, cred_t *,
+ caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_retry2(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_rwmem(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *, uio_t *,
+ cred_t *, caller_context_t *);
+static int lxpr_write_sys_kernel_corepatt(lxpr_node_t *, uio_t *, cred_t *,
+ caller_context_t *);
+
+/*
+ * Simple conversion
+ */
+#define btok(x) ((x) >> 10) /* bytes to kbytes */
+#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */
+
+#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t))
+
+extern rctl_hndl_t rc_process_semmsl;
+extern rctl_hndl_t rc_process_semopm;
+extern rctl_hndl_t rc_zone_semmni;
+extern rctl_hndl_t rc_process_msgmnb;
+
+extern rctl_hndl_t rc_zone_msgmni;
+extern rctl_hndl_t rc_zone_shmmax;
+extern rctl_hndl_t rc_zone_shmmni;
+
+/* From uts/common/crypto/io/swrand.c */
+extern swrand_stats_t swrand_stats;
+
+#define ONEGB 1073741824ULL
+#define FOURGB 4294967295ULL
+
+/*
+ * The maximum length of the concatenation of argument vector strings we
+ * will return to the user via the branded procfs. Likewise for the env vector.
+ */
+int lxpr_maxargvlen = 4096;
+int lxpr_maxenvvlen = 4096;
+
+/*
+ * The lx /proc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = lxpr_open },
+ VOPNAME_CLOSE, { .vop_close = lxpr_close },
+ VOPNAME_READ, { .vop_read = lxpr_read },
+ VOPNAME_WRITE, { .vop_read = lxpr_write },
+ VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr },
+ VOPNAME_ACCESS, { .vop_access = lxpr_access },
+ VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup },
+ VOPNAME_CREATE, { .vop_create = lxpr_create },
+ VOPNAME_READDIR, { .vop_readdir = lxpr_readdir },
+ VOPNAME_READLINK, { .vop_readlink = lxpr_readlink },
+ VOPNAME_SPACE, { .vop_space = lxpr_space },
+ VOPNAME_SETATTR, { .vop_setattr = lxpr_setattr },
+ VOPNAME_FSYNC, { .error = lxpr_sync },
+ VOPNAME_SEEK, { .error = lxpr_sync },
+ VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive },
+ VOPNAME_CMP, { .vop_cmp = lxpr_cmp },
+ VOPNAME_REALVP, { .vop_realvp = lxpr_realvp },
+ VOPNAME_POLL, { .vop_poll = lxpr_poll },
+ NULL, NULL
+};
+
+
+/*
+ * file contents of an lx /proc directory.
+ */
+static lxpr_dirent_t lx_procdir[] = {
+ { LXPR_CGROUPS, "cgroups" },
+ { LXPR_CMDLINE, "cmdline" },
+ { LXPR_CPUINFO, "cpuinfo" },
+ { LXPR_DEVICES, "devices" },
+ { LXPR_DISKSTATS, "diskstats" },
+ { LXPR_DMA, "dma" },
+ { LXPR_FILESYSTEMS, "filesystems" },
+ { LXPR_INTERRUPTS, "interrupts" },
+ { LXPR_IOPORTS, "ioports" },
+ { LXPR_KCORE, "kcore" },
+ { LXPR_KMSG, "kmsg" },
+ { LXPR_LOADAVG, "loadavg" },
+ { LXPR_MEMINFO, "meminfo" },
+ { LXPR_MODULES, "modules" },
+ { LXPR_MOUNTS, "mounts" },
+ { LXPR_NETDIR, "net" },
+ { LXPR_PARTITIONS, "partitions" },
+ { LXPR_SELF, "self" },
+ { LXPR_STAT, "stat" },
+ { LXPR_SWAPS, "swaps" },
+ { LXPR_SYSDIR, "sys" },
+ { LXPR_UPTIME, "uptime" },
+ { LXPR_VERSION, "version" },
+ { LXPR_VMSTAT, "vmstat" }
+};
+
+#define PROCDIRFILES (sizeof (lx_procdir) / sizeof (lx_procdir[0]))
+
+/*
+ * Contents of an lx /proc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+ { LXPR_PID_AUXV, "auxv" },
+ { LXPR_PID_CGROUP, "cgroup" },
+ { LXPR_PID_CMDLINE, "cmdline" },
+ { LXPR_PID_COMM, "comm" },
+ { LXPR_PID_CPU, "cpu" },
+ { LXPR_PID_CURDIR, "cwd" },
+ { LXPR_PID_ENV, "environ" },
+ { LXPR_PID_EXE, "exe" },
+ { LXPR_PID_GIDMAP, "gid_map" },
+ { LXPR_PID_LIMITS, "limits" },
+ { LXPR_PID_LOGINUID, "loginuid" },
+ { LXPR_PID_MAPS, "maps" },
+ { LXPR_PID_MEM, "mem" },
+ { LXPR_PID_MOUNTINFO, "mountinfo" },
+ { LXPR_PID_MOUNTS, "mounts" },
+ { LXPR_PID_OOM_SCR_ADJ, "oom_score_adj" },
+ { LXPR_PID_PERSONALITY, "personality" },
+ { LXPR_PID_ROOTDIR, "root" },
+ { LXPR_PID_STAT, "stat" },
+ { LXPR_PID_STATM, "statm" },
+ { LXPR_PID_STATUS, "status" },
+ { LXPR_PID_TASKDIR, "task" },
+ { LXPR_PID_FDDIR, "fd" },
+ { LXPR_PID_UIDMAP, "uid_map" }
+};
+
+#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * Contents of an lx /proc/<pid>/task/<tid> directory.
+ */
+static lxpr_dirent_t tiddir[] = {
+ { LXPR_PID_TID_AUXV, "auxv" },
+ { LXPR_PID_CGROUP, "cgroup" },
+ { LXPR_PID_CMDLINE, "cmdline" },
+ { LXPR_PID_TID_COMM, "comm" },
+ { LXPR_PID_CPU, "cpu" },
+ { LXPR_PID_CURDIR, "cwd" },
+ { LXPR_PID_ENV, "environ" },
+ { LXPR_PID_EXE, "exe" },
+ { LXPR_PID_GIDMAP, "gid_map" },
+ { LXPR_PID_LIMITS, "limits" },
+ { LXPR_PID_LOGINUID, "loginuid" },
+ { LXPR_PID_MAPS, "maps" },
+ { LXPR_PID_MEM, "mem" },
+ { LXPR_PID_MOUNTINFO, "mountinfo" },
+ { LXPR_PID_TID_OOM_SCR_ADJ, "oom_score_adj" },
+ { LXPR_PID_PERSONALITY, "personality" },
+ { LXPR_PID_ROOTDIR, "root" },
+ { LXPR_PID_TID_STAT, "stat" },
+ { LXPR_PID_STATM, "statm" },
+ { LXPR_PID_TID_STATUS, "status" },
+ { LXPR_PID_FDDIR, "fd" },
+ { LXPR_PID_UIDMAP, "uid_map" }
+};
+
+#define TIDDIRFILES (sizeof (tiddir) / sizeof (tiddir[0]))
+
+#define LX_RLIM_INFINITY 0xFFFFFFFFFFFFFFFF
+
+#define RCTL_INFINITE(x) \
+ ((x.rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \
+ (x.rcv_flagaction & RCTL_GLOBAL_INFINITE))
+
+typedef struct lxpr_rlimtab {
+ char *rlim_name; /* limit name */
+ char *rlim_unit; /* limit unit */
+ char *rlim_rctl; /* rctl source */
+} lxpr_rlimtab_t;
+
+static lxpr_rlimtab_t lxpr_rlimtab[] = {
+ { "Max cpu time", "seconds", "process.max-cpu-time" },
+ { "Max file size", "bytes", "process.max-file-size" },
+ { "Max data size", "bytes", "process.max-data-size" },
+ { "Max stack size", "bytes", "process.max-stack-size" },
+ { "Max core file size", "bytes", "process.max-core-size" },
+ { "Max resident set", "bytes", "zone.max-physical-memory" },
+ { "Max processes", "processes", "zone.max-lwps" },
+ { "Max open files", "files", "process.max-file-descriptor" },
+ { "Max locked memory", "bytes", "zone.max-locked-memory" },
+ { "Max address space", "bytes", "process.max-address-space" },
+ { "Max file locks", "locks", NULL },
+ { "Max pending signals", "signals",
+ "process.max-sigqueue-size" },
+ { "Max msgqueue size", "bytes", "process.max-msg-messages" }
+};
+
+#define LX_RLIM_TAB_LEN (sizeof (lxpr_rlimtab) / sizeof (lxpr_rlimtab[0]))
+
+
+/*
+ * contents of lx /proc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+ { LXPR_NET_ARP, "arp" },
+ { LXPR_NET_DEV, "dev" },
+ { LXPR_NET_DEV_MCAST, "dev_mcast" },
+ { LXPR_NET_IF_INET6, "if_inet6" },
+ { LXPR_NET_IGMP, "igmp" },
+ { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" },
+ { LXPR_NET_IP_MR_VIF, "ip_mr_vif" },
+ { LXPR_NET_IPV6_ROUTE, "ipv6_route" },
+ { LXPR_NET_MCFILTER, "mcfilter" },
+ { LXPR_NET_NETSTAT, "netstat" },
+ { LXPR_NET_RAW, "raw" },
+ { LXPR_NET_ROUTE, "route" },
+ { LXPR_NET_RPC, "rpc" },
+ { LXPR_NET_RT_CACHE, "rt_cache" },
+ { LXPR_NET_SOCKSTAT, "sockstat" },
+ { LXPR_NET_SNMP, "snmp" },
+ { LXPR_NET_STAT, "stat" },
+ { LXPR_NET_TCP, "tcp" },
+ { LXPR_NET_TCP6, "tcp6" },
+ { LXPR_NET_UDP, "udp" },
+ { LXPR_NET_UDP6, "udp6" },
+ { LXPR_NET_UNIX, "unix" }
+};
+
+#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * contents of /proc/sys directory
+ */
+static lxpr_dirent_t sysdir[] = {
+ { LXPR_SYS_FSDIR, "fs" },
+ { LXPR_SYS_KERNELDIR, "kernel" },
+ { LXPR_SYS_NETDIR, "net" },
+ { LXPR_SYS_VMDIR, "vm" },
+};
+
+#define SYSDIRFILES (sizeof (sysdir) / sizeof (sysdir[0]))
+
+/*
+ * contents of /proc/sys/fs directory
+ */
+static lxpr_dirent_t sys_fsdir[] = {
+ { LXPR_SYS_FS_AIO_MAX_NR, "aio-max-nr" },
+ { LXPR_SYS_FS_AIO_NR, "aio-nr" },
+ { LXPR_SYS_FS_FILEMAX, "file-max" },
+ { LXPR_SYS_FS_FILENR, "file-nr" },
+ { LXPR_SYS_FS_INOTIFYDIR, "inotify" },
+ { LXPR_SYS_FS_PIPE_MAX, "pipe-max-size" },
+};
+
+#define SYS_FSDIRFILES (sizeof (sys_fsdir) / sizeof (sys_fsdir[0]))
+
+/*
+ * contents of /proc/sys/fs/inotify directory
+ */
+static lxpr_dirent_t sys_fs_inotifydir[] = {
+ { LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
+ { LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
+ { LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
+};
+
+#define SYS_FS_INOTIFYDIRFILES \
+ (sizeof (sys_fs_inotifydir) / sizeof (sys_fs_inotifydir[0]))
+
+/*
+ * contents of /proc/sys/kernel directory
+ */
+static lxpr_dirent_t sys_kerneldir[] = {
+ { LXPR_SYS_KERNEL_CAPLCAP, "cap_last_cap" },
+ { LXPR_SYS_KERNEL_COREPATT, "core_pattern" },
+ { LXPR_SYS_KERNEL_HOSTNAME, "hostname" },
+ { LXPR_SYS_KERNEL_MSGMAX, "msgmax" },
+ { LXPR_SYS_KERNEL_MSGMNB, "msgmnb" },
+ { LXPR_SYS_KERNEL_MSGMNI, "msgmni" },
+ { LXPR_SYS_KERNEL_NGROUPS_MAX, "ngroups_max" },
+ { LXPR_SYS_KERNEL_OSREL, "osrelease" },
+ { LXPR_SYS_KERNEL_PID_MAX, "pid_max" },
+ { LXPR_SYS_KERNEL_RANDDIR, "random" },
+ { LXPR_SYS_KERNEL_SEM, "sem" },
+ { LXPR_SYS_KERNEL_SHMALL, "shmall" },
+ { LXPR_SYS_KERNEL_SHMMAX, "shmmax" },
+ { LXPR_SYS_KERNEL_SHMMNI, "shmmni" },
+ { LXPR_SYS_KERNEL_THREADS_MAX, "threads-max" },
+};
+
+#define SYS_KERNELDIRFILES (sizeof (sys_kerneldir) / sizeof (sys_kerneldir[0]))
+
+/*
+ * contents of /proc/sys/kernel/random directory
+ */
+static lxpr_dirent_t sys_randdir[] = {
+ { LXPR_SYS_KERNEL_RAND_BOOTID, "boot_id" },
+ { LXPR_SYS_KERNEL_RAND_ENTAVL, "entropy_avail" },
+};
+
+#define SYS_RANDDIRFILES (sizeof (sys_randdir) / sizeof (sys_randdir[0]))
+
+/*
+ * contents of /proc/sys/net directory
+ */
+static lxpr_dirent_t sys_netdir[] = {
+ { LXPR_SYS_NET_COREDIR, "core" },
+ { LXPR_SYS_NET_IPV4DIR, "ipv4" },
+};
+
+#define SYS_NETDIRFILES (sizeof (sys_netdir) / sizeof (sys_netdir[0]))
+
+/*
+ * contents of /proc/sys/net/core directory
+ */
+static lxpr_dirent_t sys_net_coredir[] = {
+ { LXPR_SYS_NET_CORE_SOMAXCON, "somaxconn" },
+};
+
+#define SYS_NET_COREDIRFILES \
+ (sizeof (sys_net_coredir) / sizeof (sys_net_coredir[0]))
+
+/*
+ * contents of /proc/sys/net/ipv4 directory
+ * See the Linux ip(7) & tcp(7) man pages for descriptions and the illumos
+ * ip(7p) & tcp(7p) man pages for the native descriptions.
+ */
+static lxpr_dirent_t sys_net_ipv4dir[] = {
+ { LXPR_SYS_NET_IPV4_ICMP_EIB, "icmp_echo_ignore_broadcasts" },
+ { LXPR_SYS_NET_IPV4_IP_FORWARD, "ip_forward" },
+ { LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, "ip_local_port_range" },
+ { LXPR_SYS_NET_IPV4_TCP_FIN_TO, "tcp_fin_timeout" },
+ { LXPR_SYS_NET_IPV4_TCP_KA_INT, "tcp_keepalive_intvl" },
+ { LXPR_SYS_NET_IPV4_TCP_KA_TIM, "tcp_keepalive_time" },
+ { LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, "tcp_max_syn_backlog" },
+ { LXPR_SYS_NET_IPV4_TCP_RETRY2, "tcp_retries2" },
+ { LXPR_SYS_NET_IPV4_TCP_RMEM, "tcp_rmem" },
+ { LXPR_SYS_NET_IPV4_TCP_SACK, "tcp_sack" },
+ { LXPR_SYS_NET_IPV4_TCP_WINSCALE, "tcp_window_scaling" },
+ { LXPR_SYS_NET_IPV4_TCP_WMEM, "tcp_wmem" },
+};
+
+#define SYS_NET_IPV4DIRFILES \
+ (sizeof (sys_net_ipv4dir) / sizeof (sys_net_ipv4dir[0]))
+
+/*
+ * contents of /proc/sys/vm directory
+ */
+static lxpr_dirent_t sys_vmdir[] = {
+ { LXPR_SYS_VM_DIRTY_BG_BYTES, "dirty_background_bytes" },
+ { LXPR_SYS_VM_DIRTY_BG_RATIO, "dirty_background_ratio" },
+ { LXPR_SYS_VM_DIRTY_BYTES, "dirty_bytes" },
+ { LXPR_SYS_VM_DIRTY_EXP_CS, "dirty_expire_centisecs" },
+ { LXPR_SYS_VM_DIRTY_RATIO, "dirty_ratio" },
+ { LXPR_SYS_VM_DIRTYTIME_EXP_SEC, "dirtytime_expire_seconds" },
+ { LXPR_SYS_VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
+ { LXPR_SYS_VM_MAX_MAP_CNT, "max_map_count" },
+ { LXPR_SYS_VM_MINFR_KB, "min_free_kbytes" },
+ { LXPR_SYS_VM_NHUGEP, "nr_hugepages" },
+ { LXPR_SYS_VM_OVERCOMMIT_MEM, "overcommit_memory" },
+ { LXPR_SYS_VM_SWAPPINESS, "swappiness" },
+};
+
+#define SYS_VMDIRFILES (sizeof (sys_vmdir) / sizeof (sys_vmdir[0]))
+
+/*
+ * Table for standard writable files. Non-standard writable files not in this
+ * table can be handled explicitly as special cases.
+ * This table drives lxpr_is_writable, lxpr_write, and lxpr_create.
+ * Note that the entries LXPR_PID_FD_FD and LXPR_PID_TID_FD_FD exist in the
+ * table both to verify writability and to satisfy opening with O_CREATE.
+ */
+typedef struct wftab {
+ lxpr_nodetype_t wft_type; /* file entry type */
+ int (*wft_wrf)(lxpr_node_t *, struct uio *, cred_t *,
+ caller_context_t *); /* write function */
+} wftab_t;
+
+static wftab_t wr_tab[] = {
+ {LXPR_PID_COMM, lxpr_write_pid_tid_comm},
+ {LXPR_PID_FD_FD, NULL},
+ {LXPR_PID_LOGINUID, lxpr_write_pid_loginuid},
+ {LXPR_PID_OOM_SCR_ADJ, NULL},
+ {LXPR_PID_TID_COMM, lxpr_write_pid_tid_comm},
+ {LXPR_PID_TID_FD_FD, NULL},
+ {LXPR_PID_TID_OOM_SCR_ADJ, NULL},
+ {LXPR_SYS_FS_FILEMAX, NULL},
+ {LXPR_SYS_KERNEL_COREPATT, lxpr_write_sys_kernel_corepatt},
+ {LXPR_SYS_KERNEL_SHMALL, NULL},
+ {LXPR_SYS_KERNEL_SHMMAX, NULL},
+ {LXPR_SYS_FS_PIPE_MAX, lxpr_write_sys_fs_pipe_max},
+ {LXPR_SYS_NET_CORE_SOMAXCON, lxpr_write_sys_net_core_somaxc},
+ {LXPR_SYS_NET_IPV4_ICMP_EIB, lxpr_write_sys_net_ipv4_icmp_eib},
+ {LXPR_SYS_NET_IPV4_IP_FORWARD, NULL},
+ {LXPR_SYS_NET_IPV4_IP_LPORT_RANGE,
+ lxpr_write_sys_net_ipv4_ip_lport_range},
+ {LXPR_SYS_NET_IPV4_TCP_FIN_TO, lxpr_write_sys_net_ipv4_tcp_fin_to},
+ {LXPR_SYS_NET_IPV4_TCP_KA_INT, lxpr_write_sys_net_ipv4_tcp_ka_int},
+ {LXPR_SYS_NET_IPV4_TCP_KA_TIM, lxpr_write_sys_net_ipv4_tcp_ka_tim},
+ {LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL,
+ lxpr_write_sys_net_ipv4_tcp_max_syn_bl},
+ {LXPR_SYS_NET_IPV4_TCP_RETRY2, lxpr_write_sys_net_ipv4_tcp_retry2},
+ {LXPR_SYS_NET_IPV4_TCP_RMEM, lxpr_write_sys_net_ipv4_tcp_rwmem},
+ {LXPR_SYS_NET_IPV4_TCP_SACK, lxpr_write_sys_net_ipv4_tcp_sack},
+ {LXPR_SYS_NET_IPV4_TCP_WINSCALE, lxpr_write_sys_net_ipv4_tcp_winscale},
+ {LXPR_SYS_NET_IPV4_TCP_WMEM, lxpr_write_sys_net_ipv4_tcp_rwmem},
+ {LXPR_SYS_VM_DIRTY_BG_BYTES, NULL},
+ {LXPR_SYS_VM_DIRTY_BG_RATIO, NULL},
+ {LXPR_SYS_VM_DIRTY_BYTES, NULL},
+ {LXPR_SYS_VM_DIRTY_EXP_CS, NULL},
+ {LXPR_SYS_VM_DIRTY_RATIO, NULL},
+ {LXPR_SYS_VM_DIRTYTIME_EXP_SEC, NULL},
+ {LXPR_SYS_VM_DIRTY_WB_CS, NULL},
+ {LXPR_SYS_VM_OVERCOMMIT_MEM, NULL},
+ {LXPR_SYS_VM_SWAPPINESS, NULL},
+ {LXPR_INVALID, NULL}
+};
+
+/*
+ * Centralized test for the standard writable proc files. Other non-standard
+ * writable files might be handled separately.
+ */
+boolean_t
+lxpr_is_writable(lxpr_nodetype_t type)
+{
+ int i;
+
+ for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) {
+ if (wr_tab[i].wft_type == type)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ vnode_t *vp = *vpp;
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ vnode_t *rvp;
+ int error = 0;
+
+ /* Restrict writes to certain files */
+ if ((flag & FWRITE) && !lxpr_is_writable(type)) {
+ return (EPERM);
+ }
+
+ /*
+ * If we are opening an underlying file only allow regular files,
+ * fifos or sockets; reject the open for anything else.
+ * Just do it if we are opening the current or root directory.
+ */
+ if (lxpnp->lxpr_realvp != NULL) {
+ rvp = lxpnp->lxpr_realvp;
+
+ if (type == LXPR_PID_FD_FD && rvp->v_type != VREG &&
+ rvp->v_type != VFIFO && rvp->v_type != VSOCK) {
+ error = EACCES;
+ } else {
+ if (type == LXPR_PID_FD_FD && rvp->v_type == VFIFO) {
+ /*
+ * This flag lets the fifo open know that
+ * we're using proc/fd to open a fd which we
+ * already have open. Otherwise, the fifo might
+ * reject an open if the other end has closed.
+ */
+ flag |= FKLYR;
+ }
+ /*
+ * Need to hold rvp since VOP_OPEN() may release it.
+ */
+ VN_HOLD(rvp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+ if (error) {
+ VN_RELE(rvp);
+ } else {
+ *vpp = rvp;
+ VN_RELE(vp);
+ }
+ }
+ }
+
+ return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+#ifdef DEBUG
+ lxpr_node_t *lxpr = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpr->lxpr_type;
+
+ /*
+ * we should never get here because the close is done on the realvp
+ * for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR &&
+ type != LXPR_PID_EXE);
+#endif /* DEBUG */
+
+ return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+ NULL, /* invalid */
+ lxpr_read_isdir, /* /proc */
+ lxpr_read_isdir, /* /proc/<pid> */
+ lxpr_read_pid_auxv, /* /proc/<pid>/auxv */
+ lxpr_read_pid_cgroup, /* /proc/<pid>/cgroup */
+ lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */
+ lxpr_read_pid_tid_comm, /* /proc/<pid>/comm */
+ lxpr_read_empty, /* /proc/<pid>/cpu */
+ lxpr_read_invalid, /* /proc/<pid>/cwd */
+ lxpr_read_pid_env, /* /proc/<pid>/environ */
+ lxpr_read_invalid, /* /proc/<pid>/exe */
+ lxpr_read_pid_id_map, /* /proc/<pid>/gid_map */
+ lxpr_read_pid_limits, /* /proc/<pid>/limits */
+ lxpr_read_pid_loginuid, /* /proc/<pid>/loginuid */
+ lxpr_read_pid_maps, /* /proc/<pid>/maps */
+ lxpr_read_empty, /* /proc/<pid>/mem */
+ lxpr_read_pid_mountinfo, /* /proc/<pid>/mountinfo */
+ lxpr_read_mounts, /* /proc/<pid>/mounts */
+ lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/oom_score_adj */
+ lxpr_read_pid_personality, /* /proc/<pid>/personality */
+ lxpr_read_invalid, /* /proc/<pid>/root */
+ lxpr_read_pid_tid_stat, /* /proc/<pid>/stat */
+ lxpr_read_pid_statm, /* /proc/<pid>/statm */
+ lxpr_read_pid_tid_status, /* /proc/<pid>/status */
+ lxpr_read_isdir, /* /proc/<pid>/task */
+ lxpr_read_isdir, /* /proc/<pid>/task/nn */
+ lxpr_read_isdir, /* /proc/<pid>/fd */
+ lxpr_read_fd, /* /proc/<pid>/fd/nn */
+ lxpr_read_pid_id_map, /* /proc/<pid>/uid_map */
+ lxpr_read_pid_auxv, /* /proc/<pid>/task/<tid>/auxv */
+ lxpr_read_pid_cgroup, /* /proc/<pid>/task/<tid>/cgroup */
+ lxpr_read_pid_cmdline, /* /proc/<pid>/task/<tid>/cmdline */
+ lxpr_read_pid_tid_comm, /* /proc/<pid>/task/<tid>/comm */
+ lxpr_read_empty, /* /proc/<pid>/task/<tid>/cpu */
+ lxpr_read_invalid, /* /proc/<pid>/task/<tid>/cwd */
+ lxpr_read_pid_env, /* /proc/<pid>/task/<tid>/environ */
+ lxpr_read_invalid, /* /proc/<pid>/task/<tid>/exe */
+ lxpr_read_pid_id_map, /* /proc/<pid>/task/<tid>/gid_map */
+ lxpr_read_pid_limits, /* /proc/<pid>/task/<tid>/limits */
+ lxpr_read_pid_loginuid, /* /proc/<pid>/task/<tid>/loginuid */
+ lxpr_read_pid_maps, /* /proc/<pid>/task/<tid>/maps */
+ lxpr_read_empty, /* /proc/<pid>/task/<tid>/mem */
+ lxpr_read_pid_mountinfo, /* /proc/<pid>/task/<tid>/mountinfo */
+ lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/task/<tid>/oom_scr_adj */
+ lxpr_read_pid_personality, /* /proc/<pid>/task/<tid>/personality */
+ lxpr_read_invalid, /* /proc/<pid>/task/<tid>/root */
+ lxpr_read_pid_tid_stat, /* /proc/<pid>/task/<tid>/stat */
+ lxpr_read_pid_statm, /* /proc/<pid>/task/<tid>/statm */
+ lxpr_read_pid_tid_status, /* /proc/<pid>/task/<tid>/status */
+ lxpr_read_isdir, /* /proc/<pid>/task/<tid>/fd */
+ lxpr_read_fd, /* /proc/<pid>/task/<tid>/fd/nn */
+ lxpr_read_pid_id_map, /* /proc/<pid>/task/<tid>/uid_map */
+ lxpr_read_cgroups, /* /proc/cgroups */
+ lxpr_read_cmdline, /* /proc/cmdline */
+ lxpr_read_cpuinfo, /* /proc/cpuinfo */
+ lxpr_read_devices, /* /proc/devices */
+ lxpr_read_diskstats, /* /proc/diskstats */
+ lxpr_read_empty, /* /proc/dma */
+ lxpr_read_filesystems, /* /proc/filesystems */
+ lxpr_read_empty, /* /proc/interrupts */
+ lxpr_read_empty, /* /proc/ioports */
+ lxpr_read_empty, /* /proc/kcore */
+ lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */
+ lxpr_read_loadavg, /* /proc/loadavg */
+ lxpr_read_meminfo, /* /proc/meminfo */
+ lxpr_read_empty, /* /proc/modules */
+ lxpr_read_mounts, /* /proc/mounts */
+ lxpr_read_isdir, /* /proc/net */
+ lxpr_read_net_arp, /* /proc/net/arp */
+ lxpr_read_net_dev, /* /proc/net/dev */
+ lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */
+ lxpr_read_net_if_inet6, /* /proc/net/if_inet6 */
+ lxpr_read_net_igmp, /* /proc/net/igmp */
+ lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */
+ lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */
+ lxpr_read_net_ipv6_route, /* /proc/net/ipv6_route */
+ lxpr_read_net_mcfilter, /* /proc/net/mcfilter */
+ lxpr_read_net_netstat, /* /proc/net/netstat */
+ lxpr_read_net_raw, /* /proc/net/raw */
+ lxpr_read_net_route, /* /proc/net/route */
+ lxpr_read_net_rpc, /* /proc/net/rpc */
+ lxpr_read_net_rt_cache, /* /proc/net/rt_cache */
+ lxpr_read_net_sockstat, /* /proc/net/sockstat */
+ lxpr_read_net_snmp, /* /proc/net/snmp */
+ lxpr_read_net_stat, /* /proc/net/stat */
+ lxpr_read_net_tcp, /* /proc/net/tcp */
+ lxpr_read_net_tcp6, /* /proc/net/tcp6 */
+ lxpr_read_net_udp, /* /proc/net/udp */
+ lxpr_read_net_udp6, /* /proc/net/udp6 */
+ lxpr_read_net_unix, /* /proc/net/unix */
+ lxpr_read_partitions, /* /proc/partitions */
+ lxpr_read_invalid, /* /proc/self */
+ lxpr_read_stat, /* /proc/stat */
+ lxpr_read_swaps, /* /proc/swaps */
+ lxpr_read_invalid, /* /proc/sys */
+ lxpr_read_invalid, /* /proc/sys/fs */
+ lxpr_read_sys_fs_aiomax, /* /proc/sys/fs/aio-max-nr */
+ lxpr_read_sys_fs_aionr, /* /proc/sys/fs/aio-nr */
+ lxpr_read_sys_fs_filemax, /* /proc/sys/fs/file-max */
+ lxpr_read_sys_fs_filenr, /* /proc/sys/fs/file-nr */
+ lxpr_read_invalid, /* /proc/sys/fs/inotify */
+ lxpr_read_sys_fs_inotify_max_queued_events, /* max_queued_events */
+ lxpr_read_sys_fs_inotify_max_user_instances, /* max_user_instances */
+ lxpr_read_sys_fs_inotify_max_user_watches, /* max_user_watches */
+ lxpr_read_sys_fs_pipe_max, /* /proc/sys/fs/pipe-max-size */
+ lxpr_read_invalid, /* /proc/sys/kernel */
+ lxpr_read_sys_kernel_caplcap, /* /proc/sys/kernel/cap_last_cap */
+ lxpr_read_sys_kernel_corepatt, /* /proc/sys/kernel/core_pattern */
+ lxpr_read_sys_kernel_hostname, /* /proc/sys/kernel/hostname */
+ lxpr_read_sys_kernel_msgmax, /* /proc/sys/kernel/msgmax */
+ lxpr_read_sys_kernel_msgmnb, /* /proc/sys/kernel/msgmnb */
+ lxpr_read_sys_kernel_msgmni, /* /proc/sys/kernel/msgmni */
+ lxpr_read_sys_kernel_ngroups_max, /* /proc/sys/kernel/ngroups_max */
+ lxpr_read_sys_kernel_osrel, /* /proc/sys/kernel/osrelease */
+ lxpr_read_sys_kernel_pid_max, /* /proc/sys/kernel/pid_max */
+ lxpr_read_invalid, /* /proc/sys/kernel/random */
+ lxpr_read_sys_kernel_rand_bootid, /* /proc/sys/kernel/random/boot_id */
+ lxpr_read_sys_kernel_rand_entavl, /* .../kernel/random/entropy_avail */
+ lxpr_read_sys_kernel_sem, /* /proc/sys/kernel/sem */
+ lxpr_read_sys_kernel_shmall, /* /proc/sys/kernel/shmall */
+ lxpr_read_sys_kernel_shmmax, /* /proc/sys/kernel/shmmax */
+ lxpr_read_sys_kernel_shmmni, /* /proc/sys/kernel/shmmni */
+ lxpr_read_sys_kernel_threads_max, /* /proc/sys/kernel/threads-max */
+ lxpr_read_invalid, /* /proc/sys/net */
+ lxpr_read_invalid, /* /proc/sys/net/core */
+ lxpr_read_sys_net_core_somaxc, /* /proc/sys/net/core/somaxconn */
+ lxpr_read_invalid, /* /proc/sys/net/ipv4 */
+ lxpr_read_sys_net_ipv4_icmp_eib, /* .../icmp_echo_ignore_broadcasts */
+ lxpr_read_sys_net_ipv4_ip_forward, /* .../ipv4/ip_forward */
+ lxpr_read_sys_net_ipv4_ip_lport_range, /* ../ipv4/ip_local_port_range */
+ lxpr_read_sys_net_ipv4_tcp_fin_to, /* .../ipv4/tcp_fin_timeout */
+ lxpr_read_sys_net_ipv4_tcp_ka_int, /* .../ipv4/tcp_keepalive_intvl */
+ lxpr_read_sys_net_ipv4_tcp_ka_tim, /* .../ipv4/tcp_keepalive_time */
+ lxpr_read_sys_net_ipv4_tcp_max_syn_bl, /* ../ipv4/tcp_max_syn_backlog */
+ lxpr_read_sys_net_ipv4_tcp_retry2, /* .../ipv4/tcp_retries2 */
+ lxpr_read_sys_net_ipv4_tcp_rwmem, /* .../ipv4/tcp_rmem */
+ lxpr_read_sys_net_ipv4_tcp_sack, /* .../ipv4/tcp_sack */
+ lxpr_read_sys_net_ipv4_tcp_winscale, /* .../ipv4/tcp_window_scaling */
+ lxpr_read_sys_net_ipv4_tcp_rwmem, /* .../ipv4/tcp_wmem */
+ lxpr_read_invalid, /* /proc/sys/vm */
+ lxpr_read_sys_vm_dirty, /* .../vm/dirty_background_bytes */
+ lxpr_read_sys_vm_dirty, /* .../vm/dirty_background_ratio */
+ lxpr_read_sys_vm_dirty, /* .../vm/dirty_bytes */
+ lxpr_read_sys_vm_dirty, /* .../vm/dirty_expire_centisecs */
+ lxpr_read_sys_vm_dirty, /* .../vm/dirty_ratio */
+ lxpr_read_sys_vm_dirty, /* .../vm/dirtytime_expire_seconds */
+ lxpr_read_sys_vm_dirty, /* .../vm/dirty_writeback_centisecs */
+ lxpr_read_sys_vm_max_map_cnt, /* /proc/sys/vm/max_map_count */
+ lxpr_read_sys_vm_minfr_kb, /* /proc/sys/vm/min_free_kbytes */
+ lxpr_read_sys_vm_nhpages, /* /proc/sys/vm/nr_hugepages */
+ lxpr_read_sys_vm_overcommit_mem, /* /proc/sys/vm/overcommit_memory */
+ lxpr_read_sys_vm_swappiness, /* /proc/sys/vm/swappiness */
+ lxpr_read_uptime, /* /proc/uptime */
+ lxpr_read_version, /* /proc/version */
+ lxpr_read_vmstat, /* /proc/vmstat */
+};
+
+/*
+ * Array of lookup functions, indexed by lx /proc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+ NULL, /* invalid */
+ lxpr_lookup_procdir, /* /proc */
+ lxpr_lookup_piddir, /* /proc/<pid> */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/auxv */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cgroup */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/comm */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/gid_map */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/limits */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/loginuid */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/mountinfo */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/mounts */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/oom_score_adj */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/personality */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/root */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/status */
+ lxpr_lookup_taskdir, /* /proc/<pid>/task */
+ lxpr_lookup_task_tid_dir, /* /proc/<pid>/task/nn */
+ lxpr_lookup_fddir, /* /proc/<pid>/fd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/uid_map */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/comm */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/environ */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/exe */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/gid_map */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/limits */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/maps */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mem */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/oom_scr_adj */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/personality */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/root */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/stat */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/statm */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/status */
+ lxpr_lookup_fddir, /* /proc/<pid>/task/<tid>/fd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/uid_map */
+ lxpr_lookup_not_a_dir, /* /proc/cgroups */
+ lxpr_lookup_not_a_dir, /* /proc/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/cpuinfo */
+ lxpr_lookup_not_a_dir, /* /proc/devices */
+ lxpr_lookup_not_a_dir, /* /proc/diskstats */
+ lxpr_lookup_not_a_dir, /* /proc/dma */
+ lxpr_lookup_not_a_dir, /* /proc/filesystems */
+ lxpr_lookup_not_a_dir, /* /proc/interrupts */
+ lxpr_lookup_not_a_dir, /* /proc/ioports */
+ lxpr_lookup_not_a_dir, /* /proc/kcore */
+ lxpr_lookup_not_a_dir, /* /proc/kmsg */
+ lxpr_lookup_not_a_dir, /* /proc/loadavg */
+ lxpr_lookup_not_a_dir, /* /proc/meminfo */
+ lxpr_lookup_not_a_dir, /* /proc/modules */
+ lxpr_lookup_not_a_dir, /* /proc/mounts */
+ lxpr_lookup_netdir, /* /proc/net */
+ lxpr_lookup_not_a_dir, /* /proc/net/arp */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_lookup_not_a_dir, /* /proc/net/if_inet6 */
+ lxpr_lookup_not_a_dir, /* /proc/net/igmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_lookup_not_a_dir, /* /proc/net/ipv6_route */
+ lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_lookup_not_a_dir, /* /proc/net/netstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/raw */
+ lxpr_lookup_not_a_dir, /* /proc/net/route */
+ lxpr_lookup_not_a_dir, /* /proc/net/rpc */
+ lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/sockstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/snmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/stat */
+ lxpr_lookup_not_a_dir, /* /proc/net/tcp */
+ lxpr_lookup_not_a_dir, /* /proc/net/tcp6 */
+ lxpr_lookup_not_a_dir, /* /proc/net/udp */
+ lxpr_lookup_not_a_dir, /* /proc/net/udp6 */
+ lxpr_lookup_not_a_dir, /* /proc/net/unix */
+ lxpr_lookup_not_a_dir, /* /proc/partitions */
+ lxpr_lookup_not_a_dir, /* /proc/self */
+ lxpr_lookup_not_a_dir, /* /proc/stat */
+ lxpr_lookup_not_a_dir, /* /proc/swaps */
+ lxpr_lookup_sysdir, /* /proc/sys */
+ lxpr_lookup_sys_fsdir, /* /proc/sys/fs */
+ lxpr_lookup_not_a_dir, /* /proc/sys/fs/aio-max-nr */
+ lxpr_lookup_not_a_dir, /* /proc/sys/fs/aio-nr */
+ lxpr_lookup_not_a_dir, /* /proc/sys/fs/file-max */
+ lxpr_lookup_not_a_dir, /* /proc/sys/fs/file-nr */
+ lxpr_lookup_sys_fs_inotifydir, /* /proc/sys/fs/inotify */
+ lxpr_lookup_not_a_dir, /* .../inotify/max_queued_events */
+ lxpr_lookup_not_a_dir, /* .../inotify/max_user_instances */
+ lxpr_lookup_not_a_dir, /* .../inotify/max_user_watches */
+ lxpr_lookup_not_a_dir, /* /proc/sys/fs/pipe-max-size */
+ lxpr_lookup_sys_kerneldir, /* /proc/sys/kernel */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/cap_last_cap */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/core_pattern */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/hostname */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmax */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmnb */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmni */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/ngroups_max */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/osrelease */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/pid_max */
+ lxpr_lookup_sys_kdir_randdir, /* /proc/sys/kernel/random */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/random/boot_id */
+ lxpr_lookup_not_a_dir, /* .../kernel/random/entropy_avail */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/sem */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmall */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmax */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmni */
+ lxpr_lookup_not_a_dir, /* /proc/sys/kernel/threads-max */
+ lxpr_lookup_sys_netdir, /* /proc/sys/net */
+ lxpr_lookup_sys_net_coredir, /* /proc/sys/net/core */
+ lxpr_lookup_not_a_dir, /* /proc/sys/net/core/somaxconn */
+ lxpr_lookup_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */
+ lxpr_lookup_not_a_dir, /* .../icmp_echo_ignore_broadcasts */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/ip_forward */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/ip_local_port_range */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_max_syn_backlog */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_retries2 */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_rmem */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_sack */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_window_scaling */
+ lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_wmem */
+ lxpr_lookup_sys_vmdir, /* /proc/sys/vm */
+ lxpr_lookup_not_a_dir, /* .../vm/dirty_background_bytes */
+ lxpr_lookup_not_a_dir, /* .../vm/dirty_background_ratio */
+ lxpr_lookup_not_a_dir, /* .../vm/dirty_bytes */
+ lxpr_lookup_not_a_dir, /* .../vm/dirty_expire_centisecs */
+ lxpr_lookup_not_a_dir, /* .../vm/dirty_ratio */
+ lxpr_lookup_not_a_dir, /* .../vm/dirtytime_expire_seconds */
+ lxpr_lookup_not_a_dir, /* .../vm/dirty_writeback_centisecs */
+ lxpr_lookup_not_a_dir, /* /proc/sys/vm/max_map_count */
+ lxpr_lookup_not_a_dir, /* /proc/sys/vm/min_free_kbytes */
+ lxpr_lookup_not_a_dir, /* /proc/sys/vm/nr_hugepages */
+ lxpr_lookup_not_a_dir, /* /proc/sys/vm/overcommit_memory */
+ lxpr_lookup_not_a_dir, /* /proc/sys/vm/swappiness */
+ lxpr_lookup_not_a_dir, /* /proc/uptime */
+ lxpr_lookup_not_a_dir, /* /proc/version */
+ lxpr_lookup_not_a_dir, /* /proc/vmstat */
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+ NULL, /* invalid */
+ lxpr_readdir_procdir, /* /proc */
+ lxpr_readdir_piddir, /* /proc/<pid> */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/auxv */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cgroup */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/comm */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/gid_map */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/limits */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/loginuid */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/mountinfo */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/mounts */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/oom_score_adj */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/personality */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/root */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/status */
+ lxpr_readdir_taskdir, /* /proc/<pid>/task */
+ lxpr_readdir_task_tid_dir, /* /proc/<pid>/task/nn */
+ lxpr_readdir_fddir, /* /proc/<pid>/fd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/uid_map */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/comm */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/environ */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/exe */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/gid_map */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/limits */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/maps */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mem */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/oom_scr_adj */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/personality */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/root */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/stat */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/statm */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/status */
+ lxpr_readdir_fddir, /* /proc/<pid>/task/<tid>/fd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/uid_map */
+ lxpr_readdir_not_a_dir, /* /proc/cgroups */
+ lxpr_readdir_not_a_dir, /* /proc/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/cpuinfo */
+ lxpr_readdir_not_a_dir, /* /proc/devices */
+ lxpr_readdir_not_a_dir, /* /proc/diskstats */
+ lxpr_readdir_not_a_dir, /* /proc/dma */
+ lxpr_readdir_not_a_dir, /* /proc/filesystems */
+ lxpr_readdir_not_a_dir, /* /proc/interrupts */
+ lxpr_readdir_not_a_dir, /* /proc/ioports */
+ lxpr_readdir_not_a_dir, /* /proc/kcore */
+ lxpr_readdir_not_a_dir, /* /proc/kmsg */
+ lxpr_readdir_not_a_dir, /* /proc/loadavg */
+ lxpr_readdir_not_a_dir, /* /proc/meminfo */
+ lxpr_readdir_not_a_dir, /* /proc/modules */
+ lxpr_readdir_not_a_dir, /* /proc/mounts */
+ lxpr_readdir_netdir, /* /proc/net */
+ lxpr_readdir_not_a_dir, /* /proc/net/arp */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_readdir_not_a_dir, /* /proc/net/if_inet6 */
+ lxpr_readdir_not_a_dir, /* /proc/net/igmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_readdir_not_a_dir, /* /proc/net/ipv6_route */
+ lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_readdir_not_a_dir, /* /proc/net/netstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/raw */
+ lxpr_readdir_not_a_dir, /* /proc/net/route */
+ lxpr_readdir_not_a_dir, /* /proc/net/rpc */
+ lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/sockstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/snmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/stat */
+ lxpr_readdir_not_a_dir, /* /proc/net/tcp */
+ lxpr_readdir_not_a_dir, /* /proc/net/tcp6 */
+ lxpr_readdir_not_a_dir, /* /proc/net/udp */
+ lxpr_readdir_not_a_dir, /* /proc/net/udp6 */
+ lxpr_readdir_not_a_dir, /* /proc/net/unix */
+ lxpr_readdir_not_a_dir, /* /proc/partitions */
+ lxpr_readdir_not_a_dir, /* /proc/self */
+ lxpr_readdir_not_a_dir, /* /proc/stat */
+ lxpr_readdir_not_a_dir, /* /proc/swaps */
+ lxpr_readdir_sysdir, /* /proc/sys */
+ lxpr_readdir_sys_fsdir, /* /proc/sys/fs */
+ lxpr_readdir_not_a_dir, /* /proc/sys/fs/aio-max-nr */
+ lxpr_readdir_not_a_dir, /* /proc/sys/fs/aio-nr */
+ lxpr_readdir_not_a_dir, /* /proc/sys/fs/file-max */
+ lxpr_readdir_not_a_dir, /* /proc/sys/fs/file-nr */
+ lxpr_readdir_sys_fs_inotifydir, /* /proc/sys/fs/inotify */
+ lxpr_readdir_not_a_dir, /* .../inotify/max_queued_events */
+ lxpr_readdir_not_a_dir, /* .../inotify/max_user_instances */
+ lxpr_readdir_not_a_dir, /* .../inotify/max_user_watches */
+ lxpr_readdir_not_a_dir, /* /proc/sys/fs/pipe-max-size */
+ lxpr_readdir_sys_kerneldir, /* /proc/sys/kernel */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/cap_last_cap */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/core_pattern */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/hostname */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmax */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmnb */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmni */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/ngroups_max */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/osrelease */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/pid_max */
+ lxpr_readdir_sys_kdir_randdir, /* /proc/sys/kernel/random */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/random/boot_id */
+ lxpr_readdir_not_a_dir, /* .../kernel/random/entropy_avail */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/sem */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmall */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmax */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmni */
+ lxpr_readdir_not_a_dir, /* /proc/sys/kernel/threads-max */
+ lxpr_readdir_sys_netdir, /* /proc/sys/net */
+ lxpr_readdir_sys_net_coredir, /* /proc/sys/net/core */
+ lxpr_readdir_not_a_dir, /* /proc/sys/net/core/somaxconn */
+ lxpr_readdir_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */
+ lxpr_readdir_not_a_dir, /* .../icmp_echo_ignore_broadcasts */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/ip_forward */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/ip_local_port_range */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_max_syn_backlog */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_retries2 */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_rmem */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_sack */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_window_scaling */
+ lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_wmem */
+ lxpr_readdir_sys_vmdir, /* /proc/sys/vm */
+ lxpr_readdir_not_a_dir, /* .../vm/dirty_background_bytes */
+ lxpr_readdir_not_a_dir, /* .../vm/dirty_background_ratio */
+ lxpr_readdir_not_a_dir, /* .../vm/dirty_bytes */
+ lxpr_readdir_not_a_dir, /* .../vm/dirty_expire_centisecs */
+ lxpr_readdir_not_a_dir, /* .../vm/dirty_ratio */
+ lxpr_readdir_not_a_dir, /* .../vm/dirtytime_expire_seconds */
+ lxpr_readdir_not_a_dir, /* .../vm/dirty_writeback_centisecs */
+ lxpr_readdir_not_a_dir, /* /proc/sys/vm/max_map_count */
+ lxpr_readdir_not_a_dir, /* /proc/sys/vm/min_free_kbytes */
+ lxpr_readdir_not_a_dir, /* /proc/sys/vm/nr_hugepages */
+ lxpr_readdir_not_a_dir, /* /proc/sys/vm/overcommit_memory */
+ lxpr_readdir_not_a_dir, /* /proc/sys/vm/swappiness */
+ lxpr_readdir_not_a_dir, /* /proc/uptime */
+ lxpr_readdir_not_a_dir, /* /proc/version */
+ lxpr_readdir_not_a_dir, /* /proc/vmstat */
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in the lx procfs is human
+ * readable and not binary structures there do not have to be different
+ * read variants depending on whether the reading process model is 32 or 64 bits
+ * (at least in general, and certainly the difference is unlikely to be enough
+ * to justify have different routines for 32 and 64 bit reads
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+ int error;
+
+ ASSERT(type < LXPR_NFILES);
+
+ if (type == LXPR_KMSG) {
+ ldi_ident_t li = VTOLXPM(vp)->lxprm_li;
+ ldi_handle_t ldih;
+ struct strioctl str;
+ int rv;
+
+ /*
+ * Open the zone's console device using the layered driver
+ * interface.
+ */
+ if ((error =
+ ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0)
+ return (error);
+
+ /*
+ * Send an ioctl to the underlying console device, letting it
+ * know we're interested in getting console messages.
+ */
+ str.ic_cmd = I_CONSLOG;
+ str.ic_timout = 0;
+ str.ic_len = 0;
+ str.ic_dp = NULL;
+ if ((error = ldi_ioctl(ldih, I_STR,
+ (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+ return (error);
+
+ lxpr_read_kmsg(lxpnp, uiobuf, ldih);
+
+ if ((error = ldi_close(ldih, FREAD, cr)) != 0)
+ return (error);
+ } else {
+ lxpr_read_function[type](lxpnp, uiobuf);
+ }
+
+ error = lxpr_uiobuf_flush(uiobuf);
+ lxpr_uiobuf_free(uiobuf);
+
+ return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ * but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_auxv(): read process aux vector
+ */
+static void
+lxpr_read_pid_auxv(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ lx_proc_data_t *pd;
+ lx_elf_data_t *edp = NULL;
+ int i, cnt;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_AUXV ||
+ lxpnp->lxpr_type == LXPR_PID_TID_AUXV);
+
+ p = lxpr_lock(lxpnp, NO_ZOMB);
+
+ if (p == NULL) {
+ return;
+ }
+ if ((pd = ptolxproc(p)) == NULL) {
+ /* Emit a single AT_NULL record for non-branded processes */
+ auxv_t buf;
+
+ bzero(&buf, sizeof (buf));
+ lxpr_unlock(p);
+ lxpr_uiobuf_write(uiobuf, (char *)&buf, sizeof (buf));
+ return;
+ } else {
+ edp = &pd->l_elf_data;
+ }
+
+ if (p->p_model == DATAMODEL_NATIVE) {
+ auxv_t buf[__KERN_NAUXV_IMPL];
+
+ /*
+ * Because a_type is only of size int (not long), the buffer
+ * contents must be zeroed first to ensure cleanliness.
+ */
+ bzero(buf, sizeof (buf));
+ for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) {
+ if (lx_auxv_stol(&p->p_user.u_auxv[i],
+ &buf[cnt], edp) == 0) {
+ cnt++;
+ }
+ if (p->p_user.u_auxv[i].a_type == AT_NULL) {
+ break;
+ }
+ }
+ lxpr_unlock(p);
+ lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0]));
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ auxv32_t buf[__KERN_NAUXV_IMPL];
+
+ for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) {
+ auxv_t temp;
+
+ if (lx_auxv_stol(&p->p_user.u_auxv[i],
+ &temp, edp) == 0) {
+ buf[cnt].a_type = (int)temp.a_type;
+ buf[cnt].a_un.a_val = (int)temp.a_un.a_val;
+ cnt++;
+ }
+ if (p->p_user.u_auxv[i].a_type == AT_NULL) {
+ break;
+ }
+ }
+ lxpr_unlock(p);
+ lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0]));
+ }
+#endif /* defined(_SYSCALL32_IMPL) */
+}
+
+/*
+ * lxpr_read_pid_cgroup(): read cgroups for process
+ */
+static void
+lxpr_read_pid_cgroup(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_CGROUP ||
+ lxpnp->lxpr_type == LXPR_PID_TID_CGROUP);
+
+ p = lxpr_lock(lxpnp, ZOMB_OK);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+ lxpr_unlock(p);
+
+ /* basic stub, 3rd field will need to be populated */
+ lxpr_uiobuf_printf(uiobuf, "1:name=systemd:/\n");
+}
+
+static void
+lxpr_copy_cmdline(proc_t *p, lx_proc_data_t *pd, lxpr_uiobuf_t *uiobuf)
+{
+ uio_t *uiop = uiobuf->uiop;
+ char *buf = uiobuf->buffer;
+ int bsz = uiobuf->buffsize;
+ boolean_t env_overflow = B_FALSE;
+ uintptr_t pos = pd->l_args_start + uiop->uio_offset;
+ uintptr_t estart = pd->l_envs_start;
+ uintptr_t eend = pd->l_envs_end;
+ size_t chunk, copied;
+ int err = 0;
+
+ /* Do not bother with data beyond the end of the envp strings area. */
+ if (pos > eend) {
+ return;
+ }
+ mutex_exit(&p->p_lock);
+
+ /*
+ * If the starting or ending bounds are outside the argv strings area,
+ * check to see if the process has overwritten the terminating NULL.
+ * If not, no data needs to be copied from oustide the argv area.
+ */
+ if (pos >= estart || (pos + uiop->uio_resid) >= estart) {
+ uint8_t term;
+ if (uread(p, &term, sizeof (term), estart - 1) != 0) {
+ err = EFAULT;
+ } else if (term != 0) {
+ env_overflow = B_TRUE;
+ }
+ }
+
+ /* Data between astart and estart-1 can be copied freely. */
+ while (pos < estart && uiop->uio_resid > 0 && err == 0) {
+ chunk = MIN(estart - pos, uiop->uio_resid);
+ chunk = MIN(chunk, bsz);
+
+ if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) != 0 ||
+ copied != chunk) {
+ err = EFAULT;
+ break;
+ }
+ err = uiomove(buf, copied, UIO_READ, uiop);
+ pos += copied;
+ }
+
+ /*
+ * Onward from estart, data is copied as a contiguous string. To
+ * protect env data from potential snooping, only one buffer-sized copy
+ * is allowed to avoid complex seek logic.
+ */
+ if (err == 0 && env_overflow && pos == estart && uiop->uio_resid > 0) {
+ chunk = MIN(eend - pos, uiop->uio_resid);
+ chunk = MIN(chunk, bsz);
+ if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) == 0) {
+ int len = strnlen(buf, copied);
+ if (len > 0) {
+ err = uiomove(buf, len, UIO_READ, uiop);
+ }
+ }
+ }
+
+ uiobuf->error = err;
+ /* reset any uiobuf state */
+ uiobuf->pos = uiobuf->buffer;
+ uiobuf->beg = 0;
+
+ mutex_enter(&p->p_lock);
+}
+
+/*
+ * lxpr_read_pid_cmdline(): read argument vector from process
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ char *buf;
+ size_t asz = lxpr_maxargvlen, sz;
+ lx_proc_data_t *pd;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE ||
+ lxpnp->lxpr_type == LXPR_PID_TID_CMDLINE);
+
+ buf = kmem_alloc(asz, KM_SLEEP);
+
+ p = lxpr_lock(lxpnp, NO_ZOMB);
+ if (p == NULL) {
+ kmem_free(buf, asz);
+ return;
+ }
+
+ if ((pd = ptolxproc(p)) != NULL && pd->l_args_start != 0 &&
+ pd->l_envs_start != 0 && pd->l_envs_end != 0) {
+ /* Use Linux-style argv bounds if possible. */
+ lxpr_copy_cmdline(p, pd, uiobuf);
+ lxpr_unlock(p);
+ } else {
+ int r;
+
+ r = prreadargv(p, buf, asz, &sz);
+ lxpr_unlock(p);
+
+ if (r != 0) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ } else {
+ lxpr_uiobuf_write(uiobuf, buf, sz);
+ }
+ }
+ kmem_free(buf, asz);
+}
+
+/*
+ * lxpr_read_pid_tid_comm(): read command name from thread
+ */
+static void
+lxpr_read_pid_tid_comm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ pid_t tid;
+ char buf[LX_PR_SET_NAME_NAMELEN], *pnm;
+
+ VERIFY(lxpnp->lxpr_type == LXPR_PID_COMM ||
+ lxpnp->lxpr_type == LXPR_PID_TID_COMM);
+
+ tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc;
+ p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+ if (t == NULL) {
+ lxpr_unlock(p);
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * If a thread name has not been set, use the process command name.
+ * This also covers the /proc/{pid}/comm case.
+ */
+ if (t->t_name == NULL) {
+ pnm = p->p_user.u_comm;
+ } else {
+ pnm = t->t_name;
+ }
+
+ /* Truncate with NUL if the name is longer than the Linux size. */
+ (void) strlcpy(buf, pnm, sizeof (buf));
+
+ lxpr_unlock(p);
+ lxpr_uiobuf_printf(uiobuf, "%s\n", buf);
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_pid_tid_comm(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr,
+ caller_context_t *ct)
+{
+ int error;
+ size_t olen;
+ char *buf;
+ proc_t *p;
+ kthread_t *t;
+ pid_t tid;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_COMM ||
+ lxpnp->lxpr_type == LXPR_PID_TID_COMM);
+
+ /*
+ * Only a thread in the process can update one of the thread names. Not
+ * even a process with root privileges. Linux returns EINVAL (not EPERM)
+ * for this case.
+ */
+ if (lxpnp->lxpr_pid != curproc->p_pid)
+ return (EINVAL);
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > LX_PR_SET_NAME_NAMELEN - 1)
+ olen = LX_PR_SET_NAME_NAMELEN - 1;
+
+ buf = kmem_zalloc(THREAD_NAME_MAX, KM_SLEEP);
+
+ error = uiomove(buf, olen, UIO_WRITE, uio);
+ if (error != 0) {
+ kmem_free(buf, THREAD_NAME_MAX);
+ return (error);
+ }
+ buf[LX_PR_SET_NAME_NAMELEN - 1] = '\0';
+
+ tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc;
+ p = lxpr_lock_pid(lxpnp, tid, NO_ZOMB, &t);
+ if (p == NULL) {
+ kmem_free(buf, THREAD_NAME_MAX);
+ return (ENXIO);
+ }
+ if (t == NULL) {
+ lxpr_unlock(p);
+ kmem_free(buf, THREAD_NAME_MAX);
+ return (ENXIO);
+ }
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * See comments for thread_setname() and prctl(LX_PR_SET_NAME) handling.
+ */
+ if (t->t_name == NULL) {
+ t->t_name = buf;
+ } else {
+ (void) strlcpy(t->t_name, buf, THREAD_NAME_MAX);
+ kmem_free(buf, THREAD_NAME_MAX);
+ }
+
+ if (t->t_tid == 1) {
+ (void) strncpy(p->p_user.u_comm, t->t_name, MAXCOMLEN + 1);
+ (void) strncpy(p->p_user.u_psargs, t->t_name, PSARGSZ);
+ }
+
+ lxpr_unlock(p);
+ return (0);
+}
+
+/*
+ * lxpr_read_pid_env(): read env vector from process
+ */
+static void
+lxpr_read_pid_env(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ char *buf;
+ size_t asz = lxpr_maxenvvlen, sz;
+ int r;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_ENV);
+
+ buf = kmem_alloc(asz, KM_SLEEP);
+
+ p = lxpr_lock(lxpnp, NO_ZOMB);
+ if (p == NULL) {
+ kmem_free(buf, asz);
+ return;
+ }
+
+ r = prreadenvv(p, buf, asz, &sz);
+ lxpr_unlock(p);
+
+ if (r != 0) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ } else {
+ lxpr_uiobuf_write(uiobuf, buf, sz);
+ }
+ kmem_free(buf, asz);
+}
+
+/*
+ * lxpr_read_pid_limits(): ulimit file
+ */
+static void
+lxpr_read_pid_limits(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ rctl_qty_t cur[LX_RLIM_TAB_LEN], max[LX_RLIM_TAB_LEN];
+ int i;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_LIMITS ||
+ lxpnp->lxpr_type == LXPR_PID_TID_LIMITS);
+
+ p = lxpr_lock(lxpnp, NO_ZOMB);
+ if (p == NULL) {
+ return;
+ }
+
+ for (i = 0; i < LX_RLIM_TAB_LEN; i++) {
+ char *kname = lxpr_rlimtab[i].rlim_rctl;
+ rctl_val_t nval, *oval = NULL;
+ rctl_hndl_t hndl;
+
+ /* default to unlimited for resources without an analog */
+ cur[i] = RLIM_INFINITY;
+ max[i] = RLIM_INFINITY;
+ if (kname == NULL || (hndl = rctl_hndl_lookup(kname)) == -1) {
+ continue;
+ }
+ while (rctl_local_get(hndl, oval, &nval, p) == 0) {
+ oval = &nval;
+ switch (nval.rcv_privilege) {
+ case RCPRIV_BASIC:
+ if (!RCTL_INFINITE(nval))
+ cur[i] = nval.rcv_value;
+ break;
+ case RCPRIV_PRIVILEGED:
+ if (!RCTL_INFINITE(nval))
+ max[i] = nval.rcv_value;
+ break;
+ }
+ }
+ }
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf, "%-25s %-20s %-20s %-10s\n",
+ "Limit", "Soft Limit", "Hard Limit", "Units");
+ for (i = 0; i < LX_RLIM_TAB_LEN; i++) {
+ lxpr_uiobuf_printf(uiobuf, "%-25s", lxpr_rlimtab[i].rlim_name);
+ if (cur[i] == RLIM_INFINITY || cur[i] == LX_RLIM_INFINITY) {
+ lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited");
+ } else {
+ lxpr_uiobuf_printf(uiobuf, " %-20lu", cur[i]);
+ }
+ if (max[i] == RLIM_INFINITY || max[i] == LX_RLIM_INFINITY) {
+ lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited");
+ } else {
+ lxpr_uiobuf_printf(uiobuf, " %-20lu", max[i]);
+ }
+ lxpr_uiobuf_printf(uiobuf, " %-10s\n",
+ lxpr_rlimtab[i].rlim_unit);
+ }
+}
+/*
+ * lxpr_read_pid_id_map(): gid_map and uid_map file
+ */
+static void
+lxpr_read_pid_id_map(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_GIDMAP ||
+ lxpnp->lxpr_type == LXPR_PID_UIDMAP);
+
+ lxpr_uiobuf_printf(uiobuf, "%10u %10u %10u\n", 0, 0, MAXUID);
+}
+
+/*
+ * lxpr_read_pid_loginuid(): loginuid file
+ */
+static void
+lxpr_read_pid_loginuid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ lx_proc_data_t *pd;
+ uid_t lu = 0;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID ||
+ lxpnp->lxpr_type == LXPR_PID_TID_LOGINUID);
+
+ p = lxpr_lock(lxpnp, NO_ZOMB);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ if ((pd = ptolxproc(p)) != NULL) {
+ lu = pd->l_loginuid;
+ }
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf, "%d", lu);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ lx_proc_data_t *lxpd;
+ struct as *as;
+ struct seg *seg;
+ char *buf;
+ int buflen = MAXPATHLEN;
+ struct print_data {
+ uintptr_t saddr;
+ uintptr_t eaddr;
+ int type;
+ char prot[5];
+ uintptr_t offset;
+ vnode_t *vp;
+ char *name_override;
+ struct print_data *next;
+ } *print_head = NULL;
+ struct print_data **print_tail = &print_head;
+ struct print_data *pbuf;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS ||
+ lxpnp->lxpr_type == LXPR_PID_TID_MAPS);
+
+ p = lxpr_lock(lxpnp, NO_ZOMB);
+ if (p == NULL) {
+ return;
+ }
+
+ as = p->p_as;
+ lxpd = ptolxproc(p);
+
+ if (as == &kas) {
+ lxpr_unlock(p);
+ return;
+ }
+
+ mutex_exit(&p->p_lock);
+
+ /* Iterate over all segments in the address space */
+ AS_LOCK_ENTER(as, RW_READER);
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+ vnode_t *vp;
+ uint_t protbits;
+
+ if ((seg->s_flags & S_HOLE) != 0) {
+ continue;
+ }
+
+ pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+ pbuf->saddr = (uintptr_t)seg->s_base;
+ pbuf->eaddr = pbuf->saddr + seg->s_size;
+ pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+ /*
+ * Cheat and only use the protection bits of the first page
+ * in the segment
+ */
+ (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+ (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+ if (protbits & PROT_READ) pbuf->prot[0] = 'r';
+ if (protbits & PROT_WRITE) pbuf->prot[1] = 'w';
+ if (protbits & PROT_EXEC) pbuf->prot[2] = 'x';
+ if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's';
+ else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+ if (seg->s_ops == &segvn_ops &&
+ SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+ vp != NULL && vp->v_type == VREG) {
+ VN_HOLD(vp);
+ pbuf->vp = vp;
+ } else {
+ pbuf->vp = NULL;
+ }
+
+ pbuf->offset = SEGOP_GETOFFSET(seg, (caddr_t)pbuf->saddr);
+
+ pbuf->name_override = NULL;
+ if (lxpd != NULL) {
+ if (pbuf->saddr == lxpd->l_vdso) {
+ pbuf->name_override = "[vdso]";
+ } else if (pbuf->saddr == p->p_user.u_commpagep) {
+ pbuf->name_override = "[vvar]";
+ }
+ }
+
+ pbuf->next = NULL;
+ *print_tail = pbuf;
+ print_tail = &pbuf->next;
+ }
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ /* print the data we've extracted */
+ pbuf = print_head;
+ while (pbuf != NULL) {
+ struct print_data *pbuf_next;
+ vattr_t vattr;
+
+ int maj = 0;
+ int min = 0;
+ ino_t inode = 0;
+
+ *buf = '\0';
+ if (pbuf->name_override != NULL) {
+ (void) strncpy(buf, pbuf->name_override, buflen);
+ } else if (pbuf->vp != NULL) {
+ vattr.va_mask = AT_FSID | AT_NODEID;
+ if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+ NULL) == 0) {
+ maj = getmajor(vattr.va_fsid);
+ min = getminor(vattr.va_fsid);
+ inode = vattr.va_nodeid;
+ }
+ (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+ VN_RELE(pbuf->vp);
+ }
+
+ if (p->p_model == DATAMODEL_LP64) {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08llx-%08llx %s %08llx %02x:%02x %llu%s%s\n",
+ pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+ maj, min, inode, *buf != '\0' ? " " : "", buf);
+ } else {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08x-%08x %s %08x %02x:%02x %llu%s%s\n",
+ (uint32_t)pbuf->saddr, (uint32_t)pbuf->eaddr,
+ pbuf->prot, (uint32_t)pbuf->offset, maj, min,
+ inode, *buf != '\0' ? " " : "", buf);
+ }
+
+ pbuf_next = pbuf->next;
+ kmem_free(pbuf, sizeof (*pbuf));
+ pbuf = pbuf_next;
+ }
+
+ kmem_free(buf, buflen);
+}
+
+/*
+ * Make mount entry look more like Linux. Non-zero return to skip it.
+ */
+static int
+lxpr_clean_mntent(char **mntpt, char **fstype, char **resource)
+{
+ if (strcmp(*mntpt, "/var/ld") == 0 ||
+ strcmp(*fstype, "objfs") == 0 ||
+ strcmp(*fstype, "mntfs") == 0 ||
+ strcmp(*fstype, "ctfs") == 0 ||
+ strncmp(*mntpt, "/native/", 8) == 0) {
+ return (1);
+ }
+
+ if (strcmp(*fstype, "tmpfs") == 0) {
+ *resource = "tmpfs";
+ } else if (strcmp(*fstype, "lx_proc") == 0) {
+ *resource = *fstype = "proc";
+ } else if (strcmp(*fstype, "lx_sysfs") == 0) {
+ *resource = *fstype = "sysfs";
+ } else if (strcmp(*fstype, "lx_devfs") == 0) {
+ *resource = *fstype = "devtmpfs";
+ } else if (strcmp(*fstype, "lx_cgroup") == 0) {
+ *resource = *fstype = "cgroup";
+ } else if (strcmp(*fstype, "lxautofs") == 0) {
+ *fstype = "autofs";
+ }
+
+ return (0);
+}
+
+
+typedef struct lxpr_mount_entry {
+ list_node_t lme_link;
+ uint_t lme_id;
+ uint_t lme_parent_id;
+ refstr_t *lme_mntpt;
+ refstr_t *lme_resource;
+ uint_t lme_mntopts_len;
+ char *lme_mntopts;
+ uint_t lme_flag;
+ int lme_fstype;
+ dev_t lme_dev;
+ boolean_t lme_force;
+} lxpr_mount_entry_t;
+
+static int lxpr_zfs_fstype = -1;
+
+#define LXPR_ROOT_MOUNT_ID 15
+#define LXPR_MNT_OPT_CHUNK 128
+
+/* List of native, non-Linux mount options we should omit. */
+static const char *lx_invalid_mnt_opts[] = {
+ "xattr",
+ NULL
+};
+
+/* First see if we should omit this option */
+static boolean_t
+lxpr_skip_mntopt(const char *s)
+{
+ uint_t i;
+
+ for (i = 0; lx_invalid_mnt_opts[i] != NULL; i++) {
+ if (strcmp(s, lx_invalid_mnt_opts[i]) == 0)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static void
+lxpr_append_mntopt(lxpr_mount_entry_t *lme, char *s)
+{
+ while (strlcat(lme->lme_mntopts, s, lme->lme_mntopts_len) >=
+ lme->lme_mntopts_len) {
+ /* expand option string */
+ uint_t tlen = lme->lme_mntopts_len + LXPR_MNT_OPT_CHUNK;
+ char *t = kmem_alloc(tlen, KM_SLEEP);
+
+ (void) strlcpy(t, lme->lme_mntopts, tlen);
+ kmem_free(lme->lme_mntopts, lme->lme_mntopts_len);
+ lme->lme_mntopts_len = tlen;
+ lme->lme_mntopts = t;
+ }
+}
+
+/*
+ * Perform the somewhat complicated work of getting the mount options string
+ * for the mount.
+ */
+static void
+lxpr_get_mntopts(vfs_t *vfsp, lxpr_mount_entry_t *lme)
+{
+ uint_t i;
+ mntopt_t *mop;
+ boolean_t have_nosuid = B_FALSE, have_nodev = B_FALSE;
+
+ lme->lme_mntopts_len = LXPR_MNT_OPT_CHUNK;
+ lme->lme_mntopts = kmem_alloc(lme->lme_mntopts_len, KM_SLEEP);
+ lme->lme_mntopts[0] = '\0';
+
+ /* Always show rw/ro option */
+ lxpr_append_mntopt(lme,
+ (lme->lme_flag & VFS_RDONLY) == 0 ? "rw" : "ro");
+
+ for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
+ mop = &vfsp->vfs_mntopts.mo_list[i];
+ if ((mop->mo_flags & MO_NODISPLAY) || !(mop->mo_flags & MO_SET))
+ continue;
+
+ if (strcmp(mop->mo_name, "ro") == 0 ||
+ strcmp(mop->mo_name, "rw") == 0)
+ continue;
+
+ if (strcmp(mop->mo_name, "nosuid") == 0)
+ have_nosuid = B_TRUE;
+ /* sigh, either option string is used */
+ if (strcmp(mop->mo_name, "nodev") == 0 ||
+ strcmp(mop->mo_name, "nodevices") == 0)
+ have_nodev = B_TRUE;
+
+ if (!lxpr_skip_mntopt(mop->mo_name)) {
+ lxpr_append_mntopt(lme, ",");
+ lxpr_append_mntopt(lme, mop->mo_name);
+ if (mop->mo_arg != NULL) {
+ lxpr_append_mntopt(lme, "=");
+ lxpr_append_mntopt(lme, mop->mo_arg);
+ }
+ }
+ }
+
+ /*
+ * Sometimes nosuid is an explicit string, other times it's a flag.
+ * The same is true for nodevices.
+ */
+ if (!have_nosuid && (lme->lme_flag & VFS_NOSETUID)) {
+ lxpr_append_mntopt(lme, ",nosuid");
+ }
+ if (!have_nodev && (lme->lme_flag & VFS_NODEVICES)) {
+ lxpr_append_mntopt(lme, ",nodevices");
+ }
+}
+
+static list_t *
+lxpr_enumerate_mounts(zone_t *zone)
+{
+ vfs_t *vfsp, *rvfsp, *vfslist;
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+ list_t *result;
+ lxpr_mount_entry_t *lme;
+ lx_virt_disk_t *vd;
+ uint_t root_id, mount_id;
+ char tmppath[MAXPATHLEN];
+
+ result = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(result, sizeof (lxpr_mount_entry_t),
+ offsetof(lxpr_mount_entry_t, lme_link));
+ /* use an arbitrary start value for the root mount_id */
+ root_id = 15;
+ mount_id = root_id + 1;
+
+ ASSERT(zone != global_zone);
+ ASSERT(lxzd != NULL);
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+
+ vfs_list_read_lock();
+ vfsp = vfslist = zone->zone_vfslist;
+
+ /*
+ * If the zone has a root entry, it will be the first in the list.
+ * Conjure one up if needed.
+ */
+ if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+ zone->zone_rootpath) != 0) {
+ rvfsp = zone->zone_rootvp->v_vfsp;
+ } else {
+ rvfsp = vfslist;
+ vfsp = vfslist->vfs_zone_next;
+ }
+
+ lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP);
+ lme->lme_id = root_id;
+ lme->lme_parent_id = 0;
+ lme->lme_mntpt = refstr_alloc(zone->zone_rootpath);
+ lme->lme_flag = rvfsp->vfs_flag;
+ lme->lme_fstype = rvfsp->vfs_fstype;
+ lme->lme_force = B_TRUE;
+ lxpr_get_mntopts(rvfsp, lme);
+
+ lme->lme_resource = NULL;
+ vd = list_head(lxzd->lxzd_vdisks);
+ while (vd != NULL) {
+ if (vd->lxvd_type == LXVD_ZFS_DS &&
+ vd->lxvd_real_dev == rvfsp->vfs_dev) {
+ (void) snprintf(tmppath, sizeof (tmppath),
+ "%sdev/%s", zone->zone_rootpath, vd->lxvd_name);
+ lme->lme_resource = refstr_alloc(tmppath);
+ lme->lme_dev = vd->lxvd_emul_dev;
+ break;
+ }
+ vd = list_next(lxzd->lxzd_vdisks, vd);
+ }
+ if (lme->lme_resource == NULL) {
+ lme->lme_resource = refstr_alloc(zone->zone_rootpath);
+ lme->lme_dev = rvfsp->vfs_dev;
+ }
+ list_insert_head(result, lme);
+
+ do {
+ if (vfsp == NULL) {
+ break;
+ }
+ /* Skip mounts we shouldn't show */
+ if ((vfsp->vfs_flag & VFS_NOMNTTAB) != 0) {
+ vfsp = vfsp->vfs_zone_next;
+ continue;
+ }
+
+ lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP);
+ lme->lme_id = mount_id++;
+ lme->lme_parent_id = root_id;
+ lme->lme_mntpt = vfsp->vfs_mntpt;
+ refstr_hold(vfsp->vfs_mntpt);
+ lme->lme_flag = vfsp->vfs_flag;
+ lme->lme_fstype = vfsp->vfs_fstype;
+ lme->lme_force = B_FALSE;
+ lxpr_get_mntopts(vfsp, lme);
+
+ lme->lme_resource = NULL;
+ vd = list_head(lxzd->lxzd_vdisks);
+ while (vd != NULL) {
+ if (vd->lxvd_type == LXVD_ZFS_DS &&
+ vd->lxvd_real_dev == vfsp->vfs_dev) {
+ char vdev[MAXPATHLEN];
+
+ (void) snprintf(vdev, sizeof (vdev),
+ "%sdev/%s",
+ zone->zone_rootpath, vd->lxvd_name);
+ lme->lme_resource = refstr_alloc(vdev);
+ lme->lme_dev = vd->lxvd_emul_dev;
+ break;
+ }
+ vd = list_next(lxzd->lxzd_vdisks, vd);
+ }
+ if (lme->lme_resource == NULL) {
+ lme->lme_resource = vfsp->vfs_resource;
+ refstr_hold(vfsp->vfs_resource);
+ lme->lme_dev = vfsp->vfs_dev;
+ }
+ list_insert_tail(result, lme);
+ vfsp = vfsp->vfs_zone_next;
+ } while (vfsp != vfslist);
+
+ vfs_list_unlock();
+
+ /* Add a single dummy entry for /native/usr */
+ lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP);
+ lme->lme_id = mount_id++;
+ lme->lme_parent_id = root_id;
+ lme->lme_flag = VFS_RDONLY;
+ lme->lme_dev = makedevice(0, 1);
+ (void) snprintf(tmppath, sizeof (tmppath),
+ "%snative/usr", zone->zone_rootpath);
+ lme->lme_mntpt = refstr_alloc(tmppath);
+ lme->lme_resource = lme->lme_mntpt;
+ lme->lme_mntopts_len = 3;
+ lme->lme_mntopts = kmem_alloc(lme->lme_mntopts_len, KM_SLEEP);
+ (void) strlcpy(lme->lme_mntopts, "ro", lme->lme_mntopts_len);
+ refstr_hold(lme->lme_mntpt);
+ if (lxpr_zfs_fstype == -1) {
+ vfssw_t *zfssw = vfs_getvfssw("zfs");
+ VERIFY(zfssw != NULL);
+ lxpr_zfs_fstype = ((uintptr_t)zfssw - (uintptr_t)vfssw) /
+ sizeof (vfssw[0]);
+ VERIFY(&vfssw[lxpr_zfs_fstype] == zfssw);
+ }
+ lme->lme_fstype = lxpr_zfs_fstype;
+ lme->lme_force = B_TRUE;
+ list_insert_tail(result, lme);
+
+ return (result);
+}
+
+/*
+ * lxpr_read_pid_mountinfo(): information about process mount points.
+ */
+static void
+lxpr_read_pid_mountinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ list_t *mounts;
+ lxpr_mount_entry_t *lme;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_MOUNTINFO ||
+ lxpnp->lxpr_type == LXPR_PID_TID_MOUNTINFO);
+
+ mounts = lxpr_enumerate_mounts(zone);
+
+ /*
+ * now we can run through what we've extracted without holding
+ * vfs_list_read_lock()
+ */
+ lme = (lxpr_mount_entry_t *)list_remove_head(mounts);
+ while (lme != NULL) {
+ char *resource, *mntpt, *fstype, *rwflag;
+ vnode_t *vp;
+ int error;
+
+ mntpt = (char *)refstr_value(lme->lme_mntpt);
+ resource = (char *)refstr_value(lme->lme_resource);
+
+ if (mntpt == NULL || mntpt[0] == '\0') {
+ goto nextp;
+ }
+ mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+ error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ goto nextp;
+ } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) {
+ VN_RELE(vp);
+ goto nextp;
+ }
+ VN_RELE(vp);
+
+ if (resource != NULL && resource[0] != '\0') {
+ if (resource[0] == '/') {
+ resource = ZONE_PATH_VISIBLE(resource, zone) ?
+ ZONE_PATH_TRANSLATE(resource, zone) : mntpt;
+ }
+ } else {
+ resource = "none";
+ }
+
+ /* Make things look more like Linux. */
+ fstype = vfssw[lme->lme_fstype].vsw_name;
+ if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 &&
+ !lme->lme_force) {
+ goto nextp;
+ }
+ rwflag = ((lme->lme_flag & VFS_RDONLY) == 0) ? "rw" : "ro";
+
+ /*
+ * XXX parent ID is not tracked correctly here. Currently we
+ * always assume the parent ID is the root ID.
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "%d %d %d:%d / %s %s - %s %s %s\n",
+ lme->lme_id, lme->lme_parent_id,
+ getmajor(lme->lme_dev), getminor(lme->lme_dev),
+ mntpt, rwflag, fstype, resource, lme->lme_mntopts);
+
+nextp:
+ refstr_rele(lme->lme_mntpt);
+ refstr_rele(lme->lme_resource);
+ kmem_free(lme->lme_mntopts, lme->lme_mntopts_len);
+ kmem_free(lme, sizeof (lxpr_mount_entry_t));
+ lme = (lxpr_mount_entry_t *)list_remove_head(mounts);
+ }
+
+ list_destroy(mounts);
+ kmem_free(mounts, sizeof (list_t));
+}
+
+/*
+ * lxpr_read_pid_oom_scr_adj(): read oom_score_adj for process
+ */
+static void
+lxpr_read_pid_oom_scr_adj(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_OOM_SCR_ADJ ||
+ lxpnp->lxpr_type == LXPR_PID_TID_OOM_SCR_ADJ);
+
+ p = lxpr_lock(lxpnp, ZOMB_OK);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+ lxpr_unlock(p);
+
+ /* always 0 */
+ lxpr_uiobuf_printf(uiobuf, "0\n");
+}
+
+/*
+ * lxpr_read_pid_personality(): read personality for process
+ */
+static void
+lxpr_read_pid_personality(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ lx_proc_data_t *lxpd;
+ unsigned int personality;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_PERSONALITY);
+
+ p = lxpr_lock(lxpnp, ZOMB_OK);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+ if ((lxpd = ptolxproc(p)) != NULL) {
+ personality = lxpd->l_personality;
+ } else {
+ /* Report native processes as having the SunOS personality */
+ personality = LX_PER_SUNOS;
+ }
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf, "%08x\n", personality);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ struct as *as;
+ size_t vsize, rss;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM ||
+ lxpnp->lxpr_type == LXPR_PID_TID_STATM);
+
+ p = lxpr_lock(lxpnp, ZOMB_OK);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ as = p->p_as;
+ mutex_exit(&p->p_lock);
+ if (as != &kas) {
+ AS_LOCK_ENTER(as, RW_READER);
+ vsize = btopr(as->a_resvsize);
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as);
+ } else {
+ vsize = 0;
+ rss = 0;
+ }
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%lu %lu %lu %lu %lu %lu %lu\n",
+ vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * Determine number of LWPs visible in the process. In particular we want to
+ * ignore aio in-kernel threads.
+ */
+static uint_t
+lxpr_count_tasks(proc_t *p)
+{
+ uint_t cnt = 0;
+ kthread_t *t;
+
+ if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+ (p->p_as == &kas)) {
+ return (0);
+ }
+
+ if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL) {
+ cnt = p->p_lwpcnt;
+ } else {
+ do {
+ lx_lwp_data_t *lwpd = ttolxlwp(t);
+ /* Don't count aio kernel worker threads */
+ if ((t->t_proc_flag & TP_KTHREAD) != 0 &&
+ lwpd != NULL &&
+ (lwpd->br_lwp_flags & BR_AIO_LWP) == 0) {
+ cnt++;
+ }
+
+ t = t->t_forw;
+ } while (t != p->p_tlist);
+ }
+
+ return (cnt);
+}
+
+/*
+ * pid/tid common code to read status file
+ */
+static void
+lxpr_read_status_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf,
+ uint_t lookup_id)
+{
+ proc_t *p;
+ kthread_t *t;
+ user_t *up;
+ cred_t *cr;
+ const gid_t *groups;
+ struct as *as;
+ char *status;
+ pid_t pid, ppid;
+ pid_t tid = (lookup_id == 0) ? lxpnp->lxpr_pid : lookup_id;
+ k_sigset_t current, ignore, handle;
+ int i, lx_sig, lwpcnt, ngroups;
+ char buf_comm[MAXCOMLEN + 1];
+ rlim64_t fdlim;
+ size_t vsize = 0, nlocked = 0, rss = 0, stksize = 0;
+ boolean_t printsz = B_FALSE;
+
+
+ p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ /* Translate the pid (e.g. initpid to 1) */
+ lxpr_fixpid(LXPTOZ(lxpnp), p, &pid, &ppid);
+
+ if (t != NULL) {
+ thread_lock(t);
+ switch (t->t_state) {
+ case TS_SLEEP:
+ status = "S (sleeping)";
+ break;
+ case TS_RUN:
+ case TS_ONPROC:
+ status = "R (running)";
+ break;
+ case TS_ZOMB:
+ status = "Z (zombie)";
+ break;
+ case TS_STOPPED:
+ status = "T (stopped)";
+ break;
+ default:
+ status = "! (unknown)";
+ break;
+ }
+ thread_unlock(t);
+ } else {
+ if (lookup_id != 0) {
+ /* we can't find this specific thread */
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ lxpr_unlock(p);
+ return;
+ }
+
+ /*
+ * there is a hole in the exit code, where a proc can have
+ * no threads but it is yet to be flagged SZOMB. We will
+ * assume we are about to become a zombie
+ */
+ status = "Z (zombie)";
+ }
+
+ up = PTOU(p);
+ mutex_enter(&p->p_crlock);
+ crhold(cr = p->p_cred);
+ mutex_exit(&p->p_crlock);
+
+ (void) strlcpy(buf_comm, up->u_comm, sizeof (buf_comm));
+ fdlim = p->p_fno_ctl;
+ lwpcnt = lxpr_count_tasks(p);
+
+ /*
+ * Gather memory information
+ */
+ as = p->p_as;
+ if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) &&
+ (as != &kas)) {
+ mutex_exit(&p->p_lock);
+ AS_LOCK_ENTER(as, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+
+ nlocked = p->p_locked_mem;
+ stksize = p->p_stksize;
+ printsz = B_TRUE;
+ }
+
+ /*
+ * Gather signal information
+ */
+ sigemptyset(&current);
+ sigemptyset(&ignore);
+ sigemptyset(&handle);
+ for (i = 1; i < NSIG; i++) {
+ lx_sig = stol_signo[i];
+
+ if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) {
+ if (sigismember(&p->p_sig, i))
+ sigaddset(&current, lx_sig);
+
+ if (up->u_signal[i - 1] == SIG_IGN)
+ sigaddset(&ignore, lx_sig);
+ else if (up->u_signal[i - 1] != SIG_DFL)
+ sigaddset(&handle, lx_sig);
+ }
+ }
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "Name:\t%s\n"
+ "State:\t%s\n"
+ "Tgid:\t%d\n"
+ "Pid:\t%d\n"
+ "PPid:\t%d\n"
+ "TracerPid:\t%d\n"
+ "Uid:\t%u\t%u\t%u\t%u\n"
+ "Gid:\t%u\t%u\t%u\t%u\n"
+ "FDSize:\t%d\n"
+ "Groups:\t",
+ buf_comm,
+ status,
+ pid, /* thread group id - same as pid */
+ (lookup_id == 0) ? pid : lxpnp->lxpr_desc,
+ ppid,
+ 0,
+ crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+ crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+ fdlim);
+ ngroups = crgetngroups(cr);
+ groups = crgetgroups(cr);
+ for (i = 0; i < ngroups; i++) {
+ lxpr_uiobuf_printf(uiobuf,
+ "%u ",
+ groups[i]);
+ }
+ crfree(cr);
+ if (printsz) {
+ lxpr_uiobuf_printf(uiobuf,
+ "\n"
+ "VmSize:\t%8lu kB\n"
+ "VmLck:\t%8lu kB\n"
+ "VmRSS:\t%8lu kB\n"
+ "VmData:\t%8lu kB\n"
+ "VmStk:\t%8lu kB\n"
+ "VmExe:\t%8lu kB\n"
+ "VmLib:\t%8lu kB",
+ btok(vsize),
+ btok(nlocked),
+ ptok(rss),
+ 0l,
+ btok(stksize),
+ ptok(rss),
+ 0l);
+ }
+ lxpr_uiobuf_printf(uiobuf, "\nThreads:\t%u\n", lwpcnt);
+ lxpr_uiobuf_printf(uiobuf,
+ "SigPnd:\t%08x%08x\n"
+ "SigBlk:\t%08x%08x\n"
+ "SigIgn:\t%08x%08x\n"
+ "SigCgt:\t%08x%08x\n",
+ current.__sigbits[1], current.__sigbits[0],
+ 0, 0, /* signals blocked on per thread basis */
+ ignore.__sigbits[1], ignore.__sigbits[0],
+ handle.__sigbits[1], handle.__sigbits[0]);
+ /* Report only the full bounding set for now */
+ lxpr_uiobuf_printf(uiobuf,
+ "CapInh:\t%016x\n"
+ "CapPrm:\t%016x\n"
+ "CapEff:\t%016x\n"
+ "CapBnd:\t%016llx\n",
+ 0, 0, 0, 0x1fffffffffLL);
+}
+
+/*
+ * lxpr_read_pid_tid_status(): status file
+ */
+static void
+lxpr_read_pid_tid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS ||
+ lxpnp->lxpr_type == LXPR_PID_TID_STATUS);
+
+ lxpr_read_status_common(lxpnp, uiobuf, lxpnp->lxpr_desc);
+}
+
+/*
+ * Same logic as the lx devfs lxd_pts_devt_translator.
+ */
+static dev_t
+lxpr_xlate_pts_dev(dev_t dev)
+{
+ minor_t min = getminor(dev);
+ int lx_maj, lx_min;
+
+ lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN);
+ lx_min = min % LX_MAXMIN;
+
+ return (LX_MAKEDEVICE(lx_maj, lx_min));
+}
+
+/*
+ * pid/tid common code to read stat file
+ */
+static void
+lxpr_read_pid_tid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ struct as *as;
+ zone_t *zone;
+ char stat;
+ pid_t pid, ppid, pgpid, spid, tid;
+ gid_t psgid;
+ dev_t psdev;
+ size_t rss, vsize;
+ int nice, pri, lwpcnt;
+ caddr_t wchan, stackbase;
+ processorid_t cpu;
+ clock_t utime, stime, cutime, cstime, ticks, boottime;
+ char buf_comm[MAXCOMLEN + 1];
+ rlim64_t vmem_ctl;
+ int exit_signal = -1;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT ||
+ lxpnp->lxpr_type == LXPR_PID_TID_STAT);
+
+ zone = LXPTOZ(lxpnp);
+ tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc;
+ p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ /* Set Linux defaults if we're the zone's init process */
+ pid = p->p_pid;
+ lxpr_fixpid(zone, p, &pid, &ppid);
+ if (pid == 1) {
+ /* init process */
+ pgpid = 0;
+ psgid = (gid_t)-1;
+ spid = 0;
+ psdev = 0;
+ } else {
+ pgpid = p->p_pgrp;
+ mutex_enter(&p->p_splock);
+ mutex_enter(&p->p_sessp->s_lock);
+ spid = p->p_sessp->s_sid;
+ psdev = lxpr_xlate_pts_dev(p->p_sessp->s_dev);
+ if (p->p_sessp->s_cred)
+ psgid = crgetgid(p->p_sessp->s_cred);
+ else
+ psgid = crgetgid(p->p_cred);
+
+ mutex_exit(&p->p_sessp->s_lock);
+ mutex_exit(&p->p_splock);
+ }
+
+ if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+ (p->p_as == &kas)) {
+ stackbase = 0;
+ } else {
+ /* from prgetstackbase() */
+ stackbase = p->p_usrstack - p->p_stksize;
+ }
+
+ utime = stime = 0;
+ if (t != NULL) {
+ klwp_t *lwp = ttolwp(t);
+ hrtime_t utm = 0, stm = 0;
+
+ /*
+ * For field 38 (the exit signal), some apps explicitly use
+ * this field in a check to distinguish processes from threads,
+ * and assume only processes have a valid signal in this field!
+ */
+ if (t->t_tid == 1) {
+ lx_proc_data_t *lxpd = ptolxproc(p);
+
+ if (lxpd != NULL) {
+ exit_signal = lxpd->l_signal;
+ } else {
+ exit_signal = SIGCHLD;
+ }
+ }
+
+ thread_lock(t);
+ switch (t->t_state) {
+ case TS_SLEEP:
+ stat = 'S';
+ break;
+ case TS_RUN:
+ case TS_ONPROC:
+ stat = 'R';
+ break;
+ case TS_ZOMB:
+ stat = 'Z';
+ break;
+ case TS_STOPPED:
+ stat = 'T';
+ break;
+ default:
+ stat = '!';
+ break;
+ }
+
+ if (CL_DONICE(t, NULL, 0, &nice) != 0)
+ nice = 0;
+
+ pri = t->t_pri;
+ wchan = t->t_wchan;
+ cpu = t->t_cpu->cpu_id;
+
+ if (lwp != NULL) {
+ struct mstate *ms = &lwp->lwp_mstate;
+
+ utm = ms->ms_acct[LMS_USER];
+ stm = ms->ms_acct[LMS_SYSTEM];
+
+ /* convert unscaled high-res time to nanoseconds */
+ scalehrtime(&utm);
+ scalehrtime(&stm);
+ }
+
+ thread_unlock(t);
+
+ /* Linux /proc expects these values in ticks */
+ utime = (clock_t)NSEC_TO_TICK(utm);
+ stime = (clock_t)NSEC_TO_TICK(stm);
+ } else {
+ /* Only zombies have no threads */
+ stat = 'Z';
+ nice = 0;
+ pri = 0;
+ wchan = 0;
+ cpu = 0;
+ }
+ as = p->p_as;
+ mutex_exit(&p->p_lock);
+ if (as != &kas) {
+ AS_LOCK_ENTER(as, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as);
+ } else {
+ vsize = 0;
+ rss = 0;
+ }
+ mutex_enter(&p->p_lock);
+
+ if (tid == p->p_pid) {
+ /* process */
+ utime = p->p_utime;
+ stime = p->p_stime;
+ } else {
+ /* tid: utime & stime for the thread set in block above */
+ /* EMPTY */
+ }
+ cutime = p->p_cutime;
+ cstime = p->p_cstime;
+ lwpcnt = lxpr_count_tasks(p);
+ vmem_ctl = p->p_vmem_ctl;
+ (void) strlcpy(buf_comm, p->p_user.u_comm, sizeof (buf_comm));
+ ticks = p->p_user.u_ticks; /* lbolt at process start */
+ /* adjust ticks to account for zone boot time */
+ boottime = zone->zone_zsched->p_user.u_ticks;
+ ticks -= boottime;
+ lxpr_unlock(p);
+
+ /* Adjust hz for relevant fields */
+ utime = HZ_TO_LX_USERHZ(utime);
+ stime = HZ_TO_LX_USERHZ(stime);
+ cutime = HZ_TO_LX_USERHZ(cutime);
+ cstime = HZ_TO_LX_USERHZ(cstime);
+ ticks = HZ_TO_LX_USERHZ(ticks);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%d " /* 1 */
+ "(%s) %c %d %d %d %d %d " /* 2-8 */
+ "%lu %lu %lu %lu %lu " /* 9-13 */
+ "%lu %lu %ld %ld " /* 14-17 */
+ "%d %d %d " /* 18-20 */
+ "%lu " /* 21 */
+ "%lu " /* 22 */
+ "%lu %ld %llu " /* 23-25 */
+ "%lu %lu %llu " /* 26-28 */
+ "%lu %lu " /* 29-30 */
+ "%lu %lu %lu %lu " /* 31-34 */
+ "%lu " /* 35 */
+ "%lu %lu " /* 36-37 */
+ "%d " /* 38 */
+ "%d" /* 39 */
+ "\n",
+ tid, /* 1 */
+ buf_comm, stat, ppid, pgpid, spid, psdev, psgid, /* 2-8 */
+ 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+ utime, stime, cutime, cstime, /* 14-17 */
+ pri, nice, lwpcnt, /* 18-20 */
+ 0l, /* itrealvalue (time before next SIGALRM) 21 */
+ ticks, /* 22 */
+ vsize, rss, vmem_ctl, /* 23-25 */
+ 0l, 0l, stackbase, /* startcode, endcode, startstack 26-28 */
+ 0l, 0l, /* kstkesp, kstkeip 29-30 */
+ 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch 31-34 */
+ wchan, /* 35 */
+ 0l, 0l, /* nswap,cnswap 36-37 */
+ exit_signal, /* exit_signal 38 */
+ cpu /* 39 */);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+struct lxpr_ifstat {
+ uint64_t rx_bytes;
+ uint64_t rx_packets;
+ uint64_t rx_errors;
+ uint64_t rx_drop;
+ uint64_t tx_bytes;
+ uint64_t tx_packets;
+ uint64_t tx_errors;
+ uint64_t tx_drop;
+ uint64_t collisions;
+ uint64_t rx_multicast;
+};
+
+static void *
+lxpr_kstat_read(kstat_t *kn, boolean_t byname, size_t *size, int *num,
+ zoneid_t zoneid)
+{
+ kstat_t *kp;
+ int i, nrec = 0;
+ size_t bufsize;
+ void *buf = NULL;
+
+ if (byname == B_TRUE) {
+ kp = kstat_hold_byname(kn->ks_module, kn->ks_instance,
+ kn->ks_name, zoneid);
+ } else {
+ kp = kstat_hold_bykid(kn->ks_kid, zoneid);
+ }
+ if (kp == NULL) {
+ return (NULL);
+ }
+ if (kp->ks_flags & KSTAT_FLAG_INVALID) {
+ kstat_rele(kp);
+ return (NULL);
+ }
+
+ bufsize = kp->ks_data_size + 1;
+ kstat_rele(kp);
+
+ /*
+ * The kstat in question is released so that kmem_alloc(KM_SLEEP) is
+ * performed without it held. After the alloc, the kstat is reacquired
+ * and its size is checked again. If the buffer is no longer large
+ * enough, the alloc and check are repeated up to three times.
+ */
+ for (i = 0; i < 2; i++) {
+ buf = kmem_alloc(bufsize, KM_SLEEP);
+
+ /* Check if bufsize still appropriate */
+ if (byname == B_TRUE) {
+ kp = kstat_hold_byname(kn->ks_module, kn->ks_instance,
+ kn->ks_name, zoneid);
+ } else {
+ kp = kstat_hold_bykid(kn->ks_kid, zoneid);
+ }
+ if (kp == NULL || kp->ks_flags & KSTAT_FLAG_INVALID) {
+ if (kp != NULL) {
+ kstat_rele(kp);
+ }
+ kmem_free(buf, bufsize);
+ return (NULL);
+ }
+ KSTAT_ENTER(kp);
+ (void) KSTAT_UPDATE(kp, KSTAT_READ);
+ if (bufsize < kp->ks_data_size) {
+ kmem_free(buf, bufsize);
+ buf = NULL;
+ bufsize = kp->ks_data_size + 1;
+ KSTAT_EXIT(kp);
+ kstat_rele(kp);
+ continue;
+ } else {
+ if (KSTAT_SNAPSHOT(kp, buf, KSTAT_READ) != 0) {
+ kmem_free(buf, bufsize);
+ buf = NULL;
+ }
+ nrec = kp->ks_ndata;
+ KSTAT_EXIT(kp);
+ kstat_rele(kp);
+ break;
+ }
+ }
+
+ if (buf != NULL) {
+ *size = bufsize;
+ *num = nrec;
+ }
+ return (buf);
+}
+
+static int
+lxpr_kstat_ifstat(kstat_t *kn, struct lxpr_ifstat *ifs, zoneid_t zoneid)
+{
+ kstat_named_t *kp;
+ int i, num;
+ size_t size;
+
+ /*
+ * Search by name instead of by kid since there's a small window to
+ * race against kstats being added/removed.
+ */
+ bzero(ifs, sizeof (*ifs));
+ kp = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num, zoneid);
+ if (kp == NULL)
+ return (-1);
+ for (i = 0; i < num; i++) {
+ if (strncmp(kp[i].name, "rbytes64", KSTAT_STRLEN) == 0)
+ ifs->rx_bytes = kp[i].value.ui64;
+ else if (strncmp(kp[i].name, "ipackets64", KSTAT_STRLEN) == 0)
+ ifs->rx_packets = kp[i].value.ui64;
+ else if (strncmp(kp[i].name, "ierrors", KSTAT_STRLEN) == 0)
+ ifs->rx_errors = kp[i].value.ui32;
+ else if (strncmp(kp[i].name, "norcvbuf", KSTAT_STRLEN) == 0)
+ ifs->rx_drop = kp[i].value.ui32;
+ else if (strncmp(kp[i].name, "multircv", KSTAT_STRLEN) == 0)
+ ifs->rx_multicast = kp[i].value.ui32;
+ else if (strncmp(kp[i].name, "obytes64", KSTAT_STRLEN) == 0)
+ ifs->tx_bytes = kp[i].value.ui64;
+ else if (strncmp(kp[i].name, "opackets64", KSTAT_STRLEN) == 0)
+ ifs->tx_packets = kp[i].value.ui64;
+ else if (strncmp(kp[i].name, "oerrors", KSTAT_STRLEN) == 0)
+ ifs->tx_errors = kp[i].value.ui32;
+ else if (strncmp(kp[i].name, "noxmtbuf", KSTAT_STRLEN) == 0)
+ ifs->tx_drop = kp[i].value.ui32;
+ else if (strncmp(kp[i].name, "collisions", KSTAT_STRLEN) == 0)
+ ifs->collisions = kp[i].value.ui32;
+ }
+ kmem_free(kp, size);
+ return (0);
+}
+
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ kstat_t *ksr;
+ kstat_t ks0;
+ int i, nidx;
+ size_t sidx;
+ struct lxpr_ifstat ifs;
+ zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id;
+
+ lxpr_uiobuf_printf(uiobuf, "Inter-| Receive "
+ " | Transmit\n");
+ lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo"
+ " frame compressed multicast|bytes packets errs drop fifo"
+ " colls carrier compressed\n");
+
+ ks0.ks_kid = 0;
+ ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx, zoneid);
+ if (ksr == NULL)
+ return;
+
+ for (i = 1; i < nidx; i++) {
+ if (strncmp(ksr[i].ks_module, "link", KSTAT_STRLEN) == 0 ||
+ strncmp(ksr[i].ks_module, "lo", KSTAT_STRLEN) == 0) {
+ if (lxpr_kstat_ifstat(&ksr[i], &ifs, zoneid) != 0)
+ continue;
+
+ /* Overwriting the name is ok in the local snapshot */
+ lx_ifname_convert(ksr[i].ks_name, LX_IF_FROMNATIVE);
+ lxpr_uiobuf_printf(uiobuf, "%6s: %7llu %7llu %4lu "
+ "%4lu %4u %5u %10u %9lu %8llu %7llu %4lu %4lu %4u "
+ "%5lu %7u %10u\n",
+ ksr[i].ks_name,
+ ifs.rx_bytes, ifs.rx_packets,
+ ifs.rx_errors, ifs.rx_drop,
+ 0, 0, 0, ifs.rx_multicast,
+ ifs.tx_bytes, ifs.tx_packets,
+ ifs.tx_errors, ifs.tx_drop,
+ 0, ifs.collisions, 0, 0);
+ }
+ }
+
+ kmem_free(ksr, sidx);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+static void
+lxpr_inet6_out(const in6_addr_t *addr, char buf[33])
+{
+ const uint8_t *ip = addr->s6_addr;
+ char digits[] = "0123456789abcdef";
+ int i;
+ for (i = 0; i < 16; i++) {
+ buf[2 * i] = digits[ip[i] >> 4];
+ buf[2 * i + 1] = digits[ip[i] & 0xf];
+ }
+ buf[32] = '\0';
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_if_inet6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ ill_t *ill;
+ ipif_t *ipif;
+ ill_walk_context_t ctx;
+ char ifname[LIFNAMSIZ], ip6out[33];
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return;
+ ipst = ns->netstack_ip;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ ill = ILL_START_WALK_V6(&ctx, ipst);
+
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ for (ipif = ill->ill_ipif; ipif != NULL;
+ ipif = ipif->ipif_next) {
+ uint_t index = ill->ill_phyint->phyint_ifindex;
+ int plen = ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
+ unsigned int scope = lx_ipv6_scope_convert(
+ &ipif->ipif_v6lcl_addr);
+ /* Always report PERMANENT flag */
+ int flag = 0x80;
+
+ (void) snprintf(ifname, LIFNAMSIZ, "%s", ill->ill_name);
+ lx_ifname_convert(ifname, LX_IF_FROMNATIVE);
+ lxpr_inet6_out(&ipif->ipif_v6lcl_addr, ip6out);
+
+ lxpr_uiobuf_printf(uiobuf, "%32s %02x %02x %02x %02x"
+ " %8s\n", ip6out, index, plen, scope, flag, ifname);
+ }
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+static void
+lxpr_format_route_ipv6(ire_t *ire, lxpr_uiobuf_t *uiobuf)
+{
+ uint32_t flags;
+ char name[IFNAMSIZ];
+ char ipv6addr[33];
+
+ lxpr_inet6_out(&ire->ire_addr_v6, ipv6addr);
+ lxpr_uiobuf_printf(uiobuf, "%s %02x ", ipv6addr,
+ ip_mask_to_plen_v6(&ire->ire_mask_v6));
+
+ /* punt on this for now */
+ lxpr_uiobuf_printf(uiobuf, "%s %02x ",
+ "00000000000000000000000000000000", 0);
+
+ lxpr_inet6_out(&ire->ire_gateway_addr_v6, ipv6addr);
+ lxpr_uiobuf_printf(uiobuf, "%s", ipv6addr);
+
+ flags = ire->ire_flags &
+ (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED);
+ /* Linux's RTF_LOCAL equivalent */
+ if (ire->ire_metrics.iulp_local)
+ flags |= 0x80000000;
+
+ if (ire->ire_ill != NULL) {
+ ill_get_name(ire->ire_ill, name, sizeof (name));
+ lx_ifname_convert(name, LX_IF_FROMNATIVE);
+ } else {
+ name[0] = '\0';
+ }
+
+ lxpr_uiobuf_printf(uiobuf, " %08x %08x %08x %08x %8s\n",
+ 0, /* metric */
+ ire->ire_refcnt,
+ 0,
+ flags,
+ name);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ipv6_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ ip_stack_t *ipst;
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return;
+ ipst = ns->netstack_ip;
+
+ /*
+ * LX branded zones are expected to have exclusive IP stack, hence
+ * using ALL_ZONES as the zoneid filter.
+ */
+ ire_walk_v6(&lxpr_format_route_ipv6, uiobuf, ALL_ZONES, ipst);
+
+ netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+#define LXPR_SKIP_ROUTE(type) \
+ (((IRE_IF_CLONE | IRE_BROADCAST | IRE_MULTICAST | \
+ IRE_NOROUTE | IRE_LOOPBACK | IRE_LOCAL) & type) != 0)
+
+static void
+lxpr_format_route_ipv4(ire_t *ire, lxpr_uiobuf_t *uiobuf)
+{
+ uint32_t flags;
+ char name[IFNAMSIZ];
+ ill_t *ill;
+ ire_t *nire;
+ ipif_t *ipif;
+ ipaddr_t gateway;
+
+ if (LXPR_SKIP_ROUTE(ire->ire_type) || ire->ire_testhidden != 0)
+ return;
+
+ /* These route flags have direct Linux equivalents */
+ flags = ire->ire_flags &
+ (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED);
+
+ /*
+ * Search for a suitable IRE for naming purposes.
+ * On Linux, the default route is typically associated with the
+ * interface used to access gateway. The default IRE on Illumos
+ * typically lacks an ill reference but its parent might have one.
+ */
+ nire = ire;
+ do {
+ ill = nire->ire_ill;
+ nire = nire->ire_dep_parent;
+ } while (ill == NULL && nire != NULL);
+ if (ill != NULL) {
+ ill_get_name(ill, name, sizeof (name));
+ lx_ifname_convert(name, LX_IF_FROMNATIVE);
+ } else {
+ name[0] = '*';
+ name[1] = '\0';
+ }
+
+ /*
+ * Linux suppresses the gateway address for directly connected
+ * interface networks. To emulate this behavior, we walk all addresses
+ * of a given route interface. If one matches the gateway, it is
+ * displayed as NULL.
+ */
+ gateway = ire->ire_gateway_addr;
+ if ((ill = ire->ire_ill) != NULL) {
+ for (ipif = ill->ill_ipif; ipif != NULL;
+ ipif = ipif->ipif_next) {
+ if (ipif->ipif_lcl_addr == gateway) {
+ gateway = 0;
+ break;
+ }
+ }
+ }
+
+ lxpr_uiobuf_printf(uiobuf, "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u\n",
+ name,
+ ire->ire_addr,
+ gateway,
+ flags, 0, 0,
+ 0, /* priority */
+ ire->ire_mask,
+ 0, 0, /* mss, window */
+ ire->ire_metrics.iulp_rtt);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ ip_stack_t *ipst;
+
+ lxpr_uiobuf_printf(uiobuf, "Iface\tDestination\tGateway \tFlags\t"
+ "RefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n");
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return;
+ ipst = ns->netstack_ip;
+
+ /*
+ * LX branded zones are expected to have exclusive IP stack, hence
+ * using ALL_ZONES as the zoneid filter.
+ */
+ ire_walk_v4(&lxpr_format_route_ipv4, uiobuf, ALL_ZONES, ipst);
+
+ netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+typedef struct lxpr_snmp_table {
+ const char *lst_proto;
+ const char **lst_fields;
+} lxpr_snmp_table_t;
+
+static const char *lxpr_snmp_ip_fields[] = {
+ "forwarding", "defaultTTL", "inReceives", "inHdrErrors",
+ "inAddrErrors", "forwDatagrams", "inUnknownProtos", "inDiscards",
+ "inDelivers", "outRequests", "outDiscards", "outNoRoutes",
+ "reasmTimeout", "reasmReqds", "reasmOKs", "reasmFails", "fragOKs",
+ "fragFails", "fragCreates",
+ NULL
+};
+
+static const char *lxpr_snmp_icmp_fields[] = {
+ "inMsgs", "inErrors", "inCsumErrors", "inDestUnreachs", "inTimeExcds",
+ "inParmProbs", "inSrcQuenchs", "inRedirects", "inEchos", "inEchoReps",
+ "inTimestamps", "inTimestampReps", "inAddrMasks", "inAddrMaskReps",
+ "outMsgs", "outErrors", "outDestUnreachs", "outTimeExcds",
+ "outParmProbs", "outSrcQuenchs", "outRedirects", "outEchos",
+ "outEchoReps", "outTimestamps", "outTimestampReps", "outAddrMasks",
+ "outAddrMaskReps",
+ NULL
+};
+
+static const char *lxpr_snmp_tcp_fields[] = {
+ "rtoAlgorithm", "rtoMin", "rtoMax", "maxConn", "activeOpens",
+ "passiveOpens", "attemptFails", "estabResets", "currEstab", "inSegs",
+ "outSegs", "retransSegs", "inErrs", "outRsts", "inCsumErrors",
+ NULL
+};
+
+static const char *lxpr_snmp_udp_fields[] = {
+ "inDatagrams", "noPorts", "inErrors", "outDatagrams", "rcvbufErrors",
+ "sndbufErrors", "inCsumErrors",
+ NULL
+};
+
+static lxpr_snmp_table_t lxpr_snmp_ip = { "ip", lxpr_snmp_ip_fields };
+static lxpr_snmp_table_t lxpr_snmp_icmp = { "icmp", lxpr_snmp_icmp_fields };
+static lxpr_snmp_table_t lxpr_snmp_tcp = { "tcp", lxpr_snmp_tcp_fields };
+static lxpr_snmp_table_t lxpr_snmp_udp = { "udp", lxpr_snmp_udp_fields };
+
+static lxpr_snmp_table_t *lxpr_net_snmptab[] = {
+ &lxpr_snmp_ip,
+ &lxpr_snmp_icmp,
+ &lxpr_snmp_tcp,
+ &lxpr_snmp_udp,
+ NULL
+};
+
+static void
+lxpr_kstat_print_tab(lxpr_uiobuf_t *uiobuf, lxpr_snmp_table_t *table,
+ kstat_t *kn, zoneid_t zoneid)
+{
+ kstat_named_t *klist;
+ char upname[KSTAT_STRLEN], upfield[KSTAT_STRLEN];
+ int i, j, num;
+ size_t size;
+
+ klist = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num,
+ zoneid);
+ if (klist == NULL)
+ return;
+
+ /* Print the header line, fields capitalized */
+ (void) strncpy(upname, table->lst_proto, KSTAT_STRLEN);
+ upname[0] = toupper(upname[0]);
+ lxpr_uiobuf_printf(uiobuf, "%s:", upname);
+ for (i = 0; table->lst_fields[i] != NULL; i++) {
+ (void) strncpy(upfield, table->lst_fields[i], KSTAT_STRLEN);
+ upfield[0] = toupper(upfield[0]);
+ lxpr_uiobuf_printf(uiobuf, " %s", upfield);
+ }
+ lxpr_uiobuf_printf(uiobuf, "\n%s:", upname);
+
+ /* Then loop back through to print the value line. */
+ for (i = 0; table->lst_fields[i] != NULL; i++) {
+ kstat_named_t *kpoint = NULL;
+ for (j = 0; j < num; j++) {
+ if (strncmp(klist[j].name, table->lst_fields[i],
+ KSTAT_STRLEN) == 0) {
+ kpoint = &klist[j];
+ break;
+ }
+ }
+ if (kpoint == NULL) {
+ /* Output 0 for unknown fields */
+ lxpr_uiobuf_printf(uiobuf, " 0");
+ } else {
+ switch (kpoint->data_type) {
+ case KSTAT_DATA_INT32:
+ lxpr_uiobuf_printf(uiobuf, " %d",
+ kpoint->value.i32);
+ break;
+ case KSTAT_DATA_UINT32:
+ lxpr_uiobuf_printf(uiobuf, " %u",
+ kpoint->value.ui32);
+ break;
+ case KSTAT_DATA_INT64:
+ lxpr_uiobuf_printf(uiobuf, " %ld",
+ kpoint->value.l);
+ break;
+ case KSTAT_DATA_UINT64:
+ lxpr_uiobuf_printf(uiobuf, " %lu",
+ kpoint->value.ul);
+ break;
+ }
+ }
+ }
+ lxpr_uiobuf_printf(uiobuf, "\n");
+ kmem_free(klist, size);
+}
+
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ kstat_t *ksr;
+ kstat_t ks0;
+ lxpr_snmp_table_t **table = lxpr_net_snmptab;
+ int i, t, nidx;
+ size_t sidx;
+ zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id;
+
+ ks0.ks_kid = 0;
+ ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx, zoneid);
+ if (ksr == NULL)
+ return;
+
+ for (t = 0; table[t] != NULL; t++) {
+ for (i = 0; i < nidx; i++) {
+ if (strncmp(ksr[i].ks_class, "mib2", KSTAT_STRLEN) != 0)
+ continue;
+ if (strncmp(ksr[i].ks_name, table[t]->lst_proto,
+ KSTAT_STRLEN) == 0) {
+ lxpr_kstat_print_tab(uiobuf, table[t], &ksr[i],
+ zoneid);
+ break;
+ }
+ }
+ }
+ kmem_free(ksr, sidx);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+static int
+lxpr_convert_tcp_state(int st)
+{
+ /*
+ * Derived from the enum located in the Linux kernel sources:
+ * include/net/tcp_states.h
+ */
+ switch (st) {
+ case TCPS_ESTABLISHED:
+ return (1);
+ case TCPS_SYN_SENT:
+ return (2);
+ case TCPS_SYN_RCVD:
+ return (3);
+ case TCPS_FIN_WAIT_1:
+ return (4);
+ case TCPS_FIN_WAIT_2:
+ return (5);
+ case TCPS_TIME_WAIT:
+ return (6);
+ case TCPS_CLOSED:
+ return (7);
+ case TCPS_CLOSE_WAIT:
+ return (8);
+ case TCPS_LAST_ACK:
+ return (9);
+ case TCPS_LISTEN:
+ return (10);
+ case TCPS_CLOSING:
+ return (11);
+ default:
+ /* No translation for TCPS_IDLE, TCPS_BOUND or anything else */
+ return (0);
+ }
+}
+
+static void
+lxpr_format_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, ushort_t ipver)
+{
+ int i, sl = 0;
+ connf_t *connfp;
+ conn_t *connp;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ int sonode_shift;
+
+ ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION);
+ if (ipver == IPV4_VERSION) {
+ lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address "
+ "st tx_queue rx_queue tr tm->when retrnsmt uid timeout "
+ "inode\n");
+ } else {
+ lxpr_uiobuf_printf(uiobuf, " sl "
+ "local_address "
+ "remote_address "
+ "st tx_queue rx_queue tr tm->when retrnsmt "
+ "uid timeout inode\n");
+ }
+ /*
+ * Due to differences between the Linux and illumos TCP
+ * implementations, some data will be omitted from the output here.
+ *
+ * Valid fields:
+ * - local_address
+ * - remote_address
+ * - st
+ * - tx_queue
+ * - rx_queue
+ * - uid
+ * - inode
+ *
+ * Omitted/invalid fields
+ * - tr
+ * - tm->when
+ * - retrnsmt
+ * - timeout
+ */
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return;
+ ipst = ns->netstack_ip;
+
+ sonode_shift = highbit(sizeof (sonode_t));
+
+ for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+ connfp = &ipst->ips_ipcl_globalhash_fanout[i];
+ connp = NULL;
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
+ tcp_t *tcp;
+ ino_t inode;
+ sonode_t *so = (sonode_t *)connp->conn_upper_handle;
+ if (connp->conn_ipversion != ipver)
+ continue;
+ tcp = connp->conn_tcp;
+ if (ipver == IPV4_VERSION) {
+ lxpr_uiobuf_printf(uiobuf,
+ "%4d: %08X:%04X %08X:%04X ",
+ ++sl,
+ connp->conn_laddr_v4,
+ ntohs(connp->conn_lport),
+ connp->conn_faddr_v4,
+ ntohs(connp->conn_fport));
+ } else {
+ lxpr_uiobuf_printf(uiobuf, "%4d: "
+ "%08X%08X%08X%08X:%04X "
+ "%08X%08X%08X%08X:%04X ",
+ ++sl,
+ connp->conn_laddr_v6.s6_addr32[0],
+ connp->conn_laddr_v6.s6_addr32[1],
+ connp->conn_laddr_v6.s6_addr32[2],
+ connp->conn_laddr_v6.s6_addr32[3],
+ ntohs(connp->conn_lport),
+ connp->conn_faddr_v6.s6_addr32[0],
+ connp->conn_faddr_v6.s6_addr32[1],
+ connp->conn_faddr_v6.s6_addr32[2],
+ connp->conn_faddr_v6.s6_addr32[3],
+ ntohs(connp->conn_fport));
+ }
+
+ /*
+ * We cannot use VOP_GETATTR here to fetch the
+ * simulated inode for the socket via the
+ * so->so_vnode. This is because there is a (very
+ * tight) race for when the v_vfsp is set on the
+ * sonode's vnode. However, all we really want here is
+ * the inode number, which we can compute using the
+ * same algorithm as socket_vop_getattr.
+ */
+ inode = ((ino_t)so >> sonode_shift) & 0xFFFF;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%02X %08X:%08X %02X:%08X %08X "
+ "%5u %8d %lu %d %p %u %u %u %u %d\n",
+ lxpr_convert_tcp_state(tcp->tcp_state),
+ tcp->tcp_rcv_cnt, tcp->tcp_unsent, /* rx/tx queue */
+ 0, 0, /* tr, when */
+ 0, /* per-connection rexmits aren't tracked today */
+ connp->conn_cred->cr_uid,
+ 0, /* timeout */
+ /* inode + more */
+ inode, 0, NULL, 0, 0, 0, 0, 0);
+ }
+ }
+ netstack_rele(ns);
+}
+
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_format_tcp(lxpnp, uiobuf, IPV4_VERSION);
+}
+
+static void
+lxpr_read_net_tcp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_format_tcp(lxpnp, uiobuf, IPV6_VERSION);
+}
+
+static void
+lxpr_format_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, ushort_t ipver)
+{
+ int i, sl = 0;
+ connf_t *connfp;
+ conn_t *connp;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ int sonode_shift;
+
+ ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION);
+ if (ipver == IPV4_VERSION) {
+ lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address"
+ " st tx_queue rx_queue tr tm->when retrnsmt uid"
+ " timeout inode ref pointer drops\n");
+ } else {
+ lxpr_uiobuf_printf(uiobuf, " sl "
+ "local_address "
+ "remote_address "
+ "st tx_queue rx_queue tr tm->when retrnsmt "
+ "uid timeout inode ref pointer drops\n");
+ }
+ /*
+ * Due to differences between the Linux and illumos UDP
+ * implementations, some data will be omitted from the output here.
+ *
+ * Valid fields:
+ * - local_address
+ * - remote_address
+ * - st: limited
+ * - uid
+ *
+ * Omitted/invalid fields
+ * - tx_queue
+ * - rx_queue
+ * - tr
+ * - tm->when
+ * - retrnsmt
+ * - timeout
+ * - inode
+ */
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return;
+ ipst = ns->netstack_ip;
+
+ sonode_shift = highbit(sizeof (sonode_t));
+
+ for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+ connfp = &ipst->ips_ipcl_globalhash_fanout[i];
+ connp = NULL;
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_UDPCONN)) != NULL) {
+ udp_t *udp;
+ ino_t inode;
+ int state = 0;
+ sonode_t *so = (sonode_t *)connp->conn_upper_handle;
+ if (connp->conn_ipversion != ipver)
+ continue;
+ udp = connp->conn_udp;
+ if (ipver == IPV4_VERSION) {
+ lxpr_uiobuf_printf(uiobuf,
+ "%4d: %08X:%04X %08X:%04X ",
+ ++sl,
+ connp->conn_laddr_v4,
+ ntohs(connp->conn_lport),
+ connp->conn_faddr_v4,
+ ntohs(connp->conn_fport));
+ } else {
+ lxpr_uiobuf_printf(uiobuf, "%4d: "
+ "%08X%08X%08X%08X:%04X "
+ "%08X%08X%08X%08X:%04X ",
+ ++sl,
+ connp->conn_laddr_v6.s6_addr32[0],
+ connp->conn_laddr_v6.s6_addr32[1],
+ connp->conn_laddr_v6.s6_addr32[2],
+ connp->conn_laddr_v6.s6_addr32[3],
+ ntohs(connp->conn_lport),
+ connp->conn_faddr_v6.s6_addr32[0],
+ connp->conn_faddr_v6.s6_addr32[1],
+ connp->conn_faddr_v6.s6_addr32[2],
+ connp->conn_faddr_v6.s6_addr32[3],
+ ntohs(connp->conn_fport));
+ }
+
+ switch (udp->udp_state) {
+ case TS_UNBND:
+ case TS_IDLE:
+ state = 7;
+ break;
+ case TS_DATA_XFER:
+ state = 1;
+ break;
+ }
+
+ /*
+ * We cannot use VOP_GETATTR here to fetch the
+ * simulated inode for the socket via the
+ * so->so_vnode. This is because there is a (very
+ * tight) race for when the v_vfsp is set on the
+ * sonode's vnode. However, all we really want here is
+ * the inode number, which we can compute using the
+ * same algorithm as socket_vop_getattr.
+ */
+ inode = ((ino_t)so >> sonode_shift) & 0xFFFF;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%02X %08X:%08X %02X:%08X %08X "
+ "%5u %8d %lu %d %p %d\n",
+ state,
+ 0, 0, /* rx/tx queue */
+ 0, 0, /* tr, when */
+ 0, /* retrans */
+ connp->conn_cred->cr_uid,
+ 0, /* timeout */
+ /* inode, ref, pointer, drops */
+ inode, 0, NULL, 0);
+ }
+ }
+ netstack_rele(ns);
+}
+
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_format_udp(lxpnp, uiobuf, IPV4_VERSION);
+}
+
+static void
+lxpr_read_net_udp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_format_udp(lxpnp, uiobuf, IPV6_VERSION);
+}
+
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ sonode_t *so;
+ zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id;
+
+ lxpr_uiobuf_printf(uiobuf, "Num RefCount Protocol Flags Type "
+ "St Inode Path\n");
+
+ mutex_enter(&socklist.sl_lock);
+ for (so = socklist.sl_list; so != NULL;
+ so = _SOTOTPI(so)->sti_next_so) {
+ vnode_t *vp = so->so_vnode;
+ vattr_t attr;
+ sotpi_info_t *sti;
+ const char *name = NULL;
+ int status = 0;
+ int type = 0;
+ int flags = 0;
+
+ /* Only process active sonodes in this zone */
+ if (so->so_count == 0 || so->so_zoneid != zoneid)
+ continue;
+
+ /*
+ * Grab the inode, if possible.
+ * This must be done before entering so_lock.
+ */
+ if (vp == NULL ||
+ VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0)
+ attr.va_nodeid = 0;
+
+ mutex_enter(&so->so_lock);
+ sti = _SOTOTPI(so);
+
+ if (sti->sti_laddr_sa != NULL &&
+ sti->sti_laddr_len > 0) {
+ name = sti->sti_laddr_sa->sa_data;
+ } else if (sti->sti_faddr_sa != NULL &&
+ sti->sti_faddr_len > 0) {
+ name = sti->sti_faddr_sa->sa_data;
+ }
+
+ /*
+ * Derived from enum values in Linux kernel source:
+ * include/uapi/linux/net.h
+ */
+ if ((so->so_state & SS_ISDISCONNECTING) != 0) {
+ status = 4;
+ } else if ((so->so_state & SS_ISCONNECTING) != 0) {
+ status = 2;
+ } else if ((so->so_state & SS_ISCONNECTED) != 0) {
+ status = 3;
+ } else {
+ status = 1;
+ /* Add ACC flag for stream-type server sockets */
+ if (so->so_type != SOCK_DGRAM &&
+ sti->sti_laddr_sa != NULL)
+ flags |= 0x10000;
+ }
+
+ /* Convert to Linux type */
+ switch (so->so_type) {
+ case SOCK_DGRAM:
+ type = 2;
+ break;
+ case SOCK_SEQPACKET:
+ type = 5;
+ break;
+ default:
+ type = 1;
+ }
+
+ lxpr_uiobuf_printf(uiobuf, "%p: %08X %08X %08X %04X %02X %5llu",
+ so,
+ so->so_count,
+ 0, /* proto, always 0 */
+ flags,
+ type,
+ status,
+ (ino_t)attr.va_nodeid);
+
+ /*
+ * Due to shortcomings in the abstract socket emulation, they
+ * cannot be properly represented here (as @<path>).
+ *
+ * This will be the case until they are better implemented.
+ */
+ if (name != NULL)
+ lxpr_uiobuf_printf(uiobuf, " %s\n", name);
+ else
+ lxpr_uiobuf_printf(uiobuf, "\n");
+ mutex_exit(&so->so_lock);
+ }
+ mutex_exit(&socklist.sl_lock);
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced, unless we're open non-blocking, in which case we return after
+ * 1ms.
+ */
+
+#define LX_KMSG_PRI "<0>"
+
+/* ARGSUSED */
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh)
+{
+ mblk_t *mp;
+ timestruc_t to;
+ timestruc_t *tp = NULL;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_KMSG);
+
+ if (lxpr_uiobuf_nonblock(uiobuf)) {
+ to.tv_sec = 0;
+ to.tv_nsec = 1000000; /* 1msec */
+ tp = &to;
+ }
+
+ if (ldi_getmsg(lh, &mp, tp) == 0) {
+ /*
+ * lx procfs doesn't like successive reads to the same file
+ * descriptor unless we do an explicit rewind each time.
+ */
+ lxpr_uiobuf_seek(uiobuf, 0);
+
+ lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+ mp->b_cont->b_rptr);
+
+ freemsg(mp);
+ }
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ulong_t avenrun1;
+ ulong_t avenrun5;
+ ulong_t avenrun15;
+ ulong_t avenrun1_cs;
+ ulong_t avenrun5_cs;
+ ulong_t avenrun15_cs;
+ int loadavg[3];
+ int *loadbuf;
+ cpupart_t *cp;
+ zone_t *zone = LXPTOZ(lxpnp);
+
+ uint_t nrunnable = 0;
+ rctl_qty_t nlwps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+ mutex_enter(&cpu_lock);
+
+ /*
+ * Need to add up values over all CPU partitions. If pools are active,
+ * only report the values of the zone's partition, which by definition
+ * includes the current CPU.
+ */
+ if (pool_pset_enabled()) {
+ psetid_t psetid = zone_pset_get(LXPTOZ(lxpnp));
+
+ ASSERT(LXPTOZ(lxpnp) != &zone0);
+ cp = CPU->cpu_part;
+
+ nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+ (void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+ loadbuf = &loadavg[0];
+ } else {
+ cp = cp_list_head;
+ do {
+ nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+ } while ((cp = cp->cp_next) != cp_list_head);
+
+ loadbuf = zone == global_zone ?
+ &avenrun[0] : zone->zone_avenrun;
+ }
+
+ /*
+ * If we're in the non-global zone, we'll report the total number of
+ * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+ * otherwise will just use nthread (which will include kernel threads,
+ * but should be good enough for lxproc).
+ */
+ nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+ mutex_exit(&cpu_lock);
+
+ avenrun1 = loadbuf[0] >> FSHIFT;
+ avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun5 = loadbuf[1] >> FSHIFT;
+ avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun15 = loadbuf[2] >> FSHIFT;
+ avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+ avenrun1, avenrun1_cs,
+ avenrun5, avenrun5_cs,
+ avenrun15, avenrun15_cs,
+ nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+ ulong_t total_mem, free_mem, total_swap;
+ boolean_t swap_disabled;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+ ASSERT(zone->zone_brand == &lx_brand);
+ ASSERT(lxzd != NULL);
+ swap_disabled = lxzd->lxzd_swap_disabled;
+
+ zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem,
+ (pgcnt_t *)&free_mem);
+ total_mem = ptob(total_mem);
+ free_mem = ptob(free_mem);
+
+ if (swap_disabled) {
+ total_swap = 0;
+ } else {
+ if (zone->zone_max_swap_ctl == UINT64_MAX) {
+ total_swap = ptob(k_anoninfo.ani_max);
+ } else {
+ mutex_enter(&zone->zone_mem_lock);
+ total_swap = zone->zone_max_swap_ctl;
+ mutex_exit(&zone->zone_mem_lock);
+ }
+ }
+
+ /*
+ * SwapFree
+ * On illumos we reserve swap up front, whereas on Linux they just
+ * wing it and kill a random process if they run out of backing store
+ * for virtual memory. Our swap reservation doesn't translate to that
+ * model, so just inform the caller that no swap is being used.
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "MemTotal: %8lu kB\n"
+ "MemFree: %8lu kB\n"
+ "MemShared: %8u kB\n"
+ "Buffers: %8u kB\n"
+ "Cached: %8u kB\n"
+ "SwapCached:%8u kB\n"
+ "Active: %8u kB\n"
+ "Inactive: %8u kB\n"
+ "HighTotal: %8u kB\n"
+ "HighFree: %8u kB\n"
+ "LowTotal: %8u kB\n"
+ "LowFree: %8u kB\n"
+ "SwapTotal: %8lu kB\n"
+ "SwapFree: %8lu kB\n",
+ btok(total_mem), /* MemTotal */
+ btok(free_mem), /* MemFree */
+ 0, /* MemShared */
+ 0, /* Buffers */
+ 0, /* Cached */
+ 0, /* SwapCached */
+ 0, /* Active */
+ 0, /* Inactive */
+ 0, /* HighTotal */
+ 0, /* HighFree */
+ btok(total_mem), /* LowTotal */
+ btok(free_mem), /* LowFree */
+ btok(total_swap), /* SwapTotal */
+ btok(total_swap)); /* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ *
+ * Note: we currently also use this for /proc/{pid}/mounts since we don't
+ * yet support mount namespaces.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ list_t *mounts;
+ lxpr_mount_entry_t *lme;
+
+ mounts = lxpr_enumerate_mounts(zone);
+
+ /*
+ * now we can run through what we've extracted without holding
+ * vfs_list_read_lock()
+ */
+ lme = list_remove_head(mounts);
+ while (lme != NULL) {
+ char *resource, *mntpt, *fstype;
+ vnode_t *vp;
+ int error;
+
+ mntpt = (char *)refstr_value(lme->lme_mntpt);
+ resource = (char *)refstr_value(lme->lme_resource);
+
+ if (mntpt == NULL || mntpt[0] == '\0') {
+ goto nextp;
+ }
+ mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+ error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ goto nextp;
+ } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) {
+ VN_RELE(vp);
+ goto nextp;
+ }
+ VN_RELE(vp);
+
+ if (resource != NULL && resource[0] != '\0') {
+ if (resource[0] == '/') {
+ resource = ZONE_PATH_VISIBLE(resource, zone) ?
+ ZONE_PATH_TRANSLATE(resource, zone) : mntpt;
+ }
+ } else {
+ resource = "none";
+ }
+
+ /* Make things look more like Linux. */
+ fstype = vfssw[lme->lme_fstype].vsw_name;
+ if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 &&
+ !lme->lme_force) {
+ goto nextp;
+ }
+
+ lxpr_uiobuf_printf(uiobuf, "%s %s %s %s 0 0\n",
+ resource, mntpt, fstype, lme->lme_mntopts);
+
+nextp:
+ refstr_rele(lme->lme_mntpt);
+ refstr_rele(lme->lme_resource);
+ kmem_free(lme->lme_mntopts, lme->lme_mntopts_len);
+ kmem_free(lme, sizeof (lxpr_mount_entry_t));
+ lme = list_remove_head(mounts);
+ }
+
+ list_destroy(mounts);
+ kmem_free(mounts, sizeof (list_t));
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * Over the years, /proc/partitions has been made considerably smaller -- to
+ * the point that it really is only major number, minor number, number of
+ * blocks (which we report as 0), and partition name.
+ *
+ * We support this because some things want to see it to make sense of
+ * /proc/diskstats, and also because "fdisk -l" and a few other things look
+ * here to find all disks on the system.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lx_zone_data_t *lxzd;
+ lx_virt_disk_t *vd;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PARTITIONS);
+
+ lxpr_uiobuf_printf(uiobuf, "major minor #blocks name\n\n");
+
+ lxzd = ztolxzd(LXPTOZ(lxpnp));
+ if (lxzd == NULL)
+ return;
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+
+ vd = list_head(lxzd->lxzd_vdisks);
+ while (vd != NULL) {
+ lxpr_uiobuf_printf(uiobuf, "%4d %7d %10d %s\n",
+ getmajor(vd->lxvd_emul_dev), getminor(vd->lxvd_emul_dev),
+ 0, vd->lxvd_name);
+ vd = list_next(lxzd->lxzd_vdisks, vd);
+ }
+}
+
+/*
+ * There aren't many actual devices inside a zone but we want to provide the
+ * major numbers for the pseudo devices that do exist, including our pts/ptm
+ * device, as well as the zvol virtual disk device. We simply hardcode the
+ * emulated major numbers that are used elsewhere in the code and that match
+ * the expected Linux major numbers. See lx devfs where some of the major
+ * numbers have no defined constants.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_devices(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_DEVICES);
+
+ lxpr_uiobuf_printf(uiobuf, "Character devices:\n");
+ lxpr_uiobuf_printf(uiobuf, "%3d /dev/tty\n", LX_TTY_MAJOR);
+ lxpr_uiobuf_printf(uiobuf, "%3d /dev/console\n", LX_TTY_MAJOR);
+ lxpr_uiobuf_printf(uiobuf, "%3d /dev/ptmx\n", LX_TTY_MAJOR);
+ lxpr_uiobuf_printf(uiobuf, "%3d ptm\n", LX_PTM_MAJOR);
+ lxpr_uiobuf_printf(uiobuf, "%3d pts\n", LX_PTS_MAJOR_MIN);
+
+ lxpr_uiobuf_printf(uiobuf, "\nBlock devices:\n");
+ lxpr_uiobuf_printf(uiobuf, "%3d zvol\n", LX_MAJOR_DISK);
+}
+
+/*
+ * lxpr_read_diskstats():
+ *
+ * See the block comment above the per-device output-generating line for the
+ * details of the format.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_diskstats(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ lx_zone_data_t *lxzd;
+ kstat_t kn;
+ int num;
+ zone_vfs_kstat_t *kip;
+ size_t size;
+ lx_virt_disk_t *vd;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_DISKSTATS);
+
+ lxzd = ztolxzd(zone);
+ if (lxzd == NULL)
+ return;
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+
+ /*
+ * Use the zone_vfs kstat, which is a superset of a kstat_io_t, since
+ * it tracks IO at the zone level.
+ */
+ (void) strlcpy(kn.ks_module, "zone_vfs", sizeof (kn.ks_module));
+ (void) strlcpy(kn.ks_name, zone->zone_name, sizeof (kn.ks_name));
+ kn.ks_instance = zone->zone_id;
+
+ kip = (zone_vfs_kstat_t *)lxpr_kstat_read(&kn, B_TRUE, &size, &num,
+ zone->zone_id);
+ if (kip == NULL)
+ return;
+
+ if (size < sizeof (kstat_io_t)) {
+ kmem_free(kip, size);
+ return;
+ }
+
+ /*
+ * Because the zone vfs stats are tracked at the zone level we use
+ * the same kstat for the zone's virtual disk (the zpool) and any
+ * zvols that might also visible within the zone.
+ */
+ vd = list_head(lxzd->lxzd_vdisks);
+ while (vd != NULL) {
+ /*
+ * /proc/diskstats is defined to have one line of output for
+ * each block device, with each line containing the following
+ * 14 fields:
+ *
+ * 1 - major number
+ * 2 - minor mumber
+ * 3 - device name
+ * 4 - reads completed successfully
+ * 5 - reads merged
+ * 6 - sectors read
+ * 7 - time spent reading (ms)
+ * 8 - writes completed
+ * 9 - writes merged
+ * 10 - sectors written
+ * 11 - time spent writing (ms)
+ * 12 - I/Os currently in progress
+ * 13 - time spent doing I/Os (ms)
+ * 14 - weighted time spent doing I/Os (ms)
+ *
+ * One small hiccup: we don't actually keep track of time
+ * spent reading vs. time spent writing -- we keep track of
+ * time waiting vs. time actually performing I/O. While we
+ * could divide the total time by the I/O mix (making the
+ * obviously wrong assumption that I/O operations all take the
+ * same amount of time), this has the undesirable side-effect
+ * of moving backwards. Instead, we report the total time
+ * (read + write) for all three stats (read, write, total).
+ * This is also a lie of sorts, but it should be more
+ * immediately clear to the user that reads and writes are
+ * each being double-counted as the other.
+ *
+ * Since certain consumers interpret the major/minor numbers to
+ * infer device names, some translation is required to avoid
+ * output which results in totally unexpected results.
+ */
+
+ lxpr_uiobuf_printf(uiobuf, "%4d %7d %s ",
+ getmajor(vd->lxvd_emul_dev),
+ getminor(vd->lxvd_emul_dev),
+ vd->lxvd_name);
+
+ if (vd->lxvd_type == LXVD_ZFS_DS) {
+ /*
+ * Use the zone-wide vfs stats for any zfs datasets
+ * represented via virtual devices.
+ */
+#define KV(N) kip->zv_ ## N.value.ui64
+#define NS_PER_MS (uint64_t)(NANOSEC / MILLISEC)
+ lxpr_uiobuf_printf(uiobuf,
+ "%llu %llu %llu %llu "
+ "%llu %llu %llu %llu "
+ "%llu %llu %llu\n",
+ (uint64_t)KV(reads), 0LL,
+ KV(nread) / (uint64_t)LXPR_SECTOR_SIZE,
+ (KV(rtime) + KV(wtime)) / NS_PER_MS,
+ (uint64_t)KV(writes), 0LL,
+ KV(nwritten) / (uint64_t)LXPR_SECTOR_SIZE,
+ (KV(rtime) + KV(wtime)) / NS_PER_MS,
+ (uint64_t)(KV(rcnt) + KV(wcnt)),
+ (KV(rtime) + KV(wtime)) / NS_PER_MS,
+ (KV(rlentime) + KV(wlentime)) / NS_PER_MS);
+#undef KV
+#undef NS_PER_MS
+ } else {
+ /*
+ * Report nearly-zeroed statistics for other devices.
+ *
+ * Since iostat will ignore devices which report no
+ * succesful reads or writes, a single read of one
+ * sector, taking 1ms, is reported.
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "1 0 1 1 0 0 0 0 0 0 0\n");
+ }
+
+ vd = list_next(lxzd->lxzd_vdisks, vd);
+ }
+
+ kmem_free(kip, size);
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp));
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+ char release[LX_KERN_RELEASE_MAX];
+ char version[LX_KERN_VERSION_MAX];
+
+ mutex_enter(&lxzd->lxzd_lock);
+ (void) strlcpy(release, lxzd->lxzd_kernel_release, sizeof (release));
+ (void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version));
+ mutex_exit(&lxzd->lxzd_lock);
+
+ /* Use per-process overrides, if specified */
+ if (lxpd != NULL && lxpd->l_uname_release[0] != '\0') {
+ (void) strlcpy(release, lxpd->l_uname_release,
+ sizeof (release));
+ }
+ if (lxpd != NULL && lxpd->l_uname_version[0] != '\0') {
+ (void) strlcpy(version, lxpd->l_uname_version,
+ sizeof (version));
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%s version %s (%s version %d.%d.%d) %s\n",
+ LX_UNAME_SYSNAME, release,
+#if defined(__GNUC__)
+ "gcc", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__,
+#else
+ "cc", 1, 0, 0,
+#endif
+ version);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_vmstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+
+ ulong_t pgpgin_cum = 0;
+ ulong_t pgpgout_cum = 0;
+ ulong_t pgswapout_cum = 0;
+ ulong_t pgswapin_cum = 0;
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+ /* Calculate cumulative stats */
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ /* Only count CPUs which are present and active. */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+ pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+ pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+ pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+ mutex_exit(&cpu_lock);
+
+ /*
+ * Needless to say, the metrics presented by vmstat are very specific
+ * to the internals of the Linux kernel. There is little per-zone
+ * information which can be translated in a meaningful way to fit the
+ * expected fields. For the time being, the output is kept sparse.
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "pgpgin %lu\n"
+ "pgpgout %lu\n"
+ "pswpin %lu\n"
+ "pswpout %lu\n",
+ pgpgin_cum,
+ pgpgout_cum,
+ pgswapin_cum,
+ pgswapout_cum);
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t sys_cum = 0;
+ ulong_t user_cum = 0;
+ ulong_t irq_cum = 0;
+ ulong_t cpu_nrunnable_cum = 0;
+ ulong_t w_io_cum = 0;
+
+ ulong_t pgpgin_cum = 0;
+ ulong_t pgpgout_cum = 0;
+ ulong_t pgswapout_cum = 0;
+ ulong_t pgswapin_cum = 0;
+ ulong_t intr_cum = 0;
+ ulong_t pswitch_cum = 0;
+ ulong_t forks_cum = 0;
+ hrtime_t msnsecs[NCMSTATES];
+ /* is the emulated release > 2.4 */
+ boolean_t newer_than24 = lx_kern_release_cmp(LXPTOZ(lxpnp), "2.4") > 0;
+ zone_t *zone = LXPTOZ(lxpnp);
+ const char *fmtstr0, *fmtstr1;
+ /* temporary variable since scalehrtime modifies data in place */
+ hrtime_t tmptime;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ /* Calculate cumulative stats */
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ int i;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+ sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+ user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+ pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+ pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+ pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+ pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+
+ if (newer_than24) {
+ cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+ w_io_cum += CPU_STATS(cp, sys.iowait);
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_cum += NSEC_TO_TICK(tmptime);
+ }
+ }
+
+ for (i = 0; i < PIL_MAX; i++)
+ intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+ pswitch_cum += CPU_STATS(cp, sys.pswitch);
+ forks_cum += CPU_STATS(cp, sys.sysfork);
+ forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ if (lx_kern_release_cmp(zone, "2.6.33") >= 0) {
+ fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0 0 0\n";
+ fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0 0 0\n";
+ } else if (lx_kern_release_cmp(zone, "2.6.24") >= 0) {
+ fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0 0\n";
+ fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0 0\n";
+ } else if (lx_kern_release_cmp(zone, "2.6.11") >= 0) {
+ fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0\n";
+ fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0\n";
+ } else if (lx_kern_release_cmp(zone, "2.5.41") >= 0) {
+ fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0\n";
+ fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0\n";
+ } else {
+ /* Note: we pass an unused param to these fmt strings */
+ fmtstr0 = "cpu %lu 0 %lu %lu\n";
+ fmtstr1 = "cpu%d %lu 0 %lu %lu\n";
+ }
+
+ /* Adjust hz */
+ user_cum = HZ_TO_LX_USERHZ(user_cum);
+ sys_cum = HZ_TO_LX_USERHZ(sys_cum);
+ idle_cum = HZ_TO_LX_USERHZ(idle_cum);
+ irq_cum = HZ_TO_LX_USERHZ(irq_cum);
+
+ lxpr_uiobuf_printf(uiobuf, fmtstr0,
+ user_cum, sys_cum, idle_cum, irq_cum);
+
+ /* Do per processor stats */
+ do {
+ int i;
+
+ ulong_t idle_ticks;
+ ulong_t sys_ticks;
+ ulong_t user_ticks;
+ ulong_t irq_ticks = 0;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_IDLE]));
+ sys_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_SYSTEM]));
+ user_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_USER]));
+
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_ticks += NSEC_TO_TICK(tmptime);
+ }
+ irq_ticks = HZ_TO_LX_USERHZ(irq_ticks);
+
+ lxpr_uiobuf_printf(uiobuf, fmtstr1, HZ_TO_LX_USERHZ(cp->cpu_id),
+ user_ticks, sys_ticks, idle_ticks, irq_ticks);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+
+ if (newer_than24) {
+ lxpr_uiobuf_printf(uiobuf,
+ "page %lu %lu\n"
+ "swap %lu %lu\n"
+ "intr %lu\n"
+ "ctxt %lu\n"
+ "btime %lu\n"
+ "processes %lu\n"
+ "procs_running %lu\n"
+ "procs_blocked %lu\n",
+ pgpgin_cum, pgpgout_cum,
+ pgswapin_cum, pgswapout_cum,
+ intr_cum,
+ pswitch_cum,
+ zone->zone_boot_time,
+ forks_cum,
+ cpu_nrunnable_cum,
+ w_io_cum);
+ } else {
+ lxpr_uiobuf_printf(uiobuf,
+ "page %lu %lu\n"
+ "swap %lu %lu\n"
+ "intr %lu\n"
+ "ctxt %lu\n"
+ "btime %lu\n"
+ "processes %lu\n",
+ pgpgin_cum, pgpgout_cum,
+ pgswapin_cum, pgswapout_cum,
+ intr_cum,
+ pswitch_cum,
+ zone->zone_boot_time,
+ forks_cum);
+ }
+}
+
+/*
+ * lxpr_read_swaps():
+ *
+ * We don't support swap files or partitions, but some programs like to look
+ * here just to check we have some swap on the system, so we lie and show
+ * our entire swap cap as one swap partition. See lxpr_read_meminfo for an
+ * explanation on why we report 0 used swap.
+ *
+ * The zone's lxzd_swap_disabled boolean controls whether or not we pretend
+ * swap space is configured.
+ *
+ * It is important to use formatting identical to the Linux implementation
+ * so that consumers do not break. See swap_show() in mm/swapfile.c.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_swaps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ boolean_t swap_enabled;
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+
+ ASSERT(zone->zone_brand == &lx_brand);
+ ASSERT(lxzd != NULL);
+ swap_enabled = !lxzd->lxzd_swap_disabled;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+
+ if (swap_enabled) {
+ uint64_t totswap, usedswap;
+
+ if (zone->zone_max_swap_ctl == UINT64_MAX) {
+ totswap = (k_anoninfo.ani_max * PAGESIZE) >> 10;
+ } else {
+ mutex_enter(&zone->zone_mem_lock);
+ /* Uses units of 1 kb (2^10). */
+ totswap = zone->zone_max_swap_ctl >> 10;
+ mutex_exit(&zone->zone_mem_lock);
+ }
+ usedswap = 0;
+
+ lxpr_uiobuf_printf(uiobuf, "%-40s%s\t%llu\t%llu\t%d\n",
+ "/dev/swap", "partition", totswap, usedswap, -1);
+ }
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_aiomax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_AIO_MAX_NR);
+ lxpr_uiobuf_printf(uiobuf, "%llu\n", LX_AIO_MAX_NR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_aionr(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+ uint64_t curr;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_AIO_NR);
+ ASSERT(zone->zone_brand == &lx_brand);
+ ASSERT(lxzd != NULL);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ curr = (uint64_t)(lxzd->lxzd_aio_nr);
+ mutex_exit(&lxzd->lxzd_lock);
+ lxpr_uiobuf_printf(uiobuf, "%llu\n", curr);
+}
+
+/*
+ * lxpr_read_sys_fs_filemax():
+ *
+ * The zone's total number of open files is not fixed or tunable, but we can
+ * provide a number by taking:
+ * (zone's proc limit) * (process.max-file-descriptor rctl privileged limit).
+ * The privileged rctl limit is the same as rlim_fd_max.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_filemax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ uint64_t max_fh, proc_lim;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_FILEMAX);
+ proc_lim = (uint64_t)(zone->zone_nprocs_ctl == INT_MAX ?
+ maxpid : zone->zone_nprocs_ctl);
+ max_fh = proc_lim * (uint64_t)rlim_fd_max;
+ lxpr_uiobuf_printf(uiobuf, "%llu\n", max_fh);
+}
+
+/*
+ * lxpr_read_sys_fs_filenr():
+ *
+ * Contains 3 numbers: current number of allocated file handles (open files),
+ * number of free file handles, and max. number of file handles (same value as
+ * we use in lxpr_read_sys_fs_filemax). Note that since Linux 2.6 the "free"
+ * value is always 0, so we just do the same here. We don't keep track of the
+ * number of files in use within a zone, so we approximate that value by
+ * looking at the current "fi_nfiles" value for each process in the zone.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_filenr(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ uint64_t max_fh, proc_lim, curr_files = 0;
+ int i;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_FILENR);
+ proc_lim = (uint64_t)(zone->zone_nprocs_ctl == INT_MAX ?
+ maxpid : zone->zone_nprocs_ctl);
+ max_fh = proc_lim * (uint64_t)rlim_fd_max;
+
+ for (i = 1; i < v.v_proc; i++) {
+ uint_t nfiles;
+ proc_t *p;
+ uf_info_t *fip;
+
+ mutex_enter(&pidlock);
+
+ if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+ p->p_pid == 0 || p->p_zone != zone ||
+ p == zone->zone_zsched ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ mutex_exit(&pidlock);
+ continue;
+ }
+
+ fip = P_FINFO(p);
+ mutex_enter(&fip->fi_lock);
+ nfiles = fip->fi_nfiles;
+ mutex_exit(&fip->fi_lock);
+
+ mutex_exit(&pidlock);
+
+ curr_files += nfiles;
+ }
+
+ lxpr_uiobuf_printf(uiobuf, "%llu\t0\t%llu\n", curr_files, max_fh);
+}
+
+/*
+ * inotify tunables exported via /proc.
+ */
+extern int inotify_maxevents;
+extern int inotify_maxinstances;
+extern int inotify_maxwatches;
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *lxpnp,
+ lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxevents);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *lxpnp,
+ lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxinstances);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *lxpnp,
+ lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxwatches);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_fs_pipe_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp));
+ uint_t pipe_max;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_PIPE_MAX);
+ ASSERT(lxzd != NULL);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ pipe_max = lxzd->lxzd_pipe_max_sz;
+ mutex_exit(&lxzd->lxzd_lock);
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", pipe_max);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_caplcap(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_CAPLCAP);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", LX_CAP_MAX_VALID);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_corepatt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ struct core_globals *cg;
+ refstr_t *rp;
+ corectl_path_t *ccp;
+ char tr[MAXPATHLEN];
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT);
+
+ cg = zone_getspecific(core_zone_key, zone);
+ ASSERT(cg != NULL);
+
+ /* If core dumps are disabled, return an empty string. */
+ if ((cg->core_options & CC_PROCESS_PATH) == 0) {
+ lxpr_uiobuf_printf(uiobuf, "\n");
+ return;
+ }
+
+ ccp = cg->core_default_path;
+ mutex_enter(&ccp->ccp_mtx);
+ if ((rp = ccp->ccp_path) != NULL)
+ refstr_hold(rp);
+ mutex_exit(&ccp->ccp_mtx);
+
+ if (rp == NULL) {
+ lxpr_uiobuf_printf(uiobuf, "\n");
+ return;
+ }
+
+ bzero(tr, sizeof (tr));
+ if (lxpr_core_path_s2l(refstr_value(rp), tr, sizeof (tr)) != 0) {
+ refstr_rele(rp);
+ lxpr_uiobuf_printf(uiobuf, "\n");
+ return;
+ }
+
+ refstr_rele(rp);
+ lxpr_uiobuf_printf(uiobuf, "%s\n", tr);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_hostname(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_HOSTNAME);
+ lxpr_uiobuf_printf(uiobuf, "%s\n", uts_nodename());
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_msgmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ /*
+ * We don't have an rctl for this. See our definition for LX_MSGMAX
+ * in the user-level emulation library. Once that code moves into
+ * the kernel, we can use a common definition. This matches the
+ * value on Linux.
+ */
+ uint_t val = 8192;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMAX);
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", val);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_msgmnb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ rctl_qty_t val;
+ proc_t *pp = curproc;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNB);
+
+ mutex_enter(&pp->p_lock);
+ val = rctl_enforced_value(rc_process_msgmnb, pp->p_rctls, pp);
+ mutex_exit(&pp->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_msgmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ rctl_qty_t val;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNI);
+
+ mutex_enter(&curproc->p_lock);
+ val = rctl_enforced_value(rc_zone_msgmni,
+ LXPTOZ(lxpnp)->zone_rctls, curproc);
+ mutex_exit(&curproc->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_NGROUPS_MAX);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", ngroups_max);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_osrel(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+ char version[LX_KERN_VERSION_MAX];
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_OSREL);
+ ASSERT(zone->zone_brand == &lx_brand);
+ ASSERT(lxzd != NULL);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ (void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version));
+ mutex_exit(&lxzd->lxzd_lock);
+ lxpr_uiobuf_printf(uiobuf, "%s\n", version);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_pid_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_PID_MAX);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", maxpid);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ /*
+ * This file isn't documented on the Linux proc(5) man page but
+ * according to the blog of the author of systemd/journald (the
+ * consumer), he says:
+ * boot_id: A random ID that is regenerated on each boot. As such it
+ * can be used to identify the local machine's current boot. It's
+ * universally available on any recent Linux kernel. It's a good and
+ * safe choice if you need to identify a specific boot on a specific
+ * booted kernel.
+ *
+ * We'll just generate a random ID if necessary. On Linux the format
+ * appears to resemble a uuid but since it is not documented to be a
+ * uuid, we don't worry about that.
+ */
+ zone_t *zone = LXPTOZ(lxpnp);
+ lx_zone_data_t *lxzd = ztolxzd(zone);
+ char bootid[LX_BOOTID_LEN];
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_BOOTID);
+ ASSERT(zone->zone_brand == &lx_brand);
+ ASSERT(lxzd != NULL);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ if (lxzd->lxzd_bootid[0] == '\0') {
+ int i;
+
+ for (i = 0; i < 5; i++) {
+ u_longlong_t n;
+ char s[32];
+
+ (void) random_get_bytes((uint8_t *)&n, sizeof (n));
+ switch (i) {
+ case 0: (void) snprintf(s, sizeof (s), "%08llx", n);
+ s[8] = '\0';
+ break;
+ case 4: (void) snprintf(s, sizeof (s), "%012llx", n);
+ s[12] = '\0';
+ break;
+ default: (void) snprintf(s, sizeof (s), "%04llx", n);
+ s[4] = '\0';
+ break;
+ }
+ if (i > 0)
+ (void) strlcat(lxzd->lxzd_bootid, "-",
+ sizeof (lxzd->lxzd_bootid));
+ (void) strlcat(lxzd->lxzd_bootid, s,
+ sizeof (lxzd->lxzd_bootid));
+ }
+ }
+ (void) strlcpy(bootid, lxzd->lxzd_bootid, sizeof (bootid));
+ mutex_exit(&lxzd->lxzd_lock);
+
+ lxpr_uiobuf_printf(uiobuf, "%s\n", bootid);
+}
+
+/*
+ * The amount of entropy available (in bits).
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_rand_entavl(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_ENTAVL);
+ ASSERT(LXPTOZ(lxpnp)->zone_brand == &lx_brand);
+
+ lxpr_uiobuf_printf(uiobuf, "%d\n", swrand_stats.ss_entEst);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_sem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *pp = curproc;
+ zone_t *zone = LXPTOZ(lxpnp);
+ rctl_qty_t vmsl, vopm, vmni, vmns;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SEM);
+
+ mutex_enter(&pp->p_lock);
+ vmsl = rctl_enforced_value(rc_process_semmsl, pp->p_rctls, pp);
+ vopm = rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
+ vmni = rctl_enforced_value(rc_zone_semmni, zone->zone_rctls, pp);
+ mutex_exit(&pp->p_lock);
+ vmns = vmsl * vmni;
+ if (vmns < vmsl || vmns < vmni) {
+ vmns = ULLONG_MAX;
+ }
+ /*
+ * Format: semmsl semmns semopm semmni
+ * - semmsl: Limit semaphores in a sempahore set.
+ * - semmns: Limit semaphores in all semaphore sets
+ * - semopm: Limit operations in a single semop call
+ * - semmni: Limit number of semaphore sets
+ */
+ lxpr_uiobuf_printf(uiobuf, "%llu\t%llu\t%llu\t%llu\n",
+ vmsl, vmns, vopm, vmni);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_shmall(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ rctl_qty_t val;
+ zone_t *zone = LXPTOZ(lxpnp);
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMALL);
+
+ mutex_enter(&curproc->p_lock);
+ val = rctl_enforced_value(rc_zone_shmmax, zone->zone_rctls, curproc);
+ mutex_exit(&curproc->p_lock);
+
+ /* value is in pages */
+ lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)btop(val));
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_shmmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ rctl_qty_t val;
+ zone_t *zone = LXPTOZ(lxpnp);
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMAX);
+
+ mutex_enter(&curproc->p_lock);
+ val = rctl_enforced_value(rc_zone_shmmax, zone->zone_rctls, curproc);
+ mutex_exit(&curproc->p_lock);
+
+ if (val > FOURGB)
+ val = FOURGB;
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_shmmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ rctl_qty_t val;
+ zone_t *zone = LXPTOZ(lxpnp);
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMNI);
+
+ mutex_enter(&curproc->p_lock);
+ val = rctl_enforced_value(rc_zone_shmmni, zone->zone_rctls, curproc);
+ mutex_exit(&curproc->p_lock);
+
+ if (val > FOURGB)
+ val = FOURGB;
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_kernel_threads_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_THREADS_MAX);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", LXPTOZ(lxpnp)->zone_nlwps_ctl);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_core_somaxc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_printf(uiobuf, "%d\n", SOMAXCONN);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q);
+ netstack_rele(ns);
+}
+
+/*
+ * icmp_echo_ignore_broadcasts
+ * integer; 0 or 1
+ *
+ * illumos: ndd /dev/ip ip_respond_to_echo_broadcast
+ * From the tunable guide: Control whether IPv4 responds to broadcast ICMPv4
+ * echo request. default: 1 (enabled)
+ * Not in ip(7p) man page.
+ *
+ * Note that the Linux setting is the inverse of the illumos value.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_icmp_eib(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ ip_stack_t *ipst;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_ICMP_EIB);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ ipst = ns->netstack_ip;
+ lxpr_uiobuf_printf(uiobuf, "%d\n", !ipst->ips_ip_g_resp_to_echo_bcast);
+ netstack_rele(ns);
+}
+
+/*
+ * ip_forward
+ * integer; default: 0
+ *
+ * illumos: ndd /dev/ip ip_forwarding
+ * default: 0 (disabled)
+ * Forwarding is described in the ip(7p) man page. We do not support forwarding
+ * in lx at this time, thus we do not support Linux-ABI methods for
+ * enabling/disabling forwarding, and this is always 0.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_ip_forward(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_FORWARD);
+ lxpr_uiobuf_printf(uiobuf, "0\n");
+}
+
+/*
+ * ip_local_port_range
+ *
+ * The low & high port number range.
+ * integers; default: 32768 61000
+ *
+ * illumos: tcp_smallest_anon_port & tcp_largest_anon_port
+ * Not in tcp(7p) man page.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\t%d\n",
+ tcps->tcps_smallest_anon_port, tcps->tcps_largest_anon_port);
+ netstack_rele(ns);
+}
+
+/*
+ * tcp_fin_timeout
+ *
+ * This specifies how many seconds to wait for a final FIN packet before the
+ * socket is forcibly closed. This is strictly a violation of the TCP
+ * specification, but required to prevent denial-of-service attacks.
+ * integer; default: 60;
+ *
+ * illumos: tcp_fin_wait_2_flush_interval
+ * Not in tcp(7p) man page but see comment in uts/common/inet/tcp/tcp_input.c
+ * in the tcp_input_data() function on the use of tcp_fin_wait_2_flush_interval.
+ * The value is in milliseconds.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\n",
+ tcps->tcps_fin_wait_2_flush_interval / 1000);
+ netstack_rele(ns);
+}
+
+/*
+ * tcp_keepalive_intvl
+ *
+ * The number of seconds between TCP keep-alive probes. default: 75
+ * Linux retries tcp_keepalive_probes (9) times before timing out.
+ *
+ * illumos:
+ * We have tcp_ka_rinterval but there is no corresponding tcps_* tunable for
+ * this. The closest is tcps_keepalive_abort_interval which specifies the
+ * time threshold for aborting a TCP connection in milliseconds. Linux retries
+ * 9 times (giving a total of 11.25 minutes) so we emulate this by dividing out
+ * tcps_keepalive_abort_interval by 9.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\n",
+ (tcps->tcps_keepalive_abort_interval / 1000) / 9);
+ netstack_rele(ns);
+}
+
+/*
+ * tcp_keepalive_time
+ *
+ * The number of seconds a connection needs to be idle before TCP begins
+ * sending out keep-alive probes. The default value is 7200 seconds (2 hours).
+ *
+ * illumos: tcp_keepalive_interval
+ * The interval for sending out the first probe in milliseconds. The default is
+ * two hours.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\n",
+ (tcps->tcps_keepalive_interval / 1000));
+ netstack_rele(ns);
+}
+
+/*
+ * tcp_max_syn_backlog
+ *
+ * The number of half-open connections that can be kept by the backlog queue.
+ * See the Linux tcp(7) man page.
+ *
+ * illumos: tcp_conn_req_max_q0
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q0);
+ netstack_rele(ns);
+}
+
+/*
+ * tcp_retries2
+ *
+ * Controls number of TCP retries for data packets. Often tuned down for HA
+ * configurations. RFC 1122 recommends at least 100 seconds for the timeout,
+ * which, for Linux, corresponds to a value of ~8. Oracle suggests a value of
+ * 3 for a RAC configuration, as do various HA tuning guides.
+ * integer; Ubuntu 16.04 default: 15
+ *
+ * illumos: There are 4 ndd parameters that are related to this:
+ * tcp_rexmit_interval_initial: 1000
+ * tcp_rexmit_interval_min: 400
+ * tcp_rexmit_interval_max: 60000
+ * tcp_rexmit_interval_extra: 0
+ * Not in tcp(7p) man page.
+ *
+ * From the tunables guide:
+ * tcp_rexmit_interval_initial is the initial retransmission timeout (RTO) for
+ * a TCP connection in milliseconds (ms).
+ * The interval_min value is the minimum RTO in ms.
+ * The interval_max value is the maximum RTO in ms.
+ * The extra value is an extra time (in ms) to add in to the RTO.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_retry2(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+ uint_t i, retry, rx_min, rx_max;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RETRY2);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ rx_min = tcps->tcps_rexmit_interval_min;
+ rx_max = tcps->tcps_rexmit_interval_max;
+ netstack_rele(ns);
+
+ for (i = rx_min, retry = 0; i < rx_max; retry++) {
+ i *= 2;
+ }
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", retry);
+}
+
+/*
+ * tcp_rmem and tcp_wmem
+ *
+ * Display the minimum, default, and maximum TCP receive/transmit window sizes,
+ * in bytes. See the Linux tcp(7) man page.
+ *
+ * In illumos this roughly corresponds to: tcp_recv_hiwat or tcp_xmit_hiwat,
+ * and tcp_max_buf.
+ * tcp_recv_hiwat is the default TCP receive window size
+ * tcp_xmit_hiwat is the default TCP send window size
+ * tcp_max_buf is the maximum TCP send and receive buffer size
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_rwmem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+ uint_t min;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ||
+ lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WMEM);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+
+ /* Linux defaults to a page */
+ min = MIN((lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ?
+ tcps->tcps_recv_hiwat : tcps->tcps_xmit_hiwat), PAGESIZE);
+
+ lxpr_uiobuf_printf(uiobuf, "%d\t%d\t%d\n",
+ min,
+ (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ?
+ tcps->tcps_recv_hiwat : tcps->tcps_xmit_hiwat),
+ tcps->tcps_max_buf);
+ netstack_rele(ns);
+}
+
+/*
+ * tcp_sack
+ *
+ * Enable RFC 2018 TCP Selective Acknowledgements. Boolean, default: enabled
+ *
+ * illumos: tcp_sack_permitted
+ * tcp_sack_permitted 0 == disabled, 1 == no initiate but accept,
+ * 2 == initiate and accept. default is 2.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\n",
+ (tcps->tcps_sack_permitted == 0 ? 0 : 1));
+ netstack_rele(ns);
+}
+
+/*
+ * tcp_window_scaling
+ *
+ * RFC 1323 TCP window scaling. This feature allows the use of a large window
+ * (> 64K) on a TCP connection. Boolean; default: enabled
+ *
+ * illumos: tcp_wscale_always
+ * tcp_wscale_always is set to 1, the window scale option will always be
+ * set when connecting to a remote system. If tcp_wscale_always is 0, the
+ * window scale option will be set only if the user has requested a send or
+ * receive window larger than 64K. The default value of is 1.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, ENXIO);
+ return;
+ }
+
+ tcps = ns->netstack_tcp;
+ lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_wscale_always);
+ netstack_rele(ns);
+}
+
+/*
+ * The /proc/sys/vm/dirty* files are (poorly) documented in the Linux
+ * source file Documentation/sysctl/vm.txt. These are various VM tunables
+ * that we'll never support, but that a few misguided apps want to inspect and
+ * modify. We simply hardcode some default values and we'll lie about write
+ * success to these files.
+ */
+static void
+lxpr_read_sys_vm_dirty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ uint_t val;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BG_BYTES ||
+ lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BG_RATIO ||
+ lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BYTES ||
+ lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_EXP_CS ||
+ lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_RATIO ||
+ lxpnp->lxpr_type == LXPR_SYS_VM_DIRTYTIME_EXP_SEC ||
+ lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_WB_CS);
+
+ switch (lxpnp->lxpr_type) {
+ case LXPR_SYS_VM_DIRTY_BG_RATIO:
+ val = 10;
+ break;
+ case LXPR_SYS_VM_DIRTY_EXP_CS:
+ val = 3000;
+ break;
+ case LXPR_SYS_VM_DIRTY_RATIO:
+ val = 20;
+ break;
+ case LXPR_SYS_VM_DIRTYTIME_EXP_SEC:
+ val = 43200;
+ break;
+ case LXPR_SYS_VM_DIRTY_WB_CS:
+ val = 500;
+ break;
+ default:
+ val = 0;
+ break;
+ }
+
+ lxpr_uiobuf_printf(uiobuf, "%u\n", val);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MAX_MAP_CNT);
+ /* We don't limit mappings, just say we have a large limit. */
+ lxpr_uiobuf_printf(uiobuf, "%d\n", 16777215);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_vm_minfr_kb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MINFR_KB);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_vm_nhpages(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_NHUGEP);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_OVERCOMMIT_MEM);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_sys_vm_swappiness(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_SWAPPINESS);
+ lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t cpu_count = 0;
+ ulong_t idle_s;
+ ulong_t idle_cs;
+ ulong_t up_s;
+ ulong_t up_cs;
+ hrtime_t birthtime;
+ hrtime_t centi_sec = 10000000; /* 10^7 */
+
+ ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+ /* Calculate cumulative stats */
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+ cpu_count += 1;
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+ mutex_exit(&cpu_lock);
+
+ /* Getting the Zone zsched process startup time */
+ birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+ up_cs = (gethrtime() - birthtime) / centi_sec;
+ up_s = up_cs / 100;
+ up_cs %= 100;
+
+ ASSERT(cpu_count > 0);
+ idle_cum /= cpu_count;
+ idle_s = idle_cum / hz;
+ idle_cs = idle_cum % hz;
+ idle_cs *= 100;
+ idle_cs /= hz;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+/*
+ * Report a list of each cgroup subsystem supported by our emulated cgroup fs.
+ * This needs to exist for systemd to run but for now we don't report any
+ * cgroup subsystems as being installed. The commented example below shows
+ * how to print a subsystem entry.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_cgroups(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n",
+ "#subsys_name", "hierarchy", "num_cgroups", "enabled");
+
+ /*
+ * lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n",
+ * "cpu,cpuacct", "2", "1", "1");
+ */
+}
+
+/*
+ * Report the zone boot arguments.
+ */
+static void
+lxpr_read_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ lxpr_uiobuf_printf(uiobuf, "%s\n", zone->zone_bootargs);
+}
+
+
+typedef enum {
+ LXCS_ALWAYS = 0,
+ LXCS_CPUID1_ECX,
+ LXCS_CPUID1_EDX,
+ LXCS_CPUID7_EBX,
+ LXCS_CPUID7_ECX,
+ LXCS_CPUID7_EDX,
+ LXCS_CPUIDD1_EAX,
+ LXCS_CPUIDX1_ECX,
+ LXCS_CPUIDX1_EDX,
+ LXCS_REG_MAX
+} lx_cpuinfo_source_t;
+
+typedef struct {
+ lx_cpuinfo_source_t lxcm_source;
+ uint32_t lxcm_flag;
+ const char *lxcm_name;
+} lx_cpuinfo_mapping_t;
+
+/*
+ * This listing is derived from the X86_FEATURE flags data in the Linux kernel.
+ * Some entries are missing detectino routines. They remain in the list,
+ * although commented out, to preserve proper order should they be fixed later.
+ */
+lx_cpuinfo_mapping_t lx_cpuinfo_mappings[] = {
+ /* CPUID EDX: */
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_FPU, "fpu" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_VME, "vme" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_DE, "de" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSE, "pse" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_TSC, "tsc" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MSR, "msr" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PAE, "pae" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MCE, "mce" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CX8, "cx8" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_APIC, "apic" },
+ /* reserved */
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SEP, "sep" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MTRR, "mtrr" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PGE, "pge" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MCA, "mca" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CMOV, "cmov" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PAT, "pat" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSE36, "pse36" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSN, "pn" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CLFSH, "clflush" },
+ /* reserved */
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_DS, "dts" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_ACPI, "acpi" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MMX, "mmx" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_FXSR, "fxsr" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SSE, "sse" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SSE2, "sse2" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SS, "ss" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_HTT, "ht" },
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_TM, "tm" },
+ /* reserved */
+ { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PBE, "pbe" },
+
+ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+#if defined(__amd64)
+ { LXCS_ALWAYS, 1, "syscall" },
+#endif
+ /* Present in the Linux listing but not in recent AMD docs: "mp" */
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_NX, "nx" },
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_MMXamd, "mmxext" },
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_FFXSR, "fxsr_opt" },
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_1GPG, "pdpe1gb" },
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_TSCP, "rdtscp" },
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_LM, "lm" },
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_3DNowx, "3dnowext" },
+ { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_3DNow, "3dnow" },
+
+ /* CPUID ECX: */
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE3, "pni" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PCLMULQDQ, "pclmulqdq" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DTES64, "dtes64" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_MON, "monitor" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DSCPL, "ds_cpl" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_VMX, "vmx" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SMX, "smx" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_EST, "est" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_TM2, "tm2" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSSE3, "ssse3" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_CID, "cid" },
+ { LXCS_CPUID1_ECX, 0x00000800, "sdbg" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_FMA, "fma" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_CX16, "cx16" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_ETPRD, "xtpr" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PDCM, "pdcm" },
+ /* reserved */
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PCID, "pcid" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DCA, "dca" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE4_1, "sse4_1" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE4_2, "sse4_2" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_X2APIC, "x2apic" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_MOVBE, "movbe" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_POPCNT, "popcnt" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_TSCDL, "tsc_deadline_timer" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_AES, "aes" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_XSAVE, "xsave" },
+ /* osxsave */
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_AVX, "avx" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_F16C, "f16c" },
+ { LXCS_CPUID1_ECX, CPUID_INTC_ECX_RDRAND, "rdrand" },
+ /* not used */
+
+ /*
+ * Other features, Linux-defined mapping
+ * This range is used for feature bits which conflict or are synthesized
+ * Skipped:
+ * "recovery",
+ * "longrun",
+ * "lrti",
+ * "cxmmx",
+ * "k6_mtrr",
+ * "cyrix_arr",
+ * "centaur_mcr",
+ * "constant_tsc",
+ * "up",
+ * "arch_perfmon",
+ * "pebs",
+ * "bts",
+ * "rep_good",
+ * "nopl",
+ * "xtopology",
+ * "tsc_reliable",
+ * "nonstop_tsc",
+ * "extd_apicid",
+ * "amd_dcm",
+ * "aperfmperf",
+ * "eagerfpu",
+ * "nonstop_tsc_s3",
+ *
+ * "hypervisor",
+ * "rng",
+ * "rng_en",
+ * "ace",
+ * "ace_en",
+ * "ace2",
+ * "ace2_en",
+ * "phe",
+ * "phe_en",
+ * "pmm",
+ * "pmm_en",
+ */
+
+ /*
+ * More extended AMD flags: CPUID level 0x80000001, ecx, word 6
+ */
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_AHF64, "lahf_lm" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_CMP_LGCY, "cmp_legacy" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SVM, "svm" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_EAS, "extapic" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_CR8D, "cr8_legacy" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_LZCNT, "abm" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SSE4A, "sse4a" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_MAS, "misalignsse" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_3DNP, "3dnowprefetch" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_OSVW, "osvw" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_IBS, "ibs" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_XOP, "xop" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SKINIT, "skinit" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_WDT, "wdt" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_LWP, "lwp" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_FMA4, "fma4" },
+ { LXCS_CPUIDX1_ECX, 0x00020000, "tce" },
+
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_NIDMSR, "nodeid_msr" },
+
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_TBM, "tbm" },
+ { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_TOPOEXT, "topoext" },
+ { LXCS_CPUIDX1_ECX, 0x00800000, "perfctr_core" },
+ { LXCS_CPUIDX1_ECX, 0x01000000, "perfctr_nb" },
+ { LXCS_CPUIDX1_ECX, 0x02000000, "bpext" },
+ { LXCS_CPUIDX1_ECX, 0x04000000, "perfctr_l2" },
+ { LXCS_CPUIDX1_ECX, 0x08000000, "mwaitx" },
+
+ /*
+ * Aux flags and virt bits.
+ * Skipped:
+ * "cpb",
+ * "epb",
+ * "hw_pstate",
+ * "proc_feedback",
+ * "intel_pt",
+ * "tpr_shadow",
+ * "vnmi",
+ * "flexpriority",
+ * "ept",
+ * "vpid",
+ * "vmmcall",
+ */
+
+ /*
+ * Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9
+ */
+ { LXCS_CPUID7_EBX, 0x00000001, "fsgsbase" },
+ { LXCS_CPUID7_EBX, 0x00000002, "tsc_adjust" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_BMI1, "bmi1" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_HLE, "hle" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX2, "avx2" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SMEP, "smep" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_BMI2, "bmi2" },
+ { LXCS_CPUID7_EBX, 0x00000200, "erms" },
+ { LXCS_CPUID7_EBX, 0x00000400, "invpcid" },
+ { LXCS_CPUID7_EBX, 0x00000800, "rtm" },
+ { LXCS_CPUID7_EBX, 0x00001000, "cqm" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_MPX, "mpx" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512F, "avx512f" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512DQ, "avx512dq" },
+
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_RDSEED, "rdseed" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_ADX, "adx" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SMAP, "smap" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512IFMA, "avx512ifma" },
+
+ { LXCS_CPUID7_EBX, 0x00400000, "pcommit" },
+ { LXCS_CPUID7_EBX, 0x00800000, "clflushopt" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_CLWB, "clwb" },
+
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512PF, "avx512pf" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512ER, "avx512er" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512CD, "avx512cd" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SHA, "sha_ni" },
+
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512BW, "avx512bw" },
+ { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512VL, "avx512vl" },
+
+ /*
+ * Intel-defined CPU features, CPUID level 0x00000007:0 (ecx)
+ */
+ { LXCS_CPUID7_ECX, CPUID_INTC_ECX_7_0_AVX512VBMI, "avx512vbmi" },
+ { LXCS_CPUID7_ECX, CPUID_INTC_ECX_7_0_AVX512VPOPCDQ,
+ "avx512_vpopcntdq" },
+
+ /*
+ * Intel-defined CPU features, CPUID level 0x00000007:0 (edx)
+ */
+ { LXCS_CPUID7_EDX, CPUID_INTC_EDX_7_0_AVX5124NNIW, "avx512_4nniw" },
+ { LXCS_CPUID7_EDX, CPUID_INTC_EDX_7_0_AVX5124FMAPS, "avx512_4fmaps" },
+
+ /*
+ * Extended state features, CPUID level 0x0000000d:1 (eax)
+ */
+ { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVEOPT, "xsaveopt" },
+ { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVEC, "xsavec" },
+ { LXCS_CPUIDD1_EAX, 0x00000004, "xgetbv1" },
+ { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVES, "xsaves" },
+
+ /*
+ * Skipped:
+ * "cqm_llc",
+ * "cqm_occup_llc",
+ * "clzero",
+ */
+
+ /*
+ * Thermal and Power Management Leaf, CPUID level 0x00000006 (eax)
+ * Skipped:
+ * "dtherm",
+ * "ida",
+ * "arat",
+ * "pln",
+ * "pts",
+ * "hwp",
+ * "hwp_notify",
+ * "hwp_act_window",
+ * "hwp_epp",
+ * "hwp_pkg_req",
+ */
+
+ /*
+ * AMD SVM Feature Identification, CPUID level 0x8000000a (edx)
+ * Skipped:
+ * "npt",
+ * "lbrv",
+ * "svm_lock",
+ * "nrip_save",
+ * "tsc_scale",
+ * "vmcb_clean",
+ * "flushbyasid",
+ * "decodeassists",
+ * "pausefilter",
+ * "pfthreshold",
+ */
+};
+
+#define LX_CPUINFO_MAPPING_MAX \
+ (sizeof (lx_cpuinfo_mappings) / sizeof (lx_cpuinfo_mappings[0]))
+
+/* ARGSUSED */
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ int i;
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ char brandstr[CPU_IDSTRLEN];
+
+ ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ struct cpuid_regs cpr;
+ uint32_t maxeax, xmaxeax, cpuid_res[LXCS_REG_MAX] = { 0 };
+
+ cpr.cp_eax = 0;
+ maxeax = cpuid_insn(cp, &cpr);
+ cpr.cp_eax = 0x80000000;
+ xmaxeax = cpuid_insn(cp, &cpr);
+
+ cpuid_res[LXCS_ALWAYS] = 1;
+ if (maxeax >= 1) {
+ cpr.cp_eax = 1;
+ (void) cpuid_insn(cp, &cpr);
+ cpuid_res[LXCS_CPUID1_ECX] = cpr.cp_ecx;
+ cpuid_res[LXCS_CPUID1_EDX] = cpr.cp_edx;
+ }
+ if (maxeax >= 7) {
+ cpr.cp_eax = 7;
+ (void) cpuid_insn(cp, &cpr);
+ cpuid_res[LXCS_CPUID7_EBX] = cpr.cp_ebx;
+ cpuid_res[LXCS_CPUID7_ECX] = cpr.cp_ecx;
+ cpuid_res[LXCS_CPUID7_EDX] = cpr.cp_edx;
+ }
+ if (maxeax >= 0xd) {
+ cpr.cp_eax = 0xd;
+ cpr.cp_ecx = 1;
+ (void) cpuid_insn(cp, &cpr);
+ cpuid_res[LXCS_CPUIDD1_EAX] = cpr.cp_eax;
+ }
+ if (xmaxeax >= 0x80000001) {
+ cpr.cp_eax = 0x80000001;
+ (void) cpuid_insn(cp, &cpr);
+ cpuid_res[LXCS_CPUIDX1_ECX] = cpr.cp_ecx;
+ cpuid_res[LXCS_CPUIDX1_EDX] = cpr.cp_edx;
+ }
+
+ (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "processor\t: %d\n"
+ "vendor_id\t: %s\n"
+ "cpu family\t: %d\n"
+ "model\t\t: %d\n"
+ "model name\t: %s\n"
+ "stepping\t: %d\n"
+ "cpu MHz\t\t: %u.%03u\n",
+ cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+ cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+ (uint32_t)(cpu_freq_hz / 1000000),
+ ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+ lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+ getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+ if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+ /*
+ * 'siblings' is used for HT-style threads
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "physical id\t: %lu\n"
+ "siblings\t: %u\n",
+ pg_plat_hw_instance_id(cp, PGHW_CHIP),
+ cpuid_get_ncpu_per_chip(cp));
+ }
+
+ /*
+ * Since we're relatively picky about running on older hardware,
+ * we can be somewhat cavalier about the answers to these ones.
+ *
+ * In fact, given the hardware we support, we just say:
+ *
+ * fdiv_bug : no (if we're on a 64-bit kernel)
+ * hlt_bug : no
+ * f00f_bug : no
+ * coma_bug : no
+ * wp : yes (write protect in supervsr mode)
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "fdiv_bug\t: %s\n"
+ "hlt_bug \t: no\n"
+ "f00f_bug\t: no\n"
+ "coma_bug\t: no\n"
+ "fpu\t\t: %s\n"
+ "fpu_exception\t: %s\n"
+ "cpuid level\t: %d\n"
+ "flags\t\t:",
+#if defined(__i386)
+ fpu_pentium_fdivbug ? "yes" : "no",
+#else
+ "no",
+#endif /* __i386 */
+ fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+ maxeax);
+
+ /* Print CPUID feature flags */
+ for (i = 0; i < LX_CPUINFO_MAPPING_MAX; i++) {
+ lx_cpuinfo_mapping_t *lxm = &lx_cpuinfo_mappings[i];
+
+ ASSERT(lxm->lxcm_source < LXCS_REG_MAX);
+ if (cpuid_res[lxm->lxcm_source] & lxm->lxcm_flag) {
+ lxpr_uiobuf_printf(uiobuf, " %s",
+ lxm->lxcm_name);
+ }
+ }
+
+ lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+ lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * Report a list of file systems loaded in the kernel. We only report the ones
+ * which we support and which may be checked by various components to see if
+ * they are loaded.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_filesystems(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "autofs");
+ lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "cgroup");
+ lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "nfs");
+ lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "proc");
+ lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "sysfs");
+ lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "tmpfs");
+}
+
+/*
+ * Calculate the number of links in the task dir. Some code (e.g. chromium)
+ * depends on this value being accurate.
+ */
+static uint_t
+lxpr_count_taskdir(lxpr_node_t *lxpnp)
+{
+ proc_t *p;
+ uint_t cnt;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR);
+
+ p = lxpr_lock(lxpnp, ZOMB_OK);
+ if (p == NULL)
+ return (0);
+
+ cnt = lxpr_count_tasks(p);
+
+ lxpr_unlock(p);
+
+ /* Add the fixed entries ("." & "..") */
+ cnt += 2;
+ return (cnt);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ register lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ extern uint_t nproc;
+ int error;
+
+ /*
+ * Return attributes of underlying vnode if ATTR_REAL
+ *
+ * but keep fd files with the symlink permissions
+ */
+ if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+
+ /*
+ * withold attribute information to owner or root
+ */
+ if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * now its attributes
+ */
+ if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * if it's a file in lx /proc/pid/fd/xx then set its
+ * mode and keep it looking like a symlink, fifo or socket
+ */
+ if (type == LXPR_PID_FD_FD) {
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_type = lxpnp->lxpr_realvp->v_type;
+ vap->va_size = 0;
+ vap->va_nlink = 1;
+ }
+ return (0);
+ }
+
+ /* Default attributes, that may be overridden below */
+ bzero(vap, sizeof (*vap));
+ vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+ vap->va_nlink = 1;
+ vap->va_type = vp->v_type;
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_fsid = vp->v_vfsp->vfs_dev;
+ vap->va_blksize = DEV_BSIZE;
+ vap->va_uid = lxpnp->lxpr_uid;
+ vap->va_gid = lxpnp->lxpr_gid;
+ vap->va_nodeid = lxpnp->lxpr_ino;
+
+ switch (type) {
+ case LXPR_PROCDIR:
+ vap->va_nlink = nproc + 2 + PROCDIRFILES;
+ vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+ break;
+ case LXPR_PIDDIR:
+ vap->va_nlink = PIDDIRFILES;
+ vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+ break;
+ case LXPR_PID_TASKDIR:
+ vap->va_nlink = lxpr_count_taskdir(lxpnp);
+ vap->va_size = vap->va_nlink * LXPR_SDSIZE;
+ break;
+ case LXPR_PID_TASK_IDDIR:
+ vap->va_nlink = TIDDIRFILES;
+ vap->va_size = TIDDIRFILES * LXPR_SDSIZE;
+ break;
+ case LXPR_SELF:
+ vap->va_uid = crgetruid(curproc->p_cred);
+ vap->va_gid = crgetrgid(curproc->p_cred);
+ break;
+ case LXPR_PID_FD_FD:
+ case LXPR_PID_TID_FD_FD:
+ /*
+ * Restore VLNK type for lstat-type activity.
+ * See lxpr_readlink for more details.
+ */
+ if ((flags & FOLLOW) == 0)
+ vap->va_type = VLNK;
+ default:
+ break;
+ }
+
+ vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+ return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+ return (lxpr_doaccess(VTOLXP(vp), B_FALSE, mode, flags, cr, ct));
+}
+
+/*
+ * This makes up the bulk of the logic for lxpr_access. An extra parameter
+ * ('shallow') is present to differentiate checks that must pass muster against
+ * an underlying resource (lxpr_realvp) and those that are only concerned with
+ * permission to the process.
+ */
+static int
+lxpr_doaccess(lxpr_node_t *lxpnp, boolean_t shallow, int mode, int flags,
+ cred_t *cr, caller_context_t *ct)
+{
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ boolean_t allow_pid_access = B_FALSE;
+ int shift = 0;
+ proc_t *tp;
+
+ /*
+ * lx /proc is primarily a read only file system
+ * We handle LXPR_SYSDIR as a special case. At least 'systemd' expects
+ * access() to report /proc/sys is writable, but we can't do that in
+ * lxpr_is_writable since it breaks other code paths that check if they
+ * can write there.
+ */
+ if ((mode & VWRITE) && !lxpr_is_writable(type)) {
+ if (type != LXPR_SYSDIR)
+ return (EROFS);
+ }
+
+ if (type == LXPR_PIDDIR) {
+ return (0);
+ }
+ if (lxpnp->lxpr_pid != 0) {
+ if ((tp = lxpr_lock(lxpnp, ZOMB_OK)) == NULL) {
+ return (ENOENT);
+ }
+ if (tp == curproc || secpolicy_proc_access(cr) == 0 ||
+ priv_proc_cred_perm(cr, tp, NULL, mode) == 0) {
+ allow_pid_access = B_TRUE;
+ }
+ lxpr_unlock(tp);
+ switch (type) {
+ case LXPR_PID_CGROUP:
+ case LXPR_PID_CMDLINE:
+ case LXPR_PID_COMM:
+ case LXPR_PID_LIMITS:
+ case LXPR_PID_LOGINUID:
+ case LXPR_PID_MOUNTINFO:
+ case LXPR_PID_MOUNTS:
+ case LXPR_PID_OOM_SCR_ADJ:
+ case LXPR_PID_STAT:
+ case LXPR_PID_STATM:
+ case LXPR_PID_STATUS:
+ case LXPR_PID_TASKDIR:
+ case LXPR_PID_TASK_IDDIR:
+ case LXPR_PID_TID_CGROUP:
+ case LXPR_PID_TID_CMDLINE:
+ case LXPR_PID_TID_COMM:
+ case LXPR_PID_TID_LIMITS:
+ case LXPR_PID_TID_LOGINUID:
+ case LXPR_PID_TID_MOUNTINFO:
+ case LXPR_PID_TID_OOM_SCR_ADJ:
+ case LXPR_PID_TID_STAT:
+ case LXPR_PID_TID_STATM:
+ case LXPR_PID_TID_STATUS:
+ /*
+ * These entries are accessible to any process on the
+ * system which wishes to query them.
+ */
+ break;
+ default:
+ /*
+ * All other entries under the pid/tid hierarchy
+ * require proper authorization to be accessed.
+ */
+ if (!allow_pid_access) {
+ return (EACCES);
+ }
+ break;
+ }
+ }
+
+ /*
+ * If this entry has an underlying vnode, rely upon its access checks.
+ * Skip this if a shallow check has been requested.
+ */
+ if (lxpnp->lxpr_realvp != NULL && !shallow) {
+ return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+ }
+
+ /*
+ * Allow access to those (root) possessing the correct privilege or
+ * already authorized against a pid-specific resource.
+ */
+ if (allow_pid_access || secpolicy_proc_access(cr) == 0) {
+ return (0);
+ }
+
+ /*
+ * Access check is based on only one of owner, group, public. If not
+ * owner, then check group. If not a member of the group, then check
+ * public access.
+ */
+ if (crgetuid(cr) != lxpnp->lxpr_uid) {
+ shift += 3;
+ if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+ shift += 3;
+ }
+
+ mode &= ~(lxpnp->lxpr_mode << shift);
+
+ if (mode == 0)
+ return (0);
+
+ return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+ return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the lookup
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict lookup permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * Just return the parent vnode if that's where we are trying to go.
+ */
+ if (strcmp(comp, "..") == 0) {
+ VN_HOLD(lxpnp->lxpr_parent);
+ *vpp = lxpnp->lxpr_parent;
+ return (0);
+ }
+
+ /*
+ * Special handling for directory searches. Note: null component name
+ * denotes that the current directory is being searched.
+ */
+ if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+ VN_HOLD(dp);
+ *vpp = dp;
+ return (0);
+ }
+
+ *vpp = (lxpr_lookup_function[type](dp, comp));
+ return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ lxpr_node_t *lxpnp;
+ int count;
+
+ for (count = 0; count < dirtablen; count++) {
+ if (strcmp(dirtab[count].d_name, comp) == 0) {
+ lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+ return (dp);
+ }
+ }
+ return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+ proc_t *p;
+
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+ p = lxpr_lock(VTOLXP(dp), ZOMB_OK);
+ if (p == NULL)
+ return (NULL);
+
+ dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+ lxpr_unlock(p);
+
+ return (dp);
+}
+
+/*
+ * Lookup one of the process's task ID's.
+ */
+static vnode_t *
+lxpr_lookup_taskdir(vnode_t *dp, char *comp)
+{
+ lxpr_node_t *dlxpnp = VTOLXP(dp);
+ lxpr_node_t *lxpnp;
+ proc_t *p;
+ uint_t tid;
+ int c;
+ kthread_t *t;
+
+ ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASKDIR);
+
+ /*
+ * convert the string rendition of the filename to a thread ID
+ */
+ tid = 0;
+ while ((c = *comp++) != '\0') {
+ int otid;
+ if (c < '0' || c > '9')
+ return (NULL);
+
+ otid = tid;
+ tid = 10 * tid + c - '0';
+ /* integer overflow */
+ if (tid / 10 != otid)
+ return (NULL);
+ }
+
+ /*
+ * get the proc to work with and lock it
+ */
+ p = lxpr_lock_pid(dlxpnp, tid, NO_ZOMB, &t);
+ if (p == NULL)
+ return (NULL);
+
+ /*
+ * Bail if this is a system process.
+ */
+ if (p->p_as == &kas) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ if (p->p_brand != &lx_brand) {
+ /*
+ * Only the main thread is visible for non-branded processes.
+ */
+ t = p->p_tlist;
+ if (tid != p->p_pid || t == NULL) {
+ t = NULL;
+ }
+ } else if (t != NULL) {
+ /*
+ * Disallow any access to aio in-kernel worker threads.
+ * To prevent a potential race while looking at the lwp data
+ * for an exiting thread, we clear the TP_KTHREAD bit in
+ * lx_cleanlwp() while the p_lock is held.
+ */
+ if ((t->t_proc_flag & TP_KTHREAD) != 0) {
+ lx_lwp_data_t *lwpd;
+
+ VERIFY((lwpd = ttolxlwp(t)) != NULL);
+ if ((lwpd->br_lwp_flags & BR_AIO_LWP) != 0) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+ }
+ }
+
+ if (t == NULL) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * Allocate and fill in a new lx /proc taskid node.
+ * Instead of the last arg being a fd, it is a tid.
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PID_TASK_IDDIR, p, tid);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+ lxpr_unlock(p);
+ return (dp);
+}
+
+/*
+ * Lookup one of the process's task ID's.
+ */
+static vnode_t *
+lxpr_lookup_task_tid_dir(vnode_t *dp, char *comp)
+{
+ lxpr_node_t *dlxpnp = VTOLXP(dp);
+ lxpr_node_t *lxpnp;
+ proc_t *p;
+ kthread_t *t;
+ int i;
+
+ ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASK_IDDIR);
+
+ /*
+ * get the proc to work with and lock it
+ */
+ p = lxpr_lock_pid(dlxpnp, dlxpnp->lxpr_desc, NO_ZOMB, &t);
+ if (p == NULL)
+ return (NULL);
+
+ /*
+ * Bail if this is a system process.
+ */
+ if (p->p_as == &kas) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * allocate and fill in the new lx /proc taskid dir node
+ */
+ for (i = 0; i < TIDDIRFILES; i++) {
+ if (strcmp(tiddir[i].d_name, comp) == 0) {
+ lxpnp = lxpr_getnode(dp, tiddir[i].d_type, p,
+ dlxpnp->lxpr_desc);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+ lxpr_unlock(p);
+ return (dp);
+ }
+ }
+
+ lxpr_unlock(p);
+ return (NULL);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PID_FDDIR ||
+ VTOLXP(dp)->lxpr_type == LXPR_PID_TID_FDDIR);
+
+ return (lxpr_lookup_fdnode(dp, comp));
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+ dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+ return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+ /*
+ * We know all the names of files & dirs in our file system structure
+ * except those that are pid names. These change as pids are created/
+ * deleted etc., so we just look for a number as the first char to see
+ * if we are we doing pid lookups.
+ *
+ * Don't need to check for "self" as it is implemented as a symlink
+ */
+ if (*comp >= '0' && *comp <= '9') {
+ pid_t pid = 0;
+ lxpr_node_t *lxpnp = NULL;
+ vnode_t *vp;
+ proc_t *p;
+ kthread_t *t;
+ int c;
+
+ while ((c = *comp++) != '\0')
+ pid = 10 * pid + c - '0';
+
+ /*
+ * Can't continue if the process is still loading or it doesn't
+ * really exist yet (or maybe it just died!)
+ */
+ p = lxpr_lock_pid(VTOLXP(dp), pid, ZOMB_OK, &t);
+ if (p == NULL)
+ return (NULL);
+
+ if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * Allocate and populate a new LX /proc node.
+ *
+ * Directory entries for non-main threads can be looked up as
+ * /proc/<tid> despite the fact that they do not appear in the
+ * readdir output. Record the lookup pid (tid) so that later
+ * operations can be aware of this context.
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, pid);
+
+ lxpr_unlock(p);
+ vp = LXPTOV(lxpnp);
+ ASSERT(vp != NULL);
+
+ return (vp);
+ }
+
+ /* Lookup fixed names */
+ return (lxpr_lookup_common(dp, comp, NULL, lx_procdir, PROCDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sysdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYSDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sysdir, SYSDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_kerneldir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNELDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_kerneldir,
+ SYS_KERNELDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_kdir_randdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNEL_RANDDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_randdir,
+ SYS_RANDDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_netdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NETDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_netdir,
+ SYS_NETDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_net_coredir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_COREDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_net_coredir,
+ SYS_NET_COREDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_net_ipv4dir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_IPV4DIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_net_ipv4dir,
+ SYS_NET_IPV4DIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_vmdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_VMDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_vmdir,
+ SYS_VMDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_fsdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FSDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_fsdir,
+ SYS_FSDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_fs_inotifydir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FS_INOTIFYDIR);
+ return (lxpr_lookup_common(dp, comp, NULL, sys_fs_inotifydir,
+ SYS_FS_INOTIFYDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ ssize_t uresid;
+ off_t uoffset;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the readdir
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict readdir permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+ return (error);
+
+ uoffset = uiop->uio_offset;
+ uresid = uiop->uio_resid;
+
+ /* can't do negative reads */
+ if (uoffset < 0 || uresid <= 0)
+ return (EINVAL);
+
+ /* can't read directory entries that don't exist! */
+ if (uoffset % LXPR_SDSIZE)
+ return (ENOENT);
+
+ return (lxpr_readdir_function[type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+
+ oresid = uiop->uio_resid;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Satisfy user request
+ */
+ while ((uresid = uiop->uio_resid) > 0) {
+ int dirindex;
+ off_t uoffset;
+ int reclen;
+ int error;
+
+ uoffset = uiop->uio_offset;
+ dirindex = (uoffset / LXPR_SDSIZE) - 2;
+
+ if (uoffset == 0) {
+
+ dirent->d_ino = lxpnp->lxpr_ino;
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '\0';
+ reclen = DIRENT64_RECLEN(1);
+
+ } else if (uoffset == LXPR_SDSIZE) {
+
+ dirent->d_ino = lxpr_parentinode(lxpnp);
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '.';
+ dirent->d_name[2] = '\0';
+ reclen = DIRENT64_RECLEN(2);
+
+ } else if (dirindex >= 0 && dirindex < dirtablen) {
+ int slen = strlen(dirtab[dirindex].d_name);
+
+ dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+ lxpnp->lxpr_pid, 0);
+
+ VERIFY(slen < LXPNSIZ);
+ (void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+ reclen = DIRENT64_RECLEN(slen);
+
+ } else {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid) {
+ return (EINVAL);
+ }
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, ignoring what uiomove() does.
+ */
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ return (error);
+
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ /* Have run out of space, but could have just done last table entry */
+ if (eofp) {
+ *eofp =
+ (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+ return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ zone_t *zone;
+ int error;
+ int ceof;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+ oresid = uiop->uio_resid;
+ zone = LXPTOZ(lxpnp);
+
+ /*
+ * We return directory entries in the order: "." and ".." then the
+ * unique lxproc files, then the directories corresponding to the
+ * running processes. We have defined this as the ordering because
+ * it allows us to more easily keep track of where we are betwen calls
+ * to getdents(). If the number of processes changes between calls
+ * then we can't lose track of where we are in the lxproc files.
+ */
+
+ /* Do the fixed entries */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, lx_procdir,
+ PROCDIRFILES);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ return (error);
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /* Do the process entries */
+ while ((uresid = uiop->uio_resid) > 0) {
+ proc_t *p;
+ pid_t pid, raw_pid;
+ int len;
+ int reclen;
+ int i;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop when entire proc table has been examined.
+ */
+ i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+ if (i < 0 || i >= v.v_proc) {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+ mutex_enter(&pidlock);
+
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, a PID of 0, the
+ * zsched process for the zone, and anything the security
+ * policy doesn't allow us to look at.
+ */
+ if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+ p->p_pid == 0 || p->p_zone != zone ||
+ p == zone->zone_zsched ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ mutex_exit(&pidlock);
+ goto next;
+ }
+
+ /* Translate the pid (e.g. initpid to 1) */
+ lxpr_fixpid(LXPTOZ(lxpnp), p, &pid, NULL);
+ raw_pid = p->p_pid;
+
+ ASSERT(p->p_stat != 0);
+
+ mutex_exit(&pidlock);
+
+ dirent->d_ino = lxpr_inode(LXPR_PIDDIR, raw_pid, 0);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ return (EINVAL);
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, in the increment of this for
+ * the loop, ignoring what uiomove() does.
+ */
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ return (error);
+next:
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ if (eofp != NULL) {
+ *eofp = (uiop->uio_offset >=
+ ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+ return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ proc_t *p;
+ int err;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+ /* can't read its contents if it died */
+ if ((p = lxpr_lock(lxpnp, ZOMB_OK)) == NULL) {
+ return (ENOENT);
+ }
+ err = lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES);
+ lxpr_unlock(p);
+ return (err);
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_taskdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ int error, ceof, tiddirsize, tasknum;
+ proc_t *p;
+ kthread_t *t;
+ boolean_t branded;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR);
+
+ oresid = uiop->uio_resid;
+
+ p = lxpr_lock(lxpnp, ZOMB_OK);
+ if (p == NULL) {
+ return (ENOENT);
+ }
+
+ /*
+ * Just emit static entries for system processes and zombies.
+ */
+ if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+ (p->p_as == &kas)) {
+ lxpr_unlock(p);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0));
+ }
+
+ /*
+ * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+ * going away while we iterate over its threads.
+ */
+ tiddirsize = p->p_lwpcnt;
+ branded = (p->p_brand == &lx_brand);
+ mutex_exit(&p->p_lock);
+
+ /* Do the fixed entries (in this case just "." & "..") */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ goto out;
+
+ if ((t = p->p_tlist) == NULL) {
+ if (eofp != NULL)
+ *eofp = 1;
+ goto out;
+ }
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Loop until user's request is satisfied or until all thread's have
+ * been returned.
+ */
+ for (tasknum = 0; (uresid = uiop->uio_resid) > 0; tasknum++) {
+ int i, reclen, len;
+ uint_t emul_tid;
+ lx_lwp_data_t *lwpd;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop at the end of the thread list
+ */
+ i = (uoffset / LXPR_SDSIZE) - 2;
+ if (i < 0 || i >= tiddirsize) {
+ if (eofp) {
+ *eofp = 1;
+ }
+ goto out;
+ }
+
+ if (i != tasknum)
+ goto next;
+
+ if (!branded) {
+ /*
+ * Emulating the goofy linux task model is impossible
+ * to do for native processes. We can compromise by
+ * presenting only the main thread to the consumer.
+ */
+ emul_tid = p->p_pid;
+ } else {
+ if ((lwpd = ttolxlwp(t)) == NULL) {
+ goto next;
+ }
+ /* Don't show aio kernel worker threads */
+ if ((t->t_proc_flag & TP_KTHREAD) != 0 &&
+ (lwpd->br_lwp_flags & BR_AIO_LWP) != 0) {
+ goto next;
+ }
+ emul_tid = lwpd->br_pid;
+ /*
+ * Convert pid to Linux default of 1 if we're the
+ * zone's init.
+ */
+ if (emul_tid == LXPTOZ(lxpnp)->zone_proc_initpid)
+ emul_tid = 1;
+ }
+
+ dirent->d_ino = lxpr_inode(LXPR_PID_TASK_IDDIR, p->p_pid,
+ emul_tid);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", emul_tid);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, in the increment of this for
+ * the loop, ignoring what uiomove() does.
+ */
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ goto out;
+
+next:
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+
+ if ((t = t->t_forw) == p->p_tlist || !branded) {
+ if (eofp != NULL)
+ *eofp = 1;
+ goto out;
+ }
+ }
+
+ if (eofp != NULL)
+ *eofp = 0;
+
+out:
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ return (error);
+}
+
+static int
+lxpr_readdir_task_tid_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ proc_t *p;
+ kthread_t *t;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_TASK_IDDIR);
+
+ /* Confirm that process and thread are still present */
+ p = lxpr_lock_pid(lxpnp, lxpnp->lxpr_desc, NO_ZOMB, &t);
+ if (p == NULL) {
+ return (ENOENT);
+ }
+ lxpr_unlock(p);
+
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, tiddir, TIDDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ int error, ceof, fddirsize;
+ proc_t *p;
+ uf_info_t *fip;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR ||
+ lxpnp->lxpr_type == LXPR_PID_TID_FDDIR);
+
+ oresid = uiop->uio_resid;
+
+ p = lxpr_lock(lxpnp, ZOMB_OK);
+ if (p == NULL)
+ return (ENOENT);
+
+ /*
+ * For exiting/exited processes or those belonging to the system, only
+ * emit the fixed entries.
+ */
+ if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+ (p->p_as == &kas)) {
+ lxpr_unlock(p);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0));
+ }
+
+ /*
+ * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+ * going away while we iterate over its fi_list.
+ */
+ mutex_exit(&p->p_lock);
+
+ /* Get open file info */
+ fip = (&(p)->p_user.u_finfo);
+ mutex_enter(&fip->fi_lock);
+ fddirsize = fip->fi_nfiles;
+
+ /* Do the fixed entries (in this case just "." & "..") */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ goto out;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Loop until user's request is satisfied or until
+ * all file descriptors have been examined.
+ */
+ for (; (uresid = uiop->uio_resid) > 0;
+ uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+ int reclen;
+ int fd;
+ int len;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop at the end of the fd list
+ */
+ fd = (uoffset / LXPR_SDSIZE) - 2;
+ if (fd < 0 || fd >= fddirsize) {
+ if (eofp) {
+ *eofp = 1;
+ }
+ goto out;
+ }
+
+ if (fip->fi_list[fd].uf_file == NULL)
+ continue;
+
+ dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, p->p_pid, fd);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ goto out;
+ }
+
+ if (eofp != NULL) {
+ *eofp =
+ (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+out:
+ mutex_exit(&fip->fi_lock);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ return (error);
+}
+
+static int
+lxpr_readdir_sysdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYSDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sysdir, SYSDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_fsdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FSDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fsdir,
+ SYS_FSDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFYDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fs_inotifydir,
+ SYS_FS_INOTIFYDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_kerneldir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNELDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_kerneldir,
+ SYS_KERNELDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_kdir_randdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RANDDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_randdir,
+ SYS_RANDDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NETDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_netdir,
+ SYS_NETDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_net_coredir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_COREDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_coredir,
+ SYS_NET_COREDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4DIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_ipv4dir,
+ SYS_NET_IPV4DIRFILES));
+}
+
+static int
+lxpr_readdir_sys_vmdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_VMDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_vmdir,
+ SYS_VMDIRFILES));
+}
+
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
+
+/*
+ * Obtain a numeric value from the null-terminated input string.
+ * We don't have strtok in the kernel, so tokenize this ourselves and
+ * validate the input.
+ */
+static int
+lxpr_tokenize_num(char *str, long *pv, char **ep)
+{
+ char *pstart, *pc, c, *endptr;
+ long v;
+
+ for (pc = str; isspace(*pc); pc++)
+ ;
+
+ for (pstart = pc; isdigit(*pc); pc++)
+ ;
+ if (pc == pstart || (!isspace(*pc) && *pc != '\0'))
+ return (EINVAL);
+ c = *pc;
+ *pc = '\0';
+
+ if (ddi_strtol(pstart, &endptr, 10, &v) != 0) {
+ *pc = c;
+ return (EINVAL);
+ }
+ if (*endptr != '\0') {
+ *pc = c;
+ return (EINVAL);
+ }
+
+ if (pv != NULL)
+ *pv = v;
+ if (ep != NULL)
+ *ep = ++pc;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_tcp_property(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct, char *prop,
+ int (*xlate)(char *, int))
+{
+ int error;
+ int res = 0;
+ size_t olen;
+ char val[16]; /* big enough for a uint numeric string */
+ netstack_t *ns;
+ mod_prop_info_t *ptbl = NULL;
+ mod_prop_info_t *pinfo = NULL;
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > sizeof (val) - 1)
+ return (EINVAL);
+
+ bzero(val, sizeof (val));
+ error = uiomove(val, olen, UIO_WRITE, uio);
+ if (error != 0)
+ return (error);
+
+ if (val[olen - 1] == '\n')
+ val[olen - 1] = '\0';
+
+ if (val[0] == '\0') /* no input */
+ return (EINVAL);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return (EINVAL);
+
+ if (xlate != NULL && xlate(val, sizeof (val)) != 0) {
+ netstack_rele(ns);
+ return (EINVAL);
+ }
+
+ ptbl = ns->netstack_tcp->tcps_propinfo_tbl;
+ pinfo = mod_prop_lookup(ptbl, prop, MOD_PROTO_TCP);
+ if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, val, 0) != 0)
+ res = EINVAL;
+
+ netstack_rele(ns);
+ return (res);
+}
+
+static int
+lxpr_write_sys_net_core_somaxc(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+ "_conn_req_max_q", NULL));
+}
+
+static int
+lxpr_xlate_sec2ms(char *val, int size)
+{
+ long sec;
+ char *ep;
+
+ if (lxpr_tokenize_num(val, &sec, &ep) != 0)
+ return (EINVAL);
+ if (*ep != '\0')
+ return (EINVAL);
+ if (snprintf(val, size, "%ld", sec * 1000) >= size)
+ return (EINVAL);
+ return (0);
+}
+
+static int
+lxpr_xlate_ka_intvl(char *val, int size)
+{
+ long sec;
+ char *ep;
+
+ if (lxpr_tokenize_num(val, &sec, &ep) != 0)
+ return (EINVAL);
+ if (*ep != '\0')
+ return (EINVAL);
+ if (snprintf(val, size, "%ld", sec * 1000 * 9) >= size)
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Approximately translate the input count value into a reasonable
+ * _rexmit_interval_max timeout.
+ */
+static int
+lxpr_xlate_retry2(char *val, int size)
+{
+ long cnt;
+ char *ep;
+ uint_t i, rx_max;
+
+ if (lxpr_tokenize_num(val, &cnt, &ep) != 0)
+ return (EINVAL);
+ if (*ep != '\0')
+ return (EINVAL);
+
+ /*
+ * The _rexmit_interval_max is limited to 2 hours, so a count of 15
+ * or more will exceed that due to exponential backoff.
+ */
+ if (cnt > 15)
+ cnt = 15;
+
+ rx_max = 400; /* Start with default _rexmit_interval_min in ms */
+ for (i = 0; i < cnt; i++)
+ rx_max *= 2;
+
+ /*
+ * The _rexmit_interval_max is limited to 2 hours, so if we went over
+ * the limit, just use 2 hours (in ms).
+ */
+ if (rx_max > (7200 * 1000))
+ rx_max = 7200 * 1000;
+
+ if (snprintf(val, size, "%u", rx_max) >= size)
+ return (EINVAL);
+ return (0);
+}
+
+static int
+lxpr_xlate_sack(char *val, int size)
+{
+ long flag;
+ char *ep;
+
+ if (lxpr_tokenize_num(val, &flag, &ep) != 0)
+ return (EINVAL);
+ if (*ep != '\0')
+ return (EINVAL);
+ if (flag != 0 && flag != 1)
+ return (EINVAL);
+ /* see comment on lxpr_read_sys_net_ipv4_tcp_sack */
+ if (snprintf(val, size, "%d", (flag == 0 ? 0 : 2)) >= size)
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * We're updating a property on the ip stack so we can't reuse
+ * lxpr_write_tcp_property.
+ */
+/* ARGSUSED */
+static int
+lxpr_write_sys_net_ipv4_icmp_eib(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ int error;
+ size_t olen;
+ char val[16]; /* big enough for a uint numeric string */
+ long flag;
+ char *ep;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_ICMP_EIB);
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > sizeof (val) - 1)
+ return (EINVAL);
+
+ bzero(val, sizeof (val));
+ error = uiomove(val, olen, UIO_WRITE, uio);
+ if (error != 0)
+ return (error);
+
+ if (val[olen - 1] == '\n')
+ val[olen - 1] = '\0';
+
+ if (val[0] == '\0') /* no input */
+ return (EINVAL);
+
+ if (lxpr_tokenize_num(val, &flag, &ep) != 0)
+ return (EINVAL);
+
+ if (*ep != '\0' || (flag != 0 && flag != 1))
+ return (EINVAL);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return (EINVAL);
+
+ ipst = ns->netstack_ip;
+ ipst->ips_ip_g_resp_to_echo_bcast = !flag;
+
+ netstack_rele(ns);
+ return (0);
+}
+
+/*
+ * We expect two port numbers on a line as input for the range, and we have to
+ * set two properties on the netstack_tcp, so we can't reuse
+ * lxpr_write_tcp_property.
+ */
+/* ARGSUSED */
+static int
+lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ int res;
+ size_t olen;
+ char vals[32]; /* big enough for a line w/ 2 16-bit numeric strings */
+ char *ep;
+ long low, high;
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+ mod_prop_info_t *ptbl = NULL;
+ mod_prop_info_t *pinfo = NULL;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE);
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > sizeof (vals) - 1)
+ return (EINVAL);
+
+ bzero(vals, sizeof (vals));
+ res = uiomove(vals, olen, UIO_WRITE, uio);
+ if (res != 0)
+ return (res);
+
+ if (lxpr_tokenize_num(vals, &low, &ep) != 0)
+ return (EINVAL);
+
+ if (lxpr_tokenize_num(ep, &high, &ep) != 0)
+ return (EINVAL);
+
+ if (*ep != '\0') {
+ /* make sure no other tokens on the line */
+ *ep++ = '\0';
+ for (; isspace(*ep); ep++)
+ ;
+ if (*ep != '\0')
+ return (EINVAL);
+ }
+
+ if (low > high || high > 65535)
+ return (EINVAL);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return (EINVAL);
+
+ tcps = ns->netstack_tcp;
+ if (low < tcps->tcps_smallest_nonpriv_port) {
+ netstack_rele(ns);
+ return (EINVAL);
+ }
+
+ ptbl = ns->netstack_tcp->tcps_propinfo_tbl;
+
+ (void) snprintf(vals, sizeof (vals), "%ld", low);
+ pinfo = mod_prop_lookup(ptbl, "smallest_anon_port", MOD_PROTO_TCP);
+ if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0)
+ res = EINVAL;
+
+ (void) snprintf(vals, sizeof (vals), "%ld", high);
+ pinfo = mod_prop_lookup(ptbl, "largest_anon_port", MOD_PROTO_TCP);
+ if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0)
+ res = EINVAL;
+
+ netstack_rele(ns);
+ return (res);
+}
+
+/*
+ * We expect three numbers on a line as input for the range, and we have to
+ * set two properties on the netstack_tcp, so we can't reuse
+ * lxpr_write_tcp_property.
+ *
+ * See the Linux tcp(7) man page.
+ */
+/* ARGSUSED */
+static int
+lxpr_write_sys_net_ipv4_tcp_rwmem(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ int res;
+ size_t olen;
+ char vals[80]; /* big enough for a line w/ 3 numeric strings */
+ char *ep;
+ long min, def, max, min_limit;
+ netstack_t *ns;
+ tcp_stack_t *tcps;
+ mod_prop_info_t *ptbl;
+ mod_prop_info_t *pinfo;
+ char *attr;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ||
+ lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WMEM);
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > sizeof (vals) - 1)
+ return (EINVAL);
+
+ bzero(vals, sizeof (vals));
+ res = uiomove(vals, olen, UIO_WRITE, uio);
+ if (res != 0)
+ return (res);
+
+ if (lxpr_tokenize_num(vals, &min, &ep) != 0)
+ return (EINVAL);
+
+ if (lxpr_tokenize_num(ep, &def, &ep) != 0)
+ return (EINVAL);
+
+ if (lxpr_tokenize_num(ep, &max, &ep) != 0)
+ return (EINVAL);
+
+ if (*ep != '\0') {
+ /* make sure no other tokens on the line */
+ *ep++ = '\0';
+ for (; isspace(*ep); ep++)
+ ;
+ if (*ep != '\0')
+ return (EINVAL);
+ }
+
+ /*
+ * Ensure the numbers are valid, low to high.
+ * Valid ranges from the tunable's guide.
+ */
+ min_limit = (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ?
+ 2048 : 4096);
+ if (min > def || def > max || min < min_limit ||
+ def > ONEGB || max < 8192)
+ return (EINVAL);
+
+ ns = lxpr_netstack(lxpnp);
+ if (ns == NULL)
+ return (EINVAL);
+
+ tcps = ns->netstack_tcp;
+
+ /* recv_hiwat and xmit_hiwat are aliased to recv_buf and send_buf. */
+ attr = (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ?
+ "recv_buf" : "send_buf");
+
+ (void) snprintf(vals, sizeof (vals), "%ld", def);
+ ptbl = ns->netstack_tcp->tcps_propinfo_tbl;
+ pinfo = mod_prop_lookup(ptbl, attr, MOD_PROTO_TCP);
+ if (pinfo == NULL ||
+ pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0)
+ res = EINVAL;
+
+ /*
+ * Don't reduce max for one side (recv or xmit) since that impacts the
+ * other.
+ */
+ if (res == 0 && max > tcps->tcps_max_buf) {
+ (void) snprintf(vals, sizeof (vals), "%ld", max);
+ pinfo = mod_prop_lookup(ptbl, "max_buf", MOD_PROTO_TCP);
+ if (pinfo == NULL ||
+ pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0)
+ res = EINVAL;
+ }
+
+ netstack_rele(ns);
+ return (res);
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+ "_fin_wait_2_flush_interval", lxpr_xlate_sec2ms));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+ "_keepalive_abort_interval", lxpr_xlate_ka_intvl));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+ "_keepalive_interval", lxpr_xlate_sec2ms));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+ "_conn_req_max_q0", NULL));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_retry2(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RETRY2);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+ "_rexmit_interval_max", lxpr_xlate_retry2));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "sack",
+ lxpr_xlate_sack));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE);
+ return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "_wscale_always",
+ NULL));
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_sys_fs_pipe_max(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ int error;
+ size_t olen;
+ char val[16]; /* big enough for a uint numeric string */
+ char *ep;
+ long u;
+ size_t size;
+ lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp));
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_PIPE_MAX);
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > sizeof (val) - 1)
+ return (EINVAL);
+
+ bzero(val, sizeof (val));
+ error = uiomove(val, olen, UIO_WRITE, uio);
+ if (error != 0)
+ return (error);
+
+ if (lxpr_tokenize_num(val, &u, &ep) != 0)
+ return (EINVAL);
+ if (*ep != '\0')
+ return (EINVAL);
+
+ /*
+ * Bound to PAGESIZE <= input <= lx_pipe_max_limit, then round to the
+ * nearest page. Linux is a little more picky, rounding to the nearest
+ * power-of-two pages. Such strengthened behavior can be added later
+ * if needed.
+ */
+ size = (size_t)u;
+ size = P2ROUNDUP(MIN(MAX(PAGESIZE, size), lx_pipe_max_limit), PAGESIZE);
+
+ ASSERT(size <= lx_pipe_max_limit);
+
+ mutex_enter(&lxzd->lxzd_lock);
+ lxzd->lxzd_pipe_max_sz = size;
+ mutex_exit(&lxzd->lxzd_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_sys_kernel_corepatt(lxpr_node_t *lxpnp, struct uio *uio,
+ struct cred *cr, caller_context_t *ct)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ struct core_globals *cg;
+ refstr_t *rp, *nrp;
+ corectl_path_t *ccp;
+ char val[MAXPATHLEN];
+ char valtr[MAXPATHLEN];
+ size_t olen;
+ int error;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT);
+
+ cg = zone_getspecific(core_zone_key, zone);
+ ASSERT(cg != NULL);
+
+ if (secpolicy_coreadm(cr) != 0)
+ return (EPERM);
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > sizeof (val) - 1)
+ return (EINVAL);
+
+ bzero(val, sizeof (val));
+ error = uiomove(val, olen, UIO_WRITE, uio);
+ if (error != 0)
+ return (error);
+
+ if (val[olen - 1] == '\n')
+ val[olen - 1] = '\0';
+
+ if (val[0] == '|')
+ return (EINVAL);
+
+ if ((error = lxpr_core_path_l2s(val, valtr, sizeof (valtr))) != 0)
+ return (error);
+
+ nrp = refstr_alloc(valtr);
+
+ ccp = cg->core_default_path;
+ mutex_enter(&ccp->ccp_mtx);
+ rp = ccp->ccp_path;
+ refstr_hold((ccp->ccp_path = nrp));
+ cg->core_options |= CC_PROCESS_PATH;
+ mutex_exit(&ccp->ccp_mtx);
+
+ if (rp != NULL)
+ refstr_rele(rp);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_pid_loginuid(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr,
+ caller_context_t *ct)
+{
+ int error;
+ size_t olen;
+ char val[16]; /* big enough for a uint numeric string */
+ char *ep;
+ long u;
+ proc_t *p;
+ lx_proc_data_t *pd;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID);
+
+ if (uio->uio_loffset != 0)
+ return (EINVAL);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ olen = uio->uio_resid;
+ if (olen > sizeof (val) - 1)
+ return (EINVAL);
+
+ bzero(val, sizeof (val));
+ error = uiomove(val, olen, UIO_WRITE, uio);
+ if (error != 0)
+ return (error);
+
+ if (lxpr_tokenize_num(val, &u, &ep) != 0)
+ return (EINVAL);
+ if (*ep != '\0')
+ return (EINVAL);
+
+ if ((p = lxpr_lock(lxpnp, NO_ZOMB)) == NULL)
+ return (ENXIO);
+
+ if ((pd = ptolxproc(p)) != NULL) {
+ pd->l_loginuid = (uid_t)u;
+ }
+ lxpr_unlock(p);
+
+ return (0);
+}
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ char bp[MAXPATHLEN + 1];
+ size_t buflen = sizeof (bp);
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+ pid_t pid;
+ int error = 0;
+
+ /*
+ * Linux does something very "clever" for /proc/<pid>/fd/<num> entries.
+ * Open FDs are represented as symlinks, the link contents
+ * corresponding to the open resource. For plain files or devices,
+ * this isn't absurd since one can dereference the symlink to query
+ * the underlying resource. For sockets or pipes, it becomes ugly in a
+ * hurry. To maintain this human-readable output, those FD symlinks
+ * point to bogus targets such as "socket:[<inodenum>]". This requires
+ * circumventing vfs since the stat/lstat behavior on those FD entries
+ * will be unusual. (A stat must retrieve information about the open
+ * socket or pipe. It cannot fail because the link contents point to
+ * an absent file.)
+ *
+ * To accomplish this, lxpr_getnode returns an vnode typed VNON for FD
+ * entries. This bypasses code paths which would normally
+ * short-circuit on symlinks and allows us to emulate the vfs behavior
+ * expected by /proc consumers.
+ */
+ if (vp->v_type != VLNK && lxpnp->lxpr_type != LXPR_PID_FD_FD)
+ return (EINVAL);
+
+ /* Try to produce a symlink name for anything that has a realvp */
+ if (rvp != NULL) {
+ error = lxpr_doaccess(lxpnp, B_TRUE, VREAD, 0, cr, ct);
+ if (error != 0)
+ return (error);
+
+ if ((error = vnodetopath(NULL, rvp, bp, buflen, cr)) != 0) {
+ /*
+ * Special handling possible for /proc/<pid>/fd/<num>
+ * Generate <type>:[<inode>] links, if allowed.
+ */
+ if (lxpnp->lxpr_type != LXPR_PID_FD_FD ||
+ lxpr_readlink_fdnode(lxpnp, bp, buflen) != 0) {
+ return (error);
+ }
+ }
+ } else {
+ switch (lxpnp->lxpr_type) {
+ case LXPR_SELF:
+ /* Translate the pid (e.g. initpid to 1) */
+ lxpr_fixpid(LXPTOZ(lxpnp), curproc, &pid, NULL);
+
+ /*
+ * Don't need to check result as every possible int
+ * will fit within MAXPATHLEN bytes.
+ */
+ (void) snprintf(bp, buflen, "%d", pid);
+ break;
+ case LXPR_PID_CURDIR:
+ case LXPR_PID_ROOTDIR:
+ case LXPR_PID_EXE:
+ return (EACCES);
+ default:
+ /*
+ * Need to return error so that nothing thinks
+ * that the symlink is empty and hence "."
+ */
+ return (EINVAL);
+ }
+ }
+
+ /* copy the link data to user space */
+ return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+ /*
+ * Nothing to sync but this function must never fail
+ */
+ return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ while (vn_matchops(vp1, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+ vp1 = rvp;
+ }
+
+ while (vn_matchops(vp2, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+ vp2 = rvp;
+ }
+
+ if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+ return (vp1 == vp2);
+ return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+ vp = rvp;
+ if (VOP_REALVP(vp, &rvp, ct) == 0)
+ vp = rvp;
+ }
+
+ *vpp = vp;
+ return (0);
+}
+
+/* Pollhead for fake POLLET support below */
+static struct pollhead lxpr_pollhead;
+
+/* ARGSUSED */
+static int
+lxpr_poll(vnode_t *vp, short ev, int anyyet, short *reventsp,
+ pollhead_t **phpp, caller_context_t *ct)
+{
+ *reventsp = 0;
+ if (ev & POLLIN)
+ *reventsp |= POLLIN;
+ if (ev & POLLRDNORM)
+ *reventsp |= POLLRDNORM;
+ if (ev & POLLRDBAND)
+ *reventsp |= POLLRDBAND;
+ if (ev & POLLOUT)
+ *reventsp |= POLLOUT;
+ if (ev & POLLWRBAND)
+ *reventsp |= POLLWRBAND;
+
+ /*
+ * Newer versions of systemd will monitor /proc/self/mountinfo with
+ * edge-triggered epoll (via libmount). If adding said resource to an
+ * epoll descriptor fails, as would be the expectation for a call to
+ * fs_poll when POLLET is present, then systemd will abort and the zone
+ * will fail to properly boot. Until proper pollwakeup() support is
+ * wired into lx_proc, valid POLLET support must be faked.
+ *
+ * While the only known (at this time) lx_proc resource where POLLET
+ * support is mandatory is LXPR_PID_MOUNTINFO, we cast a wide net to
+ * avoid other unexpected trouble. Normal devpoll caching (emitting a
+ * pollhead when (*reventsp == 0 && !anyyet)) is not enabled.
+ */
+ if ((ev & POLLET) != 0) {
+ *phpp = &lxpr_pollhead;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+lxpr_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ int i;
+
+ for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) {
+ if (wr_tab[i].wft_type == type) {
+ if (wr_tab[i].wft_wrf != NULL) {
+ return (wr_tab[i].wft_wrf(lxpnp, uiop, cr, ct));
+ }
+ break;
+ }
+ }
+
+ /* pretend we wrote the whole thing */
+ uiop->uio_offset += uiop->uio_resid;
+ uiop->uio_resid = 0;
+ return (0);
+}
+
+/* Needed for writable files which are first "truncated" */
+/* ARGSUSED */
+static int
+lxpr_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
+ cred_t *cred, caller_context_t *ct)
+{
+ int error;
+
+ if (cmd != F_FREESP)
+ return (EINVAL);
+ if ((error = lxpr_access(vp, VWRITE, 0, cred, ct)) != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Needed for writable files which are first "truncated". We only support
+ * truncation.
+ */
+/* ARGSUSED */
+static int
+lxpr_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ int error;
+
+ if (vap->va_mask != AT_SIZE)
+ return (EINVAL);
+ if ((error = lxpr_access(vp, VWRITE, 0, cr, ct)) != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * We need to allow open with O_CREAT for the writable files.
+ */
+/* ARGSUSED */
+static int
+lxpr_create(vnode_t *dvp, char *nm, vattr_t *vap, enum vcexcl exclusive,
+ int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
+ vsecattr_t *vsecp)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dvp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ vnode_t *vp = NULL;
+ int error;
+
+ ASSERT(type < LXPR_NFILES);
+
+ /*
+ * restrict create permission to owner or root
+ */
+ if ((error = lxpr_access(dvp, VEXEC, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ if (*nm == '\0')
+ return (EPERM);
+
+ if (dvp->v_type != VDIR)
+ return (EPERM);
+
+ if (exclusive == EXCL)
+ return (EEXIST);
+
+ /*
+ * No writable files in top-level proc dir. We check this to avoid
+ * getting a non-proc node via "..".
+ */
+ if (type != LXPR_PROCDIR &&
+ lxpr_lookup(dvp, nm, &vp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) {
+ lxpr_nodetype_t ftype = VTOLXP(vp)->lxpr_type;
+ if (!lxpr_is_writable(ftype)) {
+ VN_RELE(vp);
+ vp = NULL;
+ }
+ }
+
+ if (vp != NULL) {
+ ASSERT(vp->v_type != VDIR);
+
+ /* confirm permissions against existing file */
+ if ((error = lxpr_access(vp, mode, 0, cr, ct)) != 0) {
+ VN_RELE(vp);
+ return (error);
+ }
+
+ *vpp = vp;
+ return (0);
+ }
+
+ /*
+ * Linux proc does not allow creation of addition, non-subsystem
+ * specific files inside the hierarchy. ENOENT is tossed when such
+ * actions are attempted.
+ */
+ return (ENOENT);
+}
diff --git a/usr/src/uts/common/brand/lx/sys/lx_acl.h b/usr/src/uts/common/brand/lx/sys/lx_acl.h
new file mode 100644
index 0000000000..1e5ab26407
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_acl.h
@@ -0,0 +1,45 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017 Joyent, Inc.
+ */
+
+#ifndef _LX_ACL_H
+#define _LX_ACL_H
+
+#include <sys/vnode.h>
+#include <sys/uio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Both fall under the 'system.' namespace */
+#define LX_XATTR_POSIX_ACL_ACCESS "posix_acl_access"
+#define LX_XATTR_POSIX_ACL_DEFAULT "posix_acl_default"
+
+enum lx_acl_type {
+ LX_ACL_ACCESS,
+ LX_ACL_DEFAULT
+};
+
+extern int lx_acl_setxattr(vnode_t *, enum lx_acl_type, void *, size_t);
+extern int lx_acl_getxattr(vnode_t *, enum lx_acl_type, void *, size_t,
+ ssize_t *);
+extern int lx_acl_removexattr(vnode_t *, enum lx_acl_type);
+extern int lx_acl_listxattr(vnode_t *, uio_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_ACL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_audit.h b/usr/src/uts/common/brand/lx/sys/lx_audit.h
new file mode 100644
index 0000000000..76686dd9ec
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_audit.h
@@ -0,0 +1,38 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2018 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _LX_AUDIT_H
+#define _LX_AUDIT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void lx_audit_init(int (*)(void *, uint_t, const char *, uint_t));
+extern void lx_audit_cleanup(void);
+extern void lx_audit_stop_worker(void *, void (*)(void *, boolean_t));
+extern int lx_audit_append_rule(void *, uint_t);
+extern int lx_audit_delete_rule(void *, uint_t);
+extern void lx_audit_list_rules(void *,
+ void (*)(void *, void *, uint_t, void *, uint_t));
+extern void lx_audit_get_feature(void *, void (*)(void *, void *, uint_t));
+extern void lx_audit_get(void *, void (*)(void *, void *, uint_t));
+extern int lx_audit_set(void *, void *, uint_t, void (*cb)(void *, boolean_t));
+extern void lx_audit_emit_user_msg(uint_t, uint_t, char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_AUDIT_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs.h b/usr/src/uts/common/brand/lx/sys/lx_autofs.h
new file mode 100644
index 0000000000..17b19895f4
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_autofs.h
@@ -0,0 +1,511 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _LX_AUTOFS_H
+#define _LX_AUTOFS_H
+
+/*
+ * The lxautofs filesystem and driver exist to emulate the Linux autofs
+ * filesystem and /dev/autofs device (this code emulates both). The
+ * purpose is to provide support for the Linux "automount" automounter.
+ *
+ * The device ioctls map fairly closely to the filesystem ioctls. The device
+ * ioctls have superseded the filesystem ioctls and the automounter will
+ * use the device ioctls if the device exists.
+ *
+ * The device ioctls are used by the automounter to perform recovery
+ * in cases where the automounter is restarted while mounts are present. It
+ * also allows for better management operations when a filesystem is mounted
+ * on top of an autofs mountpoint, as in the case of an NFS direct mount on
+ * top of an autofs mount.
+ *
+ *
+ * +++ Linux automounter background.
+ *
+ * Linux has two automounters: "amd" (not used in any popular, modern distro)
+ * and "automount".
+ *
+ * "automount" is the normal Linux automounter. It utilizes a kernel
+ * filesystem (autofs) and device (/dev/autofs) to provide its functionality.
+ * Basically, it mounts the autofs filesystem at any automounter controlled
+ * mountpoint. This filesystem then intercepts and redirects lookup operations
+ * to the userland automounter process via a pipe. The pipe to the automounter
+ * is established via a mount option when the autofs filesystem is mounted or
+ * via the setpipefd ioctl if the automounter restarts. When the automounter
+ * receives a request via this pipe, it does lookups (or unmounts) to whatever
+ * backing store it's configured to use, does mkdir operations on the autofs
+ * filesystem, mounts remote NFS filesystems on any directories it manages or
+ * just created, and signals the autofs device via an ioctl to let it know
+ * that the lookup (or expire) can continue. Other management operations (such
+ * as querying expiration for unmounting) are performed using the autofs device.
+ *
+ *
+ * +++ Linux autofs documentation.
+ *
+ * Within the Linux src tree, see the file:
+ * Documentation/filesystems/autofs4-mount-control.txt
+ * This documents some of the autofs behavior and the device driver ioctls.
+ *
+ * The following URL (https://lwn.net/Articles/606960/) documents autofs in
+ * general. This patch was targeted for Documentation/filesystems/autofs4.txt,
+ * but seems to have never integrated into the Linux src tree.
+ *
+ *
+ * +++ Linux autofs (and automount daemon) notes
+ *
+ * Since we're mimicking the behavior of the Linux autofs filesystem and
+ * device, we document some of the observed behavior here.
+ *
+ * There are multiple versions of the autofs filesystem kernel API protocol
+ * and modern implementations of the user-land automount daemon would depend
+ * on v5, although the filesystem API has been superseded by the driver ioctl
+ * API, which is roughly similar.
+ *
+ * We'll describe the filesystem ioctls first, since support for those was
+ * implemented first. The device ioctls roughly correspond to the filesystem
+ * ioctls and were implemented last, but the automounter will use those
+ * ioctls, instead of the filesystem ioctls, when the device is present.
+ *
+ * Our original autofs implementation was developed in the mid-2000s around the
+ * v2 protocol, but that is currently obsolete. Our current implementation is
+ * based around the v5 protocol API. There was no autofs device support at that
+ * time.
+ *
+ * The autoumounter supports 3 different, mutually exclusive, mount options for
+ * each mountpoint:
+ * - indirect (this was all you got with the v2 support)
+ * - direct
+ * - offset
+ *
+ * An 'indirect' mountpoint is managed with dynamic mounts below that
+ * mountpoint. For example, if '/home' were an indirect autofs mount, then
+ * accessing a username under /home would traverse the 'lookup' code described
+ * below, cause a local subdirectory to be created, and a mount, usually NFS,
+ * onto that username subdirectory.
+ *
+ * A 'direct' mountpoint is an autofs mountpoint which will trigger the
+ * mounting of another filesystem overtop that mountpoint when accessed.
+ *
+ * An 'offset' mountpoint behaves like a 'direct' mountpoint but it is
+ * created dynamically by the automounter underneath an 'indirect' mountpoint.
+ * For example, if '/net' were an indirect autosfs mountpoint and the host
+ * 'jurassic' exported two NFS filesystems; '/var/crash' and '/var/core', then
+ * accessing '/net/jurassic' would trigger the automounter to create two
+ * subdirectories; '/net/jurassic/var/crash' and '/net/jurassic/var/core'. The
+ * automounter would then mount an autofs offset mount onto each one of these
+ * directories. Accessing either of those directories would then trigger
+ * automounter to perform another mount on top, as is done with a 'direct'
+ * mount.
+ *
+ * General behavior
+ *
+ * A) Autofs allows root owned, non-automounter processes to create
+ * directories in the autofs filesystem. The autofs filesystem treats the
+ * automounter's process group as special, but it doesn't prevent root
+ * processes outside of the automounter's process group from creating new
+ * directories in the autofs filesystem.
+ *
+ * B) Autofs doesn't allow creation of any non-directory entries in the
+ * autofs filesystem. No entity can create files (e.g. /bin/touch or
+ * VOP_CREATE/VOP_SYMLINK/etc.) The only entries that can exist within
+ * the autofs filesystem are directories.
+ *
+ * C) Autofs only intercepts vop lookup operations. Notably, it does _not_
+ * intercept and re-direct vop readdir operations. This means that the
+ * observed behavior of the Linux automounter can be considerably different
+ * from that of the illumos automounter. Specifically, on illumos if an autofs
+ * mountpoint is mounted _without_ the -nobrowse option then if a user does
+ * an ls operation (which translates into a vop readdir operation) then the
+ * automounter will intercept that operation and list all the possible
+ * directories and mountpoints without actually mounting any filesystems.
+ * Essentially, all automounter managed mountpoints on Linux will behave
+ * like "-nobrowse" mountpoints on illumos. Here's an example to illustrate
+ * this. If /ws was mounted on illumos without the -nobrowse option and an
+ * auto_ws yp map was setup as the backing store for this mountpoint, then an
+ * "ls /ws" would list all the keys in the map as valid directories, but an
+ * "ls /ws" on Linux would list an emptry directory.
+ *
+ * D) NFS mounts are performed by the automount process. When the automount
+ * process gets a redirected lookup request, it determines _all_ the
+ * possible remote mountpoints for that request, creates directory paths
+ * via mkdir, and mounts the remote filesystems on the newly created paths.
+ * This is described in the offset mount example above. Once the automounter
+ * completed the mounts it would signal the autofs filesystem (via an ioctl)
+ * that the lookup could continue.
+ *
+ * E.1) Autofs only redirects vop lookup operations for path entries that
+ * don't already exist in the autofs filesystem. So for the example above,
+ * an initial (after the start of the automounter) "ls /net/jurassic" would
+ * result in a request to the automounter. A subsequest "ls /net/jurassic"
+ * would not result in a request to the automounter. Even if
+ * /net/jurassic/var/crash and /net/jurassic/var/core were manually unmounted
+ * after the initial "ls /net/jurassic", a subsequest "ls /net/jurassic"
+ * would not result in a new request to the automounter.
+ *
+ * E.2) Autofs lookup requests that are sent to the automounter only include
+ * the root directory path component. So for example, after starting up
+ * the automounter if a user were to do a "ls /net/jurassic/var/crash", the
+ * initial lookup request actually sent to the automounter would just be for
+ * "jurassic" (the same request as if the user had done "ls /net/jurassic").
+ * After the initial mounting of the two offset mounts onto crash and core the
+ * lookup would continue and a final lookup request would be sent to the
+ * automounter for "crash" (but this would be on a different vfs from the
+ * /net vfs).
+ *
+ * E.3) The two statements above aren't entirely entirely true. The Linux
+ * autofs filesystem will also redirect lookup operations for leaf
+ * directories that don't have a filesystem mounted on them. Using the
+ * example above, if a user did a "ls /net/jurassic", then manually
+ * unmounted /net/jurassic/var/crash, and then did an "ls
+ * /net/jurassic/var/crash", this would result in a request for
+ * "jurassic/var/crash" being sent to the automounter. The strange thing
+ * (a Linux bug perhaps) is that the automounter won't do anything with this
+ * request and the lookup will fail.
+ *
+ * F) The autofs filesystem communication protocol (what ioctls it supports
+ * and what data it passes to the automount process) is versioned. The
+ * userland automount daemon (as of version v5.0.7) expects v5 of the protocol
+ * (by running the AUTOFS_IOC_PROTOSUBVER ioctl), and exits if that is not
+ * supported. For v2-v5 the structure passed through the pipe always begins
+ * with a common header followed by different fields depending on the packet
+ * type. In addition the different versions support additional ioctls.
+ *
+ * v2 - basic lookup request
+ * v3 - adds expiring (umounting)
+ * v4 - adds expire multi
+ * v5 - adds missing indirect, expire indirect, missing direct & expire direct.
+ * Defines a new protocol structure layout.
+ * The v5 'missing indirect' and 'missing direct' ioctls are analogous to
+ * the v2 'missing' ioctl. These ioctls are used to initiate a mount via
+ * a lookup. The 'expire' ioctls are used by the automounter to query if
+ * it is possible to unmount the filesystem. 'direct' and 'indirect'
+ * refer to the mount option type that the automounter performed and
+ * correlate to an automounter direct or indirect map mointpoint.
+ *
+ * G) The automounter periodically issues an 'expire' ioctl to autofs to
+ * obtain the name of a mountpoint which the automounter can unmount.
+ * Unmounting is dicussed in more detail below.
+ *
+ * H) The device ioctls roughly correspond to the filesystem ioctls, but
+ * instead of being tied to an auotfs mountpoint vnode, they can be called any
+ * time. The argument structure uses either a path or an autofs pipe file
+ * descriptor to indicate what is being operated on.
+ *
+ * +++ lxautofs notes
+ *
+ * 1) In general, the lxautofs filesystem tries to mimic the behavior of the
+ * Linux autofs filesystem with the following exceptions:
+ *
+ * 1.1) We don't bother to implement the E.3 functionality listed above
+ * since it doesn't appear to be of any use.
+ *
+ * 1.2) We only fully implement v2 and v5 of the autofs protocol.
+ *
+ * 2) In general, the approach taken for lxautofs is to keep it as simple
+ * as possible and to minimize it's memory usage. To do this all information
+ * about the contents of the lxautofs filesystem are mirrored in the
+ * underlying filesystem that lxautofs is mounted on and most vop operations
+ * are simply passed onto this underlying filesystem. This means we don't
+ * have to implement most of the complex operations that a full filesystem
+ * normally has to implement. It also means that most of our filesystem state
+ * (wrt the contents of the filesystem) doesn't actually have to be stored
+ * in memory, we can simply go to the underlying filesystem to get it when
+ * it's requested. For the purposes of discussion, we'll call the underlying
+ * filesystem the "backing store."
+ *
+ * The backing store is actually a directory called ".lxautofs" which is created
+ * in the directory where the lxautofs filesystem is mounted. When the
+ * lxautofs filesystem is unmounted this backing store directory is deleted.
+ * If this directory exists at mount time (perhaps the system crashed while a
+ * previous lxautofs instance was mounted at the same location) it will be
+ * deleted. There are a few implications of using a backing store worth
+ * mentioning.
+ *
+ * 2.1) lxautofs can't be mounted on a read only filesystem. If this
+ * proves to be a problem we can probably move the location of the
+ * backing store.
+ *
+ * 2.2) If the backing store filesystem runs out of space then the
+ * automounter process won't be able to create more directories and mount
+ * new filesystems. Of course, strange failures usually happen when
+ * filesystems run out of space.
+ *
+ * 3) Why aren't we using gfs? gfs has two different usage models.
+ *
+ * 3.1) I'm my own filesystem but i'm using gfs to help with managing
+ * readdir operations.
+ *
+ * 3.2) I'm a gfs filesystem and gfs is managing all my vnodes
+ *
+ * We're not using the 3.1 interfaces because we don't implement readdir
+ * ourselves. We pass all readdir operations onto the backing store
+ * filesystem and utilize its readdir implementation.
+ *
+ * We're not using the 3.2 interfaces because they are really designed for
+ * in memory filesystems where all of the filesystem state is stored in
+ * memory. They don't lend themselves to filesystems where part of the
+ * state is in memory and part of the state is on disk.
+ *
+ * For more information on gfs take a look at the block comments in the
+ * top of gfs.c
+ *
+ * 4) Unmounting
+ *
+ * The automounter has a timeout associated with each mount. It informs autofs
+ * of this timeout using the LX_AUTOFS_DEV_IOC_TIMEOUT_CMD ioctl after autofs
+ * has been mounted on the mountpoint.
+ *
+ * After the automounter has mounted something associated with the mountpoint
+ * then periodically (<timeout>/4 seconds) the automounter will issue the
+ * LX_AUTOFS_DEV_IOC_EXPIRE_CMD ioctl on the autofs mount. autofs is expected
+ * to respond with an underlying mountpoint entry which is a candidate for
+ * unmounting. The automounter will attempt to unmount the filesystem
+ * (which may fail if it is busy, since this is obviously racy) and then
+ * acknowledge the expire ioctl. The successful acknowledgement is independent
+ * of the success of unmounting the underlying filesystem.
+ *
+ * Unmount handling varies based on which type of mount the autofs was mounted
+ * with (indirect, direct or offset).
+ *
+ * To support 'indirect' mount expiration, the autofs vfs keeps track of the
+ * filesystems mounted immediately under the autofs mountpoint (in
+ * lav_mnt_list) after a lookup has completed successfully. Upon receipt of the
+ * LX_AUTOFS_IOC_DEV_EXPIRE_CMD ioctl, autofs removes the first element from
+ * the list, attempts to check if it is busy and if not, returns that mountpoint
+ * over the fifo (if busy the entry is added to the end of the list). When the
+ * ioctl is acknowledged, if the mountpoint still exists, that means the unmount
+ * failed and the entry is added at the back of the list. If there are no
+ * elements or the first one is busy, EAGAIN is returned for the 'expire' ioctl
+ * and the autoumounter will check again in <timeout>/4 seconds.
+ *
+ * For example, if /home is an autofs indirect mount, then there are typically
+ * many different {username}-specific NFS mounts under that /home autofs mount.
+ * autofs uses the lav_mnt_list to respond to 'expire' ioctls in a round-robin
+ * fashion so that the automounter can unmount user file systems that aren't in
+ * use.
+ *
+ * Expiring 'direct' mounts is similar, but since there is only a single mount,
+ * the lav_mnt_list only will have at most one entry if there is a filesystem
+ * mounted overtop of the autofs mount.
+ *
+ * Expiring 'offset' mounts is more complicated because there are at least
+ * two different autofs VFSs involved (the top-level and one for each offset
+ * mount underneath). The actual offset mount is handled exactly like a 'direct'
+ * mount. The top-level is an indirect mount and is handled in a similar way
+ * as described above for indirect mounts, but special handling is needed for
+ * each offset mount below.
+ *
+ * This can be explained using the same 'jurassic' example described earlier
+ * (/net is an autofs 'indirect' mount and the host 'jurassic' has two exported
+ * file systems; /var/crash and /var/core). If the user accesses
+ * /net/jurassic/var/crash then the automounter would setup the system so that
+ * the following mounts exist:
+ * - /net (the original autofs indirect mount which triggers everything)
+ * - /net/jurassic/var/crash (autofs offset mount)
+ * - /net/jurassic/var/crash (NFS mount on top of the autofs offset mount)
+ * - /net/jurassic/var/core (autofs offset mount)
+ *
+ * For expiration the automounter will issue the LX_AUTOFS_IOC_EXPIRE_MULTI
+ * ioctl on each autofs vfs for which something is mounted, so we would receive
+ * an expire ioctl on /net and another on /net/jusrassic/var/crash. The vfs for
+ * /net will be tracking "jurassic", but we detect it is busy and won't do
+ * anything at first. The vfs for "crash" will work like a direct mount and
+ * acknowledge the expire ioctl to the automounter once that filesystem times
+ * out and is no longer busy. The automounter will then unmount the "crash"
+ * NFS mount.
+ *
+ * Once the "crash" NFS mount has been unmounted by the automounter, we're left
+ * with the two autofs offset mounts under jurassic. The automounter will not
+ * try to unmount either of those, so we have to do that. Once we get another
+ * expire ioctl on /net and check "jurassic", we'll see there are only autofs
+ * mounts under /net/jurassic. We umount those using the lx_autofs_umount_offset
+ * function and respond to the automounter expire ioctl with "jurassic", in the
+ * same way as we would for any other indirect mount.
+ *
+ * 5) Recovery
+ *
+ * If the automounter is restarted for any reason, it needs to cope with
+ * pre-existing autofs mounts, as well as other automount-initiated mounts (e.g.
+ * a direct mount on top of an autofs mountpoint). The automounter uses the
+ * /proc/mounts file to correlate mounts to the managed mountpoints. It then
+ * uses the /dev/autofs device to openmount each of the autofs devices and
+ * reinitialize them using the various dev ioctls (timeout, requester, etc.).
+ *
+ * In general, the autoumounter will closemount the mountpoint once it's done,
+ * but it doesn't in the case of an offset mountpoint with nothing mounted
+ * on top. In this case the automounter expects autofs to expire that mountpoint
+ * before it will closemount (so things can subsequently cleanup). We handle
+ * this special case in the expire code path.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Note that the name of the actual file system is lxautofs, not lx_autofs, but
+ * the code uses lx_autofs to prefix the various names. This is because file
+ * system names are limited to 8 characters.
+ */
+#define LX_AUTOFS_NAME "lxautofs"
+
+#define LX_AUTOFS_MINORNAME "autofs"
+
+/*
+ * Mount options supported.
+ */
+#define LX_MNTOPT_FD "fd"
+#define LX_MNTOPT_PGRP "pgrp"
+#define LX_MNTOPT_MINPROTO "minproto"
+#define LX_MNTOPT_MAXPROTO "maxproto"
+#define LX_MNTOPT_INDIRECT "indirect"
+#define LX_MNTOPT_DIRECT "direct"
+#define LX_MNTOPT_OFFSET "offset"
+
+/*
+ * Version/subversion of the Linux kernel automount protocol we support.
+ *
+ * We fully support v2 and v5. We'll return ENOTSUP for all of the ioctls we
+ * don't yet handle.
+ */
+#define LX_AUTOFS_PROTO_VERS5 5
+#define LX_AUTOFS_PROTO_SUBVERSION 2
+#define LX_AUTOFS_PROTO_VERS2 2
+
+/* packet types */
+typedef enum laph_ptype {
+ LX_AUTOFS_PTYPE_MISSING, /* 0 */
+ LX_AUTOFS_PTYPE_EXPIRE, /* 1 */
+ LX_AUTOFS_PTYPE_EXPIRE_MULTI, /* 2 */
+ LX_AUTOFS_PTYPE_MISSING_INDIR, /* 3 */
+ LX_AUTOFS_PTYPE_EXPIRE_INDIR, /* 4 */
+ LX_AUTOFS_PTYPE_MISSING_DIRECT, /* 5 */
+ LX_AUTOFS_PTYPE_EXPIRE_DIRECT /* 6 */
+} laph_ptype_t;
+
+/*
+ * Common header for all versions of the protocol.
+ */
+typedef struct lx_autofs_pkt_hdr {
+ int laph_protover; /* protocol version number */
+ laph_ptype_t laph_type;
+ int laph_id; /* every pkt must have a unique id */
+} lx_autofs_pkt_hdr_t;
+
+/*
+ * Command structure sent to automount process from lxautofs via a pipe.
+ * This structure is the same for v2-v4 of the automount protocol
+ * (the communication pipe is established at mount time).
+ */
+typedef struct lx_autofs_v2_pkt {
+ lx_autofs_pkt_hdr_t lap_hdr;
+ int lap_name_len; /* don't include newline or NULL */
+ char lap_name[256]; /* path component to lookup */
+} lx_autofs_v2_pkt_t;
+
+/* v4 multi-expire */
+typedef struct lx_autofs_v4_exp_pkt {
+ lx_autofs_pkt_hdr_t lape_hdr;
+ int lape_len;
+ char lape_name[MAXNAMELEN];
+} lx_autofs_v4_exp_pkt_t;
+
+/* v5 */
+typedef struct lx_autofs_v5_pkt {
+ lx_autofs_pkt_hdr_t lap_hdr;
+ uint32_t lap_dev;
+ uint64_t lap_ino;
+ uint32_t lap_uid;
+ uint32_t lap_gid;
+ uint32_t lap_pid;
+ uint32_t lap_tgid;
+ uint32_t lap_name_len;
+ char lap_name[256];
+} lx_autofs_v5_pkt_t;
+
+union lx_autofs_pkt {
+ lx_autofs_v2_pkt_t lap_v2;
+ lx_autofs_v5_pkt_t lap_v5;
+};
+
+#define lap_protover lap_v2.lap_hdr.laph_protover
+#define lap_type lap_v2.lap_hdr.laph_type
+#define lap_id lap_v2.lap_hdr.laph_id
+
+/*
+ * Ioctls fully supported (v2 protocol).
+ */
+#define LX_AUTOFS_IOC_READY 0x00009360 /* arg: int */
+#define LX_AUTOFS_IOC_FAIL 0x00009361 /* arg: int */
+#define LX_AUTOFS_IOC_CATATONIC 0x00009362 /* arg: <none> */
+
+/*
+ * Ioctls supported (v3/v4 protocol).
+ */
+#define LX_AUTOFS_IOC_PROTOVER 0x80049363 /* arg: int */
+#define LX_AUTOFS_IOC_SETTIMEOUT 0xc0089364 /* arg: ulong_t */
+
+/*
+ * Ioctls not supported (v3/v4 protocol).
+ */
+ /* arg: lx_autofs_v3_exp_pkt_t * */
+#define LX_AUTOFS_IOC_EXPIRE 0x81109365
+
+/*
+ * Ioctls supported (v5 protocol).
+ */
+#define LX_AUTOFS_IOC_PROTOSUBVER 0x80049367 /* arg: int */
+#define LX_AUTOFS_IOC_ASKUMOUNT 0x80049370 /* arg: int */
+#define LX_AUTOFS_IOC_EXPIRE_MULTI 0x40049366 /* arg: int */
+#define LX_AUTOFS_IOC_EXPIRE_INDIRECT LX_AUTOFS_IOC_EXPIRE_MULTI
+#define LX_AUTOFS_IOC_EXPIRE_DIRECT LX_AUTOFS_IOC_EXPIRE_MULTI
+
+/*
+ * autofs device ioctls
+ */
+#define LX_AUTOFS_DEV_IOC_VERSION_CMD 0xc0189371
+#define LX_AUTOFS_DEV_IOC_PROTOVER_CMD 0xc0189372
+#define LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD 0xc0189373
+#define LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD 0xc0189374
+#define LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD 0xc0189375
+#define LX_AUTOFS_DEV_IOC_READY_CMD 0xc0189376
+#define LX_AUTOFS_DEV_IOC_FAIL_CMD 0xc0189377
+#define LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD 0xc0189378
+#define LX_AUTOFS_DEV_IOC_CATATONIC_CMD 0xc0189379
+#define LX_AUTOFS_DEV_IOC_TIMEOUT_CMD 0xc018937a
+#define LX_AUTOFS_DEV_IOC_REQUESTER_CMD 0xc018937b
+#define LX_AUTOFS_DEV_IOC_EXPIRE_CMD 0xc018937c
+#define LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD 0xc018937d
+#define LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD 0xc018937e
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_AUTOFS_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h
new file mode 100644
index 0000000000..39ea96d1fe
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _LX_AUTOFS_IMPL_H
+#define _LX_AUTOFS_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/file.h>
+#include <sys/id_space.h>
+#include <sys/modhash.h>
+#include <sys/vnode.h>
+
+#include <sys/lx_autofs.h>
+
+/*
+ * Space key.
+ * Used to persist data across lx_autofs filesystem module unloads.
+ */
+#define LX_AUTOFS_SPACE_KEY_UDEV LX_AUTOFS_NAME "_udev"
+
+/*
+ * Name of the backing store directory.
+ */
+#define LX_AUTOFS_BS_DIR "." LX_AUTOFS_NAME
+
+#define LX_AUTOFS_VFS_ID_HASH_SIZE 15
+#define LX_AUTOFS_VFS_PATH_HASH_SIZE 15
+#define LX_AUTOFS_VFS_VN_HASH_SIZE 15
+
+enum lx_autofs_mnttype { LXAMT_NONE, LXAMT_INDIR, LXAMT_DIRECT, LXAMT_OFFSET };
+
+typedef struct lx_autofs_mntent {
+ list_node_t lxafme_lst;
+ uint64_t lxafme_ts; /* time stamp */
+ uint_t lxafme_len;
+ char *lxafme_path;
+} lx_autofs_mntent_t;
+
+/*
+ * VFS data object.
+ */
+typedef struct lx_autofs_vfs {
+ /* Info about the underlying filesystem and backing store. */
+ vnode_t *lav_mvp;
+ char *lav_bs_name;
+ vnode_t *lav_bs_vp;
+
+ /* Info about the automounter process managing this filesystem. */
+ int lav_fd;
+ pid_t lav_pgrp;
+ file_t *lav_fifo_wr;
+ file_t *lav_fifo_rd;
+
+ /* The mount's dev and ino values for v5 protocol msg */
+ uint64_t lav_dev;
+ u_longlong_t lav_ino;
+
+ /* options from the mount */
+ enum lx_autofs_mnttype lav_mnttype;
+ int lav_min_proto;
+
+ /*
+ * ioctl-set timeout value. The automounter will perform an expire
+ * ioctl every timeout/4 seconds. We use this to expire a mount once
+ * it is inactive for the full timeout.
+ */
+ ulong_t lav_timeout;
+
+ /* ioctl-set catatonic value (prevents future mounts). */
+ boolean_t lav_catatonic;
+
+ /* Mount initiator's uid/gid for recovery handling. */
+ uid_t lav_uid;
+ gid_t lav_gid;
+
+ /* Each automount requests needs a unique id. */
+ id_space_t *lav_ids;
+
+ /* All remaining structure members are protected by lav_lock. */
+ kmutex_t lav_lock;
+ /* openmount counter */
+ int lav_openmnt_cnt;
+
+
+ /* Hashes to keep track of outstanding automounter requests. */
+ mod_hash_t *lav_path_hash;
+ mod_hash_t *lav_id_hash;
+
+ /* We need to keep track of all our vnodes. */
+ vnode_t *lav_root;
+ mod_hash_t *lav_vn_hash;
+
+ /* list of current mounts */
+ list_t lav_mnt_list;
+} lx_autofs_vfs_t;
+
+enum lx_autofs_callres { LXACR_NONE, LXACR_READY, LXACR_FAIL };
+
+/*
+ * Structure to keep track of automounter requests sent to user-land.
+ */
+typedef struct lx_autofs_automnt_req {
+ /* Packet that gets sent to the automounter. */
+ union lx_autofs_pkt laar_pkt;
+ int laar_pkt_size;
+
+ /* Reference count. Always updated atomically. */
+ uint_t laar_ref;
+
+ /*
+ * Fields to keep track and sync threads waiting on a lookup.
+ * Fields are protected by lalr_lock.
+ */
+ kmutex_t laar_lock;
+ kcondvar_t laar_cv;
+ int laar_complete;
+
+ enum lx_autofs_callres laar_result;
+} lx_autofs_automnt_req_t;
+
+/*
+ * Generic stack structure.
+ */
+typedef struct stack_elem {
+ list_node_t se_list;
+ caddr_t se_ptr1;
+ caddr_t se_ptr2;
+ caddr_t se_ptr3;
+} stack_elem_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_AUTOFS_IMPL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h
new file mode 100644
index 0000000000..9c1579cc82
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h
@@ -0,0 +1,778 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _LX_BRAND_H
+#define _LX_BRAND_H
+
+#ifndef _ASM
+#include <sys/types.h>
+#include <sys/cpuvar.h>
+#include <sys/zone.h>
+#include <sys/ksocket.h>
+#include <sys/vfs.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/cpuvar.h>
+#include <sys/lx_futex.h>
+#include <sys/lx_userhz.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LX_BRANDNAME "lx"
+
+/*
+ * Brand uname info
+ */
+#define LX_UNAME_SYSNAME "Linux"
+#define LX_UNAME_RELEASE_2_6 "2.6.18"
+#define LX_UNAME_RELEASE_2_4 "2.4.21"
+#define LX_UNAME_VERSION "BrandZ virtual linux"
+#define LX_UNAME_MACHINE32 "i686"
+#define LX_UNAME_MACHINE64 "x86_64"
+
+#define LX_LIB_PATH32 "/native/usr/lib/lx_brand.so.1"
+#define LX_LIB_PATH64 "/native/usr/lib/amd64/lx_brand.so.1"
+
+#define LX_VDSO_PATH32 "/native/usr/lib/brand/lx/lx_vdso.so.1"
+#define LX_VDSO_PATH64 "/native/usr/lib/brand/lx/amd64/lx_vdso.so.1"
+
+#if defined(_LP64)
+#define LX_LIB_PATH LX_LIB_PATH64
+#define LX_UNAME_MACHINE LX_UNAME_MACHINE64
+#define LX_VDSO_PATH LX_VDSO_PATH64
+#else
+#define LX_LIB_PATH LX_LIB_PATH32
+#define LX_UNAME_MACHINE LX_UNAME_MACHINE32
+#define LX_VDSO_PATH LX_VDSO_PATH32
+#endif
+
+/*
+ * This must be large enough for both the 32-bit table and 64-bit table.
+ */
+#define LX_NSYSCALLS 358
+
+/* Highest capability we know about */
+#define LX_CAP_MAX_VALID 36
+
+/* sched attr flag values */
+#define LX_SCHED_FLAG_RESET_ON_FORK 0x1
+/*
+ * brand(2) subcommands
+ *
+ * Everything >= 128 is a brand-specific subcommand.
+ * > 192 is reserved for in-kernel emulated system calls.
+ */
+#define B_LPID_TO_SPAIR 128
+#define B_GET_CURRENT_CONTEXT 129
+#define B_EMULATION_DONE 130
+#define B_START_NFS_LOCKD 131
+#define B_BLOCK_ALL_SIGS 132
+#define B_UNBLOCK_ALL_SIGS 133
+#define B_PTRACE_CLONE_BEGIN 134
+#define B_PTRACE_STOP_FOR_OPT 135
+#define B_UNSUPPORTED 136
+#define B_STORE_ARGS 137
+#define B_GETPID 138
+#define B_JUMP_TO_LINUX 139
+#define B_ALL_SIGS_BLOCKED 140
+#define B_EXIT_AS_SIG 141
+/* formerly B_HELPER_WAITID 142 */
+#define B_HELPER_CLONE 143
+#define B_HELPER_SETGROUPS 144
+#define B_HELPER_SIGQUEUE 145
+#define B_HELPER_TGSIGQUEUE 146
+#define B_SET_NATIVE_STACK 147
+/* formerly B_SIGEV_THREAD_ID 148 */
+#define B_OVERRIDE_KERN_VER 149
+#define B_PTRACE_SIG_RETURN 150
+#define B_GET_PERSONALITY 151
+
+#ifndef _ASM
+/*
+ * Support for Linux PTRACE_SETOPTIONS handling.
+ */
+typedef enum lx_ptrace_options {
+ LX_PTRACE_O_TRACESYSGOOD = 0x0001,
+ LX_PTRACE_O_TRACEFORK = 0x0002,
+ LX_PTRACE_O_TRACEVFORK = 0x0004,
+ LX_PTRACE_O_TRACECLONE = 0x0008,
+ LX_PTRACE_O_TRACEEXEC = 0x0010,
+ LX_PTRACE_O_TRACEVFORKDONE = 0x0020,
+ LX_PTRACE_O_TRACEEXIT = 0x0040,
+ LX_PTRACE_O_TRACESECCOMP = 0x0080
+} lx_ptrace_options_t;
+
+#define LX_PTRACE_O_ALL \
+ (LX_PTRACE_O_TRACESYSGOOD | LX_PTRACE_O_TRACEFORK | \
+ LX_PTRACE_O_TRACEVFORK | LX_PTRACE_O_TRACECLONE | \
+ LX_PTRACE_O_TRACEEXEC | LX_PTRACE_O_TRACEVFORKDONE | \
+ LX_PTRACE_O_TRACEEXIT | LX_PTRACE_O_TRACESECCOMP)
+#endif /* !_ASM */
+
+/* siginfo si_status for traced events */
+#define LX_PTRACE_EVENT_FORK 0x100
+#define LX_PTRACE_EVENT_VFORK 0x200
+#define LX_PTRACE_EVENT_CLONE 0x300
+#define LX_PTRACE_EVENT_EXEC 0x400
+#define LX_PTRACE_EVENT_VFORK_DONE 0x500
+#define LX_PTRACE_EVENT_EXIT 0x600
+#define LX_PTRACE_EVENT_SECCOMP 0x700
+
+/*
+ * Brand-private values for the "pr_what" member of lwpstatus, for use with the
+ * PR_BRAND stop reason. These reasons are validated in lx_stop_notify();
+ * update it if you add new reasons here.
+ */
+#define LX_PR_SYSENTRY 1
+#define LX_PR_SYSEXIT 2
+#define LX_PR_SIGNALLED 3
+#define LX_PR_EVENT 4
+
+
+#define LX_VERSION_1 1
+#define LX_VERSION LX_VERSION_1
+
+#define LX_ATTR_KERN_RELEASE ZONE_ATTR_BRAND_ATTRS
+#define LX_ATTR_KERN_VERSION (ZONE_ATTR_BRAND_ATTRS + 1)
+#define LX_ATTR_TTY_GID (ZONE_ATTR_BRAND_ATTRS + 2)
+
+/*
+ * Aux vector containing phdr of Linux executable and ehdr of interpreter
+ * (if any), both of which are used by lx_librtld_db to ascertain r_debug.
+ * We repurpose the 3rd brand-specific aux vector slot for the Linux
+ * AT_SYSINFO_EHDR entry (we modify the a_type in the brand library).
+ */
+#define AT_SUN_BRAND_LX_PHDR AT_SUN_BRAND_AUX1
+#define AT_SUN_BRAND_LX_INTERP AT_SUN_BRAND_AUX2
+#define AT_SUN_BRAND_LX_CLKTCK AT_SUN_BRAND_AUX3
+#define AT_SUN_BRAND_LX_SYSINFO_EHDR AT_SUN_BRAND_AUX4
+
+/* Aux vectors containing real/effective user/group IDs */
+#define AT_LX_UID 11
+#define AT_LX_EUID 12
+#define AT_LX_GID 13
+#define AT_LX_EGID 14
+/* Aux vector containing hz value */
+#define AT_CLKTCK 17
+/* Aux vector containing secure boolean */
+#define AT_SECURE 23
+/* Aux vector containing vDSO addr */
+#define AT_SYSINFO_EHDR 33
+
+/*
+ * Usermode emulation routines are run on an alternate stack allocated by
+ * the brand library. Every LWP in a process will incur this overhead beyond
+ * the regular thread stack:
+ */
+#define LX_NATIVE_STACK_PAGE_COUNT 64
+
+/*
+ * When returning in a new child process created with vfork(2) (or CLONE_VFORK)
+ * we discard some of the native stack to prevent corruption of the parent
+ * emulation state.
+ */
+#define LX_NATIVE_STACK_VFORK_GAP 0x3000
+
+#ifndef _ASM
+
+extern struct brand lx_brand;
+
+typedef struct lx_brand_registration {
+ uint_t lxbr_version; /* version number */
+ void *lxbr_handler; /* base address of handler */
+ uint32_t lxbr_flags; /* LX_PROC_* registration flags */
+} lx_brand_registration_t;
+
+typedef struct lx_brand_registration32 {
+ uint_t lxbr_version; /* version number */
+ uint32_t lxbr_handler; /* base address of handler */
+ uint32_t lxbr_flags; /* LX_PROC_* registration flags */
+} lx_brand_registration32_t;
+
+#endif /* _ASM */
+
+/*
+ * GDT usage
+ */
+#define GDT_TLSMIN (GDT_BRANDMIN)
+#define GDT_TLSMAX (GDT_TLSMIN + 2)
+#define LX_TLSNUM (GDT_TLSMAX - GDT_TLSMIN)
+
+#ifndef _ASM
+
+/*
+ * Stores information needed by the lx linker to launch the main
+ * lx executable.
+ */
+typedef struct lx_elf_data64 {
+ uintptr_t ed_phdr;
+ uintptr_t ed_phent;
+ uintptr_t ed_phnum;
+ uintptr_t ed_entry;
+ uintptr_t ed_base;
+ uintptr_t ed_ldentry;
+} lx_elf_data64_t;
+
+typedef struct lx_elf_data32 {
+ uint32_t ed_phdr;
+ uint32_t ed_phent;
+ uint32_t ed_phnum;
+ uint32_t ed_entry;
+ uint32_t ed_base;
+ uint32_t ed_ldentry;
+} lx_elf_data32_t;
+
+#if defined(_LP64)
+typedef lx_elf_data64_t lx_elf_data_t;
+#else
+typedef lx_elf_data32_t lx_elf_data_t;
+#endif
+
+typedef enum lx_proc_flags {
+ /* flags configurable via brandsys() and members of LX_PROC_ALL */
+ LX_PROC_INSTALL_MODE = 0x01,
+ LX_PROC_STRICT_MODE = 0x02,
+ /* internal flags */
+ LX_PROC_CHILD_DEATHSIG = 0x04,
+ LX_PROC_NO_DUMP = 0x08 /* for lx_prctl LX_PR_[GS]ET_DUMPABLE */
+} lx_proc_flags_t;
+
+#define LX_PROC_ALL (LX_PROC_INSTALL_MODE | LX_PROC_STRICT_MODE)
+
+/* Maximum length for fields of LX uname */
+#define LX_SYS_UTS_LN 65
+
+/* Max. length of kernel release string */
+#define LX_KERN_RELEASE_MAX LX_SYS_UTS_LN
+#define LX_KERN_VERSION_MAX LX_SYS_UTS_LN
+
+#ifdef _KERNEL
+
+/*
+ * Entry points for cgroup integration.
+ */
+extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
+#define LX_RLFAKE_LOCKS 0
+#define LX_RLFAKE_NICE 1
+#define LX_RLFAKE_RTPRIO 2
+#define LX_RLFAKE_RTTIME 3
+
+#define LX_RLFAKE_NLIMITS 4
+
+#define LX_RLIM64_INFINITY (~0ULL)
+
+typedef struct {
+ uint64_t rlim_cur;
+ uint64_t rlim_max;
+} lx_rlimit64_t;
+
+typedef struct {
+ list_node_t lx_clgrpm_link;
+ proc_t *lx_clgrpm_pp;
+} lx_clone_grp_member_t;
+
+typedef struct {
+ kmutex_t lx_clgrp_lock; /* protects cnt & member list */
+ uint_t lx_clgrp_cnt;
+ list_t lx_clgrp_members;
+} lx_clone_grp_t;
+
+/* Entries in the l_clone_grps clone-group array */
+#define LX_CLGRP_FS 0
+#define LX_CLGRP_MAX 1
+
+/* See explanation in lx_mem.c about lx_mremap */
+#define LX_REMAP_ANONCACHE_NENTRIES 4
+typedef struct lx_segmap {
+ uintptr_t lxsm_vaddr; /* virtual address of mapping */
+ size_t lxsm_size; /* size of mapping in bytes */
+ uint64_t lxsm_lru; /* LRU field for cache */
+ uint_t lxsm_flags; /* protection and attribute flags */
+} lx_segmap_t;
+
+typedef struct lx_proc_data {
+ uintptr_t l_handler; /* address of user-space handler */
+ pid_t l_ppid; /* pid of originating parent proc */
+ uid_t l_loginuid; /* /proc/{pid}/loginuid */
+ int64_t l_ptrace; /* count of process lwps observed by ptrace */
+ lx_elf_data_t l_elf_data; /* ELF data for linux executable */
+ /* signal to deliver to parent when this thread group dies */
+ int l_signal;
+ /* native signal to deliver to process when parent dies */
+ int l_parent_deathsig;
+ lx_proc_flags_t l_flags;
+
+ kmutex_t l_clone_grp_lock; /* protects the following member */
+ lx_clone_grp_t *l_clone_grps[LX_CLGRP_MAX];
+
+ lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS];
+
+ kmutex_t l_io_ctx_lock; /* protects the following members */
+ uintptr_t l_io_ctxpage;
+ kcondvar_t l_io_destroy_cv;
+ uint_t l_io_ctx_cnt;
+ struct lx_io_ctx **l_io_ctxs;
+
+ /* original start/end bounds of arg/env string data */
+ uintptr_t l_args_start;
+ uintptr_t l_envs_start;
+ uintptr_t l_envs_end;
+
+ /* Override zone-wide settings for uname release and version */
+ char l_uname_release[LX_KERN_RELEASE_MAX];
+ char l_uname_version[LX_KERN_VERSION_MAX];
+
+ /* Linux process personality */
+ unsigned int l_personality;
+
+ /* VDSO location */
+ uintptr_t l_vdso;
+
+ /* mremap anon cache */
+ kmutex_t l_remap_anoncache_lock;
+ uint64_t l_remap_anoncache_generation;
+ lx_segmap_t l_remap_anoncache[LX_REMAP_ANONCACHE_NENTRIES];
+
+ /* Block all signals to all threads; used during vfork */
+ uint_t l_block_all_signals;
+} lx_proc_data_t;
+
+#endif /* _KERNEL */
+
+/*
+ * Linux process personality(2) flags stored in l_personality
+ */
+#define LX_PER_UNAME26 0x0020000
+#define LX_PER_ADDR_NO_RANDOMIZE 0x0040000
+#define LX_PER_FDPIC_FUNCPTRS 0x0080000
+#define LX_PER_MMAP_PAGE_ZERO 0x0100000
+#define LX_PER_ADDR_COMPAT_LAYOUT 0x0200000
+#define LX_PER_READ_IMPLIES_EXEC 0x0400000
+#define LX_PER_ADDR_LIMIT_32BIT 0x0800000
+#define LX_PER_SHORT_INODE 0x1000000
+#define LX_PER_WHOLE_SECONDS 0x2000000
+#define LX_PER_STICKY_TIMEOUTS 0x4000000
+#define LX_PER_ADDR_LIMIT_3GB 0x8000000
+
+#define LX_PER_LINUX 0x00
+#define LX_PER_SUNOS (0x06 | LX_PER_STICKY_TIMEOUTS)
+#define LX_PER_MASK 0xff
+
+/* max. number of aio control blocks (see lx_io_setup) allowed across zone */
+#define LX_AIO_MAX_NR 65536
+
+/*
+ * A data type big enough to bitmap all Linux possible cpus.
+ * The bitmap size is defined as 1024 cpus in the Linux 2.4 and 2.6 man pages
+ * for sched_getaffinity() and sched_getaffinity().
+ */
+#define LX_NCPU (1024)
+#define LX_AFF_ULONGS (LX_NCPU / (8 * sizeof (ulong_t)))
+typedef ulong_t lx_affmask_t[LX_AFF_ULONGS];
+
+/* Length of proc boot_id string */
+#define LX_BOOTID_LEN 37
+
+/*
+ * Flag values for uc_brand_data[0] in the ucontext_t:
+ */
+#define LX_UC_STACK_NATIVE 0x00001
+#define LX_UC_STACK_BRAND 0x00002
+#define LX_UC_RESTORE_NATIVE_SP 0x00010
+#define LX_UC_FRAME_IS_SYSCALL 0x00100
+#define LX_UC_RESTART_SYSCALL 0x01000
+#define LX_UC_IGNORE_LINK 0x10000
+
+#ifdef _KERNEL
+
+typedef struct lx_lwp_data lx_lwp_data_t;
+
+/*
+ * Flag values for "lxpa_flags" on a ptrace(2) accord.
+ */
+typedef enum lx_accord_flags {
+ LX_ACC_TOMBSTONE = 0x01
+} lx_accord_flags_t;
+
+/*
+ * Flags values for "br_ptrace_flags" in the LWP-specific data.
+ */
+typedef enum lx_ptrace_flags {
+ LX_PTF_SYSCALL = 0x01, /* handling syscall or a trap */
+ LX_PTF_EXITING = 0x02,
+ LX_PTF_STOPPING = 0x04,
+ LX_PTF_INHERIT = 0x08,
+ LX_PTF_STOPPED = 0x10,
+ LX_PTF_PARENT_WAIT = 0x20,
+ LX_PTF_CLDPEND = 0x40,
+ LX_PTF_CLONING = 0x80,
+ LX_PTF_WAITPEND = 0x100,
+ LX_PTF_NOSTOP = 0x200, /* disable syscall stop event */
+ LX_PTF_INSYSCALL = 0x400 /* between syscall enter & exit */
+} lx_ptrace_flags_t;
+
+/*
+ * A ptrace(2) accord represents the relationship between a tracer LWP and the
+ * set of LWPs that it is tracing: the tracees. This data structure belongs
+ * primarily to the tracer, but is reference counted so that it may be freed by
+ * whoever references it last.
+ */
+typedef struct lx_ptrace_accord {
+ kmutex_t lxpa_lock;
+ uint_t lxpa_refcnt;
+ lx_accord_flags_t lxpa_flags;
+
+ /*
+ * The tracer must hold "pidlock" while clearing these fields for
+ * exclusion of waitid(), etc.
+ */
+ lx_lwp_data_t *lxpa_tracer;
+ kcondvar_t *lxpa_cvp;
+
+ /*
+ * The "lxpa_tracees_lock" mutex protects the tracee list.
+ */
+ kmutex_t lxpa_tracees_lock;
+ list_t lxpa_tracees;
+} lx_ptrace_accord_t;
+
+/*
+ * These values are stored in the per-LWP data for a tracee when it is attached
+ * to a tracer. They record the method that was used to attach.
+ */
+typedef enum lx_ptrace_attach {
+ LX_PTA_NONE = 0x00, /* not attached */
+ LX_PTA_ATTACH = 0x01, /* due to tracer using PTRACE_ATTACH */
+ LX_PTA_TRACEME = 0x02, /* due to child using PTRACE_TRACEME */
+ LX_PTA_INHERIT_CLONE = 0x04, /* due to PTRACE_CLONE clone(2) flag */
+ LX_PTA_INHERIT_OPTIONS = 0x08 /* due to PTRACE_SETOPTIONS options */
+} lx_ptrace_attach_t;
+
+typedef enum lx_stack_mode {
+ LX_STACK_MODE_PREINIT = 0,
+ LX_STACK_MODE_INIT,
+ LX_STACK_MODE_NATIVE,
+ LX_STACK_MODE_BRAND
+} lx_stack_mode_t;
+
+struct lx_pid {
+ pid_t lxp_spid; /* the SunOS pid and ... */
+ id_t lxp_stid; /* ... tid pair */
+ pid_t lxp_lpid; /* the corresponding linux pid */
+ time_t lxp_start; /* birthday of this pid */
+ struct pid *lxp_pidp; /* allocated pid struct */
+ proc_t *lxp_procp; /* proc_t corresponding to lxp_spid */
+ struct lx_pid *lxp_stol_next; /* link in stol hash table */
+ struct lx_pid *lxp_ltos_next; /* link in ltos hash table */
+};
+
+/*
+ * lx-specific data in the klwp_t
+ */
+struct lx_lwp_data {
+ uint_t br_lwp_flags; /* misc. flags */
+ klwp_t *br_lwp; /* back pointer to container lwp */
+ int br_signal; /* signal to send to parent when */
+ /* clone()'ed child terminates */
+ int br_exitwhy; /* reason for thread (process) exit */
+ int br_exitwhat; /* exit code / killing signal */
+ cpuset_t *br_affinitymask; /* bitmask of CPU sched affinities */
+ struct user_desc br_tls[LX_TLSNUM];
+ /* descriptors used by libc for TLS */
+ ulong_t br_lx_fsbase; /* lx fsbase for 64-bit thread ptr */
+ ulong_t br_ntv_fsbase; /* native fsbase 64-bit thread ptr */
+ ulong_t br_lx_gsbase; /* lx user-land gsbase */
+ ulong_t br_ntv_gsbase; /* native user-land gsbase */
+ pid_t br_pid; /* converted pid for this thread */
+ pid_t br_tgid; /* thread group ID for this thread */
+ pid_t br_ppid; /* parent pid for this thread */
+ id_t br_ptid; /* parent tid for this thread */
+ void *br_clear_ctidp; /* clone thread id ptr */
+ void *br_set_ctidp; /* clone thread id ptr */
+ void *br_robust_list; /* robust lock list, if any */
+
+ /* first 4 syscall args - used for auditing */
+ uintptr_t br_syscall_args[4];
+
+ /*
+ * The following struct is used by some system calls to pass extra
+ * flags into the kernel without impinging on the namespace for
+ * illumos.
+ */
+ void *br_scall_args;
+ int br_args_size; /* size in bytes of br_scall_args */
+
+ boolean_t br_waitid_emulate;
+ int br_waitid_flags;
+
+ lx_ptrace_flags_t br_ptrace_flags; /* ptrace flags for this LWP */
+ lx_ptrace_options_t br_ptrace_options; /* PTRACE_SETOPTIONS options */
+ lx_ptrace_options_t br_ptrace_clone_option; /* current clone(2) type */
+
+ lx_ptrace_attach_t br_ptrace_attach; /* how did we get attached */
+ lx_ptrace_accord_t *br_ptrace_accord; /* accord for this tracer LWP */
+ lx_ptrace_accord_t *br_ptrace_tracer; /* accord tracing this LWP */
+ list_node_t br_ptrace_linkage; /* linkage for lxpa_tracees list */
+
+ ushort_t br_ptrace_whystop; /* stop reason, 0 for no stop */
+ ushort_t br_ptrace_whatstop; /* stop sub-reason */
+
+ int32_t br_ptrace_stopsig; /* stop signal, 0 for no signal */
+ /*
+ * Track the last (native) signal number processed by a ptrace.
+ * This allows the tracee to properly handle ignored signals after
+ * the tracer has been notified and the tracee restarted.
+ */
+ int32_t br_ptrace_donesig;
+ uintptr_t br_ptrace_stopucp; /* usermode ucontext_t pointer */
+
+ uint_t br_ptrace_event;
+ ulong_t br_ptrace_eventmsg;
+
+ int br_syscall_num; /* current system call number */
+ boolean_t br_syscall_restart; /* should restart on EINTR */
+
+ /*
+ * Store the LX_STACK_MODE for this LWP, and the current extent of the
+ * native (emulation) stack. This is similar, in principle, to the
+ * sigaltstack mechanism for signal handling. We also use this mode
+ * flag to determine how to process system calls from this LWP.
+ */
+ lx_stack_mode_t br_stack_mode;
+ uintptr_t br_ntv_stack;
+ uintptr_t br_ntv_stack_current;
+
+ /*
+ * If strict mode is enabled (via LX_STRICT in the environment), any
+ * call to lx_unsupported() will set this boolean to B_TRUE. This will
+ * cause us to drop SIGSYS on the LWP as it attempts to return to
+ * usermode.
+ */
+ boolean_t br_strict_failure;
+
+ /*
+ * Some syscalls emulated in-kernel still call back out to the
+ * userspace emulation for certain functions. When that is the case,
+ * the syscall_return logic must be bypassed at the end of the
+ * in-kernel syscall code. The NORMALRETURN and JUSTRETURN constants
+ * are used to choose the behavior.
+ */
+ char br_eosys;
+
+ /*
+ * Hold a pre-allocated lx_pid structure to be used during lx_initlwp.
+ */
+ struct lx_pid *br_lpid;
+
+ /*
+ * ID of the cgroup this thread belongs to.
+ */
+ uint_t br_cgroupid;
+
+ /*
+ * When the zone is running under FSS (which is the common case) then
+ * we cannot change scheduling class, so we emulate that. By default
+ * Linux uses LX_SCHED_OTHER (which is 0) and that only supports a
+ * priority of 0, so no special initialization is needed.
+ */
+ int br_schd_class; /* emulated scheduling class */
+ int br_schd_pri; /* emulated scheduling priority */
+ uint64_t br_schd_flags; /* emulated [sg]et_attr flags */
+ uint64_t br_schd_runtime; /* emulated DEADLINE */
+ uint64_t br_schd_deadline; /* emulated DEADLINE */
+ uint64_t br_schd_period; /* emulated DEADLINE */
+
+ fwaiter_t br_fwaiter; /* futex upon which we're waiting */
+ uint_t br_clone_grp_flags; /* pending clone group */
+};
+
+/*
+ * Upper limit on br_args_size, low because this value can persist until
+ * overridden with another value, and the size is given from userland.
+ */
+#define LX_BR_ARGS_SIZE_MAX (1024)
+
+typedef enum lx_audit_enbl {
+ LXAE_DISABLED,
+ LXAE_ENABLED,
+ LXAE_LOCKED
+} lx_audit_enbl_t;
+
+/*
+ * brand specific data
+ *
+ * We currently only support a single cgroup mount in an lx zone so we only have
+ * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever
+ * enhanced to support different mounts with different subsystem controllers.
+ */
+typedef struct lx_zone_data {
+ kmutex_t lxzd_lock; /* protects all members */
+ char lxzd_kernel_release[LX_KERN_RELEASE_MAX];
+ char lxzd_kernel_version[LX_KERN_VERSION_MAX];
+ ksocket_t lxzd_ioctl_sock;
+ char lxzd_bootid[LX_BOOTID_LEN]; /* procfs boot_id */
+ gid_t lxzd_ttygrp; /* tty gid for pty chown */
+ vfs_t *lxzd_cgroup; /* cgroup for this zone */
+ pid_t lxzd_lockd_pid; /* pid of NFS lockd */
+ list_t *lxzd_vdisks; /* virtual disks (zvols) */
+ dev_t lxzd_zfs_dev; /* major num for zfs */
+ uint_t lxzd_aio_nr; /* see lx_aio.c */
+ uint_t lxzd_pipe_max_sz; /* pipe-max-size sysctl val */
+ boolean_t lxzd_swap_disabled; /* no fake swap in zone? */
+ lx_audit_enbl_t lxzd_audit_enabled; /* auditing? */
+ struct lx_audit_state *lxzd_audit_state; /* zone's audit state */
+} lx_zone_data_t;
+
+/* LWP br_lwp_flags values */
+#define BR_CPU_BOUND 0x0001
+#define BR_AIO_LWP 0x0002 /* aio kernel worker thread */
+
+#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t))
+#define lwptolxlwp(l) ((struct lx_lwp_data *)lwptolwpbrand(l))
+#define ttolxproc(t) \
+ (((t)->t_procp->p_brand == &lx_brand) ? \
+ (struct lx_proc_data *)(t)->t_procp->p_brand_data : NULL)
+#define ptolxproc(p) \
+ (((p)->p_brand == &lx_brand) ? \
+ (struct lx_proc_data *)(p)->p_brand_data : NULL)
+#define ztolxzd(z) \
+ (((z)->zone_brand == &lx_brand) ? \
+ (lx_zone_data_t *)(z)->zone_brand_data : NULL)
+
+/* Macro for converting to system call arguments. */
+#define LX_ARGS(scall) ((struct lx_##scall##_args *)\
+ (ttolxlwp(curthread)->br_scall_args))
+
+typedef enum lx_virt_disk_type {
+ LXVD_NONE,
+ LXVD_ZFS_DS,
+ LXVD_ZVOL
+} lx_virt_disk_type_t;
+
+typedef struct lx_virt_disk {
+ list_node_t lxvd_link;
+ char lxvd_name[MAXNAMELEN];
+ lx_virt_disk_type_t lxvd_type;
+ dev_t lxvd_emul_dev;
+ dev_t lxvd_real_dev;
+ uint64_t lxvd_volsize;
+ uint64_t lxvd_blksize;
+ char lxvd_real_name[MAXPATHLEN];
+} lx_virt_disk_t;
+
+/*
+ * Determine the upper bound on the system call number:
+ */
+#if defined(_LP64)
+#define LX_MAX_SYSCALL(lwp) \
+ ((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ? \
+ lx_nsysent64 : lx_nsysent32)
+#else
+#define LX_MAX_SYSCALL(lwp) lx_nsysent32
+#endif
+
+extern int lx_kern_release_cmp(zone_t *, const char *);
+
+extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t);
+extern void lx_divert(klwp_t *, uintptr_t);
+extern int lx_runexe(klwp_t *, void *);
+extern void lx_switch_to_native(klwp_t *);
+
+extern int lx_syscall_enter(void);
+extern void lx_syscall_return(klwp_t *, int, long);
+
+extern void lx_trace_sysenter(int, uintptr_t *);
+extern void lx_trace_sysreturn(int, long);
+
+extern void lx_emulate_user(klwp_t *, int, uintptr_t *);
+
+extern void lx_audit_ld();
+extern void lx_audit_unld();
+extern void lx_audit_fini(zone_t *);
+extern void lx_audit_syscall_exit(int, long);
+
+#if defined(_SYSCALL32_IMPL)
+extern void lx_emulate_user32(klwp_t *, int, uintptr_t *);
+#endif
+
+extern int lx_debug;
+#define lx_print if (lx_debug) printf
+
+/*
+ * Flags for lx_lpid_lock()
+ */
+typedef enum {
+ LXP_PRLOCK = 0x1, /* acquire PR_LOCK as part of locking */
+ LXP_ZOMBOK = 0x2 /* allow locking of zombies */
+} lx_pid_flag_t;
+
+extern void lx_pid_assign(kthread_t *, struct lx_pid *);
+extern void lx_pid_reassign(kthread_t *);
+extern void lx_pid_rele(pid_t, id_t);
+extern pid_t lx_lpid_to_spair(pid_t, pid_t *, id_t *);
+extern int lx_lpid_lock(pid_t, zone_t *, lx_pid_flag_t, proc_t **,
+ kthread_t **);
+extern pid_t lx_lwp_ppid(klwp_t *, pid_t *, id_t *);
+extern void lx_pid_init(void);
+extern void lx_pid_fini(void);
+extern void lx_acct_out(vnode_t *, int);
+
+extern uint_t lx_pipe_max_limit;
+extern uint_t lx_pipe_max_default;
+
+/*
+ * In-Kernel Linux System Call Description.
+ */
+typedef struct lx_sysent {
+ char *sy_name;
+ long (*sy_callc)();
+ char sy_flags;
+ char sy_narg;
+} lx_sysent_t;
+
+#if defined(_LP64)
+extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1];
+extern int lx_nsysent64;
+#endif
+extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1];
+extern int lx_nsysent32;
+
+#endif /* _KERNEL */
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_BRAND_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_fcntl.h b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h
new file mode 100644
index 0000000000..f82c6b867d
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h
@@ -0,0 +1,161 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_LX_FCNTL_H
+#define _SYS_LX_FCNTL_H
+
+#include <sys/vnode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Lx open/fcntl flags
+ */
+#define LX_O_RDONLY 00
+#define LX_O_WRONLY 01
+#define LX_O_RDWR 02
+#define LX_O_ACCMODE (LX_O_RDONLY | LX_O_WRONLY | LX_O_RDWR)
+#define LX_O_CREAT 0100
+#define LX_O_EXCL 0200
+#define LX_O_NOCTTY 0400
+#define LX_O_TRUNC 01000
+#define LX_O_APPEND 02000
+#define LX_O_NONBLOCK 04000
+#define LX_O_NDELAY LX_O_NONBLOCK
+#define LX_O_SYNC 010000
+#define LX_O_FSYNC LX_O_SYNC
+#define LX_O_ASYNC 020000
+#define LX_O_DIRECT 040000
+#define LX_O_LARGEFILE 0100000
+#define LX_O_DIRECTORY 0200000
+#define LX_O_NOFOLLOW 0400000
+#define LX_O_CLOEXEC 02000000
+#define LX_O_PATH 010000000
+
+#define LX_F_DUPFD 0
+#define LX_F_GETFD 1
+#define LX_F_SETFD 2
+#define LX_F_GETFL 3
+#define LX_F_SETFL 4
+#define LX_F_GETLK 5
+#define LX_F_SETLK 6
+#define LX_F_SETLKW 7
+#define LX_F_SETOWN 8
+#define LX_F_GETOWN 9
+#define LX_F_SETSIG 10
+#define LX_F_GETSIG 11
+
+#define LX_F_GETLK64 12
+#define LX_F_SETLK64 13
+#define LX_F_SETLKW64 14
+
+#define LX_F_SETLEASE 1024
+#define LX_F_GETLEASE 1025
+#define LX_F_NOTIFY 1026
+#define LX_F_CANCELLK 1029
+#define LX_F_DUPFD_CLOEXEC 1030
+#define LX_F_SETPIPE_SZ 1031
+#define LX_F_GETPIPE_SZ 1032
+
+#define LX_F_RDLCK 0
+#define LX_F_WRLCK 1
+#define LX_F_UNLCK 2
+
+/* Test for emulated O_PATH setting in file_t flags */
+#define LX_IS_O_PATH(f) (((f)->f_flag & (FREAD|FWRITE)) == 0)
+
+extern int lx_vp_at(int, char *, vnode_t **, int);
+
+/*
+ * Lx flock codes.
+ */
+#define LX_NAME_MAX 255
+#define LX_LOCK_SH 1 /* shared */
+#define LX_LOCK_EX 2 /* exclusive */
+#define LX_LOCK_NB 4 /* non-blocking */
+#define LX_LOCK_UN 8 /* unlock */
+
+/*
+ * On Linux the constants AT_REMOVEDIR and AT_EACCESS have the same value.
+ * AT_REMOVEDIR is used only by unlinkat and AT_EACCESS is used only by
+ * faccessat.
+ */
+#define LX_AT_FDCWD (-100)
+#define LX_AT_SYMLINK_NOFOLLOW 0x100
+#define LX_AT_REMOVEDIR 0x200
+#define LX_AT_EACCESS 0x200
+#define LX_AT_SYMLINK_FOLLOW 0x400
+#define LX_AT_NO_AUTOMOUNT 0x800
+#define LX_AT_EMPTY_PATH 0x1000
+
+typedef struct lx_flock {
+ short l_type;
+ short l_whence;
+ long l_start;
+ long l_len;
+ int l_pid;
+} lx_flock_t;
+
+typedef struct lx_flock64 {
+ short l_type;
+ short l_whence;
+ long long l_start;
+ long long l_len;
+ int l_pid;
+} lx_flock64_t;
+
+#if defined(_KERNEL)
+
+/*
+ * 64-bit kernel view of 32-bit usermode structs.
+ */
+#pragma pack(4)
+typedef struct lx_flock32 {
+ int16_t l_type;
+ int16_t l_whence;
+ int32_t l_start;
+ int32_t l_len;
+ int32_t l_pid;
+} lx_flock32_t;
+
+typedef struct lx_flock64_32 {
+ int16_t l_type;
+ int16_t l_whence;
+ int64_t l_start;
+ int64_t l_len;
+ int32_t l_pid;
+} lx_flock64_32_t;
+#pragma pack()
+
+#endif /* _KERNEL && _SYSCALL32_IMPL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LX_FCNTL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h
new file mode 100644
index 0000000000..7eba389218
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2017, Joyent, Inc.
+ */
+
+#ifndef _SYS_LX_FUTEX_H
+#define _SYS_LX_FUTEX_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FUTEX_WAIT 0
+#define FUTEX_WAKE 1
+#define FUTEX_FD 2
+#define FUTEX_REQUEUE 3
+#define FUTEX_CMP_REQUEUE 4
+#define FUTEX_WAKE_OP 5
+#define FUTEX_LOCK_PI 6
+#define FUTEX_UNLOCK_PI 7
+#define FUTEX_TRYLOCK_PI 8
+#define FUTEX_WAIT_BITSET 9
+#define FUTEX_WAKE_BITSET 10
+#define FUTEX_WAIT_REQUEUE_PI 11
+#define FUTEX_CMP_REQUEUE_PI 12
+#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE_PI
+
+/*
+ * Flags that can be OR'd into a futex operation.
+ */
+#define FUTEX_CMD_MASK 0x007f
+#define FUTEX_PRIVATE_FLAG 0x0080
+#define FUTEX_CLOCK_REALTIME 0x0100
+
+#define FUTEX_BITSET_MATCH_ANY 0xffffffff
+/*
+ * FUTEX_WAKE_OP operations
+ */
+#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */
+#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */
+#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */
+#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */
+#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */
+
+/*
+ * FUTEX_WAKE_OP comparison operations
+ */
+#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */
+#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */
+#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */
+#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */
+#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */
+#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */
+
+/*
+ * The encoding of the FUTEX_WAKE_OP operation in 32 bits:
+ *
+ * +--+-- - --+-- - --+-- - --+-- - --+
+ * |S |OP |CMP |OPARG |CMPARG |
+ * +--+-- - --+-- - --+-- - --+-- - --+
+ * |31|30 - 28|27 - 24|23 - 12|11 - 0|
+ *
+ * The S bit denotes that the OPARG should be (1 << OPARG) instead of OPARG.
+ * (Yes, this whole thing is entirely absurd -- see the block comment in
+ * lx_futex.c for an explanation of this nonsense.) Macros to extract the
+ * various components from the operation, given the above encoding:
+ */
+#define FUTEX_OP_OP(x) (((x) >> 28) & 7)
+#define FUTEX_OP_CMP(x) (((x) >> 24) & 15)
+#define FUTEX_OP_OPARG(x) (((x) >> 31) ? (1 << (((x) << 8) >> 20)) : \
+ ((((x) << 8) >> 20)))
+#define FUTEX_OP_CMPARG(x) (((x) << 20) >> 20)
+
+#ifdef _KERNEL
+
+/*
+ * This structure is used to track all the threads currently waiting on a
+ * futex. There is one fwaiter_t for each blocked thread. We store all
+ * fwaiter_t's in a hash structure, indexed by the memid_t of the integer
+ * containing the futex's value.
+ *
+ * At the moment, all fwaiter_t's for a single futex are simply dumped into
+ * the hash bucket. If futex contention ever becomes a hot path, we can
+ * chain a single futex's waiters together.
+ */
+typedef struct fwaiter {
+ memid_t fw_memid; /* memid of the user-space futex */
+ kcondvar_t fw_cv; /* cond var */
+ struct fwaiter *fw_next; /* hash queue */
+ struct fwaiter *fw_prev; /* hash queue */
+ uint32_t fw_bits; /* bits waiting on */
+ pid_t fw_tid; /* for PI futexes; the waiter's tid */
+ int fw_opri; /* for PI futexes; original pri. */
+ boolean_t fw_pri_up; /* for PI futexes; pri. increased */
+ volatile int fw_woken;
+} fwaiter_t;
+
+#define FUTEX_WAITERS 0x80000000
+#define FUTEX_OWNER_DIED 0x40000000
+#define FUTEX_TID_MASK 0x3fffffff
+
+#define FUTEX_ROBUST_LOCK_PI 1
+#define FUTEX_ROBUST_LIST_LIMIT 2048
+
+extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout,
+ uintptr_t addr2, int val2);
+extern void lx_futex_init(void);
+extern int lx_futex_fini(void);
+extern long lx_set_robust_list(void *listp, size_t len);
+extern long lx_get_robust_list(pid_t pid, void **listp, size_t *lenp);
+extern void lx_futex_robust_exit(uintptr_t addr, uint32_t tid);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LX_FUTEX_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_impl.h b/usr/src/uts/common/brand/lx/sys/lx_impl.h
new file mode 100644
index 0000000000..03b9d43038
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_impl.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _LX_IMPL_H
+#define _LX_IMPL_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (lx_systrace_f)(ulong_t, ulong_t, ulong_t, ulong_t, ulong_t,
+ ulong_t, ulong_t);
+
+
+extern lx_systrace_f *lx_systrace_entry_ptr;
+extern lx_systrace_f *lx_systrace_return_ptr;
+
+extern void lx_brand_systrace_enable(void);
+extern void lx_brand_systrace_disable(void);
+
+extern void lx_unsupported(char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_IMPL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_ldt.h b/usr/src/uts/common/brand/lx/sys/lx_ldt.h
new file mode 100644
index 0000000000..08d4d78efb
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_ldt.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2018 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_LINUX_LDT_H
+#define _SYS_LINUX_LDT_H
+
+#include <sys/segments.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ldt_info {
+ uint_t entry_number;
+ uint_t base_addr;
+ uint_t limit;
+ uint_t seg_32bit:1,
+ contents:2,
+ read_exec_only:1,
+ limit_in_pages:1,
+ seg_not_present:1,
+ useable:1;
+};
+
+#define LDT_INFO_EMPTY(info) \
+ ((info)->base_addr == 0 && (info)->limit == 0 && \
+ (info)->contents == 0 && (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && (info)->useable == 0)
+
+#if defined(__amd64)
+#define SETMODE(desc) (desc)->usd_long = SDP_SHORT;
+#else
+#define SETMODE(desc)
+#endif
+
+#define LDT_INFO_TO_DESC(info, desc) { \
+ USEGD_SETBASE(desc, (info)->base_addr); \
+ USEGD_SETLIMIT(desc, (info)->limit); \
+ (desc)->usd_type = ((info)->contents << 2) | \
+ ((info)->read_exec_only ^ 1) << 1 | SDT_S | SDT_A; \
+ (desc)->usd_dpl = SEL_UPL; \
+ (desc)->usd_p = (info)->seg_not_present ^ 1; \
+ (desc)->usd_def32 = (info)->seg_32bit; \
+ (desc)->usd_gran = (info)->limit_in_pages; \
+ (desc)->usd_avl = (info)->useable; \
+ SETMODE(desc); \
+}
+
+#define DESC_TO_LDT_INFO(desc, info) { \
+ bzero((info), sizeof (*(info))); \
+ (info)->base_addr = USEGD_GETBASE(desc); \
+ (info)->limit = USEGD_GETLIMIT(desc); \
+ (info)->seg_not_present = (desc)->usd_p ^ 1; \
+ (info)->contents = ((desc)->usd_type >> 2) & 3; \
+ (info)->read_exec_only = (((desc)->usd_type >> 1) & 1) ^ 1; \
+ (info)->seg_32bit = (desc)->usd_def32; \
+ (info)->limit_in_pages = (desc)->usd_gran; \
+ (info)->useable = (desc)->usd_avl; \
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LINUX_LDT_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h
new file mode 100644
index 0000000000..0418d3e9f9
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h
@@ -0,0 +1,136 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _SYS__LX_MISC_H
+#define _SYS__LX_MISC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <sys/siginfo.h>
+#include <sys/lx_brand.h>
+
+#ifdef _KERNEL
+
+extern void lx_setrval(klwp_t *, int, int);
+extern void lx_exec();
+extern void lx_exitlwp(klwp_t *);
+extern void lx_freelwp(klwp_t *);
+extern void *lx_lwpdata_alloc(proc_t *);
+extern void lx_lwpdata_free(void *);
+extern void lx_initlwp(klwp_t *, void *);
+extern void lx_initlwp_post(klwp_t *);
+extern void lx_forklwp(klwp_t *, klwp_t *);
+
+extern void lx_affinity_forklwp(klwp_t *, klwp_t *);
+
+extern void lx_set_gdt(int, user_desc_t *);
+extern void lx_clear_gdt(int);
+
+extern longlong_t lx_nosys();
+
+extern void lx_clone_grp_create(uint_t);
+extern void lx_clone_grp_enter(uint_t, proc_t *, proc_t *);
+extern void lx_clone_grp_exit(proc_t *, boolean_t);
+extern boolean_t lx_clone_grp_member(lx_proc_data_t *, uint_t);
+extern int lx_clone_grp_walk(lx_proc_data_t *, uint_t,
+ int (*)(proc_t *, void *), void *);
+
+extern greg_t lx_fixsegreg(greg_t, model_t);
+extern uintptr_t lx_fsbase(klwp_t *, uintptr_t);
+extern void lx_exit_with_sig(proc_t *, sigqueue_t *);
+extern boolean_t lx_wait_filter(proc_t *, proc_t *);
+extern void lx_sigfd_translate(k_siginfo_t *);
+extern int stol_ksiginfo_copyout(k_siginfo_t *, void *);
+
+extern int ltos_at_flag(int, int, boolean_t);
+#if defined(_SYSCALL32_IMPL)
+extern int stol_ksiginfo32_copyout(k_siginfo_t *, void *);
+#endif
+extern void lx_read_argv_bounds(proc_t *p);
+
+typedef enum lx_regs_location {
+ LX_REG_LOC_UNAVAIL,
+ LX_REG_LOC_LWP,
+ LX_REG_LOC_UCP
+} lx_regs_location_t;
+
+extern lx_regs_location_t lx_regs_location(lx_lwp_data_t *, void **, boolean_t);
+
+
+typedef enum lx_if_action {
+ LX_IF_FROMNATIVE,
+ LX_IF_TONATIVE
+} lx_if_action_t;
+
+/* Linux ARP protocol hardware identifiers */
+#define LX_ARPHRD_ETHER 1 /* Ethernet */
+#define LX_ARPHRD_LOOPBACK 772 /* Loopback */
+#define LX_ARPHRD_VOID 0xffff /* Unknown */
+
+/* IPv6 address scope values used in /proc/net/if_inet6 */
+#define LX_IPV6_ADDR_LOOPBACK 0x0010U
+#define LX_IPV6_ADDR_LINKLOCAL 0x0020U
+#define LX_IPV6_ADDR_SITELOCAL 0x0040U
+#define LX_IPV6_ADDR_COMPATv4 0x0080U
+
+/* Maximum length of a thread name, including the NUL terminator */
+#define LX_PR_SET_NAME_NAMELEN 16
+
+extern void lx_ifname_convert(char *, lx_if_action_t);
+extern void lx_ifflags_convert(uint64_t *, lx_if_action_t);
+extern unsigned int lx_ipv6_scope_convert(const in6_addr_t *);
+extern void lx_stol_hwaddr(const struct sockaddr_dl *, struct sockaddr *,
+ int *);
+
+extern boolean_t lx_ptrace_stop(ushort_t);
+extern void lx_stop_notify(proc_t *, klwp_t *, ushort_t, ushort_t);
+extern void lx_ptrace_init(void);
+extern void lx_ptrace_fini(void);
+extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *,
+ int *);
+extern void lx_ptrace_exit(proc_t *, klwp_t *);
+extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *);
+extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t);
+extern int lx_ptrace_set_clone_inherit(int, boolean_t);
+extern int lx_sigcld_repost(proc_t *, sigqueue_t *);
+extern int lx_ptrace_issig_stop(proc_t *, klwp_t *);
+extern boolean_t lx_ptrace_sig_ignorable(proc_t *, klwp_t *, int);
+
+extern int lx_helper_clone(int64_t *, int, void *, void *, void *);
+extern int lx_helper_setgroups(int, gid_t *);
+extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *);
+extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *);
+
+extern boolean_t lx_vsyscall_iscall(klwp_t *, uintptr_t, int *);
+extern void lx_vsyscall_enter(proc_t *, klwp_t *, int);
+
+extern void lx_check_strict_failure(lx_lwp_data_t *);
+
+extern boolean_t lx_is_eventfd(file_t *);
+
+extern int lx_read_common(file_t *, uio_t *, size_t *, boolean_t);
+extern int lx_write_common(file_t *, uio_t *, size_t *, boolean_t);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS__LX_MISC_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_ptm.h b/usr/src/uts/common/brand/lx/sys/lx_ptm.h
new file mode 100644
index 0000000000..74bbc939a3
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_ptm.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_PTM_LINUX_H
+#define _SYS_PTM_LINUX_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LX_PTM_DRV "lx_ptm"
+#define LX_PTM_MINOR_NODE "lx_ptmajor"
+
+#define LX_PTM_DEV_TO_PTS(dev) (getminor(dev) - 1)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_PTM_LINUX_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h
new file mode 100644
index 0000000000..9f606b614f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h
@@ -0,0 +1,190 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LX_SIGINFO_H
+#define _LX_SIGINFO_H
+
+#include <sys/lx_types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lx_siginfo_t lsi_code values
+ *
+ * LX_SI_ASYNCNL: Sent by asynch name lookup completion
+ * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads
+ * LX_SI_SIGIO: Sent by queued SIGIO
+ * LX_SI_ASYNCIO: Sent by asynchronous I/O completion
+ * LX_SI_MESGQ: Sent by real time message queue state change
+ * LX_SI_TIMER: Sent by timer expiration
+ * LX_SI_QUEUE: Sent by sigqueue
+ * LX_SI_USER: Sent by kill, sigsend, raise, etc.
+ * LX_SI_KERNEL: Sent by kernel
+ * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to
+ * illumos errors, if there is no translation available, this value
+ * should be used. This value should have no meaning as an si_code in
+ * illumos or Linux.
+ *
+ * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by
+ * BrandZ.
+ */
+#define LX_SI_CODE_NOT_EXIST (-61)
+#define LX_SI_ASYNCNL (-60)
+#define LX_SI_DETHREAD (-7)
+#define LX_SI_TKILL (-6)
+#define LX_SI_SIGIO (-5)
+#define LX_SI_ASYNCIO (-4)
+#define LX_SI_MESGQ (-3)
+#define LX_SI_TIMER (-2)
+#define LX_SI_QUEUE (-1)
+#define LX_SI_USER (0)
+#define LX_SI_KERNEL (0x80)
+
+#define LX_SI_MAX_SIZE 128
+#define LX_SI_PAD_SIZE_32 ((LX_SI_MAX_SIZE / sizeof (int)) - 3)
+#define LX_SI_PAD_SIZE_64 ((LX_SI_MAX_SIZE / sizeof (int)) - 4)
+
+#if defined(_LP64)
+/*
+ * Because of the odd number (3) of ints before the union, we need to account
+ * for the smaller padding needed on x64 due to the union being offset to an 8
+ * byte boundary.
+ */
+#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_64
+#else
+#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_32
+#endif
+
+typedef struct lx_siginfo {
+ int lsi_signo;
+ int lsi_errno;
+ int lsi_code;
+ union {
+ int _pad[LX_SI_PAD_SIZE];
+
+ struct {
+ pid_t _pid;
+ lx_uid16_t _uid;
+ } _kill;
+
+ struct {
+ uint_t _timer1;
+ uint_t _timer2;
+ } _timer;
+
+ struct {
+ pid_t _pid;
+ lx_uid16_t _uid;
+ union sigval _sigval;
+ } _rt;
+
+ struct {
+ pid_t _pid;
+ lx_uid16_t _uid;
+ int _status;
+ clock_t _utime;
+ clock_t _stime;
+ } _sigchld;
+
+ struct {
+ void *_addr;
+ } _sigfault;
+
+ struct {
+ int _band;
+ int _fd;
+ } _sigpoll;
+ } _sifields;
+} lx_siginfo_t;
+
+#if defined(_KERNEL) && defined(_SYSCALL32_IMPL)
+/*
+ * 64-bit kernel view of the 32-bit "lx_siginfo_t" object.
+ */
+#pragma pack(4)
+typedef struct lx_siginfo32 {
+ int lsi_signo;
+ int lsi_errno;
+ int lsi_code;
+ union {
+ int _pad[LX_SI_PAD_SIZE_32];
+
+ struct {
+ pid32_t _pid;
+ lx_uid16_t _uid;
+ } _kill;
+
+ struct {
+ uint_t _timer1;
+ uint_t _timer2;
+ } _timer;
+
+ struct {
+ pid32_t _pid;
+ lx_uid16_t _uid;
+ union sigval32 _sigval;
+ } _rt;
+
+ struct {
+ pid32_t _pid;
+ lx_uid16_t _uid;
+ int _status;
+ clock32_t _utime;
+ clock32_t _stime;
+ } _sigchld;
+
+ struct {
+ caddr32_t _addr;
+ } _sigfault;
+
+ struct {
+ int _band;
+ int _fd;
+ } _sigpoll;
+ } _sifields;
+} lx_siginfo32_t;
+#pragma pack()
+#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */
+
+#define lsi_pid _sifields._kill._pid
+#define lsi_uid _sifields._kill._uid
+#define lsi_status _sifields._sigchld._status
+#define lsi_utime _sifields._sigchld._utime
+#define lsi_stime _sifields._sigchld._stime
+#define lsi_value _sifields._rt._sigval
+#define lsi_int _sifields._rt._sigval.sivalx_int
+#define lsi_ptr _sifields._rt._sigval.sivalx_ptr
+#define lsi_addr _sifields._sigfault._addr
+#define lsi_band _sifields._sigpoll._band
+#define lsi_fd _sifields._sigpoll._fd
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_SIGINFO_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_signal.h b/usr/src/uts/common/brand/lx/sys/lx_signal.h
new file mode 100644
index 0000000000..552c36238b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_signal.h
@@ -0,0 +1,32 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LX_SIGNAL_H
+#define _LX_SIGNAL_H
+
+#include <lx_signum.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void lx_ltos_sigset(lx_sigset_t *, k_sigset_t *);
+extern void lx_stol_sigset(k_sigset_t *, lx_sigset_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_SIGNAL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_socket.h b/usr/src/uts/common/brand/lx/sys/lx_socket.h
new file mode 100644
index 0000000000..99489e4d13
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_socket.h
@@ -0,0 +1,444 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+ */
+
+#ifndef _SYS_LX_SOCKET_H
+#define _SYS_LX_SOCKET_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Linux address family definitions
+ * Some of these are not supported
+ */
+#define LX_AF_UNSPEC 0 /* Unspecified */
+#define LX_AF_UNIX 1 /* local file/pipe name */
+#define LX_AF_INET 2 /* IP protocol family */
+#define LX_AF_AX25 3 /* Amateur Radio AX.25 */
+#define LX_AF_IPX 4 /* Novell Internet Protocol */
+#define LX_AF_APPLETALK 5 /* Appletalk */
+#define LX_AF_NETROM 6 /* Amateur radio */
+#define LX_AF_BRIDGE 7 /* Multiprotocol bridge */
+#define LX_AF_ATMPVC 8 /* ATM PVCs */
+#define LX_AF_X25 9 /* X.25 */
+#define LX_AF_INET6 10 /* IPV 6 */
+#define LX_AF_ROSE 11 /* Amateur Radio X.25 */
+#define LX_AF_DECNET 12 /* DECnet */
+#define LX_AF_NETBEUI 13 /* 802.2LLC */
+#define LX_AF_SECURITY 14 /* Security callback */
+#define LX_AF_KEY 15 /* key management */
+#define LX_AF_ROUTE 16 /* Alias to emulate 4.4BSD */
+#define LX_AF_NETLINK LX_AF_ROUTE
+#define LX_AF_PACKET 17 /* Packet family */
+#define LX_AF_ASH 18 /* Ash ? */
+#define LX_AF_ECONET 19 /* Acorn Econet */
+#define LX_AF_ATMSVC 20 /* ATM SVCs */
+#define LX_AF_SNA 22 /* Linux SNA */
+#define LX_AF_IRDA 23 /* IRDA sockets */
+#define LX_AF_PPPOX 24 /* PPPoX sockets */
+#define LX_AF_WANPIPE 25 /* Wanpipe API sockets */
+#define LX_AF_LLC 26
+/* gap in Linux defines for 27 and 28 */
+#define LX_AF_CAN 29
+#define LX_AF_TIPC 30
+#define LX_AF_BLUETOOTH 31 /* Bluetooth sockets */
+#define LX_AF_IUCV 32
+#define LX_AF_RXRPC 33
+
+/* limit of AF mappings */
+#define LX_AF_MAX LX_AF_RXRPC
+
+#define AF_NOTSUPPORTED -1
+#define AF_INVAL -2
+
+/*
+ * Options for use with [gs]etsockopt at the SOL_SOCKET level.
+ */
+#define LX_SOL_SOCKET 1
+
+#define LX_SCM_RIGHTS 1
+#define LX_SCM_CRED 2
+
+#define LX_SO_DEBUG 1
+#define LX_SO_REUSEADDR 2
+#define LX_SO_TYPE 3
+#define LX_SO_ERROR 4
+#define LX_SO_DONTROUTE 5
+#define LX_SO_BROADCAST 6
+#define LX_SO_SNDBUF 7
+#define LX_SO_RCVBUF 8
+#define LX_SO_KEEPALIVE 9
+#define LX_SO_OOBINLINE 10
+#define LX_SO_NO_CHECK 11
+#define LX_SO_PRIORITY 12
+#define LX_SO_LINGER 13
+#define LX_SO_BSDCOMPAT 14
+#define LX_SO_REUSEPORT 15
+/*
+ * For Linux see unix(7) man page SO_PASSCRED description. For Illumos see
+ * socket.h(3HEAD) man page SO_RECVUCRED description.
+ */
+#define LX_SO_PASSCRED 16
+#define LX_SO_PEERCRED 17
+#define LX_SO_RCVLOWAT 18
+#define LX_SO_SNDLOWAT 19
+#define LX_SO_RCVTIMEO 20
+#define LX_SO_SNDTIMEO 21
+/* Security levels - as per NRL IPv6 - don't actually do anything */
+#define LX_SO_SECURITY_AUTHENTICATION 22
+#define LX_SO_SECURITY_ENCRYPTION_TRANSPORT 23
+#define LX_SO_SECURITY_ENCRYPTION_NETWORK 24
+#define LX_SO_BINDTODEVICE 25
+/* Socket filtering */
+#define LX_SO_ATTACH_FILTER 26
+#define LX_SO_DETACH_FILTER 27
+#define LX_SO_PEERNAME 28
+#define LX_SO_TIMESTAMP 29
+#define LX_SCM_TIMESTAMP LX_SO_TIMESTAMP
+#define LX_SO_ACCEPTCONN 30
+
+#define LX_SO_PEERSEC 31
+#define LX_SO_SNDBUFFORCE 32
+#define LX_SO_RCVBUFFORCE 33
+#define LX_SO_PASSSEC 34
+#define LX_SO_TIMESTAMPNS 35
+#define LX_SCM_TIMESTAMPNS LX_SO_TIMESTAMPNS
+#define LX_SO_MARK 36
+#define LX_SO_TIMESTAMPING 37
+#define LX_SCM_TIMESTAMPING LX_SO_TIMESTAMPING
+#define LX_SO_PROTOCOL 38
+#define LX_SO_DOMAIN 39
+#define LX_SO_RXQ_OVFL 40
+#define LX_SO_WIFI_STATUS 41
+#define LX_SCM_WIFI_STATUS LX_SO_WIFI_STATUS
+#define LX_SO_PEEK_OFF 42
+#define LX_SO_NOFCS 43
+#define LX_SO_LOCK_FILTER 44
+#define LX_SO_SELECT_ERR_QUEUE 45
+#define LX_SO_BUSY_POLL 46
+#define LX_SO_MAX_PACING_RATE 47
+#define LX_SO_BPF_EXTENSIONS 48
+
+/*
+ * Options for use with [gs]etsockopt at the RAW level.
+ * IPPROTO_RAW
+ */
+#define LX_ICMP_FILTER 1
+
+/*
+ * Options for use with [gs]etsockopt at the PACKET level.
+ * SOL_PACKET
+ */
+#define LX_SOL_PACKET 263
+
+#define LX_PACKET_ADD_MEMBERSHIP 1
+#define LX_PACKET_DROP_MEMBERSHIP 2
+#define LX_PACKET_RECV_OUTPUT 3
+#define LX_PACKET_RX_RING 5
+#define LX_PACKET_STATISTICS 6
+
+/*
+ * Options for use with [gs]etsockopt at the NETLINK level.
+ * SOL_NETLINK
+ */
+#define LX_SOL_NETLINK 270
+
+/*
+ * Linux socket type definitions
+ */
+#define LX_SOCK_STREAM 1 /* Connection-based byte streams */
+#define LX_SOCK_DGRAM 2 /* Connectionless, datagram */
+#define LX_SOCK_RAW 3 /* Raw protocol interface */
+#define LX_SOCK_RDM 4 /* Reliably-delivered message */
+#define LX_SOCK_SEQPACKET 5 /* Sequenced packet stream */
+#define LX_SOCK_PACKET 10 /* Linux specific */
+#define LX_SOCK_MAX 11
+
+/*
+ * The Linux socket type can be or-ed with other flags (e.g. SOCK_CLOEXEC).
+ */
+#define LX_SOCK_TYPE_MASK 0xf
+
+/*
+ * Linux flags for socket, socketpair and accept4. These are or-ed into the
+ * socket type value. In the Linux net.h header these come from fcntl.h (note
+ * that they are in octal in the Linux header).
+ */
+#define LX_SOCK_CLOEXEC 0x80000
+#define LX_SOCK_NONBLOCK 0x800
+
+#define SOCK_NOTSUPPORTED -1
+#define SOCK_INVAL -2
+
+/*
+ * PF_PACKET protocol definitions.
+ */
+#define LX_ETH_P_802_3 0x0001
+#define LX_ETH_P_ALL 0x0003
+#define LX_ETH_P_802_2 0x0004
+#define LX_ETH_P_IP 0x0800
+#define LX_ETH_P_ARP 0x0806
+#define LX_ETH_P_IPV6 0x86DD
+
+/*
+ * IP Protocol levels. Some of these match the Illumos IPPROTO_* values.
+ */
+#define LX_IPPROTO_IP 0
+#define LX_IPPROTO_ICMP 1
+#define LX_IPPROTO_IGMP 2
+#define LX_IPPROTO_TCP 6
+#define LX_IPPROTO_UDP 17
+#define LX_IPPROTO_IPV6 41
+#define LX_IPPROTO_ICMPV6 58
+#define LX_IPPROTO_RAW 255
+
+/*
+ * Options for use with [gs]etsockopt at the IP level.
+ * IPPROTO_IP
+ */
+#define LX_IP_TOS 1
+#define LX_IP_TTL 2
+#define LX_IP_HDRINCL 3
+#define LX_IP_OPTIONS 4
+#define LX_IP_ROUTER_ALERT 5
+#define LX_IP_RECVOPTS 6
+#define LX_IP_RETOPTS 7
+#define LX_IP_PKTINFO 8
+#define LX_IP_PKTOPTIONS 9
+#define LX_IP_MTU_DISCOVER 10
+#define LX_IP_RECVERR 11
+#define LX_IP_RECVTTL 12
+#define LX_IP_RECVTOS 13
+#define LX_IP_MTU 14
+#define LX_IP_FREEBIND 15
+#define LX_IP_IPSEC_POLICY 16
+#define LX_IP_XFRM_POLICY 17
+#define LX_IP_PASSSEC 18
+#define LX_IP_TRANSPARENT 19
+#define LX_IP_ORIGDSTADDR 20
+#define LX_IP_MINTTL 21
+#define LX_IP_NODEFRAG 22
+/* Linux apparently leaves a gap here */
+#define LX_IP_MULTICAST_IF 32
+#define LX_IP_MULTICAST_TTL 33
+#define LX_IP_MULTICAST_LOOP 34
+#define LX_IP_ADD_MEMBERSHIP 35
+#define LX_IP_DROP_MEMBERSHIP 36
+#define LX_IP_UNBLOCK_SOURC 37
+#define LX_IP_BLOCK_SOURCE 38
+#define LX_IP_ADD_SOURCE_MEMBERSHIP 39
+#define LX_IP_DROP_SOURCE_MEMBERSHIP 40
+#define LX_IP_MSFILTER 41
+#define LX_MCAST_JOIN_GROUP 42
+#define LX_MCAST_BLOCK_SOURCE 43
+#define LX_MCAST_UNBLOCK_SOURCE 44
+#define LX_MCAST_LEAVE_GROUP 45
+#define LX_MCAST_JOIN_SOURCE_GROUP 46
+#define LX_MCAST_LEAVE_SOURCE_GROUP 47
+#define LX_MCAST_MSFILTER 48
+#define LX_IP_MULTICAST_ALL 49
+#define LX_IP_UNICAST_IF 50
+
+/*
+ * LX_IP_MTU_DISCOVER values
+ */
+#define LX_IP_PMTUDISC_DONT 0
+#define LX_IP_PMTUDISC_WANT 1
+#define LX_IP_PMTUDISC_DO 2
+#define LX_IP_PMTUDISC_PROBE 3
+#define LX_IP_PMTUDISC_INTERFACE 4
+#define LX_IP_PMTUDISC_OMIT 5
+
+/*
+ * Options for use with [gs]etsockopt at the IP level.
+ * IPPROTO_IPV6
+ */
+
+#define LX_IPV6_ADDRFORM 1
+#define LX_IPV6_2292PKTINFO 2
+#define LX_IPV6_2292HOPOPTS 3
+#define LX_IPV6_2292DSTOPTS 4
+#define LX_IPV6_2292RTHDR 5
+#define LX_IPV6_2292PKTOPTIONS 6
+#define LX_IPV6_CHECKSUM 7
+#define LX_IPV6_2292HOPLIMIT 8
+#define LX_IPV6_NEXTHOP 9
+#define LX_IPV6_AUTHHDR 10
+#define LX_IPV6_UNICAST_HOPS 16
+#define LX_IPV6_MULTICAST_IF 17
+#define LX_IPV6_MULTICAST_HOPS 18
+#define LX_IPV6_MULTICAST_LOOP 19
+#define LX_IPV6_JOIN_GROUP 20
+#define LX_IPV6_LEAVE_GROUP 21
+#define LX_IPV6_ROUTER_ALERT 22
+#define LX_IPV6_MTU_DISCOVER 23
+#define LX_IPV6_MTU 24
+#define LX_IPV6_RECVERR 25
+#define LX_IPV6_V6ONLY 26
+#define LX_IPV6_JOIN_ANYCAST 27
+#define LX_IPV6_LEAVE_ANYCAST 28
+#define LX_IPV6_IPSEC_POLICY 34
+#define LX_IPV6_XFRM_POLICY 35
+
+#define LX_IPV6_RECVPKTINFO 49
+#define LX_IPV6_PKTINFO 50
+#define LX_IPV6_RECVHOPLIMIT 51
+#define LX_IPV6_HOPLIMIT 52
+#define LX_IPV6_RECVHOPOPTS 53
+#define LX_IPV6_HOPOPTS 54
+#define LX_IPV6_RTHDRDSTOPTS 55
+#define LX_IPV6_RECVRTHDR 56
+#define LX_IPV6_RTHDR 57
+#define LX_IPV6_RECVDSTOPTS 58
+#define LX_IPV6_DSTOPTS 59
+#define LX_IPV6_RECVTCLASS 66
+#define LX_IPV6_TCLASS 67
+
+/*
+ * Options for use with [gs]etsockopt at the IP level.
+ * IPPROTO_ICMPV6
+ */
+
+#define LX_ICMP6_FILTER 1
+
+/*
+ * Options for use with [gs]etsockopt at the TCP level.
+ * IPPROTO_TCP
+ */
+#define LX_TCP_NODELAY 1 /* Don't delay send to coalesce packets */
+#define LX_TCP_MAXSEG 2 /* Set maximum segment size */
+#define LX_TCP_CORK 3 /* Control sending of partial frames */
+#define LX_TCP_KEEPIDLE 4 /* Start keeplives after this period */
+#define LX_TCP_KEEPINTVL 5 /* Interval between keepalives */
+#define LX_TCP_KEEPCNT 6 /* Number of keepalives before death */
+#define LX_TCP_SYNCNT 7 /* Number of SYN retransmits */
+#define LX_TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */
+#define LX_TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */
+#define LX_TCP_WINDOW_CLAMP 10 /* Bound advertised window */
+#define LX_TCP_INFO 11 /* Information about this connection. */
+#define LX_TCP_QUICKACK 12 /* Bock/reenable quick ACKs. */
+#define LX_TCP_CONGESTION 13 /* Congestion control algorithm */
+#define LX_TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
+#define LX_TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts on thin streams */
+#define LX_TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
+#define LX_TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
+#define LX_TCP_REPAIR 19 /* TCP socket under repair */
+#define LX_TCP_REPAIR_QUEUE 20
+#define LX_TCP_QUEUE_SEQ 21
+#define LX_TCP_REPAIR_OPTIONS 22
+#define LX_TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
+#define LX_TCP_TIMESTAMP 24
+#define LX_TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes */
+
+/*
+ * Options for use with [gs]etsockopt at the IGMP level.
+ * IPPROTO_IGMP
+ */
+#define LX_IGMP_MINLEN 8
+#define LX_IGMP_MAX_HOST_REPORT_DELAY 10
+#define LX_IGMP_HOST_MEMBERSHIP_QUERY 0x11
+#define LX_IGMP_HOST_MEMBERSHIP_REPORT 0x12
+#define LX_IGMP_DVMRP 0x13
+#define LX_IGMP_PIM 0x14
+#define LX_IGMP_TRACE 0x15
+#define LX_IGMP_HOST_NEW_MEMBERSHIP_REPORT 0x16
+#define LX_IGMP_HOST_LEAVE_MESSAGE 0x17
+#define LX_IGMP_MTRACE_RESP 0x1e
+#define LX_IGMP_MTRACE 0x1f
+
+/*
+ * Linux socket flags for use with recv(2)/send(2)/recvmsg(2)/sendmsg(2)
+ */
+#define LX_MSG_OOB 0x1
+#define LX_MSG_PEEK 0x2
+#define LX_MSG_DONTROUTE 0x4
+#define LX_MSG_CTRUNC 0x8
+#define LX_MSG_PROXY 0x10
+#define LX_MSG_TRUNC 0x20
+#define LX_MSG_DONTWAIT 0x40
+#define LX_MSG_EOR 0x80
+#define LX_MSG_WAITALL 0x100
+#define LX_MSG_FIN 0x200
+#define LX_MSG_SYN 0x400
+#define LX_MSG_CONFIRM 0x800
+#define LX_MSG_RST 0x1000
+#define LX_MSG_ERRQUEUE 0x2000
+#define LX_MSG_NOSIGNAL 0x4000
+#define LX_MSG_MORE 0x8000
+#define LX_MSG_WAITFORONE 0x10000
+#define LX_MSG_FASTOPEN 0x20000000
+#define LX_MSG_CMSG_CLOEXEC 0x40000000
+
+typedef struct lx_msghdr {
+ void *msg_name; /* optional address */
+ socklen_t msg_namelen; /* size of address */
+ struct iovec *msg_iov; /* scatter/gather array */
+ size_t msg_iovlen; /* # elements in msg_iov */
+ void *msg_control; /* ancillary data */
+ size_t msg_controllen; /* ancillary data buffer len */
+ int msg_flags; /* flags on received message */
+} lx_msghdr_t;
+
+typedef struct lx_mmsghdr {
+ lx_msghdr_t msg_hdr; /* message header */
+ unsigned int msg_len; /* no. of bytes transmitted */
+} lx_mmsghdr_t;
+
+#if defined(_LP64)
+
+typedef struct lx_msghdr32 {
+ caddr32_t msg_name; /* optional address */
+ uint32_t msg_namelen; /* size of address */
+ caddr32_t msg_iov; /* scatter/gather array */
+ int32_t msg_iovlen; /* # elements in msg_iov */
+ caddr32_t msg_control; /* ancillary data */
+ uint32_t msg_controllen; /* ancillary data buffer len */
+ int32_t msg_flags; /* flags on received message */
+} lx_msghdr32_t;
+
+typedef struct lx_mmsghdr32 {
+ lx_msghdr32_t msg_hdr; /* message header */
+ unsigned int msg_len; /* no. of bytes transmitted */
+} lx_mmsghdr32_t;
+
+#endif
+
+typedef struct lx_sockaddr_in6 {
+ sa_family_t sin6_family;
+ in_port_t sin6_port;
+ uint32_t sin6_flowinfo;
+ struct in6_addr sin6_addr;
+ uint32_t sin6_scope_id; /* Depends on scope of sin6_addr */
+ /* one 32-bit field shorter than illumos */
+} lx_sockaddr_in6_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LX_SOCKET_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
new file mode 100644
index 0000000000..78fbf6e0a8
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
@@ -0,0 +1,341 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+ */
+
+#ifndef _SYS_LINUX_SYSCALLS_H
+#define _SYS_LINUX_SYSCALLS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+extern long lx_accept();
+extern long lx_accept4();
+extern long lx_access();
+extern long lx_acct();
+extern long lx_alarm();
+extern long lx_arch_prctl();
+extern long lx_bind();
+extern long lx_brk();
+extern long lx_chdir();
+extern long lx_chmod();
+extern long lx_chown();
+extern long lx_chown16();
+extern long lx_chroot();
+extern long lx_clock_getres();
+extern long lx_clock_gettime();
+extern long lx_clock_settime();
+extern long lx_close();
+extern long lx_connect();
+extern long lx_creat();
+extern long lx_dup();
+extern long lx_dup2();
+extern long lx_dup3();
+extern long lx_epoll_create();
+extern long lx_epoll_create1();
+extern long lx_epoll_ctl();
+extern long lx_epoll_pwait();
+extern long lx_epoll_wait();
+extern long lx_eventfd();
+extern long lx_eventfd2();
+extern long lx_faccessat();
+extern long lx_fadvise64();
+extern long lx_fadvise64_32();
+extern long lx_fadvise64_64();
+extern long lx_fallocate();
+extern long lx_fallocate32();
+extern long lx_fchdir();
+extern long lx_fchmod();
+extern long lx_fchmodat();
+extern long lx_fchown();
+extern long lx_fchown16();
+extern long lx_fchownat();
+extern long lx_fcntl();
+extern long lx_fcntl64();
+extern long lx_fgetxattr();
+extern long lx_flistxattr();
+extern long lx_flock();
+extern long lx_fremovexattr();
+extern long lx_fsetxattr();
+extern long lx_fstat32();
+extern long lx_fstat64();
+extern long lx_fstatat64();
+extern long lx_futex();
+extern long lx_get_robust_list();
+extern long lx_get_thread_area();
+extern long lx_getcpu();
+extern long lx_getcwd();
+extern long lx_getdents_32();
+extern long lx_getdents_64();
+extern long lx_getdents64();
+extern long lx_getegid();
+extern long lx_getegid16();
+extern long lx_geteuid();
+extern long lx_geteuid16();
+extern long lx_getgid();
+extern long lx_getgid16();
+extern long lx_getitimer();
+extern long lx_getpeername();
+extern long lx_getpgid();
+extern long lx_getpgrp();
+extern long lx_getsockname();
+extern long lx_getpid();
+extern long lx_getppid();
+extern long lx_getpriority();
+extern long lx_getrandom();
+extern long lx_getresgid();
+extern long lx_getresgid16();
+extern long lx_getresuid();
+extern long lx_getresuid16();
+extern long lx_getrlimit();
+extern long lx_getrusage();
+extern long lx_getsid();
+extern long lx_getsockopt();
+extern long lx_gettid();
+extern long lx_gettimeofday();
+extern long lx_getuid();
+extern long lx_getuid16();
+extern long lx_getxattr();
+extern long lx_io_cancel();
+extern long lx_io_destroy();
+extern long lx_io_getevents();
+extern long lx_io_setup();
+extern long lx_io_submit();
+extern long lx_ioctl();
+extern long lx_ioprio_get();
+extern long lx_ioprio_set();
+extern long lx_kill();
+extern long lx_lchown();
+extern long lx_lchown16();
+extern long lx_lgetxattr();
+extern long lx_link();
+extern long lx_linkat();
+extern long lx_listen();
+extern long lx_llistxattr();
+extern long lx_llseek();
+extern long lx_lremovexattr();
+extern long lx_lseek32();
+extern long lx_lseek64();
+extern long lx_lsetxattr();
+extern long lx_lstat32();
+extern long lx_lstat64();
+extern long lx_listxattr();
+extern long lx_madvise();
+extern long lx_mincore();
+extern long lx_mkdir();
+extern long lx_mkdirat();
+extern long lx_mlock();
+extern long lx_mlockall();
+extern long lx_mmap();
+extern long lx_mmap2();
+extern long lx_mremap();
+extern long lx_mprotect();
+extern long lx_modify_ldt();
+extern long lx_mount();
+extern long lx_msync();
+extern long lx_munlock();
+extern long lx_munlockall();
+extern long lx_munmap();
+extern long lx_nanosleep();
+extern long lx_nice();
+extern long lx_oldgetrlimit();
+extern long lx_open();
+extern long lx_openat();
+extern long lx_pause();
+extern long lx_personality();
+extern long lx_pipe();
+extern long lx_pipe2();
+extern long lx_poll();
+extern long lx_ppoll();
+extern long lx_pread();
+extern long lx_pread32();
+extern long lx_preadv();
+extern long lx_preadv32();
+extern long lx_prctl();
+extern long lx_prlimit64();
+extern long lx_pselect();
+extern long lx_ptrace();
+extern long lx_pwrite();
+extern long lx_pwrite32();
+extern long lx_pwritev();
+extern long lx_pwritev32();
+extern long lx_read();
+extern long lx_readlink();
+extern long lx_readlinkat();
+extern long lx_readv();
+extern long lx_reboot();
+extern long lx_recv();
+extern long lx_recvmsg();
+extern long lx_recvmmsg();
+extern long lx_recvfrom();
+extern long lx_rename();
+extern long lx_renameat();
+extern long lx_sched_getaffinity();
+extern long lx_sched_getparam();
+extern long lx_sched_getscheduler();
+extern long lx_sched_getattr();
+extern long lx_sched_get_priority_max();
+extern long lx_sched_get_priority_min();
+extern long lx_sched_rr_get_interval();
+extern long lx_sched_setaffinity();
+extern long lx_sched_setattr();
+extern long lx_sched_setparam();
+extern long lx_sched_setscheduler();
+extern long lx_sched_yield();
+extern long lx_select();
+extern long lx_send();
+extern long lx_sendmsg();
+extern long lx_sendmmsg();
+extern long lx_sendto();
+extern long lx_set_robust_list();
+extern long lx_set_thread_area();
+extern long lx_set_tid_address();
+extern long lx_setdomainname();
+extern long lx_setfsuid();
+extern long lx_setfsuid16();
+extern long lx_setfsgid();
+extern long lx_setfsgid16();
+extern long lx_setgid();
+extern long lx_setgid16();
+extern long lx_sethostname();
+extern long lx_setpgid();
+extern long lx_setpriority();
+extern long lx_setregid();
+extern long lx_setregid16();
+extern long lx_setresgid();
+extern long lx_setresgid16();
+extern long lx_setresuid();
+extern long lx_setresuid16();
+extern long lx_setreuid();
+extern long lx_setreuid16();
+extern long lx_setrlimit();
+extern long lx_setsid();
+extern long lx_setuid();
+extern long lx_setuid16();
+extern long lx_setxattr();
+extern long lx_setsockopt();
+extern long lx_symlink();
+extern long lx_symlinkat();
+extern long lx_shutdown();
+extern long lx_socket();
+extern long lx_socketcall();
+extern long lx_socketpair();
+extern long lx_splice();
+extern long lx_stat32();
+extern long lx_stat64();
+extern long lx_stime();
+extern long lx_swapoff();
+extern long lx_swapon();
+extern long lx_sync();
+extern long lx_sync_file_range();
+extern long lx_syncfs();
+extern long lx_sysinfo32();
+extern long lx_sysinfo64();
+extern long lx_syslog();
+extern long lx_removexattr();
+extern long lx_tgkill();
+extern long lx_time();
+extern long lx_times();
+extern long lx_timer_create();
+extern long lx_tkill();
+extern long lx_umask();
+extern long lx_umount();
+extern long lx_umount2();
+extern long lx_uname();
+extern long lx_unlink();
+extern long lx_unlinkat();
+extern long lx_unshare();
+extern long lx_vhangup();
+extern long lx_wait4();
+extern long lx_waitid();
+extern long lx_waitpid();
+extern long lx_write();
+extern long lx_writev();
+
+#if defined(_LP64)
+/*
+ * Linux vsyscall addresses:
+ */
+#define LX_VSYS_gettimeofday (uintptr_t)0xffffffffff600000
+#define LX_VSYS_time (uintptr_t)0xffffffffff600400
+#define LX_VSYS_getcpu (uintptr_t)0xffffffffff600800
+
+#define LX_VSYSCALL_ADDR (uintptr_t)0xffffffffff600000
+#define LX_VSYSCALL_SIZE (uintptr_t)0x1000
+#endif
+
+#endif /* _KERNEL */
+
+/*
+ * System call numbers for revectoring:
+ */
+
+#if defined(__amd64)
+#define LX_SYS_close 3
+#define LX_SYS_gettimeofday 96
+#define LX_SYS_mount 165
+#define LX_SYS_time 201
+#define LX_SYS_io_setup 206
+#define LX_SYS_clock_gettime 228
+#define LX_SYS_getcpu 309
+
+#define LX_SYS32_close 6
+#define LX_SYS32_gettimeofday 78
+#define LX_SYS32_time 13
+#define LX_SYS32_mount 21
+#define LX_SYS32_clock_gettime 265
+#define LX_SYS32_io_setup 245
+#define LX_SYS32_getcpu 318
+#elif defined(__i386)
+#define LX_SYS_close 6
+#define LX_SYS_mount 21
+#define LX_SYS_gettimeofday 78
+#define LX_SYS_time 13
+#define LX_SYS_clock_gettime 265
+#define LX_SYS_io_setup 245
+#define LX_SYS_getcpu 318
+#else
+#error "Architecture not supported"
+#endif /* defined(__amd64) */
+
+/*
+ * The current code in the VDSO operates under the expectation that it will be
+ * mapped at a fixed offset from the comm page. This simplifies the act of
+ * locating said page without any other reference. The VDSO must fit within
+ * this offset, matching the same value as COMM_PAGE_ALIGN.
+ * See: uts/i86pc/sys/comm_page.h
+ */
+#define LX_VDSO_SIZE 0x4000
+#define LX_VDSO_ADDR_MASK ~(LX_VDSO_SIZE - 1)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LINUX_SYSCALLS_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h
new file mode 100644
index 0000000000..90363c8939
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_types.h
@@ -0,0 +1,144 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_LX_TYPES_H
+#define _SYS_LX_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _KERNEL
+
+#define SHRT_MIN (-32768) /* min value of a "short int" */
+#define SHRT_MAX 32767 /* max value of a "short int" */
+#define USHRT_MAX 65535 /* max of "unsigned short int" */
+#define INT_MIN (-2147483647-1) /* min value of an "int" */
+#define INT_MAX 2147483647 /* max value of an "int" */
+#define UINT_MAX 4294967295U /* max value of an "unsigned int" */
+
+#ifndef LLONG_MAX
+#define LLONG_MAX 9223372036854775807LL
+#endif
+
+#if defined(_LP64)
+#define LONG_MAX 9223372036854775807L
+#define ULONG_MAX 18446744073709551615UL
+#else
+#define LONG_MAX 2147483647L /* max value of a 32-bit "long int" */
+#define ULONG_MAX 4294967295UL /* max value of a 32-bit "ulong int" */
+#endif
+
+#endif /* !_KERNEL */
+
+
+typedef uint64_t lx_dev_t;
+typedef uint16_t lx_dev16_t;
+typedef uint32_t lx_ino_t;
+typedef uint64_t lx_ino64_t;
+typedef uint32_t lx_uid_t;
+typedef uint16_t lx_uid16_t;
+typedef uint32_t lx_gid_t;
+typedef uint16_t lx_gid16_t;
+typedef uint32_t lx_off_t;
+typedef uint64_t lx_off64_t;
+typedef uint32_t lx_blksize_t;
+typedef uint32_t lx_blkcnt_t;
+typedef uint64_t lx_blkcnt64_t;
+typedef uint32_t lx_mode_t;
+typedef uint16_t lx_mode16_t;
+
+/*
+ * Linux mangles major/minor numbers into dev_t differently than SunOS.
+ */
+#ifdef _LP64
+#define LX_MAKEDEVICE(maj, min) \
+ (((min) & 0xff) | (((maj) & 0xfff) << 8) | \
+ ((uint64_t)((min) & ~0xff) << 12) | ((uint64_t)((maj) & ~0xfff) << 32))
+
+#define LX_GETMAJOR(lx_dev) ((((lx_dev) >> 8) & 0xfff) | \
+ ((((uint64_t)(lx_dev)) >> 32) & ~0xfff))
+
+#else
+#define LX_MAKEDEVICE(maj, min) \
+ (((min) & 0xff) | (((maj) & 0xfff) << 8) | (((min) & ~0xff) << 12))
+
+#define LX_GETMAJOR(lx_dev) (((lx_dev) >> 8) & 0xfff)
+#endif
+
+#define LX_GETMINOR(lx_dev) (((lx_dev) & 0xff) | (((lx_dev) >> 12) & ~0xff))
+/* Linux supports 20 bits for the minor, and 12 bits for the major number */
+#define LX_MAXMIN 0xfffff
+#define LX_MAXMAJ 0xfff
+
+/*
+ * Certain Linux tools care deeply about major/minor number mapping.
+ * Map virtual disks (zfs datasets, zvols, etc) into a safe reserved range.
+ */
+#define LX_MAJOR_DISK 203
+
+/* LX ptm driver major/minor number */
+#define LX_PTM_MAJOR 5
+#define LX_PTM_MINOR 2
+
+/* LX pts driver major number range */
+#define LX_PTS_MAJOR_MIN 136
+#define LX_PTS_MAJOR_MAX 143
+
+/* LX tty/cons driver major number */
+#define LX_TTY_MAJOR 5
+
+#define LX_UID16_TO_UID32(uid16) \
+ (((uid16) == (lx_uid16_t)-1) ? ((lx_uid_t)-1) : (lx_uid_t)(uid16))
+
+#define LX_GID16_TO_GID32(gid16) \
+ (((gid16) == (lx_gid16_t)-1) ? ((lx_gid_t)-1) : (lx_gid_t)(gid16))
+
+/* Overflow values default to NFS nobody. */
+
+#define UID16_OVERFLOW ((lx_uid16_t)65534)
+#define GID16_OVERFLOW ((lx_gid16_t)65534)
+
+/*
+ * All IDs with high word non-zero are converted to default overflow values to
+ * avoid inadvertent truncation to zero (root) (!).
+ */
+#define LX_UID32_TO_UID16(uid32) \
+ ((((uid32) & 0xffff0000) == 0) ? ((lx_uid16_t)(uid32)) : \
+ (((uid32) == ((lx_uid_t)-1)) ? ((lx_uid16_t)-1) : UID16_OVERFLOW))
+
+#define LX_GID32_TO_GID16(gid32) \
+ ((((gid32) & 0xffff0000) == 0) ? ((lx_gid16_t)(gid32)) : \
+ (((gid32) == ((lx_gid_t)-1)) ? ((lx_gid16_t)-1) : GID16_OVERFLOW))
+
+#define LX_32TO64(lo, hi) \
+ ((uint64_t)((uint64_t)(lo) | ((uint64_t)(hi) << 32)))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LX_TYPES_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_userhz.h b/usr/src/uts/common/brand/lx/sys/lx_userhz.h
new file mode 100644
index 0000000000..ebbda28698
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_userhz.h
@@ -0,0 +1,64 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _LX_USERHZ_H
+#define _LX_USERHZ_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Within the kernel, Linux implements an internal hz that they refer to as a
+ * "jiffy". Linux can be built with different hz, but on modern kernels
+ * it is frequently 250. However, Linux has a separate concept for the hz
+ * that is visible outside the kernel. This is called "USER_HZ" and is the
+ * value returned by 'sysconf(_SC_CLK_TCK)'. This is almost universally set to
+ * 100hz. Some (lazy) applications just hardcode 100hz instead of checking.
+ * To accommodate these broken applications, we always work with a USER_HZ of
+ * 100 and scale accordingly. See the Linux time(7) man page for a more
+ * detailed discussion of their behavior. See the comment in our
+ * uts/common/conf/param.c for a discussion of valid native hz values.
+ *
+ * There are a few interfaces which expose a clock_t to user-land and which
+ * need to be considered for USER_HZ adjustment.
+ * 1) The times(2) syscall. This is handled correctly.
+ * 2) The waitid(2) syscall passes a siginfo_t which contains si_stime and
+ * si_utime. Testing waitid(2) on various Linux distributions shows that the
+ * these fields are garbage. This aligns with the Linux waitid(2) man page,
+ * which describes the subset of the siginfo_t structure that is populated.
+ * Neither si_stime or si_utime are listed.
+ * 3) A sigaction(2) handler can pass a siginfo_t. This is only documented to
+ * occur when the sa_flags is SA_SIGINFO. The si_stime and si_utime are
+ * documented to only be populated when the signal is SIGCHLD. However,
+ * testing on Linux seems to show that these fields are not consistent
+ * with the corresponding times(2) data for the process, even for the
+ * SIGCHLD sigaction handler case.
+ * 4) Some fields in /proc/stat and /proc/pid/stat. See the Linux proc man
+ * page for references to sysconf(_SC_CLK_TCK).
+ *
+ * Although the siginfo_t si_stime and si_utime data for cases #2 and #3 is not
+ * consistent on Linux, we populate these fields correctly to be on the safe
+ * side.
+ */
+extern uint_t lx_hz_scale;
+#define LX_USERHZ 100
+#define HZ_TO_LX_USERHZ(x) ((x) / lx_hz_scale)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_USERHZ_H */
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_access.c b/usr/src/uts/common/brand/lx/syscall/lx_access.c
new file mode 100644
index 0000000000..8cf836cd7a
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_access.c
@@ -0,0 +1,223 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T
+ * All Rights Reserved
+ *
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ *
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred_impl.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/file.h>
+#include <fs/fs_subr.h>
+#include <c2/audit.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+
+/*
+ * Determine accessibility of file.
+ */
+
+#define E_OK 010 /* use effective ids */
+#define R_OK 004
+#define W_OK 002
+#define X_OK 001
+
+/*
+ * Convert Linux LX_AT_* flags to SunOS AT_* flags but skip verifying allowed
+ * flags have been passed. This also allows EACCESS/REMOVEDIR to be translated
+ * correctly since on linux they have the same value.
+ *
+ * Some code can actually pass in other bits in the flag. We may have to simply
+ * ignore these, as indicated by the enforce parameter.
+ */
+int
+ltos_at_flag(int lflag, int allow, boolean_t enforce)
+{
+ int sflag = 0;
+
+ if ((lflag & LX_AT_EACCESS) && (allow & AT_EACCESS)) {
+ lflag &= ~LX_AT_EACCESS;
+ sflag |= AT_EACCESS;
+ }
+
+ if ((lflag & LX_AT_REMOVEDIR) && (allow & AT_REMOVEDIR)) {
+ lflag &= ~LX_AT_REMOVEDIR;
+ sflag |= AT_REMOVEDIR;
+ }
+
+ if ((lflag & LX_AT_SYMLINK_NOFOLLOW) && (allow & AT_SYMLINK_NOFOLLOW)) {
+ lflag &= ~LX_AT_SYMLINK_NOFOLLOW;
+ sflag |= AT_SYMLINK_NOFOLLOW;
+ }
+
+ /* right now SunOS doesn't have a _FOLLOW flag, so use a fake one */
+ if ((lflag & LX_AT_SYMLINK_FOLLOW) && (allow & LX_AT_SYMLINK_FOLLOW)) {
+ lflag &= ~LX_AT_SYMLINK_FOLLOW;
+ sflag |= LX_AT_SYMLINK_FOLLOW;
+ }
+
+ /* If lflag is not zero than some flags did not hit the above code. */
+ if (enforce && lflag)
+ return (-1);
+
+ return (sflag);
+}
+
+/*
+ * For illumos, access() does this:
+ * If the process has appropriate privileges, an implementation may indicate
+ * success for X_OK even if none of the execute file permission bits are set.
+ *
+ * But for Linux, access() does this:
+ * If the calling process is privileged (i.e., its real UID is zero), then
+ * an X_OK check is successful for a regular file if execute permission is
+ * enabled for any of the file owner, group, or other.
+ *
+ * Linux used to behave more like illumos on older kernels:
+ * In kernel 2.4 (and earlier) there is some strangeness in the handling
+ * of X_OK tests for superuser. If all categories of execute permission
+ * are disabled for a nondirectory file, then the only access() test that
+ * returns -1 is when mode is specified as just X_OK; if R_OK or W_OK is
+ * also specified in mode, then access() returns 0 for such files.
+ *
+ * So we need to handle the case where a privileged process is checking for
+ * X_OK but none of the execute bits are set on the file. We'll keep the old
+ * 2.4 behavior for 2.4 emulation but use the new behavior for any other
+ * kernel rev.
+ */
+static int
+lx_common_access(char *fname, int fmode, vnode_t *startvp)
+{
+ vnode_t *vp;
+ cred_t *tmpcr;
+ int error;
+ int mode;
+ cred_t *cr;
+ int estale_retry = 0;
+
+ if (fmode & ~(E_OK|R_OK|W_OK|X_OK))
+ return (EINVAL);
+
+ mode = ((fmode & (R_OK|W_OK|X_OK)) << 6);
+
+ cr = CRED();
+
+ /* OK to use effective uid/gid, i.e., no need to crdup(CRED())? */
+ if ((fmode & E_OK) != 0 ||
+ (cr->cr_uid == cr->cr_ruid && cr->cr_gid == cr->cr_rgid)) {
+ tmpcr = cr;
+ crhold(tmpcr);
+ } else {
+ tmpcr = crdup(cr);
+ tmpcr->cr_uid = cr->cr_ruid;
+ tmpcr->cr_gid = cr->cr_rgid;
+ tmpcr->cr_ruid = cr->cr_uid;
+ tmpcr->cr_rgid = cr->cr_gid;
+ }
+
+lookup:
+ if ((error = lookupnameatcred(fname, UIO_USERSPACE, FOLLOW, NULLVPP,
+ &vp, startvp, tmpcr)) != 0) {
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto lookup;
+ crfree(tmpcr);
+ return (error);
+ }
+
+ if (mode != 0) {
+ error = VOP_ACCESS(vp, mode, 0, tmpcr, NULL);
+ if (error != 0) {
+ if ((error == ESTALE) &&
+ fs_need_estale_retry(estale_retry++)) {
+ VN_RELE(vp);
+ goto lookup;
+ }
+
+ } else if ((fmode & X_OK) != 0 && cr->cr_ruid == 0 &&
+ lx_kern_release_cmp(curproc->p_zone, "2.4.0") > 0) {
+ /* check for incorrect execute success */
+ vattr_t va;
+
+ va.va_mask = AT_MODE;
+ if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) == 0) {
+ mode_t m = VTTOIF(va.va_type) | va.va_mode;
+
+ if ((m & S_IFMT) == S_IFREG &&
+ !(m & (S_IXUSR | S_IXGRP | S_IXOTH))) {
+ /* no execute bits set in the mode */
+ error = EACCES;
+ }
+ }
+ }
+ }
+
+ crfree(tmpcr);
+ VN_RELE(vp);
+ return (error);
+}
+
+int
+lx_faccessat(int atfd, char *fname, int fmode, int flag)
+{
+ vnode_t *startvp;
+ int error;
+
+ if (atfd == LX_AT_FDCWD)
+ atfd = AT_FDCWD;
+
+ if ((flag = ltos_at_flag(flag, AT_EACCESS, B_FALSE)) < 0)
+ return (set_errno(EINVAL));
+
+ if (fname == NULL)
+ return (set_errno(EFAULT));
+ if ((error = fgetstartvp(atfd, fname, &startvp)) != 0)
+ return (set_errno(error));
+ if (AU_AUDITING() && startvp != NULL)
+ audit_setfsat_path(1);
+
+ /* Do not allow E_OK unless AT_EACCESS flag is set */
+ if ((flag & AT_EACCESS) == 0)
+ fmode &= ~E_OK;
+
+ error = lx_common_access(fname, fmode, startvp);
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+int
+lx_access(char *fname, int fmode)
+{
+ return (lx_faccessat(LX_AT_FDCWD, fname, fmode, 0));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_aio.c b/usr/src/uts/common/brand/lx/syscall/lx_aio.c
new file mode 100644
index 0000000000..c821e72538
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_aio.c
@@ -0,0 +1,1345 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * Linux aio syscall support.
+ *
+ * The Linux story around the io_* syscalls is very confusing. The io_* syscalls
+ * are not exposed via glibc and in fact, glibc seems to implement its own aio
+ * without using the io_* syscalls at all. However, there is the libaio library
+ * which uses the io_* syscalls, although its implementation of the io_*
+ * functions (with the same names!) is different from the syscalls themselves,
+ * and it uses different definitions for some of the structures involved.
+ *
+ * These syscalls are documented to use an aio_context_t for the context
+ * parameter. On Linux this is a ulong_t. The contexts live in the kernel
+ * address space and are looked up using the aio_context_t parameter. However,
+ * the Linux libaio library, which is a consumer of the io_* syscalls, abuses
+ * the context by assuming it can be used as a pointer into memory that is
+ * mapped into the process. To accomodate this abomination we map a page of
+ * anonymous memory and expose the context to user-land as a pointer offset
+ * into that page. The page itself is never used by our code and our internal
+ * context ID is simply an integer we calculate based on the page pointer
+ * offset.
+ *
+ * Most applications never use aio, so we don't want an implementation that
+ * adds overhead to every process, but on the other hand, when an application is
+ * using aio, it is for performance reasons and we want to be as efficient as
+ * possible. In particular, we don't want to dynamically allocate resources
+ * in the paths that enqueue I/O. Instead, we pre-allocate the resources
+ * we may need when the application performs the io_setup call and keep the
+ * io_submit and io_getevents calls streamlined.
+ *
+ * The general approach here is inspired by the native aio support provided by
+ * libc in user-land. We have worker threads that pick up pending work from
+ * the context "lxioctx_pending" list and synchronously issue the operation in
+ * the control block. When the operation completes, the thread places the
+ * control block into the context "lxioctx_done" list for later consumption by
+ * io_getevents. The thread will then attempt to service another pending
+ * operation or wait for more work to arrive.
+ *
+ * The control blocks on the pending or done lists are referenced by an
+ * lx_io_elem_t struct. This simply holds a pointer to the user-land control
+ * block and the result of the operation. These elements are pre-allocated at
+ * io_setup time and stored on the context "lxioctx_free" list.
+ *
+ * io_submit pulls elements off of the free list, places them on the pending
+ * list and kicks a worker thread to run. io_getevents pulls elements off of
+ * the done list, sets up an event to return, and places the elements back
+ * onto the free list.
+ *
+ * The worker threads are pre-allocated at io_setup time. These are LWP's
+ * that are part of the process, but never leave the kernel. The number of
+ * LWP's is allocated based on the nr_events argument to io_setup. Because
+ * this argument can theoretically be large (up to LX_AIO_MAX_NR), we want to
+ * pre-allocate enough threads to get good I/O concurrency, but not overdo it.
+ * For a small nr_events (<= lx_aio_base_workers) we pre-allocate as many
+ * threads as nr_events so that all of the the I/O can run in parallel. Once
+ * we exceed lx_aio_base_workers, we scale up the number of threads by 2, until
+ * we hit the maximum at lx_aio_max_workers. See the code in io_setup for more
+ * information.
+ *
+ * Because the worker threads never leave the kernel, they are marked with the
+ * TP_KTHREAD bit so that /proc operations essentially ignore them. We also tag
+ * the brand lwp flags with the BR_AIO_LWP bit so that these threads never
+ * appear in the lx /proc. Aside from servicing aio submissions, the worker
+ * threads don't participate in most application-initiated operations. Forking
+ * is a special case for the workers. The Linux fork(2) and vfork(2) behavior
+ * always forks only a single thread; the caller. However, during cfork() the
+ * system attempts to quiesce all threads by calling holdlwps(). The workers
+ * check for SHOLDFORK and SHOLDFORK1 in their loops and suspend themselves ala
+ * holdlwp() if the process forks.
+ *
+ * It is hard to make any generalized statements about how the aio syscalls
+ * are used in production. MySQL is one of the more popular consumers of aio
+ * and in the default configuration it will create 10 contexts with a capacity
+ * of 256 I/Os (io_setup nr_events) and 1 context with a capacity of 100 I/Os.
+ * Another application we've seen will create 8 contexts, each with a capacity
+ * of 128 I/Os. In practice 1-7 was the typical number of in-flight I/Os.
+ *
+ * The default configuration for MySQL uses 4 read and 4 write threads. Each
+ * thread has an associated context. MySQL also allocates 3 additional contexts,
+ * so in the default configuration it will only use 11, but the number of
+ * read and write threads can be tuned up to a maximum of 64. We can expand
+ * a process's number of contexts up to a maximum of LX_IOCTX_CNT_MAX, which
+ * is significantly more than we've ever seen in use.
+ *
+ * According to www.kernel.org/doc/Documentation/sysctl/fs.txt, the
+ * /proc/sys/fs entries for aio are:
+ * - aio-nr: The total of all nr_events values specified on the io_setup
+ * call for every active context.
+ * - aio-max-nr: The upper limit for aio-nr
+ * aio-nr is tracked as a zone-wide value. We keep aio-max-nr limited to
+ * LX_AIO_MAX_NR, which matches Linux and provides plenty of headroom for the
+ * zone.
+ */
+
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <sys/brand.h>
+#include <sys/sysmacros.h>
+#include <sys/sdt.h>
+#include <sys/procfs.h>
+#include <sys/eventfd.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_syscalls.h>
+#include <sys/lx_misc.h>
+#include <lx_errno.h>
+
+/* These constants match Linux */
+#define LX_IOCB_FLAG_RESFD 0x0001
+#define LX_IOCB_CMD_PREAD 0
+#define LX_IOCB_CMD_PWRITE 1
+#define LX_IOCB_CMD_FSYNC 2
+#define LX_IOCB_CMD_FDSYNC 3
+#define LX_IOCB_CMD_PREADX 4
+#define LX_IOCB_CMD_POLL 5
+#define LX_IOCB_CMD_NOOP 6
+#define LX_IOCB_CMD_PREADV 7
+#define LX_IOCB_CMD_PWRITEV 8
+
+#define LX_KIOCB_KEY 0
+
+/*
+ * Base and max. number of contexts/process. Note that we currently map one
+ * page to manage the user-level context ID, so that code must be adjusted if
+ * LX_IOCTX_CNT_MAX is ever enlarged. Currently, this is the limit for the
+ * number of 64-bit pointers in one 4k page.
+ */
+#define LX_IOCTX_CNT_BASE 16
+#define LX_IOCTX_CNT_MAX 512
+
+/*
+ * Max number of control block pointers, or lx_io_event_t's, to allocate on the
+ * stack in io_submit or io_getevents.
+ */
+#define MAX_ALLOC_ON_STACK 128
+#define alloca(x) __builtin_alloca(x)
+extern void *__builtin_alloca(size_t);
+
+/* The context is an offset within the ctxpage we mapped */
+#define CTXID_TO_PTR(L, I) ((L)->l_io_ctxpage + ((I) * sizeof (uintptr_t)))
+#define PTR_TO_CTXID(L, P) ((int)((uintptr_t)(P) - (L)->l_io_ctxpage) / \
+ sizeof (uintptr_t))
+
+typedef ulong_t lx_aio_context_t;
+
+uint_t lx_aio_base_workers = 16; /* num threads/context before scaling */
+uint_t lx_aio_max_workers = 32; /* upper limit on threads/context */
+
+/*
+ * Internal representation of an aio context.
+ */
+typedef struct lx_io_ctx {
+ boolean_t lxioctx_shutdown; /* context is being destroyed */
+ uint_t lxioctx_maxn; /* nr_events from io_setup */
+ uint_t lxioctx_in_use; /* reference counter */
+ kmutex_t lxioctx_f_lock; /* free list lock */
+ uint_t lxioctx_free_cnt; /* num. elements in free list */
+ list_t lxioctx_free; /* free list */
+ kmutex_t lxioctx_p_lock; /* pending list lock */
+ kcondvar_t lxioctx_pending_cv; /* pending list cv */
+ list_t lxioctx_pending; /* pending list */
+ kmutex_t lxioctx_d_lock; /* done list lock */
+ kcondvar_t lxioctx_done_cv; /* done list cv */
+ uint_t lxioctx_done_cnt; /* num. elements in done list */
+ list_t lxioctx_done; /* done list */
+} lx_io_ctx_t;
+
+/*
+ * Linux binary definition of an I/O event.
+ */
+typedef struct lx_io_event {
+ uint64_t lxioe_data; /* data payload */
+ uint64_t lxioe_object; /* object of origin */
+ int64_t lxioe_res; /* result code */
+ int64_t lxioe_res2; /* "secondary" result (WTF?) */
+} lx_io_event_t;
+
+/*
+ * Linux binary definition of an I/O control block.
+ */
+typedef struct lx_iocb {
+ uint64_t lxiocb_data; /* data payload */
+ uint32_t lxiocb_key; /* must be LX_KIOCB_KEY (!) */
+ uint32_t lxiocb_reserved1;
+ uint16_t lxiocb_op; /* operation */
+ int16_t lxiocb_reqprio; /* request priority */
+ uint32_t lxiocb_fd; /* file descriptor */
+ uint64_t lxiocb_buf; /* data buffer */
+ uint64_t lxiocb_nbytes; /* number of bytes */
+ int64_t lxiocb_offset; /* offset in file */
+ uint64_t lxiocb_reserved2;
+ uint32_t lxiocb_flags; /* LX_IOCB_FLAG_* flags */
+ uint32_t lxiocb_resfd; /* eventfd fd, if any */
+} lx_iocb_t;
+
+typedef struct lx_io_elem {
+ list_node_t lxioelem_link;
+ uint16_t lxioelem_op; /* operation */
+ uint16_t lxioelem_flags; /* bits from lxiocb_flags */
+ int lxioelem_fd; /* file descriptor */
+ file_t *lxioelem_fp; /* getf() file pointer */
+ int lxioelem_resfd; /* RESFD file descriptor */
+ file_t *lxioelem_resfp; /* RESFD getf() file pointer */
+ void *lxioelem_buf; /* data buffer */
+ uint64_t lxioelem_nbytes; /* number of bytes */
+ int64_t lxioelem_offset; /* offset in file */
+ uint64_t lxioelem_data;
+ ssize_t lxioelem_res;
+ void *lxioelem_cbp; /* ptr to iocb in userspace */
+} lx_io_elem_t;
+
+/* From lx_rw.c */
+extern ssize_t lx_pread_fp(file_t *, void *, size_t, off64_t);
+extern ssize_t lx_pwrite_fp(file_t *, void *, size_t, off64_t);
+
+/* From common/syscall/rw.c */
+extern int fdsync(int, int);
+/* From common/os/grow.c */
+extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t);
+
+/*
+ * Given an aio_context ID, return our internal context pointer with an
+ * additional ref. count, or NULL if cp not found.
+ */
+static lx_io_ctx_t *
+lx_io_cp_hold(lx_aio_context_t cid)
+{
+ int id;
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+ lx_io_ctx_t *cp;
+
+ mutex_enter(&lxpd->l_io_ctx_lock);
+
+ if (lxpd->l_io_ctxs == NULL) {
+ ASSERT(lxpd->l_io_ctx_cnt == 0);
+ ASSERT(lxpd->l_io_ctxpage == NULL);
+ goto bad;
+ }
+
+ id = PTR_TO_CTXID(lxpd, cid);
+ if (id < 0 || id >= lxpd->l_io_ctx_cnt)
+ goto bad;
+
+ if ((cp = lxpd->l_io_ctxs[id]) == NULL)
+ goto bad;
+
+ if (cp->lxioctx_shutdown)
+ goto bad;
+
+ atomic_inc_32(&cp->lxioctx_in_use);
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ return (cp);
+
+bad:
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ return (NULL);
+}
+
+/*
+ * Release a hold on the context and clean up the context if it was the last
+ * hold.
+ */
+static void
+lx_io_cp_rele(lx_io_ctx_t *cp)
+{
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+ lx_zone_data_t *lxzd;
+ int i;
+ lx_io_elem_t *ep;
+
+ mutex_enter(&lxpd->l_io_ctx_lock);
+ ASSERT(cp->lxioctx_in_use >= 1);
+ if (cp->lxioctx_in_use > 1) {
+ atomic_dec_32(&cp->lxioctx_in_use);
+ /* wake all threads waiting on context rele */
+ cv_broadcast(&lxpd->l_io_destroy_cv);
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ return;
+ }
+
+ /*
+ * We hold the last ref.
+ */
+ for (i = 0; i < lxpd->l_io_ctx_cnt; i++) {
+ if (lxpd->l_io_ctxs[i] == cp) {
+ lxpd->l_io_ctxs[i] = NULL;
+ break;
+ }
+ }
+ ASSERT(i < lxpd->l_io_ctx_cnt);
+ /* wake all threads waiting on context destruction */
+ cv_broadcast(&lxpd->l_io_destroy_cv);
+ ASSERT(cp->lxioctx_shutdown == B_TRUE);
+
+ mutex_exit(&lxpd->l_io_ctx_lock);
+
+ /* can now decrement the zone's overall aio counter */
+ lxzd = ztolxzd(curproc->p_zone);
+ mutex_enter(&lxzd->lxzd_lock);
+ VERIFY(cp->lxioctx_maxn <= lxzd->lxzd_aio_nr);
+ lxzd->lxzd_aio_nr -= cp->lxioctx_maxn;
+ mutex_exit(&lxzd->lxzd_lock);
+
+ /*
+ * We have the only pointer to the context now. Free all
+ * elements from all three queues and the context itself.
+ */
+ while ((ep = list_remove_head(&cp->lxioctx_free)) != NULL) {
+ kmem_free(ep, sizeof (lx_io_elem_t));
+ }
+
+ /*
+ * During io_submit() we use getf() to get/validate the file pointer
+ * for the file descriptor in each control block. We do not releasef()
+ * the fd, but instead pass along the fd and file pointer to the worker
+ * threads. In order to manage this hand-off we use clear_active_fd()
+ * in the syscall path and then in our thread which takes over the file
+ * descriptor, we use a combination of set_active_fd() and releasef().
+ * Because our thread that is taking ownership of the fd has not called
+ * getf(), we first call set_active_fd(-1) to reserve a slot in the
+ * active fd array for ourselves.
+ */
+ set_active_fd(-1);
+ while ((ep = list_remove_head(&cp->lxioctx_pending)) != NULL) {
+ set_active_fd(ep->lxioelem_fd);
+ releasef(ep->lxioelem_fd);
+
+ if (ep->lxioelem_flags & LX_IOCB_FLAG_RESFD) {
+ set_active_fd(ep->lxioelem_resfd);
+ releasef(ep->lxioelem_resfd);
+ }
+
+ kmem_free(ep, sizeof (lx_io_elem_t));
+ }
+
+ while ((ep = list_remove_head(&cp->lxioctx_done)) != NULL) {
+ kmem_free(ep, sizeof (lx_io_elem_t));
+ }
+
+ ASSERT(list_is_empty(&cp->lxioctx_free));
+ list_destroy(&cp->lxioctx_free);
+ ASSERT(list_is_empty(&cp->lxioctx_pending));
+ list_destroy(&cp->lxioctx_pending);
+ ASSERT(list_is_empty(&cp->lxioctx_done));
+ list_destroy(&cp->lxioctx_done);
+
+ kmem_free(cp, sizeof (lx_io_ctx_t));
+}
+
+/*
+ * Called by a worker thread to perform the operation specified in the control
+ * block.
+ *
+ * Linux returns a negative errno in the event "lxioelem_res" field as the
+ * result of a failed operation. We do the same.
+ */
+static void
+lx_io_do_op(lx_io_elem_t *ep)
+{
+ int err;
+ int64_t res = 0;
+
+ set_active_fd(ep->lxioelem_fd);
+
+ ttolwp(curthread)->lwp_errno = 0;
+ switch (ep->lxioelem_op) {
+ case LX_IOCB_CMD_FSYNC:
+ case LX_IOCB_CMD_FDSYNC:
+ /*
+ * Note that Linux always returns EINVAL for these two
+ * operations. This is apparently because nothing in Linux
+ * defines the 'aio_fsync' function. Thus, it is unlikely any
+ * application will actually submit these.
+ *
+ * This is basically fdsync(), but we already have the fp.
+ */
+ err = VOP_FSYNC(ep->lxioelem_fp->f_vnode,
+ (ep->lxioelem_op == LX_IOCB_CMD_FSYNC) ? FSYNC : FDSYNC,
+ ep->lxioelem_fp->f_cred, NULL);
+ if (err != 0) {
+ (void) set_errno(err);
+ }
+
+ break;
+
+ case LX_IOCB_CMD_PREAD:
+ res = lx_pread_fp(ep->lxioelem_fp, ep->lxioelem_buf,
+ ep->lxioelem_nbytes, ep->lxioelem_offset);
+ break;
+
+ case LX_IOCB_CMD_PWRITE:
+ res = lx_pwrite_fp(ep->lxioelem_fp, ep->lxioelem_buf,
+ ep->lxioelem_nbytes, ep->lxioelem_offset);
+ break;
+
+ default:
+ /* We validated the op at io_submit syscall time */
+ VERIFY(0);
+ break;
+ }
+ if (ttolwp(curthread)->lwp_errno != 0)
+ res = -lx_errno(ttolwp(curthread)->lwp_errno, EINVAL);
+
+ ep->lxioelem_res = res;
+
+ releasef(ep->lxioelem_fd);
+ ep->lxioelem_fd = 0;
+ ep->lxioelem_fp = NULL;
+}
+
+/*
+ * The operation has either completed or been cancelled. Finalize the handling
+ * and move the operation onto the "done" queue.
+ */
+static void
+lx_io_finish_op(lx_io_ctx_t *cp, lx_io_elem_t *ep, boolean_t do_event)
+{
+ boolean_t do_resfd;
+ int resfd = 0;
+ file_t *resfp = NULL;
+
+ if (ep->lxioelem_flags & LX_IOCB_FLAG_RESFD) {
+ do_resfd = B_TRUE;
+ resfd = ep->lxioelem_resfd;
+ resfp = ep->lxioelem_resfp;
+ } else {
+ do_resfd = B_FALSE;
+ }
+
+ ep->lxioelem_flags = 0;
+ ep->lxioelem_resfd = 0;
+ ep->lxioelem_resfp = NULL;
+
+ mutex_enter(&cp->lxioctx_d_lock);
+ list_insert_tail(&cp->lxioctx_done, ep);
+ cp->lxioctx_done_cnt++;
+ cv_signal(&cp->lxioctx_done_cv);
+ mutex_exit(&cp->lxioctx_d_lock);
+
+ /* Update the eventfd if necessary */
+ if (do_resfd) {
+ vnode_t *vp = resfp->f_vnode;
+ uint64_t val = 1;
+
+ set_active_fd(resfd);
+
+ if (do_event) {
+ /*
+ * Eventfd notifications from AIO are special in that
+ * they are not expected to block. This interface allows
+ * the eventfd value to reach (but not cross) the
+ * overflow value.
+ */
+ (void) VOP_IOCTL(vp, EVENTFDIOC_POST, (intptr_t)&val,
+ FKIOCTL, resfp->f_cred, NULL, NULL);
+ }
+
+ releasef(resfd);
+ }
+}
+
+/*
+ * First check if this worker needs to quit due to shutdown or exit. Return
+ * true in this case.
+ *
+ * Then check if our process is forking. In this case it expects all LWPs to be
+ * stopped first. For the worker threads, a stop equivalent to holdlwp() is
+ * necessary before the fork can proceed.
+ *
+ * It is common to check p_flag outside of p_lock (see issig) and we want to
+ * avoid making p_lock any hotter since this is called in the worker main loops.
+ */
+static boolean_t
+lx_io_worker_chk_status(lx_io_ctx_t *cp, boolean_t locked)
+{
+ if (cp->lxioctx_shutdown)
+ return (B_TRUE);
+
+ if (curproc->p_flag & (SEXITLWPS | SKILLED)) {
+ cp->lxioctx_shutdown = B_TRUE;
+ return (B_TRUE);
+ }
+
+ if (curproc->p_flag & (SHOLDFORK | SHOLDFORK1)) {
+ if (locked)
+ mutex_exit(&cp->lxioctx_p_lock);
+
+ mutex_enter(&curproc->p_lock);
+ stop(PR_SUSPENDED, SUSPEND_NORMAL);
+ mutex_exit(&curproc->p_lock);
+
+ if (locked)
+ mutex_enter(&cp->lxioctx_p_lock);
+
+ if (cp->lxioctx_shutdown)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Worker thread - pull work off the pending queue, perform the operation and
+ * place the result on the done queue. Do this as long as work is pending, then
+ * wait for more.
+ */
+static void
+lx_io_worker(void *a)
+{
+ lx_io_ctx_t *cp = (lx_io_ctx_t *)a;
+ lx_io_elem_t *ep;
+
+ set_active_fd(-1); /* See comment in lx_io_cp_rele */
+
+ while (!cp->lxioctx_shutdown) {
+ mutex_enter(&cp->lxioctx_p_lock);
+ if (list_is_empty(&cp->lxioctx_pending)) {
+ /*
+ * This must be cv_wait_sig, as opposed to cv_wait, so
+ * that pokelwps works correctly on these threads.
+ *
+ * The worker threads have all of their signals held,
+ * so a cv_wait_sig return of 0 here only occurs while
+ * we're shutting down.
+ */
+ if (cv_wait_sig(&cp->lxioctx_pending_cv,
+ &cp->lxioctx_p_lock) == 0)
+ cp->lxioctx_shutdown = B_TRUE;
+ }
+
+ if (lx_io_worker_chk_status(cp, B_TRUE)) {
+ mutex_exit(&cp->lxioctx_p_lock);
+ break;
+ }
+
+ ep = list_remove_head(&cp->lxioctx_pending);
+ mutex_exit(&cp->lxioctx_p_lock);
+
+ while (ep != NULL) {
+ lx_io_do_op(ep);
+
+ lx_io_finish_op(cp, ep, B_TRUE);
+
+ if (lx_io_worker_chk_status(cp, B_FALSE))
+ break;
+
+ mutex_enter(&cp->lxioctx_p_lock);
+ ep = list_remove_head(&cp->lxioctx_pending);
+ mutex_exit(&cp->lxioctx_p_lock);
+ }
+ }
+
+ lx_io_cp_rele(cp);
+
+ ASSERT(curthread->t_lwp != NULL);
+ mutex_enter(&curproc->p_lock);
+ lwp_exit();
+}
+
+/*
+ * LTP passes -1 for nr_events but we're limited by LX_AIO_MAX_NR anyway.
+ */
+long
+lx_io_setup(uint_t nr_events, void *ctxp)
+{
+ int i, slot;
+ proc_t *p = curproc;
+ lx_proc_data_t *lxpd = ptolxproc(p);
+ lx_zone_data_t *lxzd = ztolxzd(p->p_zone);
+ lx_io_ctx_t *cp;
+ lx_io_elem_t *ep;
+ uintptr_t cid;
+ uint_t nworkers;
+ k_sigset_t hold_set;
+
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ uintptr32_t cid32;
+
+ if (copyin(ctxp, &cid32, sizeof (cid32)) != 0)
+ return (set_errno(EFAULT));
+ cid = (uintptr_t)cid32;
+ } else
+#endif
+ if (copyin(ctxp, &cid, sizeof (cid)) != 0)
+ return (set_errno(EFAULT));
+
+ /* The cid in user-land must be NULL to start */
+ if (cid != NULL || nr_events > LX_AIO_MAX_NR)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&lxzd->lxzd_lock);
+ if ((nr_events + lxzd->lxzd_aio_nr) > LX_AIO_MAX_NR) {
+ mutex_exit(&lxzd->lxzd_lock);
+ return (set_errno(EAGAIN));
+ }
+ lxzd->lxzd_aio_nr += nr_events;
+ mutex_exit(&lxzd->lxzd_lock);
+
+ /* Find a free slot */
+ mutex_enter(&lxpd->l_io_ctx_lock);
+ if (lxpd->l_io_ctxs == NULL) {
+ /*
+ * First use of aio, allocate a context array and a page
+ * in our address space to use for context ID handling.
+ */
+ uintptr_t ctxpage;
+
+ ASSERT(lxpd->l_io_ctx_cnt == 0);
+ ASSERT(lxpd->l_io_ctxpage == NULL);
+
+ ttolwp(curthread)->lwp_errno = 0;
+ ctxpage = (uintptr_t)smmap64(0, PAGESIZE, PROT_READ,
+ MAP_SHARED | MAP_ANON, -1, 0);
+ if (ttolwp(curthread)->lwp_errno != 0) {
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ return (set_errno(ENOMEM));
+ }
+
+ lxpd->l_io_ctxpage = ctxpage;
+ lxpd->l_io_ctx_cnt = LX_IOCTX_CNT_BASE;
+ lxpd->l_io_ctxs = kmem_zalloc(lxpd->l_io_ctx_cnt *
+ sizeof (lx_io_ctx_t *), KM_SLEEP);
+ slot = 0;
+ } else {
+ ASSERT(lxpd->l_io_ctx_cnt > 0);
+ for (slot = 0; slot < lxpd->l_io_ctx_cnt; slot++) {
+ if (lxpd->l_io_ctxs[slot] == NULL)
+ break;
+ }
+
+ if (slot == lxpd->l_io_ctx_cnt) {
+ /* Double our context array up to the max. */
+ const uint_t new_cnt = lxpd->l_io_ctx_cnt * 2;
+ const uint_t old_size = lxpd->l_io_ctx_cnt *
+ sizeof (lx_io_ctx_t *);
+ const uint_t new_size = new_cnt *
+ sizeof (lx_io_ctx_t *);
+ struct lx_io_ctx **old_array = lxpd->l_io_ctxs;
+
+ if (new_cnt > LX_IOCTX_CNT_MAX) {
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ mutex_enter(&lxzd->lxzd_lock);
+ lxzd->lxzd_aio_nr -= nr_events;
+ mutex_exit(&lxzd->lxzd_lock);
+ return (set_errno(ENOMEM));
+ }
+
+ /* See big theory comment explaining context ID. */
+ VERIFY(PAGESIZE >= new_size);
+ lxpd->l_io_ctxs = kmem_zalloc(new_size, KM_SLEEP);
+
+ bcopy(old_array, lxpd->l_io_ctxs, old_size);
+ kmem_free(old_array, old_size);
+ lxpd->l_io_ctx_cnt = new_cnt;
+
+ /* note: 'slot' is now valid in the new array */
+ }
+ }
+
+ cp = kmem_zalloc(sizeof (lx_io_ctx_t), KM_SLEEP);
+ list_create(&cp->lxioctx_free, sizeof (lx_io_elem_t),
+ offsetof(lx_io_elem_t, lxioelem_link));
+ list_create(&cp->lxioctx_pending, sizeof (lx_io_elem_t),
+ offsetof(lx_io_elem_t, lxioelem_link));
+ list_create(&cp->lxioctx_done, sizeof (lx_io_elem_t),
+ offsetof(lx_io_elem_t, lxioelem_link));
+ mutex_init(&cp->lxioctx_f_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&cp->lxioctx_p_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&cp->lxioctx_d_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&cp->lxioctx_pending_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&cp->lxioctx_done_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Add a hold on this context until we're done setting up */
+ cp->lxioctx_in_use = 1;
+ lxpd->l_io_ctxs[slot] = cp;
+
+ cid = CTXID_TO_PTR(lxpd, slot);
+
+ mutex_exit(&lxpd->l_io_ctx_lock);
+
+ /*
+ * Finish setting up the context.
+ *
+ * The context is in the l_io_ctxs array now, so it is potentially
+ * visible to other threads. However, we have a hold so it cannot be
+ * destroyed, and both lxioctx_free_cnt and lxioctx_maxn are still 0,
+ * so nothing can be submitted to this context yet either.
+ */
+
+ /* Setup the free list of internal control block elements */
+ for (i = 0; i < nr_events; i++) {
+ ep = kmem_zalloc(sizeof (lx_io_elem_t), KM_SLEEP);
+ list_insert_head(&cp->lxioctx_free, ep);
+ }
+
+ /*
+ * Pre-allocate the worker threads at setup time.
+ *
+ * Based on how much concurrent input we may be given, we want enough
+ * worker threads to get good parallelism but we also want to taper off
+ * and cap at our upper limit. Our zone's ZFS I/O limit may also come
+ * into play when we're pumping lots of I/O in parallel.
+ *
+ * Note: a possible enhancement here would be to also limit the number
+ * of worker threads based on the zone's cpu-cap. That is, if the
+ * cap is low, we might not want too many worker threads.
+ */
+ if (nr_events <= lx_aio_base_workers) {
+ nworkers = nr_events;
+ } else {
+ /* scale up until hit max */
+ nworkers = (nr_events / 2) + (lx_aio_base_workers / 2);
+ if (nworkers > lx_aio_max_workers)
+ nworkers = lx_aio_max_workers;
+ }
+
+ sigfillset(&hold_set);
+ for (i = 0; i < nworkers; i++) {
+ klwp_t *l;
+ kthread_t *t;
+
+ /*
+ * Note that this lwp will not "stop at sys_rtt" as described
+ * on lwp_create. This lwp will run entirely in the kernel as
+ * a worker thread serving aio requests.
+ */
+ l = lwp_create(lx_io_worker, (void *)cp, 0, p, TS_STOPPED,
+ minclsyspri - 1, &hold_set, curthread->t_cid, 0);
+ if (l == NULL) {
+ if (i == 0) {
+ /*
+ * Uh-oh - we can't create a single worker.
+ * Release our hold which will cleanup.
+ */
+ cp->lxioctx_shutdown = B_TRUE;
+ mutex_enter(&lxpd->l_io_ctx_lock);
+ cp->lxioctx_maxn = nr_events;
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ lx_io_cp_rele(cp);
+ return (set_errno(ENOMEM));
+ } else {
+ /*
+ * No new lwp but we already have at least 1
+ * worker so don't fail entire syscall.
+ */
+ break;
+ }
+ }
+
+ atomic_inc_32(&cp->lxioctx_in_use);
+
+ /*
+ * Mark it as an in-kernel thread, an lx AIO worker LWP, and
+ * set it running.
+ */
+ t = lwptot(l);
+ mutex_enter(&curproc->p_lock);
+ t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
+ lwptolxlwp(l)->br_lwp_flags |= BR_AIO_LWP;
+ lwp_create_done(t);
+ mutex_exit(&curproc->p_lock);
+ }
+
+ /*
+ * io_submit can occur once lxioctx_free_cnt and lxioctx_maxn are
+ * non-zero.
+ */
+ mutex_enter(&lxpd->l_io_ctx_lock);
+ cp->lxioctx_maxn = cp->lxioctx_free_cnt = nr_events;
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ /* Release our hold, worker thread refs keep ctx alive. */
+ lx_io_cp_rele(cp);
+
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ uintptr32_t cid32 = (uintptr32_t)cid;
+
+ if (copyout(&cid32, ctxp, sizeof (cid32)) != 0) {
+ /* Since we did a copyin above, this shouldn't fail */
+ (void) lx_io_destroy(cid);
+ return (set_errno(EFAULT));
+ }
+ } else
+#endif
+ if (copyout(&cid, ctxp, sizeof (cid)) != 0) {
+ /* Since we did a copyin above, this shouldn't fail */
+ (void) lx_io_destroy(cid);
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+
+long
+lx_io_submit(lx_aio_context_t cid, const long nr, uintptr_t **bpp)
+{
+ uint_t i = 0;
+ int err = 0;
+ const size_t sz = nr * sizeof (uintptr_t);
+ lx_io_ctx_t *cp;
+ lx_io_elem_t *ep;
+ lx_iocb_t **iocbpp;
+
+ if ((cp = lx_io_cp_hold(cid)) == NULL)
+ return (set_errno(EINVAL));
+
+ if (nr == 0) {
+ lx_io_cp_rele(cp);
+ return (0);
+ }
+
+ if (nr < 0 || nr > cp->lxioctx_maxn) {
+ lx_io_cp_rele(cp);
+ return (set_errno(EINVAL));
+ }
+
+ if (nr > MAX_ALLOC_ON_STACK) {
+ iocbpp = (lx_iocb_t **)kmem_alloc(sz, KM_NOSLEEP);
+ if (iocbpp == NULL) {
+ lx_io_cp_rele(cp);
+ return (set_errno(EAGAIN));
+ }
+ } else {
+ iocbpp = (lx_iocb_t **)alloca(sz);
+ }
+
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ uintptr32_t *iocbpp32;
+
+ if (copyin(bpp, iocbpp, nr * sizeof (uintptr32_t)) != 0) {
+ lx_io_cp_rele(cp);
+ err = EFAULT;
+ goto out;
+ }
+
+ /*
+ * Zero-extend the 32-bit pointers to proper size. This is
+ * performed "in reverse" so it can be done in-place, rather
+ * than with an additional translation copy.
+ */
+ iocbpp32 = (uintptr32_t *)iocbpp;
+ i = nr;
+ do {
+ i--;
+ iocbpp[i] = (lx_iocb_t *)(uintptr_t)iocbpp32[i];
+ } while (i != 0);
+ } else
+#endif
+ if (copyin(bpp, iocbpp, nr * sizeof (uintptr_t)) != 0) {
+ lx_io_cp_rele(cp);
+ err = EFAULT;
+ goto out;
+ }
+
+ /* We need to return an error if not able to process any of them */
+ mutex_enter(&cp->lxioctx_f_lock);
+ if (cp->lxioctx_free_cnt == 0) {
+ mutex_exit(&cp->lxioctx_f_lock);
+ lx_io_cp_rele(cp);
+ err = EAGAIN;
+ goto out;
+ }
+ mutex_exit(&cp->lxioctx_f_lock);
+
+ for (i = 0; i < nr; i++) {
+ lx_iocb_t cb;
+ file_t *fp, *resfp = NULL;
+
+ if (cp->lxioctx_shutdown)
+ break;
+
+ if (copyin(iocbpp[i], &cb, sizeof (lx_iocb_t)) != 0) {
+ err = EFAULT;
+ break;
+ }
+
+ /* There is only one valid flag */
+ if (cb.lxiocb_flags & ~LX_IOCB_FLAG_RESFD) {
+ err = EINVAL;
+ break;
+ }
+
+ switch (cb.lxiocb_op) {
+ case LX_IOCB_CMD_FSYNC:
+ case LX_IOCB_CMD_FDSYNC:
+ case LX_IOCB_CMD_PREAD:
+ case LX_IOCB_CMD_PWRITE:
+ break;
+
+ /*
+ * We don't support asynchronous preadv and pwritev (an
+ * asynchronous scatter/gather being a somewhat odd
+ * notion to begin with); we return EINVAL for that
+ * case, which the caller should be able to deal with.
+ * We also return EINVAL for LX_IOCB_CMD_NOOP or any
+ * unrecognized opcode.
+ */
+ default:
+ err = EINVAL;
+ break;
+ }
+ if (err != 0)
+ break;
+
+ /* Validate fd */
+ if ((fp = getf(cb.lxiocb_fd)) == NULL) {
+ err = EBADF;
+ break;
+ }
+
+ if (cb.lxiocb_op == LX_IOCB_CMD_PREAD &&
+ (fp->f_flag & FREAD) == 0) {
+ err = EBADF;
+ releasef(cb.lxiocb_fd);
+ break;
+ } else if (cb.lxiocb_op == LX_IOCB_CMD_PWRITE &&
+ (fp->f_flag & FWRITE) == 0) {
+ err = EBADF;
+ releasef(cb.lxiocb_fd);
+ break;
+ }
+
+ /*
+ * A character device is a bit complicated. Linux seems to
+ * accept these on some devices (e.g. /dev/zero) but not
+ * others (e.g. /proc/self/fd/0). This might be related to
+ * the device being seek-able, but a simple seek-set to the
+ * current offset will succeed for us on a pty. For now we
+ * handle this by rejecting the device if it is a stream.
+ *
+ * If it is a pipe (VFIFO) or directory (VDIR), we error here
+ * as does Linux. If it is a socket (VSOCK), it's ok here but
+ * we will post ESPIPE when processing the I/O CB, as does
+ * Linux. We also error on our other types: VDOOR, VPROC,
+ * VPORT, VBAD.
+ */
+ if (fp->f_vnode->v_type == VCHR) {
+ if (fp->f_vnode->v_stream != NULL) {
+ err = EINVAL;
+ releasef(cb.lxiocb_fd);
+ break;
+ }
+ } else if (fp->f_vnode->v_type != VREG &&
+ fp->f_vnode->v_type != VBLK &&
+ fp->f_vnode->v_type != VSOCK) {
+ err = EINVAL;
+ releasef(cb.lxiocb_fd);
+ break;
+ }
+
+ if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) {
+ if ((resfp = getf(cb.lxiocb_resfd)) == NULL ||
+ !lx_is_eventfd(resfp)) {
+ err = EINVAL;
+ releasef(cb.lxiocb_fd);
+ if (resfp != NULL)
+ releasef(cb.lxiocb_resfd);
+ break;
+ }
+ }
+
+ mutex_enter(&cp->lxioctx_f_lock);
+ if (cp->lxioctx_free_cnt == 0) {
+ mutex_exit(&cp->lxioctx_f_lock);
+ releasef(cb.lxiocb_fd);
+ if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) {
+ releasef(cb.lxiocb_resfd);
+ }
+ if (i == 0) {
+ /*
+ * Another thread used all of the free entries
+ * after the check preceding this loop. Since
+ * we did nothing, we must return an error.
+ */
+ err = EAGAIN;
+ }
+ break;
+ }
+ ep = list_remove_head(&cp->lxioctx_free);
+ cp->lxioctx_free_cnt--;
+ ASSERT(ep != NULL);
+ mutex_exit(&cp->lxioctx_f_lock);
+
+ ep->lxioelem_op = cb.lxiocb_op;
+ ep->lxioelem_fd = cb.lxiocb_fd;
+ ep->lxioelem_fp = fp;
+ ep->lxioelem_buf = (void *)(uintptr_t)cb.lxiocb_buf;
+ ep->lxioelem_nbytes = cb.lxiocb_nbytes;
+ ep->lxioelem_offset = cb.lxiocb_offset;
+ ep->lxioelem_data = cb.lxiocb_data;
+ ep->lxioelem_cbp = iocbpp[i];
+
+ /* Hang on to the fp but setup to hand it off to a worker */
+ clear_active_fd(cb.lxiocb_fd);
+
+ if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) {
+ ep->lxioelem_flags = LX_IOCB_FLAG_RESFD;
+ ep->lxioelem_resfd = cb.lxiocb_resfd;
+ ep->lxioelem_resfp = resfp;
+ clear_active_fd(cb.lxiocb_resfd);
+ }
+
+ mutex_enter(&cp->lxioctx_p_lock);
+ list_insert_tail(&cp->lxioctx_pending, ep);
+ cv_signal(&cp->lxioctx_pending_cv);
+ mutex_exit(&cp->lxioctx_p_lock);
+ }
+
+ lx_io_cp_rele(cp);
+
+out:
+ if (nr > MAX_ALLOC_ON_STACK) {
+ kmem_free(iocbpp, sz);
+ }
+ if (i == 0 && err != 0)
+ return (set_errno(err));
+
+ return (i);
+}
+
+long
+lx_io_getevents(lx_aio_context_t cid, long min_nr, const long nr,
+ lx_io_event_t *events, timespec_t *timeoutp)
+{
+ int i;
+ lx_io_ctx_t *cp;
+ const size_t sz = nr * sizeof (lx_io_event_t);
+ timespec_t timeout, *tp;
+ lx_io_event_t *out;
+
+ if ((cp = lx_io_cp_hold(cid)) == NULL)
+ return (set_errno(EINVAL));
+
+ if (min_nr < 0 || min_nr > cp->lxioctx_maxn ||
+ nr < 0 || nr > cp->lxioctx_maxn) {
+ lx_io_cp_rele(cp);
+ return (set_errno(EINVAL));
+ }
+
+ if (nr == 0) {
+ lx_io_cp_rele(cp);
+ return (0);
+ }
+
+ if (events == NULL) {
+ lx_io_cp_rele(cp);
+ return (set_errno(EFAULT));
+ }
+
+ if (timeoutp == NULL) {
+ tp = NULL;
+ } else {
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &timeout, sizeof (timestruc_t))) {
+ lx_io_cp_rele(cp);
+ return (EFAULT);
+ }
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ timestruc32_t timeout32;
+ if (copyin(timeoutp, &timeout32,
+ sizeof (timestruc32_t))) {
+ lx_io_cp_rele(cp);
+ return (EFAULT);
+ }
+ timeout.tv_sec = (time_t)timeout32.tv_sec;
+ timeout.tv_nsec = timeout32.tv_nsec;
+ }
+#endif
+
+ if (itimerspecfix(&timeout)) {
+ lx_io_cp_rele(cp);
+ return (EINVAL);
+ }
+
+ tp = &timeout;
+ if (timeout.tv_sec == 0 && timeout.tv_nsec == 0) {
+ /*
+ * A timeout of 0:0 is like a poll; we return however
+ * many events are ready, irrespective of the passed
+ * min_nr.
+ */
+ min_nr = 0;
+ } else {
+ timestruc_t now;
+
+ /*
+ * We're given a relative time; add it to the current
+ * time to derive an absolute time.
+ */
+ gethrestime(&now);
+ timespecadd(tp, &now);
+ }
+ }
+
+ out = kmem_zalloc(sz, KM_SLEEP);
+
+ /*
+ * A min_nr of 0 is like a poll even if given a NULL timeout; we return
+ * however many events are ready.
+ */
+ if (min_nr > 0) {
+ mutex_enter(&cp->lxioctx_d_lock);
+ while (!cp->lxioctx_shutdown && cp->lxioctx_done_cnt < min_nr) {
+ int r;
+
+ r = cv_waituntil_sig(&cp->lxioctx_done_cv,
+ &cp->lxioctx_d_lock, tp, timechanged);
+ if (r < 0) {
+ /* timeout */
+ mutex_exit(&cp->lxioctx_d_lock);
+ lx_io_cp_rele(cp);
+ kmem_free(out, sz);
+ return (0);
+ } else if (r == 0) {
+ /* interrupted */
+ mutex_exit(&cp->lxioctx_d_lock);
+ lx_io_cp_rele(cp);
+ kmem_free(out, sz);
+ return (set_errno(EINTR));
+ }
+
+ /*
+ * Signalled that something was queued up. Check if
+ * there are now enough or if we have to wait for more.
+ */
+ }
+ ASSERT(cp->lxioctx_done_cnt >= min_nr || cp->lxioctx_shutdown);
+ mutex_exit(&cp->lxioctx_d_lock);
+ }
+
+ /*
+ * For each done control block, move it into the Linux event we return.
+ * As we're doing this, we also moving it from the done list to the
+ * free list.
+ */
+ for (i = 0; i < nr && !cp->lxioctx_shutdown; i++) {
+ lx_io_event_t *lxe;
+ lx_io_elem_t *ep;
+
+ lxe = &out[i];
+
+ mutex_enter(&cp->lxioctx_d_lock);
+ if (cp->lxioctx_done_cnt == 0) {
+ mutex_exit(&cp->lxioctx_d_lock);
+ break;
+ }
+
+ ep = list_remove_head(&cp->lxioctx_done);
+ cp->lxioctx_done_cnt--;
+ mutex_exit(&cp->lxioctx_d_lock);
+
+ lxe->lxioe_data = ep->lxioelem_data;
+ lxe->lxioe_object = (uint64_t)(uintptr_t)ep->lxioelem_cbp;
+ lxe->lxioe_res = ep->lxioelem_res;
+ lxe->lxioe_res2 = 0;
+
+ /* Put it back on the free list */
+ ep->lxioelem_cbp = NULL;
+ ep->lxioelem_data = 0;
+ ep->lxioelem_res = 0;
+ mutex_enter(&cp->lxioctx_f_lock);
+ list_insert_head(&cp->lxioctx_free, ep);
+ cp->lxioctx_free_cnt++;
+ mutex_exit(&cp->lxioctx_f_lock);
+ }
+
+ lx_io_cp_rele(cp);
+
+ /*
+ * Note: Linux seems to push the events back into the queue if the
+ * copyout fails. Since this error is due to an application bug, it
+ * seems unlikely we need to worry about it, but we can revisit this
+ * if it is ever seen to be an issue.
+ */
+ if (i > 0 && copyout(out, events, i * sizeof (lx_io_event_t)) != 0) {
+ kmem_free(out, sz);
+ return (set_errno(EFAULT));
+ }
+
+ kmem_free(out, sz);
+ return (i);
+}
+
+/*
+ * Linux never returns 0 from io_cancel. A successful cancellation will return
+ * EINPROGRESS and the result for the cancelled operation will be available via
+ * a normal io_getevents call. The third parameter (the "result") to this
+ * syscall is unused. Note that currently the Linux man pages are incorrect
+ * about this behavior. Also note that in Linux, only the USB driver currently
+ * support aio cancellation, so callers will almost always get EINVAL when they
+ * attempt to cancel an IO on Linux.
+ */
+/*ARGSUSED*/
+long
+lx_io_cancel(lx_aio_context_t cid, lx_iocb_t *iocbp, lx_io_event_t *result)
+{
+ lx_io_ctx_t *cp;
+ lx_io_elem_t *ep;
+ uint32_t buf;
+
+ /*
+ * The Linux io_cancel copies in a field from the iocb in order to
+ * locate the matching kernel-internal structure. To appease the LTP
+ * test case which exercises this, a similar copy is performed here.
+ */
+ if (copyin(iocbp, &buf, sizeof (buf)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ if ((cp = lx_io_cp_hold(cid)) == NULL)
+ return (set_errno(EINVAL));
+
+ /* Try to pull the CB off the pending list */
+ mutex_enter(&cp->lxioctx_p_lock);
+ ep = list_head(&cp->lxioctx_pending);
+ while (ep != NULL) {
+ if (ep->lxioelem_cbp == iocbp) {
+ list_remove(&cp->lxioctx_pending, ep);
+ break;
+ }
+ ep = list_next(&cp->lxioctx_pending, ep);
+ }
+ mutex_exit(&cp->lxioctx_p_lock);
+
+ if (ep == NULL) {
+ lx_io_cp_rele(cp);
+ return (set_errno(EAGAIN));
+ }
+
+ set_active_fd(-1); /* See comment in lx_io_cp_rele */
+ set_active_fd(ep->lxioelem_fd);
+ releasef(ep->lxioelem_fd);
+ ep->lxioelem_fd = 0;
+ ep->lxioelem_fp = NULL;
+ ep->lxioelem_res = -lx_errno(EINTR, EINTR);
+
+ lx_io_finish_op(cp, ep, B_FALSE);
+ lx_io_cp_rele(cp);
+
+ return (set_errno(EINPROGRESS));
+}
+
+long
+lx_io_destroy(lx_aio_context_t cid)
+{
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+ lx_io_ctx_t *cp;
+ int cnt = 0;
+
+ if ((cp = lx_io_cp_hold(cid)) == NULL)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&lxpd->l_io_ctx_lock);
+ cp->lxioctx_shutdown = B_TRUE;
+
+ /*
+ * Wait for the worker threads and any blocked io_getevents threads to
+ * exit. We have a hold and our rele will cleanup after all other holds
+ * are released.
+ */
+ ASSERT(cp->lxioctx_in_use >= 1);
+ while (cp->lxioctx_in_use > 1) {
+ DTRACE_PROBE2(lx__io__destroy, lx_io_ctx_t *, cp, int, cnt);
+ cv_broadcast(&cp->lxioctx_pending_cv);
+ cv_broadcast(&cp->lxioctx_done_cv);
+
+ /*
+ * Each worker has a hold. We want to let those threads finish
+ * up and exit.
+ */
+ cv_wait(&lxpd->l_io_destroy_cv, &lxpd->l_io_ctx_lock);
+ cnt++;
+ }
+
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ lx_io_cp_rele(cp);
+ return (0);
+}
+
+/*
+ * Called at proc fork to clear contexts from child. We don't bother to unmap
+ * l_io_ctxpage since the vast majority of processes will immediately exec and
+ * cause an unmapping. If the child does not exec, there will simply be a
+ * single shared page in its address space, so no additional anonymous memory
+ * is consumed.
+ */
+void
+lx_io_clear(lx_proc_data_t *cpd)
+{
+ cpd->l_io_ctxs = NULL;
+ cpd->l_io_ctx_cnt = 0;
+ cpd->l_io_ctxpage = NULL;
+}
+
+/*
+ * Called via lx_proc_exit to cleanup any existing io context array. All
+ * worker threads should have already exited by this point, so all contexts
+ * should already be deleted.
+ */
+void
+lx_io_cleanup(proc_t *p)
+{
+ lx_proc_data_t *lxpd;
+ int i;
+
+ mutex_enter(&p->p_lock);
+ VERIFY((lxpd = ptolxproc(p)) != NULL);
+ mutex_exit(&p->p_lock);
+
+ mutex_enter(&lxpd->l_io_ctx_lock);
+ if (lxpd->l_io_ctxs == NULL) {
+ ASSERT(lxpd->l_io_ctx_cnt == 0);
+ mutex_exit(&lxpd->l_io_ctx_lock);
+ return;
+ }
+
+ ASSERT(lxpd->l_io_ctx_cnt > 0);
+ for (i = 0; i < lxpd->l_io_ctx_cnt; i++) {
+ ASSERT(lxpd->l_io_ctxs[i] == NULL);
+ }
+
+ kmem_free(lxpd->l_io_ctxs, lxpd->l_io_ctx_cnt * sizeof (lx_io_ctx_t *));
+ lxpd->l_io_ctxs = NULL;
+ lxpd->l_io_ctx_cnt = 0;
+ mutex_exit(&lxpd->l_io_ctx_lock);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_brk.c b/usr/src/uts/common/brand/lx/syscall/lx_brk.c
new file mode 100644
index 0000000000..d46e442759
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_brk.c
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/errno.h>
+
+/* From usr/src/uts/common/os/grow.c */
+extern intptr_t brk(caddr_t);
+
+long
+lx_brk(caddr_t nva)
+{
+ if (nva != 0) {
+ (void) brk(nva);
+
+ /*
+ * Despite claims to the contrary in the man page, when Linux
+ * brk(2) fails, errno is left unchanged.
+ */
+ ttolwp(curthread)->lwp_errno = 0;
+ }
+
+ /*
+ * When ASLR was integrated, our internal brk(2) was updated to emit
+ * the current brk when arg0 == 0. Using the function yields an
+ * equivalent result to manually calculating the brk, but also
+ * serializes with changes to the process AS.
+ */
+ return ((long)brk((caddr_t)0));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chmod.c b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c
new file mode 100644
index 0000000000..7783b97cb0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c
@@ -0,0 +1,107 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/thread.h>
+#include <sys/klwp.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+
+long
+lx_vn_chmod(vnode_t *vp, int mode)
+{
+ vattr_t vattr;
+
+ vattr.va_mode = mode & MODEMASK;
+ vattr.va_mask = AT_MODE;
+
+ if (vn_is_readonly(vp)) {
+ return (EROFS);
+ }
+ return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL));
+}
+
+static long
+lx_fchmodat_wrapper(int fd, char *path, int mode)
+{
+ long error;
+ vnode_t *vp;
+
+ if ((error = lx_vp_at(fd, path, &vp, 0)) != 0) {
+ lx_proc_data_t *pd = ttolxproc(curthread);
+
+ /*
+ * If the process is in "install mode", return success
+ * if the operation failed due to an absent file.
+ */
+ if (error == ENOENT &&
+ (pd->l_flags & LX_PROC_INSTALL_MODE)) {
+ return (0);
+ }
+ return (set_errno(error));
+ }
+
+ error = lx_vn_chmod(vp, mode);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_fchmodat(int fd, char *path, int mode)
+{
+ return (lx_fchmodat_wrapper(fd, path, mode));
+}
+
+long
+lx_fchmod(int fd, int mode)
+{
+ file_t *fp;
+ vnode_t *vp;
+ long error;
+
+ /*
+ * In order to do proper O_PATH handling, lx_fchmod cannot leverage
+ * lx_fchmodat with a NULL path since the desired behavior differs.
+ */
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ if (LX_IS_O_PATH(fp)) {
+ releasef(fd);
+ return (set_errno(EBADF));
+ }
+ vp = fp->f_vnode;
+ VN_HOLD(vp);
+ releasef(fd);
+
+ error = lx_vn_chmod(vp, mode);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_chmod(char *path, int mode)
+{
+ return (lx_fchmodat_wrapper(LX_AT_FDCWD, path, mode));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chown.c b/usr/src/uts/common/brand/lx/syscall/lx_chown.c
new file mode 100644
index 0000000000..830fba0a73
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_chown.c
@@ -0,0 +1,180 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/zone.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_types.h>
+
+long
+lx_vn_chown(vnode_t *vp, uid_t uid, gid_t gid)
+{
+ vattr_t vattr;
+ zone_t *zone = crgetzone(CRED());
+
+ if ((uid != (uid_t)-1 && !VALID_UID(uid, zone)) ||
+ (gid != (gid_t)-1 && !VALID_GID(gid, zone))) {
+ return (EINVAL);
+ }
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+ vattr.va_mask = 0;
+ if (vattr.va_uid != -1)
+ vattr.va_mask |= AT_UID;
+ if (vattr.va_gid != -1)
+ vattr.va_mask |= AT_GID;
+
+ if (vn_is_readonly(vp)) {
+ return (EROFS);
+ }
+ return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL));
+}
+
+long
+lx_fchownat_wrapper(int fd, char *path, uid_t uid, gid_t gid, int native_flag)
+{
+ long error;
+ vnode_t *vp;
+
+ if ((error = lx_vp_at(fd, path, &vp, native_flag)) != 0) {
+ lx_proc_data_t *pd = ttolxproc(curthread);
+
+ /*
+ * If the process is in "install mode", return success
+ * if the operation failed due to an absent file.
+ */
+ if (error == ENOENT &&
+ (pd->l_flags & LX_PROC_INSTALL_MODE)) {
+ return (0);
+ }
+ return (set_errno(error));
+ }
+
+ error = lx_vn_chown(vp, uid, gid);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_fchown_wrapper(int fd, uid_t uid, gid_t gid)
+{
+ file_t *fp;
+ vnode_t *vp;
+ long error;
+
+ /*
+ * In order to do proper O_PATH handling, lx_fchown cannot leverage
+ * lx_fchownat with a NULL path since the desired behavior differs.
+ */
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ if (LX_IS_O_PATH(fp)) {
+ releasef(fd);
+ return (set_errno(EBADF));
+ }
+ vp = fp->f_vnode;
+ VN_HOLD(vp);
+ releasef(fd);
+
+ error = lx_vn_chown(vp, uid, gid);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag)
+{
+ int native_flag = 0;
+
+ if (flag & LX_AT_EMPTY_PATH) {
+ char c;
+
+ /*
+ * According to fchownat(2), when AT_EMPTY_PATH is set: "if
+ * path is an empty string, operate on the file referred to by
+ * fd". We pass NULL in place of the empty string, which
+ * causes fchownat() to operate on the fd we passed without an
+ * additional lookup.
+ */
+ if (copyin(path, &c, sizeof (c)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ if (c == '\0') {
+ path = NULL;
+ }
+
+ flag &= ~LX_AT_EMPTY_PATH;
+ }
+ if (flag & LX_AT_SYMLINK_NOFOLLOW) {
+ flag &= ~LX_AT_SYMLINK_NOFOLLOW;
+ native_flag |= AT_SYMLINK_NOFOLLOW;
+ }
+ if (flag != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ return (lx_fchownat_wrapper(fd, path, uid, gid, native_flag));
+}
+
+long
+lx_fchown(int fd, uid_t uid, gid_t gid)
+{
+ return (lx_fchown_wrapper(fd, uid, gid));
+}
+
+long
+lx_lchown(char *path, uid_t uid, gid_t gid)
+{
+ return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid,
+ AT_SYMLINK_NOFOLLOW));
+}
+
+long
+lx_chown(char *path, uid_t uid, gid_t gid)
+{
+ return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, 0));
+}
+
+long
+lx_fchown16(int fd, lx_uid16_t uid, lx_gid16_t gid)
+{
+ return (lx_fchown_wrapper(fd, LX_UID16_TO_UID32(uid),
+ LX_GID16_TO_GID32(gid)));
+}
+
+long
+lx_lchown16(char *path, uid_t uid, gid_t gid)
+{
+ return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid),
+ LX_GID16_TO_GID32(gid), AT_SYMLINK_NOFOLLOW));
+}
+
+long
+lx_chown16(char *path, lx_uid16_t uid, lx_gid16_t gid)
+{
+ return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid),
+ LX_GID16_TO_GID32(gid), 0));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
new file mode 100644
index 0000000000..4e00e90b1a
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
@@ -0,0 +1,513 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * [This comment omits the 'LX_' prefix on the clone flag names.]
+ *
+ * The vast majority of clone calls result in the creation of a new process or
+ * a new thread. Both of these map easily from Linux to our native code. For
+ * these calls, the user-level brand library uses a brand call to hook into the
+ * lx_helper_clone function for the required in-kernel support.
+ *
+ * A fork will typically provide these clone flags:
+ * CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID
+ *
+ * A new thread will use our SHARED_AS macro which has the flags:
+ * CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM
+ *
+ * In rare cases an application will attempt to use a subset of the SHARED_AS
+ * flags in order to implement some sharing between two processes without using
+ * a true thread. Because we do not have native support for this concept, the
+ * lx brand implements the notion of a 'clone-group'. This is a set of
+ * processes which share a subset of the allowed SHARED_AS flags. The lx brand
+ * syscalls implement the appropriate sharing for each flag. A clone-group is
+ * only instantiated in the rare case that a subset of the SHARED_AS flags are
+ * used with clone.
+ *
+ * The following set of flags could theoretically be supported, although most
+ * are not implemented at this time. The user-level brand library will validate
+ * that a supported subset of the flags are being used, or error if not. We
+ * also re-validate in the kernel.
+ *
+ * CLONE_FILES: share the file descriptor table
+ * CLONE_FS: share the filesystem information (root of the filesystem, the
+ * CWD, and the umask)
+ * CLONE_SIGHAND: share the table of signal handlers
+ * CLONE_THREAD: share the thread group
+ * CLONE_VM: share the address space
+ *
+ * At this time, only those flags defined in CLONE_GRP_SUBSET (CLONE_FS) are
+ * implemented.
+ *
+ * When a clone-group is in use, the lx_proc_data_t`l_clone_grps array will
+ * hold groups of processes sharing the attributes relevant to the clone flag.
+ * Each supported flag can have an associated group list in the array.
+ *
+ * On the first clone, a new lx_clone_grp_t struct will be created. This struct
+ * holds a pointer to each process in the group. A reference to that group is
+ * held in the appropriate slot in l_clone_grps. The struct is created for
+ * the parent process by lx_clone_grp_create() and then the child process will
+ * associate itself with the group(s) using lx_clone_grp_enter().
+ *
+ * Each syscall acting upon attributes relevant to a clone-group must include
+ * logic to do so properly. The syscalls will use lx_clone_grp_member() to
+ * determine if clone-group handling is required, and use lx_clone_grp_walk()
+ * to walk the list of processes in the group and apply the provided callback
+ * to each process.
+ *
+ * The following example illustrates how a common clone group would be used,
+ * as processes clone with the same set of CLONE_* flags.
+ * A clones B with CLONE_FS
+ * B clones C with CLONE_FS
+ * When A clones B, a new clone group is created and saved in the LX_CLGRP_FS
+ * slot in the l_clone_grps array on both A and B. When B clones, since a group
+ * already exists, C is added to the group and the group is saved in the
+ * LX_CLGRP_FS slot on C.
+ *
+ * The following example illustrates how two common clone groups would be used,
+ * as processes clone with the same set of CLONE_* flags.
+ * A clones B with CLONE_FS|CLONE_THREAD
+ * A new clone group is created and saved in the LX_CLGRP_FS slot in the
+ * l_clone_grps array on both A and B. A second clone group is created and
+ * saved in the LX_CLGRP_THREAD slot on both A and B (note that LX_CLGRP_THREAD
+ * is not implemented at this time).
+ *
+ * The following example illustrates how different clone groups would be used,
+ * as processes clone with different sets of CLONE_* flags.
+ * A clones B with CLONE_FS
+ * B clones C with CLONE_THREAD
+ * C clones D with CLONE_FS
+ * In this example, only A&B and C&D should share their FS information. B&C
+ * have to be in two clone groups. When A clones, a new clone group is created
+ * and saved in the LX_CLGRP_FS slot in the l_clone_grps array on both A and B.
+ * When B clones, a new clone group is created and saved in the LX_CLGRP_THREAD
+ * slot on both B and C (note that LX_CLGRP_THREAD is not implemented at this
+ * time). When C clones, a new clone group is created and saved in the
+ * LX_CLGRP_FS slot on both C and D.
+ *
+ * When a process exits, it removes itself from any groups to which it belongs.
+ * When the last process exits a group, it is cleaned up.
+ *
+ * If clone-groups were commonly used, this implementation would be inefficient
+ * and unwieldy, but since they are so rare a straightforward list-based
+ * approach is adequate.
+ *
+ * During group creation, the l_clone_grp_lock is first taken to ensure only
+ * one group is created, otherwise, only the group's lx_clgrp_lock protects the
+ * list.
+ *
+ * Note: Despite the locking, there is still a subtle race that can occur in
+ * this code. This occurs if a process has two threads and one of them is about
+ * to execute a clone-group aware syscall (e.g. chdir), while the other thread
+ * is forking to create a new clone-group. In theory the child process could be
+ * created, but not yet in the group. The syscall in the first thread could
+ * thus miss the new process. For example, the first thread might chdir the
+ * parent, but since the child process was alrady created, but not yet in the
+ * clone-group, it would not be chdir-ed.
+ */
+
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_ldt.h>
+#include <sys/lx_misc.h>
+#include <lx_signum.h>
+#include <lx_syscall.h>
+#include <sys/x86_archext.h>
+#include <sys/controlregs.h>
+
+/*
+ * We currently only support a single clone-group (CLONE_FS) but the design
+ * allows for future expansion by expanding the lx_proc_data+t`l_clone_grps
+ * array.
+ */
+static int
+lx_clone_flag2grp(uint_t flag)
+{
+ if (flag & LX_CLONE_FS)
+ return (LX_CLGRP_FS);
+
+ return (-1);
+}
+
+/*
+ * Note: this function has the side effect of clearing the flags.
+ */
+static int
+lx_clone_flags_iter(uint_t *fp)
+{
+ if (*fp & LX_CLONE_FS) {
+ *fp &= ~LX_CLONE_FS;
+ return (LX_CLGRP_FS);
+ }
+
+ return (-1);
+}
+
+/*
+ * Setup the current process in the proper clone-group(s) and record the
+ * clone-group flags on the lwp so that we can join the child process to the
+ * group during lx_forklwp().
+ */
+void
+lx_clone_grp_create(uint_t flags)
+{
+ int offset;
+ lx_proc_data_t *plproc = ttolxproc(curthread);
+ lx_lwp_data_t *ldp = (lx_lwp_data_t *)ttolwp(curthread)->lwp_brand;
+ lx_clone_grp_t **cgps;
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+
+ if (!LX_IS_CLONE_GRP(flags))
+ return;
+
+ ldp->br_clone_grp_flags = flags & LX_CLONE_GRP_SUBSET;
+
+ cgps = plproc->l_clone_grps;
+ /*
+ * We take the top-level mutex during create to ensure we only create
+ * one group per flag.
+ */
+ mutex_enter(&plproc->l_clone_grp_lock);
+ while ((offset = lx_clone_flags_iter(&flags)) != -1) {
+ cgp = cgps[offset];
+
+ /*
+ * If we already havae a clone-group list for this flag then
+ * nothing to do.
+ */
+ if (cgp != NULL)
+ continue;
+
+ /*
+ * Create a new clone-group. If it ever becomes an issue, we
+ * could preallocate this memory before taking
+ * l_clone_grp_lock.
+ */
+ cgp = kmem_alloc(sizeof (lx_clone_grp_t), KM_SLEEP);
+ mutex_init(&cgp->lx_clgrp_lock, NULL, MUTEX_DEFAULT, NULL);
+ cgp->lx_clgrp_cnt = 1;
+ list_create(&cgp->lx_clgrp_members,
+ sizeof (lx_clone_grp_member_t),
+ offsetof(lx_clone_grp_member_t, lx_clgrpm_link));
+
+ mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP);
+ mp->lx_clgrpm_pp = curproc;
+ list_insert_tail(&cgp->lx_clgrp_members, mp);
+
+ /* Attach group to our proc */
+ plproc->l_clone_grps[offset] = cgp;
+ }
+ mutex_exit(&plproc->l_clone_grp_lock);
+}
+
+/*
+ * Add the child process to the proper parent clone-group(s).
+ *
+ * Called from lx_forklwp, thus there is no need to have any locking for the
+ * destination proc. This is always run in the thread context of the source
+ * thread, and the destination thread is always newly created and not referred
+ * to from anywhere else. The source process should have already created the
+ * clone group(s) that we need to place the child into via lx_clone_grp_create.
+ */
+void
+lx_clone_grp_enter(uint_t flags, proc_t *srcp, proc_t *dstp)
+{
+ int offset;
+ lx_proc_data_t *plproc = ptolxproc(srcp);
+ lx_proc_data_t *clproc = ptolxproc(dstp);
+ lx_clone_grp_t **cgps;
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+
+ cgps = plproc->l_clone_grps;
+ while ((offset = lx_clone_flags_iter(&flags)) != -1) {
+ cgp = cgps[offset];
+
+ /*
+ * Parent should already have a clone-group list for this flag.
+ * The child joins that group.
+ */
+ VERIFY(cgp != NULL);
+
+ mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP);
+ mp->lx_clgrpm_pp = dstp;
+
+ mutex_enter(&cgp->lx_clgrp_lock);
+ list_insert_tail(&cgp->lx_clgrp_members, mp);
+ cgp->lx_clgrp_cnt++;
+ clproc->l_clone_grps[offset] = cgp;
+ mutex_exit(&cgp->lx_clgrp_lock);
+ }
+}
+
+/*
+ * The process is exiting or we're exec-ing a native app. In the unlikely event
+ * it is in a clone-group, remove it from the group and perform any necessary
+ * cleanup. Normally we're called from lx_proc_exit(), so we know we're the
+ * last lwp in the process, but we can also be called from lx_clearbrand() when
+ * exec-ing a native application. In this case we know the lwp(s) are stopped
+ * (It is possible to have multiple lwps if we branded the process but the
+ * exec failed. Those lwps were just branded as part of the exec, and will
+ * be de-branded).
+ */
+void
+lx_clone_grp_exit(proc_t *p, boolean_t lwps_ok)
+{
+ int i;
+ lx_proc_data_t *plproc = ptolxproc(p);
+ lx_clone_grp_t **cgps;
+
+ ASSERT(!MUTEX_HELD(&p->p_lock));
+ ASSERT(plproc != NULL);
+
+ if (!lwps_ok)
+ VERIFY(p->p_lwpcnt <= 1);
+
+ cgps = plproc->l_clone_grps;
+ for (i = 0; i < LX_CLGRP_MAX; i++) {
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+ boolean_t found;
+
+ cgp = cgps[i];
+ if (cgp == NULL)
+ continue;
+
+ /*
+ * The rare case when this process belongs to a clone-group.
+ */
+
+ mutex_enter(&cgp->lx_clgrp_lock);
+
+ /* First remove ourselves from the group. */
+ found = B_FALSE;
+ mp = list_head(&cgp->lx_clgrp_members);
+ while (mp != NULL) {
+ if (mp->lx_clgrpm_pp == p) {
+ found = B_TRUE;
+ list_remove(&cgp->lx_clgrp_members, mp);
+ kmem_free(mp, sizeof (lx_clone_grp_member_t));
+ ASSERT(cgp->lx_clgrp_cnt > 0);
+ cgp->lx_clgrp_cnt--;
+ plproc->l_clone_grps[i] = NULL;
+ break;
+ }
+ mp = list_next(&cgp->lx_clgrp_members, mp);
+ }
+ VERIFY(found);
+
+ if (cgp->lx_clgrp_cnt > 0) {
+ mutex_exit(&cgp->lx_clgrp_lock);
+ continue;
+ }
+
+ /*
+ * cgp->lx_clgrp_cnt == 0
+ *
+ * We're the sole remaining member; finish cleanup now.
+ */
+ ASSERT(plproc->l_clone_grps[i] == NULL);
+ mutex_exit(&cgp->lx_clgrp_lock);
+
+ /* Delete the group since there are no more references to it. */
+ VERIFY(list_is_empty(&cgp->lx_clgrp_members));
+
+ list_destroy(&cgp->lx_clgrp_members);
+ mutex_destroy(&cgp->lx_clgrp_lock);
+ kmem_free(cgp, sizeof (lx_clone_grp_t));
+ }
+}
+
+/*
+ * Return true in the rare case that the process is a member of a clone group
+ * with the specific flag set. Clone groups are only added to the array
+ * atomically until this process exits, so we don't need to take
+ * l_clone_grp_lock.
+ */
+boolean_t
+lx_clone_grp_member(lx_proc_data_t *dp, uint_t flag)
+{
+ int offset;
+
+ if ((offset = lx_clone_flag2grp(flag)) == -1)
+ return (B_FALSE);
+
+ if (dp->l_clone_grps[offset] != NULL) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Walk all of the processes in the clone-group list and apply the callback
+ * to each. Because we're holding the group list lock (lx_clgrp_lock) none of
+ * the processes can exit, but that is the only locking guarantee made by this
+ * function itself.
+ */
+int
+lx_clone_grp_walk(lx_proc_data_t *dp, uint_t flag, int (*cb)(proc_t *, void *),
+ void *arg)
+{
+ int offset;
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+ int res, rv = 0;
+
+
+ ASSERT(dp != NULL);
+ /* We should not be called unless we belong to a group */
+ VERIFY((offset = lx_clone_flag2grp(flag)) != -1);
+ VERIFY(dp->l_clone_grps[offset] != NULL);
+
+ cgp = dp->l_clone_grps[offset];
+ mutex_enter(&cgp->lx_clgrp_lock);
+
+ mp = list_head(&cgp->lx_clgrp_members);
+ while (mp != NULL) {
+ res = cb(mp->lx_clgrpm_pp, arg);
+ /* return the first error we see, but try all procs */
+ if (res != 0 && rv == 0)
+ rv = res;
+ mp = list_next(&cgp->lx_clgrp_members, mp);
+ }
+
+ mutex_exit(&cgp->lx_clgrp_lock);
+
+ return (rv);
+}
+
+
+/*
+ * Our lwp has already been created at this point, so this routine is
+ * responsible for setting up all the state needed to track this as a
+ * linux cloned thread.
+ */
+/* ARGSUSED */
+int
+lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp)
+{
+ struct lx_lwp_data *lwpd = ttolxlwp(curthread);
+ struct lx_proc_data *lproc = ttolxproc(curthread);
+ struct ldt_info info;
+ struct user_desc descr;
+ int tls_index;
+ int entry = -1;
+ int signo;
+
+ signo = flags & LX_CSIGNAL;
+ if (signo < 0 || signo > LX_NSIG)
+ return (set_errno(EINVAL));
+
+ if (!(flags & LX_CLONE_THREAD)) {
+ lproc->l_signal = signo;
+ } else {
+ if (flags & LX_CLONE_SETTLS) {
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ if (copyin((caddr_t)tls, &info, sizeof (info)))
+ return (set_errno(EFAULT));
+
+ if (LDT_INFO_EMPTY(&info))
+ return (set_errno(EINVAL));
+
+ entry = info.entry_number;
+ if (entry < GDT_TLSMIN || entry > GDT_TLSMAX)
+ return (set_errno(EINVAL));
+
+ tls_index = entry - GDT_TLSMIN;
+
+ /*
+ * Convert the user-space structure into a real
+ * x86 descriptor and copy it into this LWP's
+ * TLS array. We also load it into the GDT.
+ */
+ LDT_INFO_TO_DESC(&info, &descr);
+ bcopy(&descr, &lwpd->br_tls[tls_index],
+ sizeof (descr));
+ lx_set_gdt(entry, &lwpd->br_tls[tls_index]);
+ } else {
+ /*
+ * Set the Linux %fsbase for this LWP. We will
+ * restore it the next time we return to Linux
+ * via setcontext()/lx_restorecontext().
+ */
+ lwpd->br_lx_fsbase = (uintptr_t)tls;
+ }
+ }
+
+ lwpd->br_clear_ctidp =
+ (flags & LX_CLONE_CHILD_CLEARTID) ? ctidp : NULL;
+
+ if (signo && ! (flags & LX_CLONE_DETACH))
+ lwpd->br_signal = signo;
+ else
+ lwpd->br_signal = 0;
+
+ if (flags & LX_CLONE_THREAD)
+ lwpd->br_tgid = curthread->t_procp->p_pid;
+
+ if (flags & LX_CLONE_PARENT)
+ lwpd->br_ppid = 0;
+
+ if ((flags & LX_CLONE_CHILD_SETTID) && (ctidp != NULL) &&
+ (suword32(ctidp, lwpd->br_pid) != 0)) {
+ if (entry >= 0)
+ lx_clear_gdt(entry);
+ return (set_errno(EFAULT));
+ }
+ if ((flags & LX_CLONE_PARENT_SETTID) && (ptidp != NULL) &&
+ (suword32(ptidp, lwpd->br_pid) != 0)) {
+ if (entry >= 0)
+ lx_clear_gdt(entry);
+ return (set_errno(EFAULT));
+ }
+ }
+
+ *rval = lwpd->br_pid;
+ return (0);
+}
+
+long
+lx_set_tid_address(int *tidp)
+{
+ struct lx_lwp_data *lwpd = ttolxlwp(curthread);
+ long rv;
+
+ lwpd->br_clear_ctidp = tidp;
+
+ if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) {
+ rv = 1;
+ } else {
+ rv = lwpd->br_pid;
+ }
+
+ return (rv);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_close.c b/usr/src/uts/common/brand/lx/syscall/lx_close.c
new file mode 100644
index 0000000000..5d1a1605c1
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_close.c
@@ -0,0 +1,30 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/brand.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_syscalls.h>
+
+
+extern int close(int);
+
+long
+lx_close(int fdes)
+{
+ return (close(fdes));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_cpu.c b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c
new file mode 100644
index 0000000000..b0a92394dc
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c
@@ -0,0 +1,36 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_impl.h>
+
+/*
+ * We support neither the second argument (NUMA node), nor the third (obsolete
+ * pre-2.6.24 caching functionality which was ultimately broken).
+ */
+/* ARGSUSED1 */
+long
+lx_getcpu(unsigned int *cpu, uintptr_t p2, uintptr_t p3)
+{
+ unsigned int curcpu = curthread->t_cpu->cpu_id;
+
+ if (copyout(&curcpu, cpu, sizeof (curcpu)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_dup.c b/usr/src/uts/common/brand/lx/syscall/lx_dup.c
new file mode 100644
index 0000000000..d0f513753c
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_dup.c
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_misc.h>
+
+/* From usr/src/uts/common/syscall/fcntl.c */
+extern int fcntl(int, int, intptr_t);
+
+long
+lx_dup(int fd)
+{
+ return (fcntl(fd, F_DUPFD, 0));
+}
+
+long
+lx_dup2(int oldfd, int newfd)
+{
+ return (fcntl(oldfd, F_DUP2FD, newfd));
+}
+
+long
+lx_dup3(int oldfd, int newfd, int flags)
+{
+ int rc;
+
+ /* The only valid flag is O_CLOEXEC. */
+ if (flags & ~LX_O_CLOEXEC)
+ return (set_errno(EINVAL));
+
+ /* Only DUP2FD_CLOEXEC returns EINVAL on the same fd's */
+ if (oldfd == newfd)
+ return (set_errno(EINVAL));
+
+ rc = fcntl(oldfd, (flags == 0) ? F_DUP2FD : F_DUP2FD_CLOEXEC, newfd);
+ return (rc);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_epoll.c b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c
new file mode 100644
index 0000000000..47688dad6a
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c
@@ -0,0 +1,303 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/epoll.h>
+#include <sys/devpoll.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/vnode.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/lx_signal.h>
+
+static major_t devpoll_major = 0;
+
+static boolean_t
+lx_epoll_isvalid(file_t *fp)
+{
+ vnode_t *vp = fp->f_vnode;
+
+ if (vp->v_type == VCHR && getmajor(vp->v_rdev) == devpoll_major)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+long
+lx_epoll_create1(int flags)
+{
+ int err, fd, rv;
+ int fmode = FREAD | FWRITE;
+ boolean_t cloexec = B_FALSE;
+ vnode_t *vp = NULL;
+ file_t *fp = NULL;
+
+ if (flags & EPOLL_CLOEXEC) {
+ cloexec = B_TRUE;
+ flags &= ~EPOLL_CLOEXEC;
+ }
+ if (flags != 0) {
+ /* No other flags accepted at this time */
+ return (set_errno(EINVAL));
+ }
+
+ if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0) {
+ err = EMFILE;
+ goto error;
+ }
+ if (ldi_vp_from_name("/devices/pseudo/poll@0:poll", &vp) != 0) {
+ err = ENOENT;
+ goto error;
+ }
+ if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) {
+ goto error;
+ }
+ err = VOP_IOCTL(vp, DP_EPOLLCOMPAT, 0, fmode, CRED(), &rv, NULL);
+ if (err != 0) {
+ (void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL);
+ goto error;
+ }
+
+ devpoll_major = getmajor(vp->v_rdev);
+
+ fp->f_vnode = vp;
+ mutex_exit(&fp->f_tlock);
+ setf(fd, fp);
+ if (cloexec) {
+ f_setfd(fd, FD_CLOEXEC);
+ }
+ return (fd);
+
+error:
+ if (fp != NULL) {
+ setf(fd, NULL);
+ unfalloc(fp);
+ }
+ if (vp != NULL) {
+ VN_RELE(vp);
+ }
+ return (set_errno(err));
+}
+
+long
+lx_epoll_create(int size)
+{
+ if (size <= 0) {
+ return (set_errno(EINVAL));
+ }
+
+ return (lx_epoll_create1(0));
+}
+
+
+/* Match values from libc implementation */
+#define EPOLLIGNORED (EPOLLMSG | EPOLLWAKEUP)
+#define EPOLLSWIZZLED \
+ (EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM)
+#define EPOLL_TIMEOUT_CLAMP(t) (((t) < -1) ? -1 : (t))
+
+long
+lx_epoll_ctl(int fd, int op, int pfd, void *event)
+{
+ epoll_event_t epevent;
+ dvpoll_epollfd_t dpevent[2];
+ file_t *fp;
+ iovec_t aiov;
+ uio_t auio;
+ uint32_t events, ev = 0;
+ int error = 0, i = 0;
+
+ dpevent[i].dpep_pollfd.fd = pfd;
+ switch (op) {
+ case EPOLL_CTL_DEL:
+ dpevent[i].dpep_pollfd.events = POLLREMOVE;
+ break;
+
+ case EPOLL_CTL_MOD:
+ /*
+ * In the modify case, we pass down two events: one to
+ * remove the event and another to add it back.
+ */
+ dpevent[i++].dpep_pollfd.events = POLLREMOVE;
+ dpevent[i].dpep_pollfd.fd = pfd;
+ /* FALLTHROUGH */
+
+ case EPOLL_CTL_ADD:
+ if (copyin(event, &epevent, sizeof (epevent)) != 0)
+ return (set_errno(EFAULT));
+
+ /*
+ * Mask off the events that we ignore, and then swizzle the
+ * events for which our values differ from their epoll(7)
+ * equivalents.
+ */
+ events = epevent.events;
+ ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED);
+
+ if (events & EPOLLRDHUP)
+ ev |= POLLRDHUP;
+ if (events & EPOLLET)
+ ev |= POLLET;
+ if (events & EPOLLONESHOT)
+ ev |= POLLONESHOT;
+ if (events & EPOLLWRNORM)
+ ev |= POLLWRNORM;
+ if (events & EPOLLWRBAND)
+ ev |= POLLWRBAND;
+
+ dpevent[i].dpep_data = epevent.data.u64;
+ dpevent[i].dpep_pollfd.events = ev;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ } else if (!lx_epoll_isvalid(fp)) {
+ releasef(fd);
+ return (set_errno(EINVAL));
+ }
+
+ aiov.iov_base = (void *)dpevent;
+ aiov.iov_len = sizeof (dvpoll_epollfd_t) * (i + 1);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = aiov.iov_len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_loffset = 0;
+ auio.uio_fmode = fp->f_flag;
+
+ error = VOP_WRITE(fp->f_vnode, &auio, 1, fp->f_cred, NULL);
+
+ releasef(fd);
+
+ switch (error) {
+ case 0:
+ return (0);
+
+ case EBADF:
+ case EEXIST:
+ case EINVAL:
+ case ENOENT:
+ case ENOMEM:
+ case ENOSPC:
+ case EPERM:
+ /*
+ * Legal errors should pass straight through.
+ */
+ return (set_errno(error));
+
+ case ELOOP:
+ /*
+ * In the case of descriptor loops, /dev/poll emits a more
+ * descriptive error than Linux epoll consumers would expect.
+ */
+ return (set_errno(EINVAL));
+
+ default:
+ /*
+ * While devpoll itself should not emit unexpected errors, it
+ * is possible that a VOP_POLL handler might. There is little
+ * choice but to map these unexpected errors to something which
+ * is valid for epoll_ctl.
+ */
+ return (set_errno(ENOMEM));
+ }
+}
+
+long
+lx_epoll_wait(int fd, void *events, int maxevents, int timeout)
+{
+ struct dvpoll arg;
+ file_t *fp;
+ int rv = 0, error, flag;
+
+ if (maxevents <= 0) {
+ return (set_errno(EINVAL));
+ }
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ } else if (!lx_epoll_isvalid(fp)) {
+ releasef(fd);
+ return (set_errno(EINVAL));
+ }
+
+ arg.dp_nfds = maxevents;
+ arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout);
+ arg.dp_fds = (pollfd_t *)events;
+ flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL;
+ error = VOP_IOCTL(fp->f_vnode, DP_POLL, (uintptr_t)&arg, flag,
+ fp->f_cred, &rv, NULL);
+
+ releasef(fd);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (rv);
+}
+
+long
+lx_epoll_pwait(int fd, void *events, int maxevents, int timeout, void *sigmask)
+{
+ struct dvpoll arg;
+ file_t *fp;
+ int rv = 0, error, flag;
+ k_sigset_t ksig;
+
+ if (maxevents <= 0) {
+ return (set_errno(EINVAL));
+ }
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ } else if (!lx_epoll_isvalid(fp)) {
+ releasef(fd);
+ return (set_errno(EINVAL));
+ }
+ if (sigmask != NULL) {
+ lx_sigset_t lsig;
+
+ if (copyin(sigmask, &lsig, sizeof (lsig)) != 0) {
+ releasef(fd);
+ return (set_errno(EFAULT));
+ }
+ lx_ltos_sigset(&lsig, &ksig);
+ arg.dp_setp = (sigset_t *)&ksig;
+ } else {
+ arg.dp_setp = NULL;
+ }
+
+ arg.dp_nfds = maxevents;
+ arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout);
+ arg.dp_fds = (pollfd_t *)events;
+ flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL;
+ error = VOP_IOCTL(fp->f_vnode, DP_PPOLL, (uintptr_t)&arg, flag,
+ fp->f_cred, &rv, NULL);
+
+ releasef(fd);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (rv);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c b/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c
new file mode 100644
index 0000000000..21205aa18a
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c
@@ -0,0 +1,126 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/vnode.h>
+#include <sys/eventfd.h>
+
+static major_t eventfd_major = 0;
+
+/* io_submit uses this to validate control block eventfd descriptors */
+boolean_t
+lx_is_eventfd(file_t *fp)
+{
+ vnode_t *vp = fp->f_vnode;
+
+ if (vp->v_type == VCHR && getmajor(vp->v_rdev) == eventfd_major)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+long
+lx_eventfd2(uint_t initval, int flags)
+{
+ int err, fd;
+ int fmode = FREAD | FWRITE;
+ vnode_t *vp = NULL;
+ file_t *fp = NULL;
+
+ if (flags & ~(EFD_NONBLOCK | EFD_CLOEXEC | EFD_SEMAPHORE))
+ return (set_errno(EINVAL));
+
+ if (flags & EFD_NONBLOCK)
+ fmode |= FNONBLOCK;
+
+ if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0)
+ return (set_errno(EMFILE));
+
+ if (ldi_vp_from_name("/dev/eventfd", &vp) != 0) {
+ /*
+ * If /dev/eventfd is not available then it is less jarring to
+ * Linux programs to tell them that the system call is not
+ * supported instead of reporting an error (ENOENT) they are
+ * not expecting.
+ */
+ err = ENOTSUP;
+ goto error;
+ }
+ if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) {
+ VN_RELE(vp);
+ vp = NULL;
+ goto error;
+ }
+
+ if (flags & EFD_SEMAPHORE) {
+ int rv;
+
+ if ((err = VOP_IOCTL(vp, EVENTFDIOC_SEMAPHORE, 0, fmode, CRED(),
+ &rv, NULL)) != 0)
+ goto error;
+ }
+
+ if (initval != 0) {
+ uint64_t val = initval;
+ struct uio auio;
+ struct iovec aiov;
+
+ /* write initial value */
+ aiov.iov_base = (caddr_t)&val;
+ aiov.iov_len = sizeof (val);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = 0;
+ auio.uio_offset = 0;
+ auio.uio_resid = sizeof (val);
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_fmode = FWRITE;
+
+ if ((err = VOP_WRITE(vp, &auio, FWRITE, CRED(), NULL)) != 0)
+ goto error;
+ }
+
+ eventfd_major = getmajor(vp->v_rdev);
+
+ fp->f_vnode = vp;
+ mutex_exit(&fp->f_tlock);
+ setf(fd, fp);
+ if (flags & EFD_CLOEXEC) {
+ f_setfd(fd, FD_CLOEXEC);
+ }
+ return (fd);
+
+error:
+ if (fp != NULL) {
+ setf(fd, NULL);
+ unfalloc(fp);
+ }
+ if (vp != NULL) {
+ (void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL);
+ VN_RELE(vp);
+ }
+ return (set_errno(err));
+}
+
+long
+lx_eventfd(uint_t val)
+{
+ return (lx_eventfd2(val, 0));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c b/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c
new file mode 100644
index 0000000000..61f9b936f2
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c
@@ -0,0 +1,103 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/fcntl.h>
+#include <sys/lx_misc.h>
+
+/*
+ * Based on illumos posix_fadvise which does nothing. The only difference is
+ * that on Linux an fd refering to a pipe or FIFO returns EINVAL. The Linux
+ * POSIX_FADV_* values are the same as the illumos values. See how the 32-bit
+ * glibc calls fadvise64; the offeset is a 64-bit value, but the length is not.
+ * fadvise64_64 passes both the offset and length as 64-bit values. The 64-bit
+ * fadvise64 caller always passes 64-bit values for the offset and length.
+ */
+
+/*
+ * This is the fadvise64 function used by 64-bit callers, and by 32-bit callers
+ * after they have adjusted their arguments.
+ */
+/* ARGSUSED */
+int
+lx_fadvise64(int fd, off64_t offset, off64_t len, int advice)
+{
+ file_t *fp;
+ boolean_t is_fifo;
+
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ case POSIX_FADV_NOREUSE:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ if (len < 0)
+ return (set_errno(EINVAL));
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+ is_fifo = (fp->f_vnode->v_type == VFIFO);
+ releasef(fd);
+
+ if (is_fifo)
+ return (set_errno(ESPIPE));
+
+ return (0);
+}
+
+/*
+ * This is the fadvise64 function used by 32-bit callers. Linux passes the
+ * 64-bit offset by concatenating consecutive arguments. We must perform the
+ * same conversion here.
+ */
+long
+lx_fadvise64_32(int fd, uint32_t off_lo, uint32_t off_hi, int32_t len,
+ int advice)
+{
+ off64_t offset;
+
+ offset = off_hi;
+ offset = offset << 32;
+ offset |= off_lo;
+
+ return (lx_fadvise64(fd, offset, (off64_t)len, advice));
+}
+
+/*
+ * This function is only used by 32-bit callers. Linux passes the 64-bit offset
+ * and length by concatenating consecutive arguments. We must perform the same
+ * conversion here.
+ */
+long
+lx_fadvise64_64(int fd, uint32_t off_lo, uint32_t off_hi, uint32_t len_lo,
+ uint32_t len_hi, int advice)
+{
+ off64_t offset;
+ off64_t len;
+
+ offset = off_hi;
+ offset = offset << 32;
+ offset |= off_lo;
+ len = len_hi;
+ len = len << 32;
+ len |= len_lo;
+
+ return (lx_fadvise64(fd, offset, len, advice));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c
new file mode 100644
index 0000000000..338e4399fe
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c
@@ -0,0 +1,251 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/nbmlock.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+#include <sys/sdt.h>
+
+extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
+
+#define LX_FALLOC_FL_KEEP_SIZE 0x01
+#define LX_FALLOC_FL_PUNCH_HOLE 0x02
+#define LX_FALLOC_FL_NO_HIDE_STALE 0x04
+#define LX_FALLOC_FL_COLLAPSE_RANGE 0x08
+#define LX_FALLOC_FL_ZERO_RANGE 0x10
+
+#define LX_FALLOC_VALID (LX_FALLOC_FL_KEEP_SIZE | LX_FALLOC_FL_PUNCH_HOLE | \
+ LX_FALLOC_FL_NO_HIDE_STALE | LX_FALLOC_FL_COLLAPSE_RANGE | \
+ LX_FALLOC_FL_ZERO_RANGE)
+
+#define LX_FALLOC_UNSUPP (LX_FALLOC_FL_NO_HIDE_STALE | \
+ LX_FALLOC_FL_COLLAPSE_RANGE)
+
+long
+lx_fallocate(int fd, int mode, off_t offset, off_t len)
+{
+ int error = 0;
+ file_t *fp;
+ vnode_t *vp;
+ int64_t tot;
+ struct flock64 bf;
+ vattr_t vattr;
+ u_offset_t f_offset;
+ boolean_t in_crit = B_FALSE;
+
+ /*
+ * Error checking is in a specific order to make LTP happy.
+ */
+
+ tot = offset + len;
+ if (tot > (LLONG_MAX / (int64_t)1024))
+ return (set_errno(EFBIG));
+
+ if (mode & LX_FALLOC_UNSUPP)
+ return (set_errno(EOPNOTSUPP));
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ goto done;
+ }
+
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = EINVAL;
+ goto done;
+ }
+
+ if (offset < 0 || len <= 0) {
+ error = EINVAL;
+ goto done;
+ }
+
+ if (tot < 0LL) {
+ error = EFBIG;
+ goto done;
+ }
+
+ if ((mode & ~LX_FALLOC_VALID) != 0) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * If this is the only flag then we don't actually do any work.
+ */
+ if (mode == LX_FALLOC_FL_KEEP_SIZE)
+ goto done;
+
+ bzero(&bf, sizeof (bf));
+
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
+ goto done;
+
+ if (mode == 0) {
+ /* Nothing to do if not extending the file */
+ if (vattr.va_size >= tot)
+ goto done;
+
+ /* Extend the file. */
+ bf.l_start = (off64_t)tot;
+ bf.l_len = (off64_t)0;
+
+ } else if (mode & LX_FALLOC_FL_PUNCH_HOLE) {
+ /*
+ * Deallocate space in the file.
+ */
+ if ((mode & LX_FALLOC_FL_KEEP_SIZE) == 0) {
+ /* this flag is required with punch hole */
+ error = EINVAL;
+ goto done;
+ }
+
+ if (mode &
+ ~(LX_FALLOC_FL_PUNCH_HOLE | LX_FALLOC_FL_KEEP_SIZE)) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /* Make sure we don't extend since keep_size is set. */
+ if (vattr.va_size < tot) {
+ if (offset > vattr.va_size)
+ goto done;
+ len = (off_t)vattr.va_size - offset;
+ }
+
+ bf.l_start = (off64_t)offset;
+ bf.l_len = (off64_t)len;
+
+ } else if (mode & LX_FALLOC_FL_ZERO_RANGE) {
+ /*
+ * Zero out the space in the file.
+ */
+ if (mode &
+ ~(LX_FALLOC_FL_ZERO_RANGE | LX_FALLOC_FL_KEEP_SIZE)) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /* Make sure we don't extend when keep_size is set. */
+ if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) {
+ if (offset > vattr.va_size)
+ goto done;
+ len = vattr.va_size - offset;
+ }
+
+ bf.l_start = (off64_t)offset;
+ bf.l_len = (off64_t)len;
+ } else {
+ /* We should have already handled all flags */
+ VERIFY(0);
+ }
+
+ /*
+ * Check for locks in the range.
+ */
+ f_offset = fp->f_offset;
+ error = flock_check(vp, &bf, f_offset, MAXOFF_T);
+ if (error != 0)
+ goto done;
+
+ /*
+ * Check for conflicting non-blocking mandatory locks.
+ * We need to get the size again under nbl_start_crit.
+ */
+ if (nbl_need_check(vp)) {
+ u_offset_t begin;
+ ssize_t length;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = B_TRUE;
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
+ goto done;
+
+ /*
+ * Make sure we don't extend when keep_size is set.
+ */
+ if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) {
+ ASSERT(mode & (LX_FALLOC_FL_PUNCH_HOLE |
+ LX_FALLOC_FL_ZERO_RANGE));
+
+ /*
+ * If the size grew we can short-circuit the rest of
+ * the work, otherwise adjust bf for the vop_space
+ * call.
+ */
+ if (offset >= vattr.va_size)
+ goto done;
+ len = vattr.va_size - offset;
+ bf.l_len = (off64_t)len;
+ }
+
+ if (offset > vattr.va_size) {
+ begin = vattr.va_size;
+ length = offset - vattr.va_size;
+ } else {
+ begin = offset;
+ length = vattr.va_size - offset;
+ }
+
+ if (nbl_conflict(vp, NBL_WRITE, begin, length, 0, NULL)) {
+ error = EACCES;
+ goto done;
+ }
+ }
+
+ error = VOP_SPACE(vp, F_FREESP, &bf, 0, f_offset, fp->f_cred, NULL);
+
+done:
+ if (in_crit)
+ nbl_end_crit(vp);
+
+ releasef(fd);
+ if (error != 0)
+ return (set_errno(error));
+
+ return (0);
+}
+
+long
+lx_fallocate32(int fd, int mode, uint32_t offl, uint32_t offh, uint32_t lenl,
+ uint32_t lenh)
+{
+ int64_t offset = 0, len = 0;
+
+ /*
+ * From 32-bit callers, Linux passes the 64-bit offset and len by
+ * concatenating consecutive arguments. We must perform the same
+ * conversion here.
+ */
+ offset = offh;
+ offset = offset << 32;
+ offset |= offl;
+ len = lenh;
+ len = len << 32;
+ len |= lenl;
+
+ return (lx_fallocate(fd, mode, (off_t)offset, (off_t)len));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c
new file mode 100644
index 0000000000..a5406c0a4f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c
@@ -0,0 +1,701 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/cmn_err.h>
+#include <sys/pathname.h>
+#include <sys/policy.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_socket.h>
+#include <sys/brand.h>
+#include <sys/fs/fifonode.h>
+#include <sys/strsubr.h>
+#include <sys/stream.h>
+#include <sys/flock.h>
+
+extern int fcntl(int, int, intptr_t);
+extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
+extern int lx_pipe_setsz(stdata_t *, uint_t, boolean_t);
+
+
+int
+lx_vp_at(int fd, char *upath, vnode_t **vpp, int flag)
+{
+ vnode_t *startvp;
+ int error;
+
+ if (fd == LX_AT_FDCWD) {
+ fd = AT_FDCWD;
+ }
+
+ if ((error = fgetstartvp(fd, upath, &startvp)) != 0) {
+ return (error);
+ }
+
+ if (upath != NULL) {
+ uio_seg_t seg = UIO_USERSPACE;
+
+ error = lookupnameat(upath, seg,
+ (flag == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW,
+ NULLVPP, vpp, startvp);
+ if (startvp != NULL) {
+ VN_RELE(startvp);
+ }
+ return (error);
+ } else {
+ /* VN_HOLD was established in fgetstartvp */
+ *vpp = startvp;
+ VERIFY(*vpp);
+ return (0);
+ }
+}
+
+#define LTOS_FLOCK(l, s) \
+{ \
+ s->l_type = ltos_type(l->l_type); \
+ s->l_whence = l->l_whence; \
+ s->l_start = l->l_start; \
+ s->l_len = l->l_len; \
+ s->l_sysid = 0; /* not defined in linux */ \
+ s->l_pid = (pid_t)l->l_pid; \
+}
+
+#define STOL_FLOCK(s, l) \
+{ \
+ l->l_type = stol_type(s->l_type); \
+ l->l_whence = s->l_whence; \
+ l->l_start = s->l_start; \
+ l->l_len = s->l_len; \
+ l->l_pid = (int)s->l_pid; \
+}
+
+static short
+ltos_type(short l_type)
+{
+ switch (l_type) {
+ case LX_F_RDLCK:
+ return (F_RDLCK);
+ case LX_F_WRLCK:
+ return (F_WRLCK);
+ case LX_F_UNLCK:
+ return (F_UNLCK);
+ default:
+ return (-1);
+ }
+}
+
+static short
+stol_type(short l_type)
+{
+ switch (l_type) {
+ case F_RDLCK:
+ return (LX_F_RDLCK);
+ case F_WRLCK:
+ return (LX_F_WRLCK);
+ case F_UNLCK:
+ return (LX_F_UNLCK);
+ default:
+ /* can't ever happen */
+ return (0);
+ }
+}
+
+static void
+ltos_flock(struct lx_flock *l, struct flock64 *s)
+{
+ LTOS_FLOCK(l, s)
+}
+
+static void
+stol_flock(struct flock64 *s, struct lx_flock *l)
+{
+ STOL_FLOCK(s, l)
+}
+
+static void
+ltos_flock64(struct lx_flock64_32 *l, struct flock64 *s)
+{
+ LTOS_FLOCK(l, s)
+}
+
+static void
+stol_flock64(struct flock64 *s, struct lx_flock64_32 *l)
+{
+ STOL_FLOCK(s, l)
+}
+
+static int
+lx_fcntl_getfl(int fd)
+{
+ int retval;
+ int rc;
+
+ retval = fcntl(fd, F_GETFL, 0);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ return (ttolwp(curthread)->lwp_errno);
+
+ if ((retval & O_ACCMODE) == O_RDONLY)
+ rc = LX_O_RDONLY;
+ else if ((retval & O_ACCMODE) == O_WRONLY)
+ rc = LX_O_WRONLY;
+ else
+ rc = LX_O_RDWR;
+ /* O_NDELAY != O_NONBLOCK, so we need to check for both */
+ if (retval & O_NDELAY)
+ rc |= LX_O_NDELAY;
+ if (retval & O_NONBLOCK)
+ rc |= LX_O_NONBLOCK;
+ if (retval & O_APPEND)
+ rc |= LX_O_APPEND;
+ if (retval & O_SYNC)
+ rc |= LX_O_SYNC;
+ if (retval & O_LARGEFILE)
+ rc |= LX_O_LARGEFILE;
+ if (retval & FASYNC)
+ rc |= LX_O_ASYNC;
+
+ return (rc);
+}
+
+#define LX_SETFL_MASK (O_NONBLOCK | O_APPEND | O_SYNC | FASYNC);
+
+static int
+lx_fcntl_setfl(int fd, ulong_t arg)
+{
+ int flags;
+
+ /*
+ * When performing fcntl(F_SETFL), only certain flags are
+ * allowed to be manipulated. A mask is used to preserve
+ * other flags, such as those which are specified during
+ * open(2). The mask on Linux excludes O_LARGEFILE from
+ * being manipulated, whereas illumos expects the flag to
+ * be set. In order to properly preserve the O_LARGEFILE
+ * (FOFFMAX) state, we must first query for it via
+ * fcntl(F_GETFL) so that the value can be carried
+ * through.
+ */
+ flags = fcntl(fd, F_GETFL, 0);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ return (ttolwp(curthread)->lwp_errno);
+
+ flags &= ~LX_SETFL_MASK;
+
+ /* LX_O_NDELAY == LX_O_NONBLOCK, so we only check for one */
+ if (arg & LX_O_NDELAY)
+ flags |= O_NONBLOCK;
+ if (arg & LX_O_APPEND)
+ flags |= O_APPEND;
+ if (arg & LX_O_SYNC)
+ flags |= O_SYNC;
+ if (arg & LX_O_ASYNC)
+ flags |= FASYNC;
+
+ return (fcntl(fd, F_SETFL, flags));
+}
+
+
+static int
+lx_fcntl_pipesz(int fd, int cmd, ulong_t arg)
+{
+ file_t *fp;
+ vnode_t *vp;
+ stdata_t *str;
+ int err = 0, res = 0;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VFIFO || vp->v_op != fifo_vnodeops) {
+ err = EBADF;
+ goto out;
+ }
+ VERIFY((str = vp->v_stream) != NULL);
+
+ if (cmd == LX_F_SETPIPE_SZ) {
+ err = lx_pipe_setsz(str, (uint_t)arg, B_FALSE);
+ } else if (cmd == LX_F_GETPIPE_SZ) {
+ size_t val;
+
+ err = strqget(RD(str->sd_wrq), QHIWAT, 0, &val);
+ res = val;
+ } else {
+ /* NOTREACHED */
+ ASSERT(0);
+ }
+
+out:
+ releasef(fd);
+ if (err != 0) {
+ return (set_errno(err));
+ }
+ return (res);
+}
+
+static int
+lx_fcntl_common(int fd, int cmd, ulong_t arg)
+{
+ int rc = 0;
+ pid_t pid;
+ int error;
+ int rv;
+ int32_t flag;
+ file_t *fp;
+
+ /*
+ * We depend on the call to fcntl to set the errno if necessary.
+ */
+ ttolwp(curthread)->lwp_errno = 0;
+
+ switch (cmd) {
+ case LX_F_SETSIG:
+ case LX_F_GETSIG:
+ case LX_F_SETLEASE:
+ case LX_F_GETLEASE:
+ case LX_F_NOTIFY:
+ case LX_F_CANCELLK:
+ {
+ char buf[80];
+
+ (void) snprintf(buf, sizeof (buf),
+ "unsupported fcntl command: %d", cmd);
+ lx_unsupported(buf);
+ }
+ return (set_errno(ENOTSUP));
+
+ case LX_F_DUPFD:
+ rc = fcntl(fd, F_DUPFD, arg);
+ break;
+
+ case LX_F_DUPFD_CLOEXEC:
+ rc = fcntl(fd, F_DUPFD_CLOEXEC, arg);
+ break;
+
+ case LX_F_GETFD:
+ rc = fcntl(fd, F_GETFD, 0);
+ break;
+
+ case LX_F_SETFD:
+ rc = fcntl(fd, F_SETFD, arg);
+ break;
+
+ case LX_F_GETFL:
+ rc = lx_fcntl_getfl(fd);
+ break;
+
+ case LX_F_SETFL:
+ rc = lx_fcntl_setfl(fd, arg);
+ break;
+
+ case LX_F_SETOWN:
+ pid = (pid_t)arg;
+ if (pid == 1) {
+ /* Setown for the init process uses the real pid. */
+ pid = curzone->zone_proc_initpid;
+ }
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ rv = 0;
+
+ flag = fp->f_flag | get_udatamodel() | FKIOCTL;
+ error = VOP_IOCTL(fp->f_vnode, FIOSETOWN, (intptr_t)&pid,
+ flag, CRED(), &rv, NULL);
+ releasef(fd);
+ if (error != 0) {
+ /*
+ * On illumos F_SETOWN is only defined for sockets, but
+ * some apps hardcode to do this fcntl on other devices
+ * (e.g. /dev/tty) to setup signal handling. If the
+ * app is only setting itself to be the signal
+ * handler, we pretend to succeed.
+ */
+ if (error != EINVAL ||
+ curthread->t_procp->p_pid != pid) {
+ return (set_errno(error));
+ }
+ }
+
+ rc = 0;
+ break;
+
+ case LX_F_GETOWN:
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ rv = 0;
+
+ flag = fp->f_flag | get_udatamodel() | FKIOCTL;
+ error = VOP_IOCTL(fp->f_vnode, FIOGETOWN, (intptr_t)&pid,
+ flag, CRED(), &rv, NULL);
+ releasef(fd);
+ if (error != 0)
+ return (set_errno(error));
+
+ if (pid == curzone->zone_proc_initpid) {
+ /* Getown for the init process returns 1. */
+ pid = 1;
+ }
+
+ rc = pid;
+ break;
+
+ case LX_F_SETPIPE_SZ:
+ case LX_F_GETPIPE_SZ:
+ rc = lx_fcntl_pipesz(fd, cmd, arg);
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ return (rc);
+}
+
+static int
+lx_fcntl_lock_cmd_to_s(int lx_cmd)
+{
+ switch (lx_cmd) {
+ case LX_F_GETLK:
+ return (F_GETLK);
+ case LX_F_SETLK:
+ return (F_SETLK);
+ case LX_F_SETLKW:
+ return (F_SETLKW);
+ case LX_F_GETLK64:
+ return (F_GETLK64);
+ case LX_F_SETLK64:
+ return (F_SETLK64);
+ case LX_F_SETLKW64:
+ return (F_SETLKW64);
+ default:
+ VERIFY(0);
+ /*NOTREACHED*/
+ return (0);
+ }
+}
+
+/*
+ * This is a pain but we can't re-use the fcntl code for locking since it does
+ * its own copyin/copyout for the flock struct. Since we have to convert the
+ * struct we have to do our own copyin/out. Thus we replicate the fcntl code for
+ * these 3 cmds. Luckily it's not much.
+ */
+static int
+lx_fcntl_lock(int fd, int lx_cmd, void *arg)
+{
+ int cmd;
+ int error = 0;
+ file_t *fp;
+ vnode_t *vp;
+ int flag;
+ offset_t maxoffset;
+ u_offset_t offset;
+ model_t datamodel;
+ lx_flock_t lxflk;
+ lx_flock64_32_t lxflk64;
+ struct flock64 bf;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ maxoffset = MAXOFF_T;
+ datamodel = DATAMODEL_NATIVE;
+#if defined(_SYSCALL32_IMPL)
+ if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32)
+ maxoffset = MAXOFF32_T;
+#endif
+ vp = fp->f_vnode;
+ flag = fp->f_flag;
+ offset = fp->f_offset;
+
+ cmd = lx_fcntl_lock_cmd_to_s(lx_cmd);
+
+ switch (cmd) {
+ case F_GETLK:
+ case F_SETLK:
+ case F_SETLKW:
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyin(arg, &lxflk, sizeof (lx_flock_t)) != 0) {
+ error = EFAULT;
+ break;
+ }
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ lx_flock32_t lxflk32;
+
+ if (copyin(arg, &lxflk32, sizeof (lxflk32)) != 0) {
+ error = EFAULT;
+ break;
+ }
+
+ lxflk.l_type = lxflk32.l_type;
+ lxflk.l_whence = lxflk32.l_whence;
+ lxflk.l_start = (off64_t)lxflk32.l_start;
+ lxflk.l_len = (off64_t)lxflk32.l_len;
+ lxflk.l_pid = lxflk32.l_pid;
+ }
+#endif /* _SYSCALL32_IMPL */
+
+ ltos_flock(&lxflk, &bf);
+
+ if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0)
+ break;
+
+ if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL,
+ fp->f_cred, NULL)) != 0) {
+ if (cmd == F_SETLKW && error == EINTR) {
+ ttolxlwp(curthread)->br_syscall_restart =
+ B_TRUE;
+ }
+ break;
+ }
+
+ if (cmd != F_GETLK)
+ break;
+
+ /*
+ * The command is GETLK, return result.
+ */
+ stol_flock(&bf, &lxflk);
+
+ /*
+ * If no lock is found, only the type field is changed.
+ */
+ if (lxflk.l_type == LX_F_UNLCK) {
+ /* l_type always first entry, always a short */
+ if (copyout(&lxflk.l_type, &((lx_flock_t *)arg)->l_type,
+ sizeof (lxflk.l_type)))
+ error = EFAULT;
+ break;
+ }
+
+ if (bf.l_start > maxoffset || bf.l_len > maxoffset) {
+ error = EOVERFLOW;
+ break;
+ }
+
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyout(&lxflk, arg, sizeof (lxflk)) != 0) {
+ error = EFAULT;
+ break;
+ }
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ lx_flock32_t lxflk32;
+
+ if (bf.l_start > MAXOFF32_T || bf.l_len > MAXOFF32_T) {
+ error = EOVERFLOW;
+ break;
+ }
+
+ lxflk32.l_type = lxflk.l_type;
+ lxflk32.l_whence = lxflk.l_whence;
+ lxflk32.l_start = lxflk.l_start;
+ lxflk32.l_len = lxflk.l_len;
+ lxflk32.l_pid = lxflk.l_pid;
+
+ if (copyout(&lxflk32, arg, sizeof (lxflk32)) != 0) {
+ error = EFAULT;
+ break;
+ }
+ }
+#endif /* _SYSCALL32_IMPL */
+ break;
+
+ case F_GETLK64:
+ case F_SETLK64:
+ case F_SETLKW64:
+ /*
+ * Large File support is only used for ILP32 apps.
+ */
+ if (datamodel != DATAMODEL_ILP32) {
+ error = EINVAL;
+ break;
+ }
+
+ if (cmd == F_GETLK64)
+ cmd = F_GETLK;
+ else if (cmd == F_SETLK64)
+ cmd = F_SETLK;
+ else if (cmd == F_SETLKW64)
+ cmd = F_SETLKW;
+
+ if (copyin(arg, &lxflk64, sizeof (lxflk64)) != 0) {
+ error = EFAULT;
+ break;
+ }
+
+ ltos_flock64(&lxflk64, &bf);
+
+ if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0)
+ break;
+
+ if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL,
+ fp->f_cred, NULL)) != 0)
+ break;
+
+ if (cmd != F_GETLK)
+ break;
+
+ /*
+ * The command is GETLK, return result.
+ */
+ stol_flock64(&bf, &lxflk64);
+
+ /*
+ * If no lock is found, only the type field is changed.
+ */
+ if (lxflk64.l_type == LX_F_UNLCK) {
+ /* l_type always first entry, always a short */
+ if (copyout(&lxflk64.l_type,
+ &((lx_flock64_t *)arg)->l_type,
+ sizeof (lxflk64.l_type)))
+ error = EFAULT;
+ break;
+ }
+
+ if (bf.l_start > maxoffset || bf.l_len > maxoffset) {
+ error = EOVERFLOW;
+ break;
+ }
+
+ if (copyout(&lxflk64, arg, sizeof (lxflk64)) != 0) {
+ error = EFAULT;
+ break;
+ }
+ break;
+ }
+
+ releasef(fd);
+ if (error)
+ return (set_errno(error));
+
+ return (0);
+}
+
+long
+lx_fcntl(int fd, int cmd, intptr_t arg)
+{
+ switch (cmd) {
+ case LX_F_GETLK64:
+ case LX_F_SETLK64:
+ case LX_F_SETLKW64:
+ /* The 64-bit fcntl commands must go through fcntl64(). */
+ return (set_errno(EINVAL));
+
+ case LX_F_GETLK:
+ case LX_F_SETLK:
+ case LX_F_SETLKW:
+ return (lx_fcntl_lock(fd, cmd, (void *)arg));
+
+ default:
+ return (lx_fcntl_common(fd, cmd, arg));
+ }
+}
+
+long
+lx_fcntl64(int fd, int cmd, intptr_t arg)
+{
+ switch (cmd) {
+ case LX_F_GETLK:
+ case LX_F_SETLK:
+ case LX_F_SETLKW:
+ case LX_F_GETLK64:
+ case LX_F_SETLKW64:
+ case LX_F_SETLK64:
+ return (lx_fcntl_lock(fd, cmd, (void *)arg));
+
+ default:
+ return (lx_fcntl_common(fd, cmd, (ulong_t)arg));
+ }
+}
+
+/*
+ * Apply or remove an advisory lock on the entire file. F_FLOCK and F_FLOCKW
+ * are OFD-style locks. For more information, see the comment on ofdlock().
+ */
+long
+lx_flock(int fd, int op)
+{
+ int cmd;
+ int error;
+ flock64_t bf;
+ file_t *fp;
+
+ if (op & LX_LOCK_NB) {
+ cmd = F_FLOCK;
+ op &= ~LX_LOCK_NB;
+ } else {
+ cmd = F_FLOCKW;
+ }
+
+ switch (op) {
+ case LX_LOCK_UN:
+ bf.l_type = F_UNLCK;
+ break;
+ case LX_LOCK_SH:
+ bf.l_type = F_RDLCK;
+ break;
+ case LX_LOCK_EX:
+ bf.l_type = F_WRLCK;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ bf.l_whence = 0;
+ bf.l_start = 0;
+ bf.l_len = 0;
+ bf.l_sysid = 0;
+ bf.l_pid = 0;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ /*
+ * See the locking comment in fcntl.c. In summary, the *_frlock
+ * functions in the various file systems basically do some validation,
+ * then funnel everything through the fs_frlock function. For OFD-style
+ * locks, fs_frlock will do nothing. Once control returns here, we call
+ * the ofdlock function to do the actual locking.
+ */
+ error = VOP_FRLOCK(fp->f_vnode, cmd, &bf, fp->f_flag, fp->f_offset,
+ NULL, fp->f_cred, NULL);
+ if (error != 0) {
+ releasef(fd);
+ return (set_errno(error));
+ }
+ error = ofdlock(fp, cmd, &bf, fp->f_flag, fp->f_offset);
+ if (error != 0) {
+ if (cmd == F_FLOCKW && error == EINTR)
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ (void) set_errno(error);
+ }
+ releasef(fd);
+ return (error);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c
new file mode 100644
index 0000000000..2bf65748c0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c
@@ -0,0 +1,1665 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/debug.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <vm/page.h>
+#include <sys/priv.h>
+#include <sys/mman.h>
+#include <sys/timer.h>
+#include <sys/condvar.h>
+#include <sys/inttypes.h>
+#include <sys/cmn_err.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_futex.h>
+#include <sys/lx_impl.h>
+#include <sys/sdt.h>
+
+/*
+ * Futexes are a Linux-specific implementation of inter-process mutexes.
+ * They are designed to use shared memory for simple, uncontested
+ * operations, and rely on the kernel to resolve any contention issues.
+ *
+ * Most of the information in this section comes from the paper "Futexes
+ * Are Tricky", by Ulrich Drepper. This paper is currently available at:
+ * http://people.redhat.com/~drepper/futex.pdf.
+ *
+ * A futex itself a 4-byte integer, which must be 4-byte aligned. The
+ * value of this integer is expected to be modified using user-level atomic
+ * operations. For the original, simple futexes, the futex(4) design itself did
+ * not impose any semantic constraints on the value stored in the futex; it is
+ * up to the application to define its own protocol. For the newer,
+ * priority-inheritance (PI) futexes, the value is 0 or the TID of the holder,
+ * as defined in futex(2).
+ *
+ * When the application decides that kernel intervention is required, it
+ * will use the futex(2) system call. Originally there were 5 different
+ * operations that could be performed on a futex, using this system call, but
+ * that has subsequently been extended. Since this interface has evolved over
+ * time, there are several different prototypes available to the user.
+ * Fortunately, there is only a single kernel-level interface:
+ *
+ * long sys_futex(void *futex1, int cmd, int val1,
+ * struct timespec *timeout, void *futex2, int val2)
+ *
+ * The kernel-level operations that may be performed on a simple futex are:
+ *
+ * FUTEX_WAIT
+ *
+ * Atomically verify that futex1 contains the value val1. If it
+ * doesn't, return EWOULDBLOCK. If it does contain the expected
+ * value, the thread will sleep until somebody performs a FUTEX_WAKE
+ * on the futex. The caller may also specify a timeout, indicating
+ * the maximum time the thread should sleep. If the timer expires,
+ * the call returns ETIMEDOUT. If the thread is awoken with a signal,
+ * the call returns EINTR. Otherwise, the call returns 0.
+ *
+ * FUTEX_WAKE
+ *
+ * Wake up val1 processes that are waiting on futex1. The call
+ * returns the number of blocked threads that were woken up.
+ *
+ * FUTEX_WAIT_BITSET/FUTEX_WAKE_BITSET
+ *
+ * Similar to FUTEX_WAIT/FUTEX_WAKE, but each takes an additional argument
+ * denoting a bit vector, with wakers will only waking waiters that match
+ * in one or more bits. These semantics are dubious enough, but the
+ * interface has an inconsistency that is glaring even by the
+ * embarrassingly low standards that Linux sets for itself: the timeout
+ * argument to FUTEX_WAIT_BITSET is absolute, not relative as it is for
+ * FUTEX_WAIT. And as if that weren't enough unnecessary complexity,
+ * the caller may specify this absolute timeout to be against either
+ * CLOCK_MONOTONIC or CLOCK_REALTIME -- but only for FUTEX_WAIT_BITSET,
+ * of course!
+ *
+ * FUTEX_WAKE_OP
+ *
+ * The implementation of a conditional variable in terms of futexes
+ * actually uses two futexes: one to assure sequential access and one to
+ * represent the condition variable. This implementation gives rise to a
+ * particular performance problem whereby a thread is awoken on the futex
+ * that represents the condition variable only to have to (potentially)
+ * immediately wait on the futex that protects the condition variable.
+ * (Do not confuse the futex that serves to protect the condition variable
+ * with the pthread_mutex_t associated with pthread_cond_t -- which
+ * represents a third futex.) To (over)solve this problem, FUTEX_WAKE_OP
+ * was invented, which performs an atomic compare-and-exchange on a
+ * second address in a specified fashion (that is, with a specified
+ * operation). Here are the possible operations (OPARG is defined
+ * to be 12 bit value embedded in the operation):
+ *
+ * - FUTEX_OP_SET: Sets the value at the second address to OPARG
+ * - FUTEX_OP_ADD: Adds the value to OPARG
+ * - FUTEX_OP_OR: OR's the value with OPARG
+ * - FUTEX_OP_ANDN: Performs a negated AND of the value with OPARG
+ * - FUTEX_OP_XOR: XOR's the value with OPARG
+ *
+ * After this compare-and-exchange on the second address, a FUTEX_WAKE is
+ * performed on the first address and -- if the compare-and-exchange
+ * matches a specified result based on a specified comparison operation --
+ * a FUTEX_WAKE is performed on the second address. Here are the possible
+ * comparison operations:
+ *
+ * - FUTEX_OP_CMP_EQ: If old value is CMPARG, wake
+ * - FUTEX_OP_CMP_NE: If old value is not equal to CMPARG, wake
+ * - FUTEX_OP_CMP_LT: If old value is less than CMPARG, wake
+ * - FUTEX_OP_CMP_LE: If old value is less than or equal to CMPARG, wake
+ * - FUTEX_OP_CMP_GT: If old value is greater than CMPARG, wake
+ * - FUTEX_OP_CMP_GE: If old value is greater than or equal to CMPARG, wake
+ *
+ * As a practical matter, the only way that this is used (or, some might
+ * argue, is usable) is by the implementation of pthread_cond_signal(),
+ * which uses FUTEX_WAKE_OP to -- in a single system call -- unlock the
+ * futex that protects the condition variable and wake the futex that
+ * represents the condition variable. The second wake-up is conditional
+ * because the futex that protects the condition variable (rather than the
+ * one that represents it) may or may not have waiters. Given that this
+ * is the use case, FUTEX_WAKE_OP is falsely generic: despite allowing for
+ * five different kinds of operations and six different kinds of
+ * comparision operations, in practice only one is used. (Namely, setting
+ * to 0 and waking if the old value is greater than 1 -- which denotes
+ * that waiters are present and the wakeup should be performed.) Moreover,
+ * because FUTEX_WAKE_OP does not (and cannot) optimize anything in the
+ * case that the pthread_mutex_t associated with the pthread_cond_t is
+ * held at the time of a pthread_cond_signal(), this entire mechanism is
+ * essentially for naught in this case. As one can imagine (and can
+ * verify on just about any source base that uses pthread_cond_signal()),
+ * it is overwhelmingly the common case that the lock associated with the
+ * pthread_cond_t is held at the time of pthread_cond_signal(), assuring
+ * that the problem that all of this complexity was designed to solve
+ * isn't, in fact, solved because the signalled thread simply wakes up
+ * only to block again on the held mutex. Cue a slow clap!
+ *
+ * FUTEX_CMP_REQUEUE
+ *
+ * If the value stored in futex1 matches that passed in in val2, wake
+ * up val1 processes that are waiting on futex1. Otherwise, return
+ * EAGAIN.
+ *
+ * If there are more than val1 threads waiting on the futex, remove
+ * the remaining threads from this futex, and requeue them on futex2.
+ * The caller can limit the number of threads being requeued by
+ * encoding an integral numerical value in the position usually used
+ * for the timeout pointer.
+ *
+ * The call returns the number of blocked threads that were woken up
+ * or requeued.
+ *
+ * FUTEX_REQUEUE
+ *
+ * Identical to FUTEX_CMP_REQUEUE except that it does not use val2.
+ * This command has been declared broken and obsolete, but we still
+ * need to support it.
+ *
+ * FUTEX_FD
+ *
+ * Return a file descriptor, which can be used to refer to the futex.
+ * This operation was broken by design, and was blessedly removed in
+ * Linux 2.6.26 ("because it was inherently racy"); it should go without
+ * saying that we don't support this operation.
+ *
+ * The kernel-level operations that may be performed on a PI futex are:
+ *
+ * FUTEX_LOCK_PI
+ *
+ * Called after a user-land attempt to acquire the lock using an atomic
+ * instruction failed because the futex had a nonzero value (the current
+ * holder's TID). Once enqueued, the thread sleeps until FUTEX_UNLOCK_PI
+ * is called on the futex, or the timeout expires. The timeout argument to
+ * FUTEX_LOCK_PI is absolute, unlike FUTEX_WAIT, and cannot be modified
+ * as with FUTEX_WAIT_BITSET!
+ *
+ * FUTEX_TRYLOCK_PI
+ *
+ * Similar to FUTEX_LOCK_PI but can be used for error recovery as
+ * described in futex(2).
+ *
+ * FUTEX_UNLOCK_PI
+ *
+ * Called when user-land cannot atomically release the lock because
+ * there are waiting threads. This will wake the highest priority waiting
+ * thread.
+ *
+ * FUTEX_CMP_REQUEUE_PI
+ *
+ * Not implemented at this time.
+ *
+ * FUTEX_WAIT_REQUEUE_PI
+ *
+ * Not implemented at this time.
+ *
+ * Priority Inheritance
+ *
+ * Our general approach to priority inheritance recognizes the fact that the
+ * application is almost certainly not a real-time process running on dedicated
+ * hardware. The zone is most likely running in a multi-tenant environment under
+ * FSS, in spite of whatever scheduling class the Linux application thinks it is
+ * using. Thus, we make our best effort to handle priority inheritance. When a
+ * thread must block on a PI futex, it may increase the scheduling priority of
+ * the futex holder to match the blocking thread. The futex holder's original
+ * priority will be restored when it unlocks the futex.
+ *
+ * This approach does not always handle transitive priority inheritance. For
+ * example, three threads at Low, Medium and High priority:
+ * L holds futex X
+ * M holds futex Y and became enqueued on X (M bumped L's priority to M)
+ * H enqueues on Y and bumps priority of M to H, but never bumps L's priority
+ * (which is currently M) up to H
+ * In reality this scenario is both uncommon and likely still executes
+ * reasonably well under a multi-tenant, FSS scenario. Also note that if H
+ * enqueued on Y before M enqueues on X, then L will have its priority raised
+ * to H when M enqueues on X.
+ *
+ * PI Futex Cleanup
+ *
+ * Futex cleanup can occur when a thread exits unexpectedly while holding one
+ * or more futexes. Normally this done via a "robust" futex and cleanup of a
+ * robust PI futex works in the same way as a non-PI robust futex (see
+ * lx_futex_robust_exit). On Linux, in the case of a non-robust PI futex,
+ * cleanup can still occur because the futex is associated with a real-time
+ * mutex inside the kernel (see the futex(2) man page for more details). For lx
+ * we are not using anything similar. When a thread exits, lx_futex_robust_exit
+ * will be called, but we would have to iterate every hash bucket, and every
+ * futex in the chain, to look for futexes held by the exiting thread. This
+ * would be very expensive and would occur whether or not the thread held any
+ * futexes. Thus, at this time we don't set the FUTEX_OWNER_DIED bit on
+ * non-robust PI futexes held by a thread when it exits while holding futexes.
+ * In practice this does not seem to be a serious limitation since user-level
+ * code generally appears to use robust futexes, but this may need to be
+ * revisited if it is observed to be an issue.
+ */
+
+/*
+ * The structure of the robust_list, as set with the set_robust_list() system
+ * call. See lx_futex_robust_exit(), below, for details.
+ */
+typedef struct futex_robust_list {
+ uintptr_t frl_head; /* list of robust locks held */
+ uint64_t frl_offset; /* offset of lock word within a lock */
+ uintptr_t frl_pending; /* pending operation */
+} futex_robust_list_t;
+
+#if defined(_SYSCALL32_IMPL)
+
+#pragma pack(4)
+typedef struct futex_robust_list32 {
+ uint32_t frl_head; /* list of robust locks held */
+ uint32_t frl_offset; /* offset of lock word within a lock */
+ uint32_t frl_pending; /* pending operation */
+} futex_robust_list32_t;
+#pragma pack()
+
+#endif
+
+#define MEMID_COPY(s, d) \
+ { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; }
+#define MEMID_EQUAL(s, d) \
+ ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1])
+
+/*
+ * Because collisions on this hash table can be a source of negative
+ * scalability, we make it pretty large: 4,096 entries -- 64K. If this
+ * size is found to be insufficient, the size should be made dynamic.
+ * (Making it dynamic will be delicate because the per-chain locking will
+ * necessitate memory retiring or similar; see the 2008 ACM Queue article
+ * "Real-world concurrency" for details on this technique.)
+ */
+#define HASH_SHIFT_SZ 12
+#define HASH_SIZE (1 << HASH_SHIFT_SZ)
+#define HASH_FUNC(id) \
+ ((((uintptr_t)((id)->val[1]) >> 3) + \
+ ((uintptr_t)((id)->val[1]) >> (3 + HASH_SHIFT_SZ)) + \
+ ((uintptr_t)((id)->val[1]) >> (3 + 2 * HASH_SHIFT_SZ)) + \
+ ((uintptr_t)((id)->val[0]) >> 3) + \
+ ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \
+ ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \
+ (HASH_SIZE - 1))
+
+/*
+ * A small, invalid value we can compare against to find the highest scheduling
+ * priority.
+ */
+#define BELOW_MINPRI INT_MIN
+
+/*
+ * We place the per-chain lock next to the pointer to the chain itself.
+ * When compared to an array of orthogonal locks, this reduces false sharing
+ * (though adjacent entries can still be falsely shared -- just not as many),
+ * while having the additional bonus of increasing locality.
+ */
+typedef struct futex_hash {
+ kmutex_t fh_lock;
+ fwaiter_t *fh_waiters;
+} futex_hash_t;
+
+static futex_hash_t futex_hash[HASH_SIZE];
+
+static void
+futex_hashin(fwaiter_t *fwp)
+{
+ int index;
+
+ index = HASH_FUNC(&fwp->fw_memid);
+ ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock));
+
+ fwp->fw_prev = NULL;
+ fwp->fw_next = futex_hash[index].fh_waiters;
+ if (fwp->fw_next)
+ fwp->fw_next->fw_prev = fwp;
+ futex_hash[index].fh_waiters = fwp;
+}
+
+static void
+futex_hashout(fwaiter_t *fwp)
+{
+ int index;
+
+ index = HASH_FUNC(&fwp->fw_memid);
+ ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock));
+
+ if (fwp->fw_prev)
+ fwp->fw_prev->fw_next = fwp->fw_next;
+ if (fwp->fw_next)
+ fwp->fw_next->fw_prev = fwp->fw_prev;
+ if (futex_hash[index].fh_waiters == fwp)
+ futex_hash[index].fh_waiters = fwp->fw_next;
+
+ fwp->fw_prev = NULL;
+ fwp->fw_next = NULL;
+}
+
+/*
+ * Go to sleep until somebody does a WAKE operation on this futex, we get a
+ * signal, or the timeout expires.
+ */
+static int
+futex_wait(memid_t *memid, caddr_t addr,
+ int val, timespec_t *timeout, uint32_t bits, boolean_t hrtime)
+{
+ kthread_t *t = curthread;
+ lx_lwp_data_t *lwpd = ttolxlwp(t);
+ fwaiter_t *fwp = &lwpd->br_fwaiter;
+ int err, ret;
+ int32_t curval;
+ int index;
+
+ /*
+ * The LMS_USER_LOCK micro state becomes valid if we sleep; otherwise
+ * our time will accrue against LMS_SYSTEM. Use of this micro state
+ * is modelled on lwp_mutex_timedlock(), a native analogue of
+ * futex_wait().
+ */
+ (void) new_mstate(t, LMS_USER_LOCK);
+
+ fwp->fw_woken = 0;
+ fwp->fw_bits = bits;
+ fwp->fw_tid = 0;
+
+ MEMID_COPY(memid, &fwp->fw_memid);
+ cv_init(&fwp->fw_cv, NULL, CV_DEFAULT, NULL);
+
+ index = HASH_FUNC(&fwp->fw_memid);
+ mutex_enter(&futex_hash[index].fh_lock);
+
+ if (fuword32(addr, (uint32_t *)&curval)) {
+ err = set_errno(EFAULT);
+ goto out;
+ }
+ if (curval != val) {
+ err = set_errno(EWOULDBLOCK);
+ goto out;
+ }
+
+ futex_hashin(fwp);
+
+ err = 0;
+ while ((fwp->fw_woken == 0) && (err == 0)) {
+ /*
+ * If hrtime is set, we interpret timeout to be absolute and
+ * CLOCK_MONOTONIC-based; otherwise we treat it as absolute
+ * and CLOCK_REALTIME-based. (Strictly speaking -- or at least
+ * in as much as the term "strictly" means anything in the
+ * semantic shambles that is Linux -- FUTEX_WAIT defines its
+ * timeout to be CLOCK_MONOTONIC-based but limited by system
+ * clock interval; we treat these semantics as effectively
+ * CLOCK_REALTIME.)
+ */
+ if (hrtime) {
+ ret = cv_timedwait_sig_hrtime(&fwp->fw_cv,
+ &futex_hash[index].fh_lock, ts2hrt(timeout));
+ } else {
+ ret = cv_waituntil_sig(&fwp->fw_cv,
+ &futex_hash[index].fh_lock, timeout, timechanged);
+ }
+
+ if (ret < 0) {
+ err = set_errno(ETIMEDOUT);
+ } else if (ret == 0) {
+ /*
+ * According to signal(7), a futex(2) call with the
+ * FUTEX_WAIT operation is restartable.
+ */
+ ttolxlwp(t)->br_syscall_restart = B_TRUE;
+ err = set_errno(EINTR);
+ }
+ }
+
+ /*
+ * The futex is normally hashed out in wakeup. If we timed out or
+ * got a signal, we need to hash it out here instead.
+ */
+ if (fwp->fw_woken == 0)
+ futex_hashout(fwp);
+
+out:
+ mutex_exit(&futex_hash[index].fh_lock);
+
+ return (err);
+}
+
+/*
+ * Wake up to wake_threads threads that are blocked on the futex at memid.
+ */
+static int
+futex_wake(memid_t *memid, int wake_threads, uint32_t mask)
+{
+ fwaiter_t *fwp, *next;
+ int index;
+ int ret = 0;
+
+ index = HASH_FUNC(memid);
+
+ mutex_enter(&futex_hash[index].fh_lock);
+
+ for (fwp = futex_hash[index].fh_waiters;
+ fwp != NULL && ret < wake_threads; fwp = next) {
+ next = fwp->fw_next;
+ if (MEMID_EQUAL(&fwp->fw_memid, memid)) {
+ if (fwp->fw_tid != 0) {
+ /*
+ * A PI waiter. It is invalid to mix PI and
+ * non-PI usage on the same futex.
+ */
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (set_errno(EINVAL));
+ }
+
+ if ((fwp->fw_bits & mask)) {
+ futex_hashout(fwp);
+ fwp->fw_woken = 1;
+ cv_signal(&fwp->fw_cv);
+ ret++;
+ }
+ }
+ }
+
+ mutex_exit(&futex_hash[index].fh_lock);
+
+ return (ret);
+}
+
+static int
+futex_wake_op_execute(int32_t *addr, int32_t val3)
+{
+ int32_t op = FUTEX_OP_OP(val3);
+ int32_t cmp = FUTEX_OP_CMP(val3);
+ int32_t cmparg = FUTEX_OP_CMPARG(val3);
+ int32_t oparg, oldval, newval;
+ label_t ljb;
+ int rval;
+
+ if ((uintptr_t)addr >= KERNELBASE)
+ return (-EFAULT);
+
+ if (on_fault(&ljb))
+ return (-EFAULT);
+
+ oparg = FUTEX_OP_OPARG(val3);
+
+ do {
+ oldval = *addr;
+ newval = oparg;
+
+ switch (op) {
+ case FUTEX_OP_SET:
+ break;
+
+ case FUTEX_OP_ADD:
+ newval += oparg;
+ break;
+
+ case FUTEX_OP_OR:
+ newval |= oparg;
+ break;
+
+ case FUTEX_OP_ANDN:
+ newval &= ~oparg;
+ break;
+
+ case FUTEX_OP_XOR:
+ newval ^= oparg;
+ break;
+
+ default:
+ no_fault();
+ return (-EINVAL);
+ }
+ } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval);
+
+ no_fault();
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ rval = (oldval == cmparg);
+ break;
+
+ case FUTEX_OP_CMP_NE:
+ rval = (oldval != cmparg);
+ break;
+
+ case FUTEX_OP_CMP_LT:
+ rval = (oldval < cmparg);
+ break;
+
+ case FUTEX_OP_CMP_LE:
+ rval = (oldval <= cmparg);
+ break;
+
+ case FUTEX_OP_CMP_GT:
+ rval = (oldval > cmparg);
+ break;
+
+ case FUTEX_OP_CMP_GE:
+ rval = (oldval >= cmparg);
+ break;
+
+ default:
+ return (-EINVAL);
+ }
+
+ return (rval);
+}
+
+static int
+futex_wake_op(memid_t *memid, caddr_t addr2, memid_t *memid2,
+ int wake_threads, int wake_threads2, int val3)
+{
+ kmutex_t *l1, *l2;
+ int ret = 0, ret2 = 0, wake;
+ fwaiter_t *fwp, *next;
+ int index1, index2;
+
+ index1 = HASH_FUNC(memid);
+ index2 = HASH_FUNC(memid2);
+
+ if (index1 == index2) {
+ l1 = &futex_hash[index1].fh_lock;
+ l2 = NULL;
+ } else if (index1 < index2) {
+ l1 = &futex_hash[index1].fh_lock;
+ l2 = &futex_hash[index2].fh_lock;
+ } else {
+ l1 = &futex_hash[index2].fh_lock;
+ l2 = &futex_hash[index1].fh_lock;
+ }
+
+ mutex_enter(l1);
+ if (l2 != NULL)
+ mutex_enter(l2);
+
+ /* LINTED: alignment */
+ if ((wake = futex_wake_op_execute((int32_t *)addr2, val3)) < 0) {
+ (void) set_errno(-wake); /* convert back to positive errno */
+ ret = -1;
+ goto out;
+ }
+
+ for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) {
+ next = fwp->fw_next;
+ if (!MEMID_EQUAL(&fwp->fw_memid, memid))
+ continue;
+
+ if (fwp->fw_tid != 0) {
+ /*
+ * A PI waiter. It is invalid to mix PI and non-PI
+ * usage on the same futex.
+ */
+ (void) set_errno(EINVAL);
+ ret = -1;
+ goto out;
+ }
+
+ futex_hashout(fwp);
+ fwp->fw_woken = 1;
+ cv_signal(&fwp->fw_cv);
+ if (++ret >= wake_threads) {
+ break;
+ }
+ }
+
+ if (!wake)
+ goto out;
+
+ for (fwp = futex_hash[index2].fh_waiters; fwp != NULL; fwp = next) {
+ next = fwp->fw_next;
+ if (!MEMID_EQUAL(&fwp->fw_memid, memid2))
+ continue;
+
+ if (fwp->fw_tid != 0) {
+ /*
+ * A PI waiter. It is invalid to mix PI and non-PI
+ * usage on the same futex.
+ */
+ (void) set_errno(EINVAL);
+ ret = -1;
+ goto out;
+ }
+
+ futex_hashout(fwp);
+ fwp->fw_woken = 1;
+ cv_signal(&fwp->fw_cv);
+ if (++ret2 >= wake_threads2) {
+ break;
+ }
+ }
+
+ ret += ret2;
+out:
+ if (l2 != NULL)
+ mutex_exit(l2);
+ mutex_exit(l1);
+
+ return (ret);
+}
+
+/*
+ * Wake up to wake_threads waiting on the futex at memid. If there are
+ * more than that many threads waiting, requeue the remaining threads on
+ * the futex at requeue_memid.
+ */
+static int
+futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads,
+ ulong_t requeue_threads, caddr_t addr, int *cmpval)
+{
+ fwaiter_t *fwp, *next;
+ int index1, index2;
+ int ret = 0;
+ int32_t curval;
+ kmutex_t *l1, *l2;
+
+ /*
+ * To ensure that we don't miss a wakeup if the value of cmpval
+ * changes, we need to grab locks on both the original and new hash
+ * buckets. To avoid deadlock, we always grab the lower-indexed
+ * lock first.
+ */
+ index1 = HASH_FUNC(memid);
+ index2 = HASH_FUNC(requeue_memid);
+
+ if (index1 == index2) {
+ l1 = &futex_hash[index1].fh_lock;
+ l2 = NULL;
+ } else if (index1 < index2) {
+ l1 = &futex_hash[index1].fh_lock;
+ l2 = &futex_hash[index2].fh_lock;
+ } else {
+ l1 = &futex_hash[index2].fh_lock;
+ l2 = &futex_hash[index1].fh_lock;
+ }
+
+ mutex_enter(l1);
+ if (l2 != NULL)
+ mutex_enter(l2);
+
+ if (cmpval != NULL) {
+ if (fuword32(addr, (uint32_t *)&curval)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
+ for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) {
+ next = fwp->fw_next;
+ if (!MEMID_EQUAL(&fwp->fw_memid, memid))
+ continue;
+
+ futex_hashout(fwp);
+ if (ret++ < wake_threads) {
+ fwp->fw_woken = 1;
+ cv_signal(&fwp->fw_cv);
+ } else {
+ MEMID_COPY(requeue_memid, &fwp->fw_memid);
+ futex_hashin(fwp);
+
+ if ((ret - wake_threads) >= requeue_threads)
+ break;
+ }
+ }
+
+out:
+ if (l2 != NULL)
+ mutex_exit(l2);
+ mutex_exit(l1);
+
+ if (ret < 0)
+ return (set_errno(-ret));
+ return (ret);
+}
+
+/*
+ * Copy in the timeout provided by the application and convert it to an
+ * absolute timeout. Sadly, this is complicated by the different timeout
+ * semantics of FUTEX_WAIT vs. FUTEX_WAIT_BITSET vs. FUTEX_LOCK_PI. (Yes, you
+ * read that correctly; all three of these have different timeout semantics;
+ * see the block comment at the top of the file for commentary on this
+ * inanity.) This function doesn't attempt to clean up all of these
+ * differences, however; we will only copy the timer value in, perform some
+ * basic sanity checking, and (if it's an operation operating on a relative
+ * time, which is to say FUTEX_WAIT) adjust it to be absolute. All other
+ * nuances (namely, the resolution and clock of the timeout) are left up to
+ * the caller.
+ */
+static int
+get_timeout(void *lx_timeout, timestruc_t *timeout, int cmd)
+{
+ timestruc_t now;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(lx_timeout, timeout, sizeof (timestruc_t)))
+ return (EFAULT);
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ timestruc32_t timeout32;
+ if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t)))
+ return (EFAULT);
+ timeout->tv_sec = (time_t)timeout32.tv_sec;
+ timeout->tv_nsec = timeout32.tv_nsec;
+ }
+#endif
+ if (itimerspecfix(timeout))
+ return (EINVAL);
+
+ if (cmd == FUTEX_WAIT) {
+ /*
+ * We've been given a relative time; add it to the current
+ * time to derive an absolute time.
+ */
+ gethrestime(&now);
+ timespecadd(timeout, &now);
+ }
+
+ return (0);
+}
+
+/*
+ * Attempt to take the futex. If currently held, enqueue (sleep) on the futex
+ * until a thread performs futex_unlock_pi, we get a signal, or the timeout
+ * expires. If 'is_trylock' is true and the futex is currently held, return
+ * EAGAIN immediately.
+ */
+static int
+futex_lock_pi(memid_t *memid, uint32_t *addr, timespec_t *timeout,
+ boolean_t is_trylock)
+{
+ kthread_t *t = curthread;
+ lx_lwp_data_t *lwpd = ttolxlwp(t);
+ fwaiter_t *fwp = &lwpd->br_fwaiter;
+ fwaiter_t *f_fwp;
+ int fpri, mypri;
+ int err;
+ int index;
+ /* volatile to silence gcc clobber warning for longjmp */
+ volatile pid_t mytid;
+ pid_t ftid; /* current futex holder tid */
+ proc_t *fproc = NULL; /* current futex holder proc */
+ kthread_t *fthrd; /* current futex holder thread */
+ volatile uint32_t oldval;
+
+ if ((uintptr_t)addr >= KERNELBASE)
+ return (set_errno(EFAULT));
+
+ mytid = (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid);
+
+ /*
+ * Have to take mutex first to prevent the following race with unlock:
+ * a) T1 sees a tid in the futex and atomically sets FUTEX_WAITERS.
+ * b) T2 calls unlock, sees there are waiters, but since nothing is in
+ * the queue yet, it simply returns with the futex now containing 0.
+ * c) T1 proceeds to enqueue itself.
+ * At this point nothing will ever wake T1.
+ */
+ index = HASH_FUNC(memid);
+ mutex_enter(&futex_hash[index].fh_lock);
+
+ /* It would be very unusual to actually loop here. */
+ oldval = 0;
+ /* CONSTCOND */
+ while (1) {
+ uint32_t curval;
+ label_t ljb;
+
+ if (on_fault(&ljb)) {
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * We optimistically try to set our tid on the off chance that
+ * the futex was released after we initiated the syscall. That
+ * may work but it is the unlikely path and is usually just our
+ * way of getting the current value. This also handles the
+ * retry in the case when the futex only has the high bits set.
+ */
+ curval = atomic_cas_32(addr, oldval, mytid);
+ if (oldval == curval) {
+ no_fault();
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (0);
+ }
+
+ oldval = curval;
+ ftid = oldval & FUTEX_TID_MASK;
+ /* high bits were only ones set, so we retry to set our tid */
+ if (ftid == 0) {
+ no_fault();
+ continue;
+ }
+
+ if (ftid == mytid) {
+ no_fault();
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (set_errno(EDEADLK));
+ }
+
+ /* The futex is currently held by another thread. */
+ if (is_trylock) {
+ no_fault();
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (set_errno(EAGAIN));
+ }
+
+ curval = atomic_cas_32(addr, oldval, oldval | FUTEX_WAITERS);
+ no_fault();
+ if (curval == oldval) {
+ /*
+ * We set the WAITERS bit so now we can enqueue our
+ * thread on the mutex. This is the typical path.
+ */
+ oldval |= FUTEX_WAITERS;
+ break;
+ }
+
+ /*
+ * The rare case when a change snuck into the window between
+ * first getting the futex value and updating it; retry.
+ */
+ oldval = 0;
+ }
+
+ /*
+ * Determine if the current futex holder's priority needs to inherit
+ * our priority (only if it should be increased).
+ *
+ * If a non-branded proc is sharing this futex(!?) then we don't
+ * interact with it. This seems like it would only occur maliciously.
+ * That proc will never be able to call futex(2) to unlock the futex.
+ * We just return ESRCH for this invalid case.
+ *
+ * Otherwise, get the holder's priority and if necessary, bump it up to
+ * our level.
+ */
+ mutex_enter(&curproc->p_lock);
+ (void) CL_DOPRIO(curthread, kcred, 0, &mypri);
+ mutex_exit(&curproc->p_lock);
+
+ if (lx_lpid_lock(ftid, curzone, 0, &fproc, &fthrd) != 0) {
+ label_t ljb;
+
+ if (on_fault(&ljb) == 0) {
+ (void) atomic_cas_32(addr, oldval,
+ oldval | FUTEX_OWNER_DIED);
+ }
+ no_fault();
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (set_errno(ESRCH));
+ }
+ if (!PROC_IS_BRANDED(fproc)) {
+ mutex_exit(&fproc->p_lock);
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (set_errno(ESRCH));
+ }
+
+ ASSERT(MUTEX_HELD(&fproc->p_lock));
+ (void) CL_DOPRIO(fthrd, kcred, 0, &fpri);
+
+ f_fwp = &lwptolxlwp(ttolwp(fthrd))->br_fwaiter;
+ if (mypri > fpri) {
+ /* Save holder's current pri if not already bumped up */
+ if (!f_fwp->fw_pri_up)
+ f_fwp->fw_opri = fpri;
+ f_fwp->fw_pri_up = B_TRUE;
+ DTRACE_PROBE2(futex__lck__pri, int, mypri, int, fpri);
+ CL_DOPRIO(fthrd, kcred, mypri - fpri, &fpri);
+ }
+
+ /*
+ * If we haven't already been bumped by some other thread then
+ * record our pri at time of enqueue.
+ */
+ if (!fwp->fw_pri_up) {
+ fwp->fw_opri = mypri;
+ }
+ mutex_exit(&fproc->p_lock);
+
+ /*
+ * Enqueue our thread on the mutex. This is similar to futex_wait().
+ * See futex_wait() for LMS_USER_LOCK state description.
+ */
+ (void) new_mstate(t, LMS_USER_LOCK);
+
+ fwp->fw_woken = 0;
+ fwp->fw_bits = 0;
+ fwp->fw_tid = mytid;
+ MEMID_COPY(memid, &fwp->fw_memid);
+ cv_init(&fwp->fw_cv, NULL, CV_DEFAULT, NULL);
+
+ futex_hashin(fwp);
+
+ err = 0;
+ while (fwp->fw_woken == 0 && err == 0) {
+ int ret;
+
+ ret = cv_waituntil_sig(&fwp->fw_cv, &futex_hash[index].fh_lock,
+ timeout, timechanged);
+ if (ret < 0) {
+ err = set_errno(ETIMEDOUT);
+ } else if (ret == 0) {
+ /* EINTR is not valid for futex_lock_pi */
+ err = set_errno(EAGAIN);
+ }
+ }
+
+ /*
+ * The futex is normally hashed out in futex_unlock_pi. If we timed out
+ * or got a signal, we need to hash it out here instead.
+ */
+ if (fwp->fw_woken == 0)
+ futex_hashout(fwp);
+
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (err);
+}
+
+/*
+ * This must be a separate function to prevent compiler complaints about
+ * clobbering variables via longjmp (on_fault). When setting the new owner we
+ * must preserve the current WAITERS and OWNER_DIED bits.
+ */
+static int
+futex_unlock_pi_waiter(fwaiter_t *fnd_fwp, uint32_t *addr, uint32_t curval)
+{
+ label_t ljb;
+ pid_t tid;
+
+ if (on_fault(&ljb)) {
+ return (EFAULT);
+ }
+
+ /* No waiter on this futex; again, not normal, but not an error. */
+ if (fnd_fwp == NULL) {
+ int res = 0;
+ if (atomic_cas_32(addr, curval,
+ 0 | (curval & FUTEX_OWNER_DIED)) != curval)
+ res = EINVAL;
+ no_fault();
+ return (res);
+ }
+
+ tid = fnd_fwp->fw_tid | (curval & (FUTEX_WAITERS | FUTEX_OWNER_DIED));
+ if (atomic_cas_32(addr, curval, tid) != curval) {
+ /*
+ * The value was changed behind our back, return an error and
+ * don't dequeue the waiter.
+ */
+ no_fault();
+ return (EINVAL);
+ }
+
+ no_fault();
+
+ futex_hashout(fnd_fwp);
+ fnd_fwp->fw_woken = 1;
+ cv_signal(&fnd_fwp->fw_cv);
+
+ return (0);
+}
+
+/*
+ * Paired with futex_lock_pi; wake up highest priority thread that is blocked
+ * on the futex at memid. A non-zero 'clean_tid' argument is used for a PI
+ * futex during robust or trylock cleanup when the calling thread may not own
+ * the futex. During cleanup we check that the futex contains the expected
+ * tid to avoid cleanup races.
+ */
+static int
+futex_unlock_pi(memid_t *memid, uint32_t *addr, pid_t clean_tid)
+{
+ kthread_t *t = curthread;
+ lx_lwp_data_t *lwpd = ttolxlwp(t);
+ fwaiter_t *fwp, *fnd_fwp;
+ uint32_t curval;
+ pid_t mytid;
+ pid_t holder_tid;
+ int index;
+ int hipri;
+ int res;
+
+ if ((uintptr_t)addr >= KERNELBASE)
+ return (EFAULT);
+
+ mytid = (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid);
+
+ /* See comment in futex_lock_pi for why we take the mutex first. */
+ index = HASH_FUNC(memid);
+ mutex_enter(&futex_hash[index].fh_lock);
+
+ if (fuword32(addr, &curval)) {
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (EFAULT);
+ }
+
+ holder_tid = curval & FUTEX_TID_MASK;
+ if (clean_tid == 0) {
+ /* Not cleaning up so we must hold the futex */
+ if (holder_tid != mytid) {
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (EPERM);
+ }
+ } else {
+ /*
+ * We're doing cleanup but we want to check if another thread
+ * already did the cleanup due to a race before we took the
+ * futex_hash.fh_lock.
+ *
+ * There are two posible cases here:
+ * 1) During robust cleanup we already cleared the dead tid
+ * from the futex and set the FUTEX_OWNER_DIED bit.
+ * 2) During trylock cleanup we want to be sure the tid we
+ * saw in the futex before we took the futex_hash lock
+ * is still there and that we did not race with another
+ * trylock also doing cleanup.
+ */
+ DTRACE_PROBE2(futex__unl__clean, int, curval, int, clean_tid);
+ if ((curval & FUTEX_OWNER_DIED) != 0) {
+ if (holder_tid != 0) {
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (0);
+ }
+ } else if (holder_tid != clean_tid) {
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (0);
+ }
+ }
+
+ /*
+ * If necessary, restore our old priority. Since we only ever bump up
+ * the priority, our incr should be negative, but we allow for the
+ * case where the priority was lowered in some other way while we held
+ * the futex. Also, we only reset our priority on a true unlock, not
+ * when cleaning up, as indicated by clean_tid.
+ */
+ if (clean_tid == 0) {
+ fwp = &lwpd->br_fwaiter;
+ if (fwp->fw_pri_up) {
+ int curpri;
+ int incr;
+
+ mutex_enter(&curproc->p_lock);
+ CL_DOPRIO(curthread, kcred, 0, &curpri);
+ DTRACE_PROBE2(futex__unl__pri, int, fwp->fw_opri,
+ int, curpri);
+ incr = fwp->fw_opri - curpri;
+ if (incr < 0) {
+ CL_DOPRIO(curthread, kcred, incr, &curpri);
+ }
+ mutex_exit(&curproc->p_lock);
+ fwp->fw_pri_up = B_FALSE;
+ }
+ }
+
+ /*
+ * Normally an application wouldn't make the syscall if the WAITERS
+ * bit is not set, but we also come through here on robust and trylock
+ * cleanup. Preserve the OWNER_DIED bit even though there are no
+ * waiters and we're just clearing the tid.
+ */
+ if ((curval & FUTEX_WAITERS) == 0) {
+ res = 0;
+ label_t fjb;
+
+ if (on_fault(&fjb)) {
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (EFAULT);
+ }
+ if (atomic_cas_32(addr, curval,
+ 0 | (curval & FUTEX_OWNER_DIED)) != curval) {
+ res = EINVAL;
+ }
+
+ no_fault();
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (res);
+ }
+
+ /* Find the highest priority waiter. */
+ hipri = BELOW_MINPRI;
+ fnd_fwp = NULL;
+ for (fwp = futex_hash[index].fh_waiters; fwp != NULL;
+ fwp = fwp->fw_next) {
+ if (MEMID_EQUAL(&fwp->fw_memid, memid)) {
+ if (fwp->fw_tid == 0) {
+ /*
+ * A non-PI waiter. It is invalid to mix PI and
+ * non-PI usage on the same futex.
+ */
+ no_fault();
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (EINVAL);
+ }
+ /*
+ * Because futex_hashin inserts at the head of the list
+ * we want to find the oldest entry with the highest
+ * priority (hence >=).
+ */
+ if (fwp->fw_opri >= hipri) {
+ fnd_fwp = fwp;
+ hipri = fwp->fw_opri;
+ }
+ }
+ }
+
+ res = futex_unlock_pi_waiter(fnd_fwp, addr, curval);
+ mutex_exit(&futex_hash[index].fh_lock);
+ return (res);
+}
+
+/*
+ * Handle the case where the futex holder is gone and try to recover. Trylock
+ * will never enqueue on the futex and must return EAGAIN if it is held by
+ * a live process.
+ */
+static int
+futex_trylock_pi(memid_t *memid, uint32_t *addr)
+{
+ uint32_t curval;
+ pid_t ftid; /* current futex holder tid */
+ proc_t *fproc = NULL; /* current futex holder proc */
+ kthread_t *fthrd; /* current futex holder thread */
+
+ if ((uintptr_t)addr >= KERNELBASE)
+ return (set_errno(EFAULT));
+
+ if (fuword32(addr, &curval))
+ return (set_errno(EFAULT));
+
+ /* The futex is free, use the normal flow. */
+ if (curval == 0)
+ return (futex_lock_pi(memid, addr, NULL, B_TRUE));
+
+ /* Determine if the current futex holder is still alive. */
+ ftid = curval & FUTEX_TID_MASK;
+ if (lx_lpid_lock(ftid, curzone, 0, &fproc, &fthrd) == 0) {
+ mutex_exit(&fproc->p_lock);
+ } else {
+ /*
+ * The current holder is gone. Unlock then take the lock.
+ * Ignore any error that may result from two threads racing to
+ * cleanup.
+ */
+ (void) futex_unlock_pi(memid, addr, ftid);
+ }
+ return (futex_lock_pi(memid, addr, NULL, B_TRUE));
+}
+
+long
+lx_futex(uintptr_t addr, int op, int val, uintptr_t lx_timeout,
+ uintptr_t addr2, int val3)
+{
+ struct as *as = curproc->p_as;
+ memid_t memid, memid2;
+ timestruc_t timeout;
+ timestruc_t *tptr = NULL;
+ int val2 = NULL;
+ int rval = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+ int private = op & FUTEX_PRIVATE_FLAG;
+ char dmsg[32];
+
+ /* must be aligned on int boundary */
+ if (addr & 0x3)
+ return (set_errno(EINVAL));
+
+ /* Sanity check the futex command */
+ if (cmd < 0 || cmd > FUTEX_MAX_CMD)
+ return (set_errno(EINVAL));
+
+ if (cmd == FUTEX_FD) {
+ /*
+ * FUTEX_FD was sentenced to death for grievous crimes of
+ * semantics against humanity; it has been ripped out of Linux
+ * and will never be supported by us.
+ */
+ (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd);
+ lx_unsupported(dmsg);
+ return (set_errno(ENOSYS));
+ }
+
+ switch (cmd) {
+ case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_CMP_REQUEUE_PI:
+ /*
+ * These are operations that we don't currently support, but
+ * may well need to in the future. For now, callers need to
+ * deal with these being missing -- but if and as that changes,
+ * they may well need to be implemented.
+ */
+ (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd);
+ lx_unsupported(dmsg);
+ return (set_errno(ENOSYS));
+ }
+
+ if ((op & FUTEX_CLOCK_REALTIME) && cmd != FUTEX_WAIT_BITSET) {
+ /*
+ * Linux only allows FUTEX_CLOCK_REALTIME to be set on the
+ * FUTEX_WAIT_BITSET and FUTEX_WAIT_REQUEUE_PI commands.
+ */
+ return (set_errno(ENOSYS));
+ }
+
+ /* Copy in the timeout structure from userspace. */
+ if ((cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_LOCK_PI) && lx_timeout != NULL) {
+ rval = get_timeout((timespec_t *)lx_timeout, &timeout, cmd);
+
+ if (rval != 0)
+ return (set_errno(rval));
+ tptr = &timeout;
+ }
+
+ switch (cmd) {
+ case FUTEX_REQUEUE:
+ case FUTEX_CMP_REQUEUE:
+ case FUTEX_WAKE_OP:
+ /*
+ * lx_timeout is nominally a pointer to a userspace address.
+ * For several commands, however, it actually contains
+ * an additional integer parameter. This is horrible, and
+ * the people who did this to us should be sorry.
+ */
+ val2 = (int)lx_timeout;
+ }
+
+ /*
+ * Translate the process-specific, user-space futex virtual
+ * address(es) to a universal memid. If the private bit is set, we
+ * can just use our as plus the virtual address, saving quite a bit
+ * of effort.
+ */
+ if (private) {
+ memid.val[0] = (uintptr_t)as;
+ memid.val[1] = (uintptr_t)addr;
+ } else {
+ rval = as_getmemid(as, (void *)addr, &memid);
+ if (rval != 0)
+ return (set_errno(rval));
+ }
+
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_WAKE_OP) {
+ if (addr2 & 0x3)
+ return (set_errno(EINVAL));
+
+ if (private) {
+ memid2.val[0] = (uintptr_t)as;
+ memid2.val[1] = (uintptr_t)addr2;
+ } else {
+ rval = as_getmemid(as, (void *)addr2, &memid2);
+ if (rval)
+ return (set_errno(rval));
+ }
+ }
+
+ switch (cmd) {
+ case FUTEX_WAIT:
+ rval = futex_wait(&memid, (void *)addr, val,
+ tptr, FUTEX_BITSET_MATCH_ANY, B_FALSE);
+ break;
+
+ case FUTEX_WAIT_BITSET:
+ rval = futex_wait(&memid, (void *)addr, val, tptr, val3,
+ (op & FUTEX_CLOCK_REALTIME) ? B_FALSE : B_TRUE);
+ break;
+
+ case FUTEX_WAKE:
+ rval = futex_wake(&memid, val, FUTEX_BITSET_MATCH_ANY);
+ break;
+
+ case FUTEX_WAKE_BITSET:
+ rval = futex_wake(&memid, val, val3);
+ break;
+
+ case FUTEX_WAKE_OP:
+ rval = futex_wake_op(&memid, (void *)addr2, &memid2,
+ val, val2, val3);
+ break;
+
+ case FUTEX_CMP_REQUEUE:
+ case FUTEX_REQUEUE:
+ rval = futex_requeue(&memid, &memid2, val,
+ val2, (void *)addr2, &val3);
+
+ break;
+
+ case FUTEX_LOCK_PI:
+ rval = futex_lock_pi(&memid, (uint32_t *)addr, tptr, B_FALSE);
+ break;
+
+ case FUTEX_TRYLOCK_PI:
+ rval = futex_trylock_pi(&memid, (uint32_t *)addr);
+ break;
+
+ case FUTEX_UNLOCK_PI:
+ rval = futex_unlock_pi(&memid, (uint32_t *)addr, 0);
+ if (rval != 0)
+ (void) set_errno(rval);
+ break;
+ }
+
+ return (rval);
+}
+
+/*
+ * Wake the next waiter if the thread holding the futex has exited without
+ * releasing the futex.
+ */
+static void
+futex_robust_wake(memid_t *memid, uint32_t tid)
+{
+ fwaiter_t *fwp;
+ int index;
+
+ index = HASH_FUNC(memid);
+
+ mutex_enter(&futex_hash[index].fh_lock);
+
+ for (fwp = futex_hash[index].fh_waiters; fwp != NULL;
+ fwp = fwp->fw_next) {
+ if (MEMID_EQUAL(&fwp->fw_memid, memid))
+ break;
+ }
+
+ if (fwp != NULL) {
+ if (fwp->fw_tid != 0) {
+ /*
+ * This is a PI futex and there is a waiter; unlock the
+ * futex in cleanup mode. Ignore errors, which are very
+ * unlikely, but could happen if the futex was in an
+ * unexpected state due to some other cleanup, such as
+ * might happen with a concurrent trylock call.
+ */
+ mutex_exit(&futex_hash[index].fh_lock);
+ (void) futex_unlock_pi(memid,
+ (uint32_t *)(uintptr_t)memid->val[1], tid);
+ return;
+ }
+
+ /* non-PI futex, just wake it */
+ futex_hashout(fwp);
+ fwp->fw_woken = 1;
+ cv_signal(&fwp->fw_cv);
+ }
+
+ mutex_exit(&futex_hash[index].fh_lock);
+}
+
+/*
+ * Does the dirty work of actually dropping a held robust lock in the event
+ * of the untimely death of the owner; see lx_futex_robust_exit(), below.
+ */
+static void
+lx_futex_robust_drop(uintptr_t addr, uint32_t tid)
+{
+ memid_t memid;
+ uint32_t oldval, newval;
+
+ VERIFY(addr + sizeof (uint32_t) < KERNELBASE);
+
+ do {
+ fuword32_noerr((void *)addr, &oldval);
+
+ if ((oldval & FUTEX_TID_MASK) != tid)
+ return;
+
+ newval = (oldval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval);
+
+ /*
+ * We have now denoted that this lock's owner is dead; we need to
+ * wake any waiters.
+ */
+ if (as_getmemid(curproc->p_as, (void *)addr, &memid) != 0)
+ return;
+
+ futex_robust_wake(&memid, tid);
+}
+
+/*
+ * Called when a thread is exiting. The role of the kernel is very clearly
+ * spelled out in the Linux design document entitled robust-futex-ABI.txt:
+ * we must (carefully!) iterate over the list of held locks pointed to by
+ * the robust list head; for each lock, we'll check to see if the calling
+ * (exiting) thread is the owner, and if so, denote that the lock is dead
+ * and wake any waiters. (The "pending" field of the head points to a lock
+ * that is in transition; it should be dropped if held.) If there are any
+ * errors through here at all (including memory operations), we abort the
+ * entire operation.
+ */
+void
+lx_futex_robust_exit(uintptr_t addr, uint32_t tid)
+{
+ futex_robust_list_t list;
+ uintptr_t entry, next;
+ model_t model = get_udatamodel();
+ int length = 0;
+ label_t ljb;
+
+ if (on_fault(&ljb))
+ return;
+
+ if (addr + sizeof (futex_robust_list_t) >= KERNELBASE)
+ goto out;
+
+ if (model == DATAMODEL_NATIVE) {
+ copyin_noerr((void *)addr, &list, sizeof (list));
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ futex_robust_list32_t list32;
+
+ copyin_noerr((void *)addr, &list32, sizeof (list32));
+ list.frl_head = list32.frl_head;
+ list.frl_offset = list32.frl_offset;
+ list.frl_pending = list32.frl_pending;
+ }
+#endif
+
+ /*
+ * Strip off the PI bit, if any.
+ */
+ entry = list.frl_head & ~FUTEX_ROBUST_LOCK_PI;
+
+ while (entry != addr && length++ < FUTEX_ROBUST_LIST_LIMIT) {
+ if (entry + list.frl_offset + sizeof (uint32_t) >= KERNELBASE)
+ goto out;
+
+ if (model == DATAMODEL_NATIVE) {
+ fulword_noerr((void *)entry, &next);
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ uint32_t next32;
+ fuword32_noerr((void *)entry, &next32);
+ next = next32;
+ }
+#endif
+
+ /*
+ * Drop the robust mutex -- but only if our pending lock didn't
+ * somehow sneak on there.
+ */
+ if (entry != list.frl_pending)
+ lx_futex_robust_drop(entry + list.frl_offset, tid);
+
+ entry = next & ~FUTEX_LOCK_PI;
+ }
+
+ /*
+ * Finally, drop the pending lock if there is one.
+ */
+ if (list.frl_pending != NULL && list.frl_pending +
+ list.frl_offset + sizeof (uint32_t) < KERNELBASE)
+ lx_futex_robust_drop(list.frl_pending + list.frl_offset, tid);
+
+out:
+ no_fault();
+}
+
+long
+lx_set_robust_list(void *listp, size_t len)
+{
+ proc_t *p = curproc;
+ klwp_t *lwp = ttolwp(curthread);
+ struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (len != sizeof (futex_robust_list_t))
+ return (set_errno(EINVAL));
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ if (len != sizeof (futex_robust_list32_t))
+ return (set_errno(EINVAL));
+ }
+#endif
+
+ /*
+ * To assure that we are serialized with respect to any racing call
+ * to lx_get_robust_list(), we lock ourselves to set the value. (Note
+ * that sprunlock() drops p_lock.)
+ */
+ mutex_enter(&p->p_lock);
+ sprlock_proc(p);
+ lwpd->br_robust_list = listp;
+ sprunlock(p);
+
+ return (0);
+}
+
+long
+lx_get_robust_list(pid_t pid, void **listp, size_t *lenp)
+{
+ model_t model = get_udatamodel();
+ proc_t *rproc;
+ kthread_t *rthr;
+ klwp_t *rlwp;
+ lx_lwp_data_t *rlwpd;
+ void *list;
+ int err = 0;
+
+ if (pid == 0) {
+ /*
+ * A pid of 0 denotes the current thread; we lock the current
+ * process even though it isn't strictly necessary (we can't
+ * race with set_robust_list() because a thread may only set
+ * its robust list on itself).
+ */
+ rproc = curproc;
+ rlwpd = lwptolxlwp(ttolwp(curthread));
+ mutex_enter(&curproc->p_lock);
+ sprlock_proc(rproc);
+ } else {
+ if (lx_lpid_lock(pid, curzone, LXP_PRLOCK, &rproc,
+ &rthr) != 0) {
+ return (set_errno(ESRCH));
+ }
+
+ if (rproc->p_model != model ||
+ (rlwp = ttolwp(rthr)) == NULL ||
+ (rlwpd = lwptolxlwp(rlwp)) == NULL) {
+ /*
+ * The target process does not match our data model, or
+ * we couldn't find the LWP, or the target process is
+ * not branded.
+ */
+ err = ESRCH;
+ goto out;
+ }
+ }
+
+ if (curproc != rproc &&
+ priv_proc_cred_perm(curproc->p_cred, rproc, NULL, VREAD) != 0) {
+ /*
+ * We don't have the permission to examine the target.
+ */
+ err = EPERM;
+ goto out;
+ }
+
+ list = rlwpd->br_robust_list;
+
+out:
+ sprunlock(rproc);
+
+ if (err != 0)
+ return (set_errno(err));
+
+ if (model == DATAMODEL_NATIVE) {
+ if (sulword(listp, (uintptr_t)list) != 0)
+ return (set_errno(EFAULT));
+
+ if (sulword(lenp, sizeof (futex_robust_list_t)) != 0)
+ return (set_errno(EFAULT));
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ if (suword32(listp, (uint32_t)(uintptr_t)list) != 0)
+ return (set_errno(EFAULT));
+
+ if (suword32(lenp, sizeof (futex_robust_list32_t)) != 0)
+ return (set_errno(EFAULT));
+ }
+#endif
+
+ return (0);
+}
+
+void
+lx_futex_init(void)
+{
+ int i;
+
+ for (i = 0; i < HASH_SIZE; i++)
+ mutex_init(&futex_hash[i].fh_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+int
+lx_futex_fini(void)
+{
+ int i, err;
+
+ err = 0;
+ for (i = 0; (err == 0) && (i < HASH_SIZE); i++) {
+ mutex_enter(&futex_hash[i].fh_lock);
+ if (futex_hash[i].fh_waiters != NULL)
+ err = EBUSY;
+ mutex_exit(&futex_hash[i].fh_lock);
+ }
+ return (err);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c
new file mode 100644
index 0000000000..275a781fa0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c
@@ -0,0 +1,52 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+
+/*
+ * getcwd() - Linux syscall semantics are slightly different; we need to return
+ * the length of the pathname copied (+ 1 for the terminating NULL byte.)
+ */
+long
+lx_getcwd(char *buf, int size)
+{
+ int len;
+ int error;
+ vnode_t *vp;
+ char path[MAXPATHLEN + 1];
+
+ mutex_enter(&curproc->p_lock);
+ vp = PTOU(curproc)->u_cdir;
+ VN_HOLD(vp);
+ mutex_exit(&curproc->p_lock);
+ if ((error = vnodetopath(NULL, vp, path, sizeof (path), CRED())) != 0) {
+ VN_RELE(vp);
+ return (set_errno(error));
+ }
+ VN_RELE(vp);
+
+ len = strlen(path) + 1;
+ if (len > size)
+ return (set_errno(ERANGE));
+
+ if (copyout(path, buf, len) != 0)
+ return (set_errno(EFAULT));
+
+ return (len);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getdents.c b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c
new file mode 100644
index 0000000000..5bde892aea
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c
@@ -0,0 +1,416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/filio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/inttypes.h>
+#include <sys/vnode.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+
+#include <sys/lx_types.h>
+#include <sys/lx_misc.h>
+
+#define LX_NAMEMAX 256
+
+#define LX_GETDENTS_MAX_BUFSZ 65536
+
+/*
+ * See the comment in our lx_sysfs VFS code for a detailed explanation around
+ * the handling of 'd_type' here.
+ */
+#define LX_DT_UNKNOWN 0
+#define LX_DT_FIFO 1
+#define LX_DT_CHR 2
+#define LX_DT_DIR 4
+#define LX_DT_BLK 6
+#define LX_DT_REG 8
+#define LX_DT_LNK 10
+#define LX_DT_SOCK 12
+
+/*
+ * Set by lx_sysfs when it loads. lx_sysfs depends on the lx_brand module,
+ * so our module has to load first and define the variables that lx_sysfs will
+ * set when it loads.
+ */
+int lx_sysfs_vfs_type;
+int (*lx_sysfs_vtype)(ino_t);
+
+/*
+ * Because the Linux dirent has an extra field (d_type), it's possible that
+ * each entry will be 8 bytes larger (and aligned to 8 bytes) due to padding.
+ * To prevent overrun during translation, the illumos-native buffer is sized
+ * pessimistically.
+ */
+#define LTOS_GETDENTS_BUFSZ(bufsz, datasz) \
+ (((bufsz) / (((datasz) + 15) & ~7)) * sizeof (struct dirent))
+
+/*
+ * Linux d_type offset is at (d_reclen - 1). See the Linux getdents(2) man page.
+ * This macro assumes d_reclen is already set correctly.
+ */
+#define LX_DTYPE(l) *(((char *)l) + (l->d_reclen - 1))
+
+/*
+ * Record must be long enough to house d_name string, null terminator and
+ * d_type field. It's then padded to nearest 8-byte boundary
+ */
+#define LX_RECLEN(l, t) \
+ ((offsetof(t, d_name) + 2 + (l) + 7) & ~7)
+
+/*
+ * Bytes after d_name string until d_reclen should be zeroed.
+ * Includes zero-terminating d_name
+ */
+#define LX_ZEROLEN(l, t) \
+ (LX_RECLEN(l, t) - \
+ ((offsetof(t, d_name) + (l))))
+
+/* The output format of getdents differs if the caller is 32 or 64 bit. */
+struct lx_dirent_32 {
+ uint32_t d_ino;
+ int32_t d_off;
+ ushort_t d_reclen;
+ char d_name[1];
+ uchar_t d_type;
+};
+
+struct lx_dirent_64 {
+ uint64_t d_ino;
+ int64_t d_off;
+ ushort_t d_reclen;
+ char d_name[1];
+ uchar_t d_type;
+};
+
+static long
+lx_getdents_common(int fd, caddr_t uptr, size_t count,
+ unsigned int lx_size, int (*outcb)(caddr_t, caddr_t, int, boolean_t))
+{
+ vnode_t *vp;
+ boolean_t is_sysfs = B_FALSE;
+ file_t *fp;
+ struct uio auio;
+ struct iovec aiov;
+ int error, at_eof;
+ int sbufsz, lbufsz, bufsz;
+ void *lbuf, *sbuf;
+ size_t outb = 0;
+
+ if (count < lx_size) {
+ return (set_errno(EINVAL));
+ }
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VDIR) {
+ releasef(fd);
+ return (set_errno(ENOTDIR));
+ }
+ if (!(fp->f_flag & FREAD)) {
+ releasef(fd);
+ return (set_errno(EBADF));
+ }
+
+ if (vp->v_vfsp->vfs_fstype == lx_sysfs_vfs_type) {
+ is_sysfs = B_TRUE;
+ }
+
+ if (count > LX_GETDENTS_MAX_BUFSZ) {
+ /*
+ * If the target buffer passed to us is huge, keep the
+ * translation buffers moderate in size. Iteration will be
+ * used to fill the request.
+ */
+ lbufsz = LX_GETDENTS_MAX_BUFSZ;
+ sbufsz = LTOS_GETDENTS_BUFSZ(LX_GETDENTS_MAX_BUFSZ, lx_size);
+ } else if (count < (lx_size + MAXPATHLEN)) {
+ /*
+ * If the target buffer is tiny, allocate a Linux-format buffer
+ * big enough to hold at least one max-length row while keeping
+ * the illumos-format buffer pesimistic in size.
+ *
+ * Assuming the buffer is truely tiny, it's likely that the
+ * result will not fit and an EINVAL will be tossed.
+ */
+ lbufsz = (lx_size + MAXPATHLEN);
+ sbufsz = MAX((LTOS_GETDENTS_BUFSZ(count, lx_size)),
+ sizeof (struct dirent));
+ } else {
+ lbufsz = count;
+ sbufsz = LTOS_GETDENTS_BUFSZ(count, lx_size);
+ }
+ bufsz = sbufsz;
+ lbuf = kmem_alloc(lbufsz, KM_SLEEP);
+ sbuf = kmem_alloc(sbufsz, KM_SLEEP);
+
+ aiov.iov_base = sbuf;
+ aiov.iov_len = sbufsz;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = sbufsz;
+ auio.uio_fmode = 0;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ /*
+ * Since we use a conservative buffer allocation for the differing
+ * struct sizing and Linux places fewer limits on getdents buffers in
+ * general, there's a chance we'll undershoot on the record count.
+ * When this happens, we can simply repeat the READDIR operation until
+ * the available records are exhausted or we've filled the user buffer.
+ */
+ do {
+ int res;
+
+ (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &at_eof, NULL, 0);
+ VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
+ if (error != 0 || auio.uio_resid == sbufsz) {
+ break;
+ }
+ res = outcb(sbuf, lbuf, bufsz - auio.uio_resid, is_sysfs);
+ VERIFY(res <= lbufsz);
+ if (res == 0) {
+ /* no records to copyout from this batch */
+ break;
+ } else if (res > count) {
+ /*
+ * For very small buffer sizes, it's possible that a
+ * single record is too large due to a long filename.
+ */
+ error = EINVAL;
+ break;
+ }
+
+ VERIFY(outb + res <= count);
+ if (copyout(lbuf, (void *)(uptr + outb), res) != 0) {
+ error = EFAULT;
+ break;
+ }
+ outb += res;
+
+ /*
+ * We undershot the request buffer.
+ * Reset for another READDIR, taking care not to overshoot.
+ */
+ bufsz = MIN(sbufsz, LTOS_GETDENTS_BUFSZ(count - outb, lx_size));
+ auio.uio_resid = bufsz;
+ aiov.iov_len = bufsz;
+ aiov.iov_base = sbuf;
+
+ /*
+ * Continued progress is allowed only if EOF has not been
+ * reached and there is enough remaining buffer space to hold
+ * an entry with a max-length filename.
+ */
+ } while (at_eof == 0 && (count - outb) >= (lx_size + MAXPATHLEN));
+
+ kmem_free(lbuf, lbufsz);
+ kmem_free(sbuf, sbufsz);
+
+ if (error) {
+ releasef(fd);
+ return (set_errno(error));
+ }
+
+ fp->f_offset = auio.uio_loffset;
+ releasef(fd);
+ return (outb);
+}
+
+static int
+lx_get_sysfs_dtype(ino_t ino)
+{
+ vtype_t vt;
+
+ vt = lx_sysfs_vtype(ino);
+
+ switch (vt) {
+ case VREG: return (LX_DT_REG);
+ case VDIR: return (LX_DT_DIR);
+ case VBLK: return (LX_DT_BLK);
+ case VCHR: return (LX_DT_CHR);
+ case VLNK: return (LX_DT_LNK);
+ case VFIFO: return (LX_DT_FIFO);
+ case VSOCK: return (LX_DT_SOCK);
+ default: return (LX_DT_UNKNOWN);
+ }
+}
+
+static int
+lx_getdents_format32(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs)
+{
+ struct dirent *sd;
+ struct lx_dirent_32 *ld;
+ int namelen;
+ int size = 0;
+
+ while (len > 0) {
+ /* LINTED: alignment */
+ sd = (struct dirent *)sbuf;
+ /* LINTED: alignment */
+ ld = (struct lx_dirent_32 *)lbuf;
+ namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1);
+
+ ld->d_ino = sd->d_ino;
+ ld->d_off = sd->d_off;
+ (void) strncpy(ld->d_name, sd->d_name, namelen);
+ ld->d_name[namelen] = 0;
+ ld->d_reclen = (ushort_t)LX_RECLEN(namelen,
+ struct lx_dirent_32);
+ /* Zero out any alignment padding and d_type */
+ bzero(ld->d_name + namelen,
+ LX_ZEROLEN(namelen, struct lx_dirent_32));
+
+ if (is_sysfs) {
+ LX_DTYPE(ld) = lx_get_sysfs_dtype(ld->d_ino);
+ }
+
+ len -= sd->d_reclen;
+ size += ld->d_reclen;
+ sbuf += sd->d_reclen;
+ lbuf += ld->d_reclen;
+ }
+ return (size);
+}
+
+static int
+lx_getdents_format64(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs)
+{
+ struct dirent *sd;
+ struct lx_dirent_64 *ld;
+ int namelen;
+ int size = 0;
+
+ while (len > 0) {
+ /* LINTED: alignment */
+ sd = (struct dirent *)sbuf;
+ /* LINTED: alignment */
+ ld = (struct lx_dirent_64 *)lbuf;
+ namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1);
+
+ ld->d_ino = sd->d_ino;
+ ld->d_off = sd->d_off;
+ (void) strncpy(ld->d_name, sd->d_name, namelen);
+ ld->d_name[namelen] = 0;
+ ld->d_reclen = (ushort_t)LX_RECLEN(namelen,
+ struct lx_dirent_64);
+ /* Zero out any alignment padding and d_type */
+ bzero(ld->d_name + namelen,
+ LX_ZEROLEN(namelen, struct lx_dirent_64));
+
+ if (is_sysfs) {
+ LX_DTYPE(ld) = lx_get_sysfs_dtype(ld->d_ino);
+ }
+
+ len -= sd->d_reclen;
+ size += ld->d_reclen;
+ sbuf += sd->d_reclen;
+ lbuf += ld->d_reclen;
+ }
+ return (size);
+}
+
+long
+lx_getdents_32(int fd, caddr_t buf, size_t count)
+{
+ return (lx_getdents_common(fd, buf, count,
+ sizeof (struct lx_dirent_32), lx_getdents_format32));
+}
+
+long
+lx_getdents_64(int fd, caddr_t buf, size_t count)
+{
+ return (lx_getdents_common(fd, buf, count,
+ sizeof (struct lx_dirent_64), lx_getdents_format64));
+}
+
+struct lx_dirent64 {
+ uint64_t d_ino;
+ int64_t d_off;
+ ushort_t d_reclen;
+ uchar_t d_type;
+ char d_name[1];
+};
+
+#define LX_RECLEN64(namelen) \
+ ((offsetof(struct lx_dirent64, d_name) + 1 + (namelen) + 7) & ~7)
+
+#define LX_ZEROLEN64(namelen) \
+ (LX_RECLEN64(namelen) - \
+ ((offsetof(struct lx_dirent64, d_name) + (namelen))))
+
+static int
+lx_getdents64_format(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs)
+{
+ struct dirent *sd;
+ struct lx_dirent64 *ld;
+ int namelen;
+ int size = 0;
+
+ while (len > 0) {
+ /* LINTED: alignment */
+ sd = (struct dirent *)sbuf;
+ /* LINTED: alignment */
+ ld = (struct lx_dirent64 *)lbuf;
+ namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1);
+
+ ld->d_ino = sd->d_ino;
+ ld->d_off = sd->d_off;
+ ld->d_type = LX_DT_UNKNOWN;
+ (void) strncpy(ld->d_name, sd->d_name, namelen);
+ ld->d_name[namelen] = 0;
+ ld->d_reclen = (ushort_t)LX_RECLEN64(namelen);
+ /* Zero out any alignment padding */
+ bzero(ld->d_name + namelen, LX_ZEROLEN64(namelen));
+
+ if (is_sysfs) {
+ ld->d_type = lx_get_sysfs_dtype(ld->d_ino);
+ }
+
+ len -= sd->d_reclen;
+ size += ld->d_reclen;
+ sbuf += sd->d_reclen;
+ lbuf += ld->d_reclen;
+ }
+ return (size);
+}
+
+
+long
+lx_getdents64(int fd, caddr_t buf, size_t count)
+{
+ return (lx_getdents_common(fd, buf, count,
+ sizeof (struct lx_dirent64), lx_getdents64_format));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
new file mode 100644
index 0000000000..0ebd93304e
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+/*
+ * return the pid
+ */
+long
+lx_getpid(void)
+{
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+ long rv;
+
+ if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) {
+ rv = 1;
+ } else {
+ VERIFY(lwpd != NULL);
+
+ rv = lwpd->br_tgid;
+ }
+
+ return (rv);
+}
+
+/*
+ * return the parent pid
+ */
+long
+lx_getppid(void)
+{
+ return (lx_lwp_ppid(ttolwp(curthread), NULL, NULL));
+}
+
+/*
+ * return the thread id
+ */
+long
+lx_gettid(void)
+{
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+
+ return (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c
new file mode 100644
index 0000000000..acc4073483
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c
@@ -0,0 +1,33 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+/*
+ * From "uts/common/syscall/getrandom.c":
+ */
+extern int getrandom(void *, size_t, int);
+
+long
+lx_getrandom(void *bufp, size_t buflen, int flags)
+{
+ /*
+ * According to signal(7), calls to getrandom(2) are restartable.
+ */
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+
+ return (getrandom(bufp, buflen, flags));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c
new file mode 100644
index 0000000000..67f0fc9e5e
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c
@@ -0,0 +1,509 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/zone.h>
+#include <sys/cred.h>
+#include <sys/cred_impl.h>
+#include <sys/policy.h>
+#include <sys/lx_types.h>
+
+#define LX_NGROUPS_MAX 32
+
+/* From usr/src/uts/common/syscall/gid.c & uid.c */
+extern int setgid(gid_t);
+extern int setregid(gid_t, gid_t);
+extern int setreuid(uid_t, uid_t);
+extern int setuid(uid_t);
+
+/* From usr/src/uts/common/syscall/groups.c */
+extern int setgroups(int, gid_t *);
+
+long
+lx_getegid(void)
+{
+ return (crgetgid(CRED()));
+}
+
+long
+lx_getegid16(void)
+{
+ return ((int)LX_GID32_TO_GID16(crgetgid(CRED())));
+}
+
+long
+lx_geteuid(void)
+{
+ return (crgetuid(CRED()));
+}
+
+long
+lx_geteuid16(void)
+{
+ return ((int)LX_UID32_TO_UID16(crgetuid(CRED())));
+}
+
+long
+lx_getgid(void)
+{
+ return (crgetrgid(CRED()));
+}
+
+long
+lx_getgid16(void)
+{
+ return ((int)LX_GID32_TO_GID16(crgetrgid(CRED())));
+}
+
+long
+lx_getuid(void)
+{
+ return (crgetruid(CRED()));
+}
+
+long
+lx_getuid16(void)
+{
+ return ((int)LX_UID32_TO_UID16(crgetruid(CRED())));
+}
+
+long
+lx_setgid(gid_t gid)
+{
+ return (setgid(gid));
+}
+
+long
+lx_setgid16(lx_gid16_t gid)
+{
+ return (setgid(LX_GID16_TO_GID32(gid)));
+}
+
+long
+lx_setregid(gid_t rgid, gid_t egid)
+{
+ return (setregid(rgid, egid));
+}
+
+long
+lx_setregid16(lx_gid16_t rgid, lx_gid16_t egid)
+{
+ return (setregid(LX_UID16_TO_UID32(rgid), LX_UID16_TO_UID32(egid)));
+}
+
+long
+lx_setreuid(uid_t ruid, uid_t euid)
+{
+ return (setreuid(ruid, euid));
+}
+
+long
+lx_setreuid16(lx_uid16_t ruid, lx_uid16_t euid)
+{
+ return (setreuid(LX_UID16_TO_UID32(ruid), LX_UID16_TO_UID32(euid)));
+}
+
+long
+lx_setuid(uid_t uid)
+{
+ return (setuid(uid));
+}
+
+long
+lx_setuid16(lx_uid16_t uid)
+{
+ return (setuid(LX_UID16_TO_UID32(uid)));
+}
+
+/*
+ * This function is based on setreuid in common/syscall/uid.c and exists
+ * because illumos does not have a way to explicitly set the saved uid (suid)
+ * from any other system call.
+ */
+long
+lx_setresuid(lx_uid_t ruid, lx_uid_t euid, lx_uid_t suid)
+{
+ proc_t *p;
+ int error = 0;
+ int do_nocd = 0;
+ int uidchge = 0;
+ uid_t oldruid = ruid;
+ cred_t *cr, *newcr;
+ zoneid_t zoneid = getzoneid();
+
+ if ((ruid != -1 && (ruid > MAXUID)) ||
+ (euid != -1 && (euid > MAXUID)) ||
+ (suid != -1 && (suid > MAXUID))) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+
+ p = ttoproc(curthread);
+
+retry:
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if (ruid != -1 &&
+ ruid != cr->cr_ruid && ruid != cr->cr_uid &&
+ ruid != cr->cr_suid && secpolicy_allow_setid(cr, ruid, B_FALSE)) {
+ error = EPERM;
+ } else if (euid != -1 &&
+ euid != cr->cr_ruid && euid != cr->cr_uid &&
+ euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) {
+ error = EPERM;
+ } else if (suid != -1 &&
+ suid != cr->cr_ruid && suid != cr->cr_uid &&
+ suid != cr->cr_suid && secpolicy_allow_setid(cr, suid, B_FALSE)) {
+ error = EPERM;
+ } else {
+ if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) {
+ /*
+ * The ruid of the process is going to change. In order
+ * to avoid a race condition involving the
+ * process count associated with the newly given ruid,
+ * we increment the count before assigning the
+ * credential to the process.
+ * To do that, we'll have to take pidlock, so we first
+ * release p_crlock.
+ */
+ mutex_exit(&p->p_crlock);
+ uidchge = 1;
+ mutex_enter(&pidlock);
+ upcount_inc(ruid, zoneid);
+ mutex_exit(&pidlock);
+ /*
+ * As we released p_crlock we can't rely on the cr
+ * we read. So retry the whole thing.
+ */
+ goto retry;
+ }
+ crhold(cr);
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+
+ if (euid != -1)
+ newcr->cr_uid = euid;
+ if (suid != -1)
+ newcr->cr_suid = suid;
+ if (ruid != -1) {
+ oldruid = newcr->cr_ruid;
+ newcr->cr_ruid = ruid;
+ ASSERT(ruid != oldruid ? uidchge : 1);
+ }
+
+ /*
+ * A process that gives up its privilege
+ * must be marked to produce no core dump.
+ */
+ if ((cr->cr_uid != newcr->cr_uid ||
+ cr->cr_ruid != newcr->cr_ruid ||
+ cr->cr_suid != newcr->cr_suid))
+ do_nocd = 1;
+
+ crfree(cr);
+ }
+ mutex_exit(&p->p_crlock);
+
+ /*
+ * We decrement the number of processes associated with the oldruid
+ * to match the increment above, even if the ruid of the process
+ * did not change or an error occurred (oldruid == uid).
+ */
+ if (uidchge) {
+ ASSERT(oldruid != -1 && ruid != -1);
+ mutex_enter(&pidlock);
+ upcount_dec(oldruid, zoneid);
+ mutex_exit(&pidlock);
+ }
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ goto done;
+ }
+ crfree(newcr);
+done:
+ if (error)
+ return (set_errno(error));
+ else
+ return (0);
+}
+
+long
+lx_setresuid16(lx_uid16_t ruid16, lx_uid16_t euid16, lx_uid16_t suid16)
+{
+ long rval;
+
+ rval = lx_setresuid(
+ LX_UID16_TO_UID32(ruid16),
+ LX_UID16_TO_UID32(euid16),
+ LX_UID16_TO_UID32(suid16));
+
+ return (rval);
+}
+
+/*
+ * This function is based on setregid in common/syscall/gid.c
+ */
+long
+lx_setresgid(lx_gid_t rgid, lx_gid_t egid, lx_gid_t sgid)
+{
+ proc_t *p;
+ int error = 0;
+ int do_nocd = 0;
+ cred_t *cr, *newcr;
+
+ if ((rgid != -1 && (rgid > MAXUID)) ||
+ (egid != -1 && (egid > MAXUID)) ||
+ (sgid != -1 && (sgid > MAXUID))) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if (rgid != -1 &&
+ rgid != cr->cr_rgid && rgid != cr->cr_gid &&
+ rgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) {
+ error = EPERM;
+ } else if (egid != -1 &&
+ egid != cr->cr_rgid && egid != cr->cr_gid &&
+ egid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) {
+ error = EPERM;
+ } else if (sgid != -1 &&
+ sgid != cr->cr_rgid && sgid != cr->cr_gid &&
+ sgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) {
+ error = EPERM;
+ } else {
+ crhold(cr);
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+
+ if (egid != -1)
+ newcr->cr_gid = egid;
+ if (sgid != -1)
+ newcr->cr_sgid = sgid;
+ if (rgid != -1)
+ newcr->cr_rgid = rgid;
+
+ /*
+ * A process that gives up its privilege
+ * must be marked to produce no core dump.
+ */
+ if ((cr->cr_gid != newcr->cr_gid ||
+ cr->cr_rgid != newcr->cr_rgid ||
+ cr->cr_sgid != newcr->cr_sgid))
+ do_nocd = 1;
+
+ crfree(cr);
+ }
+ mutex_exit(&p->p_crlock);
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ goto done;
+ }
+ crfree(newcr);
+done:
+ if (error)
+ return (set_errno(error));
+ else
+ return (0);
+}
+
+long
+lx_setresgid16(lx_gid16_t rgid16, lx_gid16_t egid16, lx_gid16_t sgid16)
+{
+ long rval;
+
+ rval = lx_setresgid(
+ LX_GID16_TO_GID32(rgid16),
+ LX_GID16_TO_GID32(egid16),
+ LX_GID16_TO_GID32(sgid16));
+
+ return (rval);
+}
+
+/*
+ * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ
+ * the terrible hack below so that tests may proceed, if only on DEBUG kernels.
+ */
+int
+lx_helper_setgroups(int ngroups, gid_t *grouplist)
+{
+#ifdef DEBUG
+ if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX)
+ ngroups = ngroups_max;
+#endif /* DEBUG */
+
+ return (setgroups(ngroups, grouplist));
+}
+
+long
+lx_getresuid(lx_uid_t *ruid, lx_uid_t *euid, lx_uid_t *suid)
+{
+ lx_uid_t lx_ruid, lx_euid, lx_suid;
+ cred_t *cr = CRED();
+
+ lx_ruid = (lx_uid_t)crgetruid(cr);
+ lx_euid = (lx_uid_t)crgetuid(cr);
+ lx_suid = (lx_uid_t)crgetsuid(cr);
+
+ if (copyout(&lx_ruid, (void *)ruid, sizeof (lx_uid_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_euid, (void *)euid, sizeof (lx_uid_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_suid, (void *)suid, sizeof (lx_uid_t)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+long
+lx_getresuid16(lx_uid16_t *ruid16, lx_uid16_t *euid16, lx_uid16_t *suid16)
+{
+ lx_uid16_t lx_ruid16, lx_euid16, lx_suid16;
+ cred_t *cr = CRED();
+
+ lx_ruid16 = LX_UID32_TO_UID16((lx_uid_t)crgetruid(cr));
+ lx_euid16 = LX_UID32_TO_UID16((lx_uid_t)crgetuid(cr));
+ lx_suid16 = LX_UID32_TO_UID16((lx_uid_t)crgetsuid(cr));
+
+ if (copyout(&lx_ruid16, (void *)ruid16, sizeof (lx_uid16_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_euid16, (void *)euid16, sizeof (lx_uid16_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_suid16, (void *)suid16, sizeof (lx_uid16_t)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+long
+lx_getresgid(lx_gid_t *rgid, lx_gid_t *egid, lx_gid_t *sgid)
+{
+ lx_gid_t lx_rgid, lx_egid, lx_sgid;
+ cred_t *cr = CRED();
+
+ lx_rgid = (lx_gid_t)crgetrgid(cr);
+ lx_egid = (lx_gid_t)crgetgid(cr);
+ lx_sgid = (lx_gid_t)crgetsgid(cr);
+
+ if (copyout(&lx_rgid, (void *)rgid, sizeof (lx_gid_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_egid, (void *)egid, sizeof (lx_gid_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_sgid, (void *)sgid, sizeof (lx_gid_t)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+long
+lx_getresgid16(lx_gid16_t *rgid16, lx_gid16_t *egid16, lx_gid16_t *sgid16)
+{
+ lx_gid16_t lx_rgid16, lx_egid16, lx_sgid16;
+ cred_t *cr = CRED();
+
+ lx_rgid16 = LX_GID32_TO_GID16((lx_gid_t)crgetrgid(cr));
+ lx_egid16 = LX_GID32_TO_GID16((lx_gid_t)crgetgid(cr));
+ lx_sgid16 = LX_GID32_TO_GID16((lx_gid_t)crgetsgid(cr));
+
+ if (copyout(&lx_rgid16, (void *)rgid16, sizeof (lx_gid16_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_egid16, (void *)egid16, sizeof (lx_gid16_t)) != 0)
+ return (set_errno(EFAULT));
+ if (copyout(&lx_sgid16, (void *)sgid16, sizeof (lx_gid16_t)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/*
+ * The lx brand cannot support the setfs[ug]id16/setfs[ug]id calls as that
+ * would require significant rework of the illumos privilege mechanisms, so
+ * instead return the current effective [ug]id.
+ *
+ * In Linux, fsids track effective IDs, so returning the effective IDs works
+ * as a substitute; returning the current value also denotes failure of the
+ * call if the caller had specified something different. We don't need to
+ * worry about setting error codes because the Linux calls don't set any.
+ */
+/*ARGSUSED*/
+long
+lx_setfsuid16(uid_t fsuid16)
+{
+ return ((int)LX_UID32_TO_UID16(crgetuid(CRED())));
+}
+
+/*ARGSUSED*/
+long
+lx_setfsgid16(gid_t fsgid16)
+{
+ return ((int)LX_GID32_TO_GID16(crgetgid(CRED())));
+}
+
+/*ARGSUSED*/
+long
+lx_setfsuid(uid_t fsuid)
+{
+ return (crgetuid(CRED()));
+}
+
+/*ARGSUSED*/
+long
+lx_setfsgid(gid_t fsgid)
+{
+ return (crgetgid(CRED()));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c
new file mode 100644
index 0000000000..9d8d88d6f6
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c
@@ -0,0 +1,1865 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/termio.h>
+#include <sys/termios.h>
+#include <sys/ptyvar.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <sys/sockio.h>
+#include <sys/stropts.h>
+#include <sys/ptms.h>
+#include <sys/cred.h>
+#include <sys/cred_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_ptm.h>
+#include <sys/brand.h>
+#include <sys/sunddi.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/session.h>
+#include <sys/kmem.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/if_arp.h>
+#include <sys/ioccom.h>
+#include <sys/dtrace.h>
+#include <sys/ethernet.h>
+#include <sys/dlpi.h>
+#include <sys/lx_autofs.h>
+#include <sys/netstack.h>
+#include <inet/ip.h>
+#include <inet/ip_if.h>
+#include <sys/dkio.h>
+#include <sys/sdt.h>
+
+/*
+ * Linux ioctl types
+ */
+#define LX_IOC_TYPE_HD 0x03
+#define LX_IOC_TYPE_BLK 0x12
+#define LX_IOC_TYPE_FD 0x54
+#define LX_IOC_TYPE_DTRACE 0x68
+#define LX_IOC_TYPE_SOCK 0x89
+#define LX_IOC_TYPE_AUTOFS 0x93
+
+/*
+ * Supported ioctls
+ */
+#define LX_HDIO_GETGEO 0x0301
+#define LX_BLKGETSIZE 0x1260
+#define LX_BLKSSZGET 0x1268
+#define LX_BLKGETSIZE64 0x80081272
+#define LX_TCGETS 0x5401
+#define LX_TCSETS 0x5402
+#define LX_TCSETSW 0x5403
+#define LX_TCSETSF 0x5404
+#define LX_TCGETA 0x5405
+#define LX_TCSETA 0x5406
+#define LX_TCSETAW 0x5407
+#define LX_TCSETAF 0x5408
+#define LX_TCSBRK 0x5409
+#define LX_TCXONC 0x540a
+#define LX_TCFLSH 0x540b
+#define LX_TIOCEXCL 0x540c
+#define LX_TIOCNXCL 0x540d
+#define LX_TIOCSCTTY 0x540e
+#define LX_TIOCGPGRP 0x540f
+#define LX_TIOCSPGRP 0x5410
+#define LX_TIOCOUTQ 0x5411
+#define LX_TIOCSTI 0x5412
+#define LX_TIOCGWINSZ 0x5413
+#define LX_TIOCSWINSZ 0x5414
+#define LX_TIOCMGET 0x5415
+#define LX_TIOCMBIS 0x5416
+#define LX_TIOCMBIC 0x5417
+#define LX_TIOCMSET 0x5418
+#define LX_TIOCGSOFTCAR 0x5419
+#define LX_TIOCSSOFTCAR 0x541a
+#define LX_FIONREAD 0x541b
+#define LX_TIOCPKT 0x5420
+#define LX_FIONBIO 0x5421
+#define LX_TIOCNOTTY 0x5422
+#define LX_TIOCSETD 0x5423
+#define LX_TIOCGETD 0x5424
+#define LX_TCSBRKP 0x5425
+#define LX_TIOCGSID 0x5429
+#define LX_TIOCGPTN 0x80045430
+#define LX_TIOCSPTLCK 0x40045431
+#define LX_FIONCLEX 0x5450
+#define LX_FIOCLEX 0x5451
+#define LX_FIOASYNC 0x5452
+#define LX_FIOSETOWN 0x8901
+#define LX_SIOCSPGRP 0x8902
+#define LX_FIOGETOWN 0x8903
+#define LX_SIOCGPGRP 0x8904
+#define LX_SIOCATMARK 0x8905
+#define LX_SIOCGSTAMP 0x8906
+#define LX_SIOCADDRT 0x890b
+#define LX_SIOCDELRT 0x890c
+#define LX_SIOCRTMSG 0x890d
+#define LX_SIOCGIFNAME 0x8910
+#define LX_SIOCSIFLINK 0x8911
+#define LX_SIOCGIFCONF 0x8912
+#define LX_SIOCGIFFLAGS 0x8913
+#define LX_SIOCSIFFLAGS 0x8914
+#define LX_SIOCGIFADDR 0x8915
+#define LX_SIOCSIFADDR 0x8916
+#define LX_SIOCGIFDSTADDR 0x8917
+#define LX_SIOCSIFDSTADDR 0x8918
+#define LX_SIOCGIFBRDADDR 0x8919
+#define LX_SIOCSIFBRDADDR 0x891a
+#define LX_SIOCGIFNETMASK 0x891b
+#define LX_SIOCSIFNETMASK 0x891c
+#define LX_SIOCGIFMETRIC 0x891d
+#define LX_SIOCSIFMETRIC 0x891e
+#define LX_SIOCGIFMEM 0x891f
+#define LX_SIOCSIFMEM 0x8920
+#define LX_SIOCGIFMTU 0x8921
+#define LX_SIOCSIFMTU 0x8922
+#define LX_SIOCSIFNAME 0x8923
+#define LX_SIOCSIFHWADDR 0x8924
+#define LX_SIOCGIFENCAP 0x8925
+#define LX_SIOCSIFENCAP 0x8926
+#define LX_SIOCGIFHWADDR 0x8927
+#define LX_SIOCGIFSLAVE 0x8929
+#define LX_SIOCSIFSLAVE 0x8930
+#define LX_SIOCADDMULTI 0x8931
+#define LX_SIOCDELMULTI 0x8932
+#define LX_SIOCGIFINDEX 0x8933
+#define LX_SIOCSIFPFLAGS 0x8934
+#define LX_SIOCGIFPFLAGS 0x8935
+#define LX_SIOCDIFADDR 0x8936
+#define LX_SIOCSIFHWBROADCAST 0x8937
+#define LX_SIOCGIFCOUNT 0x8938
+#define LX_SIOCGIFBR 0x8940
+#define LX_SIOCSIFBR 0x8941
+#define LX_SIOCGIFTXQLEN 0x8942
+#define LX_SIOCSIFTXQLEN 0x8943
+#define LX_SIOCETHTOOL 0x8946
+#define LX_SIOCGMIIPHY 0x8947
+#define LX_SIOCGMIIREG 0x8948
+#define LX_SIOCSMIIREG 0x8949
+#define LX_SIOCWANDEV 0x894a
+#define LX_SIOCOUTQNSD 0x894b
+#define LX_SIOCDARP 0x8953
+#define LX_SIOCGARP 0x8954
+#define LX_SIOCSARP 0x8955
+#define LX_SIOCDRARP 0x8960
+#define LX_SIOCGRARP 0x8961
+#define LX_SIOCSRARP 0x8962
+#define LX_SIOCGIFMAP 0x8970
+#define LX_SIOCSIFMAP 0x8971
+#define LX_SIOCADDDLCI 0x8980
+#define LX_SIOCDELDLCI 0x8981
+#define LX_SIOCGIFVLAN 0x8982
+#define LX_SIOCSIFVLAN 0x8983
+#define LX_SIOCBONDENSLAVE 0x8990
+#define LX_SIOCBONDRELEASE 0x8991
+#define LX_SIOCBONDSETHWADDR 0x8992
+#define LX_SIOCBONDSLAVEINFOQUERY 0x8993
+#define LX_SIOCBONDINFOQUERY 0x8994
+#define LX_SIOCBONDCHANGEACTIVE 0x8995
+#define LX_SIOCBRADDBR 0x89a0
+#define LX_SIOCBRDELBR 0x89a1
+#define LX_SIOCBRADDIF 0x89a2
+#define LX_SIOCBRDELIF 0x89a3
+#define LX_SIOCSHWTSTAMP 0x89b0
+#define LX_SIOCGHWTSTAMP 0x89b1
+#define LX_SIOCDEVPRIVATE 0x89f0
+#define LX_SIOCPROTOPRIVATE 0x89e0
+
+#define FLUSER(fp) fp->f_flag | get_udatamodel()
+#define FLFAKE(fp) fp->f_flag | FKIOCTL
+
+/*
+ * LX_NCC must be different from LX_NCCS since while the termio and termios
+ * structures may look similar they are fundamentally different sizes and
+ * have different members.
+ */
+#define LX_NCC 8
+#define LX_NCCS 19
+
+struct lx_termio {
+ unsigned short c_iflag; /* input mode flags */
+ unsigned short c_oflag; /* output mode flags */
+ unsigned short c_cflag; /* control mode flags */
+ unsigned short c_lflag; /* local mode flags */
+ unsigned char c_line; /* line discipline */
+ unsigned char c_cc[LX_NCC]; /* control characters */
+};
+
+struct lx_termios {
+ uint32_t c_iflag; /* input mode flags */
+ uint32_t c_oflag; /* output mode flags */
+ uint32_t c_cflag; /* control mode flags */
+ uint32_t c_lflag; /* local mode flags */
+ unsigned char c_line; /* line discipline */
+ unsigned char c_cc[LX_NCCS]; /* control characters */
+};
+
+/*
+ * c_cc characters which are valid for lx_termio and lx_termios
+ */
+#define LX_VINTR 0
+#define LX_VQUIT 1
+#define LX_VERASE 2
+#define LX_VKILL 3
+#define LX_VEOF 4
+#define LX_VTIME 5
+#define LX_VMIN 6
+#define LX_VSWTC 7
+
+/*
+ * c_cc characters which are valid for lx_termios
+ */
+#define LX_VSTART 8
+#define LX_VSTOP 9
+#define LX_VSUSP 10
+#define LX_VEOL 11
+#define LX_VREPRINT 12
+#define LX_VDISCARD 13
+#define LX_VWERASE 14
+#define LX_VLNEXT 15
+#define LX_VEOL2 16
+
+/*
+ * Defaults needed for SunOS to Linux format conversion.
+ * See INIT_C_CC in linux-stable/include/asm-generic/termios.h
+ */
+#define LX_DEF_VTIME 0
+#define LX_DEF_VMIN 1
+#define LX_DEF_VEOF '\004'
+#define LX_DEF_VEOL 0
+
+/* VSD key for lx_cc information */
+static uint_t lx_ioctl_vsd = 0;
+
+
+/* Terminal helpers */
+
+static void
+l2s_termios(struct lx_termios *l_tios, struct termios *s_tios)
+{
+ ASSERT((l_tios != NULL) && (s_tios != NULL));
+
+ bzero(s_tios, sizeof (*s_tios));
+
+ s_tios->c_iflag = l_tios->c_iflag;
+ s_tios->c_oflag = l_tios->c_oflag;
+ s_tios->c_cflag = l_tios->c_cflag;
+ s_tios->c_lflag = l_tios->c_lflag;
+
+ if (s_tios->c_lflag & ICANON) {
+ s_tios->c_cc[VEOF] = l_tios->c_cc[LX_VEOF];
+ s_tios->c_cc[VEOL] = l_tios->c_cc[LX_VEOL];
+ } else {
+ s_tios->c_cc[VMIN] = l_tios->c_cc[LX_VMIN];
+ s_tios->c_cc[VTIME] = l_tios->c_cc[LX_VTIME];
+ }
+
+ s_tios->c_cc[VEOL2] = l_tios->c_cc[LX_VEOL2];
+ s_tios->c_cc[VERASE] = l_tios->c_cc[LX_VERASE];
+ s_tios->c_cc[VKILL] = l_tios->c_cc[LX_VKILL];
+ s_tios->c_cc[VREPRINT] = l_tios->c_cc[LX_VREPRINT];
+ s_tios->c_cc[VLNEXT] = l_tios->c_cc[LX_VLNEXT];
+ s_tios->c_cc[VWERASE] = l_tios->c_cc[LX_VWERASE];
+ s_tios->c_cc[VINTR] = l_tios->c_cc[LX_VINTR];
+ s_tios->c_cc[VQUIT] = l_tios->c_cc[LX_VQUIT];
+ s_tios->c_cc[VSWTCH] = l_tios->c_cc[LX_VSWTC];
+ s_tios->c_cc[VSTART] = l_tios->c_cc[LX_VSTART];
+ s_tios->c_cc[VSTOP] = l_tios->c_cc[LX_VSTOP];
+ s_tios->c_cc[VSUSP] = l_tios->c_cc[LX_VSUSP];
+ s_tios->c_cc[VDISCARD] = l_tios->c_cc[LX_VDISCARD];
+}
+
+static void
+l2s_termio(struct lx_termio *l_tio, struct termio *s_tio)
+{
+ ASSERT((l_tio != NULL) && (s_tio != NULL));
+
+ bzero(s_tio, sizeof (*s_tio));
+
+ s_tio->c_iflag = l_tio->c_iflag;
+ s_tio->c_oflag = l_tio->c_oflag;
+ s_tio->c_cflag = l_tio->c_cflag;
+ s_tio->c_lflag = l_tio->c_lflag;
+
+ if (s_tio->c_lflag & ICANON) {
+ s_tio->c_cc[VEOF] = l_tio->c_cc[LX_VEOF];
+ } else {
+ s_tio->c_cc[VMIN] = l_tio->c_cc[LX_VMIN];
+ s_tio->c_cc[VTIME] = l_tio->c_cc[LX_VTIME];
+ }
+
+ s_tio->c_cc[VINTR] = l_tio->c_cc[LX_VINTR];
+ s_tio->c_cc[VQUIT] = l_tio->c_cc[LX_VQUIT];
+ s_tio->c_cc[VERASE] = l_tio->c_cc[LX_VERASE];
+ s_tio->c_cc[VKILL] = l_tio->c_cc[LX_VKILL];
+ s_tio->c_cc[VSWTCH] = l_tio->c_cc[LX_VSWTC];
+}
+
+static void
+termios2lx_cc(struct lx_termios *l_tios, struct lx_cc *lio)
+{
+ ASSERT((l_tios != NULL) && (lio != NULL));
+
+ bzero(lio, sizeof (*lio));
+
+ lio->veof = l_tios->c_cc[LX_VEOF];
+ lio->veol = l_tios->c_cc[LX_VEOL];
+ lio->vmin = l_tios->c_cc[LX_VMIN];
+ lio->vtime = l_tios->c_cc[LX_VTIME];
+}
+
+static void
+termio2lx_cc(struct lx_termio *l_tio, struct lx_cc *lio)
+{
+ ASSERT((l_tio != NULL) && (lio != NULL));
+
+ bzero(lio, sizeof (*lio));
+
+ lio->veof = l_tio->c_cc[LX_VEOF];
+ lio->veol = 0;
+ lio->vmin = l_tio->c_cc[LX_VMIN];
+ lio->vtime = l_tio->c_cc[LX_VTIME];
+}
+
+static void
+s2l_termios(struct termios *s_tios, struct lx_termios *l_tios)
+{
+ ASSERT((s_tios != NULL) && (l_tios != NULL));
+
+ bzero(l_tios, sizeof (*l_tios));
+
+ l_tios->c_iflag = s_tios->c_iflag;
+ l_tios->c_oflag = s_tios->c_oflag;
+ l_tios->c_cflag = s_tios->c_cflag;
+ l_tios->c_lflag = s_tios->c_lflag;
+
+ /*
+ * Since use of the VMIN/VTIME and VEOF/VEOL control characters is
+ * mutually exclusive (determined by ICANON), SunOS aliases them in the
+ * c_cc field in termio/termios. Linux does not perform this aliasing,
+ * so it expects that the default values are present regardless of
+ * ICANON status.
+ *
+ * These defaults can be overridden later by any values stored via the
+ * lx_cc mechanism.
+ */
+ if (s_tios->c_lflag & ICANON) {
+ l_tios->c_cc[LX_VEOF] = s_tios->c_cc[VEOF];
+ l_tios->c_cc[LX_VEOL] = s_tios->c_cc[VEOL];
+ l_tios->c_cc[LX_VTIME] = LX_DEF_VTIME;
+ l_tios->c_cc[LX_VMIN] = LX_DEF_VMIN;
+
+ } else {
+ l_tios->c_cc[LX_VMIN] = s_tios->c_cc[VMIN];
+ l_tios->c_cc[LX_VTIME] = s_tios->c_cc[VTIME];
+ l_tios->c_cc[LX_VEOF] = LX_DEF_VEOF;
+ l_tios->c_cc[LX_VEOL] = LX_DEF_VEOL;
+ }
+
+ l_tios->c_cc[LX_VEOL2] = s_tios->c_cc[VEOL2];
+ l_tios->c_cc[LX_VERASE] = s_tios->c_cc[VERASE];
+ l_tios->c_cc[LX_VKILL] = s_tios->c_cc[VKILL];
+ l_tios->c_cc[LX_VREPRINT] = s_tios->c_cc[VREPRINT];
+ l_tios->c_cc[LX_VLNEXT] = s_tios->c_cc[VLNEXT];
+ l_tios->c_cc[LX_VWERASE] = s_tios->c_cc[VWERASE];
+ l_tios->c_cc[LX_VINTR] = s_tios->c_cc[VINTR];
+ l_tios->c_cc[LX_VQUIT] = s_tios->c_cc[VQUIT];
+ l_tios->c_cc[LX_VSWTC] = s_tios->c_cc[VSWTCH];
+ l_tios->c_cc[LX_VSTART] = s_tios->c_cc[VSTART];
+ l_tios->c_cc[LX_VSTOP] = s_tios->c_cc[VSTOP];
+ l_tios->c_cc[LX_VSUSP] = s_tios->c_cc[VSUSP];
+ l_tios->c_cc[LX_VDISCARD] = s_tios->c_cc[VDISCARD];
+}
+
+static void
+s2l_termio(struct termio *s_tio, struct lx_termio *l_tio)
+{
+ ASSERT((s_tio != NULL) && (l_tio != NULL));
+
+ bzero(l_tio, sizeof (*l_tio));
+
+ l_tio->c_iflag = s_tio->c_iflag;
+ l_tio->c_oflag = s_tio->c_oflag;
+ l_tio->c_cflag = s_tio->c_cflag;
+ l_tio->c_lflag = s_tio->c_lflag;
+
+ if (s_tio->c_lflag & ICANON) {
+ l_tio->c_cc[LX_VEOF] = s_tio->c_cc[VEOF];
+ l_tio->c_cc[LX_VTIME] = LX_DEF_VTIME;
+ l_tio->c_cc[LX_VMIN] = LX_DEF_VMIN;
+ } else {
+ l_tio->c_cc[LX_VMIN] = s_tio->c_cc[VMIN];
+ l_tio->c_cc[LX_VTIME] = s_tio->c_cc[VTIME];
+ l_tio->c_cc[LX_VEOF] = LX_DEF_VEOF;
+ }
+
+ l_tio->c_cc[LX_VINTR] = s_tio->c_cc[VINTR];
+ l_tio->c_cc[LX_VQUIT] = s_tio->c_cc[VQUIT];
+ l_tio->c_cc[LX_VERASE] = s_tio->c_cc[VERASE];
+ l_tio->c_cc[LX_VKILL] = s_tio->c_cc[VKILL];
+ l_tio->c_cc[LX_VSWTC] = s_tio->c_cc[VSWTCH];
+}
+
+static void
+set_lx_cc(vnode_t *vp, struct lx_cc *lio)
+{
+ struct lx_cc *cur;
+ /*
+ * Linux expects that the termio/termios control characters are
+ * preserved more strictly than illumos supports. In order to preserve
+ * the illusion that the characters are maintained, they are stored as
+ * vnode-specific data.
+ */
+ mutex_enter(&vp->v_vsd_lock);
+ cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd);
+ if (cur == NULL) {
+ cur = kmem_alloc(sizeof (struct lx_cc), KM_SLEEP);
+ bcopy(lio, cur, sizeof (struct lx_cc));
+ (void) vsd_set(vp, lx_ioctl_vsd, cur);
+ } else {
+ bcopy(lio, cur, sizeof (struct lx_cc));
+ }
+ mutex_exit(&vp->v_vsd_lock);
+}
+
+static int
+get_lx_cc(vnode_t *vp, struct lx_cc *lio)
+{
+ struct lx_cc *cur;
+ int rv = 1;
+ mutex_enter(&vp->v_vsd_lock);
+ cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd);
+ if (cur != NULL) {
+ bcopy(cur, lio, sizeof (*lio));
+ rv = 0;
+ }
+ mutex_exit(&vp->v_vsd_lock);
+ return (rv);
+}
+
+/* Socket helpers */
+
+typedef struct lx_ifreq32 {
+ char ifr_name[IFNAMSIZ];
+ union {
+ struct sockaddr ifru_addr;
+ } ifr_ifrn;
+} lx_ifreq32_t;
+
+typedef struct lx_ifreq64 {
+ char ifr_name[IFNAMSIZ];
+ union {
+ struct sockaddr ifru_addr;
+ /* pad this out to the Linux size */
+ uint64_t ifmap[3];
+ } ifr_ifrn;
+} lx_ifreq64_t;
+
+typedef struct lx_ifconf32 {
+ int32_t if_len;
+ caddr32_t if_buf;
+} lx_ifconf32_t;
+
+typedef struct lx_ifconf64 {
+ int32_t if_len;
+ caddr_t if_buf;
+} lx_ifconf64_t;
+
+
+/* Generic translators */
+
+/* ARGSUSED */
+static int
+ict_pass(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ int error = 0;
+ int rv;
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+ NULL);
+ return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* ARGSUSED */
+static int
+ict_fionbio(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ vnode_t *vp;
+ int32_t iflag, flags;
+ int error;
+
+ if (copyin((caddr_t)arg, &iflag, sizeof (iflag)))
+ return (set_errno(EFAULT));
+
+ mutex_enter(&fp->f_tlock);
+ vp = fp->f_vnode;
+ flags = fp->f_flag;
+ /* Linux sets NONBLOCK instead of FIONBIO */
+ if (iflag)
+ flags |= FNONBLOCK;
+ else
+ flags &= ~FNONBLOCK;
+ /* push the flag down */
+ error = VOP_SETFL(vp, fp->f_flag, flags, fp->f_cred, NULL);
+ fp->f_flag = flags;
+ mutex_exit(&fp->f_tlock);
+ return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* ARGSUSED */
+static int
+ict_fionread(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ vnode_t *vp;
+ struct vattr vattr;
+ int error = 0;
+ int rv;
+ /*
+ * offset is int32_t because that is what FIONREAD is defined in terms
+ * of. We cap at INT_MAX as in other cases for this ioctl.
+ */
+ int32_t offset;
+
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG || vp->v_type == VDIR) {
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred, NULL);
+ if (error != 0)
+ return (set_errno(error));
+ offset = MIN(vattr.va_size - fp->f_offset, INT_MAX);
+ if (copyout(&offset, (caddr_t)arg, sizeof (offset)))
+ return (set_errno(EFAULT));
+ } else {
+ error = VOP_IOCTL(vp, FIONREAD, arg, FLUSER(fp), fp->f_cred,
+ &rv, NULL);
+ if (error)
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+/*
+ * hard disk-related translators
+ *
+ * Note that the normal disk ioctls only work for VCHR devices. See spec_ioctl
+ * which will return ENOTTY for a VBLK device. However, fdisk, etc. expect to
+ * work with block devices.
+ *
+ * We expect a zvol to be the primary block device we're interacting with and
+ * we use the zone's lxzd_vdisks list to handle zvols specifically.
+ */
+
+typedef struct lx_hd_geom {
+ unsigned char heads;
+ unsigned char sectors;
+ unsigned short cylinders;
+ unsigned long start;
+} lx_hd_geom_t;
+
+/*
+ * Return the volsize and blksize for the correct virtual "disk" for the zone.
+ * Only these two values are returned in 'vdp' within this code.
+ *
+ * A virtual "disk" can be a zvol visible within the zone, but most zones are
+ * not configured with a delegated dataset necessary to make zvols visible.
+ *
+ * To make various applications happy, lx also pretends that our root filesystem
+ * (normally within the zone's dataset) lives on a virtual disk. We have a
+ * /dev/zfsds0 symlink which points at /dev/zfs. This appears in various places
+ * to give the illusion of root's disk. For example, see:
+ * /proc/partitions
+ * /sys/block/zfsds0
+ * /sys/devices/zfs/zfsds0
+ * If an application issues the various LX_HDIO_GETGEO, LX_BLKGETSIZE*, or
+ * LX_BLKSSZGET ioctls on /dev/zfs (that is, minor number 0), we want to return
+ * something sane. In this case, we return the total size (which is normally
+ * limited by a quota) of the dataset that the zone root lives on.
+ */
+static boolean_t
+lx_lookup_zdsk_info(lx_zone_data_t *lxzd, dev_t dev, lx_virt_disk_t *vdp)
+{
+ lx_virt_disk_t *vd;
+
+ /* Handle /dev/zfs */
+ if (getminor(dev) == 0) {
+ struct statvfs64 sv;
+
+ if (VFS_STATVFS(curzone->zone_rootvp->v_vfsp, &sv) == 0) {
+ vdp->lxvd_volsize = sv.f_blocks * sv.f_frsize;
+ vdp->lxvd_blksize = sv.f_frsize;
+ } else {
+ vdp->lxvd_volsize = 0;
+ /* always set to prevent potential divide-by-zero */
+ vdp->lxvd_blksize = 512;
+ }
+
+ return (B_TRUE);
+ }
+
+ vd = list_head(lxzd->lxzd_vdisks);
+ while (vd != NULL) {
+ if (vd->lxvd_type == LXVD_ZVOL && vd->lxvd_real_dev == dev) {
+ bzero(vdp, sizeof (*vdp));
+ vdp->lxvd_volsize = vd->lxvd_volsize;
+ vdp->lxvd_blksize = vd->lxvd_blksize;
+ return (B_TRUE);
+ }
+ vd = list_next(lxzd->lxzd_vdisks, vd);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * See zvol_ioctl() which always fails for DKIOCGGEOM. The geometry for a
+ * zvol (or really any modern disk) is made up, so we do that here as well.
+ */
+/* ARGSUSED */
+static int
+ict_hdgetgeo(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ lx_hd_geom_t lx_geom;
+ lx_zone_data_t *lxzd;
+
+ if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+ return (set_errno(EINVAL));
+
+ lxzd = ztolxzd(curproc->p_zone);
+ ASSERT(lxzd != NULL);
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+
+ if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+ lx_virt_disk_t vd;
+
+ if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd) ||
+ vd.lxvd_volsize == 0 || vd.lxvd_blksize == 0) {
+ /* should only happen if new zvol */
+ bzero(&lx_geom, sizeof (lx_geom));
+ } else {
+ const diskaddr_t blks =
+ MAX(1, vd.lxvd_volsize / vd.lxvd_blksize);
+
+ /*
+ * Attempt to conjure up a Cylinder-Head-Sector
+ * geometry for the given virtual disk size.
+ */
+ if (blks <= (63*16*65535)) {
+ /*
+ * Use traditional BIOS-style geometry for
+ * adequately small disks.
+ */
+ lx_geom.sectors = 63;
+ lx_geom.heads = 16;
+ lx_geom.cylinders = MAX(1, (blks / (63 * 16)));
+ } else if (blks <= (64*32*65535)) {
+ /* 1MB per cylinder for 512-byte sectors */
+ lx_geom.sectors = 64;
+ lx_geom.heads = 32;
+ lx_geom.cylinders = (blks / (64 * 32));
+ } else {
+ /*
+ * Max out the geometry sizing for large disks.
+ * This may not be adequate for truely huge
+ * volumes (maxing out at a little under 2TB
+ * for those with a 512-byte blocksize), but it
+ * is the best we can do with the given struct.
+ */
+ lx_geom.sectors = 255;
+ lx_geom.heads = 255;
+ lx_geom.cylinders = MIN(65535,
+ (blks / (255*255)));
+ }
+ lx_geom.start = 0;
+ }
+ } else {
+ int res, rv;
+ struct dk_geom geom;
+
+ res = VOP_IOCTL(fp->f_vnode, DKIOCGGEOM, (intptr_t)&geom,
+ fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+ if (res > 0)
+ return (set_errno(res));
+
+ lx_geom.heads = geom.dkg_nhead;
+ lx_geom.sectors = geom.dkg_nsect;
+ lx_geom.cylinders = geom.dkg_ncyl;
+ lx_geom.start = 0;
+ }
+
+ if (copyout(&lx_geom, (caddr_t)arg, sizeof (lx_geom)))
+ return (set_errno(EFAULT));
+ return (0);
+}
+
+/*
+ * Per the Linux sd(4) man page, get the number of sectors. The linux/fs.h
+ * header says its 512 byte blocks.
+ */
+/* ARGSUSED */
+static int
+ict_blkgetsize(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ diskaddr_t tot;
+ lx_zone_data_t *lxzd;
+
+ if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+ return (set_errno(EINVAL));
+
+ lxzd = ztolxzd(curproc->p_zone);
+ ASSERT(lxzd != NULL);
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+
+ if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+ lx_virt_disk_t vd;
+
+ if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) {
+ /* should only happen if new zvol */
+ tot = 0;
+ } else {
+ tot = vd.lxvd_volsize / 512;
+ }
+ } else {
+ int res, rv;
+ struct dk_minfo minfo;
+
+ res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo,
+ fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+ if (res > 0)
+ return (set_errno(res));
+
+ tot = minfo.dki_capacity;
+ if (minfo.dki_lbsize > 512) {
+ uint_t bsize = minfo.dki_lbsize / 512;
+
+ tot *= bsize;
+ }
+ }
+
+ if (copyout(&tot, (caddr_t)arg, sizeof (long)))
+ return (set_errno(EFAULT));
+ return (0);
+}
+
+/*
+ * Get the sector size (i.e. the logical block size).
+ */
+/* ARGSUSED */
+static int
+ict_blkgetssize(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ uint_t bsize;
+ lx_zone_data_t *lxzd;
+
+ if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+ return (set_errno(EINVAL));
+
+ lxzd = ztolxzd(curproc->p_zone);
+ ASSERT(lxzd != NULL);
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+
+ if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+ lx_virt_disk_t vd;
+
+ if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) {
+ /* should only happen if new zvol */
+ bsize = 0;
+ } else {
+ bsize = (uint_t)vd.lxvd_blksize;
+ }
+ } else {
+ int res, rv;
+ struct dk_minfo minfo;
+
+ res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo,
+ fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+ if (res > 0)
+ return (set_errno(res));
+
+ bsize = (uint_t)minfo.dki_lbsize;
+ }
+
+ if (copyout(&bsize, (caddr_t)arg, sizeof (bsize)))
+ return (set_errno(EFAULT));
+ return (0);
+}
+
+/*
+ * Get the size. The linux/fs.h header says its in bytes.
+ */
+/* ARGSUSED */
+static int
+ict_blkgetsize64(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ uint64_t tot;
+ lx_zone_data_t *lxzd;
+
+ if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+ return (set_errno(EINVAL));
+
+ lxzd = ztolxzd(curproc->p_zone);
+ ASSERT(lxzd != NULL);
+ ASSERT(lxzd->lxzd_vdisks != NULL);
+
+ if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+ lx_virt_disk_t vd;
+
+ if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) {
+ /* should only happen if new zvol */
+ tot = 0;
+ } else {
+ tot = vd.lxvd_volsize;
+ }
+ } else {
+ int res, rv;
+ struct dk_minfo minfo;
+
+ res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo,
+ fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+ if (res > 0)
+ return (set_errno(res));
+
+ tot = minfo.dki_capacity * minfo.dki_lbsize;
+ }
+
+ if (copyout(&tot, (caddr_t)arg, sizeof (uint64_t)))
+ return (set_errno(EFAULT));
+ return (0);
+}
+
+/* ARGSUSED */
+/* Terminal-related translators */
+
+/* ARGSUSED */
+static int
+ict_tcsets(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct lx_termios l_tios;
+ struct termios s_tios;
+ struct lx_cc lio;
+ int error, rv;
+
+ ASSERT(cmd == TCSETS || cmd == TCSETSW || cmd == TCSETSF);
+
+ if (copyin((struct lx_termios *)arg, &l_tios, sizeof (l_tios)) != 0)
+ return (set_errno(EFAULT));
+ termios2lx_cc(&l_tios, &lio);
+ l2s_termios(&l_tios, &s_tios);
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+ if (error)
+ return (set_errno(error));
+ /* preserve lx_cc */
+ set_lx_cc(fp->f_vnode, &lio);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_tcseta(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct lx_termio l_tio;
+ struct termio s_tio;
+ struct lx_cc lio;
+ int error, rv;
+
+ ASSERT(cmd == TCSETA || cmd == TCSETAW || cmd == TCSETAF);
+
+ if (copyin((struct lx_termio *)arg, &l_tio, sizeof (l_tio)) != 0)
+ return (set_errno(EFAULT));
+ l2s_termio(&l_tio, &s_tio);
+ termio2lx_cc(&l_tio, &lio);
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+ if (error)
+ return (set_errno(error));
+ /* preserve lx_cc */
+ set_lx_cc(fp->f_vnode, &lio);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_tcgets_ptm(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct lx_termios l_tios;
+ struct termios s_tios, *s_tiosd;
+ uint_t s_tiosl;
+
+ /* get termios defaults */
+ if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
+ DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&s_tiosd,
+ &s_tiosl) != DDI_SUCCESS)
+ return (EIO);
+ ASSERT(s_tiosl == sizeof (*s_tiosd));
+ bcopy(s_tiosd, &s_tios, sizeof (s_tios));
+ ddi_prop_free(s_tiosd);
+
+ /* Now munge the data to how Linux wants it. */
+ s2l_termios(&s_tios, &l_tios);
+ if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_tcgets_native(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct lx_termios l_tios;
+ struct termios s_tios;
+ struct lx_cc lio;
+ int error, rv;
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+ if (error)
+ return (set_errno(error));
+
+ /* Now munge the data to how Linux wants it. */
+ s2l_termios(&s_tios, &l_tios);
+
+ /* return preserved lx_cc */
+ if (get_lx_cc(fp->f_vnode, &lio) == 0) {
+ l_tios.c_cc[LX_VEOF] = lio.veof;
+ l_tios.c_cc[LX_VEOL] = lio.veol;
+ l_tios.c_cc[LX_VMIN] = lio.vmin;
+ l_tios.c_cc[LX_VTIME] = lio.vtime;
+ }
+
+ if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_tcgets(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ if (getmajor(fp->f_vnode->v_rdev) == ddi_name_to_major(LX_PTM_DRV))
+ return (ict_tcgets_ptm(fp, cmd, arg, lxcmd));
+ else
+ return (ict_tcgets_native(fp, cmd, arg, lxcmd));
+}
+
+/* ARGSUSED */
+static int
+ict_tcgeta(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct lx_termio l_tio;
+ struct termio s_tio;
+ struct lx_cc lio;
+ int error, rv;
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+ if (error)
+ return (set_errno(error));
+
+ s2l_termio(&s_tio, &l_tio);
+ /* return preserved lx_cc */
+ if (get_lx_cc(fp->f_vnode, &lio) == 0) {
+ l_tio.c_cc[LX_VEOF] = lio.veof;
+ l_tio.c_cc[LX_VMIN] = lio.vmin;
+ l_tio.c_cc[LX_VTIME] = lio.vtime;
+ }
+
+ if (copyout(&l_tio, (struct lx_termios *)arg, sizeof (l_tio)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_tiocspgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ pid_t lpid, spid, tid;
+ int error, rv;
+
+ /* Converting to the illumos pid is necessary */
+ if (copyin((pid_t *)arg, &lpid, sizeof (lpid)) < 0)
+ return (set_errno(EFAULT));
+ if (lx_lpid_to_spair(lpid, &spid, &tid) < 0)
+ return (set_errno(EPERM));
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spid,
+ fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL);
+ return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* ARGSUSED */
+static int
+ict_tcsbrkp(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ int rv, error;
+ /* use null duration to emulate TCSBRKP */
+ int dur = 0;
+ error = VOP_IOCTL(fp->f_vnode, TCSBRK, (intptr_t)&dur,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+ return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* ARGSUSED */
+static int
+ict_tiocgpgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ pid_t spgrp;
+ int error, rv;
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spgrp, FLFAKE(fp),
+ fp->f_cred, &rv, NULL);
+ if (error == 0) {
+ if (spgrp == curproc->p_zone->zone_proc_initpid) {
+ spgrp = 1;
+ }
+ if (copyout(&spgrp, (caddr_t)arg, sizeof (spgrp))) {
+ return (set_errno(EFAULT));
+ }
+ }
+ return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* ARGSUSED */
+static int
+ict_sptlock(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct strioctl istr;
+ int error, rv;
+
+ istr.ic_cmd = UNLKPT;
+ istr.ic_len = 0;
+ istr.ic_timout = 0;
+ istr.ic_dp = NULL;
+ error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr,
+ fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL);
+ /*
+ * The success/fail return values are different between Linux
+ * and illumos. Linux expects 0 or -1. Illumos can return
+ * positive number on success.
+ */
+ return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* ARGSUSED */
+static int
+ict_gptn(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct strioctl istr;
+ cred_t *cr;
+ pt_own_t pto;
+ int error, rv;
+ int ptyno;
+ lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone);
+
+ /* This operation is only valid for the lx_ptm device. */
+ if (getmajor(fp->f_vnode->v_rdev) != ddi_name_to_major(LX_PTM_DRV))
+ return (set_errno(ENOTTY));
+
+ cr = CRED();
+ pto.pto_ruid = cr->cr_uid;
+ /*
+ * Both Linux and our native code (see grantpt() in native libc)
+ * prefer assigning the "tty" gid to the new pty. On Linux this is
+ * done by udev. Since we're in the kernel we cannot lookup the gid, so
+ * we rely on the lx_support program to initialize the value in the
+ * zone data at boot time.
+ */
+ if (lxzd->lxzd_ttygrp == 0) {
+ pto.pto_rgid = cr->cr_gid;
+ } else {
+ pto.pto_rgid = lxzd->lxzd_ttygrp;
+ }
+
+ istr.ic_cmd = OWNERPT;
+ istr.ic_len = sizeof (pto);
+ istr.ic_timout = 0;
+ istr.ic_dp = (char *)&pto;
+ error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+
+ if (error)
+ return (set_errno((error == ENOTTY) ? error: EACCES));
+
+ ptyno = getminor(fp->f_vnode->v_rdev) - 1;
+ if (copyout(&ptyno, (caddr_t)arg, sizeof (ptyno)))
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_tiocgwinsz(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ int error, rv;
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+ NULL);
+
+ /*
+ * A few Linux libc's (e.g. musl) have chosen to implement isatty()
+ * using the TIOCGWINSZ ioctl. Some apps also do the same thing
+ * directly. On Linux that ioctl will return a size of 0x0 for dumb
+ * terminals but on illumos see the handling for TIOCGWINSZ in ptem's
+ * ptioc(). We fail if the winsize is all zeros. To emulate the Linux
+ * behavior use the native ioctl check that we do for isatty and return
+ * a size of 0x0 if that succeeds.
+ */
+ if (error == EINVAL) {
+ int err;
+ struct termio s_tio;
+
+ err = VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+
+ if (err == 0) {
+ struct winsize w;
+
+ bzero(&w, sizeof (w));
+ if (copyout(&w, (struct winsize *)arg, sizeof (w)) != 0)
+ return (set_errno(EFAULT));
+ return (0);
+ }
+ }
+
+ if (error != 0)
+ return (set_errno(error));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_tiocsctty(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ pid_t ttysid, mysid;
+ int error, rv;
+ proc_t *p = curproc;
+
+ /* getsid */
+ mutex_enter(&p->p_splock);
+ mysid = p->p_sessp->s_sid;
+ mutex_exit(&p->p_splock);
+
+ /*
+ * Report success if we already control the tty.
+ * If no one controls it, TIOCSCTTY will change that later.
+ */
+ error = VOP_IOCTL(fp->f_vnode, TIOCGSID, (intptr_t)&ttysid,
+ FLFAKE(fp), fp->f_cred, &rv, NULL);
+ if (error == 0 && ttysid == mysid)
+ return (0);
+
+ /*
+ * Need to make sure we're a session leader, otherwise the
+ * TIOCSCTTY ioctl will fail.
+ */
+ mutex_enter(&pidlock);
+ if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) {
+ mutex_exit(&pidlock);
+ sess_create();
+ } else {
+ mutex_exit(&pidlock);
+ }
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, 0, FLUSER(fp),
+ fp->f_cred, &rv, NULL);
+ return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* Socket-related translators */
+
+/* ARGSUSED */
+static int
+ict_siocatmark(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ vnode_t *vp = fp->f_vnode;
+ int error, rv;
+ /*
+ * Linux expects a SIOCATMARK of a UDP socket to return ENOTTY, while
+ * Illumos allows it. Linux prior to 2.6.39 returned EINVAL for this.
+ */
+ if (vp->v_type != VSOCK || VTOSO(vp)->so_type != SOCK_STREAM)
+ return (set_errno(ENOTTY));
+
+ error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+ NULL);
+ if (error)
+ return (set_errno(error));
+
+ return (0);
+}
+
+static int
+ict_if_ioctl(vnode_t *vn, int cmd, intptr_t arg, int flags, cred_t *cred)
+{
+ int error, rv;
+ lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone);
+ ksocket_t ks;
+
+ ASSERT(lxzd != NULL);
+
+ /*
+ * For ioctls of this type, we are strict about address family
+ * whereas Linux is lenient. This strictness can be avoided by using
+ * an internal AF_INET ksocket, which we use if the family is anything
+ * but AF_PACKET.
+ */
+ if (vn->v_type == VSOCK && VTOSO(vn)->so_family == AF_PACKET)
+ return (VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL));
+
+ mutex_enter(&lxzd->lxzd_lock);
+ ks = lxzd->lxzd_ioctl_sock;
+ if (ks == NULL) {
+ /*
+ * Linux is not at all picky about address family when it comes
+ * to supporting interface-related ioctls. To mimic this
+ * behavior, we'll attempt those ioctls against a ksocket
+ * configured for that purpose.
+ */
+ (void) ksocket_socket(&lxzd->lxzd_ioctl_sock, AF_INET,
+ SOCK_DGRAM, 0, 0, curproc->p_zone->zone_kcred);
+ ks = lxzd->lxzd_ioctl_sock;
+ }
+ mutex_exit(&lxzd->lxzd_lock);
+
+ if (ks != NULL) {
+ error = ksocket_ioctl(ks, cmd, arg, &rv, cred);
+ } else {
+ error = VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL);
+ }
+
+ return (error);
+}
+
+static int
+ict_sioghwaddr(file_t *fp, struct lifreq *lreq)
+{
+ struct sockaddr_dl *sdl = (struct sockaddr_dl *)&lreq->lifr_addr;
+ struct sockaddr hwaddr;
+ int error, size;
+
+ error = ict_if_ioctl(fp->f_vnode, SIOCGLIFHWADDR, (intptr_t)lreq,
+ FLFAKE(fp), fp->f_cred);
+
+ if (error == EADDRNOTAVAIL &&
+ strncmp(lreq->lifr_name, "lo", 2) == 0) {
+ /* Emulate success on suspected loopbacks */
+ sdl->sdl_type = DL_LOOP;
+ sdl->sdl_alen = ETHERADDRL;
+ bzero(LLADDR(sdl), sdl->sdl_alen);
+ error = 0;
+ }
+
+ if (error == 0) {
+ bzero(&hwaddr, sizeof (hwaddr));
+ lx_stol_hwaddr(sdl, &hwaddr, &size);
+ bcopy(&hwaddr, &lreq->lifr_addr,
+ size + sizeof (sdl->sdl_family));
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+ict_siocgifname(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct ifreq req;
+ int len;
+ char name[LIFNAMSIZ];
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ phyint_t *phyi;
+
+ if (fp->f_vnode->v_type != VSOCK) {
+ return (set_errno(EINVAL));
+ }
+
+ len = (curproc->p_model == DATAMODEL_LP64) ? sizeof (lx_ifreq64_t) :
+ sizeof (lx_ifreq32_t);
+ if (copyin((struct ifreq *)arg, &req, len) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * Since Linux calls this ioctl on all sorts of sockets, perform the
+ * interface name lookup manually.
+ */
+ if ((ns = netstack_get_current()) == NULL) {
+ return (set_errno(EINVAL));
+ }
+ ipst = ns->netstack_ip;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+ (void *) &req.ifr_index, NULL);
+ if (phyi != NULL) {
+ (void) strncpy(name, phyi->phyint_name, LIFNAMSIZ);
+ lx_ifname_convert(name, LX_IF_FROMNATIVE);
+ } else {
+ name[0] = '\0';
+ }
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ netstack_rele(ns);
+
+ if (strlen(name) != 0) {
+ /* Truncate for ifreq and copyout */
+ (void) strncpy(req.ifr_name, name, IFNAMSIZ);
+ if (copyout(&req, (struct ifreq *)arg, len) != 0) {
+ return (set_errno(EFAULT));
+ }
+ return (0);
+ }
+
+ return (set_errno(EINVAL));
+}
+
+/* ARGSUSED */
+static int
+ict_siolifreq(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ struct ifreq req;
+ struct lifreq lreq;
+ int error, len;
+
+ /* Convert from Linux ifreq to illumos lifreq */
+ if (curproc->p_model == DATAMODEL_LP64)
+ len = sizeof (lx_ifreq64_t);
+ else
+ len = sizeof (lx_ifreq32_t);
+ if (copyin((struct ifreq *)arg, &req, len) != 0)
+ return (set_errno(EFAULT));
+ bzero(&lreq, sizeof (lreq));
+ (void) strncpy(lreq.lifr_name, req.ifr_name, IFNAMSIZ);
+ bcopy(&req.ifr_ifru, &lreq.lifr_lifru, len - IFNAMSIZ);
+ lx_ifname_convert(lreq.lifr_name, LX_IF_TONATIVE);
+
+ switch (cmd) {
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCSIFMTU:
+ /*
+ * Convert cmd from SIO*IF* to SIO*LIF*.
+ * This is needed since Linux allows ifreq operations on ipv6
+ * sockets where illumos does not.
+ */
+ cmd = ((cmd & IOC_INOUT) |
+ _IOW('i', ((cmd & 0xff) + 100), struct lifreq));
+ error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+ FLFAKE(fp), fp->f_cred);
+ break;
+ case SIOCGIFINDEX:
+ cmd = SIOCGLIFINDEX;
+ error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+ FLFAKE(fp), fp->f_cred);
+ break;
+ case SIOCGIFFLAGS:
+ cmd = SIOCGLIFFLAGS;
+ error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+ FLFAKE(fp), fp->f_cred);
+ if (error == 0)
+ lx_ifflags_convert(&lreq.lifr_flags, LX_IF_FROMNATIVE);
+ break;
+ case SIOCSIFFLAGS:
+ cmd = SIOCSLIFFLAGS;
+ lx_ifflags_convert(&lreq.lifr_flags, LX_IF_TONATIVE);
+ error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+ FLFAKE(fp), fp->f_cred);
+ break;
+ case SIOCGIFHWADDR:
+ error = ict_sioghwaddr(fp, &lreq);
+ break;
+ case LX_SIOCGIFTXQLEN:
+ /*
+ * Illumos lacks the notion of txqlen. Confirm the provided
+ * interface is valid with SIOCGLIFINDEX and return a fake
+ * txqlen of 1. Loopback devices will report txqlen of 0.
+ */
+ if (strncmp(lreq.lifr_name, "lo", 2) == 0) {
+ lreq.lifr_index = 0;
+ error = 0;
+ break;
+ }
+ cmd = SIOCGLIFINDEX;
+ error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+ FLFAKE(fp), fp->f_cred);
+ if (error == 0) {
+ /* lifr_index aliases to the qlen field */
+ lreq.lifr_index = 1;
+ }
+ break;
+ case LX_SIOCSIFHWADDR:
+ /*
+ * We're not going to support SIOCSIFHWADDR, but we need to be
+ * able to check the result of the copyin first to see if the
+ * command should have returned EFAULT.
+ */
+ default:
+ error = EINVAL;
+ }
+
+ if (error != 0)
+ return (set_errno(error));
+
+ /* Convert back to a Linux ifreq */
+ lx_ifname_convert(lreq.lifr_name, LX_IF_FROMNATIVE);
+ bzero(&req, sizeof (req));
+ (void) strncpy(req.ifr_name, lreq.lifr_name, IFNAMSIZ);
+ bcopy(&lreq.lifr_lifru, &req.ifr_ifru, len - IFNAMSIZ);
+
+ if (copyout(&req, (struct lifreq *)arg, len) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ict_siocgifconf32(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ lx_ifconf32_t conf;
+ lx_ifreq32_t *oreq;
+ struct ifconf sconf;
+ int ifcount, error, i, buf_len;
+
+ if (copyin((lx_ifconf32_t *)arg, &conf, sizeof (conf)) != 0)
+ return (set_errno(EFAULT));
+
+ /* They want to know how many interfaces there are. */
+ if (conf.if_len <= 0 || conf.if_buf == NULL) {
+ error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM,
+ (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred);
+ if (error != 0)
+ return (set_errno(error));
+
+ conf.if_len = ifcount * sizeof (lx_ifreq32_t);
+
+ if (copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0)
+ return (set_errno(EFAULT));
+ return (0);
+ } else {
+ ifcount = conf.if_len / sizeof (lx_ifreq32_t);
+ }
+
+ /* Get interface configuration list. */
+ sconf.ifc_len = ifcount * sizeof (struct ifreq);
+ sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP);
+
+ error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp),
+ fp->f_cred);
+ if (error != 0) {
+ kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+ return (set_errno(error));
+ }
+
+ /* Convert data to Linux format & rename interfaces */
+ buf_len = ifcount * sizeof (lx_ifreq32_t);
+ oreq = (lx_ifreq32_t *)kmem_alloc(buf_len, KM_SLEEP);
+ for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) {
+ bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq32_t));
+ lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE);
+ }
+ conf.if_len = i * sizeof (*oreq);
+ kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+
+ error = 0;
+ if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 ||
+ copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0)
+ error = set_errno(EFAULT);
+
+ kmem_free(oreq, buf_len);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+ict_siocgifconf64(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ lx_ifconf64_t conf;
+ lx_ifreq64_t *oreq;
+ struct ifconf sconf;
+ int ifcount, error, i, buf_len;
+
+ if (copyin((lx_ifconf64_t *)arg, &conf, sizeof (conf)) != 0)
+ return (set_errno(EFAULT));
+
+ /* They want to know how many interfaces there are. */
+ if (conf.if_len <= 0 || conf.if_buf == NULL) {
+ error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM,
+ (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred);
+ if (error != 0)
+ return (set_errno(error));
+
+ conf.if_len = ifcount * sizeof (lx_ifreq64_t);
+
+ if (copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0)
+ return (set_errno(EFAULT));
+ return (0);
+ } else {
+ ifcount = conf.if_len / sizeof (lx_ifreq64_t);
+ }
+
+ /* Get interface configuration list. */
+ sconf.ifc_len = ifcount * sizeof (struct ifreq);
+ sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP);
+
+ error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp),
+ fp->f_cred);
+ if (error != 0) {
+ kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+ return (set_errno(error));
+ }
+
+ /* Convert data to Linux format & rename interfaces */
+ buf_len = ifcount * sizeof (lx_ifreq64_t);
+ oreq = (lx_ifreq64_t *)kmem_alloc(buf_len, KM_SLEEP);
+ for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) {
+ bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq64_t));
+ lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE);
+ }
+ conf.if_len = i * sizeof (*oreq);
+ kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+
+ error = 0;
+ if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 ||
+ copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0)
+ error = set_errno(EFAULT);
+
+ kmem_free(oreq, buf_len);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+ict_siocgifconf(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ if (curproc->p_model == DATAMODEL_LP64)
+ return (ict_siocgifconf64(fp, cmd, arg, lxcmd));
+ else
+ return (ict_siocgifconf32(fp, cmd, arg, lxcmd));
+}
+
+/*
+ * Unfortunately some of the autofs ioctls want to return a positive integer
+ * result which does not indicate an error. To minimize disruption in the
+ * rest of the code, we'll treat a positive return as an errno and a negative
+ * return as the non-error return (which we then negate).
+ */
+/* ARGSUSED */
+static int
+ict_autofs(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+ int res = 0;
+ int rv;
+
+ res = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+ NULL);
+ if (res > 0)
+ return (set_errno(res));
+ if (res == 0)
+ return (0);
+ return (-res);
+}
+
+/* Structure used to define an ioctl translator. */
+typedef struct lx_ioc_cmd_translator {
+ int lict_lxcmd;
+ int lict_cmd;
+ int (*lict_func)(file_t *fp, int cmd, intptr_t arg, int lxcmd);
+} lx_ioc_cmd_translator_t;
+
+#define LX_IOC_CMD_TRANSLATOR_PASS(ioc_cmd_sym) \
+ { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass },
+
+#define LX_IOC_CMD_TRANSLATOR_FILTER(ioc_cmd_sym, ioct_handler) \
+ { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler },
+
+#define LX_IOC_CMD_TRANSLATOR_CUSTOM(ioc_cmd_sym, ioct_handler) \
+ { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler },
+
+#define LX_IOC_CMD_TRANSLATOR_PTHRU(ioc_cmd_sym) \
+ { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass },
+
+#define LX_IOC_CMD_TRANSLATOR_END \
+ {0, 0, NULL}
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_fd[] = {
+ LX_IOC_CMD_TRANSLATOR_FILTER(FIONBIO, ict_fionbio)
+ LX_IOC_CMD_TRANSLATOR_FILTER(FIONREAD, ict_fionread)
+ LX_IOC_CMD_TRANSLATOR_PASS(FIOASYNC)
+
+ /* streams related */
+ LX_IOC_CMD_TRANSLATOR_PASS(TCXONC)
+ LX_IOC_CMD_TRANSLATOR_PASS(TCFLSH)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCEXCL)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCNXCL)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCSTI)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCSWINSZ)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIS)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIC)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCMSET)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCSETD)
+ LX_IOC_CMD_TRANSLATOR_PASS(TCSBRK)
+
+ /* terminal related */
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCGETD)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCGSID)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCNOTTY)
+ LX_IOC_CMD_TRANSLATOR_PASS(TIOCPKT)
+
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCSETS, ict_tcsets)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSW, ict_tcsets)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSF, ict_tcsets)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCSETA, ict_tcseta)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAW, ict_tcseta)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAF, ict_tcseta)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCGETS, ict_tcgets)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TCGETA, ict_tcgeta)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGWINSZ, ict_tiocgwinsz)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TCSBRKP, ict_tcsbrkp)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSPGRP, ict_tiocspgrp)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGPGRP, ict_tiocgpgrp)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCSPTLCK, ict_sptlock)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCGPTN, ict_gptn)
+ LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSCTTY, ict_tiocsctty)
+
+ LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_socket[] = {
+ LX_IOC_CMD_TRANSLATOR_PASS(FIOGETOWN)
+
+ LX_IOC_CMD_TRANSLATOR_PASS(SIOCSPGRP)
+ LX_IOC_CMD_TRANSLATOR_PASS(SIOCGPGRP)
+ LX_IOC_CMD_TRANSLATOR_PASS(SIOCGSTAMP)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCATMARK, ict_siocatmark)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFFLAGS, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFFLAGS, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFDSTADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFDSTADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFBRDADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFBRDADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFNETMASK, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFNETMASK, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMETRIC, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMETRIC, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMTU, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMTU, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFHWADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCSIFHWADDR, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFINDEX, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFTXQLEN, ict_siolifreq)
+ LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFCONF, ict_siocgifconf)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFNAME, ict_siocgifname)
+
+ LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_dtrace[] = {
+ LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_REMOVE)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADDDOF)
+
+ LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_autofs[] = {
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_READY)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_FAIL)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_CATATONIC)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOVER)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_SETTIMEOUT)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE_MULTI)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOSUBVER)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_ASKUMOUNT)
+
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_VERSION_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOVER_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_READY_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_FAIL_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CATATONIC_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_TIMEOUT_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_REQUESTER_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_EXPIRE_CMD)
+ LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD,
+ ict_autofs)
+
+ LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_hd[] = {
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_HDIO_GETGEO, ict_hdgetgeo)
+
+ LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_blk[] = {
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE, ict_blkgetsize)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKSSZGET, ict_blkgetssize)
+ LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE64, ict_blkgetsize64)
+
+ LX_IOC_CMD_TRANSLATOR_END
+};
+
+/*
+ * Linux only restarts ioctls for "slow" devices. This includes terminals,
+ * pipes, and sockets. If additional "slow" devices are discovered in the
+ * future, they can be added here as well.
+ */
+static boolean_t
+lx_ioctl_is_slow_dev(file_t *fp)
+{
+ int rv;
+ struct termio s_tio;
+ vtype_t vt = fp->f_vnode->v_type;
+
+ if (vt == VFIFO || vt == VSOCK)
+ return (B_TRUE);
+
+ /* Check if it's a terminal using the isatty() approach. */
+ if (vt == VCHR && VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio,
+ FLFAKE(fp), fp->f_cred, &rv, NULL) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static void
+lx_ioctl_vsd_free(void *data)
+{
+ kmem_free(data, sizeof (struct lx_cc));
+}
+
+void
+lx_ioctl_init()
+{
+ vsd_create(&lx_ioctl_vsd, lx_ioctl_vsd_free);
+}
+
+void
+lx_ioctl_fini()
+{
+ vsd_destroy(&lx_ioctl_vsd);
+}
+
+long
+lx_ioctl(int fdes, int cmd, intptr_t arg)
+{
+ file_t *fp;
+ int res = 0, error = ENOTTY;
+ lx_ioc_cmd_translator_t *ict = NULL;
+
+ if (cmd == LX_FIOCLEX || cmd == LX_FIONCLEX) {
+ res = f_setfd_error(fdes, (cmd == LX_FIOCLEX) ? FD_CLOEXEC : 0);
+ return ((res != 0) ? set_errno(res) : 0);
+ }
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+
+ switch ((cmd & 0xff00) >> 8) {
+ case LX_IOC_TYPE_FD:
+ ict = lx_ioc_xlate_fd;
+ break;
+
+ case LX_IOC_TYPE_DTRACE:
+ ict = lx_ioc_xlate_dtrace;
+ break;
+
+ case LX_IOC_TYPE_SOCK:
+ ict = lx_ioc_xlate_socket;
+ error = EOPNOTSUPP;
+ break;
+
+ case LX_IOC_TYPE_AUTOFS:
+ ict = lx_ioc_xlate_autofs;
+ break;
+
+ case LX_IOC_TYPE_BLK:
+ ict = lx_ioc_xlate_blk;
+ break;
+
+ case LX_IOC_TYPE_HD:
+ ict = lx_ioc_xlate_hd;
+ break;
+
+ default:
+ releasef(fdes);
+ return (set_errno(ENOTTY));
+ }
+
+ /*
+ * Today, none of the ioctls supported by the emulation possess
+ * overlapping cmd values. Because of that, no type interrogation of
+ * the fd is done before executing specific ioctl emulation. It's
+ * assumed that the vnode-specific logic called by the emulation
+ * function will reject ioctl commands not supported by the fd.
+ */
+ VERIFY(ict != NULL);
+ while (ict->lict_func != NULL) {
+ if (ict->lict_lxcmd == cmd)
+ break;
+ ict++;
+ }
+ if (ict->lict_func == NULL) {
+ releasef(fdes);
+ return (set_errno(error));
+ }
+
+ res = ict->lict_func(fp, ict->lict_cmd, arg, ict->lict_lxcmd);
+
+ if (ttolwp(curthread)->lwp_errno == EINTR && lx_ioctl_is_slow_dev(fp))
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+
+ releasef(fdes);
+ return (res);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c
new file mode 100644
index 0000000000..13397e199e
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c
@@ -0,0 +1,66 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/lx_brand.h>
+
+/* 'which' values. */
+#define LX_IOPRIO_WHO_PROCESS 1
+#define LX_IOPRIO_WHO_PGRP 2
+#define LX_IOPRIO_WHO_USER 3
+
+/*
+ * The possible values for the class. We report best effort (BE) as the class
+ * in use.
+ */
+#define LX_IOPRIO_CLASS_RT 1
+#define LX_IOPRIO_CLASS_BE 2
+#define LX_IOPRIO_CLASS_IDLE 3
+
+/* Macro to determine the class from the input mask */
+#define LX_IOPRIO_PRIO_CLASS(m) ((m) >> 13)
+
+/* ARGSUSED */
+long
+lx_ioprio_get(int which, int who)
+{
+ if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER)
+ return (set_errno(EINVAL));
+
+ return (LX_IOPRIO_CLASS_BE);
+}
+
+/*
+ * We allow setting any valid class, even though it's ignored.
+ * We ignore the 'who' parameter which means that we're not searching for
+ * the specified target in order to return a specific errno in the case that
+ * the target does not exist.
+ */
+/* ARGSUSED */
+long
+lx_ioprio_set(int which, int who, int mask)
+{
+ int class;
+
+ if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER)
+ return (set_errno(EINVAL));
+
+ class = LX_IOPRIO_PRIO_CLASS(mask);
+ if (class < LX_IOPRIO_CLASS_RT || class > LX_IOPRIO_CLASS_IDLE)
+ return (set_errno(EINVAL));
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c
new file mode 100644
index 0000000000..6fefbde705
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c
@@ -0,0 +1,408 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/thread.h>
+#include <sys/signal.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <lx_signum.h>
+#include <sys/contract/process_impl.h>
+
+extern int kill(pid_t, int);
+
+/*
+ * Check if it is legal to send this signal to the init process. Linux
+ * kill(2) semantics dictate that no _unhandled_ signal may be sent to pid
+ * 1.
+ */
+static int
+lx_init_sig_check(int sig, pid_t pid)
+{
+ proc_t *p;
+ int rv = 0;
+
+ mutex_enter(&pidlock);
+ if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) {
+ rv = ESRCH;
+ } else if (sig != 0) {
+ if (sigismember(&cantmask, sig)) {
+ rv = EPERM;
+ } else {
+ mutex_enter(&p->p_lock);
+ if (PTOU(p)->u_signal[sig-1] == SIG_DFL ||
+ PTOU(p)->u_signal[sig-1] == SIG_IGN) {
+ rv = EPERM;
+ }
+ mutex_exit(&p->p_lock);
+ }
+ }
+ mutex_exit(&pidlock);
+
+ return (rv);
+}
+
+static long
+lx_thrkill(pid_t tgid, pid_t pid, int lx_sig, boolean_t tgkill)
+{
+ kthread_t *t;
+ proc_t *pp, *cp = curproc;
+ sigqueue_t *sqp;
+ int sig, rv;
+
+ /*
+ * Unlike kill(2), Linux tkill(2) doesn't allow signals to
+ * be sent to process IDs <= 0 as it doesn't overlay any special
+ * semantics on the pid.
+ */
+ if ((pid <= 0) || ((lx_sig < 0) || (lx_sig > LX_NSIG)) ||
+ ((sig = ltos_signo[lx_sig]) < 0))
+ return (set_errno(EINVAL));
+
+ /*
+ * If the Linux pid is 1, translate the pid to the actual init
+ * pid for the zone. Note that Linux dictates that no unhandled
+ * signals may be sent to init, so check for that, too.
+ *
+ * Otherwise, extract the tid and real pid from the Linux pid.
+ */
+ if (pid == 1) {
+ pid_t initpid;
+
+ initpid = cp->p_zone->zone_proc_initpid;
+ if ((rv = lx_init_sig_check(sig, initpid)) != 0) {
+ return (set_errno(rv));
+ }
+ }
+ sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+ /*
+ * Find the process for the passed pid...
+ */
+ if (lx_lpid_lock(pid, curzone, 0, &pp, &t) != 0) {
+ rv = set_errno(ESRCH);
+ goto free_and_exit;
+ }
+
+ /*
+ * Make sure the thread group matches the thread.
+ */
+ if (tgkill) {
+ if ((pid == 1 && tgid != 1) ||
+ (pid != 1 && tgid != pp->p_pid)) {
+ mutex_exit(&pp->p_lock);
+ rv = set_errno(ESRCH);
+ goto free_and_exit;
+ }
+ }
+
+ /*
+ * Deny permission to send the signal if either of the following
+ * is true:
+ *
+ * + The signal is SIGCONT and the target pid is not in the same
+ * session as the sender
+ *
+ * + prochasprocperm() shows the user lacks sufficient permission
+ * to send the signal to the target pid
+ */
+ if (((sig == SIGCONT) && (pp->p_sessp != cp->p_sessp)) ||
+ (!prochasprocperm(pp, cp, CRED()))) {
+ mutex_exit(&pp->p_lock);
+ rv = set_errno(EPERM);
+ goto free_and_exit;
+ }
+
+ /* a signal of 0 means just check for the existence of the thread */
+ if (lx_sig == 0) {
+ mutex_exit(&pp->p_lock);
+ rv = 0;
+ goto free_and_exit;
+ }
+
+ sqp->sq_info.si_signo = sig;
+ sqp->sq_info.si_code = SI_LWP;
+ sqp->sq_info.si_pid = cp->p_pid;
+ sqp->sq_info.si_zoneid = getzoneid();
+ sqp->sq_info.si_uid = crgetruid(CRED());
+ sigaddqa(pp, t, sqp);
+
+ mutex_exit(&pp->p_lock);
+
+ return (0);
+
+free_and_exit:
+ kmem_free(sqp, sizeof (sigqueue_t));
+ return (rv);
+}
+
+long
+lx_tgkill(pid_t tgid, pid_t pid, int lx_sig)
+{
+ return (lx_thrkill(tgid, pid, lx_sig, B_TRUE));
+}
+
+long
+lx_tkill(pid_t pid, int lx_sig)
+{
+ return (lx_thrkill(0, pid, lx_sig, B_FALSE));
+}
+
+long
+lx_kill(pid_t lx_pid, int lx_sig)
+{
+ pid_t s_pid, initpid;
+ sigsend_t v;
+ zone_t *zone = curzone;
+ struct proc *p;
+ int err, sig, nfound;
+
+ if ((lx_sig < 0) || (lx_sig > LX_NSIG) ||
+ ((sig = ltos_signo[lx_sig]) < 0))
+ return (set_errno(EINVAL));
+
+ initpid = zone->zone_proc_initpid;
+ if (lx_pid == 0 || lx_pid == -1) {
+ s_pid = 0;
+ } else if (lx_pid > 0) {
+ /*
+ * Translations for individual processes (including pid 1) is
+ * all handled by lx_lpid_to_spair.
+ */
+ if (lx_lpid_to_spair(lx_pid, &s_pid, NULL) != 0) {
+ /*
+ * If we didn't find this pid that means it doesn't
+ * exist in this zone.
+ */
+ return (set_errno(ESRCH));
+ }
+ } else {
+ ASSERT(lx_pid < 0);
+ if (lx_lpid_to_spair(-lx_pid, &s_pid, NULL) != 0) {
+ /*
+ * If we didn't find this pid it means that the
+ * process group leader doesn't exist in this zone.
+ * In this case assuming that the Linux pid is
+ * the same as the Solaris pid will get us the
+ * correct behavior.
+ */
+ s_pid = -lx_pid;
+ }
+ }
+
+ /*
+ * Check that it is legal for this signal to be sent to init
+ */
+ if (s_pid == initpid && (err = lx_init_sig_check(sig, s_pid)) != 0)
+ return (set_errno(err));
+
+ /*
+ * For individual processes, kill() semantics are the same between
+ * Solaris and Linux.
+ */
+ if (lx_pid >= 0)
+ return (kill(s_pid, sig));
+
+ /*
+ * In Solaris, sending a signal to -pid means "send a signal to
+ * everyone in process group pid." In Linux it means "send a
+ * signal to everyone in the group other than init." Sending a
+ * signal to -1 means "send a signal to every process except init
+ * and myself."
+ */
+
+ bzero(&v, sizeof (v));
+ v.sig = sig;
+ v.checkperm = 1;
+ v.sicode = SI_USER;
+ err = 0;
+
+ mutex_enter(&pidlock);
+
+ p = (lx_pid == -1) ? practive : pgfind(s_pid);
+ nfound = 0;
+ while (err == 0 && p != NULL) {
+ if ((p->p_zone == zone) && (p->p_stat != SIDL) &&
+ (p->p_pid != initpid) && (lx_pid < -1 || p != curproc)) {
+ nfound++;
+ err = sigsendproc(p, &v);
+ }
+
+ p = (lx_pid == -1) ? p->p_next : p->p_pglink;
+ }
+ mutex_exit(&pidlock);
+
+ /*
+ * If we found no processes, we'll return ESRCH -- but unlike our
+ * native kill(2), we do not return EPERM if processes are found but
+ * we did not have permission to send any of them a signal.
+ */
+ if (nfound == 0)
+ err = ESRCH;
+
+ return (err ? set_errno(err) : 0);
+}
+
+/*
+ * This handles the unusual case where the user sends a non-queueable signal
+ * through rt_sigqueueinfo. Signals sent with codes that indicate they are
+ * queuable are sent through the sigqueue syscall via the user level function
+ * lx_rt_sigqueueinfo().
+ */
+int
+lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo)
+{
+ proc_t *target_proc;
+ pid_t s_pid;
+ zone_t *zone = curproc->p_zone;
+ sigsend_t send;
+ int err;
+ siginfo_t kinfo;
+
+ if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0)
+ return (set_errno(EFAULT));
+ /* Unlike in lx_kill, this process id must be exact, no negatives. */
+ if (tgid == 0)
+ return (set_errno(ESRCH));
+ if (tgid < 0)
+ return (set_errno(EINVAL));
+ /*
+ * Translate init directly, otherwise use the convenient utility
+ * function to translate. Since we're sending to the whole group, we
+ * only need the solaris pid, and not the lwp id.
+ */
+ if (tgid == 1) {
+ s_pid = zone->zone_proc_initpid;
+ } else {
+ if (lx_lpid_to_spair(tgid, &s_pid, NULL) != 0) {
+ /*
+ * If we didn't find this pid that means it doesn't
+ * exist in this zone.
+ */
+ return (set_errno(ESRCH));
+ }
+ }
+ /*
+ * We shouldn't have queuable signals here, those are sent elsewhere by
+ * the usermode handler for this emulated call.
+ */
+ if (!SI_CANQUEUE(kinfo.si_code)) {
+ return (set_errno(EINVAL));
+ }
+ /* Since our signal shouldn't queue, we just call sigsendproc(). */
+ bzero(&send, sizeof (send));
+ send.sig = sig;
+ send.checkperm = 1;
+ send.sicode = kinfo.si_code;
+ send.value = kinfo.si_value;
+
+ mutex_enter(&pidlock);
+ target_proc = prfind(s_pid);
+ err = 0;
+ if (target_proc != NULL) {
+ err = sigsendproc(target_proc, &send);
+ if (err == 0 && send.perm == 0)
+ err = EPERM;
+ } else {
+ err = ESRCH;
+ }
+ mutex_exit(&pidlock);
+
+ return (err ? set_errno(err) : 0);
+}
+
+/*
+ * Unlike the above function, this handles all system calls to rt_tgsigqueue
+ * regardless of si_code.
+ */
+int
+lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo)
+{
+ int err;
+ proc_t *p = NULL;
+ kthread_t *t;
+ sigqueue_t *sqp;
+ siginfo_t kinfo;
+
+ if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+
+ if (lx_lpid_lock(tid, curzone, 0, &p, &t) != 0) {
+ err = ESRCH;
+ goto errout;
+ }
+
+ /*
+ * For group leaders, the SunOS pid == Linux pid, so the SunOS leader
+ * pid should be the same as the tgid. Because the tgid comes in via
+ * the syscall, we need to check for an invalid value.
+ */
+ if (p->p_pid != tgid) {
+ err = EINVAL;
+ goto errout;
+ }
+
+ /*
+ * In order to match the Linux behavior of emitting ESRCH errors before
+ * confirming that the signal is valid, this check _must_ be performed
+ * after the target process/thread is located.
+ */
+ if (sig < 0 || sig >= NSIG) {
+ err = EINVAL;
+ goto errout;
+ }
+
+ /*
+ * To merely check for the existence of a thread, the caller will pass
+ * a signal value of 0.
+ */
+ if (sig != 0) {
+ ASSERT(sqp != NULL);
+
+ sqp->sq_info.si_signo = sig;
+ sqp->sq_info.si_code = kinfo.si_code;
+ sqp->sq_info.si_pid = p->p_pid;
+ sqp->sq_info.si_ctid = PRCTID(p);
+ sqp->sq_info.si_zoneid = getzoneid();
+ sqp->sq_info.si_uid = crgetruid(CRED());
+ sigaddqa(p, t, sqp);
+ }
+ mutex_exit(&p->p_lock);
+ return (0);
+
+errout:
+ if (p != NULL) {
+ mutex_exit(&p->p_lock);
+ }
+ kmem_free(sqp, sizeof (sigqueue_t));
+ return (set_errno(err));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_link.c b/usr/src/uts/common/brand/lx/syscall/lx_link.c
new file mode 100644
index 0000000000..4ebf491d23
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_link.c
@@ -0,0 +1,194 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/fcntl.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/systm.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_misc.h>
+
+#define LX_LINK_ALLOWED (LX_AT_SYMLINK_FOLLOW | LX_AT_EMPTY_PATH)
+
+/* From "uts/common/syscall/stat.c" */
+extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **);
+/* From uts/common/syscall/unlink.c */
+extern int unlinkat(int, char *, int);
+/* From uts/common/syscall/symlink.c */
+extern int symlinkat(char *, int, char *);
+/* From uts/common/syscall/readlink.c */
+extern ssize_t readlinkat(int, char *, char *, size_t);
+
+static long
+lx_link_common(int ffd, char *from, int tfd, char *to, int flags)
+{
+ int error;
+ vnode_t *fsvp = NULL, *tsvp = NULL;
+ enum symfollow follow = NO_FOLLOW;
+
+ if ((flags & ~LX_LINK_ALLOWED) != 0) {
+ return (set_errno(EINVAL));
+ }
+ if ((flags & LX_AT_EMPTY_PATH) == 0) {
+ char c;
+
+ /*
+ * Check that both 'from' and 'to' names are non-empty if
+ * AT_EMPTY_PATH is not set.
+ */
+ if (copyin(from, &c, sizeof (c)) != 0) {
+ return (set_errno(EFAULT));
+ } else if (c == '\0') {
+ return (set_errno(ENOENT));
+ }
+ if (copyin(to, &c, sizeof (c)) != 0) {
+ return (set_errno(EFAULT));
+ } else if (c == '\0') {
+ return (set_errno(ENOENT));
+ }
+
+ /*
+ * XXX: When our support for LX capabilities improves, ENOENT
+ * should be thrown when a process lacking CAP_DAC_READ_SEARCH
+ * attempts to use the AT_EMPTY_PATH flag.
+ */
+ }
+ if ((flags & LX_AT_SYMLINK_FOLLOW) != 0) {
+ follow = FOLLOW;
+ }
+
+ if ((error = fgetstartvp(ffd, from, &fsvp)) != 0) {
+ goto out;
+ }
+ if ((error = fgetstartvp(tfd, to, &tsvp)) != 0) {
+ goto out;
+ }
+ error = vn_linkat(fsvp, from, follow, tsvp, to, UIO_USERSPACE);
+
+out:
+ if (fsvp != NULL) {
+ VN_RELE(fsvp);
+ }
+ if (tsvp != NULL) {
+ VN_RELE(tsvp);
+ }
+ if (error) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_link(char *from, char *to)
+{
+ return (lx_link_common(AT_FDCWD, from, AT_FDCWD, to, 0));
+}
+
+long
+lx_linkat(int ffd, char *from, int tfd, char *to, int flags)
+{
+ ffd = (ffd == LX_AT_FDCWD) ? AT_FDCWD : ffd;
+ tfd = (tfd == LX_AT_FDCWD) ? AT_FDCWD : tfd;
+
+ return (lx_link_common(ffd, from, tfd, to, flags));
+}
+
+static boolean_t
+lx_isdir(int atfd, char *path)
+{
+ cred_t *cr = NULL;
+ vnode_t *vp = NULL;
+ boolean_t is_dir;
+
+ if (cstatat_getvp(atfd, path, NO_FOLLOW, &vp, &cr) != 0)
+ return (B_FALSE);
+
+ is_dir = (vp->v_type == VDIR);
+ VN_RELE(vp);
+
+ return (is_dir);
+}
+
+long
+lx_unlink(char *path)
+{
+ int err;
+
+ if ((err = unlinkat(AT_FDCWD, path, 0)) == EPERM) {
+ /* On Linux, an unlink of a dir returns EISDIR, not EPERM. */
+ if (lx_isdir(AT_FDCWD, path))
+ return (set_errno(EISDIR));
+ }
+
+ return (err);
+}
+
+long
+lx_unlinkat(int atfd, char *path, int flag)
+{
+ int err;
+
+ if (atfd == LX_AT_FDCWD)
+ atfd = AT_FDCWD;
+
+ if ((flag = ltos_at_flag(flag, AT_REMOVEDIR, B_TRUE)) < 0)
+ return (set_errno(EINVAL));
+
+ err = unlinkat(atfd, path, flag);
+ if (err == EPERM && !(flag & AT_REMOVEDIR)) {
+ /* On Linux, an unlink of a dir returns EISDIR, not EPERM. */
+ if (lx_isdir(atfd, path))
+ return (set_errno(EISDIR));
+ }
+
+ return (err);
+}
+
+long
+lx_symlink(char *name1, char *name2)
+{
+ return (symlinkat(name1, AT_FDCWD, name2));
+}
+
+long
+lx_symlinkat(char *name1, int atfd, char *name2)
+{
+ if (atfd == LX_AT_FDCWD)
+ atfd = AT_FDCWD;
+
+ return (symlinkat(name1, atfd, name2));
+}
+
+long
+lx_readlink(char *path, char *buf, size_t bufsize)
+{
+ if (bufsize <= 0)
+ return (set_errno(EINVAL));
+
+ return (readlinkat(AT_FDCWD, path, buf, bufsize));
+}
+
+long
+lx_readlinkat(int atfd, char *path, char *buf, size_t bufsize)
+{
+ if (bufsize <= 0)
+ return (set_errno(EINVAL));
+
+ if (atfd == LX_AT_FDCWD)
+ atfd = AT_FDCWD;
+
+ return (readlinkat(atfd, path, buf, bufsize));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_lseek.c b/usr/src/uts/common/brand/lx/syscall/lx_lseek.c
new file mode 100644
index 0000000000..3ac32a2faf
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_lseek.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/errno.h>
+#include <sys/debug.h>
+
+
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+
+/* from uts/common/syscalls/lseek.c */
+extern offset_t llseek32(int32_t, uint32_t, uint32_t, int);
+extern off32_t lseek32(int32_t, off32_t, int32_t);
+
+long
+lx_llseek(int fd, uint32_t off_high, uint32_t off_low, void *out, int whence)
+{
+ offset_t res;
+
+ ASSERT(get_udatamodel() == DATAMODEL_ILP32);
+ res = llseek32(fd, off_low, off_high, whence);
+ if (ttolwp(curthread)->lwp_errno == 0) {
+ if (copyout(&res, out, sizeof (offset_t)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
+ return (ttolwp(curthread)->lwp_errno);
+}
+
+
+long
+lx_lseek32(int fd, off32_t offset, int whence)
+{
+ offset_t res;
+ const uint32_t hival = (offset < 0) ? (uint32_t)-1 : 0;
+
+ /*
+ * When returning EOVERFLOW for an offset which is outside the bounds
+ * of an off32_t, Linux will still perform the actual seek before
+ * yielding EOVERFLOW.
+ *
+ * In order to emulate that behavior, an llseek bound to the 64-bit
+ * boundary is used. The overflow can then be reported after the
+ * successful seek.
+ */
+ ASSERT(get_udatamodel() == DATAMODEL_ILP32);
+ res = llseek32(fd, (uint32_t)offset, hival, whence);
+ if (ttolwp(curthread)->lwp_errno == 0 && res > MAXOFF32_T) {
+ return (set_errno(EOVERFLOW));
+ }
+ return (res);
+
+}
+#endif /* defined(_SYSCALL32_IMPL) || defined(_ILP32) */
+
+#if defined(_LP64)
+
+/* from uts/common/syscalls/lseek.c */
+extern off_t lseek64(int, off_t, int);
+
+long
+lx_lseek64(int fd, off_t offset, int whence)
+{
+ ASSERT(get_udatamodel() == DATAMODEL_LP64);
+ return (lseek64(fd, offset, whence));
+}
+
+#endif /* defined(_LP64) */
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mem.c b/usr/src/uts/common/brand/lx/syscall/lx_mem.c
new file mode 100644
index 0000000000..cc756717f1
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_mem.c
@@ -0,0 +1,1118 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/policy.h>
+#include <sys/lx_brand.h>
+#include <sys/fcntl.h>
+#include <sys/pathname.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_spt.h>
+#include <sys/shm_impl.h>
+#include <vm/as.h>
+
+/* From uts/common/os/grow.c */
+extern int mprotect(caddr_t, size_t, int);
+extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t);
+extern int munmap(caddr_t, size_t);
+/* From uts/common/syscall/close.c */
+extern int close(int);
+/* From uts/common/fs/proc/prsubr.c */
+extern uint_t pr_getprot(struct seg *, int, void **, caddr_t *, caddr_t *,
+ caddr_t);
+/* From uts/common/vm/seg_spt.c */
+extern struct seg_ops segspt_shmops;
+/* From uts/common/syscall/memcntl.c */
+extern int memcntl(caddr_t, size_t, int, caddr_t, int, int);
+/* From uts/common/os/grow.c */
+extern int smmap_common(caddr_t *, size_t, int, int, struct file *, offset_t);
+
+/*
+ * After Linux 2.6.8, an unprivileged process can lock memory up to its
+ * RLIMIT_MEMLOCK resource limit.
+ *
+ * Within memcntl() it assumes we have PRIV_PROC_LOCK_MEMORY, or the check in
+ * secpolicy_lock_memory() will fail when we attempt to lock memory. Thus,
+ * to support the Linux semantics, we bypass memcntl() and perform the locking
+ * operations directly.
+ */
+
+#define LX_MADV_NORMAL 0
+#define LX_MADV_RANDOM 1
+#define LX_MADV_SEQUENTIAL 2
+#define LX_MADV_WILLNEED 3
+#define LX_MADV_DONTNEED 4
+#define LX_MADV_FREE 8
+#define LX_MADV_REMOVE 9
+#define LX_MADV_DONTFORK 10
+#define LX_MADV_DOFORK 11
+#define LX_MADV_MERGEABLE 12
+#define LX_MADV_UNMERGEABLE 13
+#define LX_MADV_HUGEPAGE 14
+#define LX_MADV_NOHUGEPAGE 15
+#define LX_MADV_DONTDUMP 16
+#define LX_MADV_DODUMP 17
+
+#define LX_VALID_MSYNC (MS_ASYNC|MS_INVALIDATE|MS_SYNC)
+
+#define LX_PROT_GROWSDOWN 0x01000000
+#define LX_PROT_GROWSUP 0x02000000
+
+/* Internal segment map flags */
+#define LX_SM_READ 0x01
+#define LX_SM_WRITE 0x02
+#define LX_SM_EXEC 0x04
+#define LX_SM_SHM 0x08
+#define LX_SM_ANON 0x10
+#define LX_SM_SHARED 0x20
+#define LX_SM_NORESERVE 0x40
+
+/* For convenience */
+#define LX_PROT_GROWMASK (LX_PROT_GROWSUP|LX_PROT_GROWSDOWN)
+
+/* From lx_rlimit.c */
+extern void lx_get_rctl(char *, struct rlimit64 *);
+
+static int
+lx_mlock_common(int op, uintptr_t addr, size_t len)
+{
+ int err;
+ struct as *as = curproc->p_as;
+ const uintptr_t align_addr = addr & (uintptr_t)PAGEMASK;
+ const size_t align_len = P2ROUNDUP(len + (addr & PAGEOFFSET), PAGESIZE);
+
+ if (len == 0) {
+ /* Linux short-circuits to success on zero length */
+ return (0);
+ } else if ((align_addr + align_len) <= align_addr) {
+ /* Catch overflow (including when aligning len) */
+ return (set_errno(EINVAL));
+ }
+
+ err = as_ctl(as, (caddr_t)align_addr, align_len, op, 0, 0, NULL, 0);
+ if (err == EAGAIN)
+ err = ENOMEM;
+ return (err == 0 ? 0 : set_errno(err));
+}
+
+int
+lx_mlock(uintptr_t addr, size_t len)
+{
+ int err;
+
+ /*
+ * If the the caller is not privileged and either the limit is 0, or
+ * the kernel version is earlier than 2.6.9, then fail with EPERM. See
+ * LTP mlock2.c.
+ */
+ if ((err = secpolicy_lock_memory(CRED())) != 0) {
+ struct rlimit64 rlim64;
+
+ lx_get_rctl("process.max-locked-memory", &rlim64);
+ if (rlim64.rlim_cur == 0 ||
+ lx_kern_release_cmp(curzone, "2.6.9") < 0)
+ return (set_errno(err));
+ }
+
+ return (lx_mlock_common(MC_LOCK, addr, len));
+}
+
+int
+lx_munlock(uintptr_t addr, size_t len)
+{
+ return (lx_mlock_common(MC_UNLOCK, addr, len));
+}
+
+int
+lx_mlockall(int flags)
+{
+ int err;
+ struct as *as = curproc->p_as;
+
+ /*
+ * If the the caller is not privileged and either the limit is 0, or
+ * the kernel version is earlier than 2.6.9, then fail with EPERM. See
+ * LTP mlockall2.c.
+ */
+ if ((err = secpolicy_lock_memory(CRED())) != 0) {
+ struct rlimit64 rlim64;
+
+ lx_get_rctl("process.max-locked-memory", &rlim64);
+ if (rlim64.rlim_cur == 0 ||
+ lx_kern_release_cmp(curzone, "2.6.9") < 0)
+ return (set_errno(err));
+ }
+
+ if ((flags & ~(MCL_FUTURE | MCL_CURRENT)) || flags == 0)
+ return (set_errno(EINVAL));
+
+ err = as_ctl(as, 0, 0, MC_LOCKAS, 0, (uintptr_t)flags, NULL, 0);
+ if (err == EAGAIN)
+ err = ENOMEM;
+ return (err == 0 ? 0 : set_errno(err));
+}
+
+int
+lx_munlockall(void)
+{
+ int err;
+ struct as *as = curproc->p_as;
+
+ if (lx_kern_release_cmp(curzone, "2.6.9") < 0) {
+ if ((err = secpolicy_lock_memory(CRED())) != 0)
+ return (set_errno(err));
+ }
+
+ err = as_ctl(as, 0, 0, MC_UNLOCKAS, 0, 0, NULL, 0);
+ return (err == 0 ? 0 : set_errno(err));
+}
+
+int
+lx_msync(uintptr_t addr, size_t len, int flags)
+{
+ const size_t align_len = P2ROUNDUP(len, PAGESIZE);
+
+ if ((addr & PAGEOFFSET) != 0 ||
+ (flags & ~LX_VALID_MSYNC) != 0) {
+ return (set_errno(EINVAL));
+ } else if (len == 0) {
+ /* Linux short-circuits to success on zero length */
+ return (0);
+ } else if ((addr + align_len) < addr) {
+ /* Catch overflow (including when aligning len) */
+ return (set_errno(ENOMEM));
+ }
+
+ return (memcntl((caddr_t)addr, align_len, MC_SYNC,
+ (caddr_t)(uintptr_t)flags, 0, 0));
+}
+
+int
+lx_madvise(uintptr_t addr, size_t len, int advice)
+{
+ int err;
+ const size_t align_len = P2ROUNDUP(len, PAGESIZE);
+
+ switch (advice) {
+ case LX_MADV_REMOVE:
+ /* approximately similar */
+ advice = MADV_FREE;
+ break;
+
+ case LX_MADV_DONTNEED:
+ /*
+ * On Linux, MADV_DONTNEED implies an immediate purge of the
+ * specified region. This is spuriously different from
+ * (nearly) every other Unix, having apparently been done to
+ * mimic the semantics on Digital Unix (!). This is bad enough
+ * (MADV_FREE both has better semantics and results in better
+ * performance), but it gets worse: Linux applications (and
+ * notably, jemalloc) have managed to depend on the busted
+ * semantics of MADV_DONTNEED on Linux. We implement these
+ * semantics via MADV_PURGE -- and we translate our advice
+ * accordingly.
+ */
+ advice = MADV_PURGE;
+ break;
+
+ case LX_MADV_FREE:
+ advice = MADV_FREE;
+ break;
+
+ case LX_MADV_NORMAL:
+ case LX_MADV_RANDOM:
+ case LX_MADV_SEQUENTIAL:
+ case LX_MADV_WILLNEED:
+ /* These map directly to the illumos values */
+ break;
+
+ case LX_MADV_DONTFORK:
+ case LX_MADV_DOFORK:
+ case LX_MADV_HUGEPAGE:
+ case LX_MADV_NOHUGEPAGE:
+ case LX_MADV_DONTDUMP:
+ case LX_MADV_DODUMP:
+ /* harmless to pretend these work */
+ return (0);
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ if ((addr & PAGEOFFSET) != 0) {
+ return (set_errno(EINVAL));
+ } else if (len == 0) {
+ /* Linux short-circuits to success on zero length */
+ return (0);
+ } else if ((addr + align_len) <= addr) {
+ /*
+ * Catch overflow (including when aligning len). Unlike
+ * similar syscalls, this is an EINVAL failure for madvise(2).
+ */
+ return (set_errno(EINVAL));
+ }
+
+ err = memcntl((caddr_t)addr, align_len, MC_ADVISE,
+ (caddr_t)(intptr_t)advice, 0, 0);
+ if (err == EBUSY) {
+ if (advice != MADV_PURGE) {
+ return (set_errno(EINVAL));
+ }
+ /*
+ * If we received an EBUSY from a MADV_PURGE, we will now try
+ * again with a MADV_DONTNEED: there are conditions (namely,
+ * with locked mappings that haven't yet been faulted in) where
+ * MADV_PURGE will fail but MADV_DONTNEED will succeed. If
+ * this succeeds, we'll call the operation a success; if not,
+ * we'll kick back EINVAL.
+ */
+ advice = MADV_DONTNEED;
+ err = memcntl((caddr_t)addr, align_len, MC_ADVISE,
+ (caddr_t)(intptr_t)advice, 0, 0);
+ if (err != 0) {
+ return (set_errno(EINVAL));
+ }
+ /* Clear the old errno since success was eventually achieved. */
+ ttolwp(curthread)->lwp_errno = 0;
+ }
+ return (err);
+}
+
+int
+lx_mprotect(uintptr_t addr, size_t len, int prot)
+{
+ const size_t align_len = P2ROUNDUP(len, PAGESIZE);
+
+ /*
+ * The flags for native mprotect(2) are essentially the same as those
+ * on Linux, with the exception of PROT_GROWSUP/PROT_GROWSDOWN, for
+ * which there is no native analog. Those flags are presently ignored,
+ * unless they are both present, which represents an invalid argument.
+ */
+ if ((prot & LX_PROT_GROWMASK) == LX_PROT_GROWMASK) {
+ return (set_errno(EINVAL));
+ }
+ prot &= ~(LX_PROT_GROWMASK);
+
+ if ((addr & PAGEOFFSET) != 0) {
+ return (set_errno(EINVAL));
+ } else if (len == 0) {
+ /* Linux short-circuits to success on zero length */
+ return (0);
+ } else if ((addr + align_len) <= addr) {
+ /* Catch overflow (including when aligning len) */
+ return (set_errno(ENOMEM));
+ }
+
+ return (mprotect((void *)addr, align_len, prot));
+}
+
+/*
+ * There are two forms of mmap, mmap() and mmap2(). The only difference is that
+ * the final argument to mmap2() specifies the number of pages, not bytes. Also,
+ * mmap2 is 32-bit only.
+ *
+ * Linux has a number of additional flags, but they are all deprecated. We also
+ * ignore the MAP_GROWSDOWN flag, which has no equivalent on Solaris.
+ *
+ * The Linux mmap() returns ENOMEM in some cases where illumos returns
+ * EOVERFLOW, so we translate the errno as necessary.
+ */
+
+#define LX_MAP_ANONYMOUS 0x00020
+#define LX_MAP_LOCKED 0x02000
+#define LX_MAP_NORESERVE 0x04000
+#define LX_MAP_32BIT 0x00040
+
+#define ONE_GB 0x40000000
+
+static void lx_remap_anoncache_invalidate(uintptr_t, size_t);
+
+static int
+lx_ltos_mmap_flags(int flags)
+{
+ int new_flags;
+
+ new_flags = flags & (MAP_TYPE | MAP_FIXED);
+
+ if (flags & LX_MAP_ANONYMOUS)
+ new_flags |= MAP_ANONYMOUS;
+ if (flags & LX_MAP_NORESERVE)
+ new_flags |= MAP_NORESERVE;
+
+#if defined(_LP64)
+ if (flags & LX_MAP_32BIT)
+ new_flags |= MAP_32BIT;
+#endif
+
+ return (new_flags);
+}
+
+static void *
+lx_mmap_common(void *addr, size_t len, int prot, int flags, int fd, off64_t off)
+{
+ caddr_t ret;
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+
+ /*
+ * Under Linux, the file descriptor is ignored when mapping zfod
+ * anonymous memory, On illumos, we want the fd set to -1 for the
+ * same functionality.
+ */
+ if (flags & LX_MAP_ANONYMOUS)
+ fd = -1;
+
+ /*
+ * We refuse, as a matter of principle, to overcommit memory.
+ * Unfortunately, several bits of important and popular software expect
+ * to be able to pre-allocate large amounts of virtual memory but then
+ * probably never use it. One particularly bad example of this
+ * practice is golang. Another is the JVM.
+ *
+ * In the interest of running software, unsafe or not, we fudge
+ * something vaguely similar to overcommit by permanently enabling
+ * MAP_NORESERVE unless MAP_LOCKED was requested:
+ */
+ if (!(flags & LX_MAP_LOCKED)) {
+ flags |= LX_MAP_NORESERVE;
+ }
+
+ /*
+ * This is totally insane. The NOTES section in the linux mmap(2) man
+ * page claims that on some architectures, read protection may
+ * automatically include exec protection. It has been observed on a
+ * native linux system that the /proc/<pid>/maps file does indeed
+ * show that segments mmap'd from userland (such as libraries mapped in
+ * by the dynamic linker) all have exec the permission set, even for
+ * data segments.
+ *
+ * This insanity is tempered by the fact that the behavior is disabled
+ * for ELF binaries bearing a PT_GNU_STACK header which lacks PF_X
+ * (which most do). Such a header will clear the READ_IMPLIES_EXEC
+ * flag from the process personality.
+ */
+ if (prot & PROT_READ) {
+ if ((lxpd->l_personality & LX_PER_READ_IMPLIES_EXEC) != 0) {
+ prot |= PROT_EXEC;
+ }
+ }
+
+ ret = smmap64(addr, len, prot, lx_ltos_mmap_flags(flags), fd, off);
+ if (ttolwp(curthread)->lwp_errno != 0) {
+ if (ttolwp(curthread)->lwp_errno == EOVERFLOW)
+ (void) set_errno(ENOMEM);
+ return ((void *)-1);
+ }
+
+ if (flags & LX_MAP_LOCKED) {
+ (void) lx_mlock_common(MC_LOCK, (uintptr_t)ret, len);
+ /* clear any errno from mlock */
+ ttolwp(curthread)->lwp_errno = 0;
+ }
+
+ /*
+ * We have a new mapping; invalidate any cached anonymous regions that
+ * overlap(ped) with it.
+ */
+ mutex_enter(&lxpd->l_remap_anoncache_lock);
+ lx_remap_anoncache_invalidate((uintptr_t)ret, len);
+ mutex_exit(&lxpd->l_remap_anoncache_lock);
+
+ return (ret);
+}
+
+long
+lx_mmap(void *addr, size_t len, int prot, int flags, int fd, off64_t off)
+{
+ return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, off));
+}
+
+long
+lx_mmap2(void *addr, size_t len, int prot, int flags,
+ int fd, off_t off)
+{
+ return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd,
+ (off64_t)off * PAGESIZE));
+}
+
+long
+lx_munmap(void *addr, size_t len)
+{
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+
+ /*
+ * Invalidate any cached anonymous regions that overlap(ped) with it.
+ */
+ mutex_enter(&lxpd->l_remap_anoncache_lock);
+ lx_remap_anoncache_invalidate((uintptr_t)addr, len);
+ mutex_exit(&lxpd->l_remap_anoncache_lock);
+
+ return (munmap(addr, len));
+}
+
+#define LX_MREMAP_MAYMOVE 1 /* mapping can be moved */
+#define LX_MREMAP_FIXED 2 /* address is fixed */
+
+/*
+ * Unfortunately, the Linux mremap() manpage contains a statement that is, at
+ * best, grossly oversimplified: that mremap() "can be used to implement a
+ * very efficient realloc(3)." To the degree this is true at all, it is only
+ * true narrowly (namely, when large buffers are being expanded but can't be
+ * expanded in place due to virtual address space restrictions) -- but
+ * apparently, someone took this very literally, because variants of glibc
+ * appear to simply implement realloc() in terms of mremap(). This is
+ * unfortunate because absent intelligent usage, it forces realloc() to have
+ * an unncessary interaction with the VM system for small expansions -- and if
+ * realloc() is itself abused (e.g., if a consumer repeatedly expands and
+ * contracts the same memory buffer), the net result can be less efficient
+ * than a much more naive realloc() implementation. And if native Linux is
+ * suboptimal in this case, we are deeply pathological, having not
+ * historically supported mremap() for anonymous mappings at all. To make
+ * this at least palatable, we not only support remap for anonymous mappings
+ * (see lx_remap_anon(), below), we also cache the metadata associated with
+ * these anonymous remappings to reduce the need to search our address space.
+ * We implement the anonymous metadata cache with l_remap_anoncache, an LRU
+ * cache of lx_segmap_t's that correspond to anonymous segments that have been
+ * resized (only anonymous mappings that have been remapped are cached). The
+ * cache is part of the process's lx-brand-specifc data.
+ */
+
+/*
+ * Search our address space (as) mappings to find the specified mapping. This
+ * is derived from the procfs prgetmap() code. We implement the "reserved"
+ * behavior on the segment so as to accommodate the case where an mmap()'d and
+ * then ftruncate()'d file is being mremap()'d: we use the size of the
+ * mapping (which we need to validate old_size).
+ *
+ * Return 0 if mapping is found, errno if there is a problem or if mapping
+ * not found. If the mapping is found, we populate the mp parameter, vpp and
+ * offp with the results.
+ */
+static int
+lx_get_mapping(uintptr_t find_addr, size_t find_size, lx_segmap_t *mp,
+ vnode_t **vpp, offset_t *offp)
+{
+ struct as *as = curproc->p_as;
+ struct seg *seg;
+ uint_t prot;
+ caddr_t saddr, eaddr, naddr;
+
+ /* pr_getprot asserts that the as is held as a writer */
+ AS_LOCK_ENTER(as, RW_WRITER);
+
+ seg = as_segat(as, (caddr_t)find_addr);
+ if (seg == NULL || (seg->s_flags & S_HOLE) != 0) {
+ AS_LOCK_EXIT(as);
+ return (EFAULT);
+ }
+
+ /*
+ * We're interested in the reserved space, so we use the size of the
+ * segment itself.
+ */
+ eaddr = seg->s_base + seg->s_size;
+ for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
+ uintptr_t vaddr;
+ size_t size;
+ struct vnode *vp;
+ void *tmp = NULL;
+
+ prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
+ if (saddr == naddr)
+ continue;
+
+ vaddr = (uintptr_t)saddr;
+ size = (uintptr_t)naddr - (uintptr_t)saddr;
+
+ if (vaddr == find_addr && find_size < size &&
+ (find_size & PAGEOFFSET) != 0) {
+ /*
+ * We found a mapping but the size being requested is
+ * less than the mapping and not a multiple of our page
+ * size. If it is an anonymous mapping, that likely
+ * means the application did the initial mmap with this
+ * odd size. We'll round up to the next page boundary
+ * in this case.
+ */
+ if (seg->s_ops == &segspt_shmops ||
+ (seg->s_ops == &segvn_ops &&
+ (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
+ vp == NULL))) {
+ /*
+ * It's anonymous, round up the size.
+ */
+ find_size = ptob(btopr(find_size));
+ }
+ }
+
+ /* Check if mapping matches our arguments */
+ if (vaddr == find_addr && size == find_size) {
+ struct vattr vattr;
+
+ mp->lxsm_vaddr = vaddr;
+ mp->lxsm_size = size;
+ mp->lxsm_flags = 0;
+
+ *offp = SEGOP_GETOFFSET(seg, saddr);
+
+ if (prot & PROT_READ)
+ mp->lxsm_flags |= LX_SM_READ;
+ if (prot & PROT_WRITE)
+ mp->lxsm_flags |= LX_SM_WRITE;
+ if (prot & PROT_EXEC)
+ mp->lxsm_flags |= LX_SM_EXEC;
+ if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
+ mp->lxsm_flags |= LX_SM_SHARED;
+ if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
+ mp->lxsm_flags |= LX_SM_NORESERVE;
+ if (seg->s_ops == &segspt_shmops ||
+ (seg->s_ops == &segvn_ops &&
+ (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
+ vp == NULL)))
+ mp->lxsm_flags |= LX_SM_ANON;
+
+ if (seg->s_ops == &segspt_shmops) {
+ mp->lxsm_flags |= LX_SM_SHM;
+ } else if ((mp->lxsm_flags & LX_SM_SHARED) &&
+ curproc->p_segacct && shmgetid(curproc,
+ seg->s_base) != SHMID_NONE) {
+ mp->lxsm_flags |= LX_SM_SHM;
+ }
+
+ vattr.va_mask = AT_FSID | AT_NODEID;
+ if (seg->s_ops == &segvn_ops &&
+ SEGOP_GETVP(seg, saddr, &vp) == 0 &&
+ vp != NULL && vp->v_type == VREG &&
+ VOP_GETATTR(vp, &vattr, 0, CRED(),
+ NULL) == 0) {
+ VN_HOLD(vp);
+ *vpp = vp;
+ } else {
+ *vpp = NULL;
+ }
+
+ AS_LOCK_EXIT(as);
+ return (0);
+ }
+
+ if (vaddr <= find_addr &&
+ find_addr + find_size < vaddr + size) {
+ /*
+ * We have a mismatch, but our specified range is a
+ * subset of the actual segment; this is EINVAL.
+ */
+ AS_LOCK_EXIT(as);
+ DTRACE_PROBE2(lx__mremap__badsubset, caddr_t,
+ vaddr, size_t, size);
+ return (EINVAL);
+ }
+ }
+
+ AS_LOCK_EXIT(as);
+ return (EFAULT);
+}
+
+static void
+lx_remap_anoncache_invalidate(uintptr_t addr, size_t size)
+{
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+ uint_t i;
+
+ ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock));
+
+ if (lxpd->l_remap_anoncache_generation == 0)
+ return;
+
+ for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) {
+ lx_segmap_t *map = &lxpd->l_remap_anoncache[i];
+
+ /*
+ * If the ranges overlap at all, we zap it.
+ */
+ if (addr < map->lxsm_vaddr + map->lxsm_size &&
+ map->lxsm_vaddr < addr + size) {
+ bzero(map, sizeof (lx_segmap_t));
+ }
+ }
+}
+
+static void
+lx_remap_anoncache_load(lx_segmap_t *map, size_t size)
+{
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+ uint64_t oldest = UINT64_MAX;
+ lx_segmap_t *evict = NULL;
+ uint_t i;
+
+ ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock));
+
+ for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) {
+ lx_segmap_t *cp = &lxpd->l_remap_anoncache[i];
+
+ if (cp->lxsm_vaddr == map->lxsm_vaddr) {
+ /*
+ * We're already in the cache -- we just need to update
+ * our LRU field and size to reflect the hit.
+ */
+ cp->lxsm_lru = lxpd->l_remap_anoncache_generation++;
+ cp->lxsm_size = size;
+ return;
+ }
+
+ if (cp->lxsm_vaddr == 0) {
+ evict = cp;
+ break;
+ }
+
+ if (cp->lxsm_lru < oldest) {
+ oldest = cp->lxsm_lru;
+ evict = cp;
+ }
+ }
+
+ /* Update the entry we're evicting */
+ ASSERT(evict != NULL);
+ evict->lxsm_vaddr = map->lxsm_vaddr;
+ evict->lxsm_size = size;
+ evict->lxsm_flags = map->lxsm_flags;
+ evict->lxsm_lru = lxpd->l_remap_anoncache_generation++;
+}
+
+static int lx_u2u_copy(void *, void *, size_t);
+
+/*
+ * As part of lx_remap() (see below) and to accommodate heavy realloc() use
+ * cases (see the discussion of the l_remap_anoncache, above), we allow
+ * anonymous segments to be "remapped" in that we are willing to truncate them
+ * or append to them (as much as that's allowed by virtual address space
+ * usage). If we fall out of these cases, we take the more expensive option
+ * of actually copying the data to a new segment -- but we locate the address
+ * in a portion of the address space that should give us plenty of VA space to
+ * expand.
+ *
+ * We return the address of the mapping or set errno if there is a problem.
+ */
+static long
+lx_remap_anon(lx_segmap_t *mapin, size_t new_size, uint_t flags,
+ uintptr_t new_addr)
+{
+ lx_segmap_t m;
+ int mflags = MAP_ANON;
+ int prot = 0;
+ void *addr, *hint = NULL;
+
+ ASSERT(MUTEX_HELD(&ptolxproc(curproc)->l_remap_anoncache_lock));
+
+ /*
+ * Make a copy of the input lx_segmap_t argument since it might be
+ * a reference into the anon cache, and we're manipulating cache
+ * entries during this function.
+ */
+ m = *mapin;
+
+ /*
+ * If our new size is less than our old size and we're either not
+ * being ordered to move it or the address we're being ordered to
+ * move it to is our current address, we can just act as Procrustes
+ * and chop off anything larger than the new size.
+ */
+ if (new_size < m.lxsm_size && (!(flags & LX_MREMAP_FIXED) ||
+ new_addr == m.lxsm_vaddr)) {
+ if (munmap((void *)(m.lxsm_vaddr + new_size),
+ m.lxsm_size - new_size) != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ lx_remap_anoncache_load(&m, new_size);
+ return (m.lxsm_vaddr);
+ }
+
+ if (m.lxsm_flags & LX_SM_SHM)
+ return (set_errno(EINVAL));
+
+ if (m.lxsm_flags & LX_SM_WRITE)
+ prot |= PROT_WRITE;
+
+ if (m.lxsm_flags & LX_SM_READ)
+ prot |= PROT_READ;
+
+ if (m.lxsm_flags & LX_SM_EXEC)
+ prot |= PROT_EXEC;
+
+ mflags |= (m.lxsm_flags & LX_SM_SHARED) ? MAP_SHARED : MAP_PRIVATE;
+
+ if (m.lxsm_flags & LX_SM_NORESERVE)
+ mflags |= MAP_NORESERVE;
+
+ /*
+ * If we're not being told where to move it, let's try to expand our
+ * mapping in place by adding a fixed mapping after it.
+ */
+ if (!(flags & LX_MREMAP_FIXED)) {
+ void *tmp_addr = (void *)(m.lxsm_vaddr + m.lxsm_size);
+
+ ASSERT(new_size > m.lxsm_size);
+ addr = smmap64(tmp_addr, new_size - m.lxsm_size, prot,
+ mflags, -1, 0);
+ if (ttolwp(curthread)->lwp_errno != 0) {
+ /* There is no place to mmap some extra anon */
+ return (set_errno(EINVAL));
+ }
+
+ if (addr == tmp_addr) {
+ /* The expansion worked */
+ lx_remap_anoncache_load(&m, new_size);
+ return (m.lxsm_vaddr);
+ }
+
+ /*
+ * Our advisory address was not followed -- which, as a
+ * practical matter, means that the range conflicted with an
+ * extant mapping. Unmap wherever our attempted expansion
+ * landed, and drop into the relocation case.
+ */
+ (void) munmap(addr, new_size - m.lxsm_size);
+ }
+
+ lx_remap_anoncache_invalidate(m.lxsm_vaddr, m.lxsm_size);
+
+ /*
+ * If we're here, we actually need to move this mapping -- so if we
+ * can't move it, we're done.
+ */
+ if (!(flags & LX_MREMAP_MAYMOVE))
+ return (set_errno(ENOMEM));
+
+ /*
+ * If this is a shared private mapping, we can't remap it.
+ */
+ if (m.lxsm_flags & LX_SM_SHARED)
+ return (set_errno(EINVAL));
+
+ if (flags & LX_MREMAP_FIXED) {
+ mflags |= MAP_FIXED;
+ hint = (void *)new_addr;
+ } else {
+ /*
+ * Search our address space for a gap to remap into. To give
+ * ourselves plenty of room for further mremap() expansion,
+ * we'll multiply our new size by 16 and look for a gap at
+ * least that big. Historically we looked for an empty gap
+ * around the 2GB region, so we start our search for the lowest
+ * gap in that vicinity.
+ */
+ caddr_t base;
+ size_t upper;
+
+ base = (caddr_t)ONE_GB;
+ upper = (uintptr_t)USERLIMIT - (uintptr_t)base;
+
+ if (as_gap(curproc->p_as, (new_size << 4UL), &base, &upper,
+ AH_LO, NULL) != -1)
+ hint = base;
+ }
+
+ addr = smmap64(hint, new_size, prot, mflags, -1, 0);
+ if (ttolwp(curthread)->lwp_errno != 0) {
+ return (ttolwp(curthread)->lwp_errno);
+ }
+
+ if (lx_u2u_copy((void *)m.lxsm_vaddr, addr, m.lxsm_size) != 0) {
+ /* We couldn't complete the relocation, backout & fail */
+ (void) munmap(addr, new_size);
+ return (set_errno(ENOMEM));
+ }
+
+ (void) munmap((void *)m.lxsm_vaddr, m.lxsm_size);
+
+ /*
+ * Add the relocated mapping to the cache.
+ */
+ m.lxsm_vaddr = (uintptr_t)addr;
+ lx_remap_anoncache_load(&m, new_size);
+
+ return ((long)addr);
+}
+
+/*
+ * We don't have a native mremap() (nor do we particularly want one), so
+ * we emulate it strictly in lx. The idea is simple: we just want to
+ * mmap() the underlying object with the new size and rip down the old mapping.
+ * However, this is slightly complicated because we don't actually have the
+ * file descriptor that corresponds to the resized mapping. So to get a file
+ * descriptor, we may have to search our address space for the mapping and use
+ * the associated vnode to create a file descriptor. Assuming that this
+ * succeeds, we then mmap() it and rip down the original mapping. There are
+ * clearly many reasons why this might fail; absent a more apt errno (e.g.,
+ * ENOMEM in some cases), we return EINVAL to denote these cases.
+ */
+long
+lx_mremap(uintptr_t old_addr, size_t old_size, size_t new_size, int flags,
+ uintptr_t new_addr)
+{
+ int prot = 0, oflags, mflags = 0, i, res;
+ lx_segmap_t map, *mp;
+ int rval = 0;
+ lx_proc_data_t *lxpd;
+ offset_t off;
+ struct vnode *vp = NULL;
+ file_t *fp;
+ caddr_t naddr;
+
+ if (flags & LX_MREMAP_FIXED) {
+ /* MREMAP_FIXED requires MREMAP_MAYMOVE */
+ if ((flags & LX_MREMAP_MAYMOVE) == 0)
+ return (set_errno(EINVAL));
+
+ if (new_addr & PAGEOFFSET)
+ return (set_errno(EINVAL));
+
+ mflags |= MAP_FIXED;
+ } else {
+ if (new_size == old_size)
+ return (old_addr);
+
+ /* new_addr is optional and only valid when LX_MREMAP_FIXED. */
+ new_addr = NULL;
+ }
+
+ if (old_addr & PAGEOFFSET)
+ return (set_errno(EINVAL));
+
+ if (new_size == 0)
+ return (set_errno(EINVAL));
+
+ /*
+ * First consult the anoncache; if we find the segment there, we'll
+ * drop straight into lx_remap_anon() and save ourself the pain of
+ * searching our address space.
+ */
+ lxpd = ptolxproc(curproc);
+ mutex_enter(&lxpd->l_remap_anoncache_lock);
+
+ for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) {
+ long rv;
+
+ mp = &lxpd->l_remap_anoncache[i];
+
+ if (mp->lxsm_vaddr != old_addr)
+ continue;
+
+ if (mp->lxsm_size != old_size)
+ continue;
+
+ /*
+ * lx_remap_anon will either:
+ * a) expand/contract in place, returning old_addr
+ * b) relocate & expand the mapping, returning a new address
+ * c) there will be an error of some sort and errno will be set
+ */
+ rv = lx_remap_anon(mp, new_size, flags, new_addr);
+ mutex_exit(&lxpd->l_remap_anoncache_lock);
+ return (rv);
+ }
+
+ mutex_exit(&lxpd->l_remap_anoncache_lock);
+
+ /*
+ * Search our address space to find the specified mapping.
+ */
+ if ((res = lx_get_mapping(old_addr, old_size, &map, &vp, &off)) > 0)
+ return (set_errno(res));
+
+ /*
+ * We found the mapping.
+ */
+ mp = &map;
+ DTRACE_PROBE1(lx__mremap__seg, lx_segmap_t *, mp);
+
+ if (mp->lxsm_flags & LX_SM_SHM) {
+ /*
+ * If this is either ISM or System V shared memory, we're not
+ * going to remap it.
+ */
+ rval = set_errno(EINVAL);
+ goto out;
+ }
+
+ if (mp->lxsm_flags & LX_SM_ANON) {
+ /*
+ * This is an anonymous mapping -- which is the one case in
+ * which we perform something that approaches a true remap.
+ */
+ long rv;
+
+ if (vp != NULL)
+ VN_RELE(vp);
+ mutex_enter(&lxpd->l_remap_anoncache_lock);
+ rv = lx_remap_anon(mp, new_size, flags, new_addr);
+ mutex_exit(&lxpd->l_remap_anoncache_lock);
+ return (rv);
+ }
+
+ /* The rest of the code is for a 'named' mapping */
+
+ if (!(flags & LX_MREMAP_MAYMOVE)) {
+ /*
+ * If we're not allowed to move this mapping, we're going to
+ * act as if we can't expand it.
+ */
+ rval = set_errno(ENOMEM);
+ goto out;
+ }
+
+ if (!(mp->lxsm_flags & LX_SM_SHARED)) {
+ /*
+ * If this is a private mapping, we're not going to remap it.
+ */
+ rval = set_errno(EINVAL);
+ goto out;
+ }
+
+ oflags = (mp->lxsm_flags & LX_SM_WRITE) ? (FWRITE | FREAD) : FREAD;
+ if (vp == NULL) {
+ /*
+ * If vp is NULL, the path might not exist. We're going to kick
+ * it back with EINVAL.
+ */
+ rval = set_errno(EINVAL);
+ goto out;
+ }
+
+ /* falloc cannot fail with a NULL fdp. */
+ VERIFY0(falloc(vp, oflags, &fp, NULL));
+ mutex_exit(&fp->f_tlock);
+
+ if (mp->lxsm_flags & LX_SM_WRITE)
+ prot |= PROT_WRITE;
+
+ if (mp->lxsm_flags & LX_SM_READ)
+ prot |= PROT_READ;
+
+ if (mp->lxsm_flags & LX_SM_EXEC)
+ prot |= PROT_EXEC;
+
+ mflags |= MAP_SHARED;
+
+ /*
+ * We're using smmap_common to pass the fp directly, instead of
+ * initializing a temporary file descriptor for smmap64(), so as to
+ * prevent any inadvertent use of that temporary fd within the
+ * application.
+ */
+ naddr = (caddr_t)new_addr;
+ rval = smmap_common(&naddr, new_size, prot, mflags, fp, off);
+
+ mutex_enter(&fp->f_tlock);
+ unfalloc(fp);
+
+ if (rval != 0) {
+ rval = set_errno(ENOMEM);
+ goto out;
+ }
+
+ /*
+ * Our mapping succeeded; we're now going to rip down the old mapping.
+ */
+ (void) munmap((void *)old_addr, old_size);
+
+out:
+ if (vp != NULL)
+ VN_RELE(vp);
+
+ if (rval == 0)
+ return ((long)naddr);
+ return ((long)rval);
+}
+
+#pragma GCC diagnostic ignored "-Wclobbered"
+/*
+ * During mremap we had to relocate the initial anonymous mapping to a new
+ * location (a new anonymous mapping). Copy the user-level data from the first
+ * mapping to the second mapping.
+ *
+ * We have to lock both sides to ensure there is no fault. We do this in 16MB
+ * chunks at a time and we do not concern ourselves with the zone's
+ * max-locked-memory rctl.
+ *
+ * Keep this function at the end since we're disabling the compiler's "clobber"
+ * check due to the on_fault call.
+ */
+static int
+lx_u2u_copy(void *src, void *dst, size_t len)
+{
+ size_t mlen;
+ caddr_t sp, dp;
+ int err;
+ page_t **ppa_src, **ppa_dst;
+ label_t ljb;
+ struct as *p_as = curproc->p_as;
+
+ /* Both sides should be page aligned since they're from smmap64 */
+ ASSERT(((uintptr_t)src & PAGEOFFSET) == 0);
+ ASSERT(((uintptr_t)dst & PAGEOFFSET) == 0);
+ /* Both came from mmap, so they should be valid user pointers */
+ ASSERT((uintptr_t)src < USERLIMIT && (uintptr_t)dst < USERLIMIT);
+
+ sp = src;
+ dp = dst;
+
+ do {
+ mlen = MIN(len, 16 * 1024 * 1024);
+
+ err = as_pagelock(p_as, &ppa_src, sp, mlen, S_READ);
+ if (err != 0) {
+ return (err);
+ }
+ err = as_pagelock(p_as, &ppa_dst, dp, mlen, S_WRITE);
+ if (err != 0) {
+ as_pageunlock(p_as, ppa_src, sp, mlen, S_READ);
+ return (err);
+ }
+
+ DTRACE_PROBE3(lx__mremap__copy, void *, sp, void *, dp,
+ size_t, mlen);
+
+ /* on_fault calls smap_disable */
+ if (on_fault(&ljb)) {
+ /*
+ * Given that the pages are locked and smap is disabled,
+ * we really should never get here. If we somehow do
+ * get here, the copy fails just as if we could not
+ * lock the pages to begin with.
+ */
+ as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE);
+ as_pageunlock(p_as, ppa_src, sp, mlen, S_READ);
+ return (EFAULT);
+ }
+ ucopy(sp, dp, mlen);
+ no_fault(); /* calls smap_enable */
+
+ as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE);
+ as_pageunlock(p_as, ppa_src, sp, mlen, S_READ);
+
+ len -= mlen;
+ sp += mlen;
+ dp += mlen;
+ } while (len > 0);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c
new file mode 100644
index 0000000000..5245b32870
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c
@@ -0,0 +1,495 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/systeminfo.h>
+#include <sys/fcntl.h>
+#include <sys/resource.h>
+#include <sys/uadmin.h>
+#include <sys/lx_misc.h>
+#include <lx_syscall.h>
+
+#define LINUX_REBOOT_MAGIC1 0xfee1dead
+#define LINUX_REBOOT_MAGIC2 672274793
+#define LINUX_REBOOT_MAGIC2A 85072278
+#define LINUX_REBOOT_MAGIC2B 369367448
+#define LINUX_REBOOT_MAGIC2C 537993216
+
+#define LINUX_REBOOT_CMD_RESTART 0x1234567
+#define LINUX_REBOOT_CMD_HALT 0xcdef0123
+#define LINUX_REBOOT_CMD_CAD_ON 0x89abcdef
+#define LINUX_REBOOT_CMD_CAD_OFF 0
+#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc
+#define LINUX_REBOOT_CMD_RESTART2 0xa1b2c3d4
+#define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2
+#define LINUX_REBOOT_CMD_KEXEC 0x45584543
+
+#define LX_RUSAGE_SELF 0
+#define LX_RUSAGE_CHILDREN (-1)
+#define LX_RUSAGE_BOTH (-2)
+#define LX_RUSAGE_THREAD 1
+
+#define LX_SWAP_PRIOMASK 0x7fff
+#define LX_SWAP_PREFER 0x8000
+#define LX_SWAP_DISCARD 0x10000
+#define LX_SWAP_DISCARD_ONCE 0x20000
+#define LX_SWAP_DISCARD_PAGES 0x40000
+
+#define LX_SWAP_ALL (LX_SWAP_DISCARD_PAGES | \
+ LX_SWAP_DISCARD_ONCE | \
+ LX_SWAP_DISCARD | \
+ LX_SWAP_PREFER | LX_SWAP_PRIOMASK)
+
+/* From uts/common/fs/vfs.c */
+extern void vfs_sync(int);
+/* From uts/common/os/grow.c */
+extern int mincore(caddr_t, size_t, char *);
+extern int munmap(caddr_t, size_t);
+/* From uts/common/os/session.c */
+extern int vhangup();
+/* From uts/common/syscall/alarm.c */
+extern int alarm(int);
+/* From uts/common/syscall/chdir.c */
+extern int chdir(char *);
+extern int chroot(char *);
+extern int fchdir(int);
+/* From uts/common/syscall/nice.c */
+extern int nice(int);
+/* From uts/common/syscall/open.c */
+extern int open(char *, int, int);
+/* From uts/common/syscall/pause.c */
+extern int pause();
+/* From uts/common/syscall/rusagesys.c */
+extern int rusagesys(int, void *, void *, void *, void *);
+/* From uts/common/syscall/systeminfo.c */
+extern long systeminfo(int, char *, long);
+/* From uts/common/syscall/timers.c */
+extern int getitimer(uint_t, struct itimerval *);
+/* From uts/common/syscall/time.c */
+extern int stime(time_t);
+/* From uts/common/syscall/uadmin.c */
+extern int uadmin(int, int, uintptr_t);
+/* From uts/common/syscall/chdir.c */
+extern int chdir_proc(proc_t *, vnode_t *, boolean_t, boolean_t);
+/* From uts/common/fs/lookup.c */
+extern int lookupname(char *, enum uio_seg, int, vnode_t **, vnode_t **);
+/* From uts/common/fs/fs_subr.c */
+extern int fs_need_estale_retry(int);
+/* From uts/common/os/acct.c */
+extern int sysacct(char *);
+
+/* The callback arguments when handling a FS clone group. */
+typedef struct {
+ vnode_t *lcfa_vp;
+ boolean_t lcfa_type;
+ boolean_t lcfa_traverse;
+} lx_clone_fs_arg_t;
+
+long
+lx_alarm(int seconds)
+{
+ return (alarm(seconds));
+}
+
+static int
+lx_clone_fs_cb(proc_t *pp, void *arg)
+{
+ lx_clone_fs_arg_t *ap = (lx_clone_fs_arg_t *)arg;
+ int err;
+
+ /*
+ * Either:
+ * A) The initial lookupname() from lx_clone_fs_do_group() will have
+ * added a hold on the vnode to ensure its existence throughout the
+ * walk.
+ * B) We added a hold in fchdir.
+ * We need to add another hold for each process in the group.
+ */
+ VN_HOLD(ap->lcfa_vp);
+ if ((err = chdir_proc(pp, ap->lcfa_vp, ap->lcfa_type,
+ ap->lcfa_traverse)) != 0) {
+ /* if we failed, chdir_proc already did a rele on vp */
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Check to see if the process is in a CLONE_FS clone group. Return false
+ * if not (the normal case), otherwise perform the setup, do the group walk
+ * and return true.
+ */
+static boolean_t
+lx_clone_fs_do_group(char *path, boolean_t is_chroot, int *errp)
+{
+ lx_proc_data_t *lproc = ttolxproc(curthread);
+ vnode_t *vp;
+ lx_clone_fs_arg_t arg;
+ int err;
+ int estale_retry = 0;
+
+ if (!lx_clone_grp_member(lproc, LX_CLONE_FS))
+ return (B_FALSE);
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+
+retry:
+ err = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+ if (err != 0) {
+ if (err == ESTALE && fs_need_estale_retry(estale_retry++))
+ goto retry;
+ *errp = err;
+ return (B_TRUE);
+ }
+
+ arg.lcfa_vp = vp;
+ arg.lcfa_type = is_chroot;
+ arg.lcfa_traverse = B_TRUE;
+
+ /*
+ * We use the VN_HOLD from the lookup to guarantee vp exists for the
+ * entire walk.
+ */
+ err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb,
+ (void *)&arg);
+ VN_RELE(vp);
+ *errp = err;
+ return (B_TRUE);
+}
+
+long
+lx_chdir(char *path)
+{
+ int err;
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+ if (lx_clone_fs_do_group(path, B_FALSE, &err))
+ return ((err != 0) ? set_errno(err) : 0);
+
+ return (chdir(path));
+}
+
+long
+lx_chroot(char *path)
+{
+ int err;
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+ if (lx_clone_fs_do_group(path, B_TRUE, &err))
+ return ((err != 0) ? set_errno(err) : 0);
+
+ return (chroot(path));
+}
+
+long
+lx_creat(char *path, mode_t mode)
+{
+ return (open(path, O_WRONLY | O_CREAT | O_TRUNC, mode));
+}
+
+long
+lx_fchdir(int fd)
+{
+ lx_proc_data_t *lproc = ttolxproc(curthread);
+
+ if (lx_clone_grp_member(lproc, LX_CLONE_FS)) {
+ /* Handle the rare case of being in a CLONE_FS clone group */
+ file_t *fp;
+ vnode_t *vp;
+ lx_clone_fs_arg_t arg;
+ int err;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+ vp = fp->f_vnode;
+ VN_HOLD(vp);
+ releasef(fd);
+
+ arg.lcfa_vp = vp;
+ arg.lcfa_type = B_FALSE;
+ arg.lcfa_traverse = B_FALSE;
+
+ /*
+ * We use the VN_HOLD above to guarantee vp exists for the
+ * entire walk.
+ */
+ err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb,
+ (void *)&arg);
+ VN_RELE(vp);
+ if (err)
+ return (set_errno(err));
+ return (0);
+ }
+
+ return (fchdir(fd));
+}
+
+long
+lx_getitimer(int which, struct itimerval *value)
+{
+ return (getitimer(which, value));
+}
+
+/* Linux and illumos have the same rusage structures. */
+long
+lx_getrusage(int who, struct rusage *rup)
+{
+ int code;
+
+ switch (who) {
+ case LX_RUSAGE_SELF:
+ code = _RUSAGESYS_GETRUSAGE;
+ break;
+ case LX_RUSAGE_CHILDREN:
+ code = _RUSAGESYS_GETRUSAGE_CHLD;
+ break;
+ case LX_RUSAGE_THREAD:
+ code = _RUSAGESYS_GETRUSAGE_LWP;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ return (rusagesys(code, rup, NULL, NULL, NULL));
+}
+
+long
+lx_mincore(caddr_t addr, size_t len, char *vec)
+{
+ int r;
+
+ r = mincore(addr, len, vec);
+ if (r == EINVAL) {
+ /*
+ * LTP mincore01 expects mincore with a huge len to fail with
+ * ENOMEM on a modern kernel, although on Linux 2.6.11 and
+ * earlier, it will return EINVAL.
+ */
+ if (lx_kern_release_cmp(curzone, "2.6.11") > 0 && (long)len < 0)
+ return (set_errno(ENOMEM));
+ }
+ return (r);
+}
+
+long
+lx_nice(int incr)
+{
+ return (nice(incr));
+}
+
+long
+lx_pause(void)
+{
+ return (pause());
+}
+
+/*ARGSUSED*/
+long
+lx_reboot(int magic1, int magic2, uint_t flag, uintptr_t p4)
+{
+ if (magic1 != LINUX_REBOOT_MAGIC1)
+ return (set_errno(EINVAL));
+
+ switch (magic2) {
+ case LINUX_REBOOT_MAGIC2:
+ case LINUX_REBOOT_MAGIC2A:
+ case LINUX_REBOOT_MAGIC2B:
+ case LINUX_REBOOT_MAGIC2C:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Once we have better Linux capabilities(7) support we should check
+ * CAP_SYS_BOOT instead.
+ */
+ if (crgetuid(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ switch (flag) {
+ case LINUX_REBOOT_CMD_CAD_ON:
+ case LINUX_REBOOT_CMD_CAD_OFF:
+ /* ignored */
+ return (0);
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ case LINUX_REBOOT_CMD_HALT:
+ return (uadmin(A_SHUTDOWN, AD_HALT, NULL));
+
+ case LINUX_REBOOT_CMD_RESTART:
+ case LINUX_REBOOT_CMD_RESTART2:
+ /* RESTART2 may need more work */
+ return (uadmin(A_SHUTDOWN, AD_BOOT, NULL));
+
+ default:
+ return (set_errno(EINVAL));
+ }
+}
+
+long
+lx_setdomainname(char *name, long len)
+{
+ if (len < 0 || len >= LX_SYS_UTS_LN)
+ return (set_errno(EINVAL));
+
+ ttolwp(curthread)->lwp_errno = 0;
+ (void) systeminfo(SI_SET_SRPC_DOMAIN, name, len);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ return (ttolwp(curthread)->lwp_errno);
+ return (0);
+}
+
+long
+lx_sethostname(char *name, size_t len)
+{
+ ttolwp(curthread)->lwp_errno = 0;
+ (void) systeminfo(SI_SET_HOSTNAME, name, len);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ return (ttolwp(curthread)->lwp_errno);
+ return (0);
+}
+
+long
+lx_stime(time_t *tp)
+{
+ time_t time;
+
+ if (copyin(tp, &time, sizeof (time)) != 0)
+ return (set_errno(EFAULT));
+
+ return (stime(time));
+}
+
+long
+lx_sync(void)
+{
+ vfs_sync(0);
+ return (0);
+}
+
+/*
+ * For syslog, since there is no Linux kernel and nothing to log, we simply
+ * emulate a kernel buffer (LOG_BUF_LEN) of 0 bytes and only handle errors for
+ * bad input. All actions except 3 and 10 require CAP_SYS_ADMIN or CAP_SYSLOG
+ * so without full capabilities support, for now we just perform an euid check.
+ */
+long
+lx_syslog(int type, char *bufp, int len)
+{
+ if (type < 0 || type > 10)
+ return (set_errno(EINVAL));
+
+ if (type != 3 && type != 10 && crgetuid(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ if (type >= 2 && type <= 4 && (bufp == NULL || len < 0))
+ return (set_errno(EINVAL));
+
+ if (type == 8 && (len < 1 || len > 8))
+ return (set_errno(EINVAL));
+
+ return (0);
+}
+
+long
+lx_vhangup(void)
+{
+ if (crgetuid(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ /*
+ * The native vhangup code does nothing except check for the sys_config
+ * privilege. Eventually we'll first want to check our emulation for the
+ * Linux CAP_SYS_TTY_CONFIG capability, but currently, since we've
+ * already checked that our process is root, just succeed.
+ */
+ return (0);
+}
+
+long
+lx_acct(char *p)
+{
+ return (sysacct(p));
+}
+
+/*
+ * Support for Linux namespaces is not yet implemented. Normally we would
+ * simply return ENOSYS for this. However, "systemd" uses mount namespaces to
+ * provide the PrivateTmp feature for some services. Use of this feature is
+ * becoming common and these services will fail to run without namespace
+ * support. "systemd" has a fallback to allow these types of services to run if
+ * it sees either EACCES or EPERM when it tries to setup the namespace. Until
+ * we have namespace support, we return EPERM to workaround this issue.
+ */
+/*ARGSUSED*/
+long
+lx_unshare(int flags)
+{
+ return (set_errno(EPERM));
+}
+
+/*
+ * The whole idea of "swap space" within a zone is a complete fabrication.
+ * However, some apps expect to be able to see swap space data in the /proc
+ * files, while other apps actually don't want there to be any swap space
+ * configured. We use the swapon/off syscalls to allow this visibility to be
+ * controlled from within the zone iself. Note that the "swapon" CLI tends to
+ * do a lot of additional validation which will fail within a zone.
+ *
+ * Once we have better Linux capabilities(7) support we should check
+ * CAP_SYS_ADMIN instead of uid == 0.
+ */
+long
+lx_swapoff(char *path)
+{
+ char buf[MAXPATHLEN];
+ size_t len;
+ lx_zone_data_t *lxzd;
+
+ /* Simple validaton of the argument */
+ if (copyinstr(path, buf, sizeof (buf), &len) != 0)
+ return (set_errno(EFAULT));
+ if (crgetuid(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ lxzd = ztolxzd(curzone);
+ ASSERT(lxzd != NULL);
+
+ lxzd->lxzd_swap_disabled = B_TRUE;
+ return (0);
+}
+
+long
+lx_swapon(char *path, int flags)
+{
+ char buf[MAXPATHLEN];
+ size_t len;
+ lx_zone_data_t *lxzd;
+
+ /* Simple validaton of the arguments */
+ if (copyinstr(path, buf, sizeof (buf), &len) != 0)
+ return (set_errno(EFAULT));
+ if (flags & ~LX_SWAP_ALL)
+ return (set_errno(EINVAL));
+ if (crgetuid(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ lxzd = ztolxzd(curzone);
+ ASSERT(lxzd != NULL);
+
+ lxzd->lxzd_swap_disabled = B_FALSE;
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c
new file mode 100644
index 0000000000..2f29f56d5f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c
@@ -0,0 +1,38 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/fcntl.h>
+#include <sys/lx_fcntl.h>
+
+/*
+ * From "uts/common/syscall/mkdir.c":
+ */
+extern int mkdirat(int, char *, int);
+
+long
+lx_mkdirat(int fd, char *dname, int dmode)
+{
+ if (fd == LX_AT_FDCWD) {
+ fd = AT_FDCWD;
+ }
+
+ return (mkdirat(fd, dname, dmode));
+}
+
+long
+lx_mkdir(char *dname, int dmode)
+{
+ return (mkdirat(AT_FDCWD, dname, dmode));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c
new file mode 100644
index 0000000000..aa6e12a7d8
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c
@@ -0,0 +1,121 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/segments.h>
+#include <sys/archsystm.h>
+#include <sys/proc.h>
+#include <sys/sysi86.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_ldt.h>
+
+/*
+ * Read the ldt_info structure in from the Linux app, convert it to an ssd
+ * structure, and then call setdscr() to do all the heavy lifting.
+ */
+static int
+write_ldt(void *data, ulong_t count)
+{
+ user_desc_t usd;
+ struct ssd ssd;
+ struct ldt_info ldt_inf;
+ proc_t *pp = curthread->t_procp;
+ int err;
+
+ if (count != sizeof (ldt_inf))
+ return (set_errno(EINVAL));
+
+ if (copyin(data, &ldt_inf, sizeof (ldt_inf)))
+ return (set_errno(EFAULT));
+
+ if (ldt_inf.entry_number >= MAXNLDT)
+ return (set_errno(EINVAL));
+
+ LDT_INFO_TO_DESC(&ldt_inf, &usd);
+ usd_to_ssd(&usd, &ssd, SEL_LDT(ldt_inf.entry_number));
+
+ /*
+ * Get everyone into a safe state before changing the LDT.
+ */
+ if (!holdlwps(SHOLDFORK1))
+ return (set_errno(EINTR));
+
+ err = setdscr(&ssd);
+
+ /*
+ * Release the hounds!
+ */
+ mutex_enter(&pp->p_lock);
+ continuelwps(pp);
+ mutex_exit(&pp->p_lock);
+
+ return (err ? set_errno(err) : 0);
+}
+
+static int
+read_ldt(void *uptr, ulong_t count)
+{
+ proc_t *pp = curproc;
+ int bytes;
+
+ if (pp->p_ldt == NULL)
+ return (0);
+
+ bytes = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
+ if (bytes > count)
+ bytes = count;
+
+ if (copyout(pp->p_ldt, uptr, bytes))
+ return (set_errno(EFAULT));
+
+ return (bytes);
+}
+
+long
+lx_modify_ldt(int op, void *data, ulong_t count)
+{
+ int rval;
+
+ switch (op) {
+ case 0:
+ rval = read_ldt(data, count);
+ break;
+
+ case 1:
+ rval = write_ldt(data, count);
+ break;
+
+ default:
+ rval = set_errno(ENOSYS);
+ break;
+ }
+
+ return (rval);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mount.c b/usr/src/uts/common/brand/lx/syscall/lx_mount.c
new file mode 100644
index 0000000000..2524e9044a
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_mount.c
@@ -0,0 +1,675 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/ctype.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+#include <sys/types.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_syscalls.h>
+#include <sys/lx_autofs.h>
+
+#define tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x))
+
+/*
+ * mount(2) is significantly different between Linux and illumos. One of the
+ * main differences is between the set of flags. Some flags on Linux can be
+ * translated to an illumos equivalent, some are converted to a
+ * filesystem-specific option, while others have no equivalent whatsoever.
+ *
+ * Another big difference is that mounting NFS is fully handled in the kernel on
+ * Linux whereas on illumos a lot of preliminary work is done by the NFS mount
+ * command before calling mount(2). As a simplification, we forward NFS
+ * mount calls back out to the user-level library which does the same kind of
+ * preliminary processing that is done by the native user-level NFS mount code.
+ */
+#define LX_MS_MGC_VAL 0xC0ED0000
+#define LX_MS_RDONLY 0x00000001
+#define LX_MS_NOSUID 0x00000002
+#define LX_MS_NODEV 0x00000004
+#define LX_MS_NOEXEC 0x00000008
+#define LX_MS_SYNCHRONOUS 0x00000010
+#define LX_MS_REMOUNT 0x00000020
+#define LX_MS_MANDLOCK 0x00000040
+#define LX_MS_NOATIME 0x00000400
+#define LX_MS_NODIRATIME 0x00000800
+#define LX_MS_BIND 0x00001000
+#define LX_MS_MOVE 0x00002000
+#define LX_MS_REC 0x00004000
+#define LX_MS_SILENT 0x00008000
+#define LX_MS_POSIXACL 0x00010000
+#define LX_MS_UNBINDABLE 0x00020000
+#define LX_MS_PRIVATE 0x00040000
+#define LX_MS_SLAVE 0x00080000
+#define LX_MS_SHARED 0x00100000
+#define LX_MS_RELATIME 0x00200000
+#define LX_MS_KERNMOUNT 0x00400000
+#define LX_MS_I_VERSION 0x00800000
+#define LX_MS_STRICTATIME 0x01000000
+#define LX_MS_LAZYTIME 0x02000000
+
+/* Linux kernel-internal flags - ignored if passed in */
+#define LX_MS_NOSEC 0x10000000
+#define LX_MS_BORN 0x20000000
+#define LX_MS_ACTIVE 0x40000000
+#define LX_MS_NOUSER 0x80000000
+
+#define LX_MS_SUPPORTED (LX_MS_MGC_VAL | \
+ LX_MS_RDONLY | LX_MS_NOSUID | \
+ LX_MS_NODEV | LX_MS_NOEXEC | \
+ LX_MS_REMOUNT | LX_MS_NOATIME | \
+ LX_MS_BIND | LX_MS_SILENT | \
+ LX_MS_STRICTATIME | LX_MS_NOSEC | \
+ LX_MS_BORN | LX_MS_ACTIVE | LX_MS_NOUSER)
+
+/*
+ * support definitions
+ */
+typedef enum mount_opt_type {
+ MOUNT_OPT_INVALID = 0,
+ MOUNT_OPT_NORMAL = 1, /* option value: none */
+ MOUNT_OPT_UINT = 2, /* option value: unsigned int */
+ MOUNT_OPT_PASSTHRU = 3 /* option value: validated downstream */
+} mount_opt_type_t;
+
+typedef struct mount_opt {
+ char *mo_name;
+ mount_opt_type_t mo_type;
+} mount_opt_t;
+
+/* From uts/common/syscall/umount.c */
+extern int umount2(char *, int);
+
+/* From lx_chown.c */
+extern long lx_vn_chown(vnode_t *, uid_t, gid_t);
+
+/*
+ * Globals
+ */
+static mount_opt_t lofs_options[] = {
+ { NULL, MOUNT_OPT_INVALID }
+};
+
+static mount_opt_t lx_proc_options[] = {
+ { NULL, MOUNT_OPT_INVALID }
+};
+
+static mount_opt_t lx_sysfs_options[] = {
+ { NULL, MOUNT_OPT_INVALID }
+};
+
+static mount_opt_t lx_tmpfs_options[] = {
+ { "size", MOUNT_OPT_PASSTHRU },
+ { "mode", MOUNT_OPT_UINT },
+ { "uid", MOUNT_OPT_UINT },
+ { "gid", MOUNT_OPT_UINT },
+ { NULL, MOUNT_OPT_INVALID }
+};
+
+static mount_opt_t lx_autofs_options[] = {
+ { LX_MNTOPT_FD, MOUNT_OPT_UINT },
+ { LX_MNTOPT_PGRP, MOUNT_OPT_UINT },
+ { LX_MNTOPT_MINPROTO, MOUNT_OPT_UINT },
+ { LX_MNTOPT_MAXPROTO, MOUNT_OPT_UINT },
+ { LX_MNTOPT_INDIRECT, MOUNT_OPT_NORMAL },
+ { LX_MNTOPT_DIRECT, MOUNT_OPT_NORMAL },
+ { LX_MNTOPT_OFFSET, MOUNT_OPT_NORMAL },
+ { NULL, MOUNT_OPT_INVALID }
+};
+
+static const char *lx_common_mnt_opts[] = {
+ "exec",
+ "noexec",
+ "devices",
+ "nodevices",
+ "dev",
+ "nodev",
+ "suid",
+ "nosuid",
+ NULL
+};
+
+/*
+ * Check the mount options.
+ *
+ * On illumos all mount option verification is done by the user-level mount
+ * command. Invalid options are simply ignored by domount(). Thus, we check
+ * here for invalid/unsupported options.
+ */
+static int
+lx_mnt_opt_verify(char *opts, mount_opt_t *mop)
+{
+ int opts_len = strlen(opts);
+ char *opt, *tp;
+ int opt_len, i;
+ boolean_t last = B_FALSE;
+
+ ASSERT((opts != NULL) && (mop != NULL));
+
+ /* If no options were specified, nothing to do. */
+ if (opts_len == 0)
+ return (0);
+
+ /* If no options are allowed, fail. */
+ if (mop[0].mo_name == NULL)
+ return (ENOTSUP);
+
+ /* Don't accept leading or trailing ','. */
+ if ((opts[0] == ',') || (opts[opts_len] == ','))
+ return (EINVAL);
+
+ /* Don't accept sequential ','. */
+ for (i = 1; i < opts_len; i++) {
+ if ((opts[i - 1] == ',') && (opts[i] == ','))
+ return (EINVAL);
+ }
+
+ /*
+ * Verify each prop one at a time. There is no strtok in the kernel but
+ * it's easy to tokenize the entry ourselves.
+ */
+ opt = opts;
+ for (tp = opt; *tp != ',' && *tp != '\0'; tp++)
+ ;
+ if (*tp == ',') {
+ *tp = '\0';
+ } else {
+ last = B_TRUE;
+ }
+ for (;;) {
+ opt_len = strlen(opt);
+
+ /* Check common options we support on all filesystems */
+ for (i = 0; lx_common_mnt_opts[i] != NULL; i++) {
+ if (strcmp(opt, lx_common_mnt_opts[i]) == 0)
+ goto next_opt;
+ }
+
+ /* Check for matching option/value pair. */
+ for (i = 0; mop[i].mo_name != NULL; i++) {
+ char *ovalue;
+ int ovalue_len, mo_len;
+
+ /* If the options is too short don't bother comparing */
+ mo_len = strlen(mop[i].mo_name);
+ if (opt_len < mo_len) {
+ /* Keep trying to find a match. */
+ continue;
+ }
+
+ /* Compare the option to an allowed option. */
+ if (strncmp(mop[i].mo_name, opt, mo_len) != 0) {
+ /* Keep trying to find a match. */
+ continue;
+ }
+
+ if (mop[i].mo_type == MOUNT_OPT_NORMAL) {
+ /* The option doesn't take a value. */
+ if (opt_len == mo_len) {
+ /* This option is ok. */
+ break;
+ } else {
+ /* Keep trying to find a match. */
+ continue;
+ }
+ }
+
+ /* This options takes a value. */
+ if ((opt_len == mo_len) || (opt[mo_len] != '=')) {
+ /* Keep trying to find a match. */
+ continue;
+ }
+
+ /* We have an option match. Verify option value. */
+ ovalue = &opt[mo_len] + 1;
+ ovalue_len = strlen(ovalue);
+
+ /* Value can't be zero length string. */
+ if (ovalue_len == 0) {
+ goto bad;
+ }
+
+ if (mop[i].mo_type == MOUNT_OPT_UINT) {
+ int j;
+ /* Verify that value is an unsigned int. */
+ for (j = 0; j < ovalue_len; j++) {
+ if (!ISDIGIT(ovalue[j])) {
+ goto bad;
+ }
+ }
+ } else if (mop[i].mo_type == MOUNT_OPT_PASSTHRU) {
+ /* Filesystem will do its own validation. */
+ break;
+ } else {
+ /* Unknown option type specified. */
+ goto bad;
+ }
+
+ /* The option is ok. */
+ break;
+ }
+
+ /* If there were no matches this is an unsupported option. */
+ if (mop[i].mo_name == NULL) {
+ goto bad;
+ }
+
+next_opt:
+ /*
+ * This option is ok, either we're done or move on to the next
+ * option.
+ */
+ if (last)
+ break;
+
+ *tp = ',';
+ opt = tp + 1;
+ for (tp = opt; *tp != ',' && *tp != '\0'; tp++)
+ ;
+ if (*tp == ',') {
+ *tp = '\0';
+ } else {
+ last = B_TRUE;
+ }
+ };
+
+ /* We verified all the options. */
+ return (0);
+
+bad:
+ if (!last) {
+ *tp = ',';
+ }
+ return (EINVAL);
+}
+
+/*
+ * Remove an option from the string and save it in the provided buffer.
+ * The option string should have already been verified as valid.
+ * Return 0 if not present, -1 if error, and 1 if present and fine.
+ */
+static int
+lx_mnt_opt_rm(char *opts, char *rmopt, char *retstr, int retlen)
+{
+ int opts_len = strlen(opts);
+ char *optstart, *optend;
+ int optlen;
+
+ ASSERT((opts != NULL) && (rmopt != NULL));
+
+ retstr[0] = '\0';
+
+ /* If no options were specified, there's no problem. */
+ if (opts_len == 0)
+ return (0);
+
+ if ((optstart = strstr(opts, rmopt)) == NULL)
+ return (0);
+
+ for (optend = optstart; *optend != ',' && *optend != '\0'; optend++)
+ ;
+
+ /*LINTED*/
+ optlen = optend - optstart;
+ if (optlen >= retlen)
+ return (-1);
+ (void) strncpy(retstr, optstart, optlen);
+ retstr[optlen] = '\0';
+
+ if (*optend == ',')
+ optend++;
+
+ optlen = strlen(optend) + 1;
+ bcopy(optend, optstart, optlen);
+
+ if (*optstart == '\0' && optstart != opts) {
+ /* removed last opt and it had a preceeding opt, remove comma */
+ *(optstart - 1) = '\0';
+ }
+
+ return (1);
+}
+
+static int
+lx_mnt_opt_val(char *opt, int *valp)
+{
+ char *op, *ep;
+ long lval;
+
+ if ((op = strchr(opt, '=')) == NULL)
+ return (-1);
+
+ op++;
+ if (!ISDIGIT(*op))
+ return (-1);
+
+ if (ddi_strtoul(op, &ep, 10, (ulong_t *)&lval) != 0 || lval > INT_MAX) {
+ return (-1);
+ }
+
+ if (*ep != '\0')
+ return (-1);
+
+ *valp = (int)lval;
+ return (0);
+}
+
+static int
+lx_mnt_add_opt(char *option, char *buf, size_t buf_size)
+{
+ char *fmt_str = NULL;
+ size_t len;
+
+ ASSERT((option != NULL) && (strlen(option) > 0));
+ ASSERT((buf != NULL) && (buf_size > 0));
+
+ if (buf[0] == '\0') {
+ fmt_str = "%s";
+ } else {
+ fmt_str = ",%s";
+ }
+
+ len = strlen(buf);
+ VERIFY(len <= buf_size);
+ buf_size -= len;
+ buf += len;
+
+ if (snprintf(buf, buf_size, fmt_str, option) > (buf_size - 1))
+ return (EOVERFLOW);
+ return (0);
+}
+
+static int
+lx_mnt_copyin_arg(const char *from, char *to, size_t len)
+{
+ size_t slen;
+ int rv;
+
+ rv = copyinstr(from, to, len, &slen);
+ if (rv == ENAMETOOLONG || slen == len)
+ return (ENAMETOOLONG);
+ if (rv != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+long
+lx_mount(const char *sourcep, const char *targetp, const char *fstypep,
+ uint_t flags, const void *datap)
+{
+ char fstype[16];
+ char source[MAXPATHLEN];
+ char target[MAXPATHLEN];
+ char options[MAX_MNTOPT_STR];
+ int sflags, rv;
+ struct mounta ma, *map = &ma;
+ vfs_t *vfsp;
+ vnode_t *vp = NULL;
+ int uid = -1;
+ int gid = -1;
+
+ if ((rv = lx_mnt_copyin_arg(fstypep, fstype, sizeof (fstype))) != 0) {
+ if (rv == ENAMETOOLONG)
+ return (set_errno(ENODEV));
+ return (set_errno(rv));
+ }
+
+ /*
+ * Vector back out to userland emulation for NFS.
+ */
+ if (strcmp(fstype, "nfs") == 0 || strcmp(fstype, "nfs4") == 0) {
+ uintptr_t uargs[5] = {(uintptr_t)sourcep, (uintptr_t)targetp,
+ (uintptr_t)fstypep, (uintptr_t)flags, (uintptr_t)datap};
+
+ /* The userspace emulation will do the lx_syscall_return() */
+ ttolxlwp(curthread)->br_eosys = JUSTRETURN;
+
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ lx_emulate_user32(ttolwp(curthread), LX_SYS32_mount,
+ uargs);
+ } else
+#endif
+ {
+ lx_emulate_user(ttolwp(curthread), LX_SYS_mount, uargs);
+ }
+ return (0);
+ }
+
+ sflags = MS_SYSSPACE | MS_OPTIONSTR;
+ options[0] = '\0';
+
+ /* Copy in parameters that are always present. */
+ if ((rv = lx_mnt_copyin_arg(sourcep, source, sizeof (source))) != 0)
+ return (set_errno(rv));
+
+ if ((rv = lx_mnt_copyin_arg(targetp, target, sizeof (target))) != 0)
+ return (set_errno(rv));
+
+ /*
+ * While SunOS is picky about mount(2) target paths being absolute,
+ * Linux is not so strict. In order to facilitate this looser
+ * requirement we must lookup the full path.
+ */
+ if (target[0] != '/') {
+ vnode_t *vp;
+
+ if ((rv = lookupnameatcred(target, UIO_SYSSPACE, FOLLOW,
+ NULLVPP, &vp, NULL, CRED())) != 0)
+ return (set_errno(rv));
+
+ rv = vnodetopath(NULL, vp, target, MAXPATHLEN, CRED());
+ VN_RELE(vp);
+ if (rv != 0)
+ return (set_errno(rv));
+ }
+
+ /* Make sure we support the requested mount flags. */
+ if ((flags & ~LX_MS_SUPPORTED) != 0)
+ return (set_errno(ENOTSUP));
+
+ /* Copy in Linux mount options. */
+ if (datap != NULL &&
+ (rv = lx_mnt_copyin_arg(datap, options, sizeof (options))) != 0)
+ return (set_errno(rv));
+
+ /* Do filesystem specific mount work. */
+ if (flags & LX_MS_BIND) {
+ /* If MS_BIND is set, we turn this into a lofs mount. */
+ (void) strcpy(fstype, "lofs");
+
+ /* Verify Linux mount options. */
+ if ((rv = lx_mnt_opt_verify(options, lofs_options)) != 0)
+ return (set_errno(rv));
+ } else if (strcmp(fstype, "tmpfs") == 0) {
+ char idstr[64];
+
+ /* Verify Linux mount options. */
+ if ((rv = lx_mnt_opt_verify(options, lx_tmpfs_options)) != 0)
+ return (set_errno(rv));
+
+ /*
+ * Linux defaults to mode=1777 for tmpfs mounts.
+ */
+ if (strstr(options, "mode=") == NULL) {
+ if (options[0] != '\0')
+ (void) strlcat(options, ",", sizeof (options));
+ (void) strlcat(options, "mode=1777", sizeof (options));
+ }
+
+ switch (lx_mnt_opt_rm(options, "uid=", idstr, sizeof (idstr))) {
+ case 0:
+ uid = -1;
+ break;
+ case 1:
+ if (lx_mnt_opt_val(idstr, &uid) < 0)
+ return (set_errno(EINVAL));
+ break;
+ default:
+ return (set_errno(E2BIG));
+ }
+ switch (lx_mnt_opt_rm(options, "gid=", idstr, sizeof (idstr))) {
+ case 0:
+ gid = -1;
+ break;
+ case 1:
+ if (lx_mnt_opt_val(idstr, &gid) < 0)
+ return (set_errno(EINVAL));
+ break;
+ default:
+ return (set_errno(E2BIG));
+ }
+
+ /*
+ * Linux seems to always allow overlay mounts. We allow this
+ * everywhere except under /dev where it interferes with device
+ * emulation.
+ */
+ if (strcmp(target, "/dev") != 0 &&
+ strncmp(target, "/dev/", 5) != 0)
+ sflags |= MS_OVERLAY;
+ } else if (strcmp(fstype, "proc") == 0) {
+ /* Translate proc mount requests to lx_proc requests. */
+ (void) strcpy(fstype, "lx_proc");
+
+ /* Verify Linux mount options. */
+ if ((rv = lx_mnt_opt_verify(options, lx_proc_options)) != 0)
+ return (set_errno(rv));
+ } else if (strcmp(fstype, "sysfs") == 0) {
+ /* Translate sysfs mount requests to lx_sysfs requests. */
+ (void) strcpy(fstype, "lx_sysfs");
+
+ /* Verify Linux mount options. */
+ if ((rv = lx_mnt_opt_verify(options, lx_sysfs_options)) != 0)
+ return (set_errno(rv));
+ } else if (strcmp(fstype, "cgroup") == 0) {
+ /* Translate cgroup mount requests to lx_cgroup requests. */
+ (void) strcpy(fstype, "lx_cgroup");
+
+ /*
+ * Currently don't verify Linux mount options since we can
+ * have a subsystem string provided.
+ */
+ } else if (strcmp(fstype, "autofs") == 0) {
+ /* Translate autofs mount requests to lxautofs requests. */
+ (void) strcpy(fstype, LX_AUTOFS_NAME);
+
+ /* Verify Linux mount options. */
+ if ((rv = lx_mnt_opt_verify(options, lx_autofs_options)) != 0)
+ return (set_errno(rv));
+
+ /* Linux seems to always allow overlay mounts */
+ sflags |= MS_OVERLAY;
+ } else {
+ return (set_errno(ENODEV));
+ }
+
+ /* Convert some Linux flags to illumos flags. */
+ if (flags & LX_MS_RDONLY)
+ sflags |= MS_RDONLY;
+ if (flags & LX_MS_NOSUID)
+ sflags |= MS_NOSUID;
+ if (flags & LX_MS_REMOUNT)
+ sflags |= MS_REMOUNT;
+
+ /*
+ * Convert some Linux flags to illumos option strings.
+ */
+ if (flags & LX_MS_STRICTATIME) {
+ /*
+ * The "strictatime" mount option ensures that none of the
+ * weaker atime-related mode options are in effect.
+ */
+ flags &= ~(LX_MS_RELATIME | LX_MS_NOATIME);
+ }
+ if ((flags & LX_MS_NODEV) &&
+ (rv = lx_mnt_add_opt("nodev", options, sizeof (options))) != 0)
+ return (set_errno(rv));
+ if ((flags & LX_MS_NOEXEC) &&
+ (rv = lx_mnt_add_opt("noexec", options, sizeof (options))) != 0)
+ return (set_errno(rv));
+ if ((flags & LX_MS_NOATIME) &&
+ (rv = lx_mnt_add_opt("noatime", options, sizeof (options))) != 0)
+ return (set_errno(rv));
+
+ if ((rv = lookupname(target, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0)
+ return (set_errno(rv));
+
+ /* If mounting proc over itself, just return ok */
+ if (strcmp(fstype, "lx_proc") == 0 && strcmp("lx_proc",
+ vfssw[vp->v_vfsp->vfs_fstype].vsw_name) == 0) {
+ VN_RELE(vp);
+ return (0);
+ }
+
+ map->spec = source;
+ map->dir = target;
+ map->flags = sflags;
+ map->fstype = fstype;
+ map->dataptr = NULL;
+ map->datalen = 0;
+ map->optptr = options;
+ map->optlen = sizeof (options);
+
+ rv = domount(NULL, map, vp, CRED(), &vfsp);
+ VN_RELE(vp);
+ if (rv != 0)
+ return (set_errno(rv));
+
+ VFS_RELE(vfsp);
+ if (strcmp(fstype, "tmpfs") == 0 && (uid != -1 || gid != -1)) {
+ /* Handle tmpfs uid/gid mount options. */
+ if (lookupname(target, UIO_SYSSPACE, FOLLOW, NULLVPP,
+ &vp) == 0) {
+ (void) lx_vn_chown(vp, (uid_t)uid, (gid_t)gid);
+ VN_RELE(vp);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * umount() is identical to illumos, though implemented on top of umount2().
+ */
+long
+lx_umount(char *path)
+{
+ return (umount2(path, 0));
+}
+
+/*
+ * The Linux umount2() system call is identical to illumos but has a different
+ * value for MNT_FORCE (the logical equivalent to MS_FORCE).
+ */
+#define LX_MNT_FORCE 0x1
+
+long
+lx_umount2(char *path, int flg)
+{
+ int flags = 0;
+
+ if (flg & ~LX_MNT_FORCE)
+ return (set_errno(EINVAL));
+
+ if (flg & LX_MNT_FORCE)
+ flags |= MS_FORCE;
+
+ return (umount2(path, flags));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_open.c b/usr/src/uts/common/brand/lx/syscall/lx_open.c
new file mode 100644
index 0000000000..4ee355eb70
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_open.c
@@ -0,0 +1,288 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/inttypes.h>
+#include <sys/mutex.h>
+
+#include <sys/lx_types.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_misc.h>
+#include <sys/brand.h>
+
+extern int fcntl(int, int, intptr_t);
+extern int openat(int, char *, int, int);
+extern int open(char *, int, int);
+extern int close(int);
+extern int cioctl(file_t *, int, intptr_t, int *);
+extern int lookupnameat(char *, enum uio_seg, int, vnode_t **, vnode_t **,
+ vnode_t *);
+
+
+static int
+ltos_open_flags(int input)
+{
+ int flags;
+
+ if (input & LX_O_PATH) {
+ input &= (LX_O_DIRECTORY | LX_O_NOFOLLOW | LX_O_CLOEXEC);
+ }
+
+ /* This depends on the Linux ACCMODE flags being the same as SunOS. */
+ flags = (input & LX_O_ACCMODE);
+
+ if (input & LX_O_CREAT) {
+ flags |= O_CREAT;
+ }
+
+ if (input & LX_O_EXCL)
+ flags |= O_EXCL;
+ if (input & LX_O_NOCTTY)
+ flags |= O_NOCTTY;
+ if (input & LX_O_TRUNC)
+ flags |= O_TRUNC;
+ if (input & LX_O_APPEND)
+ flags |= O_APPEND;
+ if (input & LX_O_NONBLOCK)
+ flags |= O_NONBLOCK;
+ if (input & LX_O_SYNC)
+ flags |= O_SYNC;
+ if (input & LX_O_LARGEFILE)
+ flags |= O_LARGEFILE;
+ if (input & LX_O_NOFOLLOW)
+ flags |= O_NOFOLLOW;
+ if (input & LX_O_CLOEXEC)
+ flags |= O_CLOEXEC;
+
+ /*
+ * Linux uses the LX_O_DIRECT flag to do raw, synchronous I/O to the
+ * device backing the fd in question. Illumos doesn't have similar
+ * functionality, but we can attempt to simulate it using the flags
+ * (O_RSYNC|O_SYNC) and directio(3C).
+ *
+ * The LX_O_DIRECT flag also requires that the transfer size and
+ * alignment of I/O buffers be a multiple of the logical block size for
+ * the underlying file system, but frankly there isn't an easy way to
+ * support that functionality without doing something like adding an
+ * fcntl(2) flag to denote LX_O_DIRECT mode.
+ *
+ * Since LX_O_DIRECT is merely a performance advisory, we'll just
+ * emulate what we can and trust that the only applications expecting
+ * an error when performing I/O from a misaligned buffer or when
+ * passing a transfer size is not a multiple of the underlying file
+ * system block size will be test suites.
+ */
+ if (input & LX_O_DIRECT)
+ flags |= (O_RSYNC|O_SYNC);
+
+ return (flags);
+}
+
+#define LX_POSTPROCESS_OPTS (LX_O_DIRECT | LX_O_ASYNC | LX_O_PATH)
+
+static int
+lx_open_postprocess(int fd, int fmode)
+{
+ file_t *fp;
+ int rv, error = 0;
+
+ if ((fmode & LX_POSTPROCESS_OPTS) == 0) {
+ /* Skip out early, if possible */
+ return (0);
+ }
+
+ if ((fp = getf(fd)) == NULL) {
+ /*
+ * It is possible that this fd was closed by the time we
+ * arrived here if some one is hammering away with close().
+ */
+ return (EIO);
+ }
+
+ if (fmode & LX_O_DIRECT && error == 0) {
+ (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON,
+ fp->f_flag, fp->f_cred, &rv, NULL);
+ }
+
+ if (fmode & LX_O_ASYNC && error == 0) {
+ if ((error = VOP_SETFL(fp->f_vnode, fp->f_flag, FASYNC,
+ fp->f_cred, NULL)) == 0) {
+ mutex_enter(&fp->f_tlock);
+ fp->f_flag |= FASYNC;
+ mutex_exit(&fp->f_tlock);
+ }
+ }
+
+ if (fmode & LX_O_PATH && error == 0) {
+ /*
+ * While the O_PATH flag has no direct analog in SunOS, it is
+ * emulated by removing both FREAD and FWRITE from f_flag.
+ * This causes read(2) and write(2) result in EBADF and can be
+ * checked for in other syscalls to trigger the correct behavior
+ * there.
+ */
+ mutex_enter(&fp->f_tlock);
+ fp->f_flag &= ~(FREAD|FWRITE);
+ mutex_exit(&fp->f_tlock);
+ }
+
+ releasef(fd);
+ if (error != 0) {
+ (void) closeandsetf(fd, NULL);
+ }
+ return (error);
+}
+
+long
+lx_openat(int atfd, char *path, int fmode, int cmode)
+{
+ int flags, fd, error;
+ mode_t mode = 0;
+
+ if (atfd == LX_AT_FDCWD)
+ atfd = AT_FDCWD;
+
+ flags = ltos_open_flags(fmode);
+
+ /*
+ * We use the FSEARCH flag to make sure this is a directory. We have to
+ * explicitly add 1 to emulate the FREAD/FWRITE mapping of the OPENMODE
+ * macro since it won't get set via OPENMODE when FSEARCH is used.
+ */
+ if (fmode & LX_O_DIRECTORY) {
+ flags |= FSEARCH;
+ flags++;
+ }
+
+ if (flags & O_CREAT)
+ mode = (mode_t)cmode;
+
+ ttolwp(curthread)->lwp_errno = 0;
+ fd = openat(atfd, path, flags, mode);
+ if (ttolwp(curthread)->lwp_errno != 0) {
+ if ((fmode & LX_O_DIRECTORY) &&
+ ttolwp(curthread)->lwp_errno != ENOTDIR) {
+ /*
+ * We got an error trying to open a file as a directory.
+ * We need to determine if we should return the original
+ * error or ENOTDIR.
+ */
+ vnode_t *startvp;
+ vnode_t *vp;
+ int oerror, error = 0;
+
+ oerror = ttolwp(curthread)->lwp_errno;
+
+ if (atfd == AT_FDCWD) {
+ /* regular open */
+ startvp = NULL;
+ } else {
+ char startchar;
+
+ if (copyin(path, &startchar, sizeof (char)))
+ return (set_errno(oerror));
+
+ /* if startchar is / then startfd is ignored */
+ if (startchar == '/') {
+ startvp = NULL;
+ } else {
+ file_t *startfp;
+
+ if ((startfp = getf(atfd)) == NULL)
+ return (set_errno(oerror));
+ startvp = startfp->f_vnode;
+ VN_HOLD(startvp);
+ releasef(atfd);
+ }
+ }
+
+ if (lookupnameat(path, UIO_USERSPACE,
+ (fmode & LX_O_NOFOLLOW) ? NO_FOLLOW : FOLLOW,
+ NULLVPP, &vp, startvp) != 0) {
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ return (set_errno(oerror));
+ }
+
+ if (startvp != NULL)
+ VN_RELE(startvp);
+
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+
+ VN_RELE(vp);
+ if (error != 0)
+ return (set_errno(ENOTDIR));
+
+ (void) set_errno(oerror);
+ } else if ((fmode & LX_O_NOFOLLOW) && (fmode & LX_O_PATH) &&
+ ttolwp(curthread)->lwp_errno == ELOOP) {
+ /*
+ * On Linux, if O_NOFOLLOW and O_PATH are set together
+ * and the target is a symbolic link, then openat
+ * should return a file descriptor referring to the
+ * symbolic link.
+ *
+ * This file descriptor can be used with fchownat(2),
+ * fstatat(2), linkat(2), and readlinkat(2) alongside
+ * an empty pathname.
+ *
+ * We do not have a way to return such a file
+ * descriptor in illumos so open it without NO_FOLLOW
+ * and allow the postprocess to emulate O_PATH by
+ * removing the read and write flags.
+ * This is enough to keep recent systemd happy
+ * although any attempt to use the fd for the above
+ * listed calls without a pathname will fail or modify
+ * the symlink target.
+ */
+ return (lx_openat(atfd, path, fmode & ~LX_O_NOFOLLOW,
+ cmode));
+ }
+
+ if (ttolwp(curthread)->lwp_errno == EINTR)
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+
+ return (ttolwp(curthread)->lwp_errno);
+ }
+
+ if ((error = lx_open_postprocess(fd, fmode)) != 0) {
+ return (set_errno(error));
+ }
+ return (fd);
+}
+
+long
+lx_open(char *path, int fmode, int cmode)
+{
+ return (lx_openat(LX_AT_FDCWD, path, fmode, cmode));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_personality.c b/usr/src/uts/common/brand/lx/syscall/lx_personality.c
new file mode 100644
index 0000000000..e7aa945b50
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_personality.c
@@ -0,0 +1,112 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/brand.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+
+
+/*
+ * These flags are for what Linux calls "bug emulation".
+ * (Descriptions from the personality(2) Linux man page.)
+ *
+ * Flags which are currently actionable in LX:
+ * - READ_IMPLIES_EXEC (since Linux 2.6.8)
+ * With this flag set, PROT_READ implies PROT_EXEC for mmap(2).
+ *
+ * Flags which are current accepted but ignored:
+ * - UNAME26 (since Linux 3.1)
+ * Have uname(2) report a 2.6.40+ version number rather than a 3.x version
+ * number. Added as a stopgap measure to support broken applications that
+ * could not handle the kernel version- numbering switch from 2.6.x to 3.x.
+ *
+ * - ADDR_NO_RANDOMIZE (since Linux 2.6.12)
+ * With this flag set, disable address-space-layout randomization.
+ *
+ * - FDPIC_FUNCPTRS (since Linux 2.6.11)
+ * User-space function pointers to signal handlers point (on certain
+ * architectures) to descriptors.
+ *
+ * - MMAP_PAGE_ZERO (since Linux 2.4.0)
+ * Map page 0 as read-only (to support binaries that depend on this SVr4
+ * behavior).
+ *
+ * - ADDR_COMPAT_LAYOUT (since Linux 2.6.9)
+ * With this flag set, provide legacy virtual address space layout.
+ *
+ * - ADDR_LIMIT_32BIT (since Linux 2.2)
+ * Limit the address space to 32 bits.
+ *
+ * - SHORT_INODE (since Linux 2.4.0)
+ * No effects(?).
+ *
+ * - WHOLE_SECONDS (since Linux 1.2.0)
+ * No effects(?).
+ *
+ * - STICKY_TIMEOUTS (since Linux 1.2.0)
+ * With this flag set, select(2), pselect(2), and ppoll(2) do not modify the
+ * returned timeout argument when interrupted by a signal handler.
+ *
+ * - ADDR_LIMIT_3GB (since Linux 2.4.0)
+ * With this flag set, use 0xc0000000 as the offset at which to search a
+ * virtual memory chunk on mmap(2); otherwise use 0xffffe000.
+ */
+
+#define LX_PER_GET 0xffffffff
+
+long
+lx_personality(unsigned int arg)
+{
+ lx_proc_data_t *lxpd = ptolxproc(curproc);
+ unsigned int result = 0;
+
+ mutex_enter(&curproc->p_lock);
+ result = lxpd->l_personality;
+
+ if (arg == LX_PER_GET) {
+ mutex_exit(&curproc->p_lock);
+ return (result);
+ }
+
+ /*
+ * Prevent changes to the personality if the process is undergoing an
+ * exec. This will allow elfexec and friends to manipulate the
+ * personality without hinderance.
+ */
+ if ((curproc->p_flag & P_PR_EXEC) != 0) {
+ mutex_exit(&curproc->p_lock);
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Keep tabs when a non-Linux personality is set. This is silently
+ * allowed to succeed, even though the emulation required is almost
+ * certainly missing.
+ */
+ if ((arg & LX_PER_MASK) != LX_PER_LINUX) {
+ char buf[64];
+
+ (void) snprintf(buf, sizeof (buf), "invalid personality: %02X",
+ arg & LX_PER_MASK);
+ lx_unsupported(buf);
+ }
+
+ lxpd->l_personality = arg;
+ mutex_exit(&curproc->p_lock);
+ return (result);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c b/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c
new file mode 100644
index 0000000000..2acd9d431e
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c
@@ -0,0 +1,189 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/lx_misc.h>
+
+#define LX_INIT_PGID 1
+#define LX_INIT_SID 1
+
+/* From uts/common/syscall/pgrpsys.c */
+extern int setpgrp(int, int, int);
+
+long
+lx_getpgrp(void)
+{
+ int pg;
+
+ /* getpgrp() */
+ pg = setpgrp(0, 0, 0);
+
+ /*
+ * If the pgrp is that of the init process, return the value Linux
+ * expects.
+ */
+ if (pg == curzone->zone_proc_initpid)
+ return (LX_INIT_PGID);
+
+ return (pg);
+}
+
+long
+lx_getpgid(int pid)
+{
+ pid_t spid;
+ int tid;
+ int pg;
+
+ if (pid < 0)
+ return (set_errno(ESRCH));
+
+ /*
+ * If the supplied pid matches that of the init process, return the pgid
+ * Linux expects.
+ */
+ if (pid == curzone->zone_proc_initpid)
+ return (LX_INIT_PGID);
+
+ if (pid == 0) {
+ spid = curproc->p_pid;
+ } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) {
+ return (set_errno(ESRCH));
+ }
+
+ /* getpgid() */
+ ttolwp(curthread)->lwp_errno = 0;
+ pg = setpgrp(4, spid, 0);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ return (ttolwp(curthread)->lwp_errno);
+
+ /*
+ * If the pgid is that of the init process, return the value Linux
+ * expects.
+ */
+ if (pg == curzone->zone_proc_initpid)
+ return (LX_INIT_PGID);
+
+ return (pg);
+}
+
+long
+lx_setpgid(pid_t pid, pid_t pgid)
+{
+ pid_t spid, spgid;
+ int tid;
+ int pg;
+ int ret;
+
+ if (pid < 0)
+ return (set_errno(ESRCH));
+
+ if (pgid < 0)
+ return (set_errno(EINVAL));
+
+ if (pid == 0) {
+ spid = curproc->p_pid;
+ } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) {
+ return (set_errno(ESRCH));
+ }
+
+ if (pgid == 0) {
+ spgid = spid;
+ } else if (lx_lpid_to_spair(pgid, &spgid, &tid) < 0) {
+ return (set_errno(ESRCH));
+ }
+
+ /* setpgid() */
+ ret = setpgrp(5, spid, spgid);
+
+ if (ret == EPERM) {
+ /*
+ * On Linux, when calling setpgid with a desired pgid that is
+ * equal to the current pgid of the process, no error is
+ * emitted. This differs slightly from illumos which would
+ * return EPERM. To emulate the Linux behavior, we check
+ * specifically for matching pgids.
+ */
+
+ /* getpgid() */
+ ttolwp(curthread)->lwp_errno = 0;
+ pg = setpgrp(4, spid, 0);
+ if (ttolwp(curthread)->lwp_errno == 0 && spgid == pg)
+ return (0);
+ return (set_errno(EPERM));
+ }
+
+ return (ret);
+}
+
+long
+lx_getsid(int pid)
+{
+ pid_t spid;
+ int tid;
+ int sid;
+
+ if (pid < 0)
+ return (set_errno(ESRCH));
+
+ /*
+ * If the supplied pid matches that of the init process, return the sid
+ * Linux expects.
+ */
+ if (pid == curzone->zone_proc_initpid)
+ return (LX_INIT_SID);
+
+ if (pid == 0) {
+ spid = curproc->p_pid;
+ } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) {
+ return (set_errno(ESRCH));
+ }
+
+ /* getsid() */
+ ttolwp(curthread)->lwp_errno = 0;
+ sid = setpgrp(2, spid, 0);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ return (ttolwp(curthread)->lwp_errno);
+
+
+ /*
+ * If the sid is that of the init process, return the value Linux
+ * expects.
+ */
+ if (sid == curzone->zone_proc_initpid)
+ return (LX_INIT_SID);
+
+ return (sid);
+}
+
+long
+lx_setsid(void)
+{
+ int sid;
+
+ /* setsid() */
+ ttolwp(curthread)->lwp_errno = 0;
+ sid = setpgrp(3, 0, 0);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ return (ttolwp(curthread)->lwp_errno);
+
+ /*
+ * If the sid is that of the init process, return the value Linux
+ * expects.
+ */
+ if (sid == curzone->zone_proc_initpid)
+ return (LX_INIT_SID);
+
+ return (sid);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c
new file mode 100644
index 0000000000..96959e40df
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c
@@ -0,0 +1,309 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. All Rights Reserved.
+ *
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/errno.h>
+#include <sys/debug.h>
+#include <sys/fs/fifonode.h>
+#include <sys/fcntl.h>
+#include <sys/policy.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+#include <sys/sysmacros.h>
+
+#define LX_DEFAULT_PIPE_SIZE 65536
+
+/*
+ * Our default value for fs.pipe-size-max mirrors Linux. The enforced maximum
+ * is meant to provide some sort of upper bound on pipe buffer sizing. Its
+ * value was chosen somewhat arbitrarily.
+ */
+uint_t lx_pipe_max_default = 1048576;
+uint_t lx_pipe_max_limit = 8388608;
+
+int
+lx_pipe_setsz(stdata_t *str, uint_t size, boolean_t is_init)
+{
+ int err;
+ stdata_t *mate;
+ lx_zone_data_t *lxzd = ztolxzd(curzone);
+ uint_t max_size = lxzd->lxzd_pipe_max_sz;
+ fifonode_t *fnp1, *fnp2;
+
+ size = P2ROUNDUP(size, PAGESIZE);
+ if (size == 0) {
+ return (EINVAL);
+ } else if (size > max_size && secpolicy_resource(CRED()) != 0) {
+ if (!is_init) {
+ return (EPERM);
+ }
+ /*
+ * If the size limit is breached during initial pipe setup,
+ * simply clamp it to the maximum. On Linux kernels prior to
+ * 4.9, this clamping would not occur and it would be possible
+ * to open a pipe with the default buffer size even if it
+ * exceeded the sysctl limit. Rather than trigger behavior
+ * here based on the configured kernel version, it is applied
+ * to all callers.
+ */
+ size = max_size;
+ ASSERT(max_size <= lx_pipe_max_limit);
+ } else if (size > lx_pipe_max_limit) {
+ /*
+ * Unlike Linux, we do maintain a global hard cap on pipe
+ * buffer limits.
+ */
+ return (EPERM);
+ }
+
+ if (!STRMATED(str)) {
+ err = strqset(RD(str->sd_wrq), QHIWAT, 0, (intptr_t)size);
+ if (err == 0) {
+ fnp1 = VTOF(str->sd_vnode);
+ mutex_enter(&fnp1->fn_lock->flk_lock);
+ fnp1->fn_hiwat = size;
+ mutex_exit(&fnp1->fn_lock->flk_lock);
+ }
+ return (err);
+ }
+
+ /*
+ * Ensure consistent order so the set operation is always attempted on
+ * the "higher" stream first.
+ */
+ if (str > str->sd_mate) {
+ VERIFY((mate = str->sd_mate) != NULL);
+ } else {
+ mate = str;
+ VERIFY((str = mate->sd_mate) != NULL);
+ }
+
+ /*
+ * While it is unfortunate that an error could occur for the latter
+ * half of the stream pair, there is little to be done about it aside
+ * from reporting the failure.
+ */
+ if ((err = strqset(RD(str->sd_wrq), QHIWAT, 0, (intptr_t)size)) == 0) {
+ err = strqset(RD(mate->sd_wrq), QHIWAT, 0, (intptr_t)size);
+ }
+
+ if (err == 0) {
+ fnp1 = VTOF(str->sd_vnode);
+ fnp2 = VTOF(str->sd_mate->sd_vnode);
+
+ /*
+ * See fnode_constructor. Both sides should have the same
+ * lock. We expect our callers to ensure that the vnodes
+ * are VFIFO and have v_op == fifovnops.
+ */
+ ASSERT(str->sd_vnode->v_type == VFIFO);
+ ASSERT(str->sd_mate->sd_vnode->v_type == VFIFO);
+ ASSERT(fnp1->fn_lock == fnp2->fn_lock);
+
+ mutex_enter(&fnp1->fn_lock->flk_lock);
+
+ fnp1->fn_hiwat = size;
+ fnp2->fn_hiwat = size;
+
+ mutex_exit(&fnp1->fn_lock->flk_lock);
+ }
+
+ return (err);
+}
+
+/*
+ * Based on native pipe(2) system call, except that the pipe is half-duplex.
+ */
+static int
+lx_hd_pipe(intptr_t arg, int flags)
+{
+ vnode_t *vp1, *vp2;
+ struct file *fp1, *fp2;
+ int error = 0;
+ int flag1, flag2, iflags;
+ int fd1, fd2;
+ stdata_t *str;
+
+ /*
+ * Validate allowed flags.
+ */
+ if ((flags & ~(FCLOEXEC|FNONBLOCK)) != 0) {
+ return (set_errno(EINVAL));
+ }
+ /*
+ * Allocate and initialize two vnodes.
+ */
+ makepipe(&vp1, &vp2);
+
+ /*
+ * Allocate and initialize two file table entries and two
+ * file pointers. The first file pointer is open for read and the
+ * second is open for write.
+ */
+ if ((error = falloc(vp1, FREAD, &fp1, &fd1)) != 0) {
+ VN_RELE(vp1);
+ VN_RELE(vp2);
+ return (set_errno(error));
+ }
+
+ if ((error = falloc(vp2, FWRITE, &fp2, &fd2)) != 0)
+ goto out2;
+
+ /*
+ * Create two stream heads and attach to each vnode.
+ */
+ if ((error = fifo_stropen(&vp1, FREAD, fp1->f_cred, 0, 0)) != 0)
+ goto out;
+
+ if ((error = fifo_stropen(&vp2, FWRITE, fp2->f_cred, 0, 0)) != 0) {
+ (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0,
+ fp1->f_cred, NULL);
+ goto out;
+ }
+
+ strmate(vp1, vp2);
+
+ VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid();
+
+ /*
+ * Attempt to set pipe buffer sizes to expected value.
+ */
+ VERIFY((str = vp1->v_stream) != NULL);
+ (void) lx_pipe_setsz(str, LX_DEFAULT_PIPE_SIZE, B_TRUE);
+
+ /*
+ * Set the O_NONBLOCK flag if requested.
+ */
+ if (flags & FNONBLOCK) {
+ flag1 = fp1->f_flag;
+ flag2 = fp2->f_flag;
+ iflags = flags & FNONBLOCK;
+
+ if ((error = VOP_SETFL(vp1, flag1, iflags, fp1->f_cred,
+ NULL)) != 0) {
+ goto out_vop_close;
+ }
+ fp1->f_flag |= iflags;
+
+ if ((error = VOP_SETFL(vp2, flag2, iflags, fp2->f_cred,
+ NULL)) != 0) {
+ goto out_vop_close;
+ }
+ fp2->f_flag |= iflags;
+ }
+
+ /*
+ * Return the file descriptors to the user. They now
+ * point to two different vnodes which have different
+ * stream heads.
+ */
+ if (copyout(&fd1, &((int *)arg)[0], sizeof (int)) ||
+ copyout(&fd2, &((int *)arg)[1], sizeof (int))) {
+ error = EFAULT;
+ goto out_vop_close;
+ }
+
+ /*
+ * Now fill in the entries that falloc reserved
+ */
+ mutex_exit(&fp1->f_tlock);
+ mutex_exit(&fp2->f_tlock);
+ setf(fd1, fp1);
+ setf(fd2, fp2);
+
+ /*
+ * Optionally set the FCLOEXEC flag
+ */
+ if ((flags & FCLOEXEC) != 0) {
+ f_setfd(fd1, FD_CLOEXEC);
+ f_setfd(fd2, FD_CLOEXEC);
+ }
+
+ return (0);
+out_vop_close:
+ (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, fp1->f_cred, NULL);
+ (void) VOP_CLOSE(vp2, FWRITE, 1, (offset_t)0, fp2->f_cred, NULL);
+out:
+ setf(fd2, NULL);
+ unfalloc(fp2);
+out2:
+ setf(fd1, NULL);
+ unfalloc(fp1);
+ VN_RELE(vp1);
+ VN_RELE(vp2);
+ return (set_errno(error));
+}
+
+/*
+ * pipe(2) system call.
+ */
+long
+lx_pipe(intptr_t arg)
+{
+ return (lx_hd_pipe(arg, 0));
+}
+
+/*
+ * pipe2(2) system call.
+ */
+long
+lx_pipe2(intptr_t arg, int lxflags)
+{
+ int flags = 0;
+
+ /*
+ * Validate allowed flags.
+ */
+ if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Convert from Linux flags to illumos flags.
+ */
+ if (lxflags & LX_O_NONBLOCK) {
+ flags |= FNONBLOCK;
+ }
+ if (lxflags & LX_O_CLOEXEC) {
+ flags |= FCLOEXEC;
+ }
+
+ return (lx_hd_pipe(arg, flags));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_poll.c b/usr/src/uts/common/brand/lx/syscall/lx_poll.c
new file mode 100644
index 0000000000..e54130aff1
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_poll.c
@@ -0,0 +1,786 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/sunddi.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/poll_impl.h>
+#include <sys/schedctl.h>
+#include <sys/lx_signal.h>
+
+/*
+ * Max number of FDs that can be given to poll() or select() before we return
+ * EINVAL (the Linux man page documents this value as {OPEN_MAX}, and defaults
+ * it to this value).
+ */
+int lx_poll_max_fds = 1048576;
+
+/* From uts/common/syscall/poll.c */
+extern int poll_copyin(pollstate_t *, pollfd_t *, nfds_t);
+extern int poll_common(pollstate_t *, pollfd_t *, nfds_t, timespec_t *, int *);
+
+/*
+ * These events are identical between Linux and SunOS
+ */
+#define LX_POLLIN 0x001
+#define LX_POLLPRI 0x002
+#define LX_POLLOUT 0x004
+#define LX_POLLERR 0x008
+#define LX_POLLHUP 0x010
+#define LX_POLLNVAL 0x020
+#define LX_POLLRDNORM 0x040
+#define LX_POLLRDBAND 0x080
+
+#define LX_POLL_COMMON_EVENTS (LX_POLLIN | LX_POLLPRI | LX_POLLOUT | \
+ LX_POLLERR | LX_POLLHUP | LX_POLLNVAL | LX_POLLRDNORM | LX_POLLRDBAND)
+
+/*
+ * These events differ between Linux and SunOS
+ */
+#define LX_POLLWRNORM 0x0100
+#define LX_POLLWRBAND 0x0200
+#define LX_POLLRDHUP 0x2000
+
+
+#define LX_POLL_SUPPORTED_EVENTS \
+ (LX_POLL_COMMON_EVENTS | LX_POLLWRNORM | LX_POLLWRBAND | LX_POLLRDHUP)
+
+
+static int
+lx_poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, short *oldevt)
+{
+ int i, error = 0;
+ pollfd_t *pollfdp;
+
+ if ((error = poll_copyin(ps, fds, nfds)) != 0) {
+ return (error);
+ }
+ pollfdp = ps->ps_pollfd;
+
+ /* Convert the Linux events bitmask into SunOS equivalent. */
+ for (i = 0; i < nfds; i++) {
+ short lx_events = pollfdp[i].events;
+ short events;
+
+ /*
+ * If the caller is polling for an unsupported event, we
+ * have to bail out.
+ */
+ if (lx_events & ~LX_POLL_SUPPORTED_EVENTS) {
+ return (ENOTSUP);
+ }
+
+ events = lx_events & LX_POLL_COMMON_EVENTS;
+ if (lx_events & LX_POLLWRNORM)
+ events |= POLLWRNORM;
+ if (lx_events & LX_POLLWRBAND)
+ events |= POLLWRBAND;
+ if (lx_events & LX_POLLRDHUP)
+ events |= POLLRDHUP;
+ pollfdp[i].events = events;
+ oldevt[i] = lx_events;
+ }
+ return (0);
+}
+
+static int
+lx_poll_copyout(pollfd_t *pollfdp, pollfd_t *fds, nfds_t nfds, short *oldevt)
+{
+ int i;
+
+ /*
+ * Convert SunOS revents bitmask into Linux equivalent and restore
+ * cached events field which was swizzled by lx_poll_copyin.
+ */
+ for (i = 0; i < nfds; i++) {
+ short revents = pollfdp[i].revents;
+ short lx_revents = revents & LX_POLL_COMMON_EVENTS;
+ short orig_events = oldevt[i];
+
+ if (revents & POLLWRBAND)
+ lx_revents |= LX_POLLWRBAND;
+ if (revents & POLLRDHUP)
+ lx_revents |= LX_POLLRDHUP;
+ /*
+ * Because POLLOUT and POLLWRNORM are native defined as the
+ * same value, care must be taken when translating them to
+ * Linux where they differ.
+ */
+ if (revents & POLLOUT) {
+ if ((orig_events & LX_POLLOUT) == 0)
+ lx_revents &= ~LX_POLLOUT;
+ if (orig_events & LX_POLLWRNORM)
+ lx_revents |= LX_POLLWRNORM;
+ }
+
+ pollfdp[i].revents = lx_revents;
+ pollfdp[i].events = orig_events;
+ }
+
+ if (copyout(pollfdp, fds, sizeof (pollfd_t) * nfds) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static long
+lx_poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ pollstate_t *ps = NULL;
+ pollfd_t *pollfdp = NULL;
+ short *oldevt = NULL;
+ int error = 0, fdcnt = 0;
+
+ /*
+ * Reset our signal mask, if requested.
+ */
+ if (ksetp != NULL) {
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(t);
+ lwp->lwp_sigoldmask = t->t_hold;
+ t->t_hold = *ksetp;
+ t->t_flag |= T_TOMASK;
+ /*
+ * Call cv_reltimedwait_sig() just to check for signals.
+ * We will return immediately with either 0 or -1.
+ */
+ if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+ TR_CLOCK_TICK)) {
+ mutex_exit(&p->p_lock);
+ error = EINTR;
+ goto pollout;
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Initialize pollstate and copy in pollfd data if present.
+ */
+ if (nfds != 0) {
+ /*
+ * Cap the number of FDs they can give us so we don't go
+ * allocating a huge chunk of memory. Note that this is *not*
+ * the RLIMIT_NOFILE rctl.
+ */
+ if (nfds > lx_poll_max_fds) {
+ error = EINVAL;
+ goto pollout;
+ }
+
+ /*
+ * Need to allocate memory for pollstate before anything
+ * because the mutex and cv are created in this space
+ */
+ ps = pollstate_create();
+ if (ps->ps_pcache == NULL)
+ ps->ps_pcache = pcache_alloc();
+
+ /*
+ * Certain event types which are distinct on Linux are aliased
+ * against each other on illumos. In order properly translate
+ * back into the Linux format, the original events of interest
+ * are stored in 'oldevt' for use during lx_poll_copyout.
+ */
+ oldevt = kmem_alloc(nfds * sizeof (short), KM_SLEEP);
+ if ((error = lx_poll_copyin(ps, fds, nfds, oldevt)) != 0)
+ goto pollout;
+ pollfdp = ps->ps_pollfd;
+
+ /*
+ * The Linux poll(2) implicitly polls for POLLERR and POLLHUP
+ * in addition to any other events specified for the file
+ * descriptors in question. It does not modify pollfd_t`events
+ * to reflect that fact when performing a later copyout.
+ */
+ ps->ps_implicit_ev = POLLERR | POLLHUP;
+ }
+
+ /*
+ * Perform the actual poll.
+ */
+ error = poll_common(ps, fds, nfds, tsp, &fdcnt);
+
+ /*
+ * Clear implicit event interest, if needed.
+ */
+ if (ps != NULL) {
+ ps->ps_implicit_ev = 0;
+ }
+
+
+pollout:
+ /*
+ * If we changed the signal mask but we received no signal then restore
+ * the signal mask. Otherwise psig() will deal with the signal mask.
+ */
+ if (ksetp != NULL) {
+ mutex_enter(&p->p_lock);
+ if (lwp->lwp_cursig == 0) {
+ t->t_hold = lwp->lwp_sigoldmask;
+ t->t_flag &= ~T_TOMASK;
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Copy out the events and return the fdcnt to the user.
+ */
+ if (nfds != 0 && error == 0) {
+ error = lx_poll_copyout(pollfdp, fds, nfds, oldevt);
+ }
+ if (oldevt != NULL) {
+ kmem_free(oldevt, nfds * sizeof (short));
+ }
+ if (error) {
+ return (set_errno(error));
+ }
+ return (fdcnt);
+}
+
+long
+lx_poll(pollfd_t *fds, nfds_t nfds, int timeout)
+{
+ timespec_t ts, *tsp = NULL;
+
+ if (timeout >= 0) {
+ ts.tv_sec = timeout / MILLISEC;
+ ts.tv_nsec = (timeout % MILLISEC) * MICROSEC;
+ tsp = &ts;
+ }
+
+ return (lx_poll_common(fds, nfds, tsp, NULL));
+}
+
+long
+lx_ppoll(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, lx_sigset_t *setp)
+{
+ timespec_t ts, *tsp = NULL;
+ k_sigset_t kset, *ksetp = NULL;
+
+ /*
+ * Copy in timeout and sigmask.
+ */
+ if (timeoutp != NULL) {
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &ts, sizeof (ts)))
+ return (set_errno(EFAULT));
+ } else {
+ timespec32_t ts32;
+
+ if (copyin(timeoutp, &ts32, sizeof (ts32)))
+ return (set_errno(EFAULT));
+ TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+ }
+
+ if (itimerspecfix(&ts))
+ return (set_errno(EINVAL));
+ tsp = &ts;
+ }
+ if (setp != NULL) {
+ lx_sigset_t lset;
+
+ if (copyin(setp, &lset, sizeof (lset)))
+ return (set_errno(EFAULT));
+ lx_ltos_sigset(&lset, &kset);
+ ksetp = &kset;
+ }
+
+ return (lx_poll_common(fds, nfds, tsp, ksetp));
+}
+
+typedef struct lx_select_buf_s {
+ long *lsb_rfds;
+ long *lsb_wfds;
+ long *lsb_efds;
+ unsigned int lsb_size;
+} lx_select_buf_t;
+
+/*
+ * Size (in bytes) of buffer appropriate for fd_set copyin/copyout.
+ * Linux uses buffers of 'long' to accomplish this.
+ */
+#define LX_FD_SET_BYTES (sizeof (long))
+#define LX_FD_SET_BITS (8 * LX_FD_SET_BYTES)
+#define LX_FD_SET_SIZE(nfds) \
+ ((((nfds) + (LX_FD_SET_BITS - 1)) / LX_FD_SET_BITS) * LX_FD_SET_BYTES)
+
+static int
+lx_select_copyin(pollstate_t *ps, lx_select_buf_t *sbuf, int nfds,
+ long *rfds, long *wfds, long *efds)
+{
+ int n;
+ long *in, *out, *ex;
+ long absent = 0;
+ pollfd_t *pfd;
+ nfds_t old_nfds;
+
+ /*
+ * Just like pollsys and lx_poll, attempt to reuse ps_pollfd if it is
+ * appropriately sized. See poll_copyin for more detail.
+ */
+ old_nfds = ps->ps_nfds;
+ if (nfds != old_nfds) {
+ kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
+ pfd = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+ ps->ps_pollfd = pfd;
+ ps->ps_nfds = nfds;
+ } else {
+ pfd = ps->ps_pollfd;
+ }
+
+ if (rfds != NULL) {
+ if (copyin(rfds, sbuf->lsb_rfds, sbuf->lsb_size) != 0) {
+ return (EFAULT);
+ }
+ }
+ if (wfds != NULL) {
+ if (copyin(wfds, sbuf->lsb_wfds, sbuf->lsb_size) != 0) {
+ return (EFAULT);
+ }
+ }
+ if (efds != NULL) {
+ if (copyin(efds, sbuf->lsb_efds, sbuf->lsb_size) != 0) {
+ return (EFAULT);
+ }
+ }
+
+ /*
+ * For each fd, if any bits are set convert them into the appropriate
+ * pollfd struct. (Derived from libc's select logic)
+ */
+ in = (rfds != NULL) ? sbuf->lsb_rfds : &absent;
+ out = (wfds != NULL) ? sbuf->lsb_wfds : &absent;
+ ex = (efds != NULL) ? sbuf->lsb_efds : &absent;
+ for (n = 0; n < nfds; n += LX_FD_SET_BITS) {
+ unsigned long b, m, j;
+
+ b = (unsigned long)(*in | *out | *ex);
+ m = 1;
+ for (j = 0; j < LX_FD_SET_BITS; j++) {
+ int fd = n + j;
+
+ if (fd >= nfds)
+ return (0);
+ pfd->events = 0;
+ if (b & 1) {
+ pfd->fd = fd;
+ if (*in & m)
+ pfd->events |= POLLRDNORM;
+ if (*out & m)
+ pfd->events |= POLLWRNORM;
+ if (*ex & m)
+ pfd->events |= POLLRDBAND;
+ } else {
+ pfd->fd = -1;
+ }
+ pfd++;
+ b >>= 1;
+ m <<= 1;
+ }
+
+ if (rfds != NULL)
+ in++;
+ if (wfds != NULL)
+ out++;
+ if (efds != NULL)
+ ex++;
+ }
+ return (0);
+}
+
+static int
+lx_select_copyout(pollfd_t *pollfdp, lx_select_buf_t *sbuf, int nfds,
+ long *rfds, long *wfds, long *efds, int *fdcnt)
+{
+ int n;
+ pollfd_t *pfd;
+ long rv = 0;
+
+ /*
+ * If poll did not find any fds of interest, we can just zero out the
+ * fd_set fields for copyout.
+ */
+ if (*fdcnt == 0) {
+ if (rfds != NULL) {
+ bzero(sbuf->lsb_rfds, sbuf->lsb_size);
+ }
+ if (wfds != NULL) {
+ bzero(sbuf->lsb_wfds, sbuf->lsb_size);
+ }
+ if (efds != NULL) {
+ bzero(sbuf->lsb_efds, sbuf->lsb_size);
+ }
+ goto copyout;
+ }
+
+ /*
+ * For each fd, if any bits are set convert them into the appropriate
+ * pollfd struct. (Derived from libc's select logic)
+ */
+ pfd = pollfdp;
+ for (n = 0; n < nfds; n += LX_FD_SET_BITS) {
+ unsigned long m, j;
+ long in = 0, out = 0, ex = 0;
+
+ m = 1;
+ for (j = 0; j < LX_FD_SET_BITS; j++) {
+ if ((n + j) >= nfds)
+ break;
+ if (pfd->revents != 0) {
+ if (pfd->revents & POLLNVAL) {
+ return (EBADF);
+ }
+ if (pfd->revents & POLLRDNORM) {
+ in |= m;
+ rv++;
+ }
+ if (pfd->revents & POLLWRNORM) {
+ out |= m;
+ rv++;
+ }
+ if (pfd->revents & POLLRDBAND) {
+ ex |= m;
+ rv++;
+ }
+ /*
+ * Only set this bit on return if we asked
+ * about input conditions.
+ */
+ if ((pfd->revents & (POLLHUP|POLLERR)) &&
+ (pfd->events & POLLRDNORM)) {
+ if ((in & m) == 0) {
+ /* wasn't already set */
+ rv++;
+ }
+ in |= m;
+ }
+ /*
+ * Only set this bit on return if we asked
+ * about output conditions.
+ */
+ if ((pfd->revents & (POLLHUP|POLLERR)) &&
+ (pfd->events & POLLWRNORM)) {
+ if ((out & m) == 0) {
+ /* wasn't already set */
+ rv++;
+ }
+ out |= m;
+ }
+ /*
+ * Only set this bit on return if we asked
+ * about output conditions.
+ */
+ if ((pfd->revents & (POLLHUP|POLLERR)) &&
+ (pfd->events & POLLRDBAND)) {
+ if ((ex & m) == 0) {
+ /* wasn't already set */
+ rv++;
+ }
+ ex |= m;
+ }
+ }
+ m <<= 1;
+ pfd++;
+ }
+ if (rfds != NULL)
+ sbuf->lsb_rfds[n / LX_FD_SET_BITS] = in;
+ if (wfds != NULL)
+ sbuf->lsb_wfds[n / LX_FD_SET_BITS] = out;
+ if (efds != NULL)
+ sbuf->lsb_efds[n / LX_FD_SET_BITS] = ex;
+ }
+
+copyout:
+ if (rfds != NULL) {
+ if (copyout(sbuf->lsb_rfds, rfds, sbuf->lsb_size) != 0) {
+ return (EFAULT);
+ }
+ }
+ if (wfds != NULL) {
+ if (copyout(sbuf->lsb_wfds, wfds, sbuf->lsb_size) != 0) {
+ return (EFAULT);
+ }
+ }
+ if (efds != NULL) {
+ if (copyout(sbuf->lsb_efds, efds, sbuf->lsb_size) != 0) {
+ return (EFAULT);
+ }
+ }
+ *fdcnt = rv;
+ return (0);
+}
+
+
+static long
+lx_select_common(int nfds, long *rfds, long *wfds, long *efds,
+ timespec_t *tsp, k_sigset_t *ksetp)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ pollstate_t *ps = NULL;
+ pollfd_t *pollfdp = NULL, *fake_fds = NULL;
+ lx_select_buf_t sbuf = {0};
+ int error = 0, fdcnt = 0;
+
+ if (nfds < 0) {
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Reset our signal mask, if requested.
+ */
+ if (ksetp != NULL) {
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(t);
+ lwp->lwp_sigoldmask = t->t_hold;
+ t->t_hold = *ksetp;
+ t->t_flag |= T_TOMASK;
+ /*
+ * Call cv_reltimedwait_sig() just to check for signals.
+ * We will return immediately with either 0 or -1.
+ */
+ if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+ TR_CLOCK_TICK)) {
+ mutex_exit(&p->p_lock);
+ error = EINTR;
+ goto out;
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Because poll caching uses the userspace pollfd_t pointer to verify
+ * cache reuse validity, a simulated value must be supplied when
+ * emulating Linux select(2). The first non-NULL pointer from
+ * rfds/wfds/efds is used for this purpose.
+ */
+ if (rfds != NULL) {
+ fake_fds = (pollfd_t *)rfds;
+ } else if (wfds != NULL) {
+ fake_fds = (pollfd_t *)wfds;
+ } else if (efds != NULL) {
+ fake_fds = (pollfd_t *)efds;
+ } else {
+ /*
+ * A non-zero nfds was supplied but all three fd_set pointers
+ * were null. Fall back to doing a simple timeout.
+ */
+ nfds = 0;
+ }
+
+ /*
+ * Initialize pollstate and copy in pollfd data if present.
+ */
+ if (nfds != 0) {
+ /*
+ * Cap the number of FDs they can give us so we don't go
+ * allocating a huge chunk of memory. Note that this is *not*
+ * the RLIMIT_NOFILE rctl.
+ */
+ if (nfds > lx_poll_max_fds) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Need to allocate memory for pollstate before anything
+ * because the mutex and cv are created in this space
+ */
+ ps = pollstate_create();
+ if (ps->ps_pcache == NULL)
+ ps->ps_pcache = pcache_alloc();
+
+ sbuf.lsb_size = LX_FD_SET_SIZE(nfds);
+ if (rfds != NULL)
+ sbuf.lsb_rfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP);
+ if (wfds != NULL)
+ sbuf.lsb_wfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP);
+ if (efds != NULL)
+ sbuf.lsb_efds = kmem_alloc(sbuf.lsb_size, KM_SLEEP);
+
+ error = lx_select_copyin(ps, &sbuf, nfds, rfds, wfds, efds);
+ if (error != 0) {
+ goto out;
+ }
+
+ pollfdp = ps->ps_pollfd;
+ }
+
+ /*
+ * Perform the actual poll.
+ */
+ error = poll_common(ps, fake_fds, (nfds_t)nfds, tsp, &fdcnt);
+
+out:
+ /*
+ * If we changed the signal mask but we received no signal then restore
+ * the signal mask. Otherwise psig() will deal with the signal mask.
+ */
+ if (ksetp != NULL) {
+ mutex_enter(&p->p_lock);
+ if (lwp->lwp_cursig == 0) {
+ t->t_hold = lwp->lwp_sigoldmask;
+ t->t_flag &= ~T_TOMASK;
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Copy out the events and return the fdcnt to the user.
+ */
+ if (error == 0 && nfds != 0) {
+ error = lx_select_copyout(pollfdp, &sbuf, nfds, rfds, wfds,
+ efds, &fdcnt);
+ }
+ if (sbuf.lsb_size != 0) {
+ if (sbuf.lsb_rfds != NULL)
+ kmem_free(sbuf.lsb_rfds, sbuf.lsb_size);
+ if (sbuf.lsb_wfds != NULL)
+ kmem_free(sbuf.lsb_wfds, sbuf.lsb_size);
+ if (sbuf.lsb_efds != NULL)
+ kmem_free(sbuf.lsb_efds, sbuf.lsb_size);
+ }
+ if (error) {
+ return (set_errno(error));
+ }
+ return (fdcnt);
+}
+
+long
+lx_select(int nfds, long *rfds, long *wfds, long *efds,
+ struct timeval *timeoutp)
+{
+ timespec_t ts, *tsp = NULL;
+
+ if (timeoutp != NULL) {
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ struct timeval tv;
+
+ if (copyin(timeoutp, &tv, sizeof (tv)))
+ return (set_errno(EFAULT));
+ ts.tv_sec = tv.tv_sec;
+ ts.tv_nsec = tv.tv_usec * (NANOSEC / MICROSEC);
+ } else {
+ struct timeval32 tv32;
+
+ if (copyin(timeoutp, &tv32, sizeof (tv32)))
+ return (set_errno(EFAULT));
+ ts.tv_sec = tv32.tv_sec;
+ ts.tv_nsec = tv32.tv_usec * (NANOSEC / MICROSEC);
+ }
+
+ if (itimerspecfix(&ts))
+ return (set_errno(EINVAL));
+ tsp = &ts;
+ }
+
+ return (lx_select_common(nfds, rfds, wfds, efds, tsp, NULL));
+}
+
+
+typedef struct {
+ uintptr_t lpsa_addr;
+ unsigned long lpsa_len;
+} lx_pselect_sig_arg_t;
+
+#if defined(_LP64)
+typedef struct {
+ caddr32_t lpsa_addr;
+ uint32_t lpsa_len;
+} lx_pselect_sig_arg32_t;
+#endif /* defined(_LP64) */
+
+long
+lx_pselect(int nfds, long *rfds, long *wfds, long *efds,
+ timespec_t *timeoutp, void *setp)
+{
+ timespec_t ts, *tsp = NULL;
+ k_sigset_t kset, *ksetp = NULL;
+
+ /*
+ * Copy in timeout and sigmask.
+ */
+ if (timeoutp != NULL) {
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &ts, sizeof (ts)))
+ return (set_errno(EFAULT));
+ } else {
+ timespec32_t ts32;
+
+ if (copyin(timeoutp, &ts32, sizeof (ts32)))
+ return (set_errno(EFAULT));
+ TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+ }
+
+ if (itimerspecfix(&ts))
+ return (set_errno(EINVAL));
+ tsp = &ts;
+ }
+ if (setp != NULL) {
+ lx_sigset_t lset, *sigaddr = NULL;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ lx_pselect_sig_arg_t lpsa;
+
+ if (copyin(setp, &lpsa, sizeof (lpsa)) != 0)
+ return (set_errno(EFAULT));
+ /*
+ * Linux forces a size to be passed only so it can
+ * check that it's the size of a sigset_t.
+ */
+ if (lpsa.lpsa_len != sizeof (lx_sigset_t))
+ return (set_errno(EINVAL));
+
+ sigaddr = (lx_sigset_t *)lpsa.lpsa_addr;
+ }
+#if defined(_LP64)
+ else {
+ lx_pselect_sig_arg32_t lpsa32;
+
+ if (copyin(setp, &lpsa32, sizeof (lpsa32)) != 0)
+ return (set_errno(EFAULT));
+ /*
+ * Linux forces a size to be passed only so it can
+ * check that it's the size of a sigset_t.
+ */
+ if (lpsa32.lpsa_len != sizeof (lx_sigset_t))
+ return (set_errno(EINVAL));
+
+ sigaddr = (lx_sigset_t *)(uint64_t)lpsa32.lpsa_addr;
+ }
+#endif /* defined(_LP64) */
+
+ /* This is where we check if the sigset is *really* NULL. */
+ if (sigaddr != NULL) {
+ if (copyin(sigaddr, &lset, sizeof (lset)) != 0)
+ return (set_errno(EFAULT));
+
+ lx_ltos_sigset(&lset, &kset);
+ ksetp = &kset;
+ }
+ }
+
+ return (lx_select_common(nfds, rfds, wfds, efds, tsp, ksetp));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_prctl.c b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c
new file mode 100644
index 0000000000..a8b3c3422c
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c
@@ -0,0 +1,288 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/priv.h>
+#include <sys/brand.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_misc.h>
+#include <lx_signum.h>
+
+#define LX_PR_SET_PDEATHSIG 1
+#define LX_PR_GET_PDEATHSIG 2
+#define LX_PR_GET_DUMPABLE 3
+#define LX_PR_SET_DUMPABLE 4
+#define LX_PR_GET_UNALIGN 5
+#define LX_PR_SET_UNALIGN 6
+#define LX_PR_GET_KEEPCAPS 7
+#define LX_PR_SET_KEEPCAPS 8
+#define LX_PR_GET_FPEMU 9
+#define LX_PR_SET_FPEMU 10
+#define LX_PR_GET_FPEXC 11
+#define LX_PR_SET_FPEXC 12
+#define LX_PR_GET_TIMING 13
+#define LX_PR_SET_TIMING 14
+#define LX_PR_SET_NAME 15
+#define LX_PR_GET_NAME 16
+#define LX_PR_GET_ENDIAN 19
+#define LX_PR_SET_ENDIAN 20
+#define LX_PR_GET_SECCOMP 21
+#define LX_PR_SET_SECCOMP 22
+#define LX_PR_CAPBSET_READ 23
+#define LX_PR_CAPBSET_DROP 24
+#define LX_PR_GET_TSC 25
+#define LX_PR_SET_TSC 26
+#define LX_PR_GET_SECUREBITS 27
+#define LX_PR_SET_SECUREBITS 28
+#define LX_PR_SET_TIMERSLACK 29
+#define LX_PR_GET_TIMERSLACK 30
+#define LX_PR_TASK_PERF_EVENTS_DISABLE 31
+#define LX_PR_TASK_PERF_EVENTS_ENABLE 32
+#define LX_PR_MCE_KILL 33
+#define LX_PR_MCE_KILL_GET 34
+#define LX_PR_SET_MM 35
+#define LX_PR_SET_CHILD_SUBREAPER 36
+#define LX_PR_GET_CHILD_SUBREAPER 37
+#define LX_PR_SET_NO_NEW_PRIVS 38
+#define LX_PR_GET_NO_NEW_PRIVS 39
+#define LX_PR_GET_TID_ADDRESS 40
+#define LX_PR_SET_THP_DISABLE 41
+#define LX_PR_GET_THP_DISABLE 42
+
+long
+lx_prctl(int opt, uintptr_t data)
+{
+ long err;
+ char ebuf[64];
+
+ switch (opt) {
+ case LX_PR_GET_DUMPABLE: {
+ /* Only track in brand data - could hook into SNOCD later */
+ lx_proc_data_t *lxpd;
+ int val;
+
+ mutex_enter(&curproc->p_lock);
+ VERIFY((lxpd = ptolxproc(curproc)) != NULL);
+ val = lxpd->l_flags & LX_PROC_NO_DUMP;
+ mutex_exit(&curproc->p_lock);
+
+ return (val == 0);
+ }
+
+ case LX_PR_SET_DUMPABLE: {
+ lx_proc_data_t *lxpd;
+
+ if (data != 0 && data != 1) {
+ return (set_errno(EINVAL));
+ }
+
+ mutex_enter(&curproc->p_lock);
+ VERIFY((lxpd = ptolxproc(curproc)) != NULL);
+ if (data == 0) {
+ lxpd->l_flags |= LX_PROC_NO_DUMP;
+ } else {
+ lxpd->l_flags &= ~LX_PROC_NO_DUMP;
+ }
+ mutex_exit(&curproc->p_lock);
+
+ return (0);
+ }
+
+ case LX_PR_GET_SECUREBITS: {
+ /* Our bits are always 0 */
+ return (0);
+ }
+
+ case LX_PR_SET_SECUREBITS: {
+ /* Ignore setting any bits from arg2 */
+ return (0);
+ }
+
+ case LX_PR_SET_KEEPCAPS: {
+ /*
+ * The closest illumos analog to SET_KEEPCAPS is the PRIV_AWARE
+ * flag. There are probably some cases where it's not exactly
+ * the same, but this will do for a first try.
+ */
+ if (data == 0) {
+ err = setpflags(PRIV_AWARE_RESET, 1, NULL);
+ } else {
+ err = setpflags(PRIV_AWARE, 1, NULL);
+ }
+
+ if (err != 0) {
+ return (set_errno(err));
+ }
+ return (0);
+ }
+
+ case LX_PR_GET_NAME: {
+ /*
+ * We allow longer thread names than Linux for compatibility
+ * with other OSes (Solaris, NetBSD) that also allow larger
+ * names. We just truncate (with NUL termination) if
+ * the name is longer.
+ */
+ char name[LX_PR_SET_NAME_NAMELEN] = { 0 };
+ kthread_t *t = curthread;
+
+ mutex_enter(&ttoproc(t)->p_lock);
+ if (t->t_name != NULL) {
+ (void) strlcpy(name, t->t_name, sizeof (name));
+ }
+ mutex_exit(&ttoproc(t)->p_lock);
+
+ /*
+ * FWIW, the prctl(2) manpage says that the user-supplied
+ * buffer should be at least 16 (LX_PR_SET_NAME_NAMELEN) bytes
+ * long.
+ */
+ if (copyout(name, (void *)data, LX_PR_SET_NAME_NAMELEN) != 0) {
+ return (set_errno(EFAULT));
+ }
+ return (0);
+ }
+
+ case LX_PR_SET_NAME: {
+ char name[LX_PR_SET_NAME_NAMELEN] = { 0 };
+ kthread_t *t = curthread;
+ proc_t *p = ttoproc(t);
+ int ret;
+
+ ret = copyinstr((const char *)data, name, sizeof (name), NULL);
+ /*
+ * prctl(2) explicitly states that over length strings are
+ * silently truncated
+ */
+ if (ret != 0 && ret != ENAMETOOLONG) {
+ return (set_errno(EFAULT));
+ }
+ name[LX_PR_SET_NAME_NAMELEN - 1] = '\0';
+
+ if ((ret = thread_setname(t, name)) != 0) {
+ return (set_errno(ret));
+ }
+
+ /*
+ * In Linux, PR_SET_NAME sets the name of the thread, not the
+ * process. Due to the historical quirks of Linux's asinine
+ * thread model, this name is effectively the name of the
+ * process (as visible via ps(1)) if the thread is the first of
+ * its task group. The first thread is therefore special, and
+ * to best mimic Linux semantics we set the thread name, and if
+ * we are setting LWP 1, we also update the name of the process.
+ */
+ if (t->t_tid != 1) {
+ return (0);
+ }
+
+ /*
+ * We are currently choosing to not allow an empty thread
+ * name to clear p->p_user.u_comm and p->p_user.u_psargs.
+ * This is a slight divergence from linux behavior (which
+ * allows this) so that we can preserve the original command.
+ */
+ if (strlen(name) == 0) {
+ return (0);
+ }
+
+ /*
+ * We explicitly use t->t_name here instead of name in case
+ * a thread has come in between the above thread_setname()
+ * call and the setting of u_comm/u_psargs below. On Linux,
+ * one can also change the name of a thread (either itself or
+ * another thread in the same process) via writing to /proc, so
+ * while racy, this is no worse than what might happen on
+ * Linux.
+ */
+ mutex_enter(&p->p_lock);
+ (void) strncpy(p->p_user.u_comm, t->t_name, MAXCOMLEN + 1);
+ (void) strncpy(p->p_user.u_psargs, t->t_name, PSARGSZ);
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+
+ case LX_PR_GET_PDEATHSIG: {
+ int sig;
+ lx_proc_data_t *lxpd;
+
+ mutex_enter(&curproc->p_lock);
+ VERIFY((lxpd = ptolxproc(curproc)) != NULL);
+ sig = lxpd->l_parent_deathsig;
+ mutex_exit(&curproc->p_lock);
+
+ return (sig);
+ }
+
+ case LX_PR_SET_PDEATHSIG: {
+ int sig = lx_ltos_signo((int)data, 0);
+ proc_t *pp = NULL;
+ lx_proc_data_t *lxpd;
+
+ if (sig == 0 && data != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ mutex_enter(&pidlock);
+ /* Set signal on our self */
+ mutex_enter(&curproc->p_lock);
+ VERIFY((lxpd = ptolxproc(curproc)) != NULL);
+ lxpd->l_parent_deathsig = sig;
+ pp = curproc->p_parent;
+ mutex_exit(&curproc->p_lock);
+
+ /* Configure parent to potentially signal children on death */
+ mutex_enter(&pp->p_lock);
+ if (PROC_IS_BRANDED(pp)) {
+ VERIFY((lxpd = ptolxproc(pp)) != NULL);
+ /*
+ * Mark the parent as having children which wish to be
+ * signaled on death of parent.
+ */
+ lxpd->l_flags |= LX_PROC_CHILD_DEATHSIG;
+ } else {
+ /*
+ * If the parent is not a branded process, the needed
+ * hooks to facilitate this mechanism will not fire
+ * when it dies. We lie about success in this case.
+ */
+ /* EMPTY */
+ }
+ mutex_exit(&pp->p_lock);
+ mutex_exit(&pidlock);
+ return (0);
+ }
+
+ case LX_PR_CAPBSET_DROP: {
+ /*
+ * On recent versions of Linux the login svc drops capabilities
+ * and if that fails the svc dies and is restarted by systemd.
+ * For now we pretend dropping capabilities succeeded.
+ */
+ return (0);
+ }
+
+ default:
+ break;
+ }
+
+ (void) snprintf(ebuf, 64, "prctl option %d", opt);
+ lx_unsupported(ebuf);
+ return (set_errno(EINVAL));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_priority.c b/usr/src/uts/common/brand/lx/syscall/lx_priority.c
new file mode 100644
index 0000000000..44c60b66bf
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_priority.c
@@ -0,0 +1,192 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/procset.h>
+#include <sys/resource.h>
+#include <sys/priocntl.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+/* From uts/common/disp/priocntl.c */
+extern int donice(procset_t *, pcnice_t *);
+
+/*
+ * The Linux syscall returns priorities in the range (highest) 40-1 (lowest)
+ * and then glibc adjusts these to the range -20 - 19.
+ */
+long
+lx_getpriority(int which, id_t who)
+{
+ int rval;
+ idtype_t idtype;
+ id_t id, lid;
+ pcnice_t pcnice;
+ procset_t procset;
+
+ switch (which) {
+ case PRIO_PROCESS:
+ idtype = P_PID;
+ if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0)
+ return (set_errno(ESRCH));
+ break;
+ case PRIO_PGRP:
+ idtype = P_PGID;
+ break;
+ case PRIO_USER:
+ idtype = P_UID;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /* Linux fails with a different errno on a negative id */
+ if (who < 0)
+ return (set_errno(ESRCH));
+
+ id = (who == 0 ? P_MYID : who);
+
+ pcnice.pc_val = 0;
+ pcnice.pc_op = PC_GETNICE;
+
+ setprocset(&procset, POP_AND, idtype, id, P_ALL, 0);
+
+ rval = donice(&procset, &pcnice);
+ if (rval != 0) {
+ if (which == PRIO_PROCESS &&
+ (who == curproc->p_pid || who == 0) &&
+ strcmp(sclass[curthread->t_cid].cl_name, "RT") == 0) {
+ /*
+ * donice() will always return EINVAL if we're in the
+ * RT class. The zone won't be able to put itself or any
+ * of its processes into RT, but if we put the whole
+ * zone into RT via the scheduling-class property, then
+ * getpriority would always fail. This breaks pam and
+ * prevents any login. Just pretend to be the highest
+ * priority.
+ */
+ return (40);
+ }
+
+ /*
+ * Linux does not return EINVAL for invalid 'who' values, it
+ * returns ESRCH instead. We already validated 'which' above.
+ */
+ if (rval == EINVAL)
+ rval = ESRCH;
+ return (set_errno(rval));
+ }
+
+ /*
+ * The return value of the getpriority syscall is biased by 20 to avoid
+ * returning negative values when successful (-20 internally is our
+ * highest priority and 19 is our lowest).
+ */
+ return (20 - pcnice.pc_val);
+}
+
+/*
+ * Return EPERM if the current process is not allowed to operate on the target
+ * process (which is part of the procset for setpriority).
+ */
+/* ARGSUSED */
+static int
+lx_chk_pripriv(proc_t *pp, char *dummy)
+{
+ ASSERT(MUTEX_HELD(&pidlock));
+ mutex_enter(&pp->p_lock);
+ if (!prochasprocperm(pp, curproc, CRED())) {
+ mutex_exit(&pp->p_lock);
+ return (EPERM);
+ }
+ mutex_exit(&pp->p_lock);
+ return (0);
+}
+
+long
+lx_setpriority(int which, id_t who, int prio)
+{
+ int rval;
+ idtype_t idtype;
+ id_t id, lid;
+ pcnice_t pcnice;
+ procset_t procset;
+
+ switch (which) {
+ case PRIO_PROCESS:
+ idtype = P_PID;
+ if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0)
+ return (set_errno(ESRCH));
+ break;
+ case PRIO_PGRP:
+ idtype = P_PGID;
+ break;
+ case PRIO_USER:
+ idtype = P_UID;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /* Linux fails with a different errno on a negative id */
+ if (who < 0)
+ return (set_errno(ESRCH));
+
+ id = (who == 0 ? P_MYID : who);
+
+ if (prio > NZERO - 1) {
+ prio = NZERO - 1;
+ } else if (prio < -NZERO) {
+ prio = -NZERO;
+ }
+
+ pcnice.pc_val = prio;
+ pcnice.pc_op = PC_SETNICE;
+
+ setprocset(&procset, POP_AND, idtype, id, P_ALL, 0);
+
+ rval = donice(&procset, &pcnice);
+ if (rval != 0) {
+ /*
+ * Once we fully support Linux capabilities, we should update
+ * the following check to look at the CAP_SYS_NICE capability.
+ */
+ if (rval == EPERM && crgetuid(CRED()) != 0) {
+ /*
+ * donice() returns EPERM under two conditions:
+ * 1) if either the real or eff. uid don't match
+ * 2) we lack the privileges to raise the priority
+ *
+ * However, setpriority() must return a different errno
+ * based on the following:
+ * EPERM - real or eff. uid did not match
+ * EACCES - trying to increase priority
+ *
+ * We use lx_chk_pripriv to determine which case we hit.
+ *
+ * Note that the native setpriority(3C) code has the
+ * same race on re-checking.
+ */
+ if (dotoprocs(&procset, lx_chk_pripriv, NULL) != EPERM)
+ rval = EACCES;
+ }
+
+ return (set_errno(rval));
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rename.c b/usr/src/uts/common/brand/lx/syscall/lx_rename.c
new file mode 100644
index 0000000000..2fad627771
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_rename.c
@@ -0,0 +1,39 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/fcntl.h>
+#include <sys/lx_fcntl.h>
+
+/* From uts/common/syscall/rename.c */
+extern int rename(char *, char *);
+extern int renameat(int, char *, int, char *);
+
+long
+lx_rename(char *p1, char *p2)
+{
+ return (rename(p1, p2));
+}
+
+long
+lx_renameat(int atfd1, char *p1, int atfd2, char *p2)
+{
+ if (atfd1 == LX_AT_FDCWD)
+ atfd1 = AT_FDCWD;
+
+ if (atfd2 == LX_AT_FDCWD)
+ atfd2 = AT_FDCWD;
+
+ return (renameat(atfd1, p1, atfd2, p2));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c
new file mode 100644
index 0000000000..30fa996615
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c
@@ -0,0 +1,587 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/zone.h>
+#include <sys/cpuvar.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+
+#define LX_RLIMIT_CPU 0
+#define LX_RLIMIT_FSIZE 1
+#define LX_RLIMIT_DATA 2
+#define LX_RLIMIT_STACK 3
+#define LX_RLIMIT_CORE 4
+#define LX_RLIMIT_RSS 5
+#define LX_RLIMIT_NPROC 6
+#define LX_RLIMIT_NOFILE 7
+#define LX_RLIMIT_MEMLOCK 8
+#define LX_RLIMIT_AS 9
+#define LX_RLIMIT_LOCKS 10 /* NA limit on locks, early 2.4 only */
+#define LX_RLIMIT_SIGPENDING 11
+#define LX_RLIMIT_MSGQUEUE 12
+#define LX_RLIMIT_NICE 13 /* NA ceiling for nice */
+#define LX_RLIMIT_RTPRIO 14 /* NA ceiling on the RT priority */
+#define LX_RLIMIT_RTTIME 15 /* NA cpu limit for RT proc. */
+
+#define LX_RLIMIT_NLIMITS 16
+
+#define RCTL_INFINITE(x) \
+ ((x->rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \
+ (x->rcv_flagaction & RCTL_GLOBAL_INFINITE))
+
+typedef struct {
+ ulong_t rlim_cur;
+ ulong_t rlim_max;
+} lx_rlimit_t;
+
+typedef struct {
+ uint32_t rlim_cur;
+ uint32_t rlim_max;
+} lx_rlimit32_t;
+
+/*
+ * Linux supports many of the same resources that we do, but on illumos these
+ * are rctls. Instead of using rlimit, we use rctls for all of the limits.
+ * This table is used to translate Linux rlimit keys into the illumos legacy
+ * rlimit. We then primarily use the rctl/rlimit compatability code to
+ * manage these.
+ */
+static int l_to_r[LX_RLIMIT_NLIMITS] = {
+ RLIMIT_CPU, /* 0 CPU */
+ RLIMIT_FSIZE, /* 1 FSIZE */
+ RLIMIT_DATA, /* 2 DATA */
+ RLIMIT_STACK, /* 3 STACK */
+ RLIMIT_CORE, /* 4 CORE */
+ -1, /* 5 RSS */
+ -1, /* 6 NPROC */
+ RLIMIT_NOFILE, /* 7 NOFILE */
+ -1, /* 8 MEMLOCK */
+ RLIMIT_AS, /* 9 AS */
+ -1, /* 10 LOCKS */
+ -1, /* 11 SIGPENDING */
+ -1, /* 12 MSGQUEUE */
+ -1, /* 13 NICE */
+ -1, /* 14 RTPRIO */
+ -1 /* 15 RTTIME */
+};
+
+/*
+ * Magic value Linux uses to indicate infinity
+ */
+#define LX_RLIM_INFINITY_N ULONG_MAX
+
+void
+lx_get_rctl(char *nm, struct rlimit64 *rlp64)
+{
+ rctl_hndl_t hndl;
+ rctl_val_t *oval, *nval;
+
+ rlp64->rlim_cur = RLIM_INFINITY;
+ rlp64->rlim_max = RLIM_INFINITY;
+
+ nval = kmem_alloc(sizeof (rctl_val_t), KM_SLEEP);
+ mutex_enter(&curproc->p_lock);
+
+ hndl = rctl_hndl_lookup(nm);
+ oval = NULL;
+ while ((hndl != -1) && rctl_local_get(hndl, oval, nval, curproc) == 0) {
+ oval = nval;
+ switch (nval->rcv_privilege) {
+ case RCPRIV_BASIC:
+ if (!RCTL_INFINITE(nval))
+ rlp64->rlim_cur = nval->rcv_value;
+ break;
+ case RCPRIV_PRIVILEGED:
+ if (!RCTL_INFINITE(nval))
+ rlp64->rlim_max = nval->rcv_value;
+ break;
+ }
+ }
+
+ mutex_exit(&curproc->p_lock);
+ kmem_free(nval, sizeof (rctl_val_t));
+
+ if (rlp64->rlim_cur == RLIM_INFINITY &&
+ rlp64->rlim_max != RLIM_INFINITY)
+ rlp64->rlim_cur = rlp64->rlim_max;
+}
+
+static int
+lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp)
+{
+ lx_proc_data_t *pd = ptolxproc(curproc);
+ int resource;
+ int64_t cur = -1;
+ boolean_t cur_inf = B_FALSE;
+ int64_t max = -1;
+ boolean_t max_inf = B_FALSE;
+ struct rlimit64 rlim64;
+
+ if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS)
+ return (EINVAL);
+
+ switch (lx_resource) {
+ case LX_RLIMIT_LOCKS:
+ rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur;
+ rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max;
+ break;
+
+ case LX_RLIMIT_NICE:
+ rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur;
+ rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max;
+ break;
+
+ case LX_RLIMIT_RTPRIO:
+ rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur;
+ rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max;
+ break;
+
+ case LX_RLIMIT_RTTIME:
+ rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur;
+ rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max;
+ break;
+
+ case LX_RLIMIT_RSS:
+ /* zone.max-physical-memory */
+ zone_get_physmem_data(curzone->zone_id,
+ (pgcnt_t *)&rlim64.rlim_cur,
+ (pgcnt_t *)&rlim64.rlim_max); /* max is dummy variable */
+ rlim64.rlim_cur = rlim64.rlim_max = ptob(rlim64.rlim_cur);
+
+ break;
+
+ case LX_RLIMIT_NPROC:
+ /* zone.max-lwps */
+ rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_nlwps_ctl;
+ break;
+
+ case LX_RLIMIT_MEMLOCK:
+ lx_get_rctl("process.max-locked-memory", &rlim64);
+
+ /* If unlimited, use zone.max-locked-memory */
+ if (rlim64.rlim_max == RLIM64_INFINITY)
+ rlim64.rlim_max = curzone->zone_locked_mem_ctl;
+ if (rlim64.rlim_cur == RLIM64_INFINITY)
+ rlim64.rlim_cur = curzone->zone_locked_mem_ctl;
+ break;
+
+ case LX_RLIMIT_SIGPENDING:
+ lx_get_rctl("process.max-sigqueue-size", &rlim64);
+ break;
+
+ case LX_RLIMIT_MSGQUEUE:
+ lx_get_rctl("process.max-msg-messages", &rlim64);
+ break;
+
+ default:
+ resource = l_to_r[lx_resource];
+
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_rlimit_get(rctlproc_legacy[resource], curproc,
+ &rlim64);
+ mutex_exit(&curproc->p_lock);
+ break;
+ }
+
+
+ if (rlim64.rlim_cur == RLIM64_INFINITY) {
+ cur = LX_RLIM_INFINITY_N;
+ } else {
+ cur = rlim64.rlim_cur;
+ }
+ if (rlim64.rlim_max == RLIM64_INFINITY) {
+ max = LX_RLIM_INFINITY_N;
+ } else {
+ max = rlim64.rlim_max;
+ }
+
+ if (lx_resource == LX_RLIMIT_STACK && cur > INT_MAX) {
+ /*
+ * Stunningly, Linux has somehow managed to confuse the concept
+ * of a "limit" with that of a "default" -- and the value of
+ * RLIMIT_STACK is used by NPTL as the _default_ stack size if
+ * it isn't specified. (!!) Even for a system that prides
+ * itself on slapdash castles of junk, this is an amazingly
+ * willful act of incompetence -- and one that is gleefully
+ * confessed in the pthread_create() man page: "if the
+ * RLIMIT_STACK soft resource limit at the time the program
+ * started has any value other than 'unlimited', then it
+ * determines the default stack size of new threads." A
+ * typical stack limit for us is 32TB; if it needs to be said,
+ * setting the default stack size to be 32TB doesn't work so
+ * well! Of course, glibc dropping a deuce in its pants
+ * becomes our problem -- so to prevent smelly accidents we
+ * tell Linux that any stack limit over the old (32-bit) values
+ * for infinity are just infinitely large.
+ */
+ cur_inf = B_TRUE;
+ max_inf = B_TRUE;
+ }
+
+ if (cur_inf) {
+ *rlim_curp = LX_RLIM64_INFINITY;
+ } else {
+ *rlim_curp = cur;
+ }
+
+ if (max_inf) {
+ *rlim_maxp = LX_RLIM64_INFINITY;
+ } else {
+ *rlim_maxp = max;
+ }
+
+ return (0);
+}
+
+/*
+ * This is the 'new' getrlimit, variously called getrlimit or ugetrlimit
+ * in Linux headers and code. The only difference between this and the old
+ * getrlimit (variously called getrlimit or old_getrlimit) is the value of
+ * RLIM_INFINITY, which is smaller for the older version. Modern code will
+ * use this version by default.
+ */
+long
+lx_getrlimit(int resource, lx_rlimit_t *rlp)
+{
+ int rv;
+ lx_rlimit_t rl;
+ uint64_t rlim_cur, rlim_max;
+
+ rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max);
+ if (rv != 0)
+ return (set_errno(rv));
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (rlim_cur == LX_RLIM64_INFINITY)
+ rl.rlim_cur = LX_RLIM_INFINITY_N;
+ else if (rlim_cur > LX_RLIM_INFINITY_N)
+ rl.rlim_cur = LX_RLIM_INFINITY_N;
+ else
+ rl.rlim_cur = (ulong_t)rlim_cur;
+
+ if (rlim_max == LX_RLIM64_INFINITY)
+ rl.rlim_max = LX_RLIM_INFINITY_N;
+ else if (rlim_max > LX_RLIM_INFINITY_N)
+ rl.rlim_max = LX_RLIM_INFINITY_N;
+ else
+ rl.rlim_max = (ulong_t)rlim_max;
+
+ if (copyout(&rl, rlp, sizeof (rl)) != 0)
+ return (set_errno(EFAULT));
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ lx_rlimit32_t rl32;
+
+ if (rlim_cur > UINT_MAX)
+ rl.rlim_cur = UINT_MAX;
+ else
+ rl.rlim_cur = (ulong_t)rlim_cur;
+
+ if (rlim_max > UINT_MAX)
+ rl.rlim_max = UINT_MAX;
+ else
+ rl.rlim_max = (ulong_t)rlim_max;
+
+ rl32.rlim_cur = rl.rlim_cur;
+ rl32.rlim_max = rl.rlim_max;
+
+ if (copyout(&rl32, rlp, sizeof (rl32)) != 0)
+ return (set_errno(EFAULT));
+ }
+#endif
+
+ return (0);
+}
+
+/*
+ * This is the 'old' getrlimit, variously called getrlimit or old_getrlimit
+ * in Linux headers and code. The only difference between this and the new
+ * getrlimit (variously called getrlimit or ugetrlimit) is the value of
+ * RLIM_INFINITY, which is smaller for the older version.
+ *
+ * This is only used for 32-bit code.
+ */
+long
+lx_oldgetrlimit(int resource, lx_rlimit_t *rlp)
+{
+ int rv;
+ lx_rlimit32_t rl32;
+ uint64_t rlim_cur, rlim_max;
+
+ rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max);
+ if (rv != 0)
+ return (set_errno(rv));
+
+ if (rlim_cur > INT_MAX)
+ rl32.rlim_cur = INT_MAX;
+ else
+ rl32.rlim_cur = (ulong_t)rlim_cur;
+
+ if (rlim_max > INT_MAX)
+ rl32.rlim_max = INT_MAX;
+ else
+ rl32.rlim_max = (ulong_t)rlim_cur;
+
+ if (copyout(&rl32, rlp, sizeof (rl32)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+static int
+lx_set_rctl(char *nm, struct rlimit64 *rlp64)
+{
+ int err;
+ rctl_hndl_t hndl;
+ rctl_alloc_gp_t *gp;
+
+ gp = rctl_rlimit_set_prealloc(1);
+
+ mutex_enter(&curproc->p_lock);
+
+ hndl = rctl_hndl_lookup(nm);
+
+ /*
+ * We're not supposed to do this but since we want all our rctls to
+ * behave like rlimits, we take advantage of this function to set up
+ * this way.
+ */
+ err = rctl_rlimit_set(hndl, curproc, rlp64, gp, RCTL_LOCAL_DENY, 0,
+ CRED());
+
+ mutex_exit(&curproc->p_lock);
+
+ rctl_prealloc_destroy(gp);
+
+ return (err);
+}
+
+static int
+lx_setrlimit_common(int lx_resource, uint64_t rlim_cur, uint64_t rlim_max)
+{
+ lx_proc_data_t *pd = ptolxproc(curproc);
+ int err;
+ int resource;
+ rctl_alloc_gp_t *gp;
+ struct rlimit64 rl64;
+
+ if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS)
+ return (EINVAL);
+
+ switch (lx_resource) {
+ case LX_RLIMIT_LOCKS:
+ pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = rlim_cur;
+ pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = rlim_max;
+ break;
+
+ case LX_RLIMIT_NICE:
+ pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = rlim_cur;
+ pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = rlim_max;
+ break;
+
+ case LX_RLIMIT_RTPRIO:
+ pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = rlim_cur;
+ pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = rlim_max;
+ break;
+
+ case LX_RLIMIT_RTTIME:
+ pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = rlim_cur;
+ pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = rlim_max;
+ break;
+
+ case LX_RLIMIT_RSS:
+ /*
+ * zone.max-physical-memory
+ * Since we're emulating the value via a zone rctl, we can't
+ * set that from within the zone. Lie and say we set the value.
+ */
+ break;
+
+ case LX_RLIMIT_NPROC:
+ /*
+ * zone.max-lwps
+ * Since we're emulating the value via a zone rctl, we can't
+ * set that from within the zone. Lie and say we set the value.
+ */
+ break;
+
+ case LX_RLIMIT_MEMLOCK:
+ /*
+ * We allow setting to unlimited (LX_RLIM_INFINITY_N). The zone
+ * limit will always apply.
+ */
+ rl64.rlim_cur = rlim_cur;
+ rl64.rlim_max = rlim_max;
+ err = lx_set_rctl("process.max-locked-memory", &rl64);
+ if (err != 0)
+ return (set_errno(err));
+ break;
+
+ case LX_RLIMIT_SIGPENDING:
+ /*
+ * On Ubuntu at least, the login and sshd processes expect to
+ * set this limit to 16k and login will fail if this fails. On
+ * illumos we have a system limit of 8k and normally the
+ * privileged limit is 512. We simply pretend this works to
+ * allow login to work.
+ */
+ if (rlim_max > 8192)
+ return (0);
+
+ rl64.rlim_cur = rlim_cur;
+ rl64.rlim_max = rlim_max;
+ if ((err = lx_set_rctl("process.max-sigqueue-size", &rl64))
+ != 0)
+ return (set_errno(err));
+ break;
+
+ case LX_RLIMIT_MSGQUEUE:
+ rl64.rlim_cur = rlim_cur;
+ rl64.rlim_max = rlim_max;
+ if ((err = lx_set_rctl("process.max-msg-messages", &rl64)) != 0)
+ return (set_errno(err));
+ break;
+
+ default:
+ resource = l_to_r[lx_resource];
+
+ /*
+ * Linux limits the max number of open files to 1m and there is
+ * a test for this.
+ */
+ if (lx_resource == LX_RLIMIT_NOFILE && rlim_max > (1024 * 1024))
+ return (EPERM);
+
+ rl64.rlim_cur = rlim_cur;
+ rl64.rlim_max = rlim_max;
+ gp = rctl_rlimit_set_prealloc(1);
+
+ mutex_enter(&curproc->p_lock);
+ err = rctl_rlimit_set(rctlproc_legacy[resource], curproc,
+ &rl64, gp, rctlproc_flags[resource],
+ rctlproc_signals[resource], CRED());
+ mutex_exit(&curproc->p_lock);
+
+ rctl_prealloc_destroy(gp);
+ if (err != 0)
+ return (set_errno(err));
+ break;
+ }
+
+ return (0);
+}
+
+long
+lx_setrlimit(int resource, lx_rlimit_t *rlp)
+{
+ int rv;
+ lx_rlimit_t rl;
+ uint64_t rlim_cur, rlim_max;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(rlp, &rl, sizeof (rl)) != 0)
+ return (set_errno(EFAULT));
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ lx_rlimit32_t rl32;
+
+ if (copyin(rlp, &rl32, sizeof (rl32)) != 0)
+ return (set_errno(EFAULT));
+
+ rl.rlim_cur = rl32.rlim_cur;
+ rl.rlim_max = rl32.rlim_max;
+ }
+#endif
+
+ if ((rl.rlim_max != LX_RLIM_INFINITY_N &&
+ rl.rlim_cur == LX_RLIM_INFINITY_N) ||
+ rl.rlim_cur > rl.rlim_max)
+ return (set_errno(EINVAL));
+
+ if (rl.rlim_cur == LX_RLIM_INFINITY_N)
+ rlim_cur = LX_RLIM64_INFINITY;
+ else
+ rlim_cur = rl.rlim_cur;
+
+ if (rl.rlim_max == LX_RLIM_INFINITY_N)
+ rlim_max = LX_RLIM64_INFINITY;
+ else
+ rlim_max = rl.rlim_max;
+
+ rv = lx_setrlimit_common(resource, rlim_cur, rlim_max);
+ if (rv != 0)
+ return (set_errno(rv));
+ return (0);
+}
+
+/*
+ * From the man page:
+ * The Linux-specific prlimit() system call combines and extends the
+ * functionality of setrlimit() and getrlimit(). It can be used to both set
+ * and get the resource limits of an arbitrary process.
+ *
+ * If pid is 0, then the call applies to the calling process.
+ */
+long
+lx_prlimit64(pid_t pid, int resource, lx_rlimit64_t *nrlp, lx_rlimit64_t *orlp)
+{
+ int rv;
+ lx_rlimit64_t nrl, orl;
+
+ if (pid != 0) {
+ /* XXX TBD if needed */
+ char buf[80];
+
+ (void) snprintf(buf, sizeof (buf),
+ "setting prlimit %d for another process\n", resource);
+ lx_unsupported(buf);
+ return (ENOTSUP);
+ }
+
+ if (orlp != NULL) {
+ /* we first get the current limits */
+ rv = lx_getrlimit_common(resource, &orl.rlim_cur,
+ &orl.rlim_max);
+ if (rv != 0)
+ return (set_errno(rv));
+ }
+
+ if (nrlp != NULL) {
+ if (copyin(nrlp, &nrl, sizeof (nrl)) != 0)
+ return (set_errno(EFAULT));
+
+ if ((nrl.rlim_max != LX_RLIM64_INFINITY &&
+ nrl.rlim_cur == LX_RLIM64_INFINITY) ||
+ nrl.rlim_cur > nrl.rlim_max)
+ return (set_errno(EINVAL));
+
+ rv = lx_setrlimit_common(resource, nrl.rlim_cur, nrl.rlim_max);
+ if (rv != 0)
+ return (set_errno(rv));
+ }
+
+ if (orlp != NULL) {
+ /* now return the original limits, if necessary */
+ if (copyout(&orl, orlp, sizeof (orl)) != 0)
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c
new file mode 100644
index 0000000000..34aafcaf5d
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c
@@ -0,0 +1,956 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/nbmlock.h>
+#include <sys/limits.h>
+
+/* uts/common/syscall/rw.c */
+extern size_t copyout_max_cached;
+
+
+/* Common routines */
+
+static int
+lx_iovec_copyin(void *uiovp, int iovcnt, iovec_t *kiovp, ssize_t *count)
+{
+#ifdef _SYSCALL32_IMPL
+ /*
+ * 32-bit callers need to have their iovec expanded, while ensuring
+ * that they can't move more than 2Gbytes of data in a single call.
+ */
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ int aiov32len = 0;
+ ssize32_t total32 = 0;
+ int i;
+
+ if (iovcnt > IOV_MAX_STACK) {
+ aiov32len = iovcnt * sizeof (iovec32_t);
+ aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+ }
+
+ if (copyin(uiovp, aiov32, iovcnt * sizeof (iovec32_t))) {
+ if (aiov32len != 0) {
+ kmem_free(aiov32, aiov32len);
+ }
+ return (EFAULT);
+ }
+
+ for (i = 0; i < iovcnt; i++) {
+ ssize32_t iovlen32 = aiov32[i].iov_len;
+ total32 += iovlen32;
+ if (iovlen32 < 0 || total32 < 0) {
+ if (aiov32len != 0) {
+ kmem_free(aiov32, aiov32len);
+ }
+ return (EINVAL);
+ }
+ kiovp[i].iov_len = iovlen32;
+ kiovp[i].iov_base =
+ (caddr_t)(uintptr_t)aiov32[i].iov_base;
+ /* Linux does a basic sanity test on the address */
+ if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT32) {
+ if (aiov32len != 0) {
+ kmem_free(aiov32, aiov32len);
+ }
+ return (EFAULT);
+ }
+ }
+ *count = total32;
+
+ if (aiov32len != 0)
+ kmem_free(aiov32, aiov32len);
+ } else
+#endif
+ {
+ ssize_t total = 0;
+ int i;
+
+ if (copyin(uiovp, kiovp, iovcnt * sizeof (iovec_t)))
+ return (EFAULT);
+ for (i = 0; i < iovcnt; i++) {
+ ssize_t iovlen = kiovp[i].iov_len;
+ total += iovlen;
+ if (iovlen < 0 || total < 0) {
+ return (EINVAL);
+ }
+ /* Linux does a basic sanity test on the address */
+ if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT) {
+ return (EFAULT);
+ }
+ }
+ *count = total;
+ }
+ return (0);
+}
+
+int
+lx_read_common(file_t *fp, uio_t *uiop, size_t *nread, boolean_t positioned)
+{
+ vnode_t *vp = fp->f_vnode;
+ int error = 0, rwflag = 0, ioflag;
+ ssize_t count = uiop->uio_resid;
+ size_t rcount = 0;
+ struct cpu *cp;
+ boolean_t in_crit = B_FALSE;
+
+ if (fp->f_vnode->v_type == VDIR) {
+ return (EISDIR);
+ }
+ if (positioned &&
+ (fp->f_vnode->v_type == VFIFO || fp->f_vnode->v_type == VSOCK)) {
+ return (ESPIPE);
+ }
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = B_TRUE;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_READ, uiop->uio_offset, count, svmand,
+ NULL) != 0) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+ /*
+ * For non-positioned reads, recheck offset/count validity inside
+ * VOP_WRLOCK to prevent filesize from changing during validation.
+ */
+ if (!positioned) {
+ u_offset_t uoffset = (u_offset_t)(ulong_t)fp->f_offset;
+
+ if ((vp->v_type == VREG) && (uoffset >= OFFSET_MAX(fp))) {
+ struct vattr va;
+
+ va.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL);
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ if (error != 0)
+ goto out;
+ /* We have to return EOF if fileoff is >= file size. */
+ if (uoffset >= va.va_size)
+ goto out;
+ /*
+ * File is greater than or equal to maxoff and
+ * therefore we return EOVERFLOW.
+ */
+ error = EOVERFLOW;
+ goto out;
+ }
+ if ((vp->v_type == VREG) &&
+ (uoffset + count > OFFSET_MAX(fp))) {
+ count = (ssize_t)(OFFSET_MAX(fp) - uoffset);
+ uiop->uio_resid = count;
+ }
+ uiop->uio_offset = uoffset;
+ }
+ ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+ /* If read sync is not asked for, filter sync flags */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(vp, uiop, ioflag, fp->f_cred, NULL);
+ rcount = count - uiop->uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, sysread, 1);
+ CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)rcount);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)rcount;
+ /* Store offset for non-positioned reads */
+ if (!positioned) {
+ if (vp->v_type == VFIFO) {
+ /* Backward compatibility */
+ fp->f_offset = rcount;
+ } else if (((fp->f_flag & FAPPEND) == 0) ||
+ (vp->v_type != VREG) || (count != 0)) {
+ /* POSIX */
+ fp->f_offset = uiop->uio_loffset;
+ }
+ }
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ *nread = rcount;
+ return (error);
+}
+
+int
+lx_write_common(file_t *fp, uio_t *uiop, size_t *nwrite, boolean_t positioned)
+{
+ vnode_t *vp = fp->f_vnode;
+ int error = 0, rwflag = 1, ioflag;
+ ssize_t count = uiop->uio_resid;
+ size_t wcount = 0;
+ struct cpu *cp;
+ boolean_t in_crit = B_FALSE;
+
+ if (positioned &&
+ (fp->f_vnode->v_type == VFIFO || fp->f_vnode->v_type == VSOCK)) {
+ return (ESPIPE);
+ }
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = B_TRUE;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_WRITE, uiop->uio_loffset, count,
+ svmand, NULL) != 0) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+
+ if (!positioned) {
+ /*
+ * For non-positioned writes, the value of fp->f_offset is
+ * re-queried while inside VOP_RWLOCK. This ensures that other
+ * writes which alter the filesize will be taken into account.
+ */
+ uiop->uio_loffset = fp->f_offset;
+ ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+ } else {
+ /*
+ * In a senseless departure from POSIX, positioned write calls
+ * on Linux do _not_ ignore the O_APPEND flag.
+ */
+ ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+ }
+ if (vp->v_type == VREG) {
+ u_offset_t fileoff = (u_offset_t)(ulong_t)uiop->uio_loffset;
+
+ if (fileoff >= curproc->p_fsz_ctl) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+ mutex_exit(&curproc->p_lock);
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff >= OFFSET_MAX(fp)) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff + count > OFFSET_MAX(fp)) {
+ count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+ uiop->uio_resid = count;
+ }
+ }
+
+ error = VOP_WRITE(vp, uiop, ioflag, fp->f_cred, NULL);
+ wcount = count - uiop->uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, syswrite, 1);
+ CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)wcount);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)wcount;
+
+ /* Store offset for non-positioned writes */
+ if (!positioned) {
+ if (vp->v_type == VFIFO) {
+ /* Backward compatibility */
+ fp->f_offset = wcount;
+ } else if (((fp->f_flag & FAPPEND) == 0) ||
+ (vp->v_type != VREG) || (count != 0)) {
+ /* POSIX */
+ fp->f_offset = uiop->uio_loffset;
+ }
+ }
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ *nwrite = wcount;
+ return (error);
+}
+
+/*
+ * The Linux routines for reading and writing data from file descriptors behave
+ * differently from their SunOS counterparts in a few key ways:
+ *
+ * - Passing an iovcnt of 0 to the vectored functions results in an error on
+ * SunOS, but on Linux it yields return value of 0.
+ *
+ * - If any data is successfully read or written, Linux will return a success.
+ * This is unlike SunOS which would return an error code for the entire
+ * operation in cases where vectors had gone unprocessed.
+ *
+ * - Breaking from POSIX, Linux positioned writes (pwrite/pwritev) on Linux
+ * will obey the O_APPEND flag if it is set on the descriptor.
+ */
+
+ssize_t
+lx_read(int fdes, void *cbuf, size_t ccount)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ ssize_t count = (ssize_t)ccount;
+ size_t nread = 0;
+ int fflag, error = 0;
+
+ if (count < 0)
+ return (set_errno(EINVAL));
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & FREAD) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG && count == 0) {
+ goto out;
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ if (count <= copyout_max_cached)
+ auio.uio_extflg = UIO_COPY_CACHED;
+ else
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ error = lx_read_common(fp, &auio, &nread, B_FALSE);
+
+ if (error == EINTR) {
+ if (nread != 0) {
+ error = 0;
+ } else {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ releasef(fdes);
+ if (error != 0)
+ return (set_errno(error));
+ return ((ssize_t)nread);
+}
+
+ssize_t
+lx_write(int fdes, void *cbuf, size_t ccount)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ ssize_t count = (ssize_t)ccount;
+ size_t nwrite = 0;
+ int fflag, error = 0;
+
+ if (count < 0)
+ return (set_errno(EINVAL));
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG && count == 0) {
+ goto out;
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ error = lx_write_common(fp, &auio, &nwrite, B_FALSE);
+
+ if (error == EINTR) {
+ if (nwrite != 0) {
+ error = 0;
+ } else {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ releasef(fdes);
+ if (error != 0)
+ return (set_errno(error));
+ return (nwrite);
+}
+
+ssize_t
+lx_readv(int fdes, struct iovec *iovp, int iovcnt)
+{
+ struct uio auio;
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
+ file_t *fp;
+ ssize_t count;
+ size_t nread = 0;
+ int fflag, error = 0;
+
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ return (set_errno(EINVAL));
+ } else if (iovcnt == 0) {
+ return (0);
+ }
+
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+ if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(error));
+ }
+
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(EBADF));
+ }
+ if (((fflag = fp->f_flag) & FREAD) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG && count == 0) {
+ goto out;
+ }
+
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ if (count <= copyout_max_cached)
+ auio.uio_extflg = UIO_COPY_CACHED;
+ else
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ error = lx_read_common(fp, &auio, &nread, B_FALSE);
+
+ if (error != 0) {
+ if (nread != 0) {
+ error = 0;
+ } else if (error == EINTR) {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (nread);
+}
+
+ssize_t
+lx_writev(int fdes, struct iovec *iovp, int iovcnt)
+{
+ struct uio auio;
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
+ file_t *fp;
+ ssize_t count;
+ size_t nwrite = 0;
+ int fflag, error = 0;
+
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ return (set_errno(EINVAL));
+ } else if (iovcnt == 0) {
+ return (0);
+ }
+
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+ if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(error));
+ }
+
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(EBADF));
+ }
+ if (((fflag = fp->f_flag) & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG && count == 0) {
+ goto out;
+ }
+
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ error = lx_write_common(fp, &auio, &nwrite, B_FALSE);
+
+ if (error != 0) {
+ if (nwrite != 0) {
+ error = 0;
+ } else if (error == EINTR) {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (nwrite);
+}
+
+ssize_t
+lx_pread_fp(file_t *fp, void *cbuf, size_t ccount, off64_t offset)
+{
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t count = (ssize_t)ccount;
+ size_t nread = 0;
+ int fflag, error = 0;
+
+ if (count < 0)
+ return (set_errno(EINVAL));
+ if (((fflag = fp->f_flag) & FREAD) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG) {
+ u_offset_t fileoff = (u_offset_t)offset;
+
+ if (count == 0)
+ goto out;
+ /*
+ * Return EINVAL if an invalid offset comes to pread.
+ * Negative offset from user will cause this error.
+ */
+ if (fileoff > MAXOFFSET_T) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * Limit offset such that we don't read or write
+ * a file beyond the maximum offset representable in
+ * an off_t structure.
+ */
+ if (fileoff + count > MAXOFFSET_T)
+ count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff);
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ error = lx_read_common(fp, &auio, &nread, B_TRUE);
+
+ if (error == EINTR) {
+ if (nread != 0) {
+ error = 0;
+ } else {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ if (error) {
+ return (set_errno(error));
+ }
+ return ((ssize_t)nread);
+
+}
+
+ssize_t
+lx_pread(int fdes, void *cbuf, size_t ccount, off64_t offset)
+{
+ file_t *fp;
+ size_t nread;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+
+ nread = lx_pread_fp(fp, cbuf, ccount, offset);
+ releasef(fdes);
+ return (nread);
+}
+
+ssize_t
+lx_pwrite_fp(file_t *fp, void *cbuf, size_t ccount, off64_t offset)
+{
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t count = (ssize_t)ccount;
+ size_t nwrite = 0;
+ int fflag, error = 0;
+
+ if (count < 0)
+ return (set_errno(EINVAL));
+ if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG) {
+ u_offset_t fileoff = (u_offset_t)offset;
+
+ if (count == 0)
+ goto out;
+ /*
+ * return EINVAL for offsets that cannot be
+ * represented in an off_t.
+ */
+ if (fileoff > MAXOFFSET_T) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * Take appropriate action if we are trying to write above the
+ * resource limit.
+ */
+ if (fileoff >= curproc->p_fsz_ctl) {
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+ mutex_exit(&curproc->p_lock);
+
+ error = EFBIG;
+ goto out;
+ }
+ /*
+ * Don't allow pwrite to cause file sizes to exceed maxoffset.
+ */
+ if (fileoff == MAXOFFSET_T) {
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff + count > MAXOFFSET_T)
+ count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ error = lx_write_common(fp, &auio, &nwrite, B_TRUE);
+
+ if (error == EINTR) {
+ if (nwrite != 0) {
+ error = 0;
+ } else {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ if (error) {
+ return (set_errno(error));
+ }
+ return (nwrite);
+}
+
+ssize_t
+lx_pwrite(int fdes, void *cbuf, size_t ccount, off64_t offset)
+{
+ file_t *fp;
+ size_t nwrite;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+
+ nwrite = lx_pwrite_fp(fp, cbuf, ccount, offset);
+ releasef(fdes);
+ return (nwrite);
+}
+
+ssize_t
+lx_pread32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo,
+ uint32_t off_hi)
+{
+ return (lx_pread(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi)));
+}
+
+ssize_t
+lx_pwrite32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo,
+ uint32_t off_hi)
+{
+ return (lx_pwrite(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi)));
+}
+
+ssize_t
+lx_preadv(int fdes, void *iovp, int iovcnt, off64_t offset)
+{
+ struct uio auio;
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
+ file_t *fp;
+ ssize_t count;
+ size_t nread = 0;
+ int fflag, error = 0;
+
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ return (set_errno(EINVAL));
+ } else if (iovcnt == 0) {
+ return (0);
+ }
+
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+ if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(error));
+ }
+
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(EBADF));
+ }
+ if (((fflag = fp->f_flag) & FREAD) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG) {
+ u_offset_t fileoff = (u_offset_t)offset;
+
+ if (count == 0)
+ goto out;
+ /*
+ * Return EINVAL if an invalid offset comes to pread.
+ * Negative offset from user will cause this error.
+ */
+ if (fileoff > MAXOFFSET_T) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * Limit offset such that we don't read or write a file beyond
+ * the maximum offset representable in an off_t structure.
+ */
+ if (fileoff + count > MAXOFFSET_T)
+ count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff);
+ }
+
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_loffset = offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ if (count <= copyout_max_cached)
+ auio.uio_extflg = UIO_COPY_CACHED;
+ else
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ error = lx_read_common(fp, &auio, &nread, B_TRUE);
+
+ if (error != 0) {
+ if (nread != 0) {
+ error = 0;
+ } else if (error == EINTR) {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (nread);
+}
+
+ssize_t
+lx_pwritev(int fdes, void *iovp, int iovcnt, off64_t offset)
+{
+ struct uio auio;
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
+ file_t *fp;
+ ssize_t count;
+ size_t nwrite = 0;
+ int fflag, error = 0;
+
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ return (set_errno(EINVAL));
+ } else if (iovcnt == 0) {
+ return (0);
+ }
+
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+ if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(error));
+ }
+
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ return (set_errno(EBADF));
+ }
+ if (((fflag = fp->f_flag) & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode->v_type == VREG) {
+ u_offset_t fileoff = (u_offset_t)offset;
+
+ if (count == 0)
+ goto out;
+ /*
+ * Return EINVAL if an invalid offset comes to pread.
+ * Negative offset from user will cause this error.
+ */
+ if (fileoff > MAXOFFSET_T) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * Take appropriate action if we are trying to write above the
+ * resource limit.
+ */
+ if (fileoff >= curproc->p_fsz_ctl) {
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+ mutex_exit(&curproc->p_lock);
+
+ error = EFBIG;
+ goto out;
+ }
+ /*
+ * Don't allow pwritev to cause file sizes to exceed maxoffset.
+ */
+ if (fileoff == MAXOFFSET_T) {
+ error = EFBIG;
+ goto out;
+ }
+ /*
+ * Limit offset such that we don't read or write a file beyond
+ * the maximum offset representable in an off_t structure.
+ */
+ if (fileoff + count > MAXOFFSET_T)
+ count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
+ }
+
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_loffset = offset;
+ auio.uio_resid = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ error = lx_write_common(fp, &auio, &nwrite, B_TRUE);
+
+ if (error != 0) {
+ if (nwrite != 0) {
+ error = 0;
+ } else if (error == EINTR) {
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+ }
+out:
+ releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (nwrite);
+}
+
+ssize_t
+lx_preadv32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, uint32_t off_hi)
+{
+ return (lx_preadv(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi)));
+}
+
+ssize_t
+lx_pwritev32(int fdes, void *iovp, int iovcnt, uint32_t off_lo,
+ uint32_t off_hi)
+{
+ return (lx_pwritev(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi)));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c
new file mode 100644
index 0000000000..6d4904a5fe
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c
@@ -0,0 +1,1161 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Emulation for scheduling related syscalls.
+ *
+ * Under a typical zone configuration the zones will always be running under
+ * FSS so that no single zone can monopolize the system. Zones do not have the
+ * privilege to leave FSS (for the obvious reason that this would violate the
+ * global zone resource management policies). Thus, for the sched_* syscalls
+ * we typically will never be able to emulate those using our other native
+ * scheduling classes. Under this common case we simply track the scheduler
+ * settings on the lwp's lx brand structure and we also try to adjust the
+ * lwp priority within the valid range to approximate the intended effect.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/cpu.h>
+#include <sys/rtpriocntl.h>
+#include <sys/tspriocntl.h>
+#include <sys/processor.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/sysmacros.h>
+#include <sys/policy.h>
+#include <sys/procset.h>
+#include <sys/priocntl.h>
+
+typedef int l_pid_t;
+
+extern int yield();
+extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t);
+
+static int lx_sched_setprocset(procset_t *, l_pid_t);
+static long lx_do_priocntlsys(int, procset_t *, void *);
+
+#define BITS_PER_BYTE 8
+
+/*
+ * Linux scheduler policies.
+ */
+#define LX_SCHED_OTHER 0
+#define LX_SCHED_FIFO 1
+#define LX_SCHED_RR 2
+#define LX_SCHED_BATCH 3
+#define LX_SCHED_IDLE 5
+#define LX_SCHED_DEADLINE 6
+
+/*
+ * Linux scheduler priority ranges.
+ */
+#define LX_SCHED_PRIORITY_MIN_OTHER 0
+#define LX_SCHED_PRIORITY_MAX_OTHER 0
+#define LX_SCHED_PRIORITY_MIN_RRFIFO 1
+#define LX_SCHED_PRIORITY_MAX_RRFIFO 99
+
+#define MAXPRI 60 /* See FSS_MAXUPRI */
+
+/*
+ * When emulating scheduling priorities (e.g. under FSS) we'll do the best we
+ * can by adjusting the thread's priority within our range.
+ */
+static int lx_emul_pri_map[] = {
+ 0, /* LX_SCHED_OTHER */
+ MAXPRI, /* LX_SCHED_FIFO */
+ MAXPRI - 1, /* LX_SCHED_RR */
+ -MAXPRI + 1, /* LX_SCHED_BATCH */
+ 0, /* UNUSED */
+ -MAXPRI, /* LX_SCHED_IDLE */
+ MAXPRI /* LX_SCHED_DEADLINE */
+};
+
+/*
+ * Determine if we should emulate the sched_* syscalls. A zone is almost always
+ * going to be running under FSS in any kind of production configuration, and
+ * FSS is currently the only class which zone processes won't have the privilege
+ * to leave. Instead of checking for FSS explicitly, we generalize our check
+ * using CL_CANEXIT.
+ */
+#define EMUL_SCHED() (CL_CANEXIT(curthread, CRED()) != 0)
+
+struct lx_sched_param {
+ int lx_sched_prio;
+};
+
+typedef struct lx_sched_attr {
+ uint32_t lx_size;
+
+ uint32_t lx_sched_policy;
+ uint64_t lx_sched_flags;
+
+ /* For LX_SCHED_OTHER or LX_SCHED_BATCH */
+ int lx_sched_nice;
+
+ /* For LX_SCHED_FIFO or LX_SCHED_RR */
+ uint32_t lx_sched_priority;
+
+ /* For LX_SCHED_DEADLINE */
+ uint64_t lx_sched_runtime;
+ uint64_t lx_sched_deadline;
+ uint64_t lx_sched_period;
+} lx_sched_attr_t;
+
+long
+lx_sched_yield(void)
+{
+ yield();
+
+ return (0);
+}
+
+static void
+ltos_cpuset(lx_affmask_t *lmask, cpuset_t *smask)
+{
+ /* NOTE: fix this code if NCPU is ever made > LX_NCPU */
+
+ cpuset_zero(smask);
+ for (int i = 0; i < NCPU; i++) {
+ if (BT_TEST(*lmask, i)) {
+ cpuset_add(smask, i);
+ }
+ }
+}
+
+static void
+stol_cpuset(cpuset_t *smask, lx_affmask_t *lmask)
+{
+ /* NOTE: fix this code if NCPU is ever made > LX_NCPU */
+
+ bzero(lmask, sizeof (*lmask));
+ for (int i = 0; i < NCPU; i++) {
+ if (cpu_in_set(smask, i)) {
+ BT_SET(*lmask, i);
+ }
+ }
+}
+
+/*
+ * Find and lock a process for lx_sched_* operations.
+ * Sets 'pp' and 'tp' on success, with P_PR_LOCK set and p_lock held.
+ * The target process must be branded.
+ */
+static int
+lx_sched_pidlock(l_pid_t pid, proc_t **pp, kthread_t **tp, boolean_t is_write)
+{
+ proc_t *p;
+ kthread_t *t = NULL;
+ int err = 0;
+
+ if (pid < 0) {
+ return (EINVAL);
+ }
+ if (pid == 0) {
+ p = curproc;
+ ASSERT(PROC_IS_BRANDED(p));
+ mutex_enter(&p->p_lock);
+ sprlock_proc(p);
+
+ *tp = curthread;
+ *pp = p;
+ return (0);
+ }
+
+ if (lx_lpid_lock((pid_t)pid, curzone, LXP_PRLOCK, &p, &t) != 0) {
+ return (ESRCH);
+ }
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ if (!(PROC_IS_BRANDED(p))) {
+ sprunlock(p);
+ return (EPERM);
+ }
+
+ if (is_write) {
+ cred_t *cr = CRED();
+
+ /*
+ * To perform a sched_* operation on a thread outside of the
+ * current process, either the euid/egid of the target must
+ * match, or the calling process must hold CAP_SYS_NICE.
+ * (PRIV_PROC_PRIOUP maps to CAP_SYS_NICE)
+ */
+ err = 0;
+ if (secpolicy_raisepriority(cr) != 0) {
+ err = 0;
+ mutex_exit(&p->p_lock);
+ mutex_enter(&p->p_crlock);
+ if (crgetuid(cr) != crgetuid(p->p_cred) ||
+ crgetgid(cr) != crgetgid(p->p_cred)) {
+ err = EPERM;
+ }
+ mutex_exit(&p->p_crlock);
+ mutex_enter(&p->p_lock);
+ if (err != 0) {
+ sprunlock(p);
+ return (err);
+ }
+ }
+ }
+ *pp = p;
+ *tp = t;
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (0);
+}
+
+long
+lx_sched_getaffinity(l_pid_t pid, unsigned int len, void *maskp)
+{
+ proc_t *p;
+ kthread_t *tp = NULL;
+ lx_lwp_data_t *lwpd;
+ int err;
+ unsigned int pmin, pmax, compare_size;
+ lx_affmask_t lmask;
+ cpuset_t *smask;
+
+ /*
+ * The length boundary requirement is to match Linux's behavior.
+ */
+ switch (get_udatamodel()) {
+ case DATAMODEL_ILP32:
+ compare_size = sizeof (uint32_t);
+ break;
+ default:
+ compare_size = sizeof (ulong_t);
+ break;
+ }
+ if ((len & (compare_size - 1)) != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ smask = cpuset_alloc(KM_SLEEP);
+ if ((err = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) {
+ cpuset_free(smask);
+ return (set_errno(err));
+ }
+
+ mutex_exit(&p->p_lock);
+ mutex_enter(&cpu_lock);
+ mutex_enter(&p->p_lock);
+ /*
+ * Grab the existing affinity mask and constrain it by the current set
+ * of active CPUs (which may have changed since it was assigned.
+ */
+ lwpd = ttolxlwp(tp);
+ cpuset_or(smask, lwpd->br_affinitymask);
+ cpuset_and(smask, &cpu_active_set);
+ sprunlock(p);
+ mutex_exit(&cpu_lock);
+
+ cpuset_bounds(smask, &pmin, &pmax);
+ stol_cpuset(smask, &lmask);
+ cpuset_free(smask);
+
+ /*
+ * It is out of convenience that this check is performed so late. If
+ * the need arises, it could be altered to be done earlier in order to
+ * match Linux error ordering.
+ */
+ if (pmax >= (len * BITS_PER_BYTE)) {
+ return (set_errno(EINVAL));
+ }
+
+ len = MIN(len, sizeof (lx_affmask_t));
+ if (copyout(&lmask, maskp, len) != 0) {
+ return (set_errno(EFAULT));
+ }
+ return (len);
+}
+
+long
+lx_sched_setaffinity(l_pid_t pid, unsigned int len, void *maskp)
+{
+ proc_t *p;
+ kthread_t *tp = NULL;
+ lx_lwp_data_t *lwpd;
+ int err;
+ unsigned int pmin, pmax;
+ lx_affmask_t lmask;
+ cpuset_t *smask;
+
+ if (pid < 0) {
+ return (set_errno(EINVAL));
+ }
+
+ if (len < sizeof (lmask)) {
+ bzero(&lmask, sizeof (lmask));
+ } else if (len > sizeof (lmask)) {
+ len = sizeof (lmask);
+ }
+ if (copyin(maskp, &lmask, len) != 0) {
+ return (set_errno(EFAULT));
+ }
+ smask = cpuset_alloc(KM_SLEEP);
+ ltos_cpuset(&lmask, smask);
+ if ((err = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) {
+ cpuset_free(smask);
+ return (set_errno(err));
+ }
+
+ /*
+ * Constrain the mask to currently active CPUs.
+ */
+ mutex_exit(&p->p_lock);
+ mutex_enter(&cpu_lock);
+ mutex_enter(&p->p_lock);
+ lwpd = ttolxlwp(tp);
+
+ cpuset_and(smask, &cpu_active_set);
+ if (cpuset_isnull(smask)) {
+ err = EINVAL;
+ goto out;
+ }
+ if (cpuset_isequal(lwpd->br_affinitymask, smask)) {
+ err = 0;
+ goto out;
+ }
+
+ /*
+ * If one (and only one) CPU is selected in the affinity mask, bind the
+ * thread to that CPU.
+ */
+ cpuset_bounds(smask, &pmin, &pmax);
+ VERIFY(pmin != CPUSET_NOTINSET);
+ if (pmin == pmax) {
+ processorid_t obind;
+
+ (void) cpu_bind_thread(tp, pmin, &obind, &err);
+ if (err != 0) {
+ goto out;
+ }
+ } else {
+ /*
+ * If the thread transitions away from a single-CPU mask, it
+ * should be unbound from that processor.
+ */
+ cpuset_bounds(lwpd->br_affinitymask, &pmin, &pmax);
+ if (pmin == pmax) {
+ processorid_t obind;
+ (void) cpu_bind_thread(tp, PBIND_NONE, &obind, &err);
+ }
+ }
+ cpuset_zero(lwpd->br_affinitymask);
+ cpuset_or(lwpd->br_affinitymask, smask);
+ err = 0;
+
+out:
+ mutex_exit(&cpu_lock);
+ sprunlock(p);
+ cpuset_free(smask);
+ if (err != 0) {
+ return (set_errno(err));
+ }
+ return (0);
+}
+
+void
+lx_affinity_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
+{
+ proc_t *pp = lwptoproc(srclwp);
+ lx_lwp_data_t *slwpd = lwptolxlwp(srclwp);
+ lx_lwp_data_t *dlwpd = lwptolxlwp(dstlwp);
+
+ /*
+ * Copy over the affinity mask. This could be enhanced in the future
+ * to perform single-CPU binding like sched_setaffinity.
+ */
+ mutex_enter(&pp->p_lock);
+ cpuset_zero(dlwpd->br_affinitymask);
+ cpuset_or(dlwpd->br_affinitymask, slwpd->br_affinitymask);
+ mutex_exit(&pp->p_lock);
+}
+
+long
+lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ procset_t procset;
+ procset_t procset_cid;
+ pcparms_t pcparm;
+ pcinfo_t pcinfo;
+ struct lx_sched_param sched_param;
+ tsparms_t *tsp;
+ int prio, maxupri;
+ int rv;
+
+ if (pid < 0 || param == NULL)
+ return (set_errno(EINVAL));
+
+ if (copyin(param, &sched_param, sizeof (sched_param)))
+ return (set_errno(EFAULT));
+
+ prio = sched_param.lx_sched_prio;
+
+ if (EMUL_SCHED()) {
+ proc_t *p;
+ kthread_t *tp = NULL;
+ int incr;
+ lx_lwp_data_t *lwpd;
+
+ switch (policy) {
+ case LX_SCHED_OTHER:
+ case LX_SCHED_BATCH:
+ case LX_SCHED_IDLE:
+ case LX_SCHED_DEADLINE:
+ if (prio != LX_SCHED_PRIORITY_MIN_OTHER)
+ return (set_errno(EINVAL));
+ break;
+ case LX_SCHED_FIFO:
+ case LX_SCHED_RR:
+ if (crgetuid(CRED()) != 0)
+ return (set_errno(EPERM));
+ if (prio < LX_SCHED_PRIORITY_MIN_RRFIFO ||
+ prio > LX_SCHED_PRIORITY_MAX_RRFIFO)
+ return (set_errno(EINVAL));
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /* Find and operate on the target lwp. */
+ if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0)
+ return (set_errno(rv));
+
+ lwpd = lwptolxlwp(ttolwp(tp));
+ if (lwpd->br_schd_class == LX_SCHED_IDLE &&
+ policy != LX_SCHED_IDLE && crgetuid(CRED()) != 0) {
+
+ sprunlock(p);
+ return (set_errno(EPERM));
+ }
+
+ lwpd->br_schd_class = policy;
+ lwpd->br_schd_pri = prio;
+
+ ASSERT(policy <= LX_SCHED_DEADLINE);
+ incr = lx_emul_pri_map[policy];
+
+ CL_DOPRIO(tp, CRED(), incr, &rv);
+
+ sprunlock(p);
+ return (0);
+ }
+
+ if ((rv = lx_sched_setprocset(&procset, pid)))
+ return (rv);
+
+ /* get the class id */
+ pcparm.pc_cid = PC_CLNULL;
+ (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ /* get the current policy */
+ bzero(&pcinfo, sizeof (pcinfo));
+ pcinfo.pc_cid = pcparm.pc_cid;
+ (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ if (policy < 0) {
+ if (strcmp(pcinfo.pc_clname, "TS") == 0) {
+ policy = LX_SCHED_OTHER;
+ } else if (strcmp(pcinfo.pc_clname, "RT") == 0) {
+ policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
+ RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+ } else {
+ return (set_errno(EINVAL));
+ }
+ }
+
+ bzero(&pcinfo, sizeof (pcinfo));
+ bzero(&pcparm, sizeof (pcparm));
+ setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0);
+ switch (policy) {
+ case LX_SCHED_FIFO:
+ case LX_SCHED_RR:
+ (void) strcpy(pcinfo.pc_clname, "RT");
+ (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ if (prio < 0 ||
+ prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri)
+ return (set_errno(EINVAL));
+ pcparm.pc_cid = pcinfo.pc_cid;
+ ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
+ ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs =
+ policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
+ break;
+
+ case LX_SCHED_OTHER:
+ (void) strcpy(pcinfo.pc_clname, "TS");
+ (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri;
+ if (prio > maxupri || prio < -maxupri)
+ return (set_errno(EINVAL));
+
+ pcparm.pc_cid = pcinfo.pc_cid;
+ tsp = (tsparms_t *)pcparm.pc_clparms;
+ tsp->ts_upri = prio;
+ tsp->ts_uprilim = TS_NOCHANGE;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * finally set scheduling policy and parameters
+ */
+ (void) lx_do_priocntlsys(PC_SETPARMS, &procset, &pcparm);
+
+ return (0);
+}
+
+long
+lx_sched_getscheduler(l_pid_t pid)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ procset_t procset;
+ pcparms_t pcparm;
+ pcinfo_t pcinfo;
+ int policy;
+ int rv;
+
+ if (pid < 0)
+ return (set_errno(EINVAL));
+
+ if (EMUL_SCHED()) {
+ proc_t *p;
+ kthread_t *tp = NULL;
+
+ /* Find and operate on the target lwp. */
+ if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0)
+ return (set_errno(rv));
+
+ policy = lwptolxlwp(ttolwp(tp))->br_schd_class;
+ sprunlock(p);
+
+ return (policy);
+ }
+
+ if ((rv = lx_sched_setprocset(&procset, pid)))
+ return (rv);
+
+ /*
+ * get the class id
+ */
+ pcparm.pc_cid = PC_CLNULL;
+ (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ /*
+ * get the class info and identify the equivalent linux policy
+ */
+ bzero(&pcinfo, sizeof (pcinfo));
+ pcinfo.pc_cid = pcparm.pc_cid;
+ (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ if (strcmp(pcinfo.pc_clname, "TS") == 0) {
+ policy = LX_SCHED_OTHER;
+ } else if (strcmp(pcinfo.pc_clname, "RT") == 0) {
+ policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
+ RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+ } else {
+ policy = set_errno(EINVAL);
+ }
+
+ return (policy);
+}
+
+long
+lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ procset_t procset;
+ procset_t procset_cid;
+ pcparms_t pcparm;
+ pcinfo_t pcinfo;
+ struct lx_sched_param sched_param;
+ tsparms_t *tsp;
+ int policy;
+ int prio, maxupri;
+ int rv;
+
+ if (pid < 0 || param == NULL)
+ return (set_errno(EINVAL));
+
+ if (copyin(param, &sched_param, sizeof (sched_param)))
+ return (set_errno(EFAULT));
+
+ prio = sched_param.lx_sched_prio;
+
+ if (EMUL_SCHED()) {
+ proc_t *p;
+ kthread_t *tp = NULL;
+ int incr;
+
+ /* Find and operate on the target lwp. */
+ if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0)
+ return (set_errno(rv));
+
+ policy = lwptolxlwp(ttolwp(tp))->br_schd_class;
+ switch (policy) {
+ case LX_SCHED_OTHER:
+ case LX_SCHED_BATCH:
+ case LX_SCHED_IDLE:
+ case LX_SCHED_DEADLINE:
+ if (prio != LX_SCHED_PRIORITY_MIN_OTHER) {
+ sprunlock(p);
+ return (set_errno(EINVAL));
+ }
+ break;
+ case LX_SCHED_FIFO:
+ case LX_SCHED_RR:
+ if (crgetuid(CRED()) != 0) {
+ sprunlock(p);
+ return (set_errno(EPERM));
+ }
+ if (prio < LX_SCHED_PRIORITY_MIN_RRFIFO ||
+ prio > LX_SCHED_PRIORITY_MAX_RRFIFO) {
+ sprunlock(p);
+ return (set_errno(EINVAL));
+ }
+ break;
+ default:
+ /* this shouldn't happen */
+ ASSERT(0);
+ sprunlock(p);
+ return (set_errno(EINVAL));
+ }
+
+ lwptolxlwp(ttolwp(tp))->br_schd_pri = prio;
+
+ ASSERT(policy <= LX_SCHED_DEADLINE);
+ incr = lx_emul_pri_map[policy];
+
+ CL_DOPRIO(tp, CRED(), incr, &rv);
+ sprunlock(p);
+ return (0);
+ }
+
+ if ((rv = lx_sched_setprocset(&procset, pid)))
+ return (rv);
+
+ /*
+ * get the class id
+ */
+ pcparm.pc_cid = PC_CLNULL;
+ (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ /*
+ * get the current policy
+ */
+ bzero(&pcinfo, sizeof (pcinfo));
+ pcinfo.pc_cid = pcparm.pc_cid;
+ (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ if (strcmp(pcinfo.pc_clname, "TS") == 0)
+ policy = LX_SCHED_OTHER;
+ else if (strcmp(pcinfo.pc_clname, "RT") == 0)
+ policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
+ RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+ else
+ return (set_errno(EINVAL));
+
+ bzero(&pcinfo, sizeof (pcinfo));
+ bzero(&pcparm, sizeof (pcparm));
+ setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0);
+ switch (policy) {
+ case LX_SCHED_FIFO:
+ case LX_SCHED_RR:
+ (void) strcpy(pcinfo.pc_clname, "RT");
+ (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ if (prio < 0 ||
+ prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri)
+ return (set_errno(EINVAL));
+ pcparm.pc_cid = pcinfo.pc_cid;
+ ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
+ ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs =
+ policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
+ break;
+
+ case LX_SCHED_OTHER:
+ (void) strcpy(pcinfo.pc_clname, "TS");
+ (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri;
+ if (prio > maxupri || prio < -maxupri)
+ return (set_errno(EINVAL));
+
+ pcparm.pc_cid = pcinfo.pc_cid;
+ tsp = (tsparms_t *)pcparm.pc_clparms;
+ tsp->ts_upri = prio;
+ tsp->ts_uprilim = TS_NOCHANGE;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * finally set scheduling policy and parameters
+ */
+ (void) lx_do_priocntlsys(PC_SETPARMS, &procset, &pcparm);
+
+ return (0);
+}
+
+long
+lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ struct lx_sched_param local_param;
+ procset_t procset;
+ pcparms_t pcparm;
+ pcinfo_t pcinfo;
+ tsinfo_t *tsi;
+ int prio, scale;
+ int rv;
+
+ if (pid < 0 || param == NULL)
+ return (set_errno(EINVAL));
+
+ if (EMUL_SCHED()) {
+ proc_t *p;
+ kthread_t *tp = NULL;
+
+ /* Find and operate on the target lwp. */
+ if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0)
+ return (set_errno(rv));
+
+ local_param.lx_sched_prio = lwptolxlwp(ttolwp(tp))->br_schd_pri;
+ sprunlock(p);
+ if (copyout(&local_param, param, sizeof (local_param)))
+ return (set_errno(EFAULT));
+
+ return (0);
+ }
+
+ if ((rv = lx_sched_setprocset(&procset, pid)))
+ return (rv);
+
+ /*
+ * get the class id
+ */
+ pcparm.pc_cid = PC_CLNULL;
+ (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ /*
+ * get the class info and identify the equivalent linux policy
+ */
+ bzero(&pcinfo, sizeof (pcinfo));
+ pcinfo.pc_cid = pcparm.pc_cid;
+ (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ bzero(&local_param, sizeof (local_param));
+ if (strcmp(pcinfo.pc_clname, "TS") == 0) {
+ /*
+ * I don't know if we need to do this, coz it can't be
+ * changed from zero anyway.....
+ */
+ tsi = (tsinfo_t *)pcinfo.pc_clinfo;
+ prio = ((tsparms_t *)pcparm.pc_clparms)->ts_upri;
+ scale = tsi->ts_maxupri;
+ if (scale == 0)
+ local_param.lx_sched_prio = 0;
+ else
+ local_param.lx_sched_prio = -(prio * 20) / scale;
+ } else if (strcmp(pcinfo.pc_clname, "RT") == 0) {
+ local_param.lx_sched_prio =
+ ((rtparms_t *)pcparm.pc_clparms)->rt_pri;
+ } else {
+ rv = set_errno(EINVAL);
+ }
+
+ if (rv == 0)
+ if (copyout(&local_param, param, sizeof (local_param)))
+ return (set_errno(EFAULT));
+
+ return (rv);
+}
+
+long
+lx_sched_rr_get_interval(l_pid_t pid, struct timespec *ival)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ struct timespec interval;
+ procset_t procset;
+ pcparms_t pcparm;
+ pcinfo_t pcinfo;
+ int rv;
+
+ if (pid < 0)
+ return (set_errno(EINVAL));
+
+ if (EMUL_SCHED()) {
+ int policy;
+ proc_t *p;
+ kthread_t *tp = NULL;
+
+ /* Find and operate on the target lwp. */
+ if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0)
+ return (set_errno(rv));
+
+ policy = lwptolxlwp(ttolwp(tp))->br_schd_class;
+ sprunlock(p);
+
+ interval.tv_sec = 0;
+ if (policy == LX_SCHED_RR) {
+ /* Use a made-up value similar to Linux */
+ interval.tv_nsec = 100000000;
+ } else {
+ interval.tv_nsec = 0;
+ }
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ timespec32_t t32;
+
+ /*
+ * A timespec may overflow for 32-bit but EOVERFLOW
+ * is not documented as an acceptable error for
+ * sched_rr_get_interval. Such an occurance would be
+ * exceptionally weird for the RR interval.
+ */
+ TIMESPEC_TO_TIMESPEC32(&t32, &interval);
+
+ if (copyout(&t32, ival, sizeof (t32)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
+ else
+#endif
+ {
+ if (copyout(&interval, ival, sizeof (interval)))
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+ }
+
+ if ((rv = lx_sched_setprocset(&procset, pid)))
+ return (rv);
+
+ /*
+ * get the class id
+ */
+ pcparm.pc_cid = PC_CLNULL;
+ (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ /*
+ * get the class info and identify the equivalent linux policy
+ */
+ bzero(&pcinfo, sizeof (pcinfo));
+ pcinfo.pc_cid = pcparm.pc_cid;
+ (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ /*
+ * get the class info and identify the equivalent linux policy
+ */
+ setprocset(&procset, POP_AND, P_PID, 0, P_ALL, 0);
+ bzero(&pcinfo, sizeof (pcinfo));
+ (void) strcpy(pcinfo.pc_clname, "RT");
+ (void) lx_do_priocntlsys(PC_GETCID, &procset, &pcinfo);
+ if (lwp->lwp_errno)
+ return (lwp->lwp_errno);
+
+ /*
+ * Contrary to what the man page says, you don't have to be in RR to
+ * get this interval.
+ */
+ if (((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF) {
+ interval.tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs;
+ interval.tv_nsec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs;
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ timespec32_t t32;
+
+ /*
+ * Like above, the 32-bit EOVERFLOW check is not
+ * appropriate here.
+ */
+ TIMESPEC_TO_TIMESPEC32(&t32, &interval);
+
+ if (copyout(&t32, ival, sizeof (t32)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
+ else
+#endif
+ {
+ if (copyout(&interval, ival, sizeof (interval)))
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+ }
+
+ return (set_errno(EINVAL));
+}
+
+long
+lx_sched_get_priority_min(uintptr_t policy)
+{
+ /*
+ * Linux scheduling priorities are not alterable, so there is no
+ * illumos translation necessary.
+ */
+ switch (policy) {
+ case LX_SCHED_FIFO:
+ case LX_SCHED_RR:
+ return (LX_SCHED_PRIORITY_MIN_RRFIFO);
+ case LX_SCHED_OTHER:
+ case LX_SCHED_BATCH:
+ case LX_SCHED_IDLE:
+ case LX_SCHED_DEADLINE:
+ return (LX_SCHED_PRIORITY_MIN_OTHER);
+ default:
+ break;
+ }
+ return (set_errno(EINVAL));
+}
+
+long
+lx_sched_get_priority_max(uintptr_t policy)
+{
+ /*
+ * Linux scheduling priorities are not alterable, so there is no
+ * illumos translation necessary.
+ */
+ switch (policy) {
+ case LX_SCHED_FIFO:
+ case LX_SCHED_RR:
+ return (LX_SCHED_PRIORITY_MAX_RRFIFO);
+ case LX_SCHED_OTHER:
+ case LX_SCHED_BATCH:
+ case LX_SCHED_IDLE:
+ case LX_SCHED_DEADLINE:
+ return (LX_SCHED_PRIORITY_MAX_OTHER);
+ default:
+ break;
+ }
+ return (set_errno(EINVAL));
+}
+
+long
+lx_sched_setattr(l_pid_t pid, lx_sched_attr_t *attr, uint32_t flags)
+{
+ int rv;
+ uint32_t lx_size;
+ lx_sched_attr_t local_attr;
+ uint64_t flg;
+
+ if (pid < 0 || attr == NULL || flags != 0)
+ return (set_errno(EINVAL));
+
+ if (copyin(attr, &lx_size, sizeof (lx_size)))
+ return (set_errno(EFAULT));
+
+ if (lx_size > sizeof (local_attr))
+ return (set_errno(E2BIG));
+
+ bzero(&local_attr, sizeof (local_attr));
+ if (copyin(attr, &local_attr, lx_size))
+ return (set_errno(EFAULT));
+
+ flg = local_attr.lx_sched_flags;
+ if ((flg & ~LX_SCHED_FLAG_RESET_ON_FORK) != 0)
+ return (set_errno(EINVAL));
+
+ if (EMUL_SCHED()) {
+ int policy;
+ proc_t *p;
+ kthread_t *tp = NULL;
+ int incr;
+ lx_lwp_data_t *lwpd;
+
+ /* Find and operate on the target lwp. */
+ if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0)
+ return (set_errno(rv));
+
+ policy = local_attr.lx_sched_policy;
+
+ switch (policy) {
+ case LX_SCHED_OTHER:
+ case LX_SCHED_BATCH:
+ case LX_SCHED_IDLE:
+ break;
+ case LX_SCHED_FIFO:
+ case LX_SCHED_RR:
+ if (crgetuid(CRED()) != 0) {
+ sprunlock(p);
+ return (set_errno(EPERM));
+ }
+ if (local_attr.lx_sched_priority <
+ LX_SCHED_PRIORITY_MIN_RRFIFO ||
+ local_attr.lx_sched_priority >
+ LX_SCHED_PRIORITY_MAX_RRFIFO) {
+ sprunlock(p);
+ return (set_errno(EINVAL));
+ }
+ break;
+
+ case LX_SCHED_DEADLINE:
+ if (crgetuid(CRED()) != 0) {
+ sprunlock(p);
+ return (set_errno(EPERM));
+ }
+ break;
+ default:
+ sprunlock(p);
+ return (set_errno(EINVAL));
+ }
+
+ lwpd = lwptolxlwp(ttolwp(tp));
+ lwpd->br_schd_class = policy;
+ lwpd->br_schd_flags = flg;
+ lwpd->br_schd_pri = local_attr.lx_sched_priority;
+
+ lwpd->br_schd_runtime = local_attr.lx_sched_runtime;
+ lwpd->br_schd_deadline = local_attr.lx_sched_deadline;
+ lwpd->br_schd_period = local_attr.lx_sched_period;
+
+ ASSERT(policy <= LX_SCHED_DEADLINE);
+ incr = lx_emul_pri_map[policy];
+
+ CL_DOPRIO(tp, CRED(), incr, &rv);
+ sprunlock(p);
+ return (0);
+ }
+
+ /* Currently not supported under other classes */
+ return (set_errno(ENOSYS));
+}
+
+long
+lx_sched_getattr(l_pid_t pid, lx_sched_attr_t *attr, uint32_t size,
+ uint32_t flags)
+{
+ lx_sched_attr_t local_attr;
+ int rv;
+
+ if (pid < 0 || attr == NULL || flags != 0 || size < sizeof (local_attr))
+ return (set_errno(EINVAL));
+
+ bzero(&local_attr, sizeof (local_attr));
+ if (EMUL_SCHED()) {
+ proc_t *p;
+ kthread_t *tp = NULL;
+ lx_lwp_data_t *lwpd;
+
+ /* Find and operate on the target lwp. */
+ if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0)
+ return (set_errno(rv));
+
+ lwpd = lwptolxlwp(ttolwp(tp));
+ local_attr.lx_sched_policy = lwpd->br_schd_class;
+ local_attr.lx_sched_priority = lwpd->br_schd_pri;
+ local_attr.lx_sched_flags = lwpd->br_schd_flags;
+
+ local_attr.lx_sched_runtime = lwpd->br_schd_runtime;
+ local_attr.lx_sched_deadline = lwpd->br_schd_deadline;
+ local_attr.lx_sched_period = lwpd->br_schd_period;
+
+ sprunlock(p);
+
+ local_attr.lx_size = sizeof (lx_sched_attr_t);
+
+ if (copyout(&local_attr, attr, sizeof (local_attr)))
+ return (set_errno(EFAULT));
+
+ return (0);
+ }
+
+ /* Currently not supported under other classes */
+ return (set_errno(ENOSYS));
+}
+
+static int
+lx_sched_setprocset(procset_t *procset, l_pid_t pid)
+{
+ id_t lid, rid;
+ idtype_t lidtype, ridtype;
+
+ /*
+ * define the target lwp
+ */
+ if (pid == 0)
+ pid = curproc->p_pid;
+
+ if (lx_lpid_to_spair(pid, &pid, &lid) < 0)
+ return (set_errno(ESRCH));
+ rid = 0;
+ ridtype = P_ALL;
+ lidtype = P_LWPID;
+
+ setprocset(procset, POP_AND, lidtype, lid, ridtype, rid);
+
+ return (0);
+}
+
+static long
+lx_do_priocntlsys(int cmd, procset_t *procset, void *arg)
+{
+ return (priocntl_common(PC_VERSION, procset, cmd, (caddr_t)arg, 0,
+ UIO_SYSSPACE));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_socket.c b/usr/src/uts/common/brand/lx/syscall/lx_socket.c
new file mode 100644
index 0000000000..a95e220ea2
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_socket.c
@@ -0,0 +1,4537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/sockio.h>
+#include <sys/thread.h>
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kmem.h>
+#include <sys/un.h>
+#include <sys/sunddi.h>
+#include <sys/cred.h>
+#include <sys/ucred.h>
+#include <sys/model.h>
+#include <sys/brand.h>
+#include <sys/vmsystm.h>
+#include <sys/limits.h>
+#include <sys/fcntl.h>
+#include <sys/sysmacros.h>
+#include <netpacket/packet.h>
+#include <sockcommon.h>
+#include <socktpi_impl.h>
+#include <netinet/udp.h>
+#include <sys/sdt.h>
+#include <netinet/tcp.h>
+#include <netinet/igmp.h>
+#include <netinet/icmp6.h>
+#include <inet/tcp_impl.h>
+#include <lx_errno.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_socket.h>
+#include <sys/lx_types.h>
+#include <sys/lx_impl.h>
+
+/* From uts/common/fs/sockfs/socksyscalls.c */
+extern int listen(int, int, int);
+extern int shutdown(int, int, int);
+
+typedef struct lx_ucred {
+ pid_t lxu_pid;
+ lx_uid_t lxu_uid;
+ lx_gid_t lxu_gid;
+} lx_ucred_t;
+
+typedef struct lx_socket_aux_data
+{
+ kmutex_t lxsad_lock;
+ enum lxsad_status_t {
+ LXSS_NONE = 0,
+ LXSS_CONNECTING,
+ LXSS_CONNECTED
+ } lxsad_status;
+ uint_t lxsad_flags;
+} lx_socket_aux_data_t;
+
+#define LX_SS_MAXSIZE 128
+
+typedef struct lx_sockaddr_storage {
+ unsigned short lxss_family;
+ char lxdata[LX_SS_MAXSIZE - sizeof (unsigned short)];
+} lx_sockaddr_storage_t;
+
+typedef struct lx_group_req {
+ uint32_t lxgr_interface;
+#ifdef _LP64
+ /* On 64-bit linux kernels, gr_interface is padded by 4 bytes. */
+ uint32_t _lxgr_pad;
+#endif
+ lx_sockaddr_storage_t lxgr_group;
+} lx_group_req_t;
+
+#if defined(_SYSCALL32_IMPL)
+
+typedef struct lx_group_req32 {
+ uint32_t lxgr_interface;
+ lx_sockaddr_storage_t lxgr_group;
+} lx_group_req32_t;
+
+#endif /* defined(_SYSCALL32_IMPL) */
+
+/* lxsad_flags */
+#define LXSAD_FL_STRCRED 0x1
+#define LXSAD_FL_EMULSEQPKT 0x2
+
+static lx_socket_aux_data_t *lx_sad_acquire(vnode_t *);
+
+/* VSD key for lx-specific socket information */
+static uint_t lx_socket_vsd = 0;
+
+/* Convenience enum to enforce translation direction */
+typedef enum lx_xlate_dir {
+ SUNOS_TO_LX,
+ LX_TO_SUNOS
+} lx_xlate_dir_t;
+
+/* enum for getpeername/getsockname handling */
+typedef enum lx_getname_type {
+ LX_GETPEERNAME,
+ LX_GETSOCKNAME
+} lx_getname_type_t;
+
+/*
+ * What follows are a series of tables we use to translate Linux constants
+ * into equivalent Illumos constants and back again. I wish this were
+ * cleaner, more programmatic, and generally nicer. Sadly, life is messy,
+ * and Unix networking even more so.
+ */
+static const int ltos_family[LX_AF_MAX + 1] = {
+ AF_UNSPEC, /* LX_AF_UNSPEC */
+ AF_UNIX, /* LX_AF_UNIX */
+ AF_INET, /* LX_AF_INET */
+ AF_NOTSUPPORTED, /* LX_AF_AX25 */
+ AF_NOTSUPPORTED, /* LX_AF_IPX */
+ AF_NOTSUPPORTED, /* LX_AF_APPLETALK */
+ AF_NOTSUPPORTED, /* LX_AF_NETROM */
+ AF_NOTSUPPORTED, /* LX_AF_BRIDGE */
+ AF_NOTSUPPORTED, /* LX_AF_ATMPVC */
+ AF_NOTSUPPORTED, /* LX_AF_X25 */
+ AF_INET6, /* LX_AF_INET6 */
+ AF_NOTSUPPORTED, /* LX_AF_ROSE */
+ AF_NOTSUPPORTED, /* LX_AF_DECNET */
+ AF_NOTSUPPORTED, /* LX_AF_NETBEUI */
+ AF_NOTSUPPORTED, /* LX_AF_SECURITY */
+ AF_NOTSUPPORTED, /* LX_AF_KEY */
+ AF_LX_NETLINK, /* LX_AF_NETLINK */
+ AF_PACKET, /* LX_AF_PACKET */
+ AF_NOTSUPPORTED, /* LX_AF_ASH */
+ AF_NOTSUPPORTED, /* LX_AF_ECONET */
+ AF_NOTSUPPORTED, /* LX_AF_ATMSVC */
+ AF_NOTSUPPORTED, /* LX_AF_RDS */
+ AF_NOTSUPPORTED, /* LX_AF_SNA */
+ AF_NOTSUPPORTED, /* LX_AF_IRDA */
+ AF_NOTSUPPORTED, /* LX_AF_PPOX */
+ AF_NOTSUPPORTED, /* LX_AF_WANPIPE */
+ AF_NOTSUPPORTED, /* LX_AF_LLC */
+ AF_NOTSUPPORTED, /* NONE */
+ AF_NOTSUPPORTED, /* NONE */
+ AF_NOTSUPPORTED, /* LX_AF_CAN */
+ AF_NOTSUPPORTED, /* LX_AF_TIPC */
+ AF_NOTSUPPORTED, /* LX_AF_BLUETOOTH */
+ AF_NOTSUPPORTED, /* LX_AF_IUCV */
+ AF_NOTSUPPORTED /* LX_AF_RXRPC */
+ /* LX_AF_ISDN */
+ /* LX_AF_PHONET */
+ /* LX_AF_IEEE802154 */
+ /* LX_AF_CAIF */
+ /* LX_AF_ALG */
+ /* LX_AF_NFC */
+ /* LX_AF_VSOCK */
+};
+
+static const int stol_family[LX_AF_MAX + 1] = {
+ AF_UNSPEC, /* AF_UNSPEC */
+ AF_UNIX, /* AF_UNIX */
+ AF_INET, /* AF_INET */
+ AF_NOTSUPPORTED, /* AF_IMPLINK */
+ AF_NOTSUPPORTED, /* AF_PUP */
+ AF_NOTSUPPORTED, /* AF_CHAOS */
+ AF_NOTSUPPORTED, /* AF_NS */
+ AF_NOTSUPPORTED, /* AF_NBS */
+ AF_NOTSUPPORTED, /* AF_ECMA */
+ AF_NOTSUPPORTED, /* AF_DATAKIT */
+ AF_NOTSUPPORTED, /* AF_CCITT */
+ AF_NOTSUPPORTED, /* AF_SNA */
+ AF_NOTSUPPORTED, /* AF_DECNET */
+ AF_NOTSUPPORTED, /* AF_DLI */
+ AF_NOTSUPPORTED, /* AF_LAT */
+ AF_NOTSUPPORTED, /* AF_HYLINK */
+ AF_NOTSUPPORTED, /* AF_APPLETALK */
+ AF_NOTSUPPORTED, /* AF_NIT */
+ AF_NOTSUPPORTED, /* AF_802 */
+ AF_NOTSUPPORTED, /* AF_OSI */
+ AF_NOTSUPPORTED, /* AF_X25 */
+ AF_NOTSUPPORTED, /* AF_OSINET */
+ AF_NOTSUPPORTED, /* AF_GOSIP */
+ AF_NOTSUPPORTED, /* AF_IPX */
+ AF_NOTSUPPORTED, /* AF_ROUTE */
+ AF_NOTSUPPORTED, /* AF_LINK */
+ LX_AF_INET6, /* AF_INET6 */
+ AF_NOTSUPPORTED, /* AF_KEY */
+ AF_NOTSUPPORTED, /* AF_NCA */
+ AF_NOTSUPPORTED, /* AF_POLICY */
+ AF_NOTSUPPORTED, /* AF_INET_OFFLOAD */
+ AF_NOTSUPPORTED, /* AF_TRILL */
+ LX_AF_PACKET, /* AF_PACKET */
+ LX_AF_NETLINK /* AF_LX_NETLINK */
+};
+
+#define LTOS_FAMILY(d) ((d) <= LX_AF_MAX ? ltos_family[(d)] : AF_INVAL)
+#define STOL_FAMILY(d) ((d) <= LX_AF_MAX ? stol_family[(d)] : AF_INVAL)
+
+
+static const int ltos_socktype[LX_SOCK_PACKET + 1] = {
+ SOCK_NOTSUPPORTED, SOCK_STREAM, SOCK_DGRAM, SOCK_RAW,
+ SOCK_RDM, SOCK_SEQPACKET, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED,
+ SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED
+};
+
+static const int stol_socktype[SOCK_SEQPACKET + 1] = {
+ SOCK_NOTSUPPORTED, LX_SOCK_DGRAM, LX_SOCK_STREAM, SOCK_NOTSUPPORTED,
+ LX_SOCK_RAW, LX_SOCK_RDM, LX_SOCK_SEQPACKET
+};
+
+#define LTOS_SOCKTYPE(t) \
+ ((t) <= LX_SOCK_PACKET ? ltos_socktype[(t)] : SOCK_INVAL)
+#define STOL_SOCKTYPE(t) \
+ ((t) <= SOCK_SEQPACKET ? stol_socktype[(t)] : SOCK_INVAL)
+
+
+/*
+ * This string is used to prefix all abstract namespace Unix sockets, ie all
+ * abstract namespace sockets are converted to regular sockets in the /tmp
+ * directory with .ABSK_ prefixed to their names.
+ */
+#define ABST_PRFX "/tmp/.ABSK_"
+#define ABST_PRFX_LEN (sizeof (ABST_PRFX) - 1)
+
+#define DATAFILT "datafilt"
+
+typedef enum {
+ lxa_none,
+ lxa_abstract,
+ lxa_devlog
+} lx_addr_type_t;
+
+static int
+ltos_pkt_proto(int protocol)
+{
+ switch (ntohs(protocol)) {
+ case LX_ETH_P_802_2:
+ return (ETH_P_802_2);
+ case LX_ETH_P_IP:
+ return (ETH_P_IP);
+ case LX_ETH_P_ARP:
+ return (ETH_P_ARP);
+ case LX_ETH_P_IPV6:
+ return (ETH_P_IPV6);
+ case LX_ETH_P_ALL:
+ case LX_ETH_P_802_3:
+ return (ETH_P_ALL);
+ default:
+ return (-1);
+ }
+}
+
+
+typedef struct lx_flag_map {
+ enum {
+ LXFM_MAP,
+ LXFM_IGNORE,
+ LXFM_UNSUP
+ } lxfm_action;
+ int lxfm_sunos_flag;
+ int lxfm_linux_flag;
+ char *lxfm_name;
+} lx_flag_map_t;
+
+static lx_flag_map_t lx_flag_map_tbl[] = {
+ { LXFM_MAP, MSG_OOB, LX_MSG_OOB, NULL },
+ { LXFM_MAP, MSG_PEEK, LX_MSG_PEEK, NULL },
+ { LXFM_MAP, MSG_DONTROUTE, LX_MSG_DONTROUTE, NULL },
+ { LXFM_MAP, MSG_CTRUNC, LX_MSG_CTRUNC, NULL },
+ { LXFM_MAP, MSG_TRUNC, LX_MSG_TRUNC, NULL },
+ { LXFM_MAP, MSG_DONTWAIT, LX_MSG_DONTWAIT, NULL },
+ { LXFM_MAP, MSG_EOR, LX_MSG_EOR, NULL },
+ { LXFM_MAP, MSG_WAITALL, LX_MSG_WAITALL, NULL },
+ /* MSG_CONFIRM is safe to ignore */
+ { LXFM_IGNORE, 0, LX_MSG_CONFIRM, NULL },
+ /*
+ * The NOSIGNAL and CMSG_CLOEXEC flags are handled by the emulation
+ * outside of the flag-conversion routine.
+ */
+ { LXFM_IGNORE, 0, LX_MSG_NOSIGNAL, NULL },
+ { LXFM_IGNORE, 0, LX_MSG_CMSG_CLOEXEC, NULL },
+ { LXFM_UNSUP, LX_MSG_PROXY, 0, "MSG_PROXY" },
+ { LXFM_UNSUP, LX_MSG_FIN, 0, "MSG_FIN" },
+ { LXFM_UNSUP, LX_MSG_SYN, 0, "MSG_SYN" },
+ { LXFM_UNSUP, LX_MSG_RST, 0, "MSG_RST" },
+ { LXFM_UNSUP, LX_MSG_ERRQUEUE, 0, "MSG_ERRQUEUE" },
+ { LXFM_UNSUP, LX_MSG_MORE, 0, "MSG_MORE" },
+ { LXFM_UNSUP, LX_MSG_WAITFORONE, 0, "MSG_WAITFORONE" },
+ { LXFM_UNSUP, LX_MSG_FASTOPEN, 0, "MSG_FASTOPEN" },
+};
+
+#define LX_FLAG_MAP_MAX \
+ (sizeof (lx_flag_map_tbl) / sizeof (lx_flag_map_tbl[0]))
+
+#define LX_UNSUP_BUFSZ 64
+
+static int
+lx_xlate_sock_flags(int inflags, lx_xlate_dir_t dir)
+{
+ int i, outflags = 0;
+ char buf[LX_UNSUP_BUFSZ];
+
+ VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS);
+
+ for (i = 0; i < LX_FLAG_MAP_MAX; i++) {
+ lx_flag_map_t *map = &lx_flag_map_tbl[i];
+ int match, out;
+
+ if (dir == SUNOS_TO_LX) {
+ match = inflags & map->lxfm_sunos_flag;
+ out = map->lxfm_linux_flag;
+ } else {
+ match = inflags & map->lxfm_linux_flag;
+ out = map->lxfm_sunos_flag;
+ }
+ switch (map->lxfm_action) {
+ case LXFM_MAP:
+ if (match != 0) {
+ inflags &= ~(match);
+ outflags |= out;
+ }
+ break;
+ case LXFM_IGNORE:
+ if (match != 0) {
+ inflags &= ~(match);
+ }
+ break;
+ case LXFM_UNSUP:
+ if (match != 0) {
+ (void) snprintf(buf, LX_UNSUP_BUFSZ,
+ "unsupported sock flag %s", map->lxfm_name);
+ lx_unsupported(buf);
+ }
+ }
+ }
+ if (inflags != 0) {
+ (void) snprintf(buf, LX_UNSUP_BUFSZ,
+ "unsupported sock flags 0x%08x", inflags);
+ lx_unsupported(buf);
+ }
+
+ return (outflags);
+}
+
+typedef enum lx_sun_type {
+ LX_SUN_NORMAL,
+ LX_SUN_ABSTRACT,
+} lx_sun_type_t;
+
+static void
+ltos_sockaddr_ux(const struct sockaddr *inaddr, const socklen_t inlen,
+ struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type)
+{
+ struct sockaddr_un buf;
+ /* Calculate size of (sun_family + any padding) in sockaddr */
+ int sizediff = (sizeof (buf) - sizeof (buf.sun_path));
+ int len = inlen - sizediff;
+
+ VERIFY(len > 0);
+ VERIFY(len <= sizeof (buf.sun_path));
+ bzero(&buf, sizeof (buf));
+
+ if (inaddr->sa_data[0] == '\0') {
+ /*
+ * Linux supports abstract Unix sockets, which are simply
+ * sockets that do not exist on the file system. These sockets
+ * are denoted by beginning the path with a NULL character. To
+ * support these, we strip out the leading NULL character and
+ * change the path to point to a real place in /tmp directory,
+ * by prepending ABST_PRFX and replacing all illegal characters
+ * with * '_'.
+ *
+ * Since these sockets are supposed to exist outside the
+ * filesystem, they must be cleaned up after use. This removal
+ * is performed during bind().
+ */
+ int idx, odx;
+
+ /* Add our abstract prefix */
+ (void) strcpy(buf.sun_path, ABST_PRFX);
+ for (idx = 1, odx = ABST_PRFX_LEN;
+ idx < len && odx < sizeof (buf.sun_path);
+ idx++, odx++) {
+ char c = inaddr->sa_data[idx];
+ if (c == '\0' || c == '/') {
+ buf.sun_path[odx] = '_';
+ } else {
+ buf.sun_path[odx] = c;
+ }
+ }
+
+ /*
+ * Since abstract socket addresses might not be NUL terminated,
+ * we must explicitly NUL terminate the translated path.
+ * Care is taken not to overflow the buffer.
+ */
+ if (odx == sizeof (buf.sun_path)) {
+ buf.sun_path[odx - 1] = '\0';
+ } else {
+ buf.sun_path[odx] = '\0';
+ }
+
+ if (sun_type != NULL) {
+ *sun_type = LX_SUN_ABSTRACT;
+ }
+ } else {
+ /* Copy the address directly, minding termination */
+ (void) strncpy(buf.sun_path, inaddr->sa_data, len);
+ len = strnlen(buf.sun_path, len);
+ if (len == sizeof (buf.sun_path)) {
+ buf.sun_path[len - 1] = '\0';
+ } else {
+ VERIFY(len < sizeof (buf.sun_path));
+ buf.sun_path[len] = '\0';
+ }
+
+ if (sun_type != NULL) {
+ *sun_type = LX_SUN_NORMAL;
+ }
+ }
+ buf.sun_family = AF_UNIX;
+ *outlen = strlen(buf.sun_path) + 1 + sizediff;
+ VERIFY(*outlen <= sizeof (struct sockaddr_un));
+
+ *outaddr = kmem_alloc(*outlen, KM_SLEEP);
+ bcopy(&buf, *outaddr, *outlen);
+}
+
+/*
+ * Copy in a Linux-native socket address from userspace and convert it into
+ * illumos format. When successful, it will allocate an appropriately sized
+ * struct to be freed by the caller.
+ */
+static long
+ltos_sockaddr_copyin(const struct sockaddr *inaddr, const socklen_t inlen,
+ struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type)
+{
+ sa_family_t family;
+ struct sockaddr *laddr;
+ struct sockaddr_ll *sal;
+ int proto, error = 0;
+
+ VERIFY(inaddr != NULL);
+
+ if (inlen < sizeof (sa_family_t) ||
+ inlen > sizeof (struct sockaddr_storage)) {
+ return (EINVAL);
+ }
+ laddr = kmem_alloc(inlen, KM_SLEEP);
+ if (copyin(inaddr, laddr, inlen) != 0) {
+ kmem_free(laddr, inlen);
+ return (EFAULT);
+ }
+
+ family = LTOS_FAMILY(laddr->sa_family);
+ switch (family) {
+ case (sa_family_t)AF_NOTSUPPORTED:
+ error = EPROTONOSUPPORT;
+ break;
+
+ case (sa_family_t)AF_INVAL:
+ error = EAFNOSUPPORT;
+ break;
+
+ case AF_UNIX:
+ if (inlen < sizeof (sa_family_t) + 2 ||
+ inlen > sizeof (struct sockaddr_un)) {
+ error = EINVAL;
+ break;
+ }
+ ltos_sockaddr_ux(laddr, inlen, outaddr, outlen,
+ sun_type);
+
+ /* AF_UNIX bypasses the standard copy logic */
+ kmem_free(laddr, inlen);
+ return (0);
+
+ case AF_PACKET:
+ if (inlen < sizeof (struct sockaddr_ll)) {
+ error = EINVAL;
+ break;
+ }
+ *outlen = sizeof (struct sockaddr_ll);
+
+ /* sll_protocol must be translated */
+ /* LINTED: alignment */
+ sal = (struct sockaddr_ll *)laddr;
+ proto = ltos_pkt_proto(sal->sll_protocol);
+ if (proto < 0) {
+ error = EINVAL;
+ }
+ sal->sll_protocol = proto;
+ break;
+
+ case AF_INET:
+ if (inlen < sizeof (struct sockaddr)) {
+ error = EINVAL;
+ break;
+ }
+ *outlen = sizeof (struct sockaddr);
+ break;
+
+ case AF_INET6:
+ /*
+ * The illumos sockaddr_in6 has one more 32-bit field
+ * than the Linux version. We simply zero that field
+ * via kmem_zalloc.
+ */
+ if (inlen < sizeof (lx_sockaddr_in6_t)) {
+ error = EINVAL;
+ break;
+ }
+ *outlen = sizeof (struct sockaddr_in6);
+ *outaddr = (struct sockaddr *)kmem_zalloc(*outlen,
+ KM_SLEEP);
+ bcopy(laddr, *outaddr, sizeof (lx_sockaddr_in6_t));
+ (*outaddr)->sa_family = AF_INET6;
+ /* AF_INET6 bypasses the standard copy logic */
+ kmem_free(laddr, inlen);
+ return (0);
+
+ default:
+ *outlen = inlen;
+ }
+
+ if (error == 0) {
+ /*
+ * For most address families, just copying into a sockaddr of
+ * the correct size and updating sa_family is adequate.
+ */
+ VERIFY(inlen >= *outlen);
+
+ *outaddr = (struct sockaddr *)kmem_zalloc(*outlen, KM_SLEEP);
+ bcopy(laddr, *outaddr, *outlen);
+ (*outaddr)->sa_family = family;
+ }
+ kmem_free(laddr, inlen);
+ return (error);
+}
+
+/*
+ * Convert an illumos-native socket address into Linux format and copy it out
+ * to userspace.
+ */
+static long
+stol_sockaddr_copyout(struct sockaddr *inaddr, socklen_t inlen,
+ struct sockaddr *outaddr, void *outlenp, socklen_t orig)
+{
+ socklen_t size = inlen;
+ struct sockaddr_storage buf;
+ struct sockaddr *bufaddr;
+
+ /*
+ * Either we were passed a valid sockaddr (with length) or the length
+ * is set to 0.
+ */
+ VERIFY(inaddr != NULL || inlen == 0);
+
+ if (inlen == 0) {
+ goto finish;
+ }
+
+
+ switch (inaddr->sa_family) {
+ case AF_INET:
+ if (inlen != sizeof (struct sockaddr)) {
+ return (EINVAL);
+ }
+ break;
+
+ case AF_INET6:
+ if (inlen != sizeof (struct sockaddr_in6)) {
+ return (EINVAL);
+ }
+ /*
+ * The linux sockaddr_in6 is shorter than illumos.
+ * Truncate the extra field on the way out.
+ */
+ size = (sizeof (lx_sockaddr_in6_t));
+ inlen = (sizeof (lx_sockaddr_in6_t));
+ break;
+
+ case AF_UNIX:
+ if (inlen > sizeof (struct sockaddr_un)) {
+ return (EINVAL);
+ }
+
+ /*
+ * On Linux an empty AF_UNIX address is returned as NULL, which
+ * means setting the returned length to only encompass the
+ * address family part of the buffer. However, some code also
+ * references the address portion of the buffer and uses it,
+ * even though the returned length has been shortened. Thus, we
+ * clear the buffer to ensure that the address portion is NULL.
+ */
+ if (inaddr->sa_data[0] == '\0') {
+ bzero(&buf, sizeof (buf));
+ inlen = sizeof (inaddr->sa_family);
+ }
+ break;
+
+ case (sa_family_t)AF_NOTSUPPORTED:
+ return (EPROTONOSUPPORT);
+
+ case (sa_family_t)AF_INVAL:
+ return (EAFNOSUPPORT);
+
+ default:
+ break;
+ }
+
+ /*
+ * The input should be smaller than sockaddr_storage, the largest
+ * sockaddr we support.
+ */
+ VERIFY(inlen <= sizeof (buf));
+
+ bufaddr = (struct sockaddr *)&buf;
+ bcopy(inaddr, bufaddr, inlen);
+ bufaddr->sa_family = STOL_FAMILY(bufaddr->sa_family);
+
+ /*
+ * It is possible that userspace passed us a smaller buffer than we
+ * hope to output. When this is the case, we will truncate our output
+ * to the max size of their buffer but report the true size of the
+ * sockaddr when outputting the outlen value.
+ */
+ size = (orig < size) ? orig : size;
+
+ if (copyout(bufaddr, outaddr, size) != 0) {
+ return (EFAULT);
+ }
+
+finish:
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ int32_t len32 = (int32_t)inlen;
+ if (copyout(&len32, outlenp, sizeof (len32)) != 0) {
+ return (EFAULT);
+ }
+ } else
+#endif /* defined(_LP64) */
+ {
+ if (copyout(&inlen, outlenp, sizeof (inlen)) != 0) {
+ return (EFAULT);
+ }
+ }
+
+ return (0);
+}
+
+typedef struct lx_cmsg_xlate {
+ int lcx_sunos_level;
+ int lcx_sunos_type;
+ int (*lcx_stol_conv)(struct cmsghdr *, struct cmsghdr *);
+ int lcx_linux_level;
+ int lcx_linux_type;
+ int (*lcx_ltos_conv)(struct cmsghdr *, struct cmsghdr *);
+} lx_cmsg_xlate_t;
+
+static int cmsg_conv_generic(struct cmsghdr *, struct cmsghdr *);
+static int stol_conv_ucred(struct cmsghdr *, struct cmsghdr *);
+static int ltos_conv_ucred(struct cmsghdr *, struct cmsghdr *);
+static int stol_conv_recvttl(struct cmsghdr *, struct cmsghdr *);
+
+/*
+ * Table describing SunOS <-> Linux cmsg translation mappings.
+ * Certain types (IP_RECVTTL) are only converted in one direction and are
+ * indicated by one of the translation functions being set to NULL.
+ */
+static lx_cmsg_xlate_t lx_cmsg_xlate_tbl[] = {
+ { SOL_SOCKET, SCM_RIGHTS, cmsg_conv_generic,
+ LX_SOL_SOCKET, LX_SCM_RIGHTS, cmsg_conv_generic },
+ { SOL_SOCKET, SCM_UCRED, stol_conv_ucred,
+ LX_SOL_SOCKET, LX_SCM_CRED, ltos_conv_ucred },
+ { SOL_SOCKET, SCM_TIMESTAMP, cmsg_conv_generic,
+ LX_SOL_SOCKET, LX_SCM_TIMESTAMP, cmsg_conv_generic },
+ { IPPROTO_IP, IP_PKTINFO, cmsg_conv_generic,
+ LX_IPPROTO_IP, LX_IP_PKTINFO, cmsg_conv_generic },
+ { IPPROTO_IP, IP_RECVTTL, stol_conv_recvttl,
+ LX_IPPROTO_IP, LX_IP_TTL, NULL },
+ { IPPROTO_IP, IP_TTL, cmsg_conv_generic,
+ LX_IPPROTO_IP, LX_IP_TTL, cmsg_conv_generic },
+ { IPPROTO_IPV6, IPV6_HOPLIMIT, cmsg_conv_generic,
+ LX_IPPROTO_IPV6, LX_IPV6_HOPLIMIT, cmsg_conv_generic },
+ { IPPROTO_IPV6, IPV6_PKTINFO, cmsg_conv_generic,
+ LX_IPPROTO_IPV6, LX_IPV6_PKTINFO, cmsg_conv_generic }
+};
+
+#define LX_MAX_CMSG_XLATE \
+ (sizeof (lx_cmsg_xlate_tbl) / sizeof (lx_cmsg_xlate_tbl[0]))
+
+#if defined(_LP64)
+
+typedef struct {
+ int64_t cmsg_len;
+ int32_t cmsg_level;
+ int32_t cmsg_type;
+} lx_cmsghdr64_t;
+
+/* The alignment/padding for 64bit Linux cmsghdr is not the same. */
+#define LX_CMSG64_ALIGNMENT 8
+#define ISALIGNED_LX_CMSG64(addr) \
+ (((uintptr_t)(addr) & (LX_CMSG64_ALIGNMENT - 1)) == 0)
+#define ROUNDUP_LX_CMSG64_LEN(len) \
+ (((len) + LX_CMSG64_ALIGNMENT - 1) & ~(LX_CMSG64_ALIGNMENT - 1))
+
+#define LX_CMSG64_IS_ALIGNED(m) \
+ (((uintptr_t)(m) & (_CMSG_DATA_ALIGNMENT - 1)) == 0)
+#define LX_CMSG64_DATA(c) ((unsigned char *)(((lx_cmsghdr64_t *)(c)) + 1))
+/*
+ * LX_CMSG64_VALID is closely derived from CMSG_VALID with one particularly
+ * important addition. Since cmsg_len is 64bit, (cmsg + cmsg_len) is checked
+ * against the start address as well. This prevents bogus inputs from wrapping
+ * around the address space.
+ */
+#define LX_CMSG64_VALID(cmsg, start, end) \
+ (ISALIGNED_LX_CMSG64(cmsg) && \
+ ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \
+ ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \
+ ((cmsg)->cmsg_len >= sizeof (lx_cmsghdr64_t)) && \
+ ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)) && \
+ ((uintptr_t)(cmsg) + (cmsg)->cmsg_len >= (uintptr_t)(start)))
+#define LX_CMSG64_NEXT(cmsg) \
+ (lx_cmsghdr64_t *)((uintptr_t)(cmsg) + \
+ ROUNDUP_LX_CMSG64_LEN((cmsg)->cmsg_len))
+#define LX_CMSG64_DIFF sizeof (uint32_t)
+
+#endif /* defined(_LP64) */
+
+/*
+ * convert ucred_s to lx_ucred.
+ */
+static int
+stol_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+ /*
+ * Format the data correctly in the omsg buffer.
+ */
+ if (omsg != NULL) {
+ struct ucred_s *scred;
+ prcred_t *cr;
+ lx_ucred_t lcred;
+
+ scred = (struct ucred_s *)CMSG_CONTENT(inmsg);
+ lcred.lxu_pid = scred->uc_pid;
+ /* LINTED: alignment */
+ cr = UCCRED(scred);
+ if (cr != NULL) {
+ lcred.lxu_uid = cr->pr_euid;
+ lcred.lxu_gid = cr->pr_egid;
+ } else {
+ lcred.lxu_uid = lcred.lxu_gid = 0;
+ }
+
+ bcopy(&lcred, CMSG_CONTENT(omsg), sizeof (lx_ucred_t));
+ }
+
+ return (sizeof (struct cmsghdr) + sizeof (lx_ucred_t));
+}
+
+static int
+ltos_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+ if (omsg != NULL) {
+ struct ucred_s *uc;
+ prcred_t *pc;
+ lx_ucred_t *lcred;
+
+ uc = (struct ucred_s *)CMSG_CONTENT(omsg);
+ /* LINTED: alignment */
+ pc = (prcred_t *)((char *)uc + sizeof (struct ucred_s));
+
+ uc->uc_credoff = sizeof (struct ucred_s);
+
+ lcred = (lx_ucred_t *)CMSG_CONTENT(inmsg);
+
+ uc->uc_pid = lcred->lxu_pid;
+ pc->pr_euid = lcred->lxu_uid;
+ pc->pr_egid = lcred->lxu_gid;
+ }
+
+ return (sizeof (struct cmsghdr) + sizeof (struct ucred_s) +
+ sizeof (prcred_t));
+
+}
+
+static int
+stol_conv_recvttl(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+ /*
+ * SunOS communicates the TTL of incoming packets via IP_RECVTTL using
+ * a uint8_t value instead of IP_TTL using an int. This conversion is
+ * only needed in the one direction since Linux does not handle
+ * IP_RECVTTL in the sendmsg path.
+ */
+ if (omsg != NULL) {
+ uint8_t *inttl = (uint8_t *)CMSG_CONTENT(inmsg);
+ int *ottl = (int *)CMSG_CONTENT(omsg);
+
+ *ottl = (int)*inttl;
+ }
+
+ return (sizeof (struct cmsghdr) + sizeof (int));
+}
+
+static int
+cmsg_conv_generic(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+ if (omsg != NULL) {
+ size_t data_len;
+
+ data_len = inmsg->cmsg_len - sizeof (struct cmsghdr);
+ bcopy(CMSG_CONTENT(inmsg), CMSG_CONTENT(omsg), data_len);
+ }
+
+ return (inmsg->cmsg_len);
+}
+
+static int
+lx_xlate_cmsg(struct cmsghdr *inmsg, struct cmsghdr *omsg, lx_xlate_dir_t dir)
+{
+ int i;
+ int len;
+
+ VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS);
+
+ for (i = 0; i < LX_MAX_CMSG_XLATE; i++) {
+ lx_cmsg_xlate_t *xlate = &lx_cmsg_xlate_tbl[i];
+ if (dir == LX_TO_SUNOS &&
+ inmsg->cmsg_level == xlate->lcx_linux_level &&
+ inmsg->cmsg_type == xlate->lcx_linux_type &&
+ xlate->lcx_ltos_conv != NULL) {
+ len = xlate->lcx_ltos_conv(inmsg, omsg);
+ if (omsg != NULL) {
+ omsg->cmsg_len = len;
+ omsg->cmsg_level = xlate->lcx_sunos_level;
+ omsg->cmsg_type = xlate->lcx_sunos_type;
+ }
+ return (len);
+ } else if (dir == SUNOS_TO_LX &&
+ inmsg->cmsg_level == xlate->lcx_sunos_level &&
+ inmsg->cmsg_type == xlate->lcx_sunos_type &&
+ xlate->lcx_stol_conv != NULL) {
+ len = xlate->lcx_stol_conv(inmsg, omsg);
+ if (omsg != NULL) {
+ omsg->cmsg_len = len;
+ omsg->cmsg_level = xlate->lcx_linux_level;
+ omsg->cmsg_type = xlate->lcx_linux_type;
+ }
+ return (len);
+ }
+ }
+ /*
+ * The Linux man page for sendmsg does not define a specific error for
+ * unsupported cmsgs. While it is meant to indicated bad values for
+ * passed flags, EOPNOTSUPP appears to be the next closest choice.
+ */
+ return (-EOPNOTSUPP);
+}
+
+static long
+ltos_cmsgs_copyin(void *addr, socklen_t inlen, void **outmsg,
+ socklen_t *outlenp)
+{
+ void *inbuf, *obuf;
+ struct cmsghdr *inmsg, *omsg;
+ int slen = 0;
+
+ if (inlen < sizeof (struct cmsghdr) || inlen > SO_MAXARGSIZE) {
+ return (EINVAL);
+ }
+
+#if defined(_LP64)
+ if (get_udatamodel() == DATAMODEL_NATIVE &&
+ inlen < sizeof (lx_cmsghdr64_t)) {
+ /* The size requirements are more strict for 64bit. */
+ return (EINVAL);
+ }
+#endif /* defined(_LP64) */
+
+ inbuf = kmem_alloc(inlen, KM_SLEEP);
+ if (copyin(addr, inbuf, inlen) != 0) {
+ kmem_free(inbuf, inlen);
+ return (EFAULT);
+ }
+
+#if defined(_LP64)
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ /*
+ * Linux cmsg headers are longer than illumos under x86_64.
+ * Convert to regular cmsgs first.
+ */
+ lx_cmsghdr64_t *lmsg;
+ struct cmsghdr *smsg;
+ void *newbuf;
+ int len = 0;
+
+ /* Inventory the new cmsg size */
+ for (lmsg = (lx_cmsghdr64_t *)inbuf;
+ LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+ lmsg = LX_CMSG64_NEXT(lmsg)) {
+ len += ROUNDUP_cmsglen(lmsg->cmsg_len - LX_CMSG64_DIFF);
+ }
+
+ VERIFY(len < inlen);
+ if (len == 0) {
+ /* Input was bogus, so we can give up early. */
+ kmem_free(inbuf, inlen);
+ *outmsg = NULL;
+ *outlenp = 0;
+ return (EINVAL);
+ }
+
+ newbuf = kmem_alloc(len, KM_SLEEP);
+
+ for (lmsg = (lx_cmsghdr64_t *)inbuf,
+ smsg = (struct cmsghdr *)newbuf;
+ LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+ lmsg = LX_CMSG64_NEXT(lmsg), smsg = CMSG_NEXT(smsg)) {
+ smsg->cmsg_level = lmsg->cmsg_level;
+ smsg->cmsg_type = lmsg->cmsg_type;
+ smsg->cmsg_len = lmsg->cmsg_len - LX_CMSG64_DIFF;
+
+ /* The above length measurement should ensure this */
+ ASSERT(CMSG_VALID(smsg, newbuf,
+ (uintptr_t)newbuf + len));
+
+ bcopy(LX_CMSG64_DATA(lmsg), CMSG_CONTENT(smsg),
+ smsg->cmsg_len - sizeof (*smsg));
+ }
+
+ kmem_free(inbuf, inlen);
+ inbuf = newbuf;
+ inlen = len;
+ }
+#endif /* defined(_LP64) */
+
+ /*
+ * Now determine how much space we need for the conversion.
+ */
+ for (inmsg = (struct cmsghdr *)inbuf;
+ CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+ inmsg = CMSG_NEXT(inmsg)) {
+ int sz;
+
+ if ((sz = lx_xlate_cmsg(inmsg, NULL, LX_TO_SUNOS)) < 0) {
+ /* unsupported msg */
+ kmem_free(inbuf, inlen);
+ return (-sz);
+ }
+
+ slen += ROUNDUP_cmsglen(sz);
+ }
+
+ obuf = kmem_zalloc(slen, KM_SLEEP);
+
+ /*
+ * Now do the conversion.
+ */
+ for (inmsg = (struct cmsghdr *)inbuf, omsg = (struct cmsghdr *)obuf;
+ CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+ inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) {
+ VERIFY(lx_xlate_cmsg(inmsg, omsg, LX_TO_SUNOS) >= 0);
+ }
+
+ kmem_free(inbuf, inlen);
+ *outmsg = obuf;
+ *outlenp = slen;
+ return (0);
+}
+
+static long
+stol_cmsgs_copyout(void *input, socklen_t inlen, void *addr,
+ void *outlenp, socklen_t orig_outlen)
+{
+ void *obuf;
+ struct cmsghdr *inmsg, *omsg;
+ int error = 0;
+ socklen_t lx_len = 0;
+#if defined(_LP64)
+ model_t model = get_udatamodel();
+#endif
+
+ if (inlen == 0) {
+ /* Simply output the zero controllen */
+ goto finish;
+ }
+
+ VERIFY(inlen >= sizeof (struct cmsghdr));
+
+ /*
+ * First determine how much space we need for the conversion and
+ * make sure the caller has provided at least that much space to return
+ * results.
+ */
+ for (inmsg = (struct cmsghdr *)input;
+ CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0;
+ inmsg = CMSG_NEXT(inmsg)) {
+ int sz;
+
+ if ((sz = lx_xlate_cmsg(inmsg, NULL, SUNOS_TO_LX)) < 0) {
+ /* unsupported msg */
+ return (-sz);
+ }
+
+#if defined(_LP64)
+ if (model == DATAMODEL_NATIVE) {
+ /*
+ * The converted 64-bit cmsgs require an additional 4
+ * bytes of header space and must be aligned to 8 bytes
+ * (instead of the typical 4 for x86)
+ */
+ sz = ROUNDUP_LX_CMSG64_LEN(sz + LX_CMSG64_DIFF);
+ } else
+#endif /* defined(_LP64) */
+ {
+ /*
+ * The converted 32-bit cmsgs do not require additional
+ * header space or padding for Linux conversion.
+ */
+ sz = ROUNDUP_cmsglen(sz);
+ }
+
+ /*
+ * Unlike SunOS, Linux requires that the last cmsg be
+ * adequately padded for alignment.
+ */
+ lx_len += sz;
+ }
+
+ if (lx_len > orig_outlen || addr == NULL) {
+ /* This will be interpreted by the caller */
+ error = EMSGSIZE;
+ lx_len = 0;
+ goto finish;
+ }
+
+ /*
+ * Since cmsgs are often padded to an aligned size, kmem_zalloc is
+ * necessary to prevent leaking the contents of uninitialized memory.
+ */
+ obuf = kmem_zalloc(lx_len, KM_SLEEP);
+
+ /*
+ * Convert the msgs.
+ */
+ for (inmsg = (struct cmsghdr *)input, omsg = (struct cmsghdr *)obuf;
+ CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0;
+ inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) {
+ VERIFY(lx_xlate_cmsg(inmsg, omsg, SUNOS_TO_LX) >= 0);
+ }
+
+#if defined(_LP64)
+ if (model == DATAMODEL_NATIVE) {
+ /* Linux cmsg headers are longer than illumos under x86_64. */
+ struct cmsghdr *smsg;
+ lx_cmsghdr64_t *lmsg;
+ void *newbuf;
+
+ /*
+ * Once again, kmem_zalloc is needed to avoid leaking the
+ * contents of uninialized memory
+ */
+ newbuf = kmem_zalloc(lx_len, KM_SLEEP);
+ for (smsg = (struct cmsghdr *)obuf,
+ lmsg = (lx_cmsghdr64_t *)newbuf;
+ CMSG_VALID(smsg, obuf, (uintptr_t)obuf + inlen) != 0;
+ smsg = CMSG_NEXT(smsg), lmsg = LX_CMSG64_NEXT(lmsg)) {
+ lmsg->cmsg_level = smsg->cmsg_level;
+ lmsg->cmsg_type = smsg->cmsg_type;
+ lmsg->cmsg_len = smsg->cmsg_len + LX_CMSG64_DIFF;
+
+ ASSERT(LX_CMSG64_VALID(lmsg, newbuf,
+ (uintptr_t)newbuf + lx_len) != 0);
+
+ bcopy(CMSG_CONTENT(smsg), LX_CMSG64_DATA(lmsg),
+ smsg->cmsg_len - sizeof (*smsg));
+ }
+
+ kmem_free(obuf, lx_len);
+ obuf = newbuf;
+ }
+#endif /* defined(_LP64) */
+
+ if (copyout(obuf, addr, lx_len) != 0) {
+ kmem_free(obuf, lx_len);
+ return (EFAULT);
+ }
+ kmem_free(obuf, lx_len);
+
+finish:
+ if (outlenp != NULL) {
+#if defined(_LP64)
+ if (model != DATAMODEL_NATIVE) {
+ int32_t len32 = (int32_t)lx_len;
+ if (copyout(&len32, outlenp, sizeof (len32)) != 0) {
+ return (EFAULT);
+ }
+ } else
+#endif /* defined(_LP64) */
+ {
+ if (copyout(&lx_len, outlenp, sizeof (lx_len)) != 0) {
+ return (EFAULT);
+ }
+ }
+ }
+ return (error);
+}
+
+static void
+lx_cmsg_set_cloexec(void *input, socklen_t inlen)
+{
+ struct cmsghdr *inmsg;
+
+ if (inlen == 0) {
+ return;
+ }
+
+ for (inmsg = (struct cmsghdr *)input;
+ CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0;
+ inmsg = CMSG_NEXT(inmsg)) {
+ if (inmsg->cmsg_level == SOL_SOCKET &&
+ inmsg->cmsg_type == SCM_RIGHTS) {
+ int *fds = (int *)CMSG_CONTENT(inmsg);
+ int i, num = (int)CMSG_CONTENTLEN(inmsg) / sizeof (int);
+
+ for (i = 0; i < num; i++) {
+ char flags;
+ file_t *fp;
+
+ fp = getf(fds[i]);
+ if (fp == NULL) {
+ /*
+ * It is possible that a received fd
+ * will already have been closed if a
+ * thread in the local process is
+ * indiscriminately issuing close(2)
+ * calls while the message is being
+ * received. If that is the case, no
+ * further processing of the fd is
+ * needed. It will still be passed
+ * up in the cmsg even though the
+ * caller chose to close it already.
+ */
+ continue;
+ }
+
+ flags = f_getfd(fds[i]);
+ flags |= FD_CLOEXEC;
+ f_setfd(fds[i], flags);
+ releasef(fds[i]);
+ }
+ }
+ }
+}
+
+static int
+lx_cmsg_try_ucred(sonode_t *so, struct nmsghdr *msg, socklen_t origlen)
+{
+ lx_socket_aux_data_t *sad;
+ struct cmsghdr *cmsg = NULL;
+ int msgsize;
+ cred_t *cred;
+
+ if (origlen == 0) {
+ return (0);
+ }
+ sad = lx_sad_acquire(SOTOV(so));
+ if ((sad->lxsad_flags & LXSAD_FL_STRCRED) == 0) {
+ mutex_exit(&sad->lxsad_lock);
+ return (0);
+ }
+ mutex_exit(&sad->lxsad_lock);
+
+ mutex_enter(&so->so_lock);
+ if (so->so_peercred == NULL) {
+ mutex_exit(&so->so_lock);
+ return (0);
+ }
+ crhold(cred = so->so_peercred);
+ mutex_exit(&so->so_lock);
+
+ msgsize = ucredminsize(cred) + sizeof (struct cmsghdr);
+ if (msg->msg_control == NULL) {
+ msg->msg_controllen = msgsize;
+ msg->msg_control = cmsg = kmem_zalloc(msgsize, KM_SLEEP);
+ } else {
+ /*
+ * The so_recvmsg operation may have allocated a msg_control
+ * buffer which precisely fits all returned cmsgs. We must
+ * manually verify the length of that cmsg data and reallocate
+ * the buffer if it lacks the necessary space.
+ */
+ uintptr_t start = (uintptr_t)msg->msg_control;
+ uintptr_t end = start + msg->msg_controllen;
+
+ ASSERT(msg->msg_controllen > 0);
+ cmsg = (struct cmsghdr *)msg->msg_control;
+ while (CMSG_VALID(cmsg, start, end) != 0) {
+ if (cmsg->cmsg_level == SOL_SOCKET &&
+ cmsg->cmsg_type == SCM_UCRED) {
+ /*
+ * If some later code change results in a ucred
+ * being attached anyways, there is no need for
+ * us to do it manually
+ */
+ crfree(cred);
+ return (0);
+ }
+ cmsg = CMSG_NEXT(cmsg);
+ }
+ if (((uintptr_t)cmsg + msgsize) > end) {
+ socklen_t offset = (uintptr_t)cmsg - start;
+ socklen_t newsize = offset + msgsize;
+ void *newbuf;
+
+ if (newsize < msg->msg_controllen) {
+ /* size overflow, bail */
+ crfree(cred);
+ return (-1);
+ }
+ newbuf = kmem_alloc(newsize, KM_SLEEP);
+ bcopy(msg->msg_control, newbuf, msg->msg_controllen);
+ kmem_free(msg->msg_control, msg->msg_controllen);
+
+ msg->msg_control = newbuf;
+ msg->msg_controllen = newsize;
+ cmsg = (struct cmsghdr *)((uintptr_t)newbuf + offset);
+ }
+ }
+
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_UCRED;
+ cmsg->cmsg_len = msgsize;
+ (void) cred2ucred(cred, so->so_cpid, CMSG_CONTENT(cmsg), CRED());
+ crfree(cred);
+ return (0);
+}
+
+static lx_socket_aux_data_t *
+lx_sad_acquire(vnode_t *vp)
+{
+ lx_socket_aux_data_t *cur, *created;
+
+ mutex_enter(&vp->v_vsd_lock);
+ cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd);
+ if (cur == NULL) {
+ /* perform our allocation carefully */
+ mutex_exit(&vp->v_vsd_lock);
+
+ created = (lx_socket_aux_data_t *)kmem_zalloc(
+ sizeof (*created), KM_SLEEP);
+
+ mutex_enter(&vp->v_vsd_lock);
+ cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd);
+ if (cur == NULL) {
+ mutex_init(&created->lxsad_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+ (void) vsd_set(vp, lx_socket_vsd, created);
+ cur = created;
+ } else {
+ kmem_free(created, sizeof (*created));
+ }
+ }
+ mutex_exit(&vp->v_vsd_lock);
+ mutex_enter(&cur->lxsad_lock);
+ return (cur);
+}
+
+static int
+lx_convert_pkt_proto(int protocol)
+{
+ switch (ntohs(protocol)) {
+ case LX_ETH_P_802_2:
+ return (ETH_P_802_2);
+ case LX_ETH_P_IP:
+ return (ETH_P_IP);
+ case LX_ETH_P_ARP:
+ return (ETH_P_ARP);
+ case LX_ETH_P_IPV6:
+ return (ETH_P_IPV6);
+ case LX_ETH_P_ALL:
+ case LX_ETH_P_802_3:
+ return (ETH_P_ALL);
+ default:
+ return (-1);
+ }
+}
+
+static int
+lx_convert_sock_args(int in_dom, int in_type, int in_proto, int *out_dom,
+ int *out_type, int *out_options, int *out_proto)
+{
+ int domain, type, options;
+
+ if (in_dom < 0 || in_type < 0 || in_proto < 0)
+ return (EINVAL);
+
+ domain = LTOS_FAMILY(in_dom);
+ if (domain == AF_NOTSUPPORTED || domain == AF_UNSPEC)
+ return (EAFNOSUPPORT);
+ if (domain == AF_INVAL)
+ return (EINVAL);
+
+ type = LTOS_SOCKTYPE(in_type & LX_SOCK_TYPE_MASK);
+ if (type == SOCK_INVAL)
+ return (EINVAL);
+ /*
+ * Linux does not allow the app to specify IP Protocol for raw sockets.
+ * SunOS does, so bail out here.
+ */
+ if (type == SOCK_NOTSUPPORTED ||
+ (domain == AF_INET && type == SOCK_RAW && in_proto == IPPROTO_IP)) {
+ if (lx_kern_release_cmp(curzone, "2.6.15") < 0) {
+ /*
+ * Use error appropriate for kernel version.
+ * See lx_socket_create for more detail.
+ */
+ return (ESOCKTNOSUPPORT);
+ }
+ return (EPROTONOSUPPORT);
+ }
+
+ options = 0;
+ in_type &= ~(LX_SOCK_TYPE_MASK);
+ if (in_type & LX_SOCK_NONBLOCK) {
+ in_type ^= LX_SOCK_NONBLOCK;
+ options |= SOCK_NONBLOCK;
+ }
+ if (in_type & LX_SOCK_CLOEXEC) {
+ in_type ^= LX_SOCK_CLOEXEC;
+ options |= SOCK_CLOEXEC;
+ }
+ if (in_type != 0) {
+ return (EINVAL);
+ }
+
+ /* Protocol definitions for PF_PACKET differ between Linux and SunOS */
+ if (domain == PF_PACKET &&
+ (in_proto = lx_convert_pkt_proto(in_proto)) < 0)
+ return (EINVAL);
+
+ *out_dom = domain;
+ *out_type = type;
+ *out_options = options;
+ *out_proto = in_proto;
+ return (0);
+}
+
+/*
+ * For restartable socket syscall handling, the relevant syscalls are only
+ * restarted when a timeout is not set on the socket.
+ */
+static void
+lx_sock_syscall_restart(sonode_t *so, boolean_t recv)
+{
+ if (recv) {
+ if (so->so_rcvtimeo != 0)
+ return;
+ } else {
+ if (so->so_sndtimeo != 0)
+ return;
+ }
+
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+}
+
+static int
+lx_socket_create(int domain, int type, int protocol, int options, file_t **fpp,
+ int *fdp)
+{
+ sonode_t *so;
+ vnode_t *vp;
+ file_t *fp;
+ int err, fd;
+
+ /* logic cloned from so_socket */
+ so = socket_create(domain, type, protocol, NULL, NULL, SOCKET_SLEEP,
+ SOV_DEFAULT, CRED(), &err);
+
+ if (so == NULL) {
+ switch (err) {
+ case EPROTOTYPE:
+ case EPROTONOSUPPORT:
+ if (lx_kern_release_cmp(curzone, "2.6.15") < 0) {
+ /*
+ * Linux changed its socket error behavior in
+ * versions 2.6.15 and later. See git commit
+ * 86c8f9d158f68538a971a47206a46a22c7479bac in
+ * the Linux repository.
+ *
+ * LTP presently checks for version 2.6.16.
+ */
+ return (ESOCKTNOSUPPORT);
+ }
+ return (EPROTONOSUPPORT);
+ default:
+ return (err);
+ }
+ }
+
+ /* Allocate a file descriptor for the socket */
+ vp = SOTOV(so);
+ if ((err = falloc(vp, FWRITE|FREAD, &fp, &fd)) != 0) {
+ (void) socket_close(so, 0, CRED());
+ socket_destroy(so);
+ return (err);
+ }
+
+ /*
+ * Linux programs do not tolerate errors appearing from asynchronous
+ * events (such as ICMP messages arriving). Setting SM_DEFERERR will
+ * prevent checking/delivery of such errors.
+ */
+ so->so_mode |= SM_DEFERERR;
+
+ /* Now fill in the entries that falloc reserved */
+ if (options & SOCK_NONBLOCK) {
+ so->so_state |= SS_NONBLOCK;
+ fp->f_flag |= FNONBLOCK;
+ }
+ mutex_exit(&fp->f_tlock);
+ *fpp = fp;
+ *fdp = fd;
+ return (0);
+}
+
+static void
+lx_socket_destroy(file_t *fp, int fd)
+{
+ sonode_t *so = VTOSO(fp->f_vnode);
+
+ setf(fd, NULL);
+
+ mutex_enter(&fp->f_tlock);
+ unfalloc(fp);
+
+ (void) socket_close(so, 0, CRED());
+ socket_destroy(so);
+}
+
+long
+lx_socket(int domain, int type, int protocol)
+{
+ int error, options, fd = -1;
+ file_t *fp = NULL;
+
+ if ((error = lx_convert_sock_args(domain, type, protocol, &domain,
+ &type, &options, &protocol)) != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_socket_create(domain, type, protocol, options, &fp, &fd);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ setf(fd, fp);
+ if ((options & SOCK_CLOEXEC) != 0) {
+ f_setfd(fd, FD_CLOEXEC);
+ }
+ return (fd);
+}
+
+long
+lx_bind(long sock, uintptr_t name, socklen_t namelen)
+{
+ struct sonode *so;
+ struct sockaddr *addr = NULL;
+ socklen_t len = 0;
+ file_t *fp;
+ int error;
+ lx_sun_type_t sun_type;
+ boolean_t not_sock = B_FALSE;
+
+ if ((so = getsonode(sock, &error, &fp)) == NULL) {
+ return (set_errno(error));
+ }
+
+ if (namelen != 0) {
+ error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen,
+ &addr, &len, &sun_type);
+ if (error != 0) {
+ releasef(sock);
+ return (set_errno(error));
+ }
+ }
+
+ if (addr != NULL && addr->sa_family == AF_UNIX) {
+ vnode_t *vp;
+
+ error = so_ux_lookup(so, (struct sockaddr_un *)addr, B_TRUE,
+ &vp);
+ if (error == 0) {
+ /* A valid socket exists and is open at this address. */
+ VN_RELE(vp);
+ } else {
+ /* Keep track of paths which are not valid sockets. */
+ if (error == ENOTSOCK) {
+ not_sock = B_TRUE;
+ }
+
+ /*
+ * When binding to an abstract namespace address or
+ * /dev/log, implicit clean-up must occur if there is
+ * not a valid socket at the specififed address. See
+ * ltos_sockaddr_copyin for details about why these
+ * socket types act differently.
+ */
+ if (sun_type == LX_SUN_ABSTRACT) {
+ (void) vn_removeat(NULL, addr->sa_data,
+ UIO_SYSSPACE, RMFILE);
+ }
+ }
+ }
+
+ error = socket_bind(so, addr, len, _SOBIND_XPG4_2, CRED());
+
+ /*
+ * Linux returns EADDRINUSE for attempts to bind to Unix domain
+ * sockets that aren't sockets.
+ */
+ if (error == EINVAL && addr != NULL && addr->sa_family == AF_UNIX &&
+ not_sock == B_TRUE) {
+ error = EADDRINUSE;
+ }
+
+ releasef(sock);
+
+ if (addr != NULL) {
+ kmem_free(addr, len);
+ }
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_connect(long sock, uintptr_t name, socklen_t namelen)
+{
+ struct sonode *so;
+ struct sockaddr *addr = NULL;
+ lx_socket_aux_data_t *sad = NULL;
+ socklen_t len = 0;
+ file_t *fp;
+ int error;
+
+ if ((so = getsonode(sock, &error, &fp)) == NULL) {
+ return (set_errno(error));
+ }
+
+ /*
+ * Ensure the name is sized appropriately before we alloc memory and
+ * copy it in from userspace. We need at least the address family to
+ * make later sizing decisions.
+ */
+ if (namelen != 0) {
+ error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen,
+ &addr, &len, NULL);
+ if (error != 0) {
+ releasef(sock);
+ return (set_errno(error));
+ }
+ }
+
+ error = socket_connect(so, addr, len, fp->f_flag,
+ _SOCONNECT_XPG4_2, CRED());
+
+ if (error == EINTR)
+ lx_sock_syscall_restart(so, B_FALSE);
+
+ /*
+ * Linux connect(2) behavior is rather strange when using the
+ * O_NONBLOCK flag. The first call will return EINPROGRESS, as
+ * expected. Provided that is successful, a second call to connect
+ * will return 0 instead of EISCONN. Subsequent connect calls will
+ * return EISCONN.
+ */
+ if ((fp->f_flag & FNONBLOCK) != 0 && error != 0) {
+ sad = lx_sad_acquire(SOTOV(so));
+ if (error == EISCONN &&
+ sad->lxsad_status == LXSS_CONNECTING) {
+ /* Report the one success */
+ sad->lxsad_status = LXSS_CONNECTED;
+ error = 0;
+ } else if (error == EINPROGRESS) {
+ sad->lxsad_status = LXSS_CONNECTING;
+ }
+ mutex_exit(&sad->lxsad_lock);
+ }
+
+ /*
+ * When connecting to a UDP socket, configure it so that future
+ * sendto/sendmsg operations are allowed to specify a destination
+ * address. See the Posix spec. for sendto(2). Linux allows this while
+ * illumos would return EISCONN if the option is not set.
+ */
+ if (error == 0 && so->so_protocol == IPPROTO_UDP &&
+ (so->so_family == AF_INET || so->so_family == AF_INET6)) {
+ int val = 1;
+
+ DTRACE_PROBE(lx__connect__udp);
+ (void) socket_setsockopt(so, IPPROTO_UDP, UDP_SND_TO_CONNECTED,
+ &val, sizeof (val), CRED());
+ }
+
+ releasef(sock);
+
+ if (addr != NULL) {
+ kmem_free(addr, len);
+ }
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+/*
+ * Custom version of socket_recvmsg for error-handling overrides.
+ */
+static int
+lx_socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ cred_t *cr)
+{
+ int error;
+ ssize_t orig_resid = uiop->uio_resid;
+
+ /*
+ * Do not bypass the cache when reading data, as the application
+ * is likely to access the data shortly.
+ */
+ uiop->uio_extflg |= UIO_COPY_CACHED;
+
+ error = SOP_RECVMSG(so, msg, uiop, cr);
+
+ switch (error) {
+ case EINTR:
+ /* EAGAIN is EWOULDBLOCK */
+ case EWOULDBLOCK:
+ /* We did a partial read */
+ if (uiop->uio_resid != orig_resid)
+ error = 0;
+ break;
+ case ENOTCONN:
+ /*
+ * The rules are different for non-blocking sockets which are
+ * still in the process of making a connection
+ */
+ if ((msg->msg_flags & MSG_DONTWAIT) != 0 ||
+ (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) {
+ error = EAGAIN;
+ }
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+static long
+lx_recv_common(int sock, struct nmsghdr *msg, xuio_t *xuiop, int flags,
+ void *namelenp, void *controllenp, void *flagsp)
+{
+ struct sonode *so;
+ file_t *fp;
+ void *name;
+ socklen_t namelen;
+ void *control;
+ socklen_t controllen;
+ ssize_t len;
+ int error;
+ boolean_t fd_cloexec;
+ boolean_t is_peek_trunc;
+
+ if ((so = getsonode(sock, &error, &fp)) == NULL) {
+ return (set_errno(error));
+ }
+
+ fd_cloexec = ((flags & LX_MSG_CMSG_CLOEXEC) != 0);
+ flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS);
+ is_peek_trunc = (flags & (MSG_PEEK|MSG_TRUNC)) == (MSG_PEEK|MSG_TRUNC);
+ len = xuiop->xu_uio.uio_resid;
+ xuiop->xu_uio.uio_fmode = fp->f_flag;
+ xuiop->xu_uio.uio_extflg = UIO_COPY_CACHED;
+
+ /*
+ * Linux accepts MSG_TRUNC as an input flag, unlike SunOS and many
+ * other UNIX distributions. When combined with MSG_PEEK, it causes
+ * recvmsg to return the size of the waiting message, regardless of
+ * buffer size. This behavior is commonly used with a 0-length buffer
+ * to interrogate the size of a queued message prior to allocating a
+ * buffer for it.
+ *
+ * In order to support this functionality, a custom XUIO type is used
+ * to communicate the total message size out from the depths of sockfs.
+ */
+ if (is_peek_trunc) {
+ xuiop->xu_uio.uio_extflg |= UIO_XUIO;
+ xuiop->xu_type = UIOTYPE_PEEKSIZE;
+ xuiop->xu_ext.xu_ps.xu_ps_set = B_FALSE;
+ xuiop->xu_ext.xu_ps.xu_ps_size = 0;
+ }
+
+ name = msg->msg_name;
+ namelen = msg->msg_namelen;
+ control = msg->msg_control;
+ controllen = msg->msg_controllen;
+
+ /*
+ * socket_recvmsg will allocate these if needed.
+ * NULL them out to prevent any confusion.
+ */
+ msg->msg_name = NULL;
+ msg->msg_control = NULL;
+
+ msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
+ MSG_DONTWAIT);
+ /* Default to XPG4.2 operation */
+ msg->msg_flags |= MSG_XPG4_2;
+
+ error = lx_socket_recvmsg(so, msg, (struct uio *)xuiop, CRED());
+ if (error) {
+ if (error == EINTR)
+ lx_sock_syscall_restart(so, B_TRUE);
+ releasef(sock);
+ return (set_errno(error));
+ }
+ lwp_stat_update(LWP_STAT_MSGRCV, 1);
+ releasef(sock);
+
+ if (namelen != 0) {
+ error = stol_sockaddr_copyout(msg->msg_name, msg->msg_namelen,
+ name, namelenp, namelen);
+
+ if (msg->msg_namelen != 0) {
+ kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
+ msg->msg_namelen = 0;
+ }
+
+ /*
+ * Errors during copyout of the name are not a concern to Linux
+ * callers at this point in the syscall
+ */
+ if (error != 0 && error != EFAULT) {
+ goto err;
+ }
+ }
+
+ if (controllen != 0) {
+ if (fd_cloexec) {
+ /*
+ * If CLOEXEC needs to set on file descriptors passed
+ * via SCM_RIGHTS, do so before formatting the cmsgs
+ * for Linux.
+ */
+ lx_cmsg_set_cloexec(msg->msg_control,
+ msg->msg_controllen);
+ }
+ if (so->so_family == AF_UNIX &&
+ (so->so_mode & SM_CONNREQUIRED) != 0) {
+ /*
+ * It may be necessary to append a SCM_UCRED cmsg to
+ * the controls if SO_PASSCRED is set on a
+ * connection-oriented AF_UNIX socket.
+ *
+ * See lx_setsockopt_socket for more details.
+ */
+ if (lx_cmsg_try_ucred(so, msg, controllen) != 0) {
+ msg->msg_flags |= MSG_CTRUNC;
+ }
+ }
+
+ error = stol_cmsgs_copyout(msg->msg_control,
+ msg->msg_controllen, control, controllenp, controllen);
+
+ if (error != 0) {
+ /*
+ * If there was an error during cmsg translation or
+ * copyout, we need to clean up any FDs that are being
+ * passed back via SCM_RIGHTS. This prevents us from
+ * leaking those open files.
+ */
+ so_closefds(msg->msg_control, msg->msg_controllen, 0,
+ 0);
+
+ /*
+ * An error during cmsg_copyout means we had
+ * _something_ to process.
+ */
+ VERIFY(msg->msg_controllen != 0);
+
+ kmem_free(msg->msg_control,
+ (size_t)msg->msg_controllen);
+ msg->msg_controllen = 0;
+
+ if (error == EMSGSIZE) {
+ /* Communicate that messages were truncated */
+ msg->msg_flags |= MSG_CTRUNC;
+ error = 0;
+ } else {
+ goto err;
+ }
+ } else if (msg->msg_controllen != 0) {
+ kmem_free(msg->msg_control,
+ (size_t)msg->msg_controllen);
+ msg->msg_controllen = 0;
+ }
+ }
+
+ if (flagsp != NULL) {
+ int flags;
+
+ /* Clear internal flag. */
+ flags = msg->msg_flags & ~MSG_XPG4_2;
+ flags = lx_xlate_sock_flags(flags, SUNOS_TO_LX);
+
+ if (copyout(&flags, flagsp, sizeof (flags) != 0)) {
+ error = EFAULT;
+ goto err;
+ }
+ }
+
+ /*
+ * If both MSG_PEEK|MSG_TRUNC were set on the input flags and the
+ * socket layer was able to calculate the total message size for us,
+ * return that instead of the copied size.
+ */
+ if (is_peek_trunc && xuiop->xu_ext.xu_ps.xu_ps_set == B_TRUE) {
+ return (xuiop->xu_ext.xu_ps.xu_ps_size);
+ }
+
+ return (len - xuiop->xu_uio.uio_resid);
+
+err:
+ if (msg->msg_controllen != 0) {
+ /* Prevent FD leakage (see above) */
+ so_closefds(msg->msg_control, msg->msg_controllen, 0, 0);
+ kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
+ }
+ if (msg->msg_namelen != 0) {
+ kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
+ }
+ return (set_errno(error));
+}
+
+long
+lx_recv(int sock, void *buffer, size_t len, int flags)
+{
+ struct nmsghdr smsg;
+ xuio_t xuio;
+ struct iovec uiov;
+
+ if ((ssize_t)len < 0) {
+ /*
+ * The input len is unsigned, so limit it to SSIZE_MAX since
+ * the return value is signed.
+ */
+ return (set_errno(EINVAL));
+ }
+
+ uiov.iov_base = buffer;
+ uiov.iov_len = len;
+ xuio.xu_uio.uio_loffset = 0;
+ xuio.xu_uio.uio_iov = &uiov;
+ xuio.xu_uio.uio_iovcnt = 1;
+ xuio.xu_uio.uio_resid = len;
+ xuio.xu_uio.uio_segflg = UIO_USERSPACE;
+ xuio.xu_uio.uio_limit = 0;
+
+ smsg.msg_namelen = 0;
+ smsg.msg_controllen = 0;
+ smsg.msg_flags = 0;
+ return (lx_recv_common(sock, &smsg, &xuio, flags, NULL, NULL, NULL));
+}
+
+long
+lx_recvfrom(int sock, void *buffer, size_t len, int flags,
+ struct sockaddr *srcaddr, socklen_t *addrlenp)
+{
+ struct nmsghdr smsg;
+ xuio_t xuio;
+ struct iovec uiov;
+
+ if ((ssize_t)len < 0) {
+ /* Keep len reasonably limited (see lx_recv) */
+ return (set_errno(EINVAL));
+ }
+
+ uiov.iov_base = buffer;
+ uiov.iov_len = len;
+ xuio.xu_uio.uio_loffset = 0;
+ xuio.xu_uio.uio_iov = &uiov;
+ xuio.xu_uio.uio_iovcnt = 1;
+ xuio.xu_uio.uio_resid = len;
+ xuio.xu_uio.uio_segflg = UIO_USERSPACE;
+ xuio.xu_uio.uio_limit = 0;
+
+ smsg.msg_name = (char *)srcaddr;
+ if (addrlenp != NULL && srcaddr != NULL) {
+ /*
+ * Despite addrlenp being defined as a socklen_t *, Linux
+ * treats it internally as an int *. Certain LTP tests depend
+ * upon this behavior, so we must emulate it as well.
+ */
+ int namelen;
+
+ if (copyin(addrlenp, &namelen, sizeof (namelen)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ if (namelen < 0) {
+ return (set_errno(EINVAL));
+ }
+ smsg.msg_namelen = namelen;
+ } else {
+ smsg.msg_namelen = 0;
+ }
+ smsg.msg_controllen = 0;
+ smsg.msg_flags = 0;
+
+ return (lx_recv_common(sock, &smsg, &xuio, flags, addrlenp, NULL,
+ NULL));
+}
+
+long
+lx_recvmsg(int sock, void *msg, int flags)
+{
+ struct nmsghdr smsg;
+ xuio_t xuio;
+ struct iovec luiov[IOV_MAX_STACK], *uiov;
+ int i, iovcnt, iovsize;
+ long res;
+ ssize_t len = 0;
+ void *namelenp, *controllenp, *flagsp;
+
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ lx_msghdr32_t lmsg32;
+ if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name;
+ smsg.msg_namelen = lmsg32.msg_namelen;
+ smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov;
+ smsg.msg_iovlen = lmsg32.msg_iovlen;
+ smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control;
+ smsg.msg_controllen = lmsg32.msg_controllen;
+ smsg.msg_flags = lmsg32.msg_flags;
+
+ namelenp = &((lx_msghdr32_t *)msg)->msg_namelen;
+ controllenp = &((lx_msghdr32_t *)msg)->msg_controllen;
+ flagsp = &((lx_msghdr32_t *)msg)->msg_flags;
+ } else
+#endif /* defined(_LP64) */
+ {
+ lx_msghdr_t lmsg;
+ if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ smsg.msg_name = lmsg.msg_name;
+ smsg.msg_namelen = lmsg.msg_namelen;
+ smsg.msg_iov = lmsg.msg_iov;
+ smsg.msg_iovlen = lmsg.msg_iovlen;
+ smsg.msg_control = lmsg.msg_control;
+ smsg.msg_controllen = lmsg.msg_controllen;
+ smsg.msg_flags = lmsg.msg_flags;
+
+ namelenp = &((lx_msghdr_t *)msg)->msg_namelen;
+ controllenp = &((lx_msghdr_t *)msg)->msg_controllen;
+ flagsp = &((lx_msghdr_t *)msg)->msg_flags;
+ }
+
+ iovcnt = smsg.msg_iovlen;
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ return (set_errno(EMSGSIZE));
+ }
+ if (iovcnt > IOV_MAX_STACK) {
+ iovsize = iovcnt * sizeof (struct iovec);
+ uiov = kmem_alloc(iovsize, KM_SLEEP);
+ } else if (iovcnt > 0) {
+ iovsize = 0;
+ uiov = luiov;
+ } else {
+ iovsize = 0;
+ uiov = NULL;
+ goto noiov;
+ }
+
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ /* convert from 32bit iovec structs */
+ struct iovec32 luiov32[IOV_MAX_STACK], *uiov32;
+ ssize_t iov32size;
+ ssize32_t count32;
+
+ iov32size = iovcnt * sizeof (struct iovec32);
+ if (iovsize != 0) {
+ uiov32 = kmem_alloc(iov32size, KM_SLEEP);
+ } else {
+ uiov32 = luiov32;
+ }
+
+ if (copyin((struct iovec32 *)smsg.msg_iov, uiov32, iov32size)) {
+ if (iovsize != 0) {
+ kmem_free(uiov32, iov32size);
+ kmem_free(uiov, iovsize);
+ }
+
+ return (set_errno(EFAULT));
+ }
+
+ count32 = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize32_t iovlen32;
+
+ iovlen32 = uiov32[i].iov_len;
+ count32 += iovlen32;
+ if (iovlen32 < 0 || count32 < 0) {
+ if (iovsize != 0) {
+ kmem_free(uiov32, iov32size);
+ kmem_free(uiov, iovsize);
+ }
+
+ return (set_errno(EINVAL));
+ }
+
+ uiov[i].iov_len = iovlen32;
+ uiov[i].iov_base =
+ (caddr_t)(uintptr_t)uiov32[i].iov_base;
+ }
+ len = count32;
+
+ if (iovsize != 0) {
+ kmem_free(uiov32, iov32size);
+ }
+ } else
+#endif /* defined(_LP64) */
+ {
+ if (copyin(smsg.msg_iov, uiov,
+ iovcnt * sizeof (struct iovec)) != 0) {
+ if (iovsize != 0) {
+ kmem_free(uiov, iovsize);
+ }
+ return (set_errno(EFAULT));
+ }
+
+ len = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize_t iovlen = uiov[i].iov_len;
+ len += iovlen;
+ if (iovlen < 0 || len < 0) {
+ if (iovsize != 0) {
+ kmem_free(uiov, iovsize);
+ }
+ return (set_errno(EINVAL));
+ }
+ }
+ }
+
+noiov:
+ /* Since the iovec is passed via the uio, NULL it out in the msg */
+ smsg.msg_iov = NULL;
+
+ xuio.xu_uio.uio_loffset = 0;
+ xuio.xu_uio.uio_iov = uiov;
+ xuio.xu_uio.uio_iovcnt = iovcnt;
+ xuio.xu_uio.uio_resid = len;
+ xuio.xu_uio.uio_segflg = UIO_USERSPACE;
+ xuio.xu_uio.uio_limit = 0;
+
+ res = lx_recv_common(sock, &smsg, &xuio, flags, namelenp, controllenp,
+ flagsp);
+
+ if (iovsize != 0) {
+ kmem_free(uiov, iovsize);
+ }
+
+ return (res);
+}
+
+long
+lx_recvmmsg(int sock, void *msg, uint_t vlen, int flags, timespec_t *timeoutp)
+{
+ hrtime_t deadline = 0;
+ uint_t rcvd = 0;
+ long ret = 0;
+ boolean_t waitforone;
+
+ waitforone = ((flags & LX_MSG_WAITFORONE) != 0);
+ flags &= ~LX_MSG_WAITFORONE;
+
+ /*
+ * We want to limit the work that a thread calling recvmmsg() can
+ * perform in the kernel so that it cannot accrue too high a priority.
+ * Artificially capping vlen means that the thread will return to
+ * userspace after processing at most IOV_MAX messages, giving the
+ * system a chance to reset the thread priority.
+ *
+ * Linux does not cap vlen here and recvmmsg() is expected to return
+ * once vlen messages have been received, a timeout occurs, or if an
+ * error is encountered; the artificial cap adds another case.
+ *
+ * It is possible that returning "early" in this emulation will
+ * cause problems with some applications however a properly written
+ * recvmmsg() consumer should consume only the received datagrams
+ * and try again if it wants more. This may need revisiting in the
+ * future.
+ */
+ if (vlen > IOV_MAX)
+ vlen = IOV_MAX;
+
+ if (timeoutp != NULL) {
+ timespec_t timeout;
+ uhrtime_t utime = (uhrtime_t)gethrtime();
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &timeout, sizeof (timestruc_t)))
+ return (set_errno(EFAULT));
+ } else {
+ timestruc32_t timeout32;
+ if (copyin(timeoutp, &timeout32,
+ sizeof (timestruc32_t)))
+ return (set_errno(EFAULT));
+ timeout.tv_sec = (time_t)timeout32.tv_sec;
+ timeout.tv_nsec = timeout32.tv_nsec;
+ }
+
+ if (itimerspecfix(&timeout))
+ return (set_errno(EINVAL));
+
+ /*
+ * Make sure that deadline will not overflow. itimerspecfix()
+ * has already checked for negative values and too big a value
+ * in tv_nsec
+ */
+ if (timeout.tv_sec >= HRTIME_MAX / NANOSEC)
+ return (set_errno(EINVAL));
+
+ utime += timeout.tv_sec * NANOSEC;
+ utime += timeout.tv_nsec;
+
+ if (utime > HRTIME_MAX)
+ return (set_errno(EINVAL));
+
+ deadline = (hrtime_t)utime;
+ }
+
+ for (rcvd = 0; rcvd < vlen; rcvd++) {
+ uint_t *ptr;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ lx_mmsghdr_t *hdr = (lx_mmsghdr_t *)msg;
+ hdr += rcvd;
+ ret = lx_recvmsg(sock, (lx_msghdr_t *)hdr, flags);
+ ptr = &hdr->msg_len;
+ } else {
+ lx_mmsghdr32_t *hdr = (lx_mmsghdr32_t *)msg;
+ hdr += rcvd;
+ ret = lx_recvmsg(sock, (lx_msghdr32_t *)hdr, flags);
+ ptr = &hdr->msg_len;
+ }
+ if (ttolwp(curthread)->lwp_errno != 0)
+ break;
+ copyout(&ret, ptr, sizeof (*ptr));
+ /*
+ * If MSG_WAITFORONE is set, set MSG_DONTWAIT after the
+ * first packet has been received.
+ */
+ if (waitforone) {
+ flags |= LX_MSG_DONTWAIT;
+ waitforone = B_FALSE;
+ }
+ /*
+ * The Linux man page documents the timeout option as
+ * only being checked after each datagram is received.
+ * The man page does not document ETIMEDOUT as a return
+ * code so we do not set an errno.
+ */
+ if (deadline > 0 && gethrtime() >= deadline)
+ break;
+ }
+
+ if (rcvd > 0) {
+ /*
+ * Any error code is deliberately discarded if any message
+ * was successfully received.
+ */
+ ttolwp(curthread)->lwp_errno = 0;
+ return (rcvd);
+ }
+
+ return (ret);
+}
+
+/*
+ * Custom version of socket_sendmsg for error-handling overrides.
+ */
+static int
+lx_socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ cred_t *cr, boolean_t nosig)
+{
+ int error = 0;
+ ssize_t orig_resid = uiop->uio_resid;
+
+ /*
+ * Do not bypass the cache if we are doing a local (AF_UNIX) write.
+ */
+ if (so->so_family == AF_UNIX) {
+ uiop->uio_extflg |= UIO_COPY_CACHED;
+ } else {
+ uiop->uio_extflg &= ~UIO_COPY_CACHED;
+ }
+
+ error = SOP_SENDMSG(so, msg, uiop, cr);
+
+ switch (error) {
+ case EINTR:
+ case ENOMEM:
+ /* EAGAIN is EWOULDBLOCK */
+ case EWOULDBLOCK:
+ /* We did a partial send */
+ if (uiop->uio_resid != orig_resid) {
+ error = 0;
+ }
+ break;
+
+ case ENOTCONN:
+ /*
+ * The rules are different for non-blocking sockets which are
+ * still in the process of making a connection
+ */
+ if ((msg->msg_flags & MSG_DONTWAIT) != 0 ||
+ (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) {
+ error = EAGAIN;
+ break;
+ }
+
+ /* Appease LTP and match behavior detailed in the man page */
+ error = EPIPE;
+ /* FALLTHROUGH */
+ case EPIPE:
+ if (nosig == B_FALSE) {
+ tsignal(curthread, SIGPIPE);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static long
+lx_send_common(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
+{
+ struct sonode *so;
+ file_t *fp;
+ struct sockaddr *name = NULL;
+ socklen_t namelen;
+ void *control = NULL;
+ socklen_t controllen;
+ ssize_t len = 0;
+ int error;
+ boolean_t nosig;
+
+ if ((so = getsonode(sock, &error, &fp)) == NULL) {
+ return (set_errno(error));
+ }
+
+ uiop->uio_fmode = fp->f_flag;
+
+ /* Allocate and copyin name and control */
+ if (msg->msg_name != NULL && msg->msg_namelen != 0) {
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+
+ error = ltos_sockaddr_copyin((struct sockaddr *)msg->msg_name,
+ msg->msg_namelen, &name, &namelen, NULL);
+ if (error != 0) {
+ goto done;
+ }
+ /* copyin_name null terminates addresses for AF_UNIX */
+ msg->msg_namelen = namelen;
+ msg->msg_name = name;
+ } else {
+ msg->msg_name = name = NULL;
+ msg->msg_namelen = namelen = 0;
+ }
+
+ if (msg->msg_control != NULL && msg->msg_controllen != 0) {
+ /*
+ * Verify that the length is not excessive to prevent
+ * an application from consuming all of kernel memory.
+ */
+ if (msg->msg_controllen > SO_MAXARGSIZE) {
+ error = EINVAL;
+ goto done;
+ }
+ if ((error = ltos_cmsgs_copyin(msg->msg_control,
+ msg->msg_controllen, &control, &controllen)) != 0) {
+ goto done;
+ }
+ msg->msg_control = control;
+ msg->msg_controllen = controllen;
+ } else {
+ msg->msg_control = control = NULL;
+ msg->msg_controllen = controllen = 0;
+ }
+
+ len = uiop->uio_resid;
+ msg->msg_flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS);
+ /* Default to XPG4.2 operation */
+ msg->msg_flags |= MSG_XPG4_2;
+ nosig = ((flags & LX_MSG_NOSIGNAL) != 0);
+
+ error = lx_socket_sendmsg(so, msg, uiop, CRED(), nosig);
+ if (error == EINTR)
+ lx_sock_syscall_restart(so, B_FALSE);
+done:
+ if (control != NULL) {
+ kmem_free(control, controllen);
+ }
+ if (name != NULL) {
+ kmem_free(name, namelen);
+ }
+ if (error != 0) {
+ releasef(sock);
+ return (set_errno(error));
+ }
+ lwp_stat_update(LWP_STAT_MSGSND, 1);
+ releasef(sock);
+ return (len - uiop->uio_resid);
+}
+
+/*
+ * For both send and sendto Linux evaluates errors in a different order than
+ * we do internally. Specifically it will check the buffer address before
+ * checking if the socket is connected. This can lead to a different errno on
+ * us vs. Linux (seen with LTP) but we don't bother to emulate this.
+ */
+long
+lx_send(int sock, void *buffer, size_t len, int flags)
+{
+ struct nmsghdr smsg;
+ struct uio auio;
+ struct iovec aiov[1];
+
+ if ((ssize_t)len < 0) {
+ /* Keep len reasonably limited (see lx_recv) */
+ return (set_errno(EINVAL));
+ }
+
+ aiov[0].iov_base = buffer;
+ aiov[0].iov_len = len;
+ auio.uio_loffset = 0;
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_limit = 0;
+
+ smsg.msg_name = NULL;
+ smsg.msg_control = NULL;
+ return (lx_send_common(sock, &smsg, &auio, flags));
+}
+
+long
+lx_sendto(int sock, void *buffer, size_t len, int flags,
+ struct sockaddr *dstaddr, socklen_t addrlen)
+{
+ struct nmsghdr smsg;
+ struct uio auio;
+ struct iovec aiov[1];
+
+ if ((ssize_t)len < 0) {
+ /* Keep len reasonably limited (see lx_recv) */
+ return (set_errno(EINVAL));
+ }
+
+ aiov[0].iov_base = buffer;
+ aiov[0].iov_len = len;
+ auio.uio_loffset = 0;
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_limit = 0;
+
+ smsg.msg_name = (char *)dstaddr;
+ smsg.msg_namelen = addrlen;
+ smsg.msg_control = NULL;
+ return (lx_send_common(sock, &smsg, &auio, flags));
+}
+
+long
+lx_sendmsg(int sock, void *msg, int flags)
+{
+ struct nmsghdr smsg;
+ struct uio auio;
+ struct iovec buf[IOV_MAX_STACK], *aiov;
+ int i, iovcnt, iovsize;
+ long res;
+ ssize_t len = 0;
+
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ lx_msghdr32_t lmsg32;
+ if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name;
+ smsg.msg_namelen = lmsg32.msg_namelen;
+ smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov;
+ smsg.msg_iovlen = lmsg32.msg_iovlen;
+ smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control;
+ smsg.msg_controllen = lmsg32.msg_controllen;
+ smsg.msg_flags = lmsg32.msg_flags;
+ } else
+#endif /* defined(_LP64) */
+ {
+ lx_msghdr_t lmsg;
+ if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ smsg.msg_name = lmsg.msg_name;
+ smsg.msg_namelen = lmsg.msg_namelen;
+ smsg.msg_iov = lmsg.msg_iov;
+ smsg.msg_iovlen = lmsg.msg_iovlen;
+ smsg.msg_control = lmsg.msg_control;
+ smsg.msg_controllen = lmsg.msg_controllen;
+ smsg.msg_flags = lmsg.msg_flags;
+ }
+
+ iovcnt = smsg.msg_iovlen;
+ if (iovcnt <= 0 || iovcnt > IOV_MAX) {
+ return (set_errno(EMSGSIZE));
+ }
+ if (iovcnt > IOV_MAX_STACK) {
+ iovsize = iovcnt * sizeof (struct iovec);
+ aiov = kmem_alloc(iovsize, KM_SLEEP);
+ } else {
+ iovsize = 0;
+ aiov = buf;
+ }
+
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ /* convert from 32bit iovec structs */
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ ssize_t iov32size;
+ ssize32_t count32;
+
+ iov32size = iovcnt * sizeof (struct iovec32);
+ if (iovsize != 0) {
+ aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+ }
+
+ if (copyin((struct iovec32 *)smsg.msg_iov, aiov32, iov32size)) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
+ return (set_errno(EFAULT));
+ }
+
+ count32 = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize32_t iovlen32;
+
+ iovlen32 = aiov32[i].iov_len;
+ count32 += iovlen32;
+ if (iovlen32 < 0 || count32 < 0) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
+ return (set_errno(EINVAL));
+ }
+
+ aiov[i].iov_len = iovlen32;
+ aiov[i].iov_base =
+ (caddr_t)(uintptr_t)aiov32[i].iov_base;
+ }
+ len = count32;
+
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ }
+ } else
+#endif /* defined(_LP64) */
+ {
+ if (copyin(smsg.msg_iov, aiov,
+ iovcnt * sizeof (struct iovec)) != 0) {
+ if (iovsize != 0) {
+ kmem_free(aiov, iovsize);
+ }
+ return (set_errno(EFAULT));
+ }
+
+ len = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize_t iovlen = aiov[i].iov_len;
+
+ len += iovlen;
+ if (iovlen < 0 || len < 0) {
+ if (iovsize != 0) {
+ kmem_free(aiov, iovsize);
+ }
+ return (set_errno(EINVAL));
+ }
+ }
+ }
+ /* Since the iovec is passed via the uio, NULL it out in the msg */
+ smsg.msg_iov = NULL;
+
+ auio.uio_loffset = 0;
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_limit = 0;
+
+ res = lx_send_common(sock, &smsg, &auio, flags);
+
+ if (iovsize != 0) {
+ kmem_free(aiov, iovsize);
+ }
+
+ return (res);
+}
+
+long
+lx_sendmmsg(int sock, void *msg, uint_t vlen, int flags)
+{
+ long ret = 0;
+ uint_t sent = 0;
+
+ /*
+ * Linux caps vlen to UIO_MAXIOV (1024).
+ */
+ if (vlen > IOV_MAX)
+ vlen = IOV_MAX;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ lx_mmsghdr_t *hdr = msg;
+
+ for (sent = 0; sent < vlen; sent++, hdr++) {
+ ret = lx_sendmsg(sock, (lx_msghdr_t *)hdr, flags);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ break;
+ copyout(&ret, &hdr->msg_len, sizeof (hdr->msg_len));
+ }
+ } else {
+ lx_mmsghdr32_t *hdr = msg;
+
+ for (sent = 0; sent < vlen; sent++, hdr++) {
+ ret = lx_sendmsg(sock, (lx_msghdr32_t *)hdr, flags);
+ if (ttolwp(curthread)->lwp_errno != 0)
+ break;
+ copyout(&ret, &hdr->msg_len, sizeof (hdr->msg_len));
+ }
+ }
+
+ if (sent > 0) {
+ /*
+ * Any error code is deliberately discarded if any message
+ * was successfully sent.
+ */
+ ttolwp(curthread)->lwp_errno = 0;
+ return (sent);
+ }
+
+ return (ret);
+}
+
+/*
+ * Linux socket option type definitions
+ *
+ * The protocol `levels` are well defined (see in.h) The option values are
+ * not so well defined. Linux often uses different values vs. Illumos
+ * although they mean the same thing. For example, IP_TOS in Linux is
+ * defined as value 1 but in Illumos it is defined as value 3. This table
+ * maps all the Protocol levels to their options and maps them between
+ * Linux and Illumos and vice versa. Hence the reason for the complexity.
+ *
+ * For a certain subset of sockopts, Linux will implicitly truncate optval
+ * input, so long as optlen meets a minimum size. Because illumos is strict
+ * about optlen, we must cap optlen for those options.
+ */
+
+typedef struct lx_sockopt_map {
+ const int lsm_opt; /* Illumos-native equivalent */
+ const int lsm_lcap; /* Cap optlen to this size. (Ignored if 0) */
+} lx_sockopt_map_t;
+
+typedef struct lx_proto_opts {
+ const lx_sockopt_map_t *lpo_entries; /* Linux->SunOS map entries */
+ unsigned int lpo_max; /* max entries in table */
+} lx_proto_opts_t;
+
+#define OPTNOTSUP -1 /* we don't support it */
+
+#define PROTO_SOCKOPTS(opts) \
+ { (opts), sizeof ((opts)) / sizeof ((opts)[0]) }
+
+/* Shorten name so the columns can line up */
+#define IP_MREQ_SZ sizeof (struct ip_mreq)
+
+static const lx_sockopt_map_t ltos_ip_sockopts[LX_IP_UNICAST_IF + 1] = {
+ { OPTNOTSUP, 0 },
+ { IP_TOS, sizeof (int) }, /* IP_TOS */
+ { IP_TTL, sizeof (int) }, /* IP_TTL */
+ { IP_HDRINCL, sizeof (int) }, /* IP_HDRINCL */
+ { IP_OPTIONS, 0 }, /* IP_OPTIONS */
+ { OPTNOTSUP, 0 }, /* IP_ROUTER_ALERT */
+ { IP_RECVOPTS, sizeof (int) }, /* IP_RECVOPTS */
+ { IP_RETOPTS, sizeof (int) }, /* IP_RETOPTS */
+ { IP_PKTINFO, sizeof (int) }, /* IP_PKTINFO */
+ { OPTNOTSUP, 0 }, /* IP_PKTOPTIONS */
+ { OPTNOTSUP, 0 }, /* IP_MTUDISCOVER */
+ { OPTNOTSUP, 0 }, /* IP_RECVERR */
+ { IP_RECVTTL, sizeof (int) }, /* IP_RECVTTL */
+ { OPTNOTSUP, 0 }, /* IP_RECVTOS */
+ { OPTNOTSUP, 0 }, /* IP_MTU */
+ { OPTNOTSUP, 0 }, /* IP_FREEBIND */
+ { OPTNOTSUP, 0 }, /* IP_IPSEC_POLICY */
+ { OPTNOTSUP, 0 }, /* IP_XFRM_POLICY */
+ { OPTNOTSUP, 0 }, /* IP_PASSSEC */
+ { OPTNOTSUP, 0 }, /* IP_TRANSPARENT */
+ { OPTNOTSUP, 0 }, /* IP_ORIGDSTADDR */
+ { OPTNOTSUP, 0 }, /* IP_MINTTL */
+ { OPTNOTSUP, 0 }, /* IP_NODEFRAG */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { IP_MULTICAST_IF, sizeof (int) }, /* IP_MULTICAST_IF */
+ { IP_MULTICAST_TTL, sizeof (int) }, /* IP_MULTICAST_TTL */
+ { IP_MULTICAST_LOOP, sizeof (int) }, /* IP_MULTICAST_LOOP */
+ { IP_ADD_MEMBERSHIP, IP_MREQ_SZ }, /* IP_ADD_MEMBERSHIP */
+ { IP_DROP_MEMBERSHIP, IP_MREQ_SZ }, /* IP_DROP_MEMBERSHIP */
+ { IP_UNBLOCK_SOURCE, 0 }, /* IP_UNBLOCK_SOURCE */
+ { IP_BLOCK_SOURCE, 0 }, /* IP_BLOCK_SOURCE */
+ { IP_ADD_SOURCE_MEMBERSHIP, 0 }, /* IP_ADD_SOURCE_MEMBERSHIP */
+ { OPTNOTSUP, 0 }, /* IP_DROP_SOURCE_MEMBERSHIP */
+ { OPTNOTSUP, 0 }, /* IP_MSFILTER */
+ { MCAST_JOIN_GROUP, 0 }, /* MCAST_JOIN_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_BLOCK_SOURCE */
+ { OPTNOTSUP, 0 }, /* MCAST_UNBLOCK_SOURCE */
+ { MCAST_LEAVE_GROUP, 0 }, /* MCAST_LEAVE_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_JOIN_SOURCE_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_LEAVE_SOURCE_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_MSFILTER */
+ { OPTNOTSUP, 0 }, /* IP_MULTICAST_ALL */
+ { OPTNOTSUP, 0 } /* IP_UNICAST_IF */
+};
+
+/* Shorten name so the columns can line up */
+#define IP6_MREQ_SZ sizeof (struct ipv6_mreq)
+
+static const lx_sockopt_map_t ltos_ipv6_sockopts[LX_IPV6_TCLASS + 1] = {
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 }, /* IPV6_ADDRFORM */
+ { OPTNOTSUP, 0 }, /* IPV6_2292PKTINFO */
+ { OPTNOTSUP, 0 }, /* IPV6_2292HOPOPTS */
+ { OPTNOTSUP, 0 }, /* IPV6_2292DSTOPTS */
+ { OPTNOTSUP, 0 }, /* IPV6_2292RTHDR */
+ { OPTNOTSUP, 0 }, /* IPV6_2292PKTOPTIONS */
+ { IPV6_CHECKSUM, sizeof (int) }, /* IPV6_CHECKSUM */
+ { OPTNOTSUP, 0 }, /* IPV6_2292HOPLIMIT */
+ { OPTNOTSUP, 0 }, /* IPV6_NEXTHOP */
+ { OPTNOTSUP, 0 }, /* IPV6_AUTHHDR */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { IPV6_UNICAST_HOPS, sizeof (int) }, /* IPV6_UNICAST_HOPS */
+ { IPV6_MULTICAST_IF, sizeof (int) }, /* IPV6_MULTICAST_IF */
+ { IPV6_MULTICAST_HOPS, sizeof (int) }, /* IPV6_MULTICAST_HOPS */
+ { IPV6_MULTICAST_LOOP, sizeof (int) }, /* IPV6_MULTICAST_LOOP */
+ { IPV6_ADD_MEMBERSHIP, IP6_MREQ_SZ }, /* IPV6_JOIN_GROUP */
+ { IPV6_DROP_MEMBERSHIP, IP6_MREQ_SZ }, /* IPV6_LEAVE_GROUP */
+ { OPTNOTSUP, 0 }, /* IPV6_ROUTER_ALERT */
+ { OPTNOTSUP, 0 }, /* IPV6_MTU_DISCOVER */
+ { OPTNOTSUP, 0 }, /* IPV6_MTU */
+ { OPTNOTSUP, 0 }, /* IPV6_RECVERR */
+ { IPV6_V6ONLY, sizeof (int) }, /* IPV6_V6ONLY */
+ { OPTNOTSUP, 0 }, /* IPV6_JOIN_ANYCAST */
+ { OPTNOTSUP, 0 }, /* IPV6_LEAVE_ANYCAST */
+ { OPTNOTSUP, 0 }, /* IPV6_IPSEC_POLICY */
+ { OPTNOTSUP, 0 }, /* IPV6_XFRM_POLICY */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { MCAST_JOIN_GROUP, 0 }, /* MCAST_JOIN_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_BLOCK_SOURCE */
+ { OPTNOTSUP, 0 }, /* MCAST_UNBLOCK_SOURCE */
+ { MCAST_LEAVE_GROUP, 0 }, /* MCAST_LEAVE_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_JOIN_SOURCE_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_LEAVE_SOURCE_GROUP */
+ { OPTNOTSUP, 0 }, /* MCAST_MSFILTER */
+ { IPV6_RECVPKTINFO, sizeof (int) }, /* IPV6_RECVPKTINFO */
+ { IPV6_PKTINFO, 0 }, /* IPV6_PKTINFO */
+ { IPV6_RECVHOPLIMIT, sizeof (int) }, /* IPV6_RECVHOPLIMIT */
+ { IPV6_HOPLIMIT, 0 }, /* IPV6_HOPLIMIT */
+ { OPTNOTSUP, 0 }, /* IPV6_RECVHOPOPTS */
+ { OPTNOTSUP, 0 }, /* IPV6_HOPOPTS */
+ { OPTNOTSUP, 0 }, /* IPV6_RTHDRDSTOPTS */
+ { OPTNOTSUP, 0 }, /* IPV6_RECVRTHDR */
+ { OPTNOTSUP, 0 }, /* IPV6_RTHDR */
+ { OPTNOTSUP, 0 }, /* IPV6_RECVDSTOPTS */
+ { OPTNOTSUP, 0 }, /* IPV6_DSTOPTS */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 }, /* IPV6_RECVTCLASS */
+ { IPV6_TCLASS, sizeof (int) } /* IPV6_TCLASS */
+};
+
+static const lx_sockopt_map_t ltos_icmpv6_sockopts[LX_ICMP6_FILTER + 1] = {
+ { OPTNOTSUP, 0 },
+ { ICMP6_FILTER, 0 } /* ICMP6_FILTER */
+};
+
+/*
+ * Options marked as "in code" in their comment are handled in the
+ * lx_setsockopt_tcp() and lx_getsockopt_tcp() functions.
+ *
+ * For the Linux TCP_SYNCNT option (the number of SYN retransmits) we emulate
+ * that by interpreting the two connection interval settings:
+ * TCP_CONN_NOTIFY_THRESHOLD
+ * tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval
+ * TCP_CONN_ABORT_THRESHOLD
+ * tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval
+ * The system (re)transmits a SYN and performs a doubling backoff from the
+ * first timer until it passes the second timer. We determine the SYN count
+ * from these two values. Normally it will be 5. Also see the TCPS_SYN_SENT
+ * case in tcp_timer(); a tcp_second_ctimer_threshold value of 0 means to
+ * retransmit SYN indefinitely.
+ *
+ * For the Linux TCP_USER_TIMEOUT option we use our TCP_ABORT_THRESHOLD since
+ * this seems to be the closest match. This value is the
+ * tcp_second_timer_threshold, which gets initialized to the
+ * tcp_ip_abort_interval value. The tunable guide describes this as:
+ * For a given TCP connection, if TCP has been retransmitting for
+ * tcp_ip_abort_interval period of time and it has not received any
+ * acknowledgment from the other endpoint during this period, TCP closes
+ * this connection.
+ * The value is in milliseconds, which matches TCP_USER_TIMEOUT.
+ */
+static const lx_sockopt_map_t ltos_tcp_sockopts[LX_TCP_NOTSENT_LOWAT + 1] = {
+ { OPTNOTSUP, 0 },
+ { TCP_NODELAY, sizeof (int) }, /* TCP_NODELAY */
+ { TCP_MAXSEG, sizeof (int) }, /* TCP_MAXSEG - in code */
+ { TCP_CORK, sizeof (int) }, /* TCP_CORK */
+ { TCP_KEEPIDLE, sizeof (int) }, /* TCP_KEEPIDLE */
+ { TCP_KEEPINTVL, sizeof (int) }, /* TCP_KEEPINTVL */
+ { TCP_KEEPCNT, sizeof (int) }, /* TCP_KEEPCNT */
+ { OPTNOTSUP, 0 }, /* TCP_SYNCNT - in code */
+ { TCP_LINGER2, sizeof (int) }, /* TCP_LINGER2 */
+ { OPTNOTSUP, 0 }, /* TCP_DEFER_ACCEPT - in code */
+ { OPTNOTSUP, 0 }, /* TCP_WINDOW_CLAMP - in code */
+ { OPTNOTSUP, 0 }, /* TCP_INFO */
+ { OPTNOTSUP, 0 }, /* TCP_QUICKACK - in code */
+ { OPTNOTSUP, 0 }, /* TCP_CONGESTION */
+ { OPTNOTSUP, 0 }, /* TCP_MD5SIG */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 }, /* TCP_THIN_LINEAR_TIMEOUTS */
+ { OPTNOTSUP, 0 }, /* TCP_THIN_DUPACK */
+ { TCP_ABORT_THRESHOLD, sizeof (int) }, /* TCP_USER_TIMEOUT */
+ { OPTNOTSUP, 0 }, /* TCP_REPAIR */
+ { OPTNOTSUP, 0 }, /* TCP_REPAIR_QUEUE */
+ { OPTNOTSUP, 0 }, /* TCP_QUEUE_SEQ */
+ { OPTNOTSUP, 0 }, /* TCP_REPAIR_OPTIONS */
+ { OPTNOTSUP, 0 }, /* TCP_FASTOPEN */
+ { OPTNOTSUP, 0 }, /* TCP_TIMESTAMP */
+ { OPTNOTSUP, 0 } /* TCP_NOTSENT_LOWAT */
+};
+
+static const lx_sockopt_map_t ltos_igmp_sockopts[IGMP_MTRACE + 1] = {
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { IGMP_MINLEN, 0 }, /* IGMP_MINLEN */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { IGMP_MEMBERSHIP_QUERY, 0 }, /* IGMP_HOST_MEMBERSHIP_QUERY */
+ { IGMP_V1_MEMBERSHIP_REPORT, 0 }, /* IGMP_HOST_MEMBERSHIP_REPORT */
+ { IGMP_DVMRP, 0 }, /* IGMP_DVMRP */
+ { IGMP_PIM, 0 }, /* IGMP_PIM */
+ { OPTNOTSUP, 0 }, /* IGMP_TRACE */
+ { IGMP_V2_MEMBERSHIP_REPORT, 0 }, /* IGMPV2_HOST_MEMBERSHIP_REPORT */
+ { IGMP_V2_LEAVE_GROUP, 0 }, /* IGMP_HOST_LEAVE_MESSAGE */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 },
+ { IGMP_MTRACE_RESP, 0 }, /* IGMP_MTRACE_RESP */
+ { IGMP_MTRACE, 0 } /* IGMP_MTRACE */
+};
+
+static const lx_sockopt_map_t ltos_socket_sockopts[LX_SO_BPF_EXTENSIONS + 1] = {
+ { OPTNOTSUP, 0 },
+ { SO_DEBUG, sizeof (int) }, /* SO_DEBUG */
+ { SO_REUSEADDR, sizeof (int) }, /* SO_REUSEADDR */
+ { SO_TYPE, 0 }, /* SO_TYPE */
+ { SO_ERROR, 0 }, /* SO_ERROR */
+ { SO_DONTROUTE, sizeof (int) }, /* SO_DONTROUTE */
+ { SO_BROADCAST, sizeof (int) }, /* SO_BROADCAST */
+ { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUF */
+ { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUF */
+ { SO_KEEPALIVE, sizeof (int) }, /* SO_KEEPALIVE */
+ { SO_OOBINLINE, sizeof (int) }, /* SO_OOBINLINE */
+ { OPTNOTSUP, 0 }, /* SO_NO_CHECK */
+ { OPTNOTSUP, 0 }, /* SO_PRIORITY */
+ { SO_LINGER, 0 }, /* SO_LINGER */
+ { OPTNOTSUP, 0 }, /* SO_BSDCOMPAT */
+ { SO_REUSEPORT, sizeof (int) }, /* SO_REUSEPORT */
+ { SO_RECVUCRED, sizeof (int) }, /* SO_PASSCRED */
+ { OPTNOTSUP, 0 }, /* SO_PEERCRED */
+ { SO_RCVLOWAT, sizeof (int) }, /* SO_RCVLOWAT */
+ { SO_SNDLOWAT, sizeof (int) }, /* SO_SNDLOWAT */
+ { SO_RCVTIMEO, 0 }, /* SO_RCVTIMEO */
+ { SO_SNDTIMEO, 0 }, /* SO_SNDTIMEO */
+ { OPTNOTSUP, 0 }, /* SO_SECURITY_AUTHENTICATION */
+ { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_TRANSPORT */
+ { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_NETWORK */
+ { OPTNOTSUP, 0 }, /* SO_BINDTODEVICE */
+ { SO_ATTACH_FILTER, 0 }, /* SO_ATTACH_FILTER */
+ { SO_DETACH_FILTER, 0 }, /* SO_DETACH_FILTER */
+ { OPTNOTSUP, 0 }, /* SO_PEERNAME */
+ { SO_TIMESTAMP, sizeof (int) }, /* SO_TIMESTAMP */
+ { SO_ACCEPTCONN, 0 }, /* SO_ACCEPTCONN */
+ { OPTNOTSUP, 0 }, /* SO_PEERSEC */
+ { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUFFORCE */
+ { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUFFORCE */
+ { OPTNOTSUP, 0 }, /* SO_PASSSEC */
+ { OPTNOTSUP, 0 }, /* SO_TIMESTAMPNS */
+ { OPTNOTSUP, 0 }, /* SO_MARK */
+ { OPTNOTSUP, 0 }, /* SO_TIMESTAMPING */
+ { SO_PROTOTYPE, 0 }, /* SO_PROTOCOL */
+ { SO_DOMAIN, 0 }, /* SO_DOMAIN */
+ { OPTNOTSUP, 0 }, /* SO_RXQ_OVFL */
+ { OPTNOTSUP, 0 }, /* SO_WIFI_STATUS */
+ { OPTNOTSUP, 0 }, /* SO_PEEK_OFF */
+ { OPTNOTSUP, 0 }, /* SO_NOFCS */
+ { OPTNOTSUP, 0 }, /* SO_LOCK_FILTER */
+ { OPTNOTSUP, 0 }, /* SO_SELECT_ERR_QUEUE */
+ { OPTNOTSUP, 0 }, /* SO_BUSY_POLL */
+ { OPTNOTSUP, 0 }, /* SO_MAX_PACING_RATE */
+ { OPTNOTSUP, 0 } /* SO_BPF_EXTENSIONS */
+};
+
+static const lx_sockopt_map_t ltos_raw_sockopts[LX_ICMP_FILTER + 1] = {
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 } /* ICMP_FILTER */
+};
+
+static const lx_sockopt_map_t ltos_packet_sockopts[LX_PACKET_STATISTICS + 1] = {
+ { OPTNOTSUP, 0 },
+ { PACKET_ADD_MEMBERSHIP, 0 }, /* PACKET_ADD_MEMBERSHIP */
+ { PACKET_DROP_MEMBERSHIP, 0 }, /* PACKET_DROP_MEMBERSHIP */
+ { OPTNOTSUP, 0 }, /* PACKET_RECV_OUTPUT */
+ { OPTNOTSUP, 0 },
+ { OPTNOTSUP, 0 }, /* PACKET_RX_RING */
+ { PACKET_STATISTICS, 0 } /* PACKET_STATISTICS */
+};
+
+/* Needed for SO_ATTACH_FILTER */
+struct lx_bpf_program {
+ unsigned short bf_len;
+ caddr_t bf_insns;
+};
+
+/* Invert filter fields as Linux expects */
+#define LX_ICMP6_FILTER_INVERT(filterp) ( \
+ ((filterp)->__icmp6_filt[0] ^= 0xFFFFFFFFU), \
+ ((filterp)->__icmp6_filt[1] ^= 0xFFFFFFFFU), \
+ ((filterp)->__icmp6_filt[2] ^= 0xFFFFFFFFU), \
+ ((filterp)->__icmp6_filt[3] ^= 0xFFFFFFFFU), \
+ ((filterp)->__icmp6_filt[4] ^= 0xFFFFFFFFU), \
+ ((filterp)->__icmp6_filt[5] ^= 0xFFFFFFFFU), \
+ ((filterp)->__icmp6_filt[6] ^= 0xFFFFFFFFU), \
+ ((filterp)->__icmp6_filt[7] ^= 0xFFFFFFFFU))
+
+static boolean_t
+lx_sockopt_lookup(lx_proto_opts_t tbl, int *optname, socklen_t *optlen)
+{
+ const lx_sockopt_map_t *entry;
+
+ if (*optname > tbl.lpo_max) {
+ return (B_FALSE);
+ }
+ entry = &tbl.lpo_entries[*optname];
+ if (entry->lsm_opt == OPTNOTSUP) {
+ return (B_FALSE);
+ }
+ *optname = entry->lsm_opt;
+ /* Truncate the optlen if needed/allowed */
+ if (entry->lsm_lcap != 0 && *optlen > entry->lsm_lcap) {
+ *optlen = entry->lsm_lcap;
+ }
+ return (B_TRUE);
+}
+
+static int
+lx_mcast_common(sonode_t *so, int level, int optname, void *optval,
+ socklen_t optlen)
+{
+ int error;
+ struct group_req gr;
+ lx_sockaddr_storage_t *lxss;
+
+ ASSERT(optname == LX_MCAST_JOIN_GROUP ||
+ optname == LX_MCAST_LEAVE_GROUP);
+
+ /*
+ * For MCAST_JOIN_GROUP and MCAST_LEAVE_GROUP, Linux uses a
+ * gr_group that has a different size from the native gr_group.
+ * We need to translate to the native gr_group taking special
+ * care to do the right thing when dealing with a 32-bit program
+ * making a call into a 64-bit kernel.
+ */
+
+ bzero(&gr, sizeof (gr));
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ if (optlen != sizeof (lx_group_req32_t)) {
+ return (EINVAL);
+ }
+
+ lx_group_req32_t *lxgr = optval;
+
+ /* use the 32-bit type */
+ gr.gr_interface = lxgr->lxgr_interface;
+ lxss = &lxgr->lxgr_group;
+ } else
+#endif /* defined(_SYSCALL32_IMPL) */
+ {
+ if (optlen != sizeof (lx_group_req_t)) {
+ return (EINVAL);
+ }
+
+ lx_group_req_t *lxgr = optval;
+
+ gr.gr_interface = lxgr->lxgr_interface;
+ lxss = &lxgr->lxgr_group;
+ }
+
+ bcopy(lxss, &gr.gr_group, sizeof (*lxss));
+ gr.gr_group.ss_family = LTOS_FAMILY(lxss->lxss_family);
+
+ optlen = sizeof (gr);
+ optname = (optname == LX_MCAST_JOIN_GROUP) ?
+ MCAST_JOIN_GROUP : MCAST_LEAVE_GROUP;
+
+ error = socket_setsockopt(so, level, optname, &gr,
+ optlen, CRED());
+ return (error);
+}
+
+static int
+lx_setsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ int *intval = (int *)optval;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts);
+
+ switch (optname) {
+ case LX_IP_RECVERR:
+ /*
+ * Ping sets this option to receive errors on raw sockets.
+ * Currently we just ignore it to make ping happy. From the
+ * Linux ip.7 man page:
+ *
+ * For raw sockets, IP_RECVERR enables passing of all
+ * received ICMP errors to the application.
+ *
+ * Programs known to depend upon this:
+ * - ping
+ * - traceroute
+ * - mount.nfs
+ */
+ return (0);
+
+ case LX_IP_MTU_DISCOVER: {
+ int val;
+
+ /*
+ * We translate Linux's IP_MTU_DISCOVER into our IP_DONTFRAG,
+ * allowing this be a byte or an integer and observing the
+ * inverted sense of the two relative to one another (and
+ * translating accordingly).
+ */
+ if (optlen < sizeof (int)) {
+ val = *((uint8_t *)optval);
+ } else {
+ val = *((int *)optval);
+ }
+
+ switch (val) {
+ case LX_IP_PMTUDISC_DONT:
+ val = 1;
+ break;
+
+ case LX_IP_PMTUDISC_DO:
+ case LX_IP_PMTUDISC_WANT:
+ val = 0;
+ break;
+
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ error = socket_setsockopt(so, IPPROTO_IP, IP_DONTFRAG,
+ &val, sizeof (val), CRED());
+ return (error);
+ }
+
+ case LX_IP_MULTICAST_TTL:
+ case LX_IP_MULTICAST_LOOP:
+ /*
+ * For IP_MULTICAST_TTL and IP_MULTICAST_LOOP, Linux defines
+ * the option value to be an integer while we define it to be
+ * an unsigned character. To prevent the kernel from spitting
+ * back an error on an illegal length, verify that the option
+ * value is less than UCHAR_MAX before truncating optlen.
+ */
+ if (optlen <= 0 || optlen > sizeof (int) ||
+ *intval > UINT8_MAX) {
+ return (EINVAL);
+ }
+ optlen = sizeof (uchar_t);
+ break;
+
+ case LX_MCAST_JOIN_GROUP:
+ case LX_MCAST_LEAVE_GROUP:
+ error = lx_mcast_common(so, IPPROTO_IP, optname, optval,
+ optlen);
+ return (error);
+ default:
+ break;
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_setsockopt(so, IPPROTO_IP, optname, optval, optlen,
+ CRED());
+ return (error);
+}
+
+static int
+lx_setsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts);
+
+ switch (optname) {
+ case LX_IPV6_MTU:
+ /*
+ * There isn't a good translation for IPV6_MTU and certain apps
+ * such as bind9 will bail if it cannot be set.
+ * We just lie about the success for now.
+ */
+ return (0);
+ case LX_MCAST_JOIN_GROUP:
+ case LX_MCAST_LEAVE_GROUP:
+ error = lx_mcast_common(so, IPPROTO_IPV6, optname, optval,
+ optlen);
+ return (error);
+ default:
+ break;
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+ error = socket_setsockopt(so, IPPROTO_IPV6, optname, optval, optlen,
+ CRED());
+ return (error);
+}
+
+static int
+lx_setsockopt_icmpv6(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts);
+
+ if (optname == LX_ICMP6_FILTER && optval != NULL) {
+ /*
+ * Surprise! The input to ICMP6_FILTER on Linux is inverted
+ * when compared to illumos.
+ */
+ if (optlen != sizeof (icmp6_filter_t)) {
+ return (EINVAL);
+ }
+ LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval);
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+ error = socket_setsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen,
+ CRED());
+ return (error);
+}
+
+static int
+lx_setsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts);
+ cred_t *cr = CRED();
+ uint32_t rto_max, abrt_thresh;
+ boolean_t abrt_changed = B_FALSE, rto_max_changed = B_FALSE;
+
+ if (optname == LX_TCP_WINDOW_CLAMP || optname == LX_TCP_QUICKACK) {
+ /* It appears safe to lie and say we did these. */
+ return (0);
+ }
+
+ if (optname == LX_TCP_MAXSEG) {
+ /*
+ * We can get, but not set, TCP_MAXSEG. However, it appears
+ * safe to lie and say we did this. A future extension might
+ * be to allow setting this before a connection is established.
+ */
+ return (0);
+ }
+
+ if (optname == LX_TCP_SYNCNT) {
+ int intval;
+ uint64_t syn_last_backoff;
+ uint_t syn_cnt, syn_backoff, len;
+
+ /*
+ * See the comment above the ltos_tcp_sockopts table for an
+ * explanation of the TCP_SYNCNT emulation.
+ */
+ if (optlen != sizeof (int)) {
+ return (EINVAL);
+ }
+ intval = *(int *)optval;
+ if (intval > 255) {
+ return (EINVAL);
+ }
+
+ len = sizeof (syn_backoff);
+ error = socket_getsockopt(so, IPPROTO_TCP,
+ TCP_CONN_NOTIFY_THRESHOLD, &syn_backoff, &len, 0, cr);
+ if (error != 0)
+ return (error);
+
+ syn_last_backoff = syn_backoff;
+ for (syn_cnt = 0; syn_cnt < intval; syn_cnt++) {
+ syn_last_backoff *= 2;
+ /*
+ * Since the tcps_ip_abort_cinterval is milliseconds and
+ * stored as a uint_t, it's basically impossible to get
+ * up to the Linux limit of 255 SYN retries due to the
+ * doubling on the backoff.
+ */
+ if (syn_last_backoff > UINT_MAX) {
+ return (EINVAL);
+ }
+ }
+
+ syn_backoff = (uint_t)syn_last_backoff;
+ error = socket_setsockopt(so, IPPROTO_TCP,
+ TCP_CONN_ABORT_THRESHOLD, &syn_backoff, len, cr);
+ return (error);
+ }
+
+ if (optname == LX_TCP_DEFER_ACCEPT) {
+ int *intval;
+ char *dfp;
+
+ /*
+ * Emulate TCP_DEFER_ACCEPT using the datafilt(7M) socket
+ * filter but we can't emulate the timeout aspect so treat any
+ * non-zero value as enabling and zero as disabling.
+ */
+ if (optlen != sizeof (int)) {
+ return (EINVAL);
+ }
+ intval = (int *)optval;
+
+ /*
+ * socket_setsockopt asserts that the optval is aligned, so
+ * we use kmem_alloc to ensure this.
+ */
+ dfp = (char *)kmem_alloc(sizeof (DATAFILT), KM_SLEEP);
+ (void) strcpy(dfp, DATAFILT);
+
+ if (*intval > 0) {
+ error = socket_setsockopt(so, SOL_FILTER, FIL_ATTACH,
+ dfp, 9, cr);
+ if (error == EEXIST) {
+ error = 0;
+ }
+ } else {
+ error = socket_setsockopt(so, SOL_FILTER, FIL_DETACH,
+ dfp, 9, cr);
+ if (error == ENXIO) {
+ error = 0;
+ }
+ }
+ kmem_free(dfp, sizeof (DATAFILT));
+ return (error);
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ if (optname == TCP_KEEPINTVL) {
+ /*
+ * When setting TCP_KEEPINTVL there is an unfortunate set of
+ * dependencies. TCP_KEEPINTVL must be <= TCP_RTO_MAX and
+ * TCP_RTO_MAX must be <= TCP_ABORT_THRESHOLD. Thus, we may
+ * have to increase one or both of these in order to increase
+ * TCP_KEEPINTVL. Note that TCP_KEEPINTVL is passed in seconds
+ * but TCP_RTO_MAX and TCP_ABORT_THRESHOLD are in milliseconds.
+ * Also note that we currently make no attempt to handle
+ * concurrent application threads simultaneously changing
+ * TCP_KEEPINTVL, since that is unlikely. We could revisit
+ * locking if it ever becomes an issue.
+ */
+ uint32_t new_val = *(uint_t *)optval * 1000;
+ uint32_t len;
+
+ /*
+ * Linux limits this to 32k, so we do too. However, anything
+ * over 2 hours (7200000 ms) will fail anyway due to the
+ * system-wide default (see "_rexmit_interval_max" in
+ * tcp_tunables.c). Our 2 hour default seems reasonable as a
+ * practical limit for now.
+ */
+ if (*(uint_t *)optval > SHRT_MAX)
+ return (EINVAL);
+
+ len = sizeof (rto_max);
+ if ((error = socket_getsockopt(so, IPPROTO_TCP, TCP_RTO_MAX,
+ &rto_max, &len, 0, cr)) != 0)
+ return (error);
+ len = sizeof (abrt_thresh);
+ if ((error = socket_getsockopt(so, IPPROTO_TCP,
+ TCP_ABORT_THRESHOLD, &abrt_thresh, &len, 0, cr)) != 0)
+ return (error);
+
+ if (new_val > abrt_thresh) {
+ error = socket_setsockopt(so, IPPROTO_TCP,
+ TCP_ABORT_THRESHOLD, &new_val, sizeof (new_val),
+ cr);
+ if (error != 0)
+ goto fail;
+ abrt_changed = B_TRUE;
+ }
+ if (new_val > rto_max) {
+ error = socket_setsockopt(so, IPPROTO_TCP,
+ TCP_RTO_MAX, &new_val, sizeof (new_val), cr);
+ if (error != 0)
+ goto fail;
+ rto_max_changed = B_TRUE;
+ }
+ }
+
+ error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen, cr);
+
+fail:
+ if (error != 0 && optname == TCP_KEEPINTVL) {
+ /*
+ * If changing TCP_KEEPINTVL failed then we may need to
+ * restore the previous values for TCP_ABORT_THRESHOLD and
+ * TCP_RTO_MAX.
+ */
+ if (rto_max_changed) {
+ (void) socket_setsockopt(so, IPPROTO_TCP,
+ TCP_RTO_MAX, &rto_max,
+ sizeof (rto_max), cr);
+ }
+ if (abrt_changed) {
+ (void) socket_setsockopt(so, IPPROTO_TCP,
+ TCP_ABORT_THRESHOLD, &abrt_thresh,
+ sizeof (abrt_thresh), cr);
+ }
+ }
+
+ return (error);
+}
+
+static int
+lx_setsockopt_socket(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts);
+ struct lx_bpf_program *lbp;
+ int *intval;
+ struct bpf_program bp;
+
+ switch (optname) {
+ case LX_SO_BSDCOMPAT:
+ /* Linux ignores this option. */
+ return (0);
+
+ case LX_SO_TIMESTAMP:
+ /*
+ * SO_TIMESTAMP is not supported on AF_UNIX sockets but we have
+ * some of those which apps use for logging, etc., so pretend
+ * this worked.
+ */
+ if (so->so_family == AF_UNIX) {
+ return (0);
+ }
+ break;
+
+ case LX_SO_ATTACH_FILTER:
+ /*
+ * Convert bpf program struct
+ */
+ if (optlen != sizeof (struct lx_bpf_program)) {
+ return (EINVAL);
+ }
+ lbp = (struct lx_bpf_program *)optval;
+ bp.bf_len = lbp->bf_len;
+ /* LINTED: alignment */
+ bp.bf_insns = (struct bpf_insn *)lbp->bf_insns;
+ optval = &bp;
+ break;
+
+ case LX_SO_PASSSEC:
+ /*
+ * SO_PASSSEC is very similar to SO_PASSCRED (emulated by
+ * SO_RECVUCRED) in that it requests that cmsgs containing
+ * identity information be attached to recieved messages.
+ * Instead of ucred information, security-module-specific
+ * information such as selinux label is expected
+ *
+ * Since LX does not at all support selinux today, the
+ * option is silently accepted.
+ */
+ return (0);
+
+ case LX_SO_PASSCRED:
+ /*
+ * In many cases, the Linux SO_PASSCRED is mapped to the SunOS
+ * SO_RECVUCRED to enable the passing of peer credential
+ * information via received cmsgs. One exception is for
+ * connection-oriented AF_UNIX sockets which do not yet support
+ * that option. Instead, we track the setting internally and,
+ * when there is appropriate cmsg space, emulate the credential
+ * passing by querying the STREAMS ioctl.
+ *
+ * Note: this approach is broken for the case when a process
+ * sets up a Unix-domain socket with SO_PASSCRED, then forks
+ * one or more children, and expects to use the cmsg cred to
+ * accurately know which child pid sent the message (currently
+ * a pid is recorded when the socket is connected, not for each
+ * msg sent). getpeerucred(3c) suffers from the same problem.
+ * We have a workaround in lx_socketpair (use DGRAM if
+ * SEQPACKET), but the general case requires enhancing our
+ * streams support to allow passing credential cmsgs on a
+ * connection-oriented Unix socket.
+ */
+ if (so->so_family == AF_UNIX &&
+ (so->so_mode & SM_CONNREQUIRED) != 0) {
+ lx_socket_aux_data_t *sad;
+
+ if (optlen != sizeof (int)) {
+ return (EINVAL);
+ }
+ intval = (int *)optval;
+ sad = lx_sad_acquire(SOTOV(so));
+ if (*intval == 0) {
+ sad->lxsad_flags &= ~LXSAD_FL_STRCRED;
+ } else {
+ sad->lxsad_flags |= LXSAD_FL_STRCRED;
+ }
+ mutex_exit(&sad->lxsad_lock);
+ return (0);
+ }
+ break;
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_setsockopt(so, SOL_SOCKET, optname, optval, optlen,
+ CRED());
+ return (error);
+}
+
+static int
+lx_setsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts);
+
+ switch (optname) {
+ case LX_ICMP_FILTER:
+ /*
+ * This option is currently ignored to appease ping.
+ */
+ return (0);
+
+ case LX_IPV6_CHECKSUM:
+ /*
+ * Ping6 tries to set the IPV6_CHECKSUM offset in a way that
+ * illumos won't allow. Quietly ignore this to prevent it from
+ * complaining.
+ */
+ return (0);
+
+ default:
+ break;
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen,
+ CRED());
+ return (error);
+}
+
+static int
+lx_setsockopt_packet(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts);
+ struct packet_mreq *mr;
+
+ switch (optname) {
+ case LX_PACKET_ADD_MEMBERSHIP:
+ case LX_PACKET_DROP_MEMBERSHIP:
+ /* Convert Linux mr_type to illumos */
+ if (optlen != sizeof (struct packet_mreq)) {
+ return (EINVAL);
+ }
+ mr = (struct packet_mreq *)optval;
+ if (--mr->mr_type > PACKET_MR_ALLMULTI)
+ return (EINVAL);
+ optval = mr;
+ break;
+
+ default:
+ break;
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_setsockopt(so, SOL_PACKET, optname, optval, optlen,
+ CRED());
+ return (error);
+}
+
+static int
+lx_setsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+ int error;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts);
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_setsockopt(so, IPPROTO_IGMP, optname, optval, optlen,
+ CRED());
+ return (error);
+}
+
+static int
+lx_getsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+ int error = 0;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts);
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, IPPROTO_IP, optname, optval, optlen, 0,
+ CRED());
+ return (error);
+}
+
+static int
+lx_getsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+ int error = 0;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts);
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, IPPROTO_IPV6, optname, optval, optlen, 0,
+ CRED());
+ return (error);
+}
+
+static int
+lx_getsockopt_icmpv6(sonode_t *so, int optname, void *optval,
+ socklen_t *optlen)
+{
+ int error = 0;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts);
+
+ if (optname == LX_ICMP6_FILTER) {
+ error = socket_getsockopt(so, IPPROTO_ICMPV6, ICMP6_FILTER,
+ optval, optlen, 0, CRED());
+
+ /*
+ * ICMP6_FILTER is inverted on Linux. Make it so before copying
+ * back to caller's buffer.
+ */
+ if (error == 0) {
+ LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval);
+ }
+ return (error);
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen,
+ 0, CRED());
+ return (error);
+}
+
+static int
+lx_getsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+ int error = 0;
+ cred_t *cr = CRED();
+ int *intval = (int *)optval;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts);
+
+ switch (optname) {
+ case LX_TCP_WINDOW_CLAMP:
+ case LX_TCP_QUICKACK:
+ /*
+ * We do not support these options but some apps rely on them.
+ * Rather than return an error we just return 0. This isn't
+ * exactly a lie, since the options really aren't set, but it's
+ * not the whole truth either. Fortunately, we aren't under
+ * oath.
+ */
+ if (*optlen < sizeof (int)) {
+ error = EINVAL;
+ } else {
+ *intval = 0;
+ }
+ *optlen = sizeof (int);
+ return (error);
+
+ case LX_TCP_SYNCNT:
+ /*
+ * See the comment above the ltos_tcp_sockopts table for an
+ * explanation of the TCP_SYNCNT emulation.
+ */
+ if (*optlen < sizeof (int)) {
+ error = EINVAL;
+ } else {
+ uint_t syn_cnt, syn_backoff, syn_abortconn, len;
+
+ len = sizeof (syn_backoff);
+ error = socket_getsockopt(so, IPPROTO_TCP,
+ TCP_CONN_NOTIFY_THRESHOLD, &syn_backoff, &len, 0,
+ cr);
+ if (error != 0)
+ return (error);
+ error = socket_getsockopt(so, IPPROTO_TCP,
+ TCP_CONN_ABORT_THRESHOLD, &syn_abortconn, &len, 0,
+ cr);
+ if (error != 0)
+ return (error);
+
+ syn_cnt = 0;
+ while (syn_backoff < syn_abortconn) {
+ syn_cnt++;
+ syn_backoff *= 2;
+ }
+ if (syn_cnt > 255) /* clamp to Linux limit */
+ syn_cnt = 255;
+
+ *intval = syn_cnt;
+ *optlen = sizeof (int);
+ }
+
+ return (error);
+
+ case LX_TCP_DEFER_ACCEPT:
+ /*
+ * We do support TCP_DEFER_ACCEPT using the datafilt(7M) socket
+ * filter but we don't emulate the timeout aspect so treat the
+ * existence as 1 and absence as 0.
+ */
+ if (*optlen < sizeof (int)) {
+ error = EINVAL;
+ } else {
+ struct fil_info fi[10];
+ int i;
+ socklen_t len = sizeof (fi);
+
+ if ((error = socket_getsockopt(so, SOL_FILTER,
+ FIL_LIST, fi, &len, 0, cr)) != 0) {
+ *optlen = sizeof (int);
+ return (error);
+ }
+
+ *intval = 0;
+ len = len / sizeof (struct fil_info);
+ for (i = 0; i < len; i++) {
+ if (fi[i].fi_flags == FILF_PROG &&
+ strcmp(fi[i].fi_name, "datafilt") == 0) {
+ *intval = 1;
+ break;
+ }
+ }
+ }
+ *optlen = sizeof (int);
+ return (error);
+ default:
+ break;
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, IPPROTO_TCP, optname, optval, optlen, 0,
+ cr);
+ return (error);
+}
+
+static int
+lx_getsockopt_socket(sonode_t *so, int optname, void *optval,
+ socklen_t *optlen)
+{
+ int error = 0;
+ int *intval = (int *)optval;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts);
+
+ switch (optname) {
+ case LX_SO_TYPE:
+ /*
+ * Special handling for connectionless AF_UNIX sockets.
+ * See lx_socketpair for more details.
+ */
+ if (so->so_family == AF_UNIX &&
+ (so->so_mode & SM_CONNREQUIRED) == 0) {
+ lx_socket_aux_data_t *sad;
+
+ if (*optlen < sizeof (int))
+ return (EINVAL);
+ sad = lx_sad_acquire(SOTOV(so));
+ if ((sad->lxsad_flags & LXSAD_FL_EMULSEQPKT) != 0) {
+ *intval = LX_SOCK_SEQPACKET;
+ *optlen = sizeof (int);
+ mutex_exit(&sad->lxsad_lock);
+ return (0);
+ }
+ mutex_exit(&sad->lxsad_lock);
+ }
+ break;
+
+ case LX_SO_PASSSEC:
+ /*
+ * Communicate value of 0 since selinux-related functionality
+ * is not supported.
+ */
+ if (*optlen < sizeof (int)) {
+ error = EINVAL;
+ } else {
+ *intval = 0;
+ }
+ *optlen = sizeof (int);
+ return (error);
+
+ case LX_SO_PASSCRED:
+ /*
+ * Special handling for connection-oriented AF_UNIX sockets.
+ * See lx_setsockopt_socket for more details.
+ */
+ if (so->so_family == AF_UNIX &&
+ (so->so_mode & SM_CONNREQUIRED) != 0) {
+ lx_socket_aux_data_t *sad;
+
+ if (*optlen < sizeof (int)) {
+ return (EINVAL);
+ }
+ sad = lx_sad_acquire(SOTOV(so));
+ *intval = ((sad->lxsad_flags & LXSAD_FL_STRCRED) == 0 ?
+ 0 : 1);
+ *optlen = sizeof (int);
+ mutex_exit(&sad->lxsad_lock);
+ return (0);
+ }
+ break;
+
+ case LX_SO_PEERCRED:
+ if (*optlen < sizeof (struct lx_ucred)) {
+ error = EINVAL;
+ } else {
+ struct lx_ucred *lcred = (struct lx_ucred *)optval;
+
+ mutex_enter(&so->so_lock);
+ if ((so->so_mode & SM_CONNREQUIRED) == 0) {
+ error = ENOTSUP;
+ } else if (so->so_peercred == NULL) {
+ error = EINVAL;
+ } else {
+ lcred->lxu_uid = crgetuid(so->so_peercred);
+ lcred->lxu_gid = crgetgid(so->so_peercred);
+ lcred->lxu_pid = so->so_cpid;
+ }
+ mutex_exit(&so->so_lock);
+ }
+ *optlen = sizeof (struct lx_ucred);
+ return (error);
+
+ default:
+ break;
+ }
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, SOL_SOCKET, optname, optval, optlen, 0,
+ CRED());
+
+ if (error == 0) {
+ switch (optname) {
+ case SO_TYPE:
+ /* translate our type back to Linux */
+ *intval = STOL_SOCKTYPE(*intval);
+ break;
+
+ case SO_ERROR:
+ *intval = lx_errno(*intval, EINVAL);
+ break;
+ default:
+ break;
+ }
+ }
+ return (error);
+}
+
+static int
+lx_getsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+ int error = 0;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts);
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, IPPROTO_RAW, optname, optval, optlen, 0,
+ CRED());
+ return (error);
+}
+
+static int
+lx_getsockopt_packet(sonode_t *so, int optname, void *optval,
+ socklen_t *optlen)
+{
+ int error = 0;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts);
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, SOL_PACKET, optname, optval, optlen, 0,
+ CRED());
+ return (error);
+}
+
+static int
+lx_getsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+ int error = 0;
+ lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts);
+
+ if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+ return (ENOPROTOOPT);
+ }
+
+ error = socket_getsockopt(so, IPPROTO_IGMP, optname, optval, optlen, 0,
+ CRED());
+ return (error);
+}
+
+long
+lx_setsockopt(int sock, int level, int optname, void *optval, socklen_t optlen)
+{
+ struct sonode *so;
+ file_t *fp;
+ int buflen = 0;
+ intptr_t stkbuf[2];
+ void *optbuf = stkbuf;
+ int error = 0;
+
+ if (optlen != 0) {
+ if (optlen > SO_MAXARGSIZE) {
+ return (set_errno(EINVAL));
+ }
+ if (optlen > sizeof (stkbuf)) {
+ buflen = optlen;
+ optbuf = kmem_alloc(optlen, KM_SLEEP);
+ } else {
+ /*
+ * Zero the on-stack buffer to avoid poisoning smaller
+ * optvals with stack garbage.
+ */
+ stkbuf[0] = 0;
+ stkbuf[1] = 0;
+ }
+ if (copyin(optval, optbuf, optlen) != 0) {
+ if (buflen != 0) {
+ kmem_free(optbuf, buflen);
+ }
+ return (set_errno(EFAULT));
+ }
+ } else {
+ optbuf = NULL;
+ }
+ if ((so = getsonode(sock, &error, &fp)) == NULL) {
+ if (buflen != 0) {
+ kmem_free(optbuf, buflen);
+ }
+ return (set_errno(error));
+ }
+
+ switch (level) {
+ case LX_IPPROTO_IP:
+ error = lx_setsockopt_ip(so, optname, optbuf, optlen);
+ break;
+ case LX_IPPROTO_IPV6:
+ error = lx_setsockopt_ipv6(so, optname, optbuf, optlen);
+ break;
+ case LX_IPPROTO_ICMPV6:
+ error = lx_setsockopt_icmpv6(so, optname, optbuf, optlen);
+ break;
+ case LX_IPPROTO_TCP:
+ error = lx_setsockopt_tcp(so, optname, optbuf, optlen);
+ break;
+ case LX_SOL_SOCKET:
+ error = lx_setsockopt_socket(so, optname, optbuf, optlen);
+ break;
+ case LX_IPPROTO_RAW:
+ error = lx_setsockopt_raw(so, optname, optbuf, optlen);
+ break;
+ case LX_SOL_PACKET:
+ error = lx_setsockopt_packet(so, optname, optbuf, optlen);
+ break;
+ case LX_IPPROTO_IGMP:
+ error = lx_setsockopt_igmp(so, optname, optbuf, optlen);
+ break;
+ case LX_SOL_NETLINK:
+ /*
+ * Since our netlink implmentation is modeled after Linux,
+ * sockopts can be passed directly through.
+ */
+ error = socket_setsockopt(so, LX_SOL_NETLINK, optname, optval,
+ optlen, CRED());
+ break;
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+
+ if (error == ENOPROTOOPT) {
+ char buf[LX_UNSUP_BUFSZ];
+
+ (void) snprintf(buf, LX_UNSUP_BUFSZ, "setsockopt(%d, %d)",
+ level, optname);
+ lx_unsupported(buf);
+ }
+ if (buflen != 0) {
+ kmem_free(optbuf, buflen);
+ }
+ releasef(sock);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_getsockopt(int sock, int level, int optname, void *optval,
+ socklen_t *optlenp)
+{
+ struct sonode *so;
+ file_t *fp;
+ int error = 0, buflen = 0;
+ socklen_t optlen;
+ intptr_t stkbuf[2];
+ void *optbuf = stkbuf;
+
+ if (copyin(optlenp, &optlen, sizeof (optlen)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ if (optlen != 0) {
+ if (optlen > SO_MAXARGSIZE) {
+ return (set_errno(EINVAL));
+ }
+ if (optlen > sizeof (stkbuf)) {
+ buflen = optlen;
+ optbuf = kmem_zalloc(optlen, KM_SLEEP);
+ } else {
+ /* zero the on-stack buffer, just in case */
+ stkbuf[0] = 0;
+ stkbuf[1] = 0;
+ }
+ } else {
+ optbuf = NULL;
+ }
+ if ((so = getsonode(sock, &error, &fp)) == NULL) {
+ if (buflen != 0) {
+ kmem_free(optbuf, buflen);
+ }
+ return (set_errno(error));
+ }
+
+ switch (level) {
+ case LX_IPPROTO_IP:
+ error = lx_getsockopt_ip(so, optname, optbuf, &optlen);
+ break;
+ case LX_IPPROTO_IPV6:
+ error = lx_getsockopt_ipv6(so, optname, optbuf, &optlen);
+ break;
+ case LX_IPPROTO_ICMPV6:
+ error = lx_getsockopt_icmpv6(so, optname, optbuf, &optlen);
+ break;
+ case LX_IPPROTO_TCP:
+ error = lx_getsockopt_tcp(so, optname, optbuf, &optlen);
+ break;
+ case LX_SOL_SOCKET:
+ error = lx_getsockopt_socket(so, optname, optbuf, &optlen);
+ break;
+ case LX_IPPROTO_RAW:
+ error = lx_getsockopt_raw(so, optname, optbuf, &optlen);
+ break;
+ case LX_SOL_PACKET:
+ error = lx_getsockopt_packet(so, optname, optbuf, &optlen);
+ break;
+ case LX_IPPROTO_IGMP:
+ error = lx_getsockopt_igmp(so, optname, optbuf, &optlen);
+ break;
+ case LX_SOL_NETLINK:
+ /*
+ * Since our netlink implmentation is modeled after Linux,
+ * sockopts can be passed directly through.
+ */
+ error = socket_getsockopt(so, LX_SOL_NETLINK, optname, optval,
+ &optlen, 0, CRED());
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ if (error == ENOPROTOOPT) {
+ char buf[LX_UNSUP_BUFSZ];
+
+ (void) snprintf(buf, LX_UNSUP_BUFSZ, "getsockopt(%d, %d)",
+ level, optname);
+ lx_unsupported(buf);
+ }
+ if (copyout(&optlen, optlenp, sizeof (optlen)) != 0) {
+ error = EFAULT;
+ }
+ if (error == 0 && optlen > 0) {
+ VERIFY(optlen <= sizeof (stkbuf) || optlen <= buflen);
+ if (copyout(optbuf, optval, optlen) != 0) {
+ error = EFAULT;
+ }
+ }
+ if (buflen != 0) {
+ kmem_free(optbuf, buflen);
+ }
+ releasef(sock);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_getname_common(lx_getname_type_t type, int sockfd, void *np, int *nlp)
+{
+ struct sockaddr_storage buf;
+ struct sockaddr *name = (struct sockaddr *)&buf;
+ socklen_t namelen, namelen_orig;
+ int err, tmp;
+ struct sonode *so;
+
+ /* We need to validate the name address up front to pass LTP. */
+ if (copyin(np, &tmp, sizeof (tmp)) != 0)
+ return (set_errno(EFAULT));
+
+ if (copyin(nlp, &namelen, sizeof (socklen_t)) != 0)
+ return (set_errno(EFAULT));
+ namelen_orig = namelen;
+
+ /* LTP can pass -1 */
+ if ((int)namelen < 0)
+ return (set_errno(EINVAL));
+
+ if ((so = getsonode(sockfd, &err, NULL)) == NULL)
+ return (set_errno(err));
+
+ bzero(&buf, sizeof (buf));
+ namelen = sizeof (struct sockaddr_storage);
+ if (type == LX_GETPEERNAME) {
+ err = socket_getpeername(so, name, &namelen, B_FALSE, CRED());
+ } else {
+ err = socket_getsockname(so, name, &namelen, CRED());
+ }
+
+ if (err == 0) {
+ ASSERT(namelen <= so->so_max_addr_len);
+ err = stol_sockaddr_copyout(name, namelen,
+ (struct sockaddr *)np, (socklen_t *)nlp, namelen_orig);
+ }
+
+ releasef(sockfd);
+ return (err != 0 ? set_errno(err) : 0);
+}
+
+long
+lx_getpeername(int sockfd, void *np, int *nlp)
+{
+ return (lx_getname_common(LX_GETPEERNAME, sockfd, np, nlp));
+}
+
+long
+lx_getsockname(int sockfd, void *np, int *nlp)
+{
+ return (lx_getname_common(LX_GETSOCKNAME, sockfd, np, nlp));
+}
+
+static int
+lx_accept_common(int sock, struct sockaddr *name, socklen_t *nlp, int flags)
+{
+ struct sonode *so;
+ file_t *fp;
+ int error;
+ socklen_t namelen;
+ struct sonode *nso;
+ struct vnode *nvp;
+ struct file *nfp;
+ int nfd;
+ int arg;
+
+ if (flags & ~(LX_SOCK_CLOEXEC | LX_SOCK_NONBLOCK)) {
+ return (set_errno(EINVAL));
+ }
+
+ if ((so = getsonode(sock, &error, &fp)) == NULL)
+ return (set_errno(error));
+
+ if (name != NULL) {
+ /*
+ * The Linux man page says that -1 is returned and errno is set
+ * to EFAULT if the "name" address is bad, but it is silent on
+ * what to set errno to if the "namelen" address is bad.
+ * LTP expects EINVAL.
+ *
+ * Note that we must first check the name pointer, as the Linux
+ * docs state nothing is copied out if the "name" pointer is
+ * NULL. If it is NULL, we don't care about the namelen
+ * pointer's value or about dereferencing it.
+ */
+ if (copyin(nlp, &namelen, sizeof (namelen))) {
+ releasef(sock);
+ return (set_errno(EINVAL));
+ }
+ if (namelen == 0) {
+ name = NULL;
+ }
+ } else {
+ namelen = 0;
+ }
+
+ /*
+ * Allocate the user fd before socket_accept() in order to
+ * catch EMFILE errors before calling socket_accept().
+ */
+ if ((error = falloc(NULL, FWRITE|FREAD, &nfp, &nfd)) != 0) {
+ eprintsoline(so, EMFILE);
+ releasef(sock);
+ return (set_errno(error));
+ }
+ if ((error = socket_accept(so, fp->f_flag, CRED(), &nso)) != 0) {
+ if (error == EINTR)
+ lx_sock_syscall_restart(so, B_TRUE);
+ setf(nfd, NULL);
+ unfalloc(nfp);
+ releasef(sock);
+ return (set_errno(error));
+ }
+
+ nvp = SOTOV(nso);
+
+ if (namelen != 0) {
+ socklen_t addrlen = sizeof (struct sockaddr_storage);
+ struct sockaddr_storage buf;
+ struct sockaddr *addrp = (struct sockaddr *)&buf;
+
+ if ((error = socket_getpeername(nso, addrp, &addrlen, B_TRUE,
+ CRED())) == 0) {
+ error = stol_sockaddr_copyout(addrp, addrlen,
+ name, nlp, namelen);
+ /*
+ * Logic might dictate that we should check if we can
+ * write to the namelen pointer earlier so we don't
+ * accept a pending connection only to fail the call
+ * because we can't write the namelen value back out.
+ * However, testing shows Linux does indeed fail the
+ * call after accepting the connection so we must
+ * behave in a compatible manner.
+ */
+ } else {
+ ASSERT(error == EINVAL || error == ENOTCONN);
+ error = ECONNABORTED;
+ }
+ }
+
+ if (error != 0) {
+ setf(nfd, NULL);
+ unfalloc(nfp);
+ (void) socket_close(nso, 0, CRED());
+ socket_destroy(nso);
+ releasef(sock);
+ return (set_errno(error));
+ }
+
+ /* Fill in the entries that falloc reserved */
+ nfp->f_vnode = nvp;
+ mutex_exit(&nfp->f_tlock);
+ setf(nfd, nfp);
+
+ /* Act on LX_SOCK_CLOEXEC from flags */
+ if (flags & LX_SOCK_CLOEXEC) {
+ f_setfd(nfd, FD_CLOEXEC);
+ }
+
+ /*
+ * In Linux, accept()ed sockets do not inherit anything set by fcntl(),
+ * so either explicitly set the flags or filter those out.
+ *
+ * The VOP_SETFL code is a simplification of the F_SETFL code in
+ * fcntl(). Ignore any errors from VOP_SETFL.
+ */
+ arg = 0;
+ if (flags & LX_SOCK_NONBLOCK)
+ arg |= FNONBLOCK;
+
+ error = VOP_SETFL(nvp, nfp->f_flag, arg, nfp->f_cred, NULL);
+ if (error != 0) {
+ eprintsoline(so, error);
+ error = 0;
+ } else {
+ mutex_enter(&nfp->f_tlock);
+ nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
+ nfp->f_flag |= arg;
+ mutex_exit(&nfp->f_tlock);
+ }
+
+ releasef(sock);
+ return (nfd);
+}
+
+long
+lx_accept(int sockfd, void *np, int *nlp)
+{
+ return (lx_accept_common(sockfd, (struct sockaddr *)np,
+ (socklen_t *)nlp, 0));
+}
+
+long
+lx_accept4(int sockfd, void *np, int *nlp, int flags)
+{
+ return (lx_accept_common(sockfd, (struct sockaddr *)np,
+ (socklen_t *)nlp, flags));
+}
+
+long
+lx_listen(int sockfd, int backlog)
+{
+ return (listen(sockfd, backlog, 0));
+}
+
+long
+lx_shutdown(int sockfd, int how)
+{
+ return (shutdown(sockfd, how, 0));
+}
+
+/*
+ * Connect two sockets together for a socketpair. This is derived from
+ * so_socketpair, but forgoes the task of dealing with file descriptors.
+ */
+static int
+lx_socketpair_connect(file_t *fp1, file_t *fp2)
+{
+ sonode_t *so1, *so2;
+ sotpi_info_t *sti1, *sti2;
+ struct sockaddr_ux name;
+ int error;
+
+ so1 = VTOSO(fp1->f_vnode);
+ so2 = VTOSO(fp2->f_vnode);
+ sti1 = SOTOTPI(so1);
+ sti2 = SOTOTPI(so2);
+
+ VERIFY(so1->so_ops == &sotpi_sonodeops &&
+ so2->so_ops == &sotpi_sonodeops);
+
+ if (so1->so_type == SOCK_DGRAM) {
+ /*
+ * Bind both sockets and connect them with each other.
+ */
+ error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
+ if (error) {
+ return (error);
+ }
+ error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
+ if (error) {
+ return (error);
+ }
+ name.sou_family = AF_UNIX;
+ name.sou_addr = sti2->sti_ux_laddr;
+ error = socket_connect(so1, (struct sockaddr *)&name,
+ (socklen_t)sizeof (name), 0, _SOCONNECT_NOXLATE, CRED());
+ if (error) {
+ return (error);
+ }
+ name.sou_addr = sti1->sti_ux_laddr;
+ error = socket_connect(so2, (struct sockaddr *)&name,
+ (socklen_t)sizeof (name), 0, _SOCONNECT_NOXLATE, CRED());
+ return (error);
+ } else {
+ sonode_t *nso;
+
+ /*
+ * Bind both sockets, with 'so1' being a listener. Connect
+ * 'so2' to 'so1', doing so as nonblocking to avoid waiting for
+ * soaccept to complete. Accept the connection on 'so1',
+ * replacing the socket/vnode in 'fp1' with the new connection.
+ *
+ * We could simply call socket_listen() here (which would do the
+ * binding automatically) if the code didn't rely on passing
+ * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
+ */
+ error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
+ _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR, CRED());
+ if (error) {
+ return (error);
+ }
+ error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
+ if (error) {
+ return (error);
+ }
+
+ name.sou_family = AF_UNIX;
+ name.sou_addr = sti1->sti_ux_laddr;
+ error = socket_connect(so2,
+ (struct sockaddr *)&name,
+ (socklen_t)sizeof (name),
+ FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
+ if (error != 0 && error != EINPROGRESS) {
+ return (error);
+ }
+
+ error = socket_accept(so1, 0, CRED(), &nso);
+ if (error) {
+ return (error);
+ }
+
+ /* wait for so2 being SS_CONNECTED */
+ mutex_enter(&so2->so_lock);
+ error = sowaitconnected(so2, 0, 0);
+ mutex_exit(&so2->so_lock);
+ if (error != 0) {
+ (void) socket_close(nso, 0, CRED());
+ socket_destroy(nso);
+ return (error);
+ }
+
+ (void) socket_close(so1, 0, CRED());
+ socket_destroy(so1);
+ fp1->f_vnode = SOTOV(nso);
+ }
+ return (0);
+}
+
+long
+lx_socketpair(int domain, int type, int protocol, int *sv)
+{
+ int err, options, fds[2];
+ file_t *fps[2];
+ boolean_t emul_seqp = B_FALSE;
+
+ /*
+ * For the special case of SOCK_SEQPACKET for AF_UNIX, we want to treat
+ * this as a SOCK_DGRAM. The semantics are similar, but our native code
+ * will not pass cmsg creds over a connection-oriented socket, unlike a
+ * connectionless one. Some Linux code depends on this for Unix-domain
+ * sockets. In particular, a sockopt of SO_PASSCRED, which we map into
+ * our native SO_RECVUCRED, must work across fork so that the correct
+ * pid of the sender is available in the cmsg. See the comment in
+ * lx_setsockopt_socket().
+ */
+ if (domain == LX_AF_UNIX && type == LX_SOCK_SEQPACKET) {
+ type = LX_SOCK_DGRAM;
+ emul_seqp = B_TRUE;
+ }
+
+ if ((err = lx_convert_sock_args(domain, type, protocol, &domain, &type,
+ &options, &protocol)) != 0) {
+ return (set_errno(err));
+ }
+
+ if ((err = lx_socket_create(domain, type, protocol, options, &fps[0],
+ &fds[0])) != 0) {
+ return (set_errno(err));
+ }
+
+ /*
+ * While it seems silly to check the family after socket creation, this
+ * is done to appease LTP when it tries some outlandish combinations of
+ * domain/type/protocol. The socket_create function is relied upon to
+ * emit the expected errors.
+ */
+ if (VTOSO(fps[0]->f_vnode)->so_family != AF_UNIX) {
+ lx_socket_destroy(fps[0], fds[0]);
+ return (set_errno(EOPNOTSUPP));
+ }
+
+ if ((err = lx_socket_create(domain, type, protocol, options, &fps[1],
+ &fds[1])) != 0) {
+ lx_socket_destroy(fps[0], fds[0]);
+ return (set_errno(err));
+ }
+
+ err = lx_socketpair_connect(fps[0], fps[1]);
+ if (err != 0) {
+ lx_socket_destroy(fps[0], fds[0]);
+ lx_socket_destroy(fps[1], fds[1]);
+ return (set_errno(err));
+ }
+
+ if (emul_seqp) {
+ int i;
+ for (i = 0; i < 2; i++) {
+ sonode_t *so = VTOSO(fps[i]->f_vnode);
+ lx_socket_aux_data_t *sad = lx_sad_acquire(SOTOV(so));
+ sad->lxsad_flags |= LXSAD_FL_EMULSEQPKT;
+ mutex_exit(&sad->lxsad_lock);
+ }
+ }
+
+ setf(fds[0], fps[0]);
+ setf(fds[1], fps[1]);
+
+ if ((options & SOCK_CLOEXEC) != 0) {
+ f_setfd(fds[0], FD_CLOEXEC);
+ f_setfd(fds[1], FD_CLOEXEC);
+ }
+ if (copyout(fds, sv, sizeof (fds)) != 0) {
+ (void) closeandsetf(fds[0], NULL);
+ (void) closeandsetf(fds[1], NULL);
+ return (set_errno(EFAULT));
+ }
+ return (0);
+}
+
+
+#if defined(_SYSCALL32_IMPL)
+
+#define LX_SYS_SOCKETCALL 102
+#define LX_SOCKETCALL_MAX 20
+
+typedef long (*lx_sockfn_t)();
+
+static struct {
+ lx_sockfn_t s_fn; /* Function implementing the subcommand */
+ int s_nargs; /* Number of arguments the function takes */
+} lx_socketcall_fns[] = {
+ lx_socket, 3, /* socket */
+ lx_bind, 3, /* bind */
+ lx_connect, 3, /* connect */
+ lx_listen, 2, /* listen */
+ lx_accept, 3, /* accept */
+ lx_getsockname, 3, /* getsockname */
+ lx_getpeername, 3, /* getpeername */
+ lx_socketpair, 4, /* socketpair */
+ lx_send, 4, /* send */
+ lx_recv, 4, /* recv */
+ lx_sendto, 6, /* sendto */
+ lx_recvfrom, 6, /* recvfrom */
+ lx_shutdown, 2, /* shutdown */
+ lx_setsockopt, 5, /* setsockopt */
+ lx_getsockopt, 5, /* getsockopt */
+ lx_sendmsg, 3, /* sendmsg */
+ lx_recvmsg, 3, /* recvmsg */
+ lx_accept4, 4, /* accept4 */
+ lx_recvmmsg, 5, /* recvmmsg */
+ lx_sendmmsg, 4 /* sendmmsg */
+};
+
+long
+lx_socketcall(long p1, uint32_t *p2)
+{
+ int subcmd, i;
+ unsigned long args[6] = { 0, 0, 0, 0, 0, 0 };
+
+ /* incoming subcmds are 1-indexed */
+ subcmd = (int)p1 - 1;
+
+ if (subcmd < 0 || subcmd >= LX_SOCKETCALL_MAX ||
+ lx_socketcall_fns[subcmd].s_fn == NULL) {
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Copy the arguments to the subcommand in from the app's address
+ * space, returning EFAULT if we get a bogus pointer.
+ */
+ for (i = 0; i < lx_socketcall_fns[subcmd].s_nargs; i++) {
+ uint32_t arg;
+
+ if (copyin(&p2[i], &arg, sizeof (uint32_t)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ args[i] = (unsigned long)arg;
+ }
+
+ return ((lx_socketcall_fns[subcmd].s_fn)(args[0], args[1], args[2],
+ args[3], args[4], args[5]));
+}
+
+#endif /* defined(_SYSCALL32_IMPL) */
+
+static void
+lx_socket_vsd_free(void *data)
+{
+ lx_socket_aux_data_t *entry;
+
+ entry = (lx_socket_aux_data_t *)data;
+ mutex_destroy(&entry->lxsad_lock);
+ kmem_free(entry, sizeof (*entry));
+}
+
+void
+lx_socket_init()
+{
+ vsd_create(&lx_socket_vsd, lx_socket_vsd_free);
+}
+
+void
+lx_socket_fini()
+{
+ vsd_destroy(&lx_socket_vsd);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_splice.c b/usr/src/uts/common/brand/lx/syscall/lx_splice.c
new file mode 100644
index 0000000000..64db538413
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_splice.c
@@ -0,0 +1,491 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/sunddi.h>
+#include <sys/fs/fifonode.h>
+#include <sys/strsun.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_signal.h>
+
+/* Splice flags */
+#define LX_SPLICE_F_MOVE 0x01
+#define LX_SPLICE_F_NONBLOCK 0x02
+#define LX_SPLICE_F_MORE 0x04
+#define LX_SPLICE_F_GIFT 0x08
+
+/*
+ * Use a max buffer size of 32k. This is a good compromise between doing I/O in
+ * large chunks, the limit on how much data we can write into an lx pipe by
+ * default (LX_DEFAULT_PIPE_SIZE), and how much kernel memory we'll allocate.
+ */
+#define LX_SPL_BUF_SIZE (32 * 1024)
+
+/*
+ * We only want to read as much from the input fd as we can write into the
+ * output fd, up to our buffer size. Figure out what that quantity is.
+ * Note that len will continuously decrease to 0 which triggers the typical
+ * end of the splice loop.
+ */
+static size_t
+lx_spl_wr_sz(file_t *fp_out, u_offset_t fileoff, size_t bsz, size_t len,
+ boolean_t first)
+{
+ size_t sz;
+
+ sz = MIN(bsz, len);
+ if (fp_out->f_vnode->v_type == VFIFO) {
+ /*
+ * If no readers on pipe, or if it would go over high water
+ * mark then return 0. Note that the first write into a
+ * pipe is expected to block if we're over the high water mark.
+ */
+ fifonode_t *fn_dest = VTOF(fp_out->f_vnode)->fn_dest;
+ fifolock_t *fn_lock = fn_dest->fn_lock;
+
+ mutex_enter(&fn_lock->flk_lock);
+ if (fn_dest->fn_rcnt == 0) {
+ sz = 0;
+ } else if (!first &&
+ (sz + fn_dest->fn_count) > fn_dest->fn_hiwat) {
+ sz = 0;
+ }
+ mutex_exit(&fn_lock->flk_lock);
+ } else if (fp_out->f_vnode->v_type == VREG) {
+ if (fileoff >= curproc->p_fsz_ctl ||
+ fileoff >= OFFSET_MAX(fp_out)) {
+ sz = 0;
+ } else {
+ sz = MIN(sz, (size_t)curproc->p_fsz_ctl - fileoff);
+ sz = MIN(sz, (size_t)OFFSET_MAX(fp_out) - fileoff);
+ }
+ }
+
+ /*
+ * if (fp_out->f_vnode->v_type == VSOCK)
+ *
+ * There is no good way to determine if a socket is "full". A write for
+ * the different protocol implementations can return EWOULDBLOCK under
+ * different conditions, none of which we can easily check for in
+ * advance.
+ */
+
+ return (sz);
+}
+
+/*
+ * The splice read function handles "reading" from a pipe and passes everything
+ * else along to our normal VOP_READ code path.
+ *
+ * When we have a pipe as our input, we don't want to consume the data out
+ * of the pipe until the write has succeeded. This aligns more closely with
+ * the Linux behavior when a write error occurs. Thus, when a pipe is the input
+ * and we got some data, we return with the fifo flagged as FIFORDBLOCK. This
+ * ensures that the data we're writing cannot be consumed by another thread
+ * until we consume it ourself.
+ *
+ * The pipe "read" code here is derived from the fifo I_PEEK code.
+ */
+static int
+lx_spl_read(file_t *fp, uio_t *uiop, size_t *nread, boolean_t pipe_in,
+ boolean_t rd_pos)
+{
+ fifonode_t *fnp;
+ fifolock_t *fn_lock;
+ int count;
+ mblk_t *bp;
+
+ if (!pipe_in)
+ return (lx_read_common(fp, uiop, nread, rd_pos));
+
+ ASSERT(fp->f_vnode->v_type == VFIFO);
+ fnp = VTOF(fp->f_vnode);
+ fn_lock = fnp->fn_lock;
+ *nread = 0;
+
+ mutex_enter(&fn_lock->flk_lock);
+
+ /*
+ * If the pipe has been switched to socket mode then this implies an
+ * internal programmatic error. Likewise, if it was switched to
+ * socket mode because we dropped the lock to set the stayfast flag.
+ */
+ if ((fnp->fn_flag & FIFOFAST) == 0 || !fifo_stayfast_enter(fnp)) {
+ mutex_exit(&fn_lock->flk_lock);
+ return (EBADF);
+ }
+
+ while (fnp->fn_count == 0 || (fnp->fn_flag & FIFORDBLOCK) != 0) {
+ fifonode_t *fn_dest = fnp->fn_dest;
+
+ /* No writer, EOF */
+ if (fn_dest->fn_wcnt == 0 || fn_dest->fn_rcnt == 0) {
+ fifo_stayfast_exit(fnp);
+ mutex_exit(&fn_lock->flk_lock);
+ return (0);
+ }
+
+ /* If non-blocking, return EAGAIN otherwise 0. */
+ if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) {
+ fifo_stayfast_exit(fnp);
+ mutex_exit(&fn_lock->flk_lock);
+ if (uiop->uio_fmode & FNONBLOCK)
+ return (EAGAIN);
+ return (0);
+ }
+
+ /* Wait for data */
+ fnp->fn_flag |= FIFOWANTR;
+ if (!cv_wait_sig_swap(&fnp->fn_wait_cv, &fn_lock->flk_lock)) {
+ fifo_stayfast_exit(fnp);
+ mutex_exit(&fn_lock->flk_lock);
+ return (EINTR);
+ }
+ }
+
+ VERIFY((fnp->fn_flag & FIFORDBLOCK) == 0);
+ VERIFY((fnp->fn_flag & FIFOSTAYFAST) != 0);
+
+ /* Get up to our read size or whatever is currently available. */
+ count = MIN(uiop->uio_resid, fnp->fn_count);
+ ASSERT(count > 0);
+ *nread = count;
+ bp = fnp->fn_mp;
+ while (count > 0) {
+ uint_t cnt = MIN(uiop->uio_resid, MBLKL(bp));
+
+ /*
+ * We have the input pipe locked and we know there is data
+ * available to consume. We're doing a UIO_SYSSPACE move into
+ * an internal buffer that we allocated in lx_splice() so
+ * this should never fail.
+ */
+ VERIFY(uiomove((char *)bp->b_rptr, cnt, UIO_READ, uiop) == 0);
+ count -= cnt;
+ bp = bp->b_cont;
+ }
+
+ fnp->fn_flag |= FIFORDBLOCK;
+
+ mutex_exit(&fn_lock->flk_lock);
+ return (0);
+}
+
+/*
+ * We've already "read" the data out of the pipe without actually consuming it.
+ * Here we update the pipe to consume the data and discard it. This is derived
+ * from the fifo_read code, except that we already know the amount of data
+ * in the pipe to consume and we don't have to actually move any data.
+ */
+static void
+lx_spl_consume(file_t *fp, uint_t count)
+{
+ fifonode_t *fnp, *fn_dest;
+ fifolock_t *fn_lock;
+
+ ASSERT(fp->f_vnode->v_type == VFIFO);
+
+ fnp = VTOF(fp->f_vnode);
+ fn_lock = fnp->fn_lock;
+
+ mutex_enter(&fn_lock->flk_lock);
+ VERIFY(fnp->fn_count >= count);
+
+ while (count > 0) {
+ int bpsize = MBLKL(fnp->fn_mp);
+ int decr_size = MIN(bpsize, count);
+
+ fnp->fn_count -= decr_size;
+ if (bpsize <= decr_size) {
+ mblk_t *bp = fnp->fn_mp;
+ fnp->fn_mp = fnp->fn_mp->b_cont;
+ freeb(bp);
+ } else {
+ fnp->fn_mp->b_rptr += decr_size;
+ }
+
+ count -= decr_size;
+ }
+
+ fnp->fn_flag &= ~FIFORDBLOCK;
+ fifo_stayfast_exit(fnp);
+
+ fifo_wakereader(fnp, fn_lock);
+
+ /*
+ * Wake up any blocked writers, processes sleeping on POLLWRNORM, or
+ * processes waiting for SIGPOLL.
+ */
+ fn_dest = fnp->fn_dest;
+ if (fn_dest->fn_flag & (FIFOWANTW | FIFOHIWATW) &&
+ fnp->fn_count < fn_dest->fn_hiwat) {
+ fifo_wakewriter(fn_dest, fn_lock);
+ }
+
+ /* Update vnode update access time */
+ fnp->fn_atime = fnp->fn_dest->fn_atime = gethrestime_sec();
+
+ mutex_exit(&fn_lock->flk_lock);
+}
+
+/*
+ * Transfer data from the input file descriptor to the output file descriptor
+ * without leaving the kernel. For Linux this is limited by it's kernel
+ * implementation which forces at least one of the file descriptors to be a
+ * pipe. Our implementation is likely quite different from the Linux
+ * one, which appears to play some VM tricks with shared pages from the pipe
+ * code. Instead, our implementation uses our normal VOP_READ/VOP_WRITE
+ * operations to internally move the data while using a single uio buffer. We
+ * implement the additional Linux behavior around the various checks and
+ * limitations.
+ *
+ * One key point on the read side is how we handle an input pipe. We don't
+ * want to consume the data out of the pipe until the write has succeeded.
+ * This aligns more closely with the Linux behavior when a write error occurs.
+ * The lx_spl_read() and lx_spl_consume() functions are used to handle this
+ * case.
+ */
+long
+lx_splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out, size_t len,
+ uint_t flags)
+{
+ int error = 0;
+ file_t *fp_in = NULL, *fp_out = NULL;
+ boolean_t found_pipe = B_FALSE, rd_pos = B_FALSE, wr_pos = B_FALSE;
+ boolean_t first = B_TRUE, pipe_in = B_FALSE;
+ iovec_t iov;
+ uio_t uio;
+ void *buf = NULL;
+ off_t r_off = 0, w_off = 0;
+ ushort_t r_flag, w_flag;
+ size_t bsize = 0, wr_sz, nread, nwrite, total = 0;
+
+ /*
+ * Start by validating the inputs.
+ *
+ * Linux doesn't bother to check for valid flags, so neither do we.
+ * Also, aside from SPLICE_F_NONBLOCK, we ignore the rest of the
+ * flags since they're just hints to the Linux kernel implementation
+ * and have no effect on the proper functioning of the syscall.
+ */
+
+ if (len == 0)
+ return (0);
+
+ if ((fp_in = getf(fd_in)) == NULL) {
+ error = EBADF;
+ goto done;
+ }
+ switch (fp_in->f_vnode->v_type) {
+ case VFIFO:
+ /* A fifo that is not in fast mode does not count as a pipe */
+ if (((VTOF(fp_in->f_vnode))->fn_flag & FIFOFAST) != 0) {
+ found_pipe = B_TRUE;
+ pipe_in = B_TRUE;
+ }
+ /*FALLTHROUGH*/
+ case VSOCK:
+ if (off_in != NULL) {
+ error = ESPIPE;
+ goto done;
+ }
+ break;
+ case VREG:
+ case VBLK:
+ case VCHR:
+ case VPROC:
+ if (off_in != NULL) {
+ if (copyin(off_in, &r_off, sizeof (r_off)) != 0) {
+ error = EFAULT;
+ goto done;
+ }
+ rd_pos = B_TRUE;
+ }
+ break;
+ default:
+ error = EBADF;
+ goto done;
+ }
+ r_flag = fp_in->f_flag;
+ if ((r_flag & FREAD) == 0) {
+ error = EBADF;
+ goto done;
+ }
+
+ if ((fp_out = getf(fd_out)) == NULL) {
+ error = EBADF;
+ goto done;
+ }
+ switch (fp_out->f_vnode->v_type) {
+ case VFIFO:
+ found_pipe = B_TRUE;
+ /* Splicing to ourself returns EINVAL on Linux */
+ if (pipe_in) {
+ fifonode_t *fnp = VTOF(fp_in->f_vnode);
+ if (VTOF(fp_out->f_vnode) == fnp->fn_dest) {
+ error = EINVAL;
+ goto done;
+ }
+ }
+ /*FALLTHROUGH*/
+ case VSOCK:
+ if (off_out != NULL) {
+ error = ESPIPE;
+ goto done;
+ }
+ break;
+ case VREG:
+ case VBLK:
+ case VCHR:
+ case VPROC:
+ if (off_out != NULL) {
+ if (copyin(off_out, &w_off, sizeof (w_off)) != 0) {
+ error = EFAULT;
+ goto done;
+ }
+ wr_pos = B_TRUE;
+ }
+ break;
+ default:
+ error = EBADF;
+ goto done;
+ }
+ w_flag = fp_out->f_flag;
+ if ((w_flag & FWRITE) == 0) {
+ error = EBADF;
+ goto done;
+ }
+ /* Appending is invalid for output fd in splice */
+ if ((w_flag & FAPPEND) != 0) {
+ error = EINVAL;
+ goto done;
+ }
+
+ if (!found_pipe) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * Check for non-blocking pipe operations. If no data in the input
+ * pipe, return EAGAIN. If the output pipe is full, return EAGAIN.
+ */
+ if (flags & LX_SPLICE_F_NONBLOCK) {
+ fifonode_t *fn_dest;
+
+ if (fp_in->f_vnode->v_type == VFIFO) {
+ fn_dest = VTOF(fp_in->f_vnode)->fn_dest;
+ if (fn_dest->fn_count == 0) {
+ error = EAGAIN;
+ goto done;
+ }
+ }
+ if (fp_out->f_vnode->v_type == VFIFO) {
+ fn_dest = VTOF(fp_out->f_vnode)->fn_dest;
+ fifolock_t *fn_lock = fn_dest->fn_lock;
+ mutex_enter(&fn_lock->flk_lock);
+ if (fn_dest->fn_count >= fn_dest->fn_hiwat) {
+ mutex_exit(&fn_lock->flk_lock);
+ error = EAGAIN;
+ goto done;
+ }
+ mutex_exit(&fn_lock->flk_lock);
+ }
+ }
+
+ bsize = MIN(LX_SPL_BUF_SIZE, len);
+
+ buf = kmem_alloc(bsize, KM_SLEEP);
+ bzero(&uio, sizeof (uio));
+ uio.uio_iovcnt = 1;
+ uio.uio_iov = &iov;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_llimit = curproc->p_fsz_ctl;
+
+ /*
+ * Loop reading data from fd_in and writing to fd_out. This is
+ * controlled by how much of the requested data we can actually write,
+ * particularly when the destination is a pipe. This matches the Linux
+ * behavior, which may terminate earlier than the full 'len' if the
+ * pipe fills up. However, we need to block when writing into a full
+ * pipe on the first iteration of the loop. We already checked above
+ * for a full output pipe when non-blocking.
+ */
+ while ((wr_sz = lx_spl_wr_sz(fp_out, w_off, bsize, len, first)) > 0) {
+ first = B_FALSE;
+
+ /* (re)setup for a read */
+ uio.uio_resid = iov.iov_len = wr_sz; /* only rd. max writable */
+ iov.iov_base = buf;
+ uio.uio_offset = r_off;
+ uio.uio_extflg = UIO_COPY_CACHED;
+ uio.uio_fmode = r_flag;
+ error = lx_spl_read(fp_in, &uio, &nread, pipe_in, rd_pos);
+ if (error != 0 || nread == 0)
+ break;
+ r_off = uio.uio_offset;
+
+ /* Setup and perform a write from the same buffer */
+ uio.uio_resid = iov.iov_len = nread;
+ iov.iov_base = buf;
+ uio.uio_offset = w_off;
+ uio.uio_extflg = UIO_COPY_DEFAULT;
+ uio.uio_fmode = w_flag;
+ error = lx_write_common(fp_out, &uio, &nwrite, wr_pos);
+ if (error != 0) {
+ if (pipe_in) {
+ /* Need to unblock reading from the fifo. */
+ fifonode_t *fnp = VTOF(fp_in->f_vnode);
+
+ mutex_enter(&fnp->fn_lock->flk_lock);
+ fnp->fn_flag &= ~FIFORDBLOCK;
+ fifo_stayfast_exit(fnp);
+ fifo_wakereader(fnp, fnp->fn_lock);
+ mutex_exit(&fnp->fn_lock->flk_lock);
+ }
+ break;
+ }
+ w_off = uio.uio_offset;
+
+ /*
+ * If input is a pipe, then we can consume the amount of data
+ * out of the pipe that we successfully wrote.
+ */
+ if (pipe_in)
+ lx_spl_consume(fp_in, nwrite);
+
+ total += nwrite;
+ len -= nwrite;
+ }
+
+done:
+ if (buf != NULL)
+ kmem_free(buf, bsize);
+ if (fp_in != NULL)
+ releasef(fd_in);
+ if (fp_out != NULL)
+ releasef(fd_out);
+ if (error != 0)
+ return (set_errno(error));
+
+ return (total);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_stat.c b/usr/src/uts/common/brand/lx/syscall/lx_stat.c
new file mode 100644
index 0000000000..9af0080138
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_stat.c
@@ -0,0 +1,486 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/model.h>
+#include <sys/mode.h>
+#include <sys/stat.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_types.h>
+#include <sys/lx_impl.h>
+#include <sys/brand.h>
+#include <sys/ddi.h>
+
+/* From "uts/common/syscall/stat.c" */
+extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **);
+
+typedef struct lx_timespec32 {
+ int32_t ts_sec;
+ int32_t ts_nsec;
+} lx_timespec32_t;
+
+typedef struct lx_timespec64 {
+ int64_t ts_sec;
+ int64_t ts_nsec;
+}lx_timespec64_t;
+
+struct lx_stat32 {
+ uint16_t st_dev;
+ uint16_t st_pad1;
+ uint32_t st_ino;
+ uint16_t st_mode;
+ uint16_t st_nlink;
+ uint16_t st_uid;
+ uint16_t st_gid;
+ uint16_t st_rdev;
+ uint16_t st_pad2;
+ uint32_t st_size;
+ uint32_t st_blksize;
+ uint32_t st_blocks;
+ lx_timespec32_t st_atime;
+ lx_timespec32_t st_mtime;
+ lx_timespec32_t st_ctime;
+ uint32_t st_pad3;
+ uint32_t st_pad4;
+};
+
+#pragma pack(4)
+struct lx_stat64_32 {
+ uint64_t st_dev;
+ uint32_t st_pad1;
+ uint32_t st_small_ino;
+ uint32_t st_mode;
+ uint32_t st_nlink;
+ uint32_t st_uid;
+ uint32_t st_gid;
+ uint64_t st_rdev;
+ uint32_t st_pad2;
+ uint64_t st_size;
+ uint32_t st_blksize;
+ uint64_t st_blocks;
+ lx_timespec32_t st_atime;
+ lx_timespec32_t st_mtime;
+ lx_timespec32_t st_ctime;
+ uint64_t st_ino;
+};
+#pragma pack()
+
+#if defined(_LP64)
+struct lx_stat64_64 {
+ uint64_t st_dev;
+ uint64_t st_ino;
+ uint64_t st_nlink; /* yes, the order really is */
+ uint32_t st_mode; /* different for these two */
+ uint32_t st_uid;
+ uint32_t st_gid;
+ uint32_t st_pad0;
+ uint64_t st_rdev;
+ int64_t st_size;
+ int64_t st_blksize;
+ int64_t st_blocks;
+ lx_timespec64_t st_atime;
+ lx_timespec64_t st_mtime;
+ lx_timespec64_t st_ctime;
+ int64_t st_unused[3];
+};
+#endif /* defined(_LP64) */
+
+typedef enum lx_stat_fmt {
+ LXF_STAT32,
+ LXF_STAT64_32,
+ LXF_STAT64_64
+} lx_stat_fmt_t;
+
+static void
+lx_stat_xlate_dev(vattr_t *vattr)
+{
+ lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone);
+ dev_t dev;
+ lx_virt_disk_t *vd;
+ boolean_t is_dev;
+
+ if (S_ISCHR(vattr->va_mode) || S_ISBLK(vattr->va_mode)) {
+ dev = vattr->va_rdev;
+ is_dev = B_TRUE;
+ } else {
+ dev = vattr->va_fsid;
+ is_dev = B_FALSE;
+ }
+
+ /*
+ * See if this is the /dev/zfs device. If it is, the device number has
+ * already been converted to Linux format in the lx devfs so we have
+ * to check for that and not a native major/minor style.
+ */
+ if (S_ISCHR(vattr->va_mode) &&
+ LX_GETMAJOR(dev) == getmajor(lxzd->lxzd_zfs_dev) &&
+ LX_GETMINOR(dev) == 0) {
+ /*
+ * We use the /dev/zfs device as a placeholder for our in-zone
+ * fabricated /dev/zfsds0 device that we're pretending / is
+ * mounted on. lx_zone_get_zfsds has pre-allocated this
+ * entry in the emulated device list. Reset dev so we can
+ * properly match in the following loop.
+ */
+ dev = curproc->p_zone->zone_rootvp->v_vfsp->vfs_dev;
+ }
+
+ /* Substitute emulated major/minor on zvols or mounted datasets. */
+ vd = list_head(lxzd->lxzd_vdisks);
+ while (vd != NULL) {
+ if (vd->lxvd_real_dev == dev) {
+ dev = vd->lxvd_emul_dev;
+ /*
+ * We only update rdev for matching zfds/zvol devices
+ * so that the other devices are unchanged.
+ */
+ if (is_dev) {
+ vattr->va_rdev = LX_MAKEDEVICE(getmajor(dev),
+ getminor(dev));
+ }
+ break;
+ }
+ vd = list_next(lxzd->lxzd_vdisks, vd);
+ }
+
+ /* Mangle st_dev into expected format */
+ vattr->va_fsid = LX_MAKEDEVICE(getmajor(dev), getminor(dev));
+}
+
+static long
+lx_stat_common(vnode_t *vp, cred_t *cr, void *outp, lx_stat_fmt_t fmt,
+ int follow)
+{
+ vattr_t vattr;
+ mode_t mode;
+ int error, flags;
+
+ /*
+ * When symlink following is desired, the ATTR_REAL flag is necessary
+ * to circumvent some of the weird behavior present in filesystems like
+ * lx_proc.
+ */
+ flags = (follow == FOLLOW) ? ATTR_REAL : 0;
+
+ vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, flags, cr, NULL)) != 0) {
+ return (error);
+ }
+
+ mode = VTTOIF(vattr.va_type) | vattr.va_mode;
+ if ((mode & S_IFMT) == S_IFBLK) {
+ /* Linux seems to report a 0 st_size for all block devices */
+ vattr.va_size = 0;
+ }
+ if (vattr.va_rdev == NODEV) {
+ /* Linux leaves st_rdev zeroed when it is absent */
+ vattr.va_rdev = 0;
+ }
+
+ lx_stat_xlate_dev(&vattr);
+
+ if (fmt == LXF_STAT32) {
+ struct lx_stat32 sb;
+
+ if (vattr.va_fsid > USHRT_MAX || vattr.va_rdev > USHRT_MAX ||
+ vattr.va_nlink > USHRT_MAX || vattr.va_size > INT_MAX) {
+ return (EOVERFLOW);
+ }
+
+ bzero(&sb, sizeof (sb));
+ sb.st_dev = vattr.va_fsid;
+ sb.st_ino = vattr.va_nodeid;
+ sb.st_mode = mode;
+ sb.st_nlink = vattr.va_nlink;
+ sb.st_uid = LX_UID32_TO_UID16(vattr.va_uid);
+ sb.st_gid = LX_GID32_TO_GID16(vattr.va_gid);
+ sb.st_rdev = vattr.va_rdev;
+ sb.st_size = vattr.va_size;
+ sb.st_blksize = vattr.va_blksize;
+ sb.st_blocks = vattr.va_nblocks;
+ sb.st_atime.ts_sec = vattr.va_atime.tv_sec;
+ sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec;
+ sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec;
+ sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec;
+ sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec;
+ sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec;
+ if (copyout(&sb, outp, sizeof (sb)) != 0) {
+ return (EFAULT);
+ }
+ return (0);
+ } else if (fmt == LXF_STAT64_32) {
+ struct lx_stat64_32 sb;
+
+ bzero(&sb, sizeof (sb));
+ sb.st_dev = vattr.va_fsid;
+ sb.st_ino = vattr.va_nodeid;
+ sb.st_small_ino = (vattr.va_nodeid & UINT_MAX);
+ sb.st_mode = mode;
+ sb.st_nlink = vattr.va_nlink;
+ sb.st_uid = vattr.va_uid;
+ sb.st_gid = vattr.va_gid;
+ sb.st_rdev = vattr.va_rdev;
+ sb.st_size = vattr.va_size;
+ sb.st_blksize = vattr.va_blksize;
+ sb.st_blocks = vattr.va_nblocks;
+ sb.st_atime.ts_sec = vattr.va_atime.tv_sec;
+ sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec;
+ sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec;
+ sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec;
+ sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec;
+ sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec;
+ if (copyout(&sb, outp, sizeof (sb)) != 0) {
+ return (EFAULT);
+ }
+ return (0);
+ } else if (fmt == LXF_STAT64_64) {
+#if defined(_LP64)
+ struct lx_stat64_64 sb;
+
+ bzero(&sb, sizeof (sb));
+ sb.st_dev = vattr.va_fsid;
+ sb.st_ino = vattr.va_nodeid;
+ sb.st_mode = mode;
+ sb.st_nlink = vattr.va_nlink;
+ sb.st_uid = vattr.va_uid;
+ sb.st_gid = vattr.va_gid;
+ sb.st_rdev = vattr.va_rdev;
+ sb.st_size = vattr.va_size;
+ sb.st_blksize = vattr.va_blksize;
+ sb.st_blocks = vattr.va_nblocks;
+ sb.st_atime.ts_sec = vattr.va_atime.tv_sec;
+ sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec;
+ sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec;
+ sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec;
+ sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec;
+ sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec;
+ if (copyout(&sb, outp, sizeof (sb)) != 0) {
+ return (EFAULT);
+ }
+ return (0);
+#else
+ /* Invalid output format on 32-bit */
+ VERIFY(0);
+#endif
+ }
+
+ /* Invalid output format */
+ VERIFY(0);
+ return (0);
+}
+
+long
+lx_stat32(char *name, void *outp)
+{
+ vnode_t *vp = NULL;
+ cred_t *cr = NULL;
+ int error;
+
+ if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) {
+ return (set_errno(error));
+ }
+ error = lx_stat_common(vp, cr, outp, LXF_STAT32, FOLLOW);
+ VN_RELE(vp);
+ crfree(cr);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_fstat32(int fd, void *outp)
+{
+ file_t *fp;
+ int error;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, LXF_STAT32,
+ FOLLOW);
+ releasef(fd);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_lstat32(char *name, void *outp)
+{
+ vnode_t *vp = NULL;
+ cred_t *cr = NULL;
+ int error;
+
+ if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) {
+ return (set_errno(error));
+ }
+ error = lx_stat_common(vp, cr, outp, LXF_STAT32, NO_FOLLOW);
+ VN_RELE(vp);
+ crfree(cr);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_stat64(char *name, void *outp)
+{
+ vnode_t *vp = NULL;
+ cred_t *cr = NULL;
+ model_t model = get_udatamodel();
+ int error;
+
+ if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) {
+ return (set_errno(error));
+ }
+ error = lx_stat_common(vp, cr, outp,
+ (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, FOLLOW);
+ VN_RELE(vp);
+ crfree(cr);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_fstat64(int fd, void *outp)
+{
+ file_t *fp;
+ model_t model = get_udatamodel();
+ int error;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ error = lx_stat_common(fp->f_vnode, fp->f_cred, outp,
+ (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, FOLLOW);
+ releasef(fd);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+#define LX_FSTATAT_ALLOWED (LX_AT_SYMLINK_NOFOLLOW | LX_AT_EMPTY_PATH | \
+ LX_AT_NO_AUTOMOUNT)
+
+long
+lx_fstatat64(int fd, char *name, void *outp, int flag)
+{
+ vnode_t *vp = NULL;
+ cred_t *cr = NULL;
+ model_t model = get_udatamodel();
+ enum symfollow follow = FOLLOW;
+ int error;
+ char c;
+
+ if (fd == LX_AT_FDCWD) {
+ fd = AT_FDCWD;
+ }
+ if ((flag & ~LX_FSTATAT_ALLOWED) != 0) {
+ return (set_errno(EINVAL));
+ }
+ if ((flag & LX_AT_NO_AUTOMOUNT) != 0) {
+ /*
+ * While AT_NO_AUTOMOUNT is a legal flag for fstatat64, it is
+ * not yet supported by lx_autofs.
+ */
+ lx_unsupported("fstatat(AT_NO_AUTOMOUNT)");
+ return (set_errno(EINVAL));
+ }
+ if ((flag & LX_AT_SYMLINK_NOFOLLOW) != 0) {
+ follow = NO_FOLLOW;
+ }
+
+ if (copyin(name, &c, sizeof (c)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ if (c == '\0') {
+ if ((flag & LX_AT_EMPTY_PATH) == 0) {
+ return (set_errno(ENOENT));
+ }
+
+ /*
+ * When AT_EMPTY_PATH is set and and empty string has been
+ * passed for the name parameter, direct the lookup against the
+ * vnode for that fd.
+ */
+ if (fd == AT_FDCWD) {
+ mutex_enter(&curproc->p_lock);
+ vp = PTOU(curproc)->u_cdir;
+ VN_HOLD(vp);
+ mutex_exit(&curproc->p_lock);
+ cr = CRED();
+ crhold(cr);
+ } else {
+ file_t *fp;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ vp = fp->f_vnode;
+ VN_HOLD(vp);
+ cr = fp->f_cred;
+ crhold(cr);
+ releasef(fd);
+ }
+ } else {
+ if ((error = cstatat_getvp(fd, name, follow, &vp, &cr)) != 0) {
+ return (set_errno(error));
+ }
+ }
+
+ error = lx_stat_common(vp, cr, outp,
+ (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, follow);
+ VN_RELE(vp);
+ crfree(cr);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_lstat64(char *name, void *outp)
+{
+ vnode_t *vp = NULL;
+ cred_t *cr = NULL;
+ model_t model = get_udatamodel();
+ int error;
+
+ if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) {
+ return (set_errno(error));
+ }
+ error = lx_stat_common(vp, cr, outp,
+ (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32,
+ NO_FOLLOW);
+ VN_RELE(vp);
+ crfree(cr);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sync.c b/usr/src/uts/common/brand/lx/syscall/lx_sync.c
new file mode 100644
index 0000000000..614afca0b0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sync.c
@@ -0,0 +1,86 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+
+long
+lx_syncfs(int fd)
+{
+ file_t *fp;
+ vfs_t *vfsp;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ vfsp = fp->f_vnode->v_vfsp;
+ releasef(fd);
+
+ (void) (vfsp->vfs_op->vfs_sync)(vfsp, 0, CRED());
+
+ return (0);
+}
+
+#define LX_SYNC_FILE_RANGE_WAIT_BEFORE 0x1
+#define LX_SYNC_FILE_RANGE_WRITE 0x2
+#define LX_SYNC_FILE_RANGE_WAIT_AFTER 0x4
+
+#define LX_SYNC_FILE_RANGE_VALID (LX_SYNC_FILE_RANGE_WAIT_BEFORE | \
+ LX_SYNC_FILE_RANGE_WRITE | LX_SYNC_FILE_RANGE_WAIT_AFTER)
+
+
+long
+lx_sync_file_range(int fd, off_t offset, off_t nbytes, int flags)
+{
+ file_t *fp;
+ int error, sflags = 0;
+
+ if ((flags & ~LX_SYNC_FILE_RANGE_VALID) != 0)
+ return (set_errno(EINVAL));
+ if (offset < 0 || nbytes < 0)
+ return (set_errno(EINVAL));
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ /*
+ * Since sync_file_range is implemented in terms of VOP_PUTPAGE, both
+ * SYNC_FILE_RANGE_WAIT flags are treated as forcing synchronous
+ * operation. While this differs from the Linux behavior where
+ * BEFORE/AFTER are distinct, it achieves an adequate level of safety
+ * since the requested data is synced out at the end of the call.
+ */
+ if ((flags & (LX_SYNC_FILE_RANGE_WAIT_BEFORE |
+ LX_SYNC_FILE_RANGE_WAIT_AFTER)) == 0) {
+ sflags |= B_ASYNC;
+ }
+
+ error = VOP_PUTPAGE(fp->f_vnode, offset, nbytes, sflags, CRED(), NULL);
+ if (error == ENOSYS) {
+ error = ESPIPE;
+ }
+
+ releasef(fd);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
new file mode 100644
index 0000000000..052ad322a7
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
@@ -0,0 +1,207 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <vm/anon.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/zone.h>
+#include <sys/time.h>
+
+typedef struct lx_sysinfo {
+ int64_t si_uptime; /* Seconds since boot */
+ uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */
+ uint64_t si_totalram; /* Total memory size */
+ uint64_t si_freeram; /* Available memory */
+ uint64_t si_sharedram; /* Shared memory */
+ uint64_t si_bufferram; /* Buffer memory */
+ uint64_t si_totalswap; /* Total swap space */
+ uint64_t si_freeswap; /* Avail swap space */
+ uint16_t si_procs; /* Process count */
+ uint16_t si_pad; /* Padding */
+ uint64_t si_totalhigh; /* High memory size */
+ uint64_t si_freehigh; /* Avail high memory */
+ uint32_t si_mem_unit; /* Unit size of memory fields */
+} lx_sysinfo_t;
+
+#if defined(_SYSCALL32_IMPL)
+/*
+ * 64-bit kernel view of the 32-bit usermode struct.
+ */
+#pragma pack(4)
+typedef struct lx_sysinfo32 {
+ int32_t si_uptime; /* Seconds since boot */
+ uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */
+ uint32_t si_totalram; /* Total memory size */
+ uint32_t si_freeram; /* Available memory */
+ uint32_t si_sharedram; /* Shared memory */
+ uint32_t si_bufferram; /* Buffer memory */
+ uint32_t si_totalswap; /* Total swap space */
+ uint32_t si_freeswap; /* Avail swap space */
+ uint16_t si_procs; /* Process count */
+ uint16_t si_pad; /* Padding */
+ uint32_t si_totalhigh; /* High memory size */
+ uint32_t si_freehigh; /* Avail high memory */
+ uint32_t si_mem_unit; /* Unit size of memory fields */
+ char __si_pad[8];
+} lx_sysinfo32_t;
+#pragma pack()
+#endif
+
+extern pgcnt_t swapfs_minfree;
+
+static void
+lx_sysinfo_common(lx_sysinfo_t *si)
+{
+ zone_t *zone = curzone;
+ pgcnt_t zphysmem, zfreemem;
+ ulong_t ztotswap, zfreeswap;
+
+ si->si_uptime = gethrestime_sec() - zone->zone_boot_time;
+
+ si->si_loads[0] = zone->zone_hp_avenrun[0];
+ si->si_loads[1] = zone->zone_hp_avenrun[1];
+ si->si_loads[2] = zone->zone_hp_avenrun[2];
+
+ /*
+ * In linux each thread looks like a process, so we conflate the
+ * two in this stat as well.
+ */
+ si->si_procs = (int32_t)zone->zone_nlwps;
+
+ zone_get_physmem_data(zone->zone_id, &zphysmem, &zfreemem);
+
+ if (zone->zone_max_swap_ctl == UINT64_MAX) {
+ ztotswap = k_anoninfo.ani_max;
+ zfreeswap = k_anoninfo.ani_free;
+ } else {
+ /*
+ * See the comment in swapctl for a description of how free is
+ * calculated within a zone.
+ */
+ rctl_qty_t used;
+ spgcnt_t avail;
+ uint64_t max;
+
+ avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
+ max = k_anoninfo.ani_max + k_anoninfo.ani_mem_resv + avail;
+
+ mutex_enter(&zone->zone_mem_lock);
+ ztotswap = btop(zone->zone_max_swap_ctl);
+ used = btop(zone->zone_max_swap);
+ mutex_exit(&zone->zone_mem_lock);
+
+ zfreeswap = MIN(ztotswap, max) - used;
+ }
+
+ /*
+ * If the maximum memory stat is less than 1^20 pages (i.e. 4GB),
+ * then we report the result in bytes. Otherwise we use pages.
+ * Once we start supporting >1TB systems/zones, we'll need a third
+ * option.
+ */
+ if (MAX(zphysmem, ztotswap) < 1024 * 1024) {
+ si->si_totalram = ptob(zphysmem);
+ si->si_freeram = ptob(zfreemem);
+ si->si_totalswap = ptob(ztotswap);
+ si->si_freeswap = ptob(zfreeswap);
+ si->si_mem_unit = 1;
+ } else {
+ si->si_totalram = zphysmem;
+ si->si_freeram = zfreemem;
+ si->si_totalswap = ztotswap;
+ si->si_freeswap = zfreeswap;
+ si->si_mem_unit = PAGESIZE;
+ }
+ si->si_bufferram = 0;
+ si->si_sharedram = 0;
+
+ /*
+ * These two stats refer to high physical memory. If an
+ * application running in a Linux zone cares about this, then
+ * either it or we are broken.
+ */
+ si->si_totalhigh = 0;
+ si->si_freehigh = 0;
+}
+
+long
+lx_sysinfo64(caddr_t sip)
+{
+ lx_sysinfo_t si;
+
+ bzero(&si, sizeof (si));
+ lx_sysinfo_common(&si);
+
+ if (copyout(&si, sip, sizeof (si)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+long
+lx_sysinfo32(caddr_t sip)
+{
+ lx_sysinfo_t si;
+ lx_sysinfo32_t si32;
+ int i;
+
+ lx_sysinfo_common(&si);
+
+ /*
+ * Convert the lx_sysinfo_t into the legacy 32-bit view:
+ */
+ bzero(&si32, sizeof (si32));
+ si32.si_uptime = si.si_uptime;
+
+ for (i = 0; i < 3; i++) {
+ if ((si.si_loads[i]) > 0x7fffffff)
+ si32.si_loads[i] = 0x7fffffff;
+ else
+ si32.si_loads[i] = si.si_loads[i];
+ }
+
+ si32.si_procs = si.si_procs;
+ si32.si_totalram = si.si_totalram;
+ si32.si_freeram = si.si_freeram;
+ si32.si_totalswap = si.si_totalswap;
+ si32.si_freeswap = si.si_freeswap;
+ si32.si_mem_unit = si.si_mem_unit;
+
+ si32.si_bufferram = si.si_bufferram;
+ si32.si_sharedram = si.si_sharedram;
+
+ si32.si_totalhigh = si.si_totalhigh;
+ si32.si_freehigh = si.si_freehigh;
+
+ if (copyout(&si32, sip, sizeof (si32)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+#endif
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c
new file mode 100644
index 0000000000..a84c17e139
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/cpuvar.h>
+#include <sys/archsystm.h>
+#include <sys/proc.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_ldt.h>
+#include <sys/lx_misc.h>
+#include <sys/x86_archext.h>
+#include <sys/controlregs.h>
+#include <lx_syscall.h>
+
+/* ARGSUSED */
+long
+lx_arch_prctl(int code, ulong_t addr)
+{
+#if defined(__amd64)
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *llwp = lwptolxlwp(lwp);
+ pcb_t *pcb = &lwp->lwp_pcb;
+
+ switch (code) {
+ case LX_ARCH_GET_FS:
+ if (copyout(&llwp->br_lx_fsbase, (void *)addr,
+ sizeof (llwp->br_lx_fsbase)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ break;
+
+ case LX_ARCH_SET_FS:
+ llwp->br_lx_fsbase = addr;
+
+ kpreempt_disable();
+ if (pcb->pcb_fsbase != llwp->br_lx_fsbase) {
+ pcb->pcb_fsbase = llwp->br_lx_fsbase;
+
+ /*
+ * Ensure we go out via update_sregs.
+ */
+ PCB_SET_UPDATE_SEGS(pcb);
+ }
+ kpreempt_enable();
+ break;
+
+ case LX_ARCH_GET_GS:
+ if (copyout(&llwp->br_lx_gsbase, (void *)addr,
+ sizeof (llwp->br_lx_gsbase)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ break;
+
+ case LX_ARCH_SET_GS:
+ llwp->br_lx_gsbase = addr;
+
+ kpreempt_disable();
+ if (pcb->pcb_gsbase != llwp->br_lx_gsbase) {
+ pcb->pcb_gsbase = llwp->br_lx_gsbase;
+
+ /*
+ * Ensure we go out via update_sregs.
+ */
+ PCB_SET_UPDATE_SEGS(pcb);
+ }
+ kpreempt_enable();
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+#endif
+
+ return (0);
+}
+
+long
+lx_get_thread_area(struct ldt_info *inf)
+{
+ struct lx_lwp_data *jlwp = ttolxlwp(curthread);
+ struct ldt_info ldt_inf;
+ user_desc_t *dscrp;
+ int entry;
+
+ if (fuword32(&inf->entry_number, (uint32_t *)&entry))
+ return (set_errno(EFAULT));
+
+ if (entry < GDT_TLSMIN || entry > GDT_TLSMAX)
+ return (set_errno(EINVAL));
+
+ dscrp = jlwp->br_tls + entry - GDT_TLSMIN;
+
+ /*
+ * convert the solaris ldt to the linux format expected by the
+ * caller
+ */
+ DESC_TO_LDT_INFO(dscrp, &ldt_inf);
+ ldt_inf.entry_number = entry;
+
+ if (copyout(&ldt_inf, inf, sizeof (struct ldt_info)))
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+long
+lx_set_thread_area(struct ldt_info *inf)
+{
+ struct lx_lwp_data *jlwp = ttolxlwp(curthread);
+ struct ldt_info ldt_inf;
+ user_desc_t *dscrp;
+ int entry;
+ int i;
+
+ if (copyin(inf, &ldt_inf, sizeof (ldt_inf)))
+ return (set_errno(EFAULT));
+
+ entry = ldt_inf.entry_number;
+ if (entry == -1) {
+ /*
+ * Find an empty entry in the tls for this thread.
+ * The casts assume each user_desc_t entry is 8 bytes.
+ */
+ for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) {
+ if (((uint_t *)dscrp)[0] == 0 &&
+ ((uint_t *)dscrp)[1] == 0)
+ break;
+ }
+
+ if (i < LX_TLSNUM) {
+ /*
+ * found one
+ */
+ entry = i + GDT_TLSMIN;
+ if (suword32(&inf->entry_number, entry))
+ return (set_errno(EFAULT));
+ } else {
+ return (set_errno(ESRCH));
+ }
+ }
+
+ if (entry < GDT_TLSMIN || entry > GDT_TLSMAX)
+ return (set_errno(EINVAL));
+
+ /*
+ * convert the linux ldt info to standard intel descriptor
+ */
+ dscrp = jlwp->br_tls + entry - GDT_TLSMIN;
+
+ if (LDT_INFO_EMPTY(&ldt_inf)) {
+ ((uint_t *)dscrp)[0] = 0;
+ ((uint_t *)dscrp)[1] = 0;
+ } else {
+ LDT_INFO_TO_DESC(&ldt_inf, dscrp);
+ }
+
+ /*
+ * update the gdt with the new descriptor
+ */
+ kpreempt_disable();
+
+ for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++)
+ lx_set_gdt(GDT_TLSMIN + i, dscrp);
+
+ kpreempt_enable();
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_time.c b/usr/src/uts/common/brand/lx/syscall/lx_time.c
new file mode 100644
index 0000000000..b9bc8e5ab4
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_time.c
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/times.h>
+#include <sys/msacct.h>
+#include <sys/lx_userhz.h>
+
+/* See the comment on LX_USERHZ for more details. */
+#define LX_NSEC_PER_USERHZ (NANOSEC / LX_USERHZ)
+#define NSEC_TO_LX_USERHZ(nsec) ((nsec) / LX_NSEC_PER_USERHZ)
+
+/*
+ * Our times(2) implementation is based on the native times(2), but with
+ * the necessary scaling to adjust to USER_HZ. Also, Linux avoids writing
+ * to a NULL tp, whereas our native code returns EFAULT.
+ */
+long
+lx_times(struct tms *tp)
+{
+ proc_t *p = curproc;
+ struct tms p_time;
+ clock_t ret_lbolt;
+
+ mutex_enter(&p->p_lock);
+ p_time.tms_utime =
+ (clock_t)NSEC_TO_LX_USERHZ(mstate_aggr_state(p, LMS_USER));
+ p_time.tms_stime =
+ (clock_t)NSEC_TO_LX_USERHZ(mstate_aggr_state(p, LMS_SYSTEM));
+ p_time.tms_cutime = HZ_TO_LX_USERHZ(p->p_cutime);
+ p_time.tms_cstime = HZ_TO_LX_USERHZ(p->p_cstime);
+ mutex_exit(&p->p_lock);
+
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ struct tms32 t32;
+
+ t32.tms_utime = p_time.tms_utime;
+ t32.tms_stime = p_time.tms_stime;
+ t32.tms_cutime = p_time.tms_cutime;
+ t32.tms_cstime = p_time.tms_cstime;
+
+ if (tp != NULL && copyout(&t32, tp, sizeof (t32)) != 0)
+ return (set_errno(EFAULT));
+
+ ret_lbolt = ddi_get_lbolt();
+ return ((clock32_t)HZ_TO_LX_USERHZ(ret_lbolt));
+ } else
+#endif /* _SYSCALL32_IMPL */
+ {
+ if (tp != NULL && copyout(&p_time, tp, sizeof (p_time)) != 0)
+ return (set_errno(EFAULT));
+
+ ret_lbolt = ddi_get_lbolt();
+ return (HZ_TO_LX_USERHZ(ret_lbolt));
+ }
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_timer.c b/usr/src/uts/common/brand/lx/syscall/lx_timer.c
new file mode 100644
index 0000000000..279bdbddc7
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_timer.c
@@ -0,0 +1,637 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * The illumos kernel provides two clock backends: CLOCK_REALTIME, the
+ * adjustable system wall clock; and CLOCK_HIGHRES, the monotonically
+ * increasing time source that is not subject to drift or adjustment. By
+ * contrast, the Linux kernel is furnished with an overabundance of narrowly
+ * differentiated clock types.
+ *
+ * Fortunately, most of the commonly used Linux clock types are either similar
+ * enough to the native clock backends that they can be directly mapped, or
+ * represent queries to the per-process and per-LWP microstate counters.
+ *
+ * CLOCK_BOOTTIME is identical to CLOCK_MONOTONIC, except that it takes into
+ * account time that the system is suspended. Since that is uninteresting to
+ * us, we treat it the same.
+ */
+
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+#include <lx_signum.h>
+
+/*
+ * From "uts/common/os/timer.c":
+ */
+extern int clock_settime(clockid_t, timespec_t *);
+extern int clock_gettime(clockid_t, timespec_t *);
+extern int clock_getres(clockid_t, timespec_t *);
+extern int nanosleep(timespec_t *, timespec_t *);
+
+
+static int lx_emul_clock_getres(clockid_t, timespec_t *);
+static int lx_emul_clock_gettime(clockid_t, timespec_t *);
+static int lx_emul_clock_settime(clockid_t, timespec_t *);
+
+typedef struct lx_clock_backend {
+ clockid_t lclk_ntv_id;
+ int (*lclk_clock_getres)(clockid_t, timespec_t *);
+ int (*lclk_clock_gettime)(clockid_t, timespec_t *);
+ int (*lclk_clock_settime)(clockid_t, timespec_t *);
+} lx_clock_backend_t;
+
+/*
+ * NOTE: The Linux man pages state this structure is obsolete and is
+ * unsupported, so it is declared here for sizing purposes only.
+ */
+struct lx_timezone {
+ int tz_minuteswest; /* minutes W of Greenwich */
+ int tz_dsttime; /* type of dst correction */
+};
+
+/*
+ * Use the native clock_* system call implementation, but with a translated
+ * clock identifier:
+ */
+#define NATIVE(ntv_id) \
+ { ntv_id, clock_getres, clock_gettime, clock_settime }
+
+/*
+ * This backend is not supported, so we provide an emulation handler:
+ */
+#define EMUL(ntv_id) \
+ { ntv_id, lx_emul_clock_getres, lx_emul_clock_gettime, \
+ lx_emul_clock_settime }
+
+static lx_clock_backend_t lx_clock_backends[] = {
+ NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME */
+ NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC */
+ EMUL(CLOCK_PROCESS_CPUTIME_ID), /* LX_CLOCK_PROCESS_CPUTIME_ID */
+ EMUL(CLOCK_THREAD_CPUTIME_ID), /* LX_CLOCK_THREAD_CPUTIME_ID */
+ NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_RAW */
+ NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME_COARSE */
+ NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_COARSE */
+ NATIVE(CLOCK_HIGHRES) /* LX_CLOCK_BOOTTIME */
+};
+
+#define LX_CLOCK_MAX \
+ (sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0]))
+#define LX_CLOCK_BACKEND(clk) (((clk) < LX_CLOCK_MAX && (clk) >= 0) ? \
+ &lx_clock_backends[(clk)] : NULL)
+
+/*
+ * Linux defines the size of the sigevent structure to be 64 bytes. In order
+ * to meet that definition, the trailing union includes a member which pads it
+ * out to the desired length for the given architecture.
+ */
+#define LX_SIGEV_PAD_SIZE ((64 - \
+ (sizeof (int) * 2 + sizeof (union sigval))) / sizeof (int))
+
+typedef struct {
+ union sigval lx_sigev_value;
+ int lx_sigev_signo;
+ int lx_sigev_notify;
+ union {
+ int lx_pad[LX_SIGEV_PAD_SIZE];
+ int lx_tid;
+ struct {
+ void (*lx_notify_function)(union sigval);
+ void *lx_notify_attribute;
+ } lx_sigev_thread;
+ } lx_sigev_un;
+} lx_sigevent_t;
+
+
+#ifdef _SYSCALL32_IMPL
+
+#define LX_SIGEV32_PAD_SIZE ((64 - \
+ (sizeof (int) * 2 + sizeof (union sigval32))) / sizeof (int))
+
+typedef struct {
+ union sigval32 lx_sigev_value;
+ int lx_sigev_signo;
+ int lx_sigev_notify;
+ union {
+ int lx_pad[LX_SIGEV32_PAD_SIZE];
+ int lx_tid;
+ struct {
+ caddr32_t lx_notify_function;
+ caddr32_t lx_notify_attribute;
+ } lx_sigev_thread;
+ } lx_sigev_un;
+} lx_sigevent32_t;
+
+#endif /* _SYSCALL32_IMPL */
+
+#define LX_SIGEV_SIGNAL 0
+#define LX_SIGEV_NONE 1
+#define LX_SIGEV_THREAD 2
+#define LX_SIGEV_THREAD_ID 4
+
+/*
+ * Access private SIGEV_THREAD_ID callback state in itimer_t
+ */
+#define LX_SIGEV_THREAD_ID_LPID(it) ((it)->it_cb_data[0])
+#define LX_SIGEV_THREAD_ID_TID(it) ((it)->it_cb_data[1])
+
+
+/* ARGSUSED */
+static int
+lx_emul_clock_settime(clockid_t clock, timespec_t *tp)
+{
+ return (set_errno(EINVAL));
+}
+
+static int
+lx_emul_clock_gettime(clockid_t clock, timespec_t *tp)
+{
+ timespec_t t;
+
+ switch (clock) {
+ case CLOCK_PROCESS_CPUTIME_ID: {
+ proc_t *p = ttoproc(curthread);
+ hrtime_t snsecs, unsecs;
+
+ /*
+ * Based on getrusage() in "rusagesys.c":
+ */
+ mutex_enter(&p->p_lock);
+ unsecs = mstate_aggr_state(p, LMS_USER);
+ snsecs = mstate_aggr_state(p, LMS_SYSTEM);
+ mutex_exit(&p->p_lock);
+
+ hrt2ts(unsecs + snsecs, &t);
+ break;
+ }
+
+ case CLOCK_THREAD_CPUTIME_ID: {
+ klwp_t *lwp = ttolwp(curthread);
+ struct mstate *ms = &lwp->lwp_mstate;
+ hrtime_t snsecs, unsecs;
+
+ /*
+ * Based on getrusage_lwp() in "rusagesys.c":
+ */
+ unsecs = ms->ms_acct[LMS_USER];
+ snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
+
+ scalehrtime(&unsecs);
+ scalehrtime(&snsecs);
+
+ hrt2ts(unsecs + snsecs, &t);
+ break;
+ }
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ timespec32_t t32;
+
+ if (TIMESPEC_OVERFLOW(&t)) {
+ return (set_errno(EOVERFLOW));
+ }
+ TIMESPEC_TO_TIMESPEC32(&t32, &t);
+
+ if (copyout(&t32, tp, sizeof (t32)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+ }
+#endif
+
+ if (copyout(&t, tp, sizeof (t)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+
+static int
+lx_emul_clock_getres(clockid_t clock, timespec_t *tp)
+{
+ timespec_t t;
+
+ if (tp == NULL) {
+ return (0);
+ }
+
+ switch (clock) {
+ case CLOCK_PROCESS_CPUTIME_ID:
+ case CLOCK_THREAD_CPUTIME_ID:
+ /*
+ * These clock backends return microstate accounting values for
+ * the LWP or the entire process. The Linux kernel claims they
+ * have nanosecond resolution; so will we.
+ */
+ t.tv_sec = 0;
+ t.tv_nsec = 1;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ timespec32_t t32;
+
+ if (TIMESPEC_OVERFLOW(&t)) {
+ return (set_errno(EOVERFLOW));
+ }
+ TIMESPEC_TO_TIMESPEC32(&t32, &t);
+
+ if (copyout(&t32, tp, sizeof (t32)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+ }
+#endif
+
+ if (copyout(&t, tp, sizeof (t)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+
+static void
+lx_clock_unsupported(int clock)
+{
+ char buf[100];
+
+ (void) snprintf(buf, sizeof (buf), "unsupported clock: %d", clock);
+ lx_unsupported(buf);
+}
+
+long
+lx_clock_settime(int clock, timespec_t *tp)
+{
+ lx_clock_backend_t *backend;
+
+ if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
+ lx_clock_unsupported(clock);
+ return (set_errno(EINVAL));
+ }
+
+ return (backend->lclk_clock_settime(backend->lclk_ntv_id, tp));
+}
+
+long
+lx_clock_gettime(int clock, timespec_t *tp)
+{
+ lx_clock_backend_t *backend;
+
+ if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
+ lx_clock_unsupported(clock);
+ return (set_errno(EINVAL));
+ }
+
+ return (backend->lclk_clock_gettime(backend->lclk_ntv_id, tp));
+}
+
+long
+lx_clock_getres(int clock, timespec_t *tp)
+{
+ lx_clock_backend_t *backend;
+
+ if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
+ lx_clock_unsupported(clock);
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * It is important this check is performed after the clock
+ * check. Both glibc and musl, in their clock_getcpuclockid(),
+ * use clock_getres() with a NULL tp to validate a clock
+ * value. Performing the tp check before the clock check could
+ * indicate a valid clock to libc when it shouldn't.
+ */
+ if (tp == NULL) {
+ return (0);
+ }
+
+ return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp));
+}
+
+static int
+lx_ltos_sigev(lx_sigevent_t *lev, struct sigevent *sev)
+{
+ bzero(sev, sizeof (*sev));
+
+ switch (lev->lx_sigev_notify) {
+ case LX_SIGEV_NONE:
+ sev->sigev_notify = SIGEV_NONE;
+ break;
+
+ case LX_SIGEV_SIGNAL:
+ case LX_SIGEV_THREAD_ID:
+ sev->sigev_notify = SIGEV_SIGNAL;
+ break;
+
+ case LX_SIGEV_THREAD:
+ /*
+ * Just as in illumos, SIGEV_THREAD handling is performed in
+ * userspace with the help of SIGEV_SIGNAL/SIGEV_THREAD_ID.
+ *
+ * It's not expected to make an appearance in the syscall.
+ */
+ default:
+ return (EINVAL);
+ }
+
+ sev->sigev_signo = lx_ltos_signo(lev->lx_sigev_signo, 0);
+ sev->sigev_value = lev->lx_sigev_value;
+
+ /* Ensure SIGEV_SIGNAL has a valid signo to work with. */
+ if (sev->sigev_notify == SIGEV_SIGNAL && sev->sigev_signo == 0) {
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+lx_sigev_copyin(lx_sigevent_t *userp, lx_sigevent_t *levp)
+{
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ lx_sigevent32_t lev32;
+
+ if (copyin(userp, &lev32, sizeof (lev32)) != 0) {
+ return (EFAULT);
+ }
+ levp->lx_sigev_value.sival_int = lev32.lx_sigev_value.sival_int;
+ levp->lx_sigev_signo = lev32.lx_sigev_signo;
+ levp->lx_sigev_notify = lev32.lx_sigev_notify;
+ levp->lx_sigev_un.lx_tid = lev32.lx_sigev_un.lx_tid;
+ } else
+#endif /* _SYSCALL32_IMPL */
+ {
+ if (copyin(userp, levp, sizeof (lx_sigevent_t)) != 0) {
+ return (EFAULT);
+ }
+ }
+ return (0);
+}
+
+static void
+lx_sigev_thread_fire(itimer_t *it)
+{
+ proc_t *p = it->it_proc;
+ pid_t lpid = (pid_t)LX_SIGEV_THREAD_ID_LPID(it);
+ id_t tid = (id_t)LX_SIGEV_THREAD_ID_TID(it);
+ lwpdir_t *ld;
+
+ ASSERT(MUTEX_HELD(&it->it_mutex));
+ ASSERT(it->it_pending == 0);
+ ASSERT(it->it_flags & IT_SIGNAL);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ ld = lwp_hash_lookup(p, tid);
+ if (ld != NULL) {
+ lx_lwp_data_t *lwpd;
+ kthread_t *t;
+
+ t = ld->ld_entry->le_thread;
+ lwpd = ttolxlwp(t);
+ if (lwpd != NULL && lwpd->br_pid == lpid) {
+ /*
+ * A thread matching the LX pid is still present in the
+ * process. Send a targeted signal as requested.
+ */
+ it->it_pending = 1;
+ mutex_exit(&it->it_mutex);
+ sigaddqa(p, t, it->it_sigq);
+ return;
+ }
+ }
+
+ mutex_exit(&it->it_mutex);
+}
+
+long
+lx_timer_create(int clock, lx_sigevent_t *sevp, timer_t *tidp)
+{
+ int error;
+ lx_sigevent_t lev;
+ struct sigevent sev;
+ clock_backend_t *backend = NULL;
+ proc_t *p = curproc;
+ itimer_t *itp;
+ timer_t tid;
+
+ if (clock == -2) {
+ /*
+ * A change was made to the old userspace timer emulation to
+ * handle this specific clock ID for MapR. It was wrongly
+ * mapped to CLOCK_REALTIME rather than CLOCK_THREAD_CPUTIME_ID
+ * which it maps to. Until the CLOCK_*_CPUTIME_ID timers can
+ * be emulated, the admittedly incorrect mapping will remain.
+ */
+ backend = clock_get_backend(CLOCK_REALTIME);
+ } else {
+ lx_clock_backend_t *lback = LX_CLOCK_BACKEND(clock);
+
+ if (lback != NULL) {
+ backend = clock_get_backend(lback->lclk_ntv_id);
+ }
+ }
+ if (backend == NULL) {
+ return (set_errno(EINVAL));
+ }
+
+ /* We have to convert the Linux sigevent layout to the illumos layout */
+ if (sevp != NULL) {
+ if ((error = lx_sigev_copyin(sevp, &lev)) != 0) {
+ return (set_errno(error));
+ }
+ if ((error = lx_ltos_sigev(&lev, &sev)) != 0) {
+ return (set_errno(error));
+ }
+ } else {
+ bzero(&sev, sizeof (sev));
+ sev.sigev_notify = SIGEV_SIGNAL;
+ sev.sigev_signo = SIGALRM;
+ }
+
+ if ((error = timer_setup(backend, &sev, NULL, &itp, &tid)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * The SIGEV_THREAD_ID notification method in Linux allows the caller
+ * to target a specific thread to receive the signal. The IT_CALLBACK
+ * timer functionality is used to fulfill this need. After translating
+ * the LX pid to a SunOS thread ID (ensuring it exists in the current
+ * process), those IDs are attached to the timer along with the custom
+ * lx_sigev_thread_fire callback. This targets the signal notification
+ * properly when the timer fires.
+ */
+ if (lev.lx_sigev_notify == LX_SIGEV_THREAD_ID) {
+ pid_t lpid, spid;
+ id_t stid;
+
+ lpid = (pid_t)lev.lx_sigev_un.lx_tid;
+ if (lx_lpid_to_spair(lpid, &spid, &stid) != 0 ||
+ spid != curproc->p_pid) {
+ error = EINVAL;
+ goto err;
+ }
+
+ itp->it_flags |= IT_CALLBACK;
+ itp->it_cb_func = lx_sigev_thread_fire;
+ LX_SIGEV_THREAD_ID_LPID(itp) = lpid;
+ LX_SIGEV_THREAD_ID_TID(itp) = stid;
+ }
+
+ /*
+ * When the sigevent is not specified, its sigev_value field is
+ * expected to be populated with the timer ID.
+ */
+ if (sevp == NULL) {
+ itp->it_sigq->sq_info.si_value.sival_int = tid;
+ }
+
+ if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+ error = EFAULT;
+ goto err;
+ }
+
+ timer_release(p, itp);
+ return (0);
+
+err:
+ timer_delete_grabbed(p, tid, itp);
+ return (set_errno(error));
+}
+
+long
+lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp)
+{
+ struct lx_timezone tz;
+
+ bzero(&tz, sizeof (tz));
+
+ /*
+ * We want to be similar to libc which just does a fasttrap to
+ * gethrestime and simply converts that result. We follow how uniqtime
+ * does the conversion but we can't use that code since it does some
+ * extra work which can cause the result to bounce around based on which
+ * CPU we run on.
+ */
+ if (tvp != NULL) {
+ struct timeval tv;
+ timestruc_t ts;
+ int usec, nsec;
+
+ gethrestime(&ts);
+ nsec = ts.tv_nsec;
+ usec = nsec + (nsec >> 2);
+ usec = nsec + (usec >> 1);
+ usec = nsec + (usec >> 2);
+ usec = nsec + (usec >> 4);
+ usec = nsec - (usec >> 3);
+ usec = nsec + (usec >> 2);
+ usec = nsec + (usec >> 3);
+ usec = nsec + (usec >> 4);
+ usec = nsec + (usec >> 1);
+ usec = nsec + (usec >> 6);
+ usec = usec >> 10;
+
+ tv.tv_sec = ts.tv_sec;
+ tv.tv_usec = usec;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyout(&tv, tvp, sizeof (tv)) != 0)
+ return (set_errno(EFAULT));
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ struct timeval32 tv32;
+
+ if (TIMEVAL_OVERFLOW(&tv))
+ return (set_errno(EOVERFLOW));
+ TIMEVAL_TO_TIMEVAL32(&tv32, &tv);
+
+ if (copyout(&tv32, tvp, sizeof (tv32)))
+ return (set_errno(EFAULT));
+ }
+#endif
+ }
+
+ /*
+ * The Linux man page states use of the second parameter is obsolete,
+ * but gettimeofday(2) should still return EFAULT if it is set
+ * to a bad non-NULL pointer (sigh...)
+ */
+ if (tzp != NULL && copyout(&tz, tzp, sizeof (tz)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/*
+ * On Linux a bad buffer will set errno to EFAULT, and on Illumos the failure
+ * mode is documented as "undefined."
+ */
+long
+lx_time(time_t *tp)
+{
+ timestruc_t ts;
+ struct timeval tv;
+
+ gethrestime(&ts);
+ tv.tv_sec = ts.tv_sec;
+ tv.tv_usec = 0;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (tp != NULL &&
+ copyout(&tv.tv_sec, tp, sizeof (tv.tv_sec)) != 0)
+ return (set_errno(EFAULT));
+
+ return (tv.tv_sec);
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ struct timeval32 tv32;
+
+ if (TIMEVAL_OVERFLOW(&tv))
+ return (set_errno(EOVERFLOW));
+ TIMEVAL_TO_TIMEVAL32(&tv32, &tv);
+
+ if (tp != NULL &&
+ copyout(&tv32.tv_sec, tp, sizeof (tv32.tv_sec)))
+ return (set_errno(EFAULT));
+
+ return (tv32.tv_sec);
+ }
+#endif /* _SYSCALL32_IMPL */
+ /* NOTREACHED */
+}
+
+long
+lx_nanosleep(timespec_t *rqtp, timespec_t *rmtp)
+{
+ return (nanosleep(rqtp, rmtp));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_umask.c b/usr/src/uts/common/brand/lx/syscall/lx_umask.c
new file mode 100644
index 0000000000..cb5e4ed232
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_umask.c
@@ -0,0 +1,52 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/lx_misc.h>
+#include <lx_syscall.h>
+
+/* From usr/src/uts/common/syscall/umask.c */
+extern int umask(int);
+
+/*
+ * Just do what umask() does, but for the given process.
+ */
+static int
+lx_clone_umask_cb(proc_t *pp, void *arg)
+{
+ mode_t cmask = (mode_t)(intptr_t)arg;
+ mode_t orig;
+
+ orig = PTOU(pp)->u_cmask;
+ PTOU(pp)->u_cmask = (mode_t)(cmask & PERMMASK);
+ return ((int)orig);
+}
+
+long
+lx_umask(mode_t cmask)
+{
+ lx_proc_data_t *lproc = ttolxproc(curthread);
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+ if (lx_clone_grp_member(lproc, LX_CLONE_FS)) {
+ int omask;
+
+ omask = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_umask_cb,
+ (void *)(intptr_t)cmask);
+ return (omask);
+ }
+
+ return (umask(cmask));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_uname.c b/usr/src/uts/common/brand/lx/syscall/lx_uname.c
new file mode 100644
index 0000000000..2d18408eaa
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_uname.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+
+struct lx_utsname {
+ char lxu_sysname[LX_SYS_UTS_LN];
+ char lxu_nodename[LX_SYS_UTS_LN];
+ char lxu_release[LX_SYS_UTS_LN];
+ char lxu_version[LX_SYS_UTS_LN];
+ char lxu_machine[LX_SYS_UTS_LN];
+ char lxu_domainname[LX_SYS_UTS_LN];
+};
+
+long
+lx_uname(void *uptr)
+{
+ proc_t *p = curproc;
+ lx_proc_data_t *lxpd = ptolxproc(p);
+ lx_zone_data_t *lxzd = ztolxzd(p->p_zone);
+ struct lx_utsname un;
+
+ bzero(&un, sizeof (un));
+
+ (void) strlcpy(un.lxu_sysname, LX_UNAME_SYSNAME, LX_SYS_UTS_LN);
+ (void) strlcpy(un.lxu_nodename, p->p_zone->zone_nodename,
+ LX_SYS_UTS_LN);
+
+ mutex_enter(&lxzd->lxzd_lock);
+
+ if (lxpd->l_uname_release[0] != '\0') {
+ (void) strlcpy(un.lxu_release, lxpd->l_uname_release,
+ LX_SYS_UTS_LN);
+ } else {
+ (void) strlcpy(un.lxu_release, lxzd->lxzd_kernel_release,
+ LX_SYS_UTS_LN);
+ }
+ if (lxpd->l_uname_version[0] != '\0') {
+ (void) strlcpy(un.lxu_version, lxpd->l_uname_version,
+ LX_SYS_UTS_LN);
+ } else {
+ (void) strlcpy(un.lxu_version, lxzd->lxzd_kernel_version,
+ LX_SYS_UTS_LN);
+ }
+
+ mutex_exit(&lxzd->lxzd_lock);
+
+ if (get_udatamodel() == DATAMODEL_LP64) {
+ (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE64,
+ LX_SYS_UTS_LN);
+ } else {
+ (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE32,
+ LX_SYS_UTS_LN);
+ }
+ (void) strlcpy(un.lxu_domainname, p->p_zone->zone_domain,
+ LX_SYS_UTS_LN);
+
+ if (copyout(&un, uptr, sizeof (un)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c
new file mode 100644
index 0000000000..e8358f9f69
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c
@@ -0,0 +1,377 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * wait() family of functions.
+ *
+ * The first minor difference between the Linux and Solaris family of wait()
+ * calls is that the values for WNOHANG and WUNTRACED are different. Thankfully,
+ * the exit status values are identical between the two implementations.
+ *
+ * Things get very different and very complicated when we introduce the Linux
+ * threading model. Under linux, both threads and child processes are
+ * represented as processes. However, the behavior of wait() with respect to
+ * each child varies according to the flags given to clone()
+ *
+ * SIGCHLD The SIGCHLD signal should be sent on termination
+ * CLONE_THREAD The child shares the same thread group as the parent
+ * CLONE_DETACHED The parent receives no notification when the child exits
+ *
+ * The following flags control the Linux behavior w.r.t. the above attributes:
+ *
+ * __WALL Wait on all children, regardless of type
+ * __WCLONE Wait only on non-SIGCHLD children
+ * __WNOTHREAD Don't wait on children of other threads in this group
+ *
+ * The following chart shows whether wait() returns when the child exits:
+ *
+ * default __WCLONE __WALL
+ * no SIGCHLD - X X
+ * SIGCHLD X - X
+ *
+ * The following chart shows whether wait() returns when the grandchild exits:
+ *
+ * default __WNOTHREAD
+ * no CLONE_THREAD - -
+ * CLONE_THREAD X -
+ *
+ * The CLONE_DETACHED flag is universal - when the child exits, no state is
+ * stored and wait() has no effect.
+ *
+ * XXX Support the above combination of options, or some reasonable subset that
+ * covers at least fork() and pthread_create().
+ */
+
+#include <sys/wait.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/lx_misc.h>
+#include <lx_signum.h>
+#include <lx_errno.h>
+#include <lx_syscall.h>
+
+/*
+ * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c":
+ */
+extern int waitid(idtype_t, id_t, k_siginfo_t *, int);
+extern int rusagesys(int, void *, void *, void *, void *);
+
+/*
+ * Convert between Linux options and Solaris options, returning -1 if any
+ * invalid flags are found.
+ */
+#define LX_WNOHANG 0x00000001
+#define LX_WUNTRACED 0x00000002
+#define LX_WSTOPPED LX_WUNTRACED
+#define LX_WEXITED 0x00000004
+#define LX_WCONTINUED 0x00000008
+#define LX_WNOWAIT 0x01000000
+
+#define LX_WNOTHREAD 0x20000000
+#define LX_WALL 0x40000000
+#define LX_WCLONE 0x80000000
+
+#define LX_P_ALL 0x0
+#define LX_P_PID 0x1
+#define LX_P_GID 0x2
+
+/*
+ * Split the passed waitpid/waitid options into two separate variables:
+ * those for the native illumos waitid(2), and the extra Linux-specific
+ * options we will handle in our brand-specific code.
+ */
+static int
+ltos_options(uintptr_t options, int *native_options, int *extra_options)
+{
+ int newoptions = 0;
+
+ if (((options) & ~(LX_WNOHANG | LX_WUNTRACED | LX_WEXITED |
+ LX_WCONTINUED | LX_WNOWAIT | LX_WNOTHREAD | LX_WALL |
+ LX_WCLONE)) != 0) {
+ return (-1);
+ }
+
+ *extra_options = options & (LX_WNOTHREAD | LX_WALL | LX_WCLONE);
+
+ if (options & LX_WNOHANG)
+ newoptions |= WNOHANG;
+ if (options & LX_WUNTRACED)
+ newoptions |= WUNTRACED;
+ if (options & LX_WEXITED)
+ newoptions |= WEXITED;
+ if (options & LX_WCONTINUED)
+ newoptions |= WCONTINUED;
+ if (options & LX_WNOWAIT)
+ newoptions |= WNOWAIT;
+
+ /*
+ * The trapped option is implicit on Linux.
+ */
+ newoptions |= WTRAPPED;
+
+ *native_options = newoptions;
+ return (0);
+}
+
+static int
+lx_wstat(int code, int status)
+{
+ int stat = 0;
+
+ switch (code) {
+ case CLD_EXITED:
+ stat = status << 8;
+ break;
+ case CLD_DUMPED:
+ stat = lx_stol_signo(status, SIGKILL) | WCOREFLG;
+ break;
+ case CLD_KILLED:
+ stat = lx_stol_signo(status, SIGKILL);
+ break;
+ case CLD_TRAPPED:
+ case CLD_STOPPED:
+ stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG;
+ break;
+ case CLD_CONTINUED:
+ stat = WCONTFLG;
+ break;
+ }
+
+ return (stat);
+}
+
+static int
+lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options,
+ int extra_options)
+{
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+ int error;
+
+ /*
+ * Our brand-specific waitid helper only understands a subset of
+ * the possible idtypes. Ensure we keep to that subset here:
+ */
+ if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
+ return (EINVAL);
+ }
+
+ /*
+ * Enable the return of emulated ptrace(2) stop conditions
+ * through lx_waitid_helper, and stash the Linux-specific
+ * extra waitid() flags.
+ */
+ lwpd->br_waitid_emulate = B_TRUE;
+ lwpd->br_waitid_flags = extra_options;
+
+ if ((error = waitid(idtype, id, sip, native_options)) == EINTR) {
+ /*
+ * According to signal(7), the wait4(2), waitid(2), and
+ * waitpid(2) system calls are restartable.
+ */
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+
+ lwpd->br_waitid_emulate = B_FALSE;
+ lwpd->br_waitid_flags = 0;
+
+ return (error);
+}
+
+long
+lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
+{
+ k_siginfo_t info = { 0 };
+ idtype_t idtype;
+ id_t id;
+ int status = 0;
+ pid_t pid = (pid_t)p1;
+ int error;
+ int native_options, extra_options;
+ int *statusp = (int *)p2;
+ void *rup = (void *)p4;
+
+ if (ltos_options(p3, &native_options, &extra_options) == -1) {
+ return (set_errno(EINVAL));
+ }
+
+ if (pid > maxpid) {
+ return (set_errno(ECHILD));
+ }
+
+ /*
+ * While not listed as a valid return code, Linux's wait4(2) does,
+ * in fact, get an EFAULT if either the status pointer or rusage
+ * pointer is invalid. Since a failed waitpid should leave child
+ * process in a state where a future wait4(2) will succeed, we
+ * check them by copying out the values their buffers originally
+ * contained. (We need to do this as a failed system call should
+ * never affect the contents of a passed buffer.)
+ *
+ * This will fail if the buffers in question are write-only.
+ */
+ if (statusp != NULL) {
+ if (copyin(statusp, &status, sizeof (status)) != 0 ||
+ copyout(&status, statusp, sizeof (status)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
+
+ /*
+ * Do the same check for the "struct rusage" pointer, which differs
+ * in size for 32- and 64-bit processes.
+ */
+ if (rup != NULL) {
+ struct rusage ru;
+ void *krup = &ru;
+ size_t rusz = sizeof (ru);
+#if defined(_SYSCALL32_IMPL)
+ struct rusage32 ru32;
+
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ krup = &ru32;
+ rusz = sizeof (ru32);
+ }
+#endif
+
+ if (copyin(rup, krup, rusz) != 0 ||
+ copyout(krup, rup, rusz) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
+
+ if (pid < -1) {
+ idtype = P_PGID;
+ id = -pid;
+ } else if (pid == -1) {
+ idtype = P_ALL;
+ id = 0;
+ } else if (pid == 0) {
+ idtype = P_PGID;
+ mutex_enter(&pidlock);
+ id = curproc->p_pgrp;
+ mutex_exit(&pidlock);
+ } else {
+ idtype = P_PID;
+ id = pid;
+ }
+
+ native_options |= (WEXITED | WTRAPPED);
+
+ if ((error = lx_call_waitid(idtype, id, &info, native_options,
+ extra_options)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * If the WNOHANG flag was specified and no child was found return 0.
+ */
+ if ((native_options & WNOHANG) && info.si_pid == 0) {
+ return (0);
+ }
+
+ status = lx_wstat(info.si_code, info.si_status);
+
+ /*
+ * Unfortunately if this attempt to copy out either the status or the
+ * rusage fails, the process will be in an inconsistent state as
+ * subsequent calls to wait for the same child will fail where they
+ * should succeed on a Linux system. This, however, is rather
+ * unlikely since we tested the validity of both above.
+ */
+ if (statusp != NULL) {
+ if (copyout(&status, statusp, sizeof (status)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
+
+ if (rup != NULL) {
+ if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL,
+ NULL, NULL)) != 0) {
+ return (set_errno(error));
+ }
+ }
+
+ return (info.si_pid);
+}
+
+long
+lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3)
+{
+ return (lx_wait4(p1, p2, p3, NULL));
+}
+
+long
+lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt)
+{
+ int error;
+ int native_options, extra_options;
+ k_siginfo_t info = { 0 };
+
+ if (ltos_options(opt, &native_options, &extra_options) == -1) {
+ return (set_errno(EINVAL));
+ }
+
+ if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) {
+ return (set_errno(EINVAL));
+ }
+
+ switch (idtype) {
+ case LX_P_ALL:
+ idtype = P_ALL;
+ break;
+ case LX_P_PID:
+ idtype = P_PID;
+ break;
+ case LX_P_GID:
+ idtype = P_PGID;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ if ((error = lx_call_waitid(idtype, id, &info, native_options,
+ extra_options)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * If the WNOHANG flag was specified and no child was found return 0.
+ */
+ if ((native_options & WNOHANG) && info.si_pid == 0) {
+ return (0);
+ }
+
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ return (stol_ksiginfo32_copyout(&info, (void *)infop));
+ } else
+#endif
+ {
+ return (stol_ksiginfo_copyout(&info, (void *)infop));
+ }
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c
new file mode 100644
index 0000000000..19bf9a4ebb
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c
@@ -0,0 +1,519 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+#include <sys/lx_acl.h>
+
+
+#define LX_XATTR_NAME_MAX 255
+#define LX_XATTR_SIZE_MAX 65536
+#define LX_XATTR_LIST_MAX 65536
+
+#define LX_XATTR_FLAG_CREATE 0x1
+#define LX_XATTR_FLAG_REPLACE 0x2
+#define LX_XATTR_FLAGS_VALID (LX_XATTR_FLAG_CREATE | LX_XATTR_FLAG_REPLACE)
+
+enum lx_xattr_ns {
+ LX_XATTR_NS_SECURITY,
+ LX_XATTR_NS_SYSTEM,
+ LX_XATTR_NS_TRUSTED,
+ LX_XATTR_NS_USER,
+ LX_XATTR_NS_INVALID /* Catch-all for invalid namespaces */
+};
+
+/* Present under the 'security.' namespace */
+#define LX_XATTR_CAPABILITY "capability"
+
+typedef struct lx_xattr_ns_list {
+ const char *lxnl_name;
+ unsigned lxnl_len;
+ enum lx_xattr_ns lxnl_ns;
+} lx_xattr_ns_list_t;
+
+static lx_xattr_ns_list_t lx_xattr_namespaces[] = {
+ { "user.", 5, LX_XATTR_NS_USER },
+ { "system.", 7, LX_XATTR_NS_SYSTEM },
+ { "trusted.", 8, LX_XATTR_NS_TRUSTED },
+ { "security.", 9, LX_XATTR_NS_SECURITY },
+ { NULL, 0, LX_XATTR_NS_INVALID }
+};
+
+static int
+lx_xattr_parse(const char *name, size_t nlen, const char **key)
+{
+ lx_xattr_ns_list_t *lxn = lx_xattr_namespaces;
+
+ for (; lxn->lxnl_name != NULL; lxn++) {
+ if (nlen < lxn->lxnl_len) {
+ continue;
+ }
+ if (strncmp(lxn->lxnl_name, name, lxn->lxnl_len) == 0) {
+ *key = name + (lxn->lxnl_len);
+ return (lxn->lxnl_ns);
+ }
+ }
+
+ *key = name;
+ return (LX_XATTR_NS_INVALID);
+}
+
+/*
+ * *xattr() family of functions.
+ *
+ * These are largely unimplemented. In most cases we return EOPNOTSUPP, rather
+ * than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1).
+ *
+ * Note that CRED() is used instead of f_cred in the f*xattr functions. This
+ * is intentional as Linux does not have the same notion of per-fd credentials.
+ */
+
+/* ARGSUSED */
+static int
+lx_setxattr_common(vnode_t *vp, char *name, void *value, size_t sz, int flags)
+{
+ int error, type;
+ char name_buf[LX_XATTR_NAME_MAX + 1];
+ const char *key;
+ size_t name_len;
+ void *buf = NULL;
+
+ if ((flags & ~LX_XATTR_FLAGS_VALID) != 0) {
+ return (EINVAL);
+ }
+ error = copyinstr(name, name_buf, sizeof (name_buf), &name_len);
+ if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) {
+ return (ERANGE);
+ } else if (error != 0) {
+ return (EFAULT);
+ }
+
+ type = lx_xattr_parse(name_buf, name_len, &key);
+
+ if (sz != 0) {
+ if (sz > LX_XATTR_SIZE_MAX) {
+ return (E2BIG);
+ }
+ buf = kmem_alloc(sz, KM_SLEEP);
+ if (copyin(value, buf, sz) != 0) {
+ kmem_free(buf, sz);
+ return (EFAULT);
+ }
+ }
+
+ error = EOPNOTSUPP;
+ switch (type) {
+ case LX_XATTR_NS_SECURITY:
+ /*
+ * In order to keep package management software happy, despite
+ * lacking support for file-based Linux capabilities via
+ * xattrs, we fake success when root attempts a setxattr on
+ * that attribute.
+ */
+ if (crgetuid(CRED()) == 0 &&
+ strcmp(key, LX_XATTR_CAPABILITY) == 0) {
+ error = 0;
+ }
+ break;
+ case LX_XATTR_NS_SYSTEM:
+ if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) {
+ error = lx_acl_setxattr(vp, LX_ACL_ACCESS, buf, sz);
+ } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) {
+ error = lx_acl_setxattr(vp, LX_ACL_DEFAULT, buf, sz);
+ }
+ default:
+ break;
+ }
+
+ if (buf != NULL) {
+ kmem_free(buf, sz);
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+lx_getxattr_common(vnode_t *vp, char *name, char *value, size_t sz,
+ ssize_t *osz)
+{
+ int error, type;
+ char name_buf[LX_XATTR_NAME_MAX + 1];
+ const char *key;
+ size_t name_len;
+ void *buf = NULL;
+
+ error = copyinstr(name, name_buf, sizeof (name_buf), &name_len);
+ if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) {
+ return (ERANGE);
+ } else if (error != 0) {
+ return (EFAULT);
+ }
+ if (sz != 0) {
+ if (sz > LX_XATTR_SIZE_MAX) {
+ sz = LX_XATTR_SIZE_MAX;
+ }
+ buf = kmem_alloc(sz, KM_SLEEP);
+ }
+
+ type = lx_xattr_parse(name_buf, name_len, &key);
+
+ error = EOPNOTSUPP;
+ switch (type) {
+ case LX_XATTR_NS_SYSTEM:
+ if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) {
+ error = lx_acl_getxattr(vp, LX_ACL_ACCESS, buf, sz,
+ osz);
+ } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) {
+ error = lx_acl_getxattr(vp, LX_ACL_DEFAULT, buf, sz,
+ osz);
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (error == 0 && buf != NULL) {
+ VERIFY(*osz <= sz);
+
+ if (copyout(buf, value, *osz) != 0) {
+ error = EFAULT;
+ }
+ }
+ if (buf != NULL) {
+ kmem_free(buf, sz);
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+lx_listxattr_common(vnode_t *vp, void *value, size_t size, ssize_t *osize)
+{
+ struct uio auio;
+ struct iovec aiov;
+ int err = 0;
+
+ aiov.iov_base = value;
+ aiov.iov_len = size;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = 0;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_resid = size;
+ auio.uio_fmode = 0;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ /*
+ * Call into all the listxattr routines (which may be no-ops) which are
+ * currently implemented.
+ */
+ err = lx_acl_listxattr(vp, &auio);
+
+ if (err == 0) {
+ *osize = size - auio.uio_resid;
+ }
+
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+lx_removexattr_common(vnode_t *vp, char *name)
+{
+ int error, type;
+ char name_buf[LX_XATTR_NAME_MAX + 1];
+ const char *key;
+ size_t name_len;
+
+ error = copyinstr(name, name_buf, sizeof (name_buf), &name_len);
+ if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) {
+ return (ERANGE);
+ } else if (error != 0) {
+ return (EFAULT);
+ }
+
+
+ type = lx_xattr_parse(name_buf, name_len, &key);
+
+ error = EOPNOTSUPP;
+ switch (type) {
+ case LX_XATTR_NS_SYSTEM:
+ if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) {
+ error = lx_acl_removexattr(vp, LX_ACL_ACCESS);
+ } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) {
+ error = lx_acl_removexattr(vp, LX_ACL_DEFAULT);
+ }
+ default:
+ break;
+ }
+
+ return (EOPNOTSUPP);
+}
+
+
+long
+lx_setxattr(char *path, char *name, void *value, size_t size, int flags)
+{
+ int error;
+ vnode_t *vp = NULL;
+
+ error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_setxattr_common(vp, name, value, size, flags);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_lsetxattr(char *path, char *name, void *value, size_t size, int flags)
+{
+ int error;
+ vnode_t *vp = NULL;
+
+ error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_setxattr_common(vp, name, value, size, flags);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+long
+lx_fsetxattr(int fd, char *name, void *value, size_t size, int flags)
+{
+ int error;
+ file_t *fp;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+
+ error = lx_setxattr_common(fp->f_vnode, name, value, size, flags);
+ releasef(fd);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+ssize_t
+lx_getxattr(char *path, char *name, void *value, size_t size)
+{
+ int error;
+ vnode_t *vp = NULL;
+ ssize_t osize;
+
+ error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_getxattr_common(vp, name, value, size, &osize);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (osize);
+}
+
+ssize_t
+lx_lgetxattr(char *path, char *name, void *value, size_t size)
+{
+
+ int error;
+ vnode_t *vp = NULL;
+ ssize_t osize;
+
+ error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_getxattr_common(vp, name, value, size, &osize);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (osize);
+}
+
+ssize_t
+lx_fgetxattr(int fd, char *name, void *value, size_t size)
+{
+ int error;
+ file_t *fp;
+ ssize_t osize;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+
+ /*
+ * When a file is opened with O_PATH we clear read/write and fgetxattr
+ * is expected to return EBADF.
+ */
+ if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+ releasef(fd);
+ return (set_errno(EBADF));
+ }
+
+ error = lx_getxattr_common(fp->f_vnode, name, value, size, &osize);
+ releasef(fd);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (osize);
+}
+
+ssize_t
+lx_listxattr(char *path, char *list, size_t size)
+{
+ int error;
+ vnode_t *vp = NULL;
+ ssize_t osize;
+
+ error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_listxattr_common(vp, list, size, &osize);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (osize);
+}
+
+ssize_t
+lx_llistxattr(char *path, char *list, size_t size)
+{
+ int error;
+ vnode_t *vp = NULL;
+ ssize_t osize;
+
+ error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_listxattr_common(vp, list, size, &osize);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (osize);
+}
+
+ssize_t
+lx_flistxattr(int fd, char *list, size_t size)
+{
+ int error;
+ file_t *fp;
+ ssize_t osize;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+
+ error = lx_listxattr_common(fp->f_vnode, list, size, &osize);
+ releasef(fd);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (osize);
+}
+
+int
+lx_removexattr(char *path, char *name)
+{
+ int error;
+ vnode_t *vp = NULL;
+
+ error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_removexattr_common(vp, name);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+int
+lx_lremovexattr(char *path, char *name)
+{
+ int error;
+ vnode_t *vp = NULL;
+
+ error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+ if (error != 0) {
+ return (set_errno(error));
+ }
+
+ error = lx_removexattr_common(vp, name);
+ VN_RELE(vp);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+int
+lx_fremovexattr(int fd, char *name)
+{
+ int error;
+ file_t *fp;
+
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+
+ error = lx_removexattr_common(fp->f_vnode, name);
+ releasef(fd);
+
+ if (error != 0) {
+ return (set_errno(error));
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h
new file mode 100644
index 0000000000..f34ed31dcb
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h
@@ -0,0 +1,198 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _LXSYSFS_H
+#define _LXSYSFS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lx_sysfs.h: declarations, data structures and macros for lx_sysfs
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+#include <sys/netstack.h>
+#include <inet/ip.h>
+#include <inet/ip_if.h>
+
+/*
+ * Convert a vnode into an lxsys_mnt_t
+ */
+#define VTOLXSM(vp) ((lxsys_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxsys_node
+ */
+#define VTOLXS(vp) ((lxsys_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxsys_node into a vnode
+ */
+#define LXSTOV(lxsnp) ((lxsnp)->lxsys_vnode)
+
+/*
+ * convert a lxsys_node into zone for fs
+ */
+#define LXSTOZ(lxsnp) \
+ (((lxsys_mnt_t *)(lxsnp)->lxsys_vnode->v_vfsp->vfs_data)->lxsysm_zone)
+
+#define LXSNSIZ 256 /* max size of lx /sys file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define LXSYS_SDSIZE 16
+
+/* Root sysfs lxsys_instance */
+#define LXSYS_INST_ROOT 0
+
+/*
+ * Node/file types for lx /sys files
+ * (directories and files contained therein).
+ */
+typedef enum lxsys_nodetype {
+ LXSYS_NONE, /* None-type to keep inodes non-zero */
+ LXSYS_STATIC, /* Statically defined entries */
+ LXSYS_CLASS_NET, /* /sys/class/net/<iface> */
+ LXSYS_DEV_NET, /* /sys/devices/virtual/net/<iface> */
+ LXSYS_BLOCK, /* /sys/block/<dev> */
+ LXSYS_DEV_ZFS, /* /sys/devices/zfs/<dev> */
+ LXSYS_DEV_SYS_CPU, /* /sys/devices/system/cpu/<cpu> */
+ LXSYS_DEV_SYS_CPUINFO, /* /sys/devices/system/cpu/cpuN/<info> */
+ LXSYS_DEV_SYS_NODE, /* /sys/devices/system/node/node0/<info> */
+ LXSYS_MAXTYPE, /* type limit */
+} lxsys_nodetype_t;
+
+/*
+ * external dirent characteristics
+ */
+typedef struct {
+ unsigned int d_idnum;
+ char *d_name;
+} lxsys_dirent_t;
+
+typedef struct {
+ unsigned int dl_instance;
+ lxsys_dirent_t *dl_list;
+ int dl_length;
+} lxsys_dirlookup_t;
+
+/*
+ * This is the lx sysfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+struct lxsys_node;
+typedef struct lxsys_node lxsys_node_t;
+struct lxsys_node {
+ lxsys_nodetype_t lxsys_type; /* type ID of node */
+ unsigned int lxsys_instance; /* instance ID node */
+ unsigned int lxsys_endpoint; /* endpoint ID node */
+ vnode_t *lxsys_vnode; /* vnode for the node */
+ vnode_t *lxsys_parentvp; /* parent directory */
+ lxsys_node_t *lxsys_next; /* next list entry */
+ timestruc_t lxsys_time; /* creation time */
+ mode_t lxsys_mode; /* file mode bits */
+ uid_t lxsys_uid; /* file owner */
+ gid_t lxsys_gid; /* file group owner */
+ ino_t lxsys_ino; /* node id */
+};
+
+/*
+ * This is the lxsysfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxsys_mnt {
+ kmutex_t lxsysm_lock; /* protects fields */
+ lxsys_node_t *lxsysm_node; /* node at root of sys mount */
+ zone_t *lxsysm_zone; /* zone for this mount */
+} lxsys_mnt_t;
+
+extern vnodeops_t *lxsys_vnodeops;
+
+typedef struct mounta mounta_t;
+
+extern void lxsys_initnodecache();
+extern void lxsys_fininodecache();
+extern ino_t lxsys_inode(lxsys_nodetype_t, unsigned int, unsigned int);
+extern ino_t lxsys_parentinode(lxsys_node_t *);
+extern lxsys_node_t *lxsys_getnode(vnode_t *, lxsys_nodetype_t, unsigned int,
+ unsigned int);
+extern lxsys_node_t *lxsys_getnode_static(vnode_t *, unsigned int);
+extern void lxsys_freenode(lxsys_node_t *);
+
+extern netstack_t *lxsys_netstack(lxsys_node_t *);
+extern ill_t *lxsys_find_ill(ip_stack_t *, uint_t);
+
+extern int lxsys_ino_get_type(ino_t);
+
+typedef struct lxpr_uiobuf {
+ uio_t *uiop;
+ char *buffer;
+ uint32_t bufsize;
+ char *pos;
+ size_t beg;
+ int error;
+} lxsys_uiobuf_t;
+
+extern lxsys_uiobuf_t *lxsys_uiobuf_new(uio_t *);
+extern void lxsys_uiobuf_free(lxsys_uiobuf_t *);
+extern void lxsys_uiobuf_seterr(lxsys_uiobuf_t *, int);
+extern int lxsys_uiobuf_flush(lxsys_uiobuf_t *);
+extern void lxsys_uiobuf_write(lxsys_uiobuf_t *, const char *, size_t);
+extern void lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef islower
+#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z'))
+#endif
+#ifndef toupper
+#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x))
+#endif
+
+#endif /* _LXSYSFS_H */
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c
new file mode 100644
index 0000000000..69234ddbaa
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c
@@ -0,0 +1,443 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * lx_syssubr.c: Various functions for the /sys vnodeops.
+ */
+
+#include <sys/varargs.h>
+
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lx_sysfs.h"
+
+#define LXSYSCACHE_NAME "lxsys_cache"
+
+static int lxsys_node_constructor(void *, void *, int);
+static void lxsys_node_destructor(void *, void *);
+
+static kmem_cache_t *lxsys_node_cache;
+
+void
+lxsys_initnodecache()
+{
+ lxsys_node_cache = kmem_cache_create(LXSYSCACHE_NAME,
+ sizeof (lxsys_node_t), 0,
+ lxsys_node_constructor, lxsys_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxsys_fininodecache()
+{
+ kmem_cache_destroy(lxsys_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxsys_node_constructor(void *buf, void *un, int kmflags)
+{
+ lxsys_node_t *lxsnp = buf;
+ vnode_t *vp;
+
+ vp = lxsnp->lxsys_vnode = vn_alloc(kmflags);
+ if (vp == NULL)
+ return (-1);
+
+ (void) vn_setops(vp, lxsys_vnodeops);
+ vp->v_data = lxsnp;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+lxsys_node_destructor(void *buf, void *un)
+{
+ lxsys_node_t *lxsnp = buf;
+
+ vn_free(LXSTOV(lxsnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxsys node
+ */
+ino_t
+lxsys_inode(lxsys_nodetype_t type, unsigned int instance,
+ unsigned int endpoint)
+{
+ /*
+ * Sysfs Inode format:
+ * 0000AABBBBCC
+ *
+ * AA - TYPE
+ * BBBB - INSTANCE
+ * CC - ENDPOINT
+ */
+ ASSERT(instance <= 0xffff);
+ ASSERT(endpoint <= 0xff);
+
+ return ((ino_t)(type << 24)|(instance << 8)|endpoint);
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxsys_parentinode(lxsys_node_t *lxsnp)
+{
+ /*
+ * If the input node is the root then the parent inode
+ * is the mounted on inode so just return our inode number
+ */
+ if (lxsnp->lxsys_type == LXSYS_STATIC &&
+ lxsnp->lxsys_instance == LXSYS_INST_ROOT) {
+ return (lxsnp->lxsys_ino);
+ } else {
+ return (VTOLXS(lxsnp->lxsys_parentvp)->lxsys_ino);
+ }
+}
+
+/*
+ * Allocate a new lxsys node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxsys_node_t *
+lxsys_getnode(vnode_t *dp, lxsys_nodetype_t type, unsigned int instance,
+ unsigned int endpoint)
+{
+ lxsys_node_t *lxsnp;
+ vnode_t *vp;
+ timestruc_t now;
+
+ /*
+ * Allocate a new node. It is deallocated in vop_innactive
+ */
+ lxsnp = kmem_cache_alloc(lxsys_node_cache, KM_SLEEP);
+
+ /*
+ * Set defaults (may be overridden below)
+ */
+ gethrestime(&now);
+ lxsnp->lxsys_type = type;
+ lxsnp->lxsys_instance = instance;
+ lxsnp->lxsys_endpoint = endpoint;
+ lxsnp->lxsys_next = NULL;
+ lxsnp->lxsys_parentvp = dp;
+ VN_HOLD(dp);
+
+ lxsnp->lxsys_time = now;
+ lxsnp->lxsys_uid = lxsnp->lxsys_gid = 0;
+ lxsnp->lxsys_ino = lxsys_inode(type, instance, endpoint);
+
+ /* initialize the vnode data */
+ vp = lxsnp->lxsys_vnode;
+ vn_reinit(vp);
+ vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+ vp->v_vfsp = dp->v_vfsp;
+
+ /*
+ * Default to a directory with open permissions.
+ * Specific components will override this
+ */
+ if (type == LXSYS_STATIC && instance == LXSYS_INST_ROOT) {
+ vp->v_flag |= VROOT;
+ }
+ vp->v_type = VDIR;
+ lxsnp->lxsys_mode = 0555;
+
+ return (lxsnp);
+}
+
+lxsys_node_t *
+lxsys_getnode_static(vnode_t *dp, unsigned int instance)
+{
+ lxsys_mnt_t *lxsm = VTOLXSM(dp);
+ lxsys_node_t *lnp, *tail = NULL;
+
+ mutex_enter(&lxsm->lxsysm_lock);
+ for (lnp = lxsm->lxsysm_node; lnp != NULL; lnp = lnp->lxsys_next) {
+ if (lnp->lxsys_instance == instance) {
+ VERIFY(lnp->lxsys_parentvp == dp);
+
+ VN_HOLD(lnp->lxsys_vnode);
+ mutex_exit(&lxsm->lxsysm_lock);
+ return (lnp);
+ } else if (lnp->lxsys_next == NULL) {
+ /* Found no match by the end of the list */
+ tail = lnp;
+ break;
+ }
+ }
+
+ tail->lxsys_next = lxsys_getnode(dp, LXSYS_STATIC, instance, 0);
+ lnp = tail->lxsys_next;
+ /* Allow mounts on static entries */
+ LXSTOV(lnp)->v_flag &= (~VNOMOUNT);
+ mutex_exit(&lxsm->lxsysm_lock);
+ return (lnp);
+}
+
+/* Clean up persistence for static lxsys_node */
+int
+lxsys_freenode_static(lxsys_node_t *lnp)
+{
+ lxsys_node_t *plnp;
+ vnode_t *vp = LXSTOV(lnp);
+ lxsys_mnt_t *lxsm = VTOLXSM(vp);
+
+ if (lnp->lxsys_instance == LXSYS_INST_ROOT) {
+ /*
+ * The root vnode does not need special cleanup since it
+ * anchors the list and is freed by lxsys_unmount.
+ */
+ return (0);
+ }
+
+ mutex_enter(&lxsm->lxsysm_lock);
+
+ /*
+ * It is possible that a different process acquired a fresh reference
+ * to this vnode via lookup while we were waiting on the lxsysm_lock.
+ * To avoid freeing the vnode out from under them, we will double-check
+ * v_count and bail from the fop_inactive if it was grabbed.
+ */
+ mutex_enter(&vp->v_lock);
+ if (vp->v_count != 1) {
+ VERIFY(vp->v_count > 0);
+
+ /* Release our hold before bailing out of lxsys_inactive */
+ vp->v_count--;
+
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&lxsm->lxsysm_lock);
+ return (-1);
+ }
+ mutex_exit(&vp->v_lock);
+
+ /* search for the record pointing to lnp */
+ plnp = lxsm->lxsysm_node;
+ while (plnp != NULL && plnp->lxsys_next != lnp) {
+ plnp = plnp->lxsys_next;
+ }
+ /* entry should always be found */
+ VERIFY(plnp != NULL);
+ plnp->lxsys_next = lnp->lxsys_next;
+
+ mutex_exit(&lxsm->lxsysm_lock);
+ return (0);
+}
+
+/*
+ * Free the storage obtained from lxsys_getnode().
+ */
+void
+lxsys_freenode(lxsys_node_t *lxsnp)
+{
+ vnode_t *vp = LXSTOV(lxsnp);
+
+ VERIFY(vp != NULL);
+
+ if (lxsnp->lxsys_type == LXSYS_STATIC) {
+ if (lxsys_freenode_static(lxsnp) != 0) {
+ return;
+ }
+ }
+
+ /*
+ * delete any association with parent vp
+ */
+ if (lxsnp->lxsys_parentvp != NULL)
+ VN_RELE(lxsnp->lxsys_parentvp);
+
+ /*
+ * Release the lxsysnode.
+ */
+ kmem_cache_free(lxsys_node_cache, lxsnp);
+}
+
+/*
+ * Get the netstack associated with this lxsys mount
+ */
+netstack_t *
+lxsys_netstack(lxsys_node_t *lnp)
+{
+ zone_t *zone = VTOLXSM(LXSTOV(lnp))->lxsysm_zone;
+
+ return (netstack_hold_if_active(zone->zone_netstack));
+}
+
+ill_t *
+lxsys_find_ill(ip_stack_t *ipst, uint_t ifindex)
+{
+ ill_t *ill;
+ phyint_t *phyi;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+ (void *) &ifindex, NULL);
+ if (phyi != NULL) {
+ /*
+ * Since interface information presented via /sys is not
+ * specific to IPv4 or IPv6, an ill reference from either
+ * protocol will be adequate. Check both, starting with IPv4
+ * for a valid reference to use.
+ */
+ for (ill = phyi->phyint_illv4; ill != phyi->phyint_illv6;
+ ill = phyi->phyint_illv6) {
+ if (ill != NULL) {
+ mutex_enter(&ill->ill_lock);
+ if (!ILL_IS_CONDEMNED(ill)) {
+ ill_refhold_locked(ill);
+ mutex_exit(&ill->ill_lock);
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (ill);
+ }
+ mutex_exit(&ill->ill_lock);
+ }
+ }
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (NULL);
+}
+
+
+#define LXSYSUIOBUFSZ 4096
+
+lxsys_uiobuf_t *
+lxsys_uiobuf_new(uio_t *uiop)
+{
+ /* Allocate memory for both lxsys_uiobuf and output buffer */
+ int bufsize = LXSYSUIOBUFSZ;
+ lxsys_uiobuf_t *uiobuf =
+ kmem_alloc(sizeof (lxsys_uiobuf_t) + bufsize, KM_SLEEP);
+
+ uiobuf->uiop = uiop;
+ uiobuf->buffer = (char *)&uiobuf[1];
+ uiobuf->bufsize = bufsize;
+ uiobuf->pos = uiobuf->buffer;
+ uiobuf->beg = 0;
+ uiobuf->error = 0;
+
+ return (uiobuf);
+}
+
+void
+lxsys_uiobuf_free(lxsys_uiobuf_t *uiobuf)
+{
+ ASSERT(uiobuf != NULL);
+ ASSERT(uiobuf->pos == uiobuf->buffer);
+
+ kmem_free(uiobuf, sizeof (lxsys_uiobuf_t) + uiobuf->bufsize);
+}
+
+void
+lxsys_uiobuf_seterr(lxsys_uiobuf_t *uiobuf, int err)
+{
+ ASSERT(uiobuf->error == 0);
+
+ uiobuf->error = err;
+}
+
+int
+lxsys_uiobuf_flush(lxsys_uiobuf_t *uiobuf)
+{
+ off_t off = uiobuf->uiop->uio_offset;
+ caddr_t uaddr = uiobuf->buffer;
+ size_t beg = uiobuf->beg;
+ size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+ if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ ASSERT(off >= beg);
+
+ if (beg + size > off && off >= 0)
+ uiobuf->error =
+ uiomove(uaddr + (off - beg), size - (off - beg),
+ UIO_READ, uiobuf->uiop);
+
+ uiobuf->beg += size;
+ }
+
+ uiobuf->pos = uaddr;
+
+ return (uiobuf->error);
+}
+
+void
+lxsys_uiobuf_write(lxsys_uiobuf_t *uiobuf, const char *buf, size_t size)
+{
+ /* While we can still carry on */
+ while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ uintptr_t remain = (uintptr_t)uiobuf->bufsize -
+ ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+ /* Enough space in buffer? */
+ if (remain >= size) {
+ bcopy(buf, uiobuf->pos, size);
+ uiobuf->pos += size;
+ return;
+ }
+
+ /* Not enough space, so copy all we can and try again */
+ bcopy(buf, uiobuf->pos, remain);
+ uiobuf->pos += remain;
+ (void) lxsys_uiobuf_flush(uiobuf);
+ buf += remain;
+ size -= remain;
+ }
+}
+
+#define TYPBUFFSIZE 256
+
+void
+lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...)
+{
+ va_list args;
+ char buff[TYPBUFFSIZE];
+ int len;
+ char *buffer;
+
+ /* Can we still do any output */
+ if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+ return;
+
+ va_start(args, fmt);
+
+ /* Try using stack allocated buffer */
+ len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+ if (len < TYPBUFFSIZE) {
+ va_end(args);
+ lxsys_uiobuf_write(uiobuf, buff, len);
+ return;
+ }
+
+ /* Not enough space in pre-allocated buffer */
+ buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+ /*
+ * We know we allocated the correct amount of space
+ * so no check on the return value
+ */
+ (void) vsnprintf(buffer, len+1, fmt, args);
+ lxsys_uiobuf_write(uiobuf, buffer, len);
+ va_end(args);
+ kmem_free(buffer, len+1);
+}
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c
new file mode 100644
index 0000000000..fddc1e0234
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c
@@ -0,0 +1,365 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * lxsysvfsops.c: vfs operations for lx sysfs.
+ *
+ * sysfs has a close relationship with the lx getdents(2) syscall. This is
+ * necessary so that the getdents code can populate the 'd_type' entries
+ * during a sysfs readdir operation. The glibc code which accesses sysfs
+ * (specifically the 'cpu' subtree) expects dirents to have the d_type field
+ * populated. One problematic consumer is java, which becomes unstable if it
+ * gets the incorrect data from glibc. When sysfs loads, it populates the
+ * lx_sysfs_vfs_type and lx_sysfs_vtype variables defined in lx_getdents.c.
+ * The getdents code can then call into sysfs to determine the d_type for any
+ * given inode directory entry.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/lx_impl.h>
+
+#include "lx_sysfs.h"
+
+/* Module level parameters */
+static int lxsysfstype;
+static dev_t lxsysdev;
+static kmutex_t lxsys_mount_lock;
+
+extern int lx_sysfs_vfs_type;
+extern int (*lx_sysfs_vtype)(ino_t);
+
+static int lxsys_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxsys_unmount(vfs_t *, int, cred_t *);
+static int lxsys_root(vfs_t *, vnode_t **);
+static int lxsys_statvfs(vfs_t *, statvfs64_t *);
+static int lxsys_init(int, char *);
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "lx_sysfs",
+ lxsys_init,
+ VSW_ZMOUNT,
+ NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+ &mod_fsops, "lx brand sysfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int retval;
+
+ /*
+ * attempt to unload the module
+ */
+ if ((retval = mod_remove(&modlinkage)) != 0)
+ goto done;
+
+ lx_sysfs_vfs_type = 0;
+ lx_sysfs_vtype = NULL;
+
+ /*
+ * destroy lxsys_node cache
+ */
+ lxsys_fininodecache();
+
+ /*
+ * clean out the vfsops and vnodeops
+ */
+ (void) vfs_freevfsops_by_type(lxsysfstype);
+ vn_freevnodeops(lxsys_vnodeops);
+
+ mutex_destroy(&lxsys_mount_lock);
+done:
+ return (retval);
+}
+
+static int
+lxsys_init(int fstype, char *name)
+{
+ static const fs_operation_def_t lxsys_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = lxsys_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = lxsys_unmount },
+ VFSNAME_ROOT, { .vfs_root = lxsys_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = lxsys_statvfs },
+ NULL, NULL
+ };
+ extern const fs_operation_def_t lxsys_vnodeops_template[];
+ int error;
+ major_t dev;
+
+ lx_sysfs_vtype = lxsys_ino_get_type;
+ lx_sysfs_vfs_type = lxsysfstype = fstype;
+ ASSERT(lxsysfstype != 0);
+
+ mutex_init(&lxsys_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Associate VFS ops vector with this fstype.
+ */
+ error = vfs_setfsops(fstype, lxsys_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "lxsys_init: bad vfs ops template");
+ return (error);
+ }
+
+ /*
+ * Set up vnode ops vector too.
+ */
+ error = vn_make_ops(name, lxsys_vnodeops_template, &lxsys_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "lxsys_init: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * Assign a unique "device" number (used by stat(2)).
+ */
+ if ((dev = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN, "lxsys_init: can't get unique device number");
+ dev = 0;
+ }
+
+ /*
+ * Make the pseudo device
+ */
+ lxsysdev = makedevice(dev, 0);
+
+ /*
+ * Initialise cache for lxsys_nodes
+ */
+ lxsys_initnodecache();
+
+ return (0);
+}
+
+static int
+lxsys_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+ lxsys_mnt_t *lxsys_mnt;
+ zone_t *zone = curproc->p_zone;
+
+ /*
+ * must be root to mount
+ */
+ if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+ return (EPERM);
+
+ /*
+ * mount point must be a directory
+ */
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (zone == global_zone) {
+ zone_t *mntzone;
+
+ mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+ zone_rele(mntzone);
+ if (zone != mntzone)
+ return (EBUSY);
+ }
+
+ /*
+ * Having the resource be anything but "lxsys" doesn't make sense
+ */
+ vfs_setresource(vfsp, "lxsys", 0);
+
+ lxsys_mnt = kmem_alloc(sizeof (*lxsys_mnt), KM_SLEEP);
+
+ mutex_enter(&lxsys_mount_lock);
+
+ /*
+ * Ensure we don't allow overlaying mounts
+ */
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ mutex_exit(&lxsys_mount_lock);
+ kmem_free(lxsys_mnt, sizeof ((*lxsys_mnt)));
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+
+ mutex_init(&lxsys_mnt->lxsysm_lock, NULL, MUTEX_DEFAULT, NULL);
+ zone_hold(lxsys_mnt->lxsysm_zone = zone);
+
+ /* Arbitrarily set the parent vnode to the mounted over directory */
+ lxsys_mnt->lxsysm_node = lxsys_getnode(mvp, LXSYS_STATIC,
+ LXSYS_INST_ROOT, 0);
+ lxsys_mnt->lxsysm_node->lxsys_next = NULL;
+
+ /* Correctly set the fs for the root node */
+ lxsys_mnt->lxsysm_node->lxsys_vnode->v_vfsp = vfsp;
+
+ vfs_make_fsid(&vfsp->vfs_fsid, lxsysdev, lxsysfstype);
+ vfsp->vfs_bsize = DEV_BSIZE;
+ vfsp->vfs_fstype = lxsysfstype;
+ vfsp->vfs_data = (caddr_t)lxsys_mnt;
+ vfsp->vfs_dev = lxsysdev;
+
+ mutex_exit(&lxsys_mount_lock);
+
+ return (0);
+}
+
+static int
+lxsys_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ lxsys_mnt_t *lxsys_mnt = (lxsys_mnt_t *)vfsp->vfs_data;
+ lxsys_node_t *lnp;
+ vnode_t *vp;
+ int count;
+
+ VERIFY(lxsys_mnt != NULL);
+
+ mutex_enter(&lxsys_mount_lock);
+
+ /* must be root to unmount */
+ if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+ mutex_exit(&lxsys_mount_lock);
+ return (EPERM);
+ }
+
+ /* forced unmount is not supported by this fs */
+ if (flag & MS_FORCE) {
+ mutex_exit(&lxsys_mount_lock);
+ return (ENOTSUP);
+ }
+
+ /* Ensure that no vnodes are in use on this mount point. */
+ lnp = lxsys_mnt->lxsysm_node;
+ vp = LXSTOV(lnp);
+ mutex_enter(&vp->v_lock);
+ count = vp->v_count;
+ mutex_exit(&vp->v_lock);
+ if (count > 1) {
+ mutex_exit(&lxsys_mount_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * If there are no references to the root vnode the list of persistent
+ * static vnodes should be empty
+ */
+ VERIFY(lnp->lxsys_next == NULL);
+
+ (void) dnlc_purge_vfsp(vfsp, 0);
+
+ lxsys_mnt->lxsysm_node = NULL;
+ lxsys_freenode(lnp);
+ zone_rele(lxsys_mnt->lxsysm_zone);
+ vfsp->vfs_data = NULL;
+ kmem_free(lxsys_mnt, sizeof (*lxsys_mnt));
+
+ mutex_exit(&lxsys_mount_lock);
+
+ return (0);
+}
+
+static int
+lxsys_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ lxsys_mnt_t *lxsm = (lxsys_mnt_t *)vfsp->vfs_data;
+ vnode_t *vp;
+
+ VERIFY(lxsm != NULL);
+ VERIFY(lxsm->lxsysm_node != NULL);
+
+ vp = LXSTOV(lxsm->lxsysm_node);
+ VN_HOLD(vp);
+ *vpp = vp;
+
+ return (0);
+}
+
+static int
+lxsys_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+ dev32_t d32;
+
+ bzero((caddr_t)sp, sizeof (*sp));
+ sp->f_bsize = DEV_BSIZE;
+ sp->f_frsize = DEV_BSIZE;
+ sp->f_blocks = (fsblkcnt64_t)0;
+ sp->f_bfree = (fsblkcnt64_t)0;
+ sp->f_bavail = (fsblkcnt64_t)0;
+ sp->f_files = (fsfilcnt64_t)3;
+ sp->f_ffree = (fsfilcnt64_t)0; /* none */
+ sp->f_favail = (fsfilcnt64_t)0; /* none */
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sp->f_fsid = d32;
+ /* It is guaranteed that vsw_name will fit in f_basetype */
+ (void) strcpy(sp->f_basetype, vfssw[lxsysfstype].vsw_name);
+ sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sp->f_namemax = 64; /* quite arbitrary */
+ bzero(sp->f_fstr, sizeof (sp->f_fstr));
+
+ /* We know f_fstr is 32 chars */
+ (void) strcpy(sp->f_fstr, "/sys");
+ (void) strcpy(&sp->f_fstr[6], "/sys");
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c
new file mode 100644
index 0000000000..10c99baa7b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c
@@ -0,0 +1,2165 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * lx_sysfs -- a Linux-compatible /sys for the LX brand
+ */
+
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_brand.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+#include <sys/param.h>
+#include <sys/utsname.h>
+#include <sys/lx_misc.h>
+#include <sys/brand.h>
+#include <sys/cred_impl.h>
+#include <sys/tihdr.h>
+#include <sys/sunddi.h>
+#include <sys/vnode.h>
+#include <sys/netstack.h>
+#include <sys/ethernet.h>
+#include <inet/ip_arp.h>
+
+#include "lx_sysfs.h"
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxsys_init() in lx_sysvfsops.c
+ */
+vnodeops_t *lxsys_vnodeops;
+
+static int lxsys_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxsys_close(vnode_t *, int, int, offset_t, cred_t *,
+ caller_context_t *);
+static int lxsys_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxsys_getattr(vnode_t *, vattr_t *, int, cred_t *,
+ caller_context_t *);
+static int lxsys_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxsys_lookup(vnode_t *, char *, vnode_t **,
+ pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+ pathname_t *);
+static int lxsys_readdir(vnode_t *, uio_t *, cred_t *, int *,
+ caller_context_t *, int);
+static int lxsys_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxsys_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxsys_sync(void);
+static void lxsys_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxsys_lookup_static(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_class_netdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_virtual_netdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_blockdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_zfsdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_syscpu(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_syscpuinfo(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_sysnode(lxsys_node_t *, char *);
+
+static int lxsys_read_static(lxsys_node_t *, lxsys_uiobuf_t *);
+static int lxsys_read_devices_virtual_net(lxsys_node_t *, lxsys_uiobuf_t *);
+static int lxsys_read_devices_zfs_block(lxsys_node_t *, lxsys_uiobuf_t *);
+static int lxsys_read_devices_syscpu(lxsys_node_t *, lxsys_uiobuf_t *);
+static int lxsys_read_devices_sysnode(lxsys_node_t *, lxsys_uiobuf_t *);
+
+static int lxsys_readdir_devices_syscpu(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_syscpuinfo(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_sysnode(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_static(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_class_netdir(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_virtual_netdir(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_blockdir(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_zfsdir(lxsys_node_t *, uio_t *, int *);
+
+static int lxsys_readlink_class_net(lxsys_node_t *, char *, size_t);
+static int lxsys_readlink_block(lxsys_node_t *, char *, size_t);
+
+/*
+ * The lx /sys vnode operations vector
+ */
+const fs_operation_def_t lxsys_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = lxsys_open },
+ VOPNAME_CLOSE, { .vop_close = lxsys_close },
+ VOPNAME_READ, { .vop_read = lxsys_read },
+ VOPNAME_GETATTR, { .vop_getattr = lxsys_getattr },
+ VOPNAME_ACCESS, { .vop_access = lxsys_access },
+ VOPNAME_LOOKUP, { .vop_lookup = lxsys_lookup },
+ VOPNAME_READDIR, { .vop_readdir = lxsys_readdir },
+ VOPNAME_READLINK, { .vop_readlink = lxsys_readlink },
+ VOPNAME_FSYNC, { .error = lxsys_sync },
+ VOPNAME_SEEK, { .error = lxsys_sync },
+ VOPNAME_INACTIVE, { .vop_inactive = lxsys_inactive },
+ VOPNAME_CMP, { .vop_cmp = lxsys_cmp },
+ NULL, NULL
+};
+
+typedef enum lxsys_cpu_state {
+ LXSYS_CPU_ON, /* online */
+ LXSYS_CPU_OFF, /* offline */
+ LXSYS_CPU_ANY, /* don't care */
+} lxsys_cpu_state_t;
+
+static void lxsys_format_cpu(char *, int, lxsys_cpu_state_t);
+
+/*
+ * Sysfs Inode format:
+ * 0000AABBBBCC
+ *
+ * AA - TYPE
+ * BBBB - INSTANCE
+ * CC - ENDPOINT
+ *
+ * Where TYPE is one of:
+ * 1 - SYS_STATIC
+ * 2 - SYS_CLASS_NET
+ * 3 - SYS_DEV_NET
+ * 4 - SYS_BLOCK
+ * 5 - SYS_DEV_ZFS
+ * 6 - SYS_DEV_SYS_CPU
+ * 7 - SYS_DEV_SYS_CPUINFO
+ * 8 - SYS_DEV_SYS_NODE
+ *
+ * Static entries will have assigned INSTANCE identifiers:
+ * - 0x00: /sys
+ * - 0x01: /sys/class
+ * - 0x02: /sys/devices
+ * - 0x03: /sys/fs
+ * - 0x04: /sys/class/net
+ * - 0x05: /sys/devices/virtual
+ * - 0x06: /sys/devices/system
+ * - 0x07: /sys/fs/cgroup
+ * - 0x08: /sys/devices/virtual/net
+ * - 0x09: /sys/block
+ * - 0x0a: /sys/devices/zfs
+ * - 0x0b: /sys/devices/system/cpu
+ * - 0x0c: /sys/devices/system/node
+ * - 0x0d: /sys/bus
+ *
+ * Dynamic /sys/class/net/<interface> symlinks will use an INSTANCE derived
+ * from the corresonding ifindex.
+ *
+ * Dynamic /sys/devices/virtual/net/<interface>/<entries> directories will use
+ * an INSTANCE derived from the ifindex and statically assigned ENDPOINT IDs
+ * for the contained entries.
+ *
+ * Dynamic /sys/block/<dev> symlinks will use an INSTANCE derived from the
+ * device major and instance from records listed in kstat or zvols.
+ *
+ * Dynamic /sys/devices/zfs/<dev> directories will use an INSTANCE derived from
+ * the emulated minor number.
+ *
+ * Semi-static/Dynamic /sys/devices/system/cpu contains the fixed 'kernel_max',
+ * 'offline', 'online', 'possible', and 'present' files, and a dynamic set of
+ * cpuN subdirectories. All of these are dynamic nodes.
+ *
+ * Static /sys/devices/system/node/node0 currently only contains a
+ * static cpulist file, but will likely need future dynamic entries for cpuN
+ * symlinks, and perhaps other static files. By only providing 'node0' we
+ * pretend that there is only a single NUMA node available to a zone (trying to
+ * be NUMA-aware inside a zone is generally not going to work anyway).
+ * If dynamic entries are added under node0, it must be converted to the
+ * semi-static/dynamic approach as used under /sys/devices/system/cpu.
+ *
+ * The dyn_ino_type table must be updated whenever a new static instance is
+ * defined.
+ */
+
+#define LXSYS_INST_CLASSDIR 0x1
+#define LXSYS_INST_DEVICESDIR 0x2
+#define LXSYS_INST_FSDIR 0x3
+#define LXSYS_INST_CLASS_NETDIR 0x4
+#define LXSYS_INST_DEVICES_VIRTUALDIR 0x5
+#define LXSYS_INST_DEVICES_SYSTEMDIR 0x6
+#define LXSYS_INST_FS_CGROUPDIR 0x7
+#define LXSYS_INST_DEVICES_VIRTUAL_NETDIR 0x8
+#define LXSYS_INST_BLOCKDIR 0x9
+#define LXSYS_INST_DEVICES_ZFSDIR 0xa
+#define LXSYS_INST_DEVICES_SYSCPU 0xb
+#define LXSYS_INST_DEVICES_SYSNODE 0xc
+#define LXSYS_INST_BUSDIR 0xd
+#define LXSYS_INST_MAX LXSYS_INST_BUSDIR /* limit */
+
+/*
+ * These are of dynamic type (LXSYS_DEV_SYS_CPU), but essentially fixed
+ * instances. Under /sys/devices/system/cpu we have: kernel_max, offline,
+ * online, possible and present. We also have a dynamic set of cpuN subdirs.
+ * The cpuN subdirs are actually of type LXSYS_DEV_SYS_CPUINFO, so we can use
+ * the following instance IDs for the fixed files.
+ */
+#define LXSYS_INST_DEV_SYSCPU_KMAX 0x1
+#define LXSYS_INST_DEV_SYSCPU_OFFLINE 0x2
+#define LXSYS_INST_DEV_SYSCPU_ONLINE 0x3
+#define LXSYS_INST_DEV_SYSCPU_POSSIBLE 0x4
+#define LXSYS_INST_DEV_SYSCPU_PRESENT 0x5
+
+/*
+ * This array is used for directory inode correction in lxsys_readdir_common
+ * when a directory's static-type entry is actually a dynamic-type.
+ */
+static int dyn_ino_type [] = {
+ 0, /* invalid */
+ 0, /* LXSYS_INST_CLASSDIR */
+ 0, /* LXSYS_INST_DEVICESDIR */
+ 0, /* LXSYS_INST_FSDIR */
+ LXSYS_CLASS_NET, /* LXSYS_INST_CLASS_NETDIR */
+ 0, /* LXSYS_INST_DEVICES_VIRTUALDIR */
+ 0, /* LXSYS_INST_DEVICES_SYSTEMDIR */
+ 0, /* LXSYS_INST_FS_CGROUPDIR */
+ LXSYS_DEV_NET, /* LXSYS_INST_DEV_VIRTUAL_NETDIR */
+ LXSYS_BLOCK, /* LXSYS_INST_BLOCKDIR */
+ LXSYS_DEV_ZFS, /* LXSYS_INST_DEVICES_ZFSDIR */
+ LXSYS_DEV_SYS_CPU, /* LXSYS_INST_DEVICES_SYSCPU */
+ LXSYS_DEV_SYS_NODE, /* LXSYS_INST_DEV_SYSNODE */
+ 0, /* LXSYS_INST_BUSDIR */
+};
+#define DYN_INO_LEN \
+ (sizeof (dyn_ino_type) / sizeof ((dyn_ino_type)[0]))
+
+/*
+ * file contents of an lx /sys directory.
+ */
+static lxsys_dirent_t dirlist_root[] = {
+ { LXSYS_INST_BLOCKDIR, "block" },
+ { LXSYS_INST_BUSDIR, "bus" },
+ { LXSYS_INST_CLASSDIR, "class" },
+ { LXSYS_INST_DEVICESDIR, "devices" },
+ { LXSYS_INST_FSDIR, "fs" }
+};
+static lxsys_dirent_t dirlist_class[] = {
+ { LXSYS_INST_CLASS_NETDIR, "net" }
+};
+static lxsys_dirent_t dirlist_fs[] = {
+ { LXSYS_INST_FS_CGROUPDIR, "cgroup" }
+};
+static lxsys_dirent_t dirlist_devices[] = {
+ { LXSYS_INST_DEVICES_SYSTEMDIR, "system" },
+ { LXSYS_INST_DEVICES_VIRTUALDIR, "virtual" },
+ { LXSYS_INST_DEVICES_ZFSDIR, "zfs" }
+};
+static lxsys_dirent_t dirlist_devices_virtual[] = {
+ { LXSYS_INST_DEVICES_VIRTUAL_NETDIR, "net" }
+};
+
+static lxsys_dirent_t dirlist_devices_system[] = {
+ { LXSYS_INST_DEVICES_SYSCPU, "cpu" },
+ { LXSYS_INST_DEVICES_SYSNODE, "node" }
+};
+
+#define LXSYS_ENDP_NET_ADDRESS 1
+#define LXSYS_ENDP_NET_ADDRLEN 2
+#define LXSYS_ENDP_NET_FLAGS 3
+#define LXSYS_ENDP_NET_IFINDEX 4
+#define LXSYS_ENDP_NET_MTU 5
+#define LXSYS_ENDP_NET_TXQLEN 6
+#define LXSYS_ENDP_NET_TYPE 7
+
+#define LXSYS_ENDP_BLOCK_DEVICE 1
+
+#define LXSYS_ENDP_NODE_CPULIST 1
+#define LXSYS_ENDP_NODE_CPUMAP 2
+
+static lxsys_dirent_t dirlist_devices_virtual_net[] = {
+ { LXSYS_ENDP_NET_ADDRESS, "address" },
+ { LXSYS_ENDP_NET_ADDRLEN, "addr_len" },
+ { LXSYS_ENDP_NET_FLAGS, "flags" },
+ { LXSYS_ENDP_NET_IFINDEX, "ifindex" },
+ { LXSYS_ENDP_NET_MTU, "mtu" },
+ { LXSYS_ENDP_NET_TXQLEN, "tx_queue_len" },
+ { LXSYS_ENDP_NET_TYPE, "type" }
+};
+
+static lxsys_dirent_t dirlist_devices_zfs_block[] = {
+ { LXSYS_ENDP_BLOCK_DEVICE, "device" }
+};
+
+static lxsys_dirent_t dirlist_devices_sysnode[] = {
+ { LXSYS_ENDP_NODE_CPULIST, "cpulist" },
+ { LXSYS_ENDP_NODE_CPUMAP, "cpumap" }
+};
+
+#define SYSDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0]))
+
+#define SYSDLENT(i, l) { i, l, SYSDIRLISTSZ(l) }
+static lxsys_dirlookup_t lxsys_dirlookup[] = {
+ SYSDLENT(LXSYS_INST_ROOT, dirlist_root),
+ SYSDLENT(LXSYS_INST_CLASSDIR, dirlist_class),
+ SYSDLENT(LXSYS_INST_FSDIR, dirlist_fs),
+ { LXSYS_INST_FS_CGROUPDIR, NULL, 0 },
+ SYSDLENT(LXSYS_INST_DEVICESDIR, dirlist_devices),
+ SYSDLENT(LXSYS_INST_DEVICES_SYSTEMDIR, dirlist_devices_system),
+ SYSDLENT(LXSYS_INST_DEVICES_VIRTUALDIR, dirlist_devices_virtual),
+ SYSDLENT(LXSYS_INST_DEVICES_SYSNODE, dirlist_devices_sysnode),
+ { LXSYS_INST_BUSDIR, NULL, 0 },
+};
+
+
+/*
+ * Array of lookup functions, indexed by lx /sys file type.
+ */
+static vnode_t *(*lxsys_lookup_function[LXSYS_MAXTYPE])() = {
+ NULL, /* LXSYS_NONE */
+ lxsys_lookup_static, /* LXSYS_STATIC */
+ lxsys_lookup_class_netdir, /* LXSYS_CLASS_NET */
+ lxsys_lookup_devices_virtual_netdir, /* LXSYS_DEV_NET */
+ lxsys_lookup_blockdir, /* LXSYS_BLOCK */
+ lxsys_lookup_devices_zfsdir, /* LXSYS_DEV_ZFS */
+ lxsys_lookup_devices_syscpu, /* LXSYS_DEV_SYS_CPU */
+ lxsys_lookup_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */
+ lxsys_lookup_devices_sysnode, /* LXSYS_DEV_SYS_NODE */
+};
+
+/*
+ * Array of readdir functions, indexed by /sys file type.
+ */
+static int (*lxsys_readdir_function[LXSYS_MAXTYPE])() = {
+ NULL, /* LXSYS_NONE */
+ lxsys_readdir_static, /* LXSYS_STATIC */
+ lxsys_readdir_class_netdir, /* LXSYS_CLASS_NET */
+ lxsys_readdir_devices_virtual_netdir, /* LXSYS_DEV_NET */
+ lxsys_readdir_blockdir, /* LXSYS_BLOCK */
+ lxsys_readdir_devices_zfsdir, /* LXSYS_DEV_ZFS */
+ lxsys_readdir_devices_syscpu, /* LXSYS_DEV_SYS_CPU */
+ lxsys_readdir_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */
+ lxsys_readdir_devices_sysnode, /* LXSYS_DEV_SYS_NODE */
+};
+
+/*
+ * Array of read functions, indexed by /sys file type.
+ */
+static int (*lxsys_read_function[LXSYS_MAXTYPE])() = {
+ NULL, /* LXSYS_NONE */
+ lxsys_read_static, /* LXSYS_STATIC */
+ NULL, /* LXSYS_CLASS_NET */
+ lxsys_read_devices_virtual_net, /* LXSYS_DEV_NET */
+ NULL, /* LXSYS_BLOCK */
+ lxsys_read_devices_zfs_block, /* LXSYS_DEV_ZFS */
+ lxsys_read_devices_syscpu, /* LXSYS_DEV_SYS_CPU */
+ NULL, /* LXSYS_DEV_SYS_CPUINFO */
+ lxsys_read_devices_sysnode, /* LXSYS_DEV_SYS_NODE */
+};
+
+/*
+ * Array of readlink functions, indexed by /sys file type.
+ */
+static int (*lxsys_readlink_function[LXSYS_MAXTYPE])() = {
+ NULL, /* LXSYS_NONE */
+ NULL, /* LXSYS_STATIC */
+ lxsys_readlink_class_net, /* LXSYS_CLASS_NET */
+ NULL, /* LXSYS_DEV_NET */
+ lxsys_readlink_block, /* LXSYS_BLOCK */
+ NULL, /* LXSYS_DEV_ZFS */
+ NULL, /* LXSYS_DEV_SYS_CPU */
+ NULL, /* LXSYS_DEV_SYS_CPUINFO */
+ NULL, /* LXSYS_DEV_SYS_NODE */
+};
+
+/*
+ * Given one of our inodes, return the vnode type.
+ *
+ * lxsys_getnode will always set the vnode type to VDIR. It expects the
+ * caller (normally the lookup functions) to fix the type. Those same rules are
+ * encoded here for our inode-to-type translation.
+ */
+int
+lxsys_ino_get_type(ino_t ino)
+{
+ lxsys_nodetype_t type;
+ unsigned int instance;
+ unsigned int endpoint;
+
+ type = (ino & 0xff000000) >> 24;
+ instance = (ino & 0xffff00) >> 8;
+ endpoint = (ino & 0xff);
+
+ if (instance > LXSYS_INST_MAX)
+ return (VNON);
+
+ /* Validate non-static node types */
+ if (type != LXSYS_STATIC &&
+ (type <= LXSYS_STATIC || type >= LXSYS_MAXTYPE)) {
+ return (VNON);
+ }
+
+ if (type != LXSYS_STATIC) {
+ /* Non-static node types */
+ switch (type) {
+ case LXSYS_CLASS_NET:
+ if (instance != 0) {
+ return (VLNK);
+ }
+ break;
+ case LXSYS_DEV_NET:
+ /*
+ * /sys/devices/virtual/net usually has the eth0 and
+ * lo directories. Each network device directory is an
+ * instances with a 0 endpoint. The files within
+ * that directory have a non-0 endpoint.
+ */
+ if (endpoint != 0) {
+ return (VREG);
+ }
+ break;
+ case LXSYS_BLOCK:
+ if (instance != 0) {
+ return (VLNK);
+ }
+ break;
+ case LXSYS_DEV_ZFS:
+ /*
+ * /sys/devices/zfs usually has the zfsds0 directory
+ * instance with a 0 endpoint. The device file within
+ * that directory has a non-0 endpoint.
+ */
+ if (endpoint != 0) {
+ return (VREG);
+ }
+ break;
+ case LXSYS_DEV_SYS_CPU:
+ if (instance != 0) {
+ return (VREG);
+ }
+ break;
+ case LXSYS_DEV_SYS_CPUINFO:
+ /*
+ * There is an instance of /sys/devices/system/cpu/cpuN
+ * for each CPU. These have an instance per CPU and
+ * currently the endpoint is 0 since there is nothing
+ * underneath the cpuN subdirectories. Future
+ * regular file entries are likely to be added there.
+ */
+ if (endpoint != 0) {
+ return (VREG);
+ }
+ break;
+ case LXSYS_DEV_SYS_NODE:
+ /*
+ * /sys/devices/system/node has the node0 directory
+ * instance with a 0 endpoint. The cpulist file within
+ * that directory has a non-0 endpoint.
+ */
+ if (endpoint != 0) {
+ return (VREG);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return (VDIR);
+}
+
+/*
+ * lxsys_open(): Vnode operation for VOP_OPEN()
+ */
+/* ARGSUSED */
+static int
+lxsys_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ /*
+ * We only allow reading in this file system
+ */
+ if (flag & FWRITE)
+ return (EROFS);
+
+ return (0);
+}
+
+
+/*
+ * lxsys_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxsys_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ return (0);
+}
+
+
+/*
+ * lxsys_read(): Vnode operation for VOP_READ()
+ * All we currently have in this fs are directories.
+ */
+/* ARGSUSED */
+static int
+lxsys_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxsys_node_t *lnp = VTOLXS(vp);
+ lxsys_nodetype_t type = lnp->lxsys_type;
+ int (*rlfunc)();
+ int error;
+ lxsys_uiobuf_t *luio;
+
+ VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+ if (vp->v_type == VDIR) {
+ return (EISDIR);
+ }
+
+ rlfunc = lxsys_read_function[type];
+ if (rlfunc != NULL) {
+ luio = lxsys_uiobuf_new(uiop);
+ if ((error = rlfunc(lnp, luio)) == 0) {
+ error = lxsys_uiobuf_flush(luio);
+ }
+ lxsys_uiobuf_free(luio);
+ } else {
+ error = EIO;
+ }
+
+ return (error);
+}
+
+/*
+ * lxsys_getattr(): Vnode operation for VOP_GETATTR()
+ */
+/* ARGSUSED */
+static int
+lxsys_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ register lxsys_node_t *lxsnp = VTOLXS(vp);
+
+ /* Default attributes, that may be overridden below */
+ bzero(vap, sizeof (*vap));
+ vap->va_atime = vap->va_mtime = vap->va_ctime = lxsnp->lxsys_time;
+ vap->va_nlink = 1;
+ vap->va_type = vp->v_type;
+ vap->va_mode = lxsnp->lxsys_mode;
+ vap->va_fsid = vp->v_vfsp->vfs_dev;
+ vap->va_blksize = DEV_BSIZE;
+ vap->va_uid = lxsnp->lxsys_uid;
+ vap->va_gid = lxsnp->lxsys_gid;
+ vap->va_nodeid = lxsnp->lxsys_ino;
+
+ vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+ return (0);
+}
+
+/*
+ * lxsys_access(): Vnode operation for VOP_ACCESS()
+ */
+/* ARGSUSED */
+static int
+lxsys_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+ lxsys_node_t *lxsnp = VTOLXS(vp);
+ int shift = 0;
+
+ /*
+ * Although our lx sysfs is basically a read only file system, Linux
+ * expects it to be writable so we can't just error if (mode & VWRITE).
+ */
+
+ /* If user is root allow access regardless of permission bits */
+ if (secpolicy_proc_access(cr) == 0)
+ return (0);
+
+ /*
+ * Access check is based on only one of owner, group, public. If not
+ * owner, then check group. If not a member of the group, then check
+ * public access.
+ */
+ if (crgetuid(cr) != lxsnp->lxsys_uid) {
+ shift += 3;
+ if (!groupmember((uid_t)lxsnp->lxsys_gid, cr))
+ shift += 3;
+ }
+
+ mode &= ~(lxsnp->lxsys_mode << shift);
+
+ if (mode == 0)
+ return (0);
+
+ return (EACCES);
+}
+
+/*
+ * lxsys_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxsys_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ lxsys_node_t *lxsnp = VTOLXS(dp);
+ lxsys_nodetype_t type = lxsnp->lxsys_type;
+ int error;
+
+ VERIFY(dp->v_type == VDIR);
+ VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+ /*
+ * restrict lookup permission to owner or root
+ */
+ if ((error = lxsys_access(dp, VEXEC, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * Just return the parent vnode if that's where we are trying to go.
+ */
+ if (strcmp(comp, "..") == 0) {
+ VN_HOLD(lxsnp->lxsys_parentvp);
+ *vpp = lxsnp->lxsys_parentvp;
+ return (0);
+ }
+
+ /*
+ * Special handling for directory searches. Note: null component name
+ * denotes that the current directory is being searched.
+ */
+ if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+ VN_HOLD(dp);
+ *vpp = dp;
+ return (0);
+ }
+
+ *vpp = (lxsys_lookup_function[type](lxsnp, comp));
+ return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+static lxsys_node_t *
+lxsys_lookup_disk(lxsys_node_t *ldp, char *comp, lxsys_nodetype_t type)
+{
+ lxsys_node_t *lnp = NULL;
+ lx_zone_data_t *lxzdata;
+ lx_virt_disk_t *vd;
+
+ lxzdata = ztolxzd(curproc->p_zone);
+ if (lxzdata == NULL)
+ return (NULL);
+ ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+ vd = list_head(lxzdata->lxzd_vdisks);
+ while (vd != NULL) {
+ int inst = getminor(vd->lxvd_emul_dev) & 0xffff;
+
+ if (strcmp(vd->lxvd_name, comp) == 0 && inst != 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, type, inst, 0);
+ break;
+ }
+
+ vd = list_next(lxzdata->lxzd_vdisks, vd);
+ }
+
+ return (lnp);
+}
+
+static vnode_t *
+lxsys_lookup_static(lxsys_node_t *ldp, char *comp)
+{
+ lxsys_dirent_t *dirent = NULL;
+ int i, len = 0;
+
+ for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) {
+ if (ldp->lxsys_instance == lxsys_dirlookup[i].dl_instance) {
+ dirent = lxsys_dirlookup[i].dl_list;
+ len = lxsys_dirlookup[i].dl_length;
+ break;
+ }
+ }
+ if (dirent == NULL) {
+ return (NULL);
+ }
+
+ for (i = 0; i < len; i++) {
+ if (strncmp(comp, dirent[i].d_name, MAXPATHLEN) == 0) {
+ lxsys_nodetype_t node_type = ldp->lxsys_type;
+ unsigned int node_instance = 0;
+ lxsys_node_t *lnp;
+
+ switch (dirent[i].d_idnum) {
+ case LXSYS_INST_BLOCKDIR:
+ node_type = LXSYS_BLOCK;
+ break;
+ case LXSYS_INST_CLASS_NETDIR:
+ node_type = LXSYS_CLASS_NET;
+ break;
+ case LXSYS_INST_DEVICES_VIRTUAL_NETDIR:
+ node_type = LXSYS_DEV_NET;
+ break;
+ case LXSYS_INST_DEVICES_ZFSDIR:
+ node_type = LXSYS_DEV_ZFS;
+ break;
+ case LXSYS_INST_DEVICES_SYSCPU:
+ node_type = LXSYS_DEV_SYS_CPU;
+ break;
+ case LXSYS_INST_DEVICES_SYSNODE:
+ node_type = LXSYS_DEV_SYS_NODE;
+ break;
+ default:
+ /* Another static node */
+ node_instance = dirent[i].d_idnum;
+ }
+ if (node_type == LXSYS_STATIC) {
+ lnp = lxsys_getnode_static(ldp->lxsys_vnode,
+ node_instance);
+ } else {
+ lnp = lxsys_getnode(ldp->lxsys_vnode,
+ node_type, node_instance, 0);
+ }
+ return (lnp->lxsys_vnode);
+ }
+ }
+ return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_class_netdir(lxsys_node_t *ldp, char *comp)
+{
+ vnode_t *result = NULL;
+ lxsys_node_t *lnp;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ avl_tree_t *phytree;
+ phyint_t *phyi;
+ char ifname[LIFNAMSIZ];
+
+ if (ldp->lxsys_type != LXSYS_CLASS_NET ||
+ ldp->lxsys_instance != 0) {
+ /* Lookups only allowed at directory level */
+ return (NULL);
+ }
+
+ (void) strncpy(ifname, comp, LIFNAMSIZ);
+ lx_ifname_convert(ifname, LX_IF_TONATIVE);
+
+ if ((ns = lxsys_netstack(ldp)) == NULL) {
+ return (NULL);
+ }
+ ipst = ns->netstack_ip;
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+ phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name;
+ phyi = avl_find(phytree, ifname, NULL);
+ if (phyi != NULL) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type,
+ phyi->phyint_ifindex, 0);
+ result = lnp->lxsys_vnode;
+ result->v_type = VLNK;
+ }
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ netstack_rele(ns);
+
+ return (result);
+}
+
+static vnode_t *
+lxsys_lookup_devices_virtual_netdir(lxsys_node_t *ldp, char *comp)
+{
+ lxsys_node_t *lnp;
+
+ if (ldp->lxsys_instance == 0) {
+ /* top-level interface listing */
+ vnode_t *result = NULL;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ avl_tree_t *phytree;
+ phyint_t *phyi;
+ char ifname[LIFNAMSIZ];
+
+ (void) strncpy(ifname, comp, LIFNAMSIZ);
+ lx_ifname_convert(ifname, LX_IF_TONATIVE);
+
+ if ((ns = lxsys_netstack(ldp)) == NULL) {
+ return (NULL);
+ }
+ ipst = ns->netstack_ip;
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+ phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name;
+ phyi = avl_find(phytree, ifname, NULL);
+ if (phyi != NULL) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type,
+ phyi->phyint_ifindex, 0);
+ result = lnp->lxsys_vnode;
+ }
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ netstack_rele(ns);
+
+ return (result);
+ } else if (ldp->lxsys_endpoint == 0) {
+ /* interface-level sub-item listing */
+ int i, size;
+ lxsys_dirent_t *dirent;
+
+ size = SYSDIRLISTSZ(dirlist_devices_virtual_net);
+ for (i = 0; i < size; i++) {
+ dirent = &dirlist_devices_virtual_net[i];
+ if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode,
+ ldp->lxsys_type, ldp->lxsys_instance,
+ dirent->d_idnum);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ return (lnp->lxsys_vnode);
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_blockdir(lxsys_node_t *ldp, char *comp)
+{
+ lxsys_node_t *lnp;
+
+ if (ldp->lxsys_instance == 0) {
+ /* top-level dev listing */
+ lnp = lxsys_lookup_disk(ldp, comp, LXSYS_BLOCK);
+
+ if (lnp != NULL) {
+ lnp->lxsys_vnode->v_type = VLNK;
+ return (lnp->lxsys_vnode);
+ }
+ }
+
+ return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_devices_zfsdir(lxsys_node_t *ldp, char *comp)
+{
+ lxsys_node_t *lnp;
+
+ if (ldp->lxsys_instance == 0) {
+ /* top-level dev listing */
+ lnp = lxsys_lookup_disk(ldp, comp, LXSYS_DEV_ZFS);
+
+ if (lnp != NULL) {
+ return (lnp->lxsys_vnode);
+ }
+ } else if (ldp->lxsys_endpoint == 0) {
+ /* disk-level sub-item listing */
+ int i, size;
+ lxsys_dirent_t *dirent;
+
+ /*
+ * All of these entries currently look like regular files
+ * but on a real Linux system some will be subdirs. This should
+ * be fixed when we populate the directory for real.
+ */
+ size = SYSDIRLISTSZ(dirlist_devices_zfs_block);
+ for (i = 0; i < size; i++) {
+ dirent = &dirlist_devices_zfs_block[i];
+ if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode,
+ ldp->lxsys_type, ldp->lxsys_instance,
+ dirent->d_idnum);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ return (lnp->lxsys_vnode);
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_devices_syscpu(lxsys_node_t *ldp, char *comp)
+{
+ lxsys_node_t *lnp = NULL;
+
+ if (ldp->lxsys_instance == 0) {
+ /* top-level cpu listing */
+
+ /* If fixed entry */
+ if (strcmp(comp, "kernel_max") == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU,
+ LXSYS_INST_DEV_SYSCPU_KMAX, 0);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ } else if (strcmp(comp, "offline") == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU,
+ LXSYS_INST_DEV_SYSCPU_OFFLINE, 0);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ } else if (strcmp(comp, "online") == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU,
+ LXSYS_INST_DEV_SYSCPU_ONLINE, 0);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ } else if (strcmp(comp, "possible") == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU,
+ LXSYS_INST_DEV_SYSCPU_POSSIBLE, 0);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ } else if (strcmp(comp, "present") == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU,
+ LXSYS_INST_DEV_SYSCPU_PRESENT, 0);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ } else {
+ /* Else dynamic cpuN entry */
+ cpuset_t *avail; /* all installed CPUs */
+ uint_t i, avlo, avhi;
+
+ avail = cpuset_alloc(KM_SLEEP);
+ cpuset_all(avail);
+
+ /* Take a snapshot of the available set */
+ mutex_enter(&cpu_lock);
+ cpuset_and(avail, &cpu_available);
+ mutex_exit(&cpu_lock);
+
+ cpuset_bounds(avail, &avlo, &avhi);
+
+ for (i = avlo; i <= avhi; i++) {
+ char cpunm[16];
+
+ if (!cpu_in_set(avail, i))
+ continue;
+
+ (void) snprintf(cpunm, sizeof (cpunm), "cpu%u",
+ i);
+
+ if (strcmp(comp, cpunm) == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode,
+ LXSYS_DEV_SYS_CPUINFO, i + 1, 0);
+ break;
+ }
+ }
+ cpuset_free(avail);
+ }
+
+ if (lnp != NULL) {
+ return (lnp->lxsys_vnode);
+ }
+ } else if (ldp->lxsys_endpoint == 0) {
+ /* cpu-level sub-item listing, currently empty */
+ /* EMPTY */
+ }
+
+ return (NULL);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxsys_lookup_devices_syscpuinfo(lxsys_node_t *ldp, char *comp)
+{
+ return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_devices_sysnode(lxsys_node_t *ldp, char *comp)
+{
+ lxsys_node_t *lnp = NULL;
+
+ if (ldp->lxsys_instance == 0) {
+ /*
+ * The system is presently represented as a single node,
+ * regardless of any NUMA topology which exists.
+ * The instances are offset by 1 to account for the top level
+ * directory occupying instance 0.
+ */
+ if (strcmp(comp, "node0") == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type,
+ 1, 0);
+ return (lnp->lxsys_vnode);
+ }
+ } else {
+ /* interface-level sub-item listing */
+ int i, size;
+ lxsys_dirent_t *dirent;
+
+ size = SYSDIRLISTSZ(dirlist_devices_sysnode);
+ for (i = 0; i < size; i++) {
+ dirent = &dirlist_devices_sysnode[i];
+ if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) {
+ lnp = lxsys_getnode(ldp->lxsys_vnode,
+ ldp->lxsys_type, ldp->lxsys_instance,
+ dirent->d_idnum);
+ lnp->lxsys_vnode->v_type = VREG;
+ lnp->lxsys_mode = 0444;
+ return (lnp->lxsys_vnode);
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+lxsys_read_devices_virtual_net(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+ netstack_t *ns;
+ ill_t *ill;
+ uint_t ifindex = lnp->lxsys_instance;
+ uint8_t *addr;
+ uint64_t flags;
+ int error = 0;
+
+ if (ifindex == 0 || lnp->lxsys_endpoint == 0) {
+ return (EISDIR);
+ }
+
+ if ((ns = lxsys_netstack(lnp)) == NULL) {
+ return (EIO);
+ }
+
+ ill = lxsys_find_ill(ns->netstack_ip, ifindex);
+ if (ill == NULL) {
+ netstack_rele(ns);
+ return (EIO);
+ }
+
+ switch (lnp->lxsys_endpoint) {
+ case LXSYS_ENDP_NET_ADDRESS:
+ if (ill->ill_phys_addr_length != ETHERADDRL) {
+ lxsys_uiobuf_printf(luio, "00:00:00:00:00:00\n");
+ break;
+ }
+ addr = ill->ill_phys_addr;
+ lxsys_uiobuf_printf(luio,
+ "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx\n",
+ addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
+ break;
+ case LXSYS_ENDP_NET_ADDRLEN:
+ lxsys_uiobuf_printf(luio, "%u\n",
+ IS_LOOPBACK(ill) ? ETHERADDRL : ill->ill_phys_addr_length);
+ break;
+ case LXSYS_ENDP_NET_FLAGS:
+ flags = (ill->ill_flags | ill->ill_ipif->ipif_flags |
+ ill->ill_phyint->phyint_flags) & 0xffff;
+ lx_ifflags_convert(&flags, LX_IF_FROMNATIVE);
+ lxsys_uiobuf_printf(luio, "0x%x\n", flags);
+ break;
+ case LXSYS_ENDP_NET_IFINDEX:
+ lxsys_uiobuf_printf(luio, "%u\n", ifindex);
+ break;
+ case LXSYS_ENDP_NET_MTU:
+ lxsys_uiobuf_printf(luio, "%u\n", ill->ill_mtu);
+ break;
+ case LXSYS_ENDP_NET_TXQLEN:
+ /* perpetuate the txqlen lie */
+ if (IS_LOOPBACK(ill)) {
+ lxsys_uiobuf_printf(luio, "0\n");
+ } else {
+ lxsys_uiobuf_printf(luio, "1\n");
+ }
+ break;
+ case LXSYS_ENDP_NET_TYPE:
+ lxsys_uiobuf_printf(luio, "%u\n",
+ IS_LOOPBACK(ill) ? LX_ARPHRD_LOOPBACK :
+ arp_hw_type(ill->ill_mactype));
+ break;
+ default:
+ error = EIO;
+ }
+
+ ill_refrele(ill);
+ netstack_rele(ns);
+ return (error);
+}
+
+/* ARGSUSED1 */
+static int
+lxsys_read_devices_zfs_block(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+ uint_t dskindex = lnp->lxsys_instance;
+
+ if (dskindex == 0 || lnp->lxsys_endpoint == 0) {
+ return (EISDIR);
+ }
+
+ return (EIO);
+}
+
+/*
+ * In the Linux src tree, see ABI/stable/sysfs-devices-node.
+ *
+ * For the 'cpumap' file, each CPU is treated as a bit, then those are
+ * accumulated and printed as a hex digit, with CPU0 as the rightmost bit.
+ * Each set of 8 digits (i.e. 32 CPUs) is then delimited with a comma.
+ * Since we are emulating a single NUMA group, all of our CPUs will be listed
+ * in this file. For example, a 48 CPU system would look like:
+ * 00000000,00000000,00000000,00000000,00000000,00000000,0000ffff,ffffffff
+ * It comes out this way because 'kernel_max' is NCPU, which is currently
+ * defined to be 256.
+ */
+static int
+lxsys_read_devices_sysnode(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+ if (lnp->lxsys_instance == 1) {
+ char outbuf[256];
+
+ if (lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPULIST) {
+ /* Show the range of CPUs */
+ lxsys_format_cpu(outbuf, sizeof (outbuf),
+ LXSYS_CPU_ANY);
+ } else if (lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPUMAP) {
+ int i;
+ uint_t j, ndigits;
+ cpuset_t *avail; /* all installed CPUs */
+
+ avail = cpuset_alloc(KM_SLEEP);
+ cpuset_all(avail);
+
+ /* Take a snapshot of the available set */
+ mutex_enter(&cpu_lock);
+ cpuset_and(avail, &cpu_available);
+ mutex_exit(&cpu_lock);
+
+ outbuf[0] = '\0';
+ ndigits = 0;
+ for (i = NCPU - 1; i >= 0; i -= 4) {
+ char buf[8];
+ int cnt = 3;
+ uint_t digit = 0;
+
+ for (j = i; cnt >= 0; j--, cnt--) {
+ if (cpu_in_set(avail, j))
+ digit |= 1 << cnt;
+ }
+ (void) snprintf(buf, sizeof (buf), "%x", digit);
+ if (ndigits == 8) {
+ (void) strlcat(outbuf, ",",
+ sizeof (outbuf));
+ ndigits = 0;
+ }
+ (void) strlcat(outbuf, buf, sizeof (outbuf));
+ ndigits++;
+ }
+
+ cpuset_free(avail);
+ } else {
+ return (EISDIR);
+ }
+
+ lxsys_uiobuf_printf(luio, "%s\n", outbuf);
+ return (0);
+ }
+ return (EISDIR);
+}
+
+static void
+lxsys_format_range(char *buf, int blen, boolean_t *first, uint_t start,
+ uint_t cnt)
+{
+ char tmp[256];
+ char *delim;
+
+ if (cnt == 0)
+ return;
+
+ if (*first) {
+ *first = B_FALSE;
+ delim = "";
+ } else {
+ delim = ",";
+ }
+ if (cnt > 1) {
+ (void) snprintf(tmp, sizeof (tmp), "%s%u-%u", delim, start,
+ start + cnt - 1);
+ } else {
+ (void) snprintf(tmp, sizeof (tmp), "%s%u", delim, start);
+ }
+ (void) strlcat(buf, tmp, blen);
+}
+
+/*
+ * Format a string of which CPUs are online, offline, or don't care (depending
+ * on chk_state), and which would be formatted like this:
+ * 0-31
+ * or
+ * 0-12,14,20-31
+ */
+static void
+lxsys_format_cpu(char *buf, int blen, lxsys_cpu_state_t chk_state)
+{
+ uint_t start, cnt, avlo, avhi;
+ boolean_t first = B_TRUE;
+ cpuset_t *active; /* CPUs online */
+ cpuset_t *avail; /* all installed CPUs */
+
+ active = cpuset_alloc(KM_SLEEP);
+ avail = cpuset_alloc(KM_SLEEP);
+ cpuset_all(active);
+ cpuset_all(avail);
+
+ /* Take a snapshot of the available and active sets */
+ mutex_enter(&cpu_lock);
+ cpuset_and(avail, &cpu_available);
+ cpuset_and(active, &cpu_active_set);
+ mutex_exit(&cpu_lock);
+
+ cpuset_bounds(avail, &avlo, &avhi);
+
+ buf[0] = '\0';
+ if (chk_state == LXSYS_CPU_ANY) {
+ start = avlo;
+ cnt = avhi + 1;
+ } else {
+ uint_t i;
+ boolean_t incl_cpu = B_TRUE;
+
+ start = 0;
+ cnt = 0;
+ for (i = avlo; i <= avhi; i++) {
+ if (chk_state == LXSYS_CPU_ON) {
+ if (!cpu_in_set(active, i))
+ incl_cpu = B_FALSE;
+ } else {
+ if (cpu_in_set(active, i))
+ incl_cpu = B_FALSE;
+ }
+
+ if (incl_cpu && cpu_in_set(avail, i)) {
+ cnt++;
+ } else {
+ /*
+ * Note: this may print nothing if our 'cnt'
+ * is 0, but we advance 'start' properly so we
+ * handle the next range of elements we're
+ * looking for.
+ */
+ lxsys_format_range(buf, blen, &first, start,
+ cnt);
+ start += cnt + 1;
+ cnt = 0;
+ incl_cpu = B_TRUE;
+ }
+ }
+ }
+
+ cpuset_free(avail);
+ cpuset_free(active);
+
+ lxsys_format_range(buf, blen, &first, start, cnt);
+}
+
+static int
+lxsys_read_devices_syscpu(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+ uint_t inst = lnp->lxsys_instance;
+ char outbuf[256];
+
+ /*
+ * For 'kernel_max', 'offline', 'online', 'possible', and 'present',
+ * see the Documentaion/cputopology.txt file in the Linux src tree.
+ */
+ if (inst == LXSYS_INST_DEV_SYSCPU_KMAX) {
+ lxsys_uiobuf_printf(luio, "%d\n", NCPU - 1);
+ return (0);
+ }
+
+ if (inst == LXSYS_INST_DEV_SYSCPU_OFFLINE) {
+ lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_OFF);
+ lxsys_uiobuf_printf(luio, "%s\n", outbuf);
+ return (0);
+ }
+
+ if (inst == LXSYS_INST_DEV_SYSCPU_ONLINE) {
+ lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_ON);
+ lxsys_uiobuf_printf(luio, "%s\n", outbuf);
+ return (0);
+ }
+
+ if (inst == LXSYS_INST_DEV_SYSCPU_POSSIBLE ||
+ inst == LXSYS_INST_DEV_SYSCPU_PRESENT) {
+ lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_ANY);
+ lxsys_uiobuf_printf(luio, "%s\n", outbuf);
+ return (0);
+ }
+
+ /* All other nodes are directories */
+ return (EISDIR);
+}
+
+/* ARGSUSED */
+static int
+lxsys_read_static(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+ /* All static nodes are directories */
+ return (EISDIR);
+}
+
+/*
+ * lxsys_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxsys_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ lxsys_node_t *lxsnp = VTOLXS(dp);
+ lxsys_nodetype_t type = lxsnp->lxsys_type;
+ ssize_t uresid;
+ off_t uoffset;
+ int error, leof;
+
+ ASSERT(dp->v_type == VDIR);
+ VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+ /*
+ * restrict readdir permission to owner or root
+ */
+ if ((error = lxsys_access(dp, VREAD, 0, cr, ct)) != 0)
+ return (error);
+
+ uoffset = uiop->uio_offset;
+ uresid = uiop->uio_resid;
+
+ /* can't do negative reads */
+ if (uoffset < 0 || uresid <= 0)
+ return (EINVAL);
+
+ /* can't read directory entries that don't exist! */
+ if (uoffset % LXSYS_SDSIZE)
+ return (ENOENT);
+
+ /* Free lower functions from having to check eofp == NULL */
+ if (eofp == NULL) {
+ eofp = &leof;
+ }
+
+ return (lxsys_readdir_function[lxsnp->lxsys_type](lxsnp, uiop, eofp));
+}
+
+static int
+lxsys_dirent_out(dirent64_t *d, ushort_t n, struct uio *uio)
+{
+ int error;
+ off_t offset = uio->uio_offset;
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset by the
+ * same amount. But we want uiop->uio_offset to change in increments
+ * of LXSYS_SDSIZE, which is different from the number of bytes being
+ * returned to the user. To accomplish this, we set uiop->uio_offset
+ * separately on success, overriding what uiomove() does.
+ */
+ d->d_off = (off64_t)(offset + LXSYS_SDSIZE);
+ d->d_reclen = n;
+ if ((error = uiomove(d, n, UIO_READ, uio)) != 0) {
+ return (error);
+ }
+ uio->uio_offset = offset + LXSYS_SDSIZE;
+ return (0);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxsys_readdir_common(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp,
+ lxsys_dirent_t *dirtab, int dirtablen)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+
+ oresid = uiop->uio_resid;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /* Satisfy user request */
+ while ((uresid = uiop->uio_resid) > 0) {
+ int dirindex;
+ off_t uoffset;
+ int reclen;
+ int error;
+
+ uoffset = uiop->uio_offset;
+ dirindex = (uoffset / LXSYS_SDSIZE) - 2;
+
+ if (uoffset == 0) {
+
+ dirent->d_ino = lxsnp->lxsys_ino;
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '\0';
+ reclen = DIRENT64_RECLEN(1);
+
+ } else if (uoffset == LXSYS_SDSIZE) {
+
+ dirent->d_ino = lxsys_parentinode(lxsnp);
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '.';
+ dirent->d_name[2] = '\0';
+ reclen = DIRENT64_RECLEN(2);
+
+ } else if (dirindex >= 0 && dirindex < dirtablen) {
+
+ int slen = strlen(dirtab[dirindex].d_name);
+ int idnum, ino_type = 0;
+
+ idnum = dirtab[dirindex].d_idnum;
+ if (idnum > 0 && idnum < DYN_INO_LEN)
+ ino_type = dyn_ino_type[idnum];
+
+ if (ino_type != 0) {
+ /*
+ * Correct the inode for static directories
+ * which contain non-static lxsys_nodetype_t's.
+ */
+ dirent->d_ino = lxsys_inode(ino_type, 0, 0);
+ DTRACE_PROBE3(lxsys__fix__inode,
+ char *, dirtab[dirindex].d_name,
+ int, ino_type, int, dirent->d_ino);
+ } else {
+ dirent->d_ino = lxsys_inode(LXSYS_STATIC,
+ idnum, 0);
+ }
+
+ (void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+ reclen = DIRENT64_RECLEN(slen);
+
+ } else {
+ /* Run out of table entries */
+ *eofp = 1;
+ return (0);
+ }
+
+ /*
+ * If the size of the data to transfer is greater than the
+ * user-provided buffer, we cannot continue.
+ */
+ if (reclen > uresid) {
+ /* Error if no entries have been returned yet. */
+ if (uresid == oresid) {
+ return (EINVAL);
+ }
+ break;
+ }
+
+ if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+ return (error);
+ }
+ }
+
+ /* Have run out of space, but could have just done last table entry */
+ *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0;
+ return (0);
+}
+
+static int
+lxsys_readdir_subdir(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp,
+ lxsys_dirent_t *dirtab, int dirtablen)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+
+ VERIFY(dirtab != NULL || dirtablen == 0);
+
+ oresid = uiop->uio_resid;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /* Satisfy user request */
+ while ((uresid = uiop->uio_resid) > 0) {
+ int dirindex;
+ off_t uoffset;
+ int reclen;
+ int error;
+
+ uoffset = uiop->uio_offset;
+ dirindex = (uoffset / LXSYS_SDSIZE) - 2;
+
+ if (uoffset == 0) {
+
+ dirent->d_ino = lxsnp->lxsys_ino;
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '\0';
+ reclen = DIRENT64_RECLEN(1);
+
+ } else if (uoffset == LXSYS_SDSIZE) {
+
+ dirent->d_ino = lxsys_parentinode(lxsnp);
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '.';
+ dirent->d_name[2] = '\0';
+ reclen = DIRENT64_RECLEN(2);
+
+ } else if (dirindex >= 0 && dirindex < dirtablen) {
+
+ int slen = strlen(dirtab[dirindex].d_name);
+
+ dirent->d_ino = lxsys_inode(lxsnp->lxsys_type,
+ lxsnp->lxsys_instance, dirtab[dirindex].d_idnum);
+ (void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+ reclen = DIRENT64_RECLEN(slen);
+
+ } else {
+ /* Run out of table entries */
+ *eofp = 1;
+ return (0);
+ }
+
+ /*
+ * If the size of the data to transfer is greater than the
+ * user-provided buffer, we cannot continue.
+ */
+ if (reclen > uresid) {
+ /* Error if no entries have been returned yet. */
+ if (uresid == oresid) {
+ return (EINVAL);
+ }
+ break;
+ }
+
+ if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+ return (error);
+ }
+ }
+
+ /* Have run out of space, but could have just done last table entry */
+ *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0;
+ return (0);
+}
+
+static int
+lxsys_readdir_ifaces(lxsys_node_t *ldp, struct uio *uiop, int *eofp,
+ lxsys_nodetype_t type)
+{
+ longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid, uresid;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ avl_tree_t *phytree;
+ phyint_t *phyi;
+ int error, i;
+
+
+ /* Emit "." and ".." entries */
+ oresid = uiop->uio_resid;
+ error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0);
+ if (error != 0 || *eofp == 0) {
+ return (error);
+ }
+
+ if ((ns = lxsys_netstack(ldp)) == NULL) {
+ *eofp = 1;
+ return (0);
+ }
+ ipst = ns->netstack_ip;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index;
+ phyi = avl_first(phytree);
+ if (phyi == NULL) {
+ *eofp = 1;
+ }
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Skip records we have already passed with the offset.
+ * This accounts for the two "." and ".." records already seen.
+ */
+ for (i = (uiop->uio_offset/LXSYS_SDSIZE) - 2; i > 0; i--) {
+ if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) {
+ *eofp = 1;
+ break;
+ }
+ }
+
+ while ((uresid = uiop->uio_resid) > 0 && phyi != NULL) {
+ uint_t ifindex;
+ int reclen;
+
+ ifindex = phyi->phyint_ifindex;
+ (void) strncpy(dirent->d_name, phyi->phyint_name, LIFNAMSIZ);
+ lx_ifname_convert(dirent->d_name, LX_IF_FROMNATIVE);
+ dirent->d_ino = lxsys_inode(type, ifindex, 0);
+ reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+ if (reclen > uresid) {
+ if (uresid == oresid) {
+ /* Not enough space for one record */
+ error = EINVAL;
+ }
+ break;
+ }
+ if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+ break;
+ }
+
+ if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) {
+ *eofp = 1;
+ break;
+ }
+ }
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ netstack_rele(ns);
+ return (error);
+}
+
+static int
+lxsys_readdir_disks(lxsys_node_t *ldp, struct uio *uiop, int *eofp,
+ lxsys_nodetype_t type)
+{
+ longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid, uresid;
+ int skip, error;
+ int reclen;
+ uint_t instance;
+ lx_zone_data_t *lxzdata;
+ lx_virt_disk_t *vd;
+
+ /* Emit "." and ".." entries */
+ oresid = uiop->uio_resid;
+ error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0);
+ if (error != 0 || *eofp == 0) {
+ return (error);
+ }
+
+ skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2;
+
+ lxzdata = ztolxzd(curproc->p_zone);
+ if (lxzdata == NULL)
+ return (EINVAL);
+ ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+ vd = list_head(lxzdata->lxzd_vdisks);
+ while (vd != NULL) {
+ if (skip > 0) {
+ skip--;
+ goto next;
+ }
+
+ if (strnlen(vd->lxvd_name, sizeof (vd->lxvd_name)) > LXSNSIZ)
+ goto next;
+
+ (void) strncpy(dirent->d_name, vd->lxvd_name, LXSNSIZ);
+
+ instance = getminor(vd->lxvd_emul_dev) & 0xffff;
+ if (instance == 0)
+ goto next;
+
+ dirent->d_ino = lxsys_inode(type, instance, 0);
+ reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+ uresid = uiop->uio_resid;
+ if (reclen > uresid) {
+ if (uresid == oresid) {
+ /* Not enough space for one record */
+ error = EINVAL;
+ }
+ break;
+ }
+ if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+ break;
+ }
+
+next:
+ vd = list_next(lxzdata->lxzd_vdisks, vd);
+ }
+
+ /* Indicate EOF if we reached the end of the virtual disks. */
+ if (vd == NULL) {
+ *eofp = 1;
+ }
+
+ return (error);
+}
+
+
+static int
+lxsys_readdir_static(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ lxsys_dirent_t *dirent = NULL;
+ int i, len = 0;
+ boolean_t found = B_FALSE;
+
+ for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) {
+ if (lnp->lxsys_instance == lxsys_dirlookup[i].dl_instance) {
+ dirent = lxsys_dirlookup[i].dl_list;
+ len = lxsys_dirlookup[i].dl_length;
+ found = B_TRUE;
+ break;
+ }
+ }
+
+ if (!found) {
+ return (ENOTDIR);
+ }
+
+ return (lxsys_readdir_common(lnp, uiop, eofp, dirent, len));
+}
+
+static int
+lxsys_readdir_class_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ if (lnp->lxsys_type != LXSYS_CLASS_NET ||
+ lnp->lxsys_instance != 0) {
+ /*
+ * Since /sys/class/net contains only symlinks, readdir
+ * operations should not be performed anywhere except the top
+ * level (instance == 0).
+ */
+ return (ENOTDIR);
+ }
+
+ return (lxsys_readdir_ifaces(lnp, uiop, eofp, LXSYS_CLASS_NET));
+}
+
+static int
+lxsys_readdir_devices_virtual_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ int error;
+
+ if (lnp->lxsys_instance == 0) {
+ /* top-level interface listing */
+ error = lxsys_readdir_ifaces(lnp, uiop, eofp,
+ LXSYS_DEV_NET);
+ } else if (lnp->lxsys_endpoint == 0) {
+ /* interface-level sub-item listing */
+ error = lxsys_readdir_subdir(lnp, uiop, eofp,
+ dirlist_devices_virtual_net,
+ SYSDIRLISTSZ(dirlist_devices_virtual_net));
+ } else {
+ /* there shouldn't be subdirs below this */
+ error = ENOTDIR;
+ }
+
+ return (error);
+}
+
+static int
+lxsys_readdir_blockdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ if (lnp->lxsys_type != LXSYS_BLOCK ||
+ lnp->lxsys_instance != 0) {
+ /*
+ * Since /sys/block contains only symlinks, readdir operations
+ * should not be performed anywhere except the top level
+ * (instance == 0).
+ */
+ return (ENOTDIR);
+ }
+
+ return (lxsys_readdir_disks(lnp, uiop, eofp, LXSYS_BLOCK));
+}
+
+static int
+lxsys_readdir_devices_zfsdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ int error;
+
+ if (lnp->lxsys_instance == 0) {
+ /* top-level dev listing */
+ error = lxsys_readdir_disks(lnp, uiop, eofp,
+ LXSYS_DEV_ZFS);
+ } else if (lnp->lxsys_endpoint == 0) {
+ /* disk-level sub-item listing */
+ error = lxsys_readdir_subdir(lnp, uiop, eofp,
+ dirlist_devices_zfs_block,
+ SYSDIRLISTSZ(dirlist_devices_zfs_block));
+ } else {
+ /*
+ * Currently there shouldn't be subdirs below this but
+ * on a real Linux system some will be subdirs. This should
+ * be fixed when we populate the directory for real.
+ */
+ error = ENOTDIR;
+ }
+
+ return (error);
+}
+
+/* Handle fixed entries within the cpu directory. */
+static int
+lxsys_do_sub_cpu(struct uio *uiop, ssize_t oresid, dirent64_t *dirent,
+ char *nm, int inst, int *errp)
+{
+ int reclen;
+ ssize_t uresid;
+
+ (void) strncpy(dirent->d_name, nm, LXSNSIZ);
+
+ dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPU, inst, 0);
+ reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+ uresid = uiop->uio_resid;
+ if (reclen > uresid) {
+ if (uresid == oresid) {
+ /* Not enough space for one record */
+ *errp = EINVAL;
+ }
+ return (-1);
+ }
+ if ((*errp = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+lxsys_readdir_cpu(lxsys_node_t *ldp, struct uio *uiop, int *eofp)
+{
+ longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid, uresid;
+ int skip, error;
+ int reclen;
+ cpuset_t *avail;
+ uint_t i, avlo, avhi;
+
+ /* Emit "." and ".." entries */
+ oresid = uiop->uio_resid;
+ error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0);
+ if (error != 0 || *eofp == 0) {
+ return (error);
+ }
+
+ skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2;
+
+ /* Fixed entries */
+ if (skip > 0) {
+ skip--;
+ } else {
+ if (lxsys_do_sub_cpu(uiop, oresid, dirent, "kernel_max",
+ LXSYS_INST_DEV_SYSCPU_KMAX, &error) != 0)
+ goto done;
+
+ if (lxsys_do_sub_cpu(uiop, oresid, dirent, "offline",
+ LXSYS_INST_DEV_SYSCPU_OFFLINE, &error) != 0)
+ goto done;
+
+ if (lxsys_do_sub_cpu(uiop, oresid, dirent, "online",
+ LXSYS_INST_DEV_SYSCPU_ONLINE, &error) != 0)
+ goto done;
+
+ if (lxsys_do_sub_cpu(uiop, oresid, dirent, "possible",
+ LXSYS_INST_DEV_SYSCPU_POSSIBLE, &error) != 0)
+ goto done;
+
+ if (lxsys_do_sub_cpu(uiop, oresid, dirent, "present",
+ LXSYS_INST_DEV_SYSCPU_PRESENT, &error) != 0)
+ goto done;
+ }
+
+ avail = cpuset_alloc(KM_SLEEP);
+ cpuset_all(avail);
+
+ /* Take a snapshot of the available set */
+ mutex_enter(&cpu_lock);
+ cpuset_and(avail, &cpu_available);
+ mutex_exit(&cpu_lock);
+
+ cpuset_bounds(avail, &avlo, &avhi);
+
+ /* Output dynamic CPU info */
+ for (i = avlo; i <= avhi; i++) {
+ char cpunm[16];
+
+ if (skip > 0) {
+ skip--;
+ continue;
+ }
+
+ if (!cpu_in_set(avail, i))
+ continue;
+
+ (void) snprintf(cpunm, sizeof (cpunm), "cpu%u", i);
+ (void) strncpy(dirent->d_name, cpunm, LXSNSIZ);
+
+ dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPUINFO, i + 1, 0);
+ reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+ uresid = uiop->uio_resid;
+ if (reclen > uresid) {
+ if (uresid == oresid) {
+ /* Not enough space for one record */
+ error = EINVAL;
+ }
+ break;
+ }
+ if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+ break;
+ }
+ }
+ cpuset_free(avail);
+
+ /* Indicate EOF if we reached the end of the CPU list. */
+ if (i == avhi) {
+ *eofp = 1;
+ }
+
+done:
+ return (error);
+}
+
+static int
+lxsys_readdir_devices_syscpu(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ int error;
+
+ if (lnp->lxsys_instance == 0) {
+ /* top-level cpu listing */
+ error = lxsys_readdir_cpu(lnp, uiop, eofp);
+ } else if (lnp->lxsys_endpoint == 0) {
+ /* cpu-level sub-item listing */
+ error = lxsys_readdir_subdir(lnp, uiop, eofp, NULL, 0);
+ } else {
+ /*
+ * Currently there shouldn't be subdirs below this but
+ * on a real Linux system some will be subdirs. This should
+ * be fixed when we populate the directory for real.
+ */
+ error = ENOTDIR;
+ }
+
+ return (error);
+}
+
+static int
+lxsys_readdir_devices_syscpuinfo(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ int error;
+
+ if (lnp->lxsys_type != LXSYS_DEV_SYS_CPUINFO) {
+ /*
+ * Since /sys/devices/system/cpu/cpuN is empty, readdir
+ * operations should not be performed anywhere except the top
+ * level.
+ */
+ return (ENOTDIR);
+ }
+
+ /*
+ * Emit "." and ".." entries
+ * All cpuN directories are currently empty.
+ */
+ error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0);
+ if (error != 0 || *eofp == 0) {
+ return (error);
+ }
+
+ /* Indicate EOF */
+ *eofp = 1;
+
+ return (error);
+}
+
+static int
+lxsys_readdir_devices_sysnode(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+ int error;
+
+ if (lnp->lxsys_instance == 0) {
+ /* top-level node listing */
+ longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid, uresid;
+ int reclen, skip;
+
+ /* Emit "." and ".." entries */
+ oresid = uiop->uio_resid;
+ error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0);
+ if (error != 0 || *eofp == 0) {
+ return (error);
+ }
+ skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2;
+
+ /* Fixed entries */
+ if (skip > 0) {
+ skip--;
+ } else {
+ (void) strncpy(dirent->d_name, "node0", LXSNSIZ);
+
+ dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_NODE,
+ 1, 0);
+ reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+ uresid = uiop->uio_resid;
+ if (reclen > uresid) {
+ if (uresid == oresid) {
+ /* Not enough space for one record */
+ return (EINVAL);
+ }
+ return (0);
+ }
+ error = lxsys_dirent_out(dirent, reclen, uiop);
+ }
+ /* Indicate EOF */
+ if (error == 0) {
+ *eofp = 1;
+ }
+ } else if (lnp->lxsys_endpoint == 0) {
+ /* node-level sub-item listing */
+ error = lxsys_readdir_subdir(lnp, uiop, eofp,
+ dirlist_devices_sysnode,
+ SYSDIRLISTSZ(dirlist_devices_sysnode));
+ } else {
+ /* there shouldn't be subdirs below this */
+ error = ENOTDIR;
+ }
+
+ return (error);
+}
+
+/*
+ * lxsys_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxsys_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ char buf[MAXPATHLEN + 1];
+ lxsys_node_t *lnp = VTOLXS(vp);
+ lxsys_nodetype_t type = lnp->lxsys_type;
+ int (*rlfunc)();
+ int error;
+
+ VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+ if (vp->v_type != VLNK) {
+ return (EINVAL);
+ }
+
+ rlfunc = lxsys_readlink_function[lnp->lxsys_type];
+ if (rlfunc != NULL) {
+ if ((error = rlfunc(lnp, buf, sizeof (buf))) == 0) {
+ error = uiomove(buf, strlen(buf), UIO_READ, uiop);
+ }
+ } else {
+ error = EINVAL;
+ }
+
+ return (error);
+}
+
+
+static int
+lxsys_readlink_class_net(lxsys_node_t *lnp, char *buf, size_t len)
+{
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ avl_tree_t *phytree;
+ phyint_t *phyi;
+ uint_t ifindex;
+ char ifname[LIFNAMSIZ];
+ int error = EINVAL;
+
+ if ((ifindex = lnp->lxsys_instance) == 0) {
+ return (error);
+ }
+
+ if ((ns = lxsys_netstack(lnp)) == NULL) {
+ return (error);
+ }
+ ipst = ns->netstack_ip;
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+ phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index;
+ phyi = avl_find(phytree, &ifindex, NULL);
+ if (phyi != NULL) {
+ (void) strncpy(ifname, phyi->phyint_name, LIFNAMSIZ);
+ lx_ifname_convert(ifname, LX_IF_FROMNATIVE);
+ (void) snprintf(buf, len, "/sys/devices/virtual/net/%s",
+ ifname);
+ error = 0;
+ }
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ netstack_rele(ns);
+ return (error);
+}
+
+static int
+lxsys_readlink_block(lxsys_node_t *lnp, char *buf, size_t len)
+{
+ int inst, error = EINVAL;
+ lx_zone_data_t *lxzdata;
+ lx_virt_disk_t *vd;
+
+ if ((inst = lnp->lxsys_instance) == 0) {
+ return (error);
+ }
+
+ lxzdata = ztolxzd(curproc->p_zone);
+ if (lxzdata == NULL)
+ return (error);
+ ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+ vd = list_head(lxzdata->lxzd_vdisks);
+ while (vd != NULL) {
+ int vinst = getminor(vd->lxvd_emul_dev) & 0xffff;
+
+ if (vinst == inst) {
+ (void) snprintf(buf, len,
+ "../devices/zfs/%s", vd->lxvd_name);
+ error = 0;
+ break;
+ }
+ vd = list_next(lxzdata->lxzd_vdisks, vd);
+ }
+
+ return (error);
+}
+
+/*
+ * lxsys_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxsys_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ lxsys_freenode(VTOLXS(vp));
+}
+
+/*
+ * lxsys_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxsys_sync()
+{
+ /*
+ * Nothing to sync but this function must never fail
+ */
+ return (0);
+}
+
+/*
+ * lxsys_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxsys_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+ if (vn_matchops(vp1, lxsys_vnodeops) ||
+ vn_matchops(vp2, lxsys_vnodeops))
+ return (vp1 == vp2);
+ return (VOP_CMP(vp1, vp2, ct));
+}
diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c
index d61928d578..ebdabce2b5 100644
--- a/usr/src/uts/common/brand/sn1/sn1_brand.c
+++ b/usr/src/uts/common/brand/sn1/sn1_brand.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/errno.h>
@@ -42,43 +43,69 @@
char *sn1_emulation_table = NULL;
-void sn1_init_brand_data(zone_t *);
+void sn1_init_brand_data(zone_t *, kmutex_t *);
void sn1_free_brand_data(zone_t *);
void sn1_setbrand(proc_t *);
int sn1_getattr(zone_t *, int, void *, size_t *);
int sn1_setattr(zone_t *, int, void *, size_t);
int sn1_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t);
void sn1_copy_procdata(proc_t *, proc_t *);
-void sn1_proc_exit(struct proc *, klwp_t *);
+void sn1_proc_exit(struct proc *);
void sn1_exec();
-int sn1_initlwp(klwp_t *);
+void sn1_initlwp(klwp_t *, void *);
void sn1_forklwp(klwp_t *, klwp_t *);
void sn1_freelwp(klwp_t *);
void sn1_lwpexit(klwp_t *);
int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
- long *, int, caddr_t, cred_t *, int);
+ size_t *, int, caddr_t, cred_t *, int *);
/* sn1 brand */
struct brand_ops sn1_brops = {
- sn1_init_brand_data,
- sn1_free_brand_data,
- sn1_brandsys,
- sn1_setbrand,
- sn1_getattr,
- sn1_setattr,
- sn1_copy_procdata,
- sn1_proc_exit,
- sn1_exec,
- lwp_setrval,
- sn1_initlwp,
- sn1_forklwp,
- sn1_freelwp,
- sn1_lwpexit,
- sn1_elfexec,
- NULL,
- NULL,
- NSIG,
+ sn1_init_brand_data, /* b_init_brand_data */
+ sn1_free_brand_data, /* b_free_brand_data */
+ sn1_brandsys, /* b_brandsys */
+ sn1_setbrand, /* b_setbrand */
+ sn1_getattr, /* b_getattr */
+ sn1_setattr, /* b_setattr */
+ sn1_copy_procdata, /* b_copy_procdata */
+ sn1_proc_exit, /* b_proc_exit */
+ sn1_exec, /* b_exec */
+ lwp_setrval, /* b_lwp_setrval */
+ NULL, /* b_lwpdata_alloc */
+ NULL, /* b_lwpdata_free */
+ sn1_initlwp, /* b_initlwp */
+ NULL, /* b_initlwp_post */
+ sn1_forklwp, /* b_forklwp */
+ sn1_freelwp, /* b_freelwp */
+ sn1_lwpexit, /* b_lwpexit */
+ sn1_elfexec, /* b_elfexec */
+ NULL, /* b_sigset_native_to_brand */
+ NULL, /* b_sigset_brand_to_native */
+ NULL, /* b_sigfd_translate */
+ NSIG, /* b_nsig */
+ NULL, /* b_exit_with_sig */
+ NULL, /* b_wait_filter */
+ NULL, /* b_native_exec */
+ NULL, /* b_map32limit */
+ NULL, /* b_stop_notify */
+ NULL, /* b_waitid_helper */
+ NULL, /* b_sigcld_repost */
+ NULL, /* b_issig_stop */
+ NULL, /* b_sig_ignorable */
+ NULL, /* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+ NULL, /* b_savecontext32 */
+#endif
+ NULL, /* b_restorecontext */
+ NULL, /* b_sendsig_stack */
+ NULL, /* b_sendsig */
+ NULL, /* b_setid_clear */
+ NULL, /* b_pagefault */
+ B_TRUE, /* b_intp_parse_arg */
+ NULL, /* b_clearbrand */
+ NULL, /* b_rpc_statd */
+ NULL /* b_acct_out */
};
#ifdef sparc
@@ -94,9 +121,12 @@ struct brand_mach_ops sn1_mops = {
struct brand_mach_ops sn1_mops = {
sn1_brand_sysenter_callback,
+ NULL,
sn1_brand_int91_callback,
sn1_brand_syscall_callback,
- sn1_brand_syscall32_callback
+ sn1_brand_syscall32_callback,
+ NULL,
+ NULL
};
#else /* ! __amd64 */
@@ -104,7 +134,10 @@ struct brand_mach_ops sn1_mops = {
struct brand_mach_ops sn1_mops = {
sn1_brand_sysenter_callback,
NULL,
+ NULL,
sn1_brand_syscall_callback,
+ NULL,
+ NULL,
NULL
};
#endif /* __amd64 */
@@ -115,7 +148,8 @@ struct brand sn1_brand = {
BRAND_VER_1,
"sn1",
&sn1_brops,
- &sn1_mops
+ &sn1_mops,
+ sizeof (brand_proc_data_t),
};
static struct modlbrand modlbrand = {
@@ -148,10 +182,10 @@ sn1_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
return (EINVAL);
}
-/*ARGSUSED*/
+/* ARGSUSED5 */
int
sn1_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
+ uintptr_t arg3, uintptr_t arg4)
{
int res;
@@ -171,9 +205,9 @@ sn1_copy_procdata(proc_t *child, proc_t *parent)
}
void
-sn1_proc_exit(struct proc *p, klwp_t *l)
+sn1_proc_exit(struct proc *p)
{
- brand_solaris_proc_exit(p, l, &sn1_brand);
+ brand_solaris_proc_exit(p, &sn1_brand);
}
void
@@ -182,10 +216,10 @@ sn1_exec()
brand_solaris_exec(&sn1_brand);
}
-int
-sn1_initlwp(klwp_t *l)
+void
+sn1_initlwp(klwp_t *l, void *bd)
{
- return (brand_solaris_initlwp(l, &sn1_brand));
+ brand_solaris_initlwp(l, &sn1_brand);
}
void
@@ -214,18 +248,18 @@ sn1_free_brand_data(zone_t *zone)
/*ARGSUSED*/
void
-sn1_init_brand_data(zone_t *zone)
+sn1_init_brand_data(zone_t *zone, kmutex_t *zsl)
{
}
int
sn1_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
- int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
- int brand_action)
+ int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
+ int *brand_action)
{
return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz,
setid, exec_file, cred, brand_action, &sn1_brand, SN1_BRANDNAME,
- SN1_LIB, SN1_LIB32, SN1_LINKER, SN1_LINKER32));
+ SN1_LIB, SN1_LIB32));
}
int
diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.h b/usr/src/uts/common/brand/sn1/sn1_brand.h
index b487745e21..fef9dc128b 100644
--- a/usr/src/uts/common/brand/sn1/sn1_brand.h
+++ b/usr/src/uts/common/brand/sn1/sn1_brand.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#ifndef _SN1_BRAND_H
@@ -37,20 +38,14 @@ extern "C" {
#define SN1_VERSION SN1_VERSION_1
#define SN1_LIB_NAME "sn1_brand.so.1"
-#define SN1_LINKER_NAME "ld.so.1"
#define SN1_LIB32 BRAND_NATIVE_DIR "usr/lib/" SN1_LIB_NAME
-#define SN1_LINKER32 "/lib/" SN1_LINKER_NAME
-
#define SN1_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" SN1_LIB_NAME
-#define SN1_LINKER64 "/lib/64/" SN1_LINKER_NAME
#if defined(_LP64)
#define SN1_LIB SN1_LIB64
-#define SN1_LINKER SN1_LINKER64
#else /* !_LP64 */
#define SN1_LIB SN1_LIB32
-#define SN1_LINKER SN1_LINKER32
#endif /* !_LP64 */
#if defined(_KERNEL)
diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c
index 0841f02e51..4de7cbcc05 100644
--- a/usr/src/uts/common/brand/solaris10/s10_brand.c
+++ b/usr/src/uts/common/brand/solaris10/s10_brand.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/errno.h>
@@ -46,45 +46,71 @@
char *s10_emulation_table = NULL;
-void s10_init_brand_data(zone_t *);
+void s10_init_brand_data(zone_t *, kmutex_t *);
void s10_free_brand_data(zone_t *);
void s10_setbrand(proc_t *);
int s10_getattr(zone_t *, int, void *, size_t *);
int s10_setattr(zone_t *, int, void *, size_t);
int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t);
void s10_copy_procdata(proc_t *, proc_t *);
-void s10_proc_exit(struct proc *, klwp_t *);
+void s10_proc_exit(struct proc *);
void s10_exec();
-int s10_initlwp(klwp_t *);
+void s10_initlwp(klwp_t *, void *);
void s10_forklwp(klwp_t *, klwp_t *);
void s10_freelwp(klwp_t *);
void s10_lwpexit(klwp_t *);
int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
- long *, int, caddr_t, cred_t *, int);
+ size_t *, int, caddr_t, cred_t *, int *);
void s10_sigset_native_to_s10(sigset_t *);
void s10_sigset_s10_to_native(sigset_t *);
/* s10 brand */
struct brand_ops s10_brops = {
- s10_init_brand_data,
- s10_free_brand_data,
- s10_brandsys,
- s10_setbrand,
- s10_getattr,
- s10_setattr,
- s10_copy_procdata,
- s10_proc_exit,
- s10_exec,
- lwp_setrval,
- s10_initlwp,
- s10_forklwp,
- s10_freelwp,
- s10_lwpexit,
- s10_elfexec,
- s10_sigset_native_to_s10,
- s10_sigset_s10_to_native,
- S10_NSIG,
+ s10_init_brand_data, /* b_init_brand_data */
+ s10_free_brand_data, /* b_free_brand_data */
+ s10_brandsys, /* b_brandsys */
+ s10_setbrand, /* b_setbrand */
+ s10_getattr, /* b_getattr */
+ s10_setattr, /* b_setattr */
+ s10_copy_procdata, /* b_copy_procdata */
+ s10_proc_exit, /* b_proc_exit */
+ s10_exec, /* b_exec */
+ lwp_setrval, /* b_lwp_setrval */
+ NULL, /* b_lwpdata_alloc */
+ NULL, /* b_lwpdata_free */
+ s10_initlwp, /* b_initlwp */
+ NULL, /* b_initlwp_post */
+ s10_forklwp, /* b_forklwp */
+ s10_freelwp, /* b_freelwp */
+ s10_lwpexit, /* b_lwpexit */
+ s10_elfexec, /* b_elfexec */
+ s10_sigset_native_to_s10, /* b_sigset_native_to_brand */
+ s10_sigset_s10_to_native, /* b_sigset_brand_to_native */
+ NULL, /* b_sigfd_translate */
+ S10_NSIG, /* b_nsig */
+ NULL, /* b_exit_with_sig */
+ NULL, /* b_wait_filter */
+ NULL, /* b_native_exec */
+ NULL, /* b_map32limit */
+ NULL, /* b_stop_notify */
+ NULL, /* b_waitid_helper */
+ NULL, /* b_sigcld_repost */
+ NULL, /* b_issig_stop */
+ NULL, /* b_sig_ignorable */
+ NULL, /* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+ NULL, /* b_savecontext32 */
+#endif
+ NULL, /* b_restorecontext */
+ NULL, /* b_sendsig_stack */
+ NULL, /* b_sendsig */
+ NULL, /* b_setid_clear */
+ NULL, /* b_pagefault */
+ B_TRUE, /* b_intp_parse_arg */
+ NULL, /* b_clearbrand */
+ NULL, /* b_rpc_statd */
+ NULL /* b_acct_out */
};
#ifdef sparc
@@ -100,9 +126,12 @@ struct brand_mach_ops s10_mops = {
struct brand_mach_ops s10_mops = {
s10_brand_sysenter_callback,
+ NULL,
s10_brand_int91_callback,
s10_brand_syscall_callback,
- s10_brand_syscall32_callback
+ s10_brand_syscall32_callback,
+ NULL,
+ NULL
};
#else /* ! __amd64 */
@@ -110,7 +139,10 @@ struct brand_mach_ops s10_mops = {
struct brand_mach_ops s10_mops = {
s10_brand_sysenter_callback,
NULL,
+ NULL,
s10_brand_syscall_callback,
+ NULL,
+ NULL,
NULL
};
#endif /* __amd64 */
@@ -121,7 +153,8 @@ struct brand s10_brand = {
BRAND_VER_1,
"solaris10",
&s10_brops,
- &s10_mops
+ &s10_mops,
+ sizeof (brand_proc_data_t),
};
static struct modlbrand modlbrand = {
@@ -250,10 +283,10 @@ s10_native(void *cmd, void *args)
return (0);
}
-/*ARGSUSED*/
+/* ARGSUSED5 */
int
s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
+ uintptr_t arg3, uintptr_t arg4)
{
proc_t *p = curproc;
int res;
@@ -327,9 +360,9 @@ s10_copy_procdata(proc_t *child, proc_t *parent)
}
void
-s10_proc_exit(struct proc *p, klwp_t *l)
+s10_proc_exit(struct proc *p)
{
- brand_solaris_proc_exit(p, l, &s10_brand);
+ brand_solaris_proc_exit(p, &s10_brand);
}
void
@@ -338,10 +371,10 @@ s10_exec()
brand_solaris_exec(&s10_brand);
}
-int
-s10_initlwp(klwp_t *l)
+void
+s10_initlwp(klwp_t *l, void *bd)
{
- return (brand_solaris_initlwp(l, &s10_brand));
+ brand_solaris_initlwp(l, &s10_brand);
}
void
@@ -381,7 +414,7 @@ s10_free_brand_data(zone_t *zone)
}
void
-s10_init_brand_data(zone_t *zone)
+s10_init_brand_data(zone_t *zone, kmutex_t *zsl)
{
ASSERT(zone->zone_brand == &s10_brand);
ASSERT(zone->zone_brand_data == NULL);
@@ -390,12 +423,12 @@ s10_init_brand_data(zone_t *zone)
int
s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
- int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
- int brand_action)
+ int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
+ int *brand_action)
{
return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz,
setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME,
- S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32));
+ S10_LIB, S10_LIB32));
}
void
diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.h b/usr/src/uts/common/brand/solaris10/s10_brand.h
index 11f9853f48..ffef485e12 100644
--- a/usr/src/uts/common/brand/solaris10/s10_brand.h
+++ b/usr/src/uts/common/brand/solaris10/s10_brand.h
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#ifndef _S10_BRAND_H
@@ -42,17 +43,12 @@ extern "C" {
#define S10_LINKER_NAME "ld.so.1"
#define S10_LIB32 BRAND_NATIVE_DIR "usr/lib/" S10_LIB_NAME
-#define S10_LINKER32 "/lib/" S10_LINKER_NAME
-
#define S10_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" S10_LIB_NAME
-#define S10_LINKER64 "/lib/64/" S10_LINKER_NAME
#if defined(_LP64)
#define S10_LIB S10_LIB64
-#define S10_LINKER S10_LINKER64
#else /* !_LP64 */
#define S10_LIB S10_LIB32
-#define S10_LINKER S10_LINKER32
#endif /* !_LP64 */
/*
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index 64227a3998..1120748b98 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -22,6 +22,7 @@
/*
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
* Copyright 2012 Milan Jurik. All rights reserved.
*/
@@ -559,8 +560,8 @@ char *isa_list = architecture;
static pgcnt_t original_physmem = 0;
#define MIN_DEFAULT_MAXUSERS 8u
-#define MAX_DEFAULT_MAXUSERS 2048u
-#define MAX_MAXUSERS 4096u
+#define MAX_DEFAULT_MAXUSERS 10000u
+#define MAX_MAXUSERS 20000u
void
param_preset(void)
@@ -572,7 +573,7 @@ void
param_calc(int platform_max_nprocs)
{
/*
- * Default to about one "user" per megabyte, taking into
+ * Default to about one "user" per 8MB, taking into
* account both physical and virtual constraints.
* Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT)
* converts pages to megs without integer overflow.
@@ -586,8 +587,9 @@ param_calc(int platform_max_nprocs)
if (maxusers == 0) {
pgcnt_t physmegs = physmem >> (20 - PAGESHIFT);
pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20;
- maxusers = MIN(MAX(MIN(physmegs, virtmegs),
- MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS);
+ maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */
+ maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS);
+ maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS);
}
if (maxusers > MAX_MAXUSERS) {
maxusers = MAX_MAXUSERS;
@@ -604,15 +606,26 @@ param_calc(int platform_max_nprocs)
/*
* We need to dynamically change any variables now so that
- * the setting of maxusers and pidmax propagate to the other
+ * the setting of maxusers and maxpid propagate to the other
* variables that are dependent on them.
*/
if (reserved_procs == 0)
reserved_procs = 5;
- if (pidmax < reserved_procs || pidmax > MAX_MAXPID)
+ if (pidmax < reserved_procs || pidmax > MAX_MAXPID) {
maxpid = MAX_MAXPID;
- else
+ } else {
+ /*
+ * If pidmax has not been explicity set in /etc/system, then
+ * increase it to the maximum on larger machines. We choose a
+ * 128GB memory size as the threshold to increase pidmax.
+ */
+ if (pidmax == DEFAULT_MAXPID) {
+ if (physmem > (btop(128ULL * 0x40000000ULL))) {
+ pidmax = MAX_MAXPID;
+ }
+ }
maxpid = pidmax;
+ }
/*
* This allows platform-dependent code to constrain the maximum
diff --git a/usr/src/uts/common/contract/process.c b/usr/src/uts/common/contract/process.c
index 9fd23fdb61..e46cbd3abf 100644
--- a/usr/src/uts/common/contract/process.c
+++ b/usr/src/uts/common/contract/process.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/mutex.h>
@@ -955,6 +956,18 @@ contract_process_exit(cont_process_t *ctp, proc_t *p, int exitstatus)
(void) cte_publish_all(ct, event, nvl, NULL);
mutex_enter(&ct->ct_lock);
}
+
+ /*
+ * CT_PR_EV_EXIT is not part of the CT_PR_ALLFATAL definition since
+ * we never allow including this in the fatal set via a user-land
+ * application, but we do allow CT_PR_EV_EXIT in the contract's fatal
+ * set for a process setup for zone init. See zone_start_init().
+ */
+ if (EVFATALP(ctp, CT_PR_EV_EXIT)) {
+ ASSERT(MUTEX_HELD(&ct->ct_lock));
+ contract_process_kill(ct, p, B_TRUE);
+ }
+
if (empty) {
/*
* Send EMPTY message.
@@ -1057,6 +1070,17 @@ contract_process_fork(ctmpl_process_t *rtmpl, proc_t *cp, proc_t *pp,
event->cte_type = CT_PR_EV_FORK;
(void) cte_publish_all(ct, event, nvl, NULL);
}
+
+ /*
+ * Because the CT_PR_KEEP_EXEC flag is meant to be used by applications
+ * which are not contract aware, we can assume that these applications
+ * will never explicitly abandon the child's new contract. Thus, we
+ * abandon it now.
+ */
+ if (ctp->conp_params & CT_PR_KEEP_EXEC) {
+ (void) contract_abandon(ct, pp, 1);
+ }
+
return (ctp);
}
diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c
index 64f9e4e68d..2a51830e6e 100644
--- a/usr/src/uts/common/crypto/api/kcf_random.c
+++ b/usr/src/uts/common/crypto/api/kcf_random.c
@@ -70,6 +70,7 @@
#include <sys/cpuvar.h>
#include <sys/taskq.h>
#include <rng/fips_random.h>
+#include <sys/strlog.h>
#define RNDPOOLSIZE 1024 /* Pool size in bytes */
#define MINEXTRACTBYTES 20
@@ -933,7 +934,8 @@ rnd_handler(void *arg)
int len = 0;
if (!rng_prov_found && rng_ok_to_log) {
- cmn_err(CE_WARN, "No randomness provider enabled for "
+ (void) strlog(0, 0, 0, SL_NOTE,
+ "No randomness provider enabled for "
"/dev/random. Use cryptoadm(1M) to enable a provider.");
rng_ok_to_log = B_FALSE;
}
diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c
index 9e079a079e..ec9df915c5 100644
--- a/usr/src/uts/common/crypto/core/kcf_sched.c
+++ b/usr/src/uts/common/crypto/core/kcf_sched.c
@@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg)
case 0:
case -1:
/*
- * Woke up with no work to do. Check
- * if this thread should exit. We keep
- * at least kcf_minthreads.
+ * Woke up with no work to do. Check if we
+ * should lwp_exit() (which won't return). We
+ * keep at least kcf_minthreads.
*/
if (kcfpool->kp_threads > kcf_minthreads) {
KCF_ATOMIC_DECR(kcfpool->kp_threads);
diff --git a/usr/src/uts/common/crypto/io/dprov.c b/usr/src/uts/common/crypto/io/dprov.c
index 5b4e23dca9..806bbef280 100644
--- a/usr/src/uts/common/crypto/io/dprov.c
+++ b/usr/src/uts/common/crypto/io/dprov.c
@@ -221,6 +221,8 @@ typedef enum dprov_mech_type {
SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */
SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */
SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */
+ SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */
+ SHA512_256_MECH_INFO_TYPE, /* SUN_CKM_SHA512_256 */
DES_CBC_MECH_INFO_TYPE, /* SUN_CKM_DES_CBC */
DES3_CBC_MECH_INFO_TYPE, /* SUN_CKM_DES3_CBC */
@@ -430,6 +432,14 @@ static crypto_mech_info_t dprov_mech_info_tab[] = {
CRYPTO_FG_ENCRYPT_MAC_ATOMIC | CRYPTO_FG_MAC_DECRYPT_ATOMIC,
SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA512_224 */
+ {SUN_CKM_SHA512_224, SHA512_224_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, 0, 0,
+ CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ /* SHA512_256 */
+ {SUN_CKM_SHA512_256, SHA512_256_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, 0, 0,
+ CRYPTO_KEYSIZE_UNIT_IN_BITS},
/* DES-CBC */
{SUN_CKM_DES_CBC, DES_CBC_MECH_INFO_TYPE,
CRYPTO_FG_ENCRYPT | CRYPTO_FG_DECRYPT | CRYPTO_FG_ENCRYPT_MAC |
@@ -1948,7 +1958,9 @@ dprov_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
mechanism->cm_type != SHA1_MECH_INFO_TYPE &&
mechanism->cm_type != SHA256_MECH_INFO_TYPE &&
mechanism->cm_type != SHA384_MECH_INFO_TYPE &&
- mechanism->cm_type != SHA512_MECH_INFO_TYPE) {
+ mechanism->cm_type != SHA512_MECH_INFO_TYPE &&
+ mechanism->cm_type != SHA512_224_MECH_INFO_TYPE &&
+ mechanism->cm_type != SHA512_256_MECH_INFO_TYPE) {
cmn_err(CE_WARN, "dprov_digest_init: unexpected mech type "
"0x%llx\n", (unsigned long long)mechanism->cm_type);
return (CRYPTO_MECHANISM_INVALID);
diff --git a/usr/src/uts/common/crypto/io/sha2_mod.c b/usr/src/uts/common/crypto/io/sha2_mod.c
index 23c73d1909..186c0c3240 100644
--- a/usr/src/uts/common/crypto/io/sha2_mod.c
+++ b/usr/src/uts/common/crypto/io/sha2_mod.c
@@ -128,7 +128,15 @@ static crypto_mech_info_t sha2_mech_info_tab[] = {
{SUN_CKM_SHA512_HMAC_GENERAL, SHA512_HMAC_GEN_MECH_INFO_TYPE,
CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
- CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+ CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+ /* SHA512_224 */
+ {SUN_CKM_SHA512_224, SHA512_224_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+ /* SHA512_256 */
+ {SUN_CKM_SHA512_256, SHA512_256_MECH_INFO_TYPE,
+ CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+ 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}
};
static void sha2_provider_status(crypto_provider_handle_t, uint_t *);
@@ -593,6 +601,12 @@ sha2_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
case SHA512_MECH_INFO_TYPE:
sha_digest_len = SHA512_DIGEST_LENGTH;
break;
+ case SHA512_224_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_224_DIGEST_LENGTH;
+ break;
+ case SHA512_256_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_256_DIGEST_LENGTH;
+ break;
default:
return (CRYPTO_MECHANISM_INVALID);
}
@@ -722,6 +736,12 @@ sha2_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest,
case SHA512_MECH_INFO_TYPE:
sha_digest_len = SHA512_DIGEST_LENGTH;
break;
+ case SHA512_224_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_224_DIGEST_LENGTH;
+ break;
+ case SHA512_256_MECH_INFO_TYPE:
+ sha_digest_len = SHA512_256_DIGEST_LENGTH;
+ break;
default:
return (CRYPTO_MECHANISM_INVALID);
}
@@ -909,6 +929,19 @@ sha2_mac_init_ctx(sha2_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes)
}
+static boolean_t
+sha2_is_general_hmech(const crypto_mechanism_t *mechanism)
+{
+ switch (mechanism->cm_type) {
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ return (B_TRUE);
+ default:
+ return (B_FALSE);
+ }
+}
+
/*
*/
static int
@@ -979,7 +1012,7 @@ sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
/*
* Get the mechanism parameters, if applicable.
*/
- if (mechanism->cm_type % 3 == 2) {
+ if (sha2_is_general_hmech(mechanism)) {
if (mechanism->cm_param == NULL ||
mechanism->cm_param_len != sizeof (ulong_t))
ret = CRYPTO_MECHANISM_PARAM_INVALID;
@@ -1214,7 +1247,7 @@ sha2_mac_atomic(crypto_provider_handle_t provider,
}
/* get the mechanism parameters, if applicable */
- if ((mechanism->cm_type % 3) == 2) {
+ if (sha2_is_general_hmech(mechanism)) {
if (mechanism->cm_param == NULL ||
mechanism->cm_param_len != sizeof (ulong_t)) {
ret = CRYPTO_MECHANISM_PARAM_INVALID;
@@ -1356,7 +1389,7 @@ sha2_mac_verify_atomic(crypto_provider_handle_t provider,
}
/* get the mechanism parameters, if applicable */
- if (mechanism->cm_type % 3 == 2) {
+ if (sha2_is_general_hmech(mechanism)) {
if (mechanism->cm_param == NULL ||
mechanism->cm_param_len != sizeof (ulong_t)) {
ret = CRYPTO_MECHANISM_PARAM_INVALID;
@@ -1592,17 +1625,32 @@ sha2_free_context(crypto_ctx_t *ctx)
if (ctx->cc_provider_private == NULL)
return (CRYPTO_SUCCESS);
- /*
- * We have to free either SHA2 or SHA2-HMAC contexts, which
- * have different lengths.
- *
- * Note: Below is dependent on the mechanism ordering.
- */
-
- if (PROV_SHA2_CTX(ctx)->sc_mech_type % 3 == 0)
+ switch (PROV_SHA2_CTX(ctx)->sc_mech_type) {
+ case SHA256_MECH_INFO_TYPE:
+ case SHA384_MECH_INFO_TYPE:
+ case SHA512_MECH_INFO_TYPE:
+ case SHA512_224_MECH_INFO_TYPE:
+ case SHA512_256_MECH_INFO_TYPE:
ctx_len = sizeof (sha2_ctx_t);
- else
+ break;
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
ctx_len = sizeof (sha2_hmac_ctx_t);
+ break;
+ default:
+ /*
+ * If we get here, someone forgot to update the above list
+ * when adding a new mechanism. Without the correct ctx_len
+ * we will corrupt the heap when calling kmem_free, so panic
+ * now and make it easier to identify the problem.
+ */
+ panic("Unknown SHA2 mechanism %d",
+ PROV_SHA2_CTX(ctx)->sc_mech_type);
+ }
bzero(ctx->cc_provider_private, ctx_len);
kmem_free(ctx->cc_provider_private, ctx_len);
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index 0196b15dae..80b5340543 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp)
/*
* Return non-zero if thread can migrate between "from" and "to"
- * without a performance penalty
+ * without a performance penalty. This is true only if we share a core on
+ * virtually any CPU; sharing the last-level cache is insufficient to make
+ * migration possible without penalty.
*/
int
pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
{
- if (from->cpu_physid->cpu_cacheid ==
- to->cpu_physid->cpu_cacheid)
+ if (from->cpu_physid->cpu_coreid ==
+ to->cpu_physid->cpu_coreid)
return (1);
return (0);
}
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 46f53faab6..2a4365ff73 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/disp.h>
@@ -74,6 +75,32 @@
* Putting threads on wait queues in random places while running in the
* kernel might lead to all kinds of locking problems.
*
+ * Bursting
+ * ========
+ *
+ * CPU bursting occurs when the CPU usage is over the baseline but under the
+ * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant
+ * environment so that we know how much CPU is allocated for a tenant under
+ * normal utilization. We can then track how much time a zone is spending
+ * over the "normal" CPU utilization expected for that zone using the
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline. Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value. This will allow the zone to burst again.
+ * We can watch this behavior using the kstats. The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value. The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting. When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
+ *
* Accounting
* ==========
*
@@ -203,18 +230,28 @@ static void caps_update();
*/
struct cap_kstat {
kstat_named_t cap_value;
+ kstat_named_t cap_baseline;
+ kstat_named_t cap_effective;
+ kstat_named_t cap_burst_limit;
+ kstat_named_t cap_bursting;
kstat_named_t cap_usage;
kstat_named_t cap_nwait;
kstat_named_t cap_below;
kstat_named_t cap_above;
+ kstat_named_t cap_above_base;
kstat_named_t cap_maxusage;
kstat_named_t cap_zonename;
} cap_kstat = {
{ "value", KSTAT_DATA_UINT64 },
+ { "baseline", KSTAT_DATA_UINT64 },
+ { "effective", KSTAT_DATA_UINT64 },
+ { "burst_limit_sec", KSTAT_DATA_UINT64 },
+ { "bursting_sec", KSTAT_DATA_UINT64 },
{ "usage", KSTAT_DATA_UINT64 },
{ "nwait", KSTAT_DATA_UINT64 },
{ "below_sec", KSTAT_DATA_UINT64 },
{ "above_sec", KSTAT_DATA_UINT64 },
+ { "above_base_sec", KSTAT_DATA_UINT64 },
{ "maxusage", KSTAT_DATA_UINT64 },
{ "zonename", KSTAT_DATA_STRING },
};
@@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
cap->cap_below = cap->cap_above = 0;
cap->cap_maxusage = 0;
cap->cap_usage = 0;
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
waitq_unblock(&cap->cap_waitq);
if (CPUCAPS_OFF()) {
cpucaps_enabled = B_TRUE;
@@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap)
ASSERT(CAP_ENABLED(cap));
waitq_block(&cap->cap_waitq);
+
+ /* do this first to avoid race with cap_kstat_update */
+ if (cap->cap_kstat != NULL) {
+ kstat_delete(cap->cap_kstat);
+ cap->cap_kstat = NULL;
+ }
+
list_remove(l, cap);
if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
cpucaps_enabled = B_FALSE;
cpucaps_clock_callout = NULL;
}
- cap->cap_value = 0;
+ cap->cap_value = cap->cap_chk_value = 0;
cap->cap_project = NULL;
cap->cap_zone = NULL;
- if (cap->cap_kstat != NULL) {
- kstat_delete(cap->cap_kstat);
- cap->cap_kstat = NULL;
- }
-
}
/*
@@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
* The waitq_isempty check is performed without the waitq lock. If a new thread
* is placed on the waitq right after the check, it will be picked up during the
* next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
*/
/* ARGSUSED */
static void
@@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
{
ASSERT(MUTEX_HELD(&caps_lock));
- if (cap->cap_usage >= cap->cap_value) {
+ if (cap->cap_base != 0) {
+ /*
+ * Because of the way usage is calculated and decayed, its
+ * possible for the zone to be slightly over its cap, but we
+ * don't want to count that after we have reduced the effective
+ * cap to the baseline. That way the zone will be able to
+ * burst again after the burst_limit has expired.
+ */
+ if (cap->cap_usage > cap->cap_base &&
+ cap->cap_chk_value == cap->cap_value) {
+ cap->cap_above_base++;
+
+ /*
+ * If bursting is limited and we've been bursting
+ * longer than we're supposed to, then set the
+ * effective cap to the baseline.
+ */
+ if (cap->cap_burst_limit != 0) {
+ cap->cap_bursting++;
+ if (cap->cap_bursting >= cap->cap_burst_limit)
+ cap->cap_chk_value = cap->cap_base;
+ }
+ } else if (cap->cap_bursting > 0) {
+ /*
+ * We're not bursting now, but we were, decay the
+ * bursting timer.
+ */
+ cap->cap_bursting--;
+ /*
+ * Reset the effective cap once we decay to 0 so we
+ * can burst again.
+ */
+ if (cap->cap_bursting == 0 &&
+ cap->cap_chk_value != cap->cap_value)
+ cap->cap_chk_value = cap->cap_value;
+ }
+ }
+
+ if (cap->cap_usage >= cap->cap_chk_value) {
cap->cap_above++;
} else {
waitq_t *wq = &cap->cap_waitq;
cap->cap_below++;
- if (!waitq_isempty(wq))
- waitq_runone(wq);
+ if (!waitq_isempty(wq)) {
+ int i, ndequeue, p;
+
+ /*
+ * Since this function is only called once per tick,
+ * we can hit a situation where we have artificially
+ * limited the project/zone below its cap. This would
+ * happen if we have multiple threads queued up but
+ * only dequeued one thread/tick. To avoid this we
+ * dequeue multiple threads, calculated based on the
+ * usage percentage of the cap. It is possible that we
+ * could dequeue too many threads and some of them
+ * might be put back on the wait queue quickly, but
+ * since we know that threads are on the wait queue
+ * because we're capping, we know that there is unused
+ * CPU cycles anyway, so this extra work would not
+ * hurt. Also, the ndequeue number is only an upper
+ * bound and we might dequeue less, depending on how
+ * many threads are actually in the wait queue. The
+ * ndequeue values are empirically derived and could be
+ * adjusted or calculated in another way if necessary.
+ */
+ p = (int)((100 * cap->cap_usage) / cap->cap_chk_value);
+ if (p >= 98)
+ ndequeue = 10;
+ else if (p >= 95)
+ ndequeue = 20;
+ else if (p >= 90)
+ ndequeue = 40;
+ else if (p >= 85)
+ ndequeue = 80;
+ else
+ ndequeue = 160;
+
+ for (i = 0; i < ndequeue; i++) {
+ waitq_runone(wq);
+ if (waitq_isempty(wq))
+ break;
+ }
+ DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i);
+ }
}
}
@@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
* Remove all projects in this zone without caps
* from the capped_projects list.
*/
- if (project_cap->cap_value == MAX_USAGE) {
+ if (project_cap->cap_chk_value == MAX_USAGE) {
cap_project_disable(kpj);
}
} else if (CAP_DISABLED(project_cap)) {
/*
* Add the project to capped_projects list.
*/
- ASSERT(project_cap->cap_value == 0);
+ ASSERT(project_cap->cap_chk_value == 0);
cap_project_enable(kpj, MAX_USAGE);
}
mutex_exit(&caps_lock);
@@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
/*
* No state transitions, just change the value
*/
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
}
ASSERT(MUTEX_HELD(&caps_lock));
@@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
}
/*
+ * Set zone's base cpu value to base_val
+ */
+int
+cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= MAXCAP);
+ if (base_val > MAXCAP)
+ base_val = MAXCAP;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = base_val * cap_tick_cost;
+ if (value < 0 || value > cap->cap_value)
+ value = 0;
+
+ cap->cap_base = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
+ * Set zone's maximum burst time in seconds. A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= INT_MAX);
+ /* Treat the default as 0 - no limit */
+ if (base_val == INT_MAX)
+ base_val = 0;
+ if (base_val > INT_MAX)
+ base_val = INT_MAX;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = SEC_TO_TICK(base_val);
+ if (value < 0)
+ value = 0;
+
+ cap->cap_burst_limit = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
* The project is going away so disable its cap.
*/
void
@@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
if (CAP_DISABLED(cap))
cap_project_enable(kpj, value);
else
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
} else if (CAP_ENABLED(cap)) {
/*
* User requested to drop a cap on the project. If it is part of
@@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
* otherwise disable the cap.
*/
if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
- cap->cap_value = MAX_USAGE;
+ cap->cap_value = cap->cap_chk_value = MAX_USAGE;
} else {
cap_project_disable(kpj);
}
@@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone)
}
/*
+ * Get current zone baseline.
+ */
+rctl_qty_t
+cpucaps_zone_get_base(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
* Charge project of thread t the time thread t spent on CPU since previously
* adjusted.
*
@@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
project_cap = kpj->kpj_cpucap;
- if (project_cap->cap_usage >= project_cap->cap_value) {
+ if (project_cap->cap_usage >= project_cap->cap_chk_value) {
t->t_schedflag |= TS_PROJWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
} else {
cpucap_t *zone_cap = zone->zone_cpucap;
- if (zone_cap->cap_usage >= zone_cap->cap_value) {
+ if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
t->t_schedflag |= TS_ZONEWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t)
/*
* Convert internal cap statistics into values exported by cap kstat.
+ * Note that the kstat is held throughout this function but caps_lock is not.
*/
static int
cap_kstat_update(kstat_t *ksp, int rw)
@@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_value.value.ui64 =
ROUND_SCALE(cap->cap_value, cap_tick_cost);
+ capsp->cap_baseline.value.ui64 =
+ ROUND_SCALE(cap->cap_base, cap_tick_cost);
+ capsp->cap_effective.value.ui64 =
+ ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+ capsp->cap_burst_limit.value.ui64 =
+ ROUND_SCALE(cap->cap_burst_limit, tick_sec);
capsp->cap_usage.value.ui64 =
ROUND_SCALE(cap->cap_usage, cap_tick_cost);
capsp->cap_maxusage.value.ui64 =
@@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+ capsp->cap_above_base.value.ui64 =
+ ROUND_SCALE(cap->cap_above_base, tick_sec);
+ capsp->cap_bursting.value.ui64 =
+ ROUND_SCALE(cap->cap_bursting, tick_sec);
kstat_named_setstr(&capsp->cap_zonename, zonename);
return (0);
diff --git a/usr/src/uts/common/disp/cpupart.c b/usr/src/uts/common/disp/cpupart.c
index 8de1f5cc37..123776a123 100644
--- a/usr/src/uts/common/disp/cpupart.c
+++ b/usr/src/uts/common/disp/cpupart.c
@@ -20,6 +20,8 @@
*/
/*
* Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
@@ -325,7 +327,7 @@ cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
kthread_t *t;
int move_threads = 1;
lgrp_id_t lgrpid;
- proc_t *p;
+ proc_t *p;
int lgrp_diff_lpl;
lpl_t *cpu_lpl;
int ret;
@@ -570,8 +572,8 @@ again:
/* Update CPU last ran on if it was this CPU */
if (t->t_cpu == cp && t->t_cpupart == oldpp &&
t->t_bound_cpu != cp) {
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(ncp, t,
+ t->t_pri);
}
t = t->t_forw;
} while (t != p->p_tlist);
@@ -623,8 +625,8 @@ again:
/* Update CPU last ran on if it was this CPU */
if (t->t_cpu == cp && t->t_cpupart == oldpp &&
t->t_bound_cpu != cp) {
- t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
- t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(ncp, t,
+ t->t_pri);
}
t = t->t_next;
@@ -879,7 +881,7 @@ cpupart_create(psetid_t *psid)
static int
cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
{
- void *projbuf, *zonebuf;
+ void *projbuf, *zonebuf;
kthread_t *t;
proc_t *p;
int err = 0;
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index 0c2c0b4993..4898a18bf2 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
+ */
+
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
@@ -56,6 +60,7 @@
#include <sys/dtrace.h>
#include <sys/sdt.h>
#include <sys/archsystm.h>
+#include <sys/ht.h>
#include <vm/as.h>
@@ -105,7 +110,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri);
/*
* If this is set, only interrupt threads will cause kernel preemptions.
* This is done by changing the value of kpreemptpri. kpreemptpri
- * will either be the max sysclass pri + 1 or the min interrupt pri.
+ * will either be the max sysclass pri or the min interrupt pri.
*/
int only_intr_kpreempt;
@@ -252,7 +257,23 @@ dispinit(void)
maxglobpri = cl_maxglobpri;
}
}
- kpreemptpri = (pri_t)v.v_maxsyspri + 1;
+
+ /*
+ * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
+ * to say, maxclsyspri + 1. However, over time, the system has used
+ * more and more asynchronous kernel threads, with an increasing number
+ * of these doing work on direct behalf of higher-level software (e.g.,
+ * network processing). This has led to potential priority inversions:
+ * threads doing low-priority lengthy kernel work can effectively
+ * delay kernel-level processing of higher-priority data. To minimize
+ * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
+ * the kernel that runs at maxclsyspri will therefore induce kernel
+ * preemption, and this priority should be used if/when an asynchronous
+ * thread (or, as is often the case, task queue) is performing a task
+ * on behalf of higher-level software (or any task that is otherwise
+ * latency-sensitve).
+ */
+ kpreemptpri = (pri_t)v.v_maxsyspri;
if (kpqpri == KPQPRI)
kpqpri = kpreemptpri;
@@ -1115,15 +1136,13 @@ swtch_to(kthread_t *next)
*/
}
-#define CPU_IDLING(pri) ((pri) == -1)
-
static void
cpu_resched(cpu_t *cp, pri_t tpri)
{
int call_poke_cpu = 0;
pri_t cpupri = cp->cpu_dispatch_pri;
- if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
+ if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
"CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
@@ -1219,17 +1238,17 @@ setbackdq(kthread_t *tp)
/*
* We'll generally let this thread continue to run where
* it last ran...but will consider migration if:
- * - We thread probably doesn't have much cache warmth.
+ * - The thread probably doesn't have much cache warmth.
+ * - HT exclusion would prefer us to run elsewhere
* - The CPU where it last ran is the target of an offline
* request.
- * - The thread last ran outside it's home lgroup.
+ * - The thread last ran outside its home lgroup.
*/
if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
- (tp->t_cpu == cpu_inmotion)) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
- } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
- self ? tp->t_cpu : NULL);
+ !ht_should_run(tp, tp->t_cpu) ||
+ (tp->t_cpu == cpu_inmotion) ||
+ !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
+ cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
} else {
cp = tp->t_cpu;
}
@@ -1258,7 +1277,8 @@ setbackdq(kthread_t *tp)
newcp = cp->cpu_next_part;
}
- if (RUNQ_LEN(newcp, tpri) < qlen) {
+ if (ht_should_run(tp, newcp) &&
+ RUNQ_LEN(newcp, tpri) < qlen) {
DTRACE_PROBE3(runq__balance,
kthread_t *, tp,
cpu_t *, cp, cpu_t *, newcp);
@@ -1269,8 +1289,8 @@ setbackdq(kthread_t *tp)
/*
* Migrate to a cpu in the new partition.
*/
- cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
- tp->t_lpl, tp->t_pri, NULL);
+ cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
+ tp->t_pri);
}
ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
} else {
@@ -1407,7 +1427,7 @@ setfrontdq(kthread_t *tp)
/*
* We'll generally let this thread continue to run
* where it last ran, but will consider migration if:
- * - The thread last ran outside it's home lgroup.
+ * - The thread last ran outside its home lgroup.
* - The CPU where it last ran is the target of an
* offline request (a thread_nomigrate() on the in
* motion CPU relies on this when forcing a preempt).
@@ -1415,21 +1435,18 @@ setfrontdq(kthread_t *tp)
* it last ran, and it is considered not likely to
* have significant cache warmth.
*/
- if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
- (cp == cpu_inmotion)) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
- (tp == curthread) ? cp : NULL);
- } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
- (!THREAD_HAS_CACHE_WARMTH(tp))) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
- NULL);
+ if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
+ cp == cpu_inmotion ||
+ (tpri < cp->cpu_disp->disp_maxrunpri &&
+ !THREAD_HAS_CACHE_WARMTH(tp))) {
+ cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
}
} else {
/*
* Migrate to a cpu in the new partition.
*/
cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
- tp->t_lpl, tp->t_pri, NULL);
+ tp, tp->t_pri);
}
ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
} else {
@@ -1580,7 +1597,7 @@ setkpdq(kthread_t *tp, int borf)
/* migrate to a cpu in the new partition */
cp = tp->t_cpupart->cp_cpulist;
}
- cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
+ cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
disp_lock_enter_high(&cp->cpu_disp->disp_lock);
ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
@@ -2258,7 +2275,7 @@ disp_getbest(disp_t *dp)
* placed earlier.
*/
if (tcp == NULL ||
- pri >= minclsyspri ||
+ (pri >= minclsyspri && tp->t_procp == &p0) ||
tp->t_cpu != tcp)
break;
@@ -2553,80 +2570,85 @@ disp_cpu_inactive(cpu_t *cp)
}
/*
- * disp_lowpri_cpu - find CPU running the lowest priority thread.
- * The hint passed in is used as a starting point so we don't favor
- * CPU 0 or any other CPU. The caller should pass in the most recently
- * used CPU for the thread.
+ * Return a score rating this CPU for running this thread: lower is better.
+ *
+ * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
+ * curcpu (as that's our own priority).
+ *
+ * If a cpu is the target of an offline request, then try to avoid it.
*
- * The lgroup and priority are used to determine the best CPU to run on
- * in a NUMA machine. The lgroup specifies which CPUs are closest while
- * the thread priority will indicate whether the thread will actually run
- * there. To pick the best CPU, the CPUs inside and outside of the given
- * lgroup which are running the lowest priority threads are found. The
- * remote CPU is chosen only if the thread will not run locally on a CPU
- * within the lgroup, but will run on the remote CPU. If the thread
- * cannot immediately run on any CPU, the best local CPU will be chosen.
+ * Otherwise we'll use double the effective dispatcher priority for the CPU.
+ *
+ * We do this so ht_adjust_cpu_score() can increment the score if needed,
+ * without ending up over-riding a dispatcher priority.
+ */
+static pri_t
+cpu_score(cpu_t *cp, kthread_t *tp)
+{
+ pri_t score;
+
+ if (tp == curthread && cp == curthread->t_cpu)
+ score = 2 * CPU_IDLE_PRI;
+ else if (cp == cpu_inmotion)
+ score = SHRT_MAX;
+ else
+ score = 2 * cp->cpu_dispatch_pri;
+
+ if (2 * cp->cpu_disp->disp_maxrunpri > score)
+ score = 2 * cp->cpu_disp->disp_maxrunpri;
+ if (2 * cp->cpu_chosen_level > score)
+ score = 2 * cp->cpu_chosen_level;
+
+ return (ht_adjust_cpu_score(tp, cp, score));
+}
+
+/*
+ * disp_lowpri_cpu - find a suitable CPU to run the given thread.
*
- * The lpl specified also identifies the cpu partition from which
- * disp_lowpri_cpu should select a CPU.
+ * We are looking for a CPU with an effective dispatch priority lower than the
+ * thread's, so that the thread will run immediately rather than be enqueued.
+ * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
+ * If we don't find an available CPU there, we will expand our search to include
+ * wider locality levels. (Note these groups are already divided by CPU
+ * partition.)
*
- * curcpu is used to indicate that disp_lowpri_cpu is being called on
- * behalf of the current thread. (curthread is looking for a new cpu)
- * In this case, cpu_dispatch_pri for this thread's cpu should be
- * ignored.
+ * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
+ * the best home CPU we found.
*
- * If a cpu is the target of an offline request then try to avoid it.
+ * The hint passed in is used as a starting point so we don't favor CPU 0 or any
+ * other CPU. The caller should pass in the most recently used CPU for the
+ * thread; it's of course possible that this CPU isn't in the home lgroup.
*
- * This function must be called at either high SPL, or with preemption
- * disabled, so that the "hint" CPU cannot be removed from the online
- * CPU list while we are traversing it.
+ * This function must be called at either high SPL, or with preemption disabled,
+ * so that the "hint" CPU cannot be removed from the online CPU list while we
+ * are traversing it.
*/
cpu_t *
-disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
+disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
{
cpu_t *bestcpu;
cpu_t *besthomecpu;
cpu_t *cp, *cpstart;
- pri_t bestpri;
- pri_t cpupri;
-
klgrpset_t done;
- klgrpset_t cur_set;
lpl_t *lpl_iter, *lpl_leaf;
- int i;
- /*
- * Scan for a CPU currently running the lowest priority thread.
- * Cannot get cpu_lock here because it is adaptive.
- * We do not require lock on CPU list.
- */
ASSERT(hint != NULL);
- ASSERT(lpl != NULL);
- ASSERT(lpl->lpl_ncpu > 0);
+ ASSERT(tp->t_lpl->lpl_ncpu > 0);
- /*
- * First examine local CPUs. Note that it's possible the hint CPU
- * passed in in remote to the specified home lgroup. If our priority
- * isn't sufficient enough such that we can run immediately at home,
- * then examine CPUs remote to our home lgroup.
- * We would like to give preference to CPUs closest to "home".
- * If we can't find a CPU where we'll run at a given level
- * of locality, we expand our search to include the next level.
- */
bestcpu = besthomecpu = NULL;
klgrpset_clear(done);
- /* start with lpl we were passed */
- lpl_iter = lpl;
+ lpl_iter = tp->t_lpl;
do {
+ pri_t best = SHRT_MAX;
+ klgrpset_t cur_set;
- bestpri = SHRT_MAX;
klgrpset_clear(cur_set);
- for (i = 0; i < lpl_iter->lpl_nrset; i++) {
+ for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
lpl_leaf = lpl_iter->lpl_rset[i];
if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
continue;
@@ -2639,34 +2661,25 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
cp = cpstart = lpl_leaf->lpl_cpus;
do {
- if (cp == curcpu)
- cpupri = -1;
- else if (cp == cpu_inmotion)
- cpupri = SHRT_MAX;
- else
- cpupri = cp->cpu_dispatch_pri;
- if (cp->cpu_disp->disp_maxrunpri > cpupri)
- cpupri = cp->cpu_disp->disp_maxrunpri;
- if (cp->cpu_chosen_level > cpupri)
- cpupri = cp->cpu_chosen_level;
- if (cpupri < bestpri) {
- if (CPU_IDLING(cpupri)) {
- ASSERT((cp->cpu_flags &
- CPU_QUIESCED) == 0);
- return (cp);
- }
+ pri_t score = cpu_score(cp, tp);
+
+ if (score < best) {
+ best = score;
bestcpu = cp;
- bestpri = cpupri;
+
+ /* An idle CPU: we're done. */
+ if (score / 2 == CPU_IDLE_PRI)
+ goto out;
}
} while ((cp = cp->cpu_next_lpl) != cpstart);
}
- if (bestcpu && (tpri > bestpri)) {
- ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
- return (bestcpu);
- }
+ if (bestcpu != NULL && tpri > (best / 2))
+ goto out;
+
if (besthomecpu == NULL)
besthomecpu = bestcpu;
+
/*
* Add the lgrps we just considered to the "done" set
*/
@@ -2678,8 +2691,11 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
* The specified priority isn't high enough to run immediately
* anywhere, so just return the best CPU from the home lgroup.
*/
- ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
- return (besthomecpu);
+ bestcpu = besthomecpu;
+
+out:
+ ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
+ return (bestcpu);
}
/*
@@ -2699,3 +2715,19 @@ static void
generic_enq_thread(cpu_t *cpu, int bound)
{
}
+
+cpu_t *
+disp_choose_best_cpu(void)
+{
+ kthread_t *t = curthread;
+ cpu_t *curcpu = CPU;
+
+ ASSERT(t->t_preempt > 0);
+ ASSERT(t->t_state == TS_ONPROC);
+ ASSERT(t->t_schedflag & TS_VCPU);
+
+ if (ht_should_run(t, curcpu))
+ return (curcpu);
+
+ return (disp_lowpri_cpu(curcpu, t, t->t_pri));
+}
diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c
index adb70871e2..5b190242e6 100644
--- a/usr/src/uts/common/disp/fx.c
+++ b/usr/src/uts/common/disp/fx.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -71,16 +71,6 @@ static struct modlinkage modlinkage = {
};
-/*
- * control flags (kparms->fx_cflags).
- */
-#define FX_DOUPRILIM 0x01 /* change user priority limit */
-#define FX_DOUPRI 0x02 /* change user priority */
-#define FX_DOTQ 0x04 /* change FX time quantum */
-
-
-#define FXMAXUPRI 60 /* maximum user priority setting */
-
#define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */
/*
diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c
index 5412df83f5..60e870ba28 100644
--- a/usr/src/uts/common/disp/priocntl.c
+++ b/usr/src/uts/common/disp/priocntl.c
@@ -114,7 +114,7 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg)
#endif
-static int donice(procset_t *, pcnice_t *);
+int donice(procset_t *, pcnice_t *);
static int doprio(procset_t *, pcprio_t *);
static int proccmp(proc_t *, struct pcmpargs *);
static int setparms(proc_t *, struct stprmargs *);
@@ -991,7 +991,7 @@ setprocnice(proc_t *pp, pcnice_t *pcnice)
/*
* Update the nice value of the specified LWP or set of processes.
*/
-static int
+int
donice(procset_t *procset, pcnice_t *pcnice)
{
int err_proc = 0;
diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c
index f87f8c56ce..115e42ccb8 100644
--- a/usr/src/uts/common/disp/rt.c
+++ b/usr/src/uts/common/disp/rt.c
@@ -22,7 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -103,13 +103,6 @@ _info(struct modinfo *modinfop)
pri_t rt_maxpri = RTMAXPRI; /* maximum real-time priority */
rtdpent_t *rt_dptbl; /* real-time dispatcher parameter table */
-/*
- * control flags (kparms->rt_cflags).
- */
-#define RT_DOPRI 0x01 /* change priority */
-#define RT_DOTQ 0x02 /* change RT time quantum */
-#define RT_DOSIG 0x04 /* change RT time quantum signal */
-
static int rt_admin(caddr_t, cred_t *);
static int rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
static int rt_fork(kthread_t *, kthread_t *, void *);
diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c
index 768b499ef2..cc88ed72fc 100644
--- a/usr/src/uts/common/disp/rt_dptbl.c
+++ b/usr/src/uts/common/disp/rt_dptbl.c
@@ -28,8 +28,6 @@
/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
/* All Rights Reserved */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/proc.h>
#include <sys/priocntl.h>
#include <sys/class.h>
@@ -70,8 +68,6 @@ _info(struct modinfo *modinfop)
return (mod_info(&modlinkage, modinfop));
}
-#define RTGPPRIO0 100 /* Global priority for RT priority 0 */
-
rtdpent_t config_rt_dptbl[] = {
/* prilevel Time quantum */
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index 854b33798d..d576738e75 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -75,6 +75,11 @@
#include <sys/cpucaps.h>
#include <sys/kiconv.h>
#include <sys/ctype.h>
+#include <sys/ht.h>
+
+#ifndef STACK_GROWTH_DOWN
+#error Stacks do not grow downward; 3b2 zombie attack detected!
+#endif
struct kmem_cache *thread_cache; /* cache of free threads */
struct kmem_cache *lwp_cache; /* cache of free lwps */
@@ -373,7 +378,7 @@ thread_create(
if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
cmn_err(CE_PANIC, "thread_create: proposed stack size"
" too small to hold thread.");
-#ifdef STACK_GROWTH_DOWN
+
stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
stksize &= -PTR24_ALIGN; /* make thread aligned */
t = (kthread_t *)(stk + stksize);
@@ -382,13 +387,6 @@ thread_create(
audit_thread_create(t);
t->t_stk = stk + stksize;
t->t_stkbase = stk;
-#else /* stack grows to larger addresses */
- stksize -= SA(sizeof (kthread_t));
- t = (kthread_t *)(stk);
- bzero(t, sizeof (kthread_t));
- t->t_stk = stk + sizeof (kthread_t);
- t->t_stkbase = stk + stksize + sizeof (kthread_t);
-#endif /* STACK_GROWTH_DOWN */
t->t_flag |= T_TALLOCSTK;
t->t_swap = stk;
} else {
@@ -401,13 +399,8 @@ thread_create(
* Initialize t_stk to the kernel stack pointer to use
* upon entry to the kernel
*/
-#ifdef STACK_GROWTH_DOWN
t->t_stk = stk + stksize;
t->t_stkbase = stk;
-#else
- t->t_stk = stk; /* 3b2-like */
- t->t_stkbase = stk + stksize;
-#endif /* STACK_GROWTH_DOWN */
}
if (kmem_stackinfo != 0) {
@@ -487,15 +480,9 @@ thread_create(
curthread->t_prev = t;
/*
- * Threads should never have a NULL t_cpu pointer so assign it
- * here. If the thread is being created with state TS_RUN a
- * better CPU may be chosen when it is placed on the run queue.
- *
- * We need to keep kernel preemption disabled when setting all
- * three fields to keep them in sync. Also, always create in
- * the default partition since that's where kernel threads go
- * (if this isn't a kernel thread, t_cpupart will be changed
- * in lwp_create before setting the thread runnable).
+ * We'll always create in the default partition since that's where
+ * kernel threads go (we'll change this later if needed, in
+ * lwp_create()).
*/
t->t_cpupart = &cp_default;
@@ -504,20 +491,23 @@ thread_create(
* Since the kernel does not (presently) allocate its memory
* in a locality aware fashion, the root is an appropriate home.
* If this thread is later associated with an lwp, it will have
- * it's lgroup re-assigned at that time.
+ * its lgroup re-assigned at that time.
*/
lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
/*
- * Inherit the current cpu. If this cpu isn't part of the chosen
- * lgroup, a new cpu will be chosen by cpu_choose when the thread
- * is ready to run.
+ * If the current CPU is in the default cpupart, use it. Otherwise,
+ * pick one that is; before entering the dispatcher code, we'll
+ * make sure to keep the invariant that ->t_cpu is set. (In fact, we
+ * rely on this, in ht_should_run(), in the call tree of
+ * disp_lowpri_cpu().)
*/
- if (CPU->cpu_part == &cp_default)
+ if (CPU->cpu_part == &cp_default) {
t->t_cpu = CPU;
- else
- t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
- t->t_pri, NULL);
+ } else {
+ t->t_cpu = cp_default.cp_cpulist;
+ t->t_cpu = disp_lowpri_cpu(t->t_cpu, t, t->t_pri);
+ }
t->t_disp_queue = t->t_cpu->cpu_disp;
kpreempt_enable();
@@ -590,6 +580,9 @@ thread_exit(void)
if ((t->t_proc_flag & TP_ZTHREAD) != 0)
cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
+ if ((t->t_flag & T_SPLITSTK) != 0)
+ cmn_err(CE_PANIC, "thread_exit: called when stack is split");
+
tsd_exit(); /* Clean up this thread's TSD */
kcpc_passivate(); /* clean up performance counter state */
@@ -870,12 +863,12 @@ thread_zone_destroy(zoneid_t zoneid, void *unused)
/*
* Guard against race condition in mutex_owner_running:
- * thread=owner(mutex)
- * <interrupt>
- * thread exits mutex
- * thread exits
- * thread reaped
- * thread struct freed
+ * thread=owner(mutex)
+ * <interrupt>
+ * thread exits mutex
+ * thread exits
+ * thread reaped
+ * thread struct freed
* cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
* A cross call to all cpus will cause the interrupt handler
* to reset the PC if it is in mutex_owner_running, refreshing
@@ -932,12 +925,12 @@ thread_reaper()
/*
* Guard against race condition in mutex_owner_running:
- * thread=owner(mutex)
- * <interrupt>
- * thread exits mutex
- * thread exits
- * thread reaped
- * thread struct freed
+ * thread=owner(mutex)
+ * <interrupt>
+ * thread exits mutex
+ * thread exits
+ * thread reaped
+ * thread struct freed
* cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
* A cross call to all cpus will cause the interrupt handler
* to reset the PC if it is in mutex_owner_running, refreshing
@@ -1055,8 +1048,44 @@ installctx(
ctx->exit_op = exit;
ctx->free_op = free;
ctx->arg = arg;
- ctx->next = t->t_ctx;
+ ctx->save_ts = 0;
+ ctx->restore_ts = 0;
+
+ /*
+ * Keep ctxops in a doubly-linked list to allow traversal in both
+ * directions. Using only the newest-to-oldest ordering was adequate
+ * previously, but reversing the order for restore_op actions is
+ * necessary if later-added ctxops depends on earlier ones.
+ *
+ * One example of such a dependency: Hypervisor software handling the
+ * guest FPU expects that it save FPU state prior to host FPU handling
+ * and consequently handle the guest logic _after_ the host FPU has
+ * been restored.
+ *
+ * The t_ctx member points to the most recently added ctxop or is NULL
+ * if no ctxops are associated with the thread. The 'next' pointers
+ * form a loop of the ctxops in newest-to-oldest order. The 'prev'
+ * pointers form a loop in the reverse direction, where t_ctx->prev is
+ * the oldest entry associated with the thread.
+ *
+ * The protection of kpreempt_disable is required to safely perform the
+ * list insertion, since there are inconsistent states between some of
+ * the pointer assignments.
+ */
+ kpreempt_disable();
+ if (t->t_ctx == NULL) {
+ ctx->next = ctx;
+ ctx->prev = ctx;
+ } else {
+ struct ctxop *head = t->t_ctx, *tail = t->t_ctx->prev;
+
+ ctx->next = head;
+ ctx->prev = tail;
+ head->prev = ctx;
+ tail->next = ctx;
+ }
t->t_ctx = ctx;
+ kpreempt_enable();
}
/*
@@ -1073,7 +1102,7 @@ removectx(
void (*exit)(void *),
void (*free)(void *, int))
{
- struct ctxop *ctx, *prev_ctx;
+ struct ctxop *ctx, *head;
/*
* The incoming kthread_t (which is the thread for which the
@@ -1098,17 +1127,31 @@ removectx(
* and the target thread from racing with each other during lwp exit.
*/
mutex_enter(&t->t_ctx_lock);
- prev_ctx = NULL;
kpreempt_disable();
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
+
+ if (t->t_ctx == NULL) {
+ mutex_exit(&t->t_ctx_lock);
+ kpreempt_enable();
+ return (0);
+ }
+
+ ctx = head = t->t_ctx;
+ do {
if (ctx->save_op == save && ctx->restore_op == restore &&
ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
ctx->exit_op == exit && ctx->free_op == free &&
ctx->arg == arg) {
- if (prev_ctx)
- prev_ctx->next = ctx->next;
- else
+ ctx->prev->next = ctx->next;
+ ctx->next->prev = ctx->prev;
+ if (ctx->next == ctx) {
+ /* last remaining item */
+ t->t_ctx = NULL;
+ } else if (ctx == t->t_ctx) {
+ /* fix up head of list */
t->t_ctx = ctx->next;
+ }
+ ctx->next = ctx->prev = NULL;
+
mutex_exit(&t->t_ctx_lock);
if (ctx->free_op != NULL)
(ctx->free_op)(ctx->arg, 0);
@@ -1116,44 +1159,70 @@ removectx(
kpreempt_enable();
return (1);
}
- prev_ctx = ctx;
- }
+
+ ctx = ctx->next;
+ } while (ctx != head);
+
mutex_exit(&t->t_ctx_lock);
kpreempt_enable();
-
return (0);
}
void
savectx(kthread_t *t)
{
- struct ctxop *ctx;
-
ASSERT(t == curthread);
- for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
- if (ctx->save_op != NULL)
- (ctx->save_op)(ctx->arg);
+
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->save_op != NULL) {
+ ctx->save_ts = gethrtime_unscaled();
+ (ctx->save_op)(ctx->arg);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
void
restorectx(kthread_t *t)
{
- struct ctxop *ctx;
-
ASSERT(t == curthread);
- for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
- if (ctx->restore_op != NULL)
- (ctx->restore_op)(ctx->arg);
+
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *tail;
+
+ /* Backward traversal (starting at the tail) */
+ ctx = tail = t->t_ctx->prev;
+ do {
+ if (ctx->restore_op != NULL) {
+ ctx->restore_ts = gethrtime_unscaled();
+ (ctx->restore_op)(ctx->arg);
+ }
+ ctx = ctx->prev;
+ } while (ctx != tail);
+ }
}
void
forkctx(kthread_t *t, kthread_t *ct)
{
- struct ctxop *ctx;
-
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
- if (ctx->fork_op != NULL)
- (ctx->fork_op)(t, ct);
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->fork_op != NULL) {
+ (ctx->fork_op)(t, ct);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
/*
@@ -1164,11 +1233,18 @@ forkctx(kthread_t *t, kthread_t *ct)
void
lwp_createctx(kthread_t *t, kthread_t *ct)
{
- struct ctxop *ctx;
-
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
- if (ctx->lwp_create_op != NULL)
- (ctx->lwp_create_op)(t, ct);
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->lwp_create_op != NULL) {
+ (ctx->lwp_create_op)(t, ct);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
/*
@@ -1181,11 +1257,18 @@ lwp_createctx(kthread_t *t, kthread_t *ct)
void
exitctx(kthread_t *t)
{
- struct ctxop *ctx;
-
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
- if (ctx->exit_op != NULL)
- (ctx->exit_op)(t);
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->exit_op != NULL) {
+ (ctx->exit_op)(t);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
/*
@@ -1195,14 +1278,21 @@ exitctx(kthread_t *t)
void
freectx(kthread_t *t, int isexec)
{
- struct ctxop *ctx;
-
kpreempt_disable();
- while ((ctx = t->t_ctx) != NULL) {
- t->t_ctx = ctx->next;
- if (ctx->free_op != NULL)
- (ctx->free_op)(ctx->arg, isexec);
- kmem_free(ctx, sizeof (struct ctxop));
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ ctx = head = t->t_ctx;
+ t->t_ctx = NULL;
+ do {
+ struct ctxop *next = ctx->next;
+
+ if (ctx->free_op != NULL) {
+ (ctx->free_op)(ctx->arg, isexec);
+ }
+ kmem_free(ctx, sizeof (struct ctxop));
+ ctx = next;
+ } while (ctx != head);
}
kpreempt_enable();
}
@@ -1217,17 +1307,22 @@ freectx(kthread_t *t, int isexec)
void
freectx_ctx(struct ctxop *ctx)
{
- struct ctxop *nctx;
+ struct ctxop *head = ctx;
ASSERT(ctx != NULL);
kpreempt_disable();
+
+ head = ctx;
do {
- nctx = ctx->next;
- if (ctx->free_op != NULL)
+ struct ctxop *next = ctx->next;
+
+ if (ctx->free_op != NULL) {
(ctx->free_op)(ctx->arg, 0);
+ }
kmem_free(ctx, sizeof (struct ctxop));
- } while ((ctx = nctx) != NULL);
+ ctx = next;
+ } while (ctx != head);
kpreempt_enable();
}
@@ -1326,6 +1421,8 @@ thread_unpin()
itp = t->t_intr; /* interrupted thread */
t->t_intr = NULL; /* clear interrupt ptr */
+ ht_end_intr();
+
/*
* Get state from interrupt thread for the one
* it interrupted.
@@ -1422,7 +1519,7 @@ thread_create_intr(struct cpu *cp)
static kmutex_t tsd_mutex; /* linked list spin lock */
static uint_t tsd_nkeys; /* size of destructor array */
/* per-key destructor funcs */
-static void (**tsd_destructor)(void *);
+static void (**tsd_destructor)(void *);
/* list of tsd_thread's */
static struct tsd_thread *tsd_list;
@@ -1889,6 +1986,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
return (on_rq);
}
+
+/*
+ * There are occasions in the kernel when we need much more stack than we
+ * allocate by default, but we do not wish to have that work done
+ * asynchronously by another thread. To accommodate these scenarios, we allow
+ * for a split stack (also known as a "segmented stack") whereby a new stack
+ * is dynamically allocated and the current thread jumps onto it for purposes
+ * of executing the specified function. After the specified function returns,
+ * the stack is deallocated and control is returned to the caller. This
+ * functionality is implemented by thread_splitstack(), below; there are a few
+ * constraints on its use:
+ *
+ * - The caller must be in a context where it is safe to block for memory.
+ * - The caller cannot be in a t_onfault context
+ * - The called function must not call thread_exit() while on the split stack
+ *
+ * The code will explicitly panic if these constraints are violated. Notably,
+ * however, thread_splitstack() _can_ be called on a split stack -- there
+ * is no limit to the level that split stacks can nest.
+ *
+ * When the stack is split, it is constructed such that stack backtraces
+ * from kernel debuggers continue to function -- though note that DTrace's
+ * stack() action and stackdepth function will only show the stack up to and
+ * including thread_splitstack_run(); DTrace explicitly bounds itself to
+ * pointers that exist within the current declared stack as a safety
+ * mechanism.
+ */
+void
+thread_splitstack(void (*func)(void *), void *arg, size_t stksize)
+{
+ kthread_t *t = curthread;
+ caddr_t ostk, ostkbase, stk;
+ ushort_t otflag;
+
+ if (t->t_onfault != NULL)
+ panic("thread_splitstack: called with non-NULL t_onfault");
+
+ ostk = t->t_stk;
+ ostkbase = t->t_stkbase;
+ otflag = t->t_flag;
+
+ stksize = roundup(stksize, PAGESIZE);
+
+ if (stksize < default_stksize)
+ stksize = default_stksize;
+
+ if (stksize == default_stksize) {
+ stk = (caddr_t)segkp_cache_get(segkp_thread);
+ } else {
+ stksize = roundup(stksize, PAGESIZE);
+ stk = (caddr_t)segkp_get(segkp, stksize,
+ (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
+ }
+
+ /*
+ * We're going to lock ourselves before we set T_SPLITSTK to assure
+ * that we're not swapped out in the meantime. (Note that we don't
+ * bother to set t_swap, as we're not going to be swapped out.)
+ */
+ thread_lock(t);
+
+ if (!(otflag & T_SPLITSTK))
+ t->t_flag |= T_SPLITSTK;
+
+ t->t_stk = stk + stksize;
+ t->t_stkbase = stk;
+
+ thread_unlock(t);
+
+ /*
+ * Now actually run on the new (split) stack...
+ */
+ thread_splitstack_run(t->t_stk, func, arg);
+
+ /*
+ * We're back onto our own stack; lock ourselves and restore our
+ * pre-split state.
+ */
+ thread_lock(t);
+
+ t->t_stk = ostk;
+ t->t_stkbase = ostkbase;
+
+ if (!(otflag & T_SPLITSTK))
+ t->t_flag &= ~T_SPLITSTK;
+
+ thread_unlock(t);
+
+ /*
+ * Now that we are entirely back on our own stack, call back into
+ * the platform layer to perform any platform-specific cleanup.
+ */
+ thread_splitstack_cleanup();
+
+ segkp_release(segkp, stk);
+}
+
/*
* Tunable kmem_stackinfo is set, fill the kernel thread stack with a
* specific pattern.
diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c
index 67ccc6922f..c840bdf31a 100644
--- a/usr/src/uts/common/disp/thread_intr.c
+++ b/usr/src/uts/common/disp/thread_intr.c
@@ -23,19 +23,10 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
/*
- * FILE NOTICE BEGIN
- *
- * This file should not be modified. If you wish to modify it or have it
- * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com->
- * (without anti-spam dashes)
- *
- * FILE NOTICE END
+ * Copyright 2015, Joyent, Inc.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/cpuvar.h>
#include <sys/stack.h>
#include <vm/seg_kp.h>
@@ -44,6 +35,17 @@
#include <sys/sysmacros.h>
/*
+ * Use a slightly larger thread stack size for interrupt threads rather than the
+ * default. This is useful for cases where the networking stack may do an rx and
+ * a tx in the context of a single interrupt and when combined with various
+ * promisc hooks that need memory, can cause us to get dangerously close to the
+ * edge of the traditional stack sizes. This is only a few pages more than a
+ * traditional stack and given that we don't have that many interrupt threads,
+ * the memory costs end up being more than worthwhile.
+ */
+#define LL_INTR_STKSZ (32 * 1024)
+
+/*
* Create and initialize an interrupt thread.
*/
static void
@@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp)
{
kthread_t *tp;
- tp = thread_create(NULL, 0,
+ tp = thread_create(NULL, LL_INTR_STKSZ,
(void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
/*
@@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp)
}
/*
- * Allocate a given number of interrupt threads for a given CPU.
- * These threads will get freed by cpu_destroy_bound_threads()
- * when CPU gets unconfigured.
+ * Allocate a given number of interrupt threads for a given CPU. These threads
+ * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured.
+ *
+ * Note, high level interrupts are always serviced using cpu_intr_stack and are
+ * not allowed to block. Low level interrupts or soft-interrupts use the
+ * kthread_t's that we create through the calls to thread_create_intr().
*/
void
cpu_intr_alloc(cpu_t *cp, int n)
@@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n)
thread_create_intr(cp);
cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE,
- KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
- INTR_STACK_SIZE - SA(MINFRAME);
+ KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
+ INTR_STACK_SIZE - SA(MINFRAME);
}
diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c
index 61cfc43693..8d5ccdc64b 100644
--- a/usr/src/uts/common/dtrace/dtrace.c
+++ b/usr/src/uts/common/dtrace/dtrace.c
@@ -7770,7 +7770,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
priv = DTRACE_PRIV_ALL;
} else {
*uidp = crgetuid(cr);
- *zoneidp = crgetzoneid(cr);
+ *zoneidp = crgetzonedid(cr);
priv = 0;
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
@@ -8266,7 +8266,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
provider->dtpv_priv.dtpp_flags = priv;
if (cr != NULL) {
provider->dtpv_priv.dtpp_uid = crgetuid(cr);
- provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
+ provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr);
}
provider->dtpv_pops = *pops;
@@ -8877,6 +8877,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
uint32_t priv;
uid_t uid;
zoneid_t zoneid;
+ dtrace_state_t *state = enab->dten_vstate->dtvs_state;
ASSERT(MUTEX_HELD(&dtrace_lock));
dtrace_ecb_create_cache = NULL;
@@ -8891,8 +8892,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
}
dtrace_probekey(desc, &pkey);
- dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
- &priv, &uid, &zoneid);
+ dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid);
+
+ if ((priv & DTRACE_PRIV_ZONEOWNER) &&
+ state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) {
+ /*
+ * If we have the privilege of instrumenting all zones but we
+ * have been told to instrument but one, we will spoof this up
+ * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes
+ * of dtrace_match(). (Note that DTRACEOPT_ZONE is not for
+ * security but rather for performance: it allows the global
+ * zone to instrument USDT probes in a local zone without
+ * requiring all zones to be instrumented.)
+ */
+ priv &= ~DTRACE_PRIV_ZONEOWNER;
+ zoneid = state->dts_options[DTRACEOPT_ZONE];
+ }
return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
enab));
diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c
index 157acc25fc..3d350ff278 100644
--- a/usr/src/uts/common/dtrace/sdt_subr.c
+++ b/usr/src/uts/common/dtrace/sdt_subr.c
@@ -97,6 +97,10 @@ static dtrace_pattr_t iscsi_attr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
};
+/*
+ * When adding a new provider you must add it before sdt as sdt is a catch all
+ * for remaining probes.
+ */
sdt_provider_t sdt_providers[] = {
{ "vtrace", "__vtrace_", &vtrace_attr },
{ "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER },
@@ -117,6 +121,7 @@ sdt_provider_t sdt_providers[] = {
{ "fc", "__fc_", &fc_attr },
{ "srp", "__srp_", &fc_attr },
{ "sysevent", "__sysevent_", &stab_attr },
+ { "vnd", "__vnd_", &stab_attr },
{ "sdt", NULL, &sdt_attr },
{ NULL }
};
@@ -1151,6 +1156,34 @@ sdt_argdesc_t sdt_args[] = {
{ "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *",
"fc_port_info_t *" },
+ { "vnd", "flow-blocked", 0, 0, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "flow-blocked", 1, 1, "uint64_t", "uint64_t" },
+ { "vnd", "flow-blocked", 2, 2, "uintptr_t", "uintptr_t" },
+ { "vnd", "flow-resumed", 0, 0, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "flow-resumed", 1, 1, "uint64_t", "uint64_t" },
+ { "vnd", "flow-resumed", 2, 2, "uintptr_t", "uintptr_t" },
+ { "vnd", "drop-in", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "drop-in", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "drop-in", 2, 2, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "drop-in", 3, 3, "const char *", "const char *" },
+ { "vnd", "drop-out", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "drop-out", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "drop-out", 2, 2, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "drop-out", 3, 3, "const char *", "const char *" },
+ { "vnd", "drop-ctl", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "drop-ctl", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "drop-ctl", 2, 2, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "drop-ctl", 3, 3, "const char *", "const char *" },
+ { "vnd", "send", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "send", 1, 1, "void *", "csinfo_t *" },
+ { "vnd", "send", 2, 2, "void *", "ipinfo_t *" },
+ { "vnd", "send", 3, 3, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "send", 4, 4, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "recv", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "recv", 1, 1, "void *", "csinfo_t *" },
+ { "vnd", "recv", 2, 2, "void *", "ipinfo_t *" },
+ { "vnd", "recv", 3, 3, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "recv", 4, 4, "mblk_t *", "etherinfo_t *" },
{ NULL }
};
diff --git a/usr/src/uts/common/exec/aout/aout.c b/usr/src/uts/common/exec/aout/aout.c
index fc45bd9544..5dbb2ed28c 100644
--- a/usr/src/uts/common/exec/aout/aout.c
+++ b/usr/src/uts/common/exec/aout/aout.c
@@ -22,6 +22,7 @@
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -54,7 +55,7 @@
static int aoutexec(vnode_t *vp, execa_t *uap, uarg_t *args,
intpdata_t *idatap, int level, long *execsz, int setid,
- caddr_t exec_file, cred_t *cred, int brand_action);
+ caddr_t exec_file, cred_t *cred, int *brand_action);
static int get_aout_head(struct vnode **vpp, struct exdata *edp, long *execsz,
int *isdyn);
static int aoutcore(vnode_t *vp, proc_t *pp, cred_t *credp,
@@ -130,7 +131,7 @@ _info(struct modinfo *modinfop)
static int
aoutexec(vnode_t *vp, struct execa *uap, struct uarg *args,
struct intpdata *idatap, int level, long *execsz, int setid,
- caddr_t exec_file, cred_t *cred, int brand_action)
+ caddr_t exec_file, cred_t *cred, int *brand_action)
{
auxv32_t auxflags_auxv32;
int error;
diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c
index 9e6b6bf69e..a4078bb351 100644
--- a/usr/src/uts/common/exec/elf/elf.c
+++ b/usr/src/uts/common/exec/elf/elf.c
@@ -80,15 +80,32 @@ extern volatile size_t aslr_max_brk_skew;
#define ORIGIN_STR "ORIGIN"
#define ORIGIN_STR_SIZE 6
-static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *);
-static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *,
- ssize_t *);
-static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *,
- ssize_t *, caddr_t *, ssize_t *);
-static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *);
-static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t,
- Phdr **, Phdr **, Phdr **, Phdr **, Phdr *,
- caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *);
+static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
+ uint_t *);
+static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
+ size_t *);
+static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
+ caddr_t *, size_t *, caddr_t *, size_t *);
+static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
+static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
+ Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
+ size_t, size_t *, size_t *);
+
+#ifdef _ELF32_COMPAT
+/* Link against the non-compat instances when compiling the 32-bit version. */
+extern size_t elf_datasz_max;
+extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
+extern uint_t elf_nphdr_max;
+extern uint_t elf_nshdr_max;
+extern size_t elf_shstrtab_max;
+#else
+size_t elf_datasz_max = 1 * 1024 * 1024;
+uint_t elf_nphdr_max = 1000;
+uint_t elf_nshdr_max = 10000;
+size_t elf_shstrtab_max = 100 * 1024;
+#endif
+
+
typedef enum {
STR_CTF,
@@ -110,8 +127,8 @@ static const char *shstrtab_data[] = {
};
typedef struct shstrtab {
- int sst_ndx[STR_NUM];
- int sst_cur;
+ uint_t sst_ndx[STR_NUM];
+ uint_t sst_cur;
} shstrtab_t;
static void
@@ -121,10 +138,10 @@ shstrtab_init(shstrtab_t *s)
s->sst_cur = 1;
}
-static int
+static uint_t
shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
{
- int ret;
+ uint_t ret;
if ((ret = s->sst_ndx[type]) != 0)
return (ret);
@@ -144,7 +161,7 @@ shstrtab_size(const shstrtab_t *s)
static void
shstrtab_dump(const shstrtab_t *s, char *buf)
{
- int i, ndx;
+ uint_t i, ndx;
*buf = '\0';
for (i = 0; i < STR_NUM; i++) {
@@ -206,31 +223,54 @@ handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
return (0);
}
+
+#ifndef _ELF32_COMPAT
+void
+elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
+{
+ size_t target = MIN(sz, elf_datasz_max);
+
+ if (target > ctx->ecc_bufsz) {
+ if (ctx->ecc_buf != NULL) {
+ kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
+ }
+ ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
+ ctx->ecc_bufsz = target;
+ }
+}
+#endif /* _ELF32_COMPAT */
+
/*
- * Map in the executable pointed to by vp. Returns 0 on success.
+ * Map in the executable pointed to by vp. Returns 0 on success. Note that
+ * this function currently has the maximum number of arguments allowed by
+ * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without
+ * adding to MAXNARG. (Better yet, do not add to this monster of a function
+ * signature!)
*/
int
mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
- intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
- caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
+ intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
+ caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
{
- size_t len;
+ size_t len, phdrsize;
struct vattr vat;
caddr_t phdrbase = NULL;
- ssize_t phdrsize;
- int nshdrs, shstrndx, nphdrs;
+ uint_t nshdrs, shstrndx, nphdrs;
int error = 0;
Phdr *uphdr = NULL;
Phdr *junk = NULL;
Phdr *dynphdr = NULL;
Phdr *dtrphdr = NULL;
- uintptr_t lddata;
- long execsz;
- intptr_t minaddr;
+ char *interp = NULL;
+ uintptr_t lddata, minaddr;
+ size_t execsz;
if (lddatap != NULL)
*lddatap = 0;
+ if (minaddrp != NULL)
+ *minaddrp = NULL;
+
if (error = execpermissions(vp, &vat, args)) {
uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
return (error);
@@ -256,25 +296,91 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
&junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
len, &execsz, brksize)) {
uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
+ if (uphdr != NULL && uphdr->p_flags == 0)
+ kmem_free(uphdr, sizeof (Phdr));
kmem_free(phdrbase, phdrsize);
return (error);
}
+ if (minaddrp != NULL)
+ *minaddrp = minaddr;
+
/*
- * Inform our caller if the executable needs an interpreter.
+ * If the executable requires an interpreter, determine its name.
*/
- *interp = (dynphdr == NULL) ? 0 : 1;
+ if (dynphdr != NULL) {
+ ssize_t resid;
+
+ if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
+ uprintf("%s: Invalid interpreter\n", exec_file);
+ kmem_free(phdrbase, phdrsize);
+ return (ENOEXEC);
+ }
+
+ interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ if ((error = vn_rdwr(UIO_READ, vp, interp,
+ (ssize_t)dynphdr->p_filesz,
+ (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
+ (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
+ interp[dynphdr->p_filesz - 1] != '\0') {
+ uprintf("%s: Cannot obtain interpreter pathname\n",
+ exec_file);
+ kmem_free(interp, MAXPATHLEN);
+ kmem_free(phdrbase, phdrsize);
+ return (error != 0 ? error : ENOEXEC);
+ }
+ }
/*
* If this is a statically linked executable, voffset should indicate
* the address of the executable itself (it normally holds the address
* of the interpreter).
*/
- if (ehdr->e_type == ET_EXEC && *interp == 0)
+ if (ehdr->e_type == ET_EXEC && interp == NULL)
*voffset = minaddr;
+ /*
+ * If the caller has asked for the interpreter name, return it (it's
+ * up to the caller to free it); if the caller hasn't asked for it,
+ * free it ourselves.
+ */
+ if (interpp != NULL) {
+ *interpp = interp;
+ } else if (interp != NULL) {
+ kmem_free(interp, MAXPATHLEN);
+ }
+
if (uphdr != NULL) {
*uphdr_vaddr = uphdr->p_vaddr;
+
+ if (uphdr->p_flags == 0)
+ kmem_free(uphdr, sizeof (Phdr));
+ } else if (ehdr->e_type == ET_DYN) {
+ /*
+ * If we don't have a uphdr, we'll apply the logic found
+ * in mapelfexec() and use the p_vaddr of the first PT_LOAD
+ * section as the base address of the object.
+ */
+ const Phdr *phdr = (Phdr *)phdrbase;
+ const uint_t hsize = ehdr->e_phentsize;
+ uint_t i;
+
+ for (i = nphdrs; i > 0; i--) {
+ if (phdr->p_type == PT_LOAD) {
+ *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
+ ehdr->e_phoff;
+ break;
+ }
+
+ phdr = (Phdr *)((caddr_t)phdr + hsize);
+ }
+
+ /*
+ * If we don't have a PT_LOAD segment, we should have returned
+ * ENOEXEC when elfsize() returned 0, above.
+ */
+ VERIFY(i > 0);
} else {
*uphdr_vaddr = (Addr)-1;
}
@@ -286,14 +392,14 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
/*ARGSUSED*/
int
elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
- int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
- int brand_action)
+ int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
+ int *brand_action)
{
caddr_t phdrbase = NULL;
caddr_t bssbase = 0;
caddr_t brkbase = 0;
size_t brksize = 0;
- ssize_t dlnsize;
+ size_t dlnsize, nsize = 0;
aux_entry_t *aux;
int error;
ssize_t resid;
@@ -305,20 +411,19 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
Phdr *uphdr = NULL;
Phdr *junk = NULL;
size_t len;
+ size_t postfixsize = 0;
size_t i;
- ssize_t phdrsize;
- int postfixsize = 0;
- int hsize;
Phdr *phdrp;
Phdr *dataphdrp = NULL;
Phdr *dtrphdr;
Phdr *capphdr = NULL;
Cap *cap = NULL;
- ssize_t capsize;
+ size_t capsize;
int hasu = 0;
int hasauxv = 0;
int hasintp = 0;
int branded = 0;
+ int dynuphdr = 0;
struct proc *p = ttoproc(curthread);
struct user *up = PTOU(p);
@@ -331,7 +436,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
struct execenv exenv;
} *bigwad; /* kmem_alloc this behemoth so we don't blow stack */
Ehdr *ehdrp;
- int nshdrs, shstrndx, nphdrs;
+ uint_t nshdrs, shstrndx, nphdrs;
+ size_t phdrsize;
char *dlnp;
char *pathbufp;
rlim64_t limit;
@@ -373,7 +479,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
} else {
args->to_model = DATAMODEL_LP64;
- args->stk_prot &= ~PROT_EXEC;
+ if (!args->stk_prot_override) {
+ args->stk_prot &= ~PROT_EXEC;
+ }
#if defined(__i386) || defined(__amd64)
args->dat_prot &= ~PROT_EXEC;
#endif
@@ -385,11 +493,25 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
#endif /* _LP64 */
/*
- * We delay invoking the brand callback until we've figured out
- * what kind of elf binary we're trying to run, 32-bit or 64-bit.
- * We do this because now the brand library can just check
- * args->to_model to see if the target is 32-bit or 64-bit without
- * having do duplicate all the code above.
+ * We delay invoking the brand callback until we've figured out what
+ * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this
+ * because now the brand library can just check args->to_model to see if
+ * the target is 32-bit or 64-bit without having do duplicate all the
+ * code above.
+ *
+ * We also give the brand a chance to indicate that based on the ELF
+ * OSABI of the target binary it should become unbranded and optionally
+ * indicate that it should be treated as existing in a specific prefix.
+ *
+ * Note that if a brand opts to go down this route it does not actually
+ * end up being debranded. In other words, future programs that exec
+ * will still be considered for branding unless this escape hatch is
+ * used. Consider the case of lx brand for example. If a user runs
+ * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
+ * of DTrace that's in /native will take this escape hatch and be run
+ * and interpreted using the normal system call table; however, the
+ * execution of a non-illumos binary in the form of /bin/ls will still
+ * be branded and be subject to all of the normal actions of the brand.
*
* The level checks associated with brand handling below are used to
* prevent a loop since the brand elfexec function typically comes back
@@ -397,8 +519,20 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
* handling in the #! interpreter code will increment the level before
* calling gexec to run the final elfexec interpreter.
*/
+ if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
+ (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
+ if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
+ &args->brand_nroot) == B_TRUE) {
+ ASSERT(ehdrp->e_ident[EI_OSABI]);
+ *brand_action = EBA_NATIVE;
+ /* Add one for the trailing '/' in the path */
+ if (args->brand_nroot != NULL)
+ nsize = strlen(args->brand_nroot) + 1;
+ }
+ }
+
if ((level <= INTP_MAXDEPTH) &&
- (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+ (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
error = BROP(p)->b_elfexec(vp, uap, args,
idatap, level + 1, execsz, setid, exec_file, cred,
brand_action);
@@ -411,7 +545,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
* determine any non-default stack protections,
* and still have this code be machine independent.
*/
- hsize = ehdrp->e_phentsize;
+ const uint_t hsize = ehdrp->e_phentsize;
phdrp = (Phdr *)phdrbase;
for (i = nphdrs; i > 0; i--) {
switch (phdrp->p_type) {
@@ -472,14 +606,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
* AT_BASE
* AT_FLAGS
* AT_PAGESZ
+ * AT_RANDOM (added in stk_copyout)
* AT_SUN_AUXFLAGS
* AT_SUN_HWCAP
* AT_SUN_HWCAP2
- * AT_SUN_PLATFORM (added in stk_copyout)
- * AT_SUN_EXECNAME (added in stk_copyout)
+ * AT_SUN_PLATFORM (added in stk_copyout)
+ * AT_SUN_EXECNAME (added in stk_copyout)
* AT_NULL
*
- * total == 9
+ * total == 10
*/
if (hasintp && hasu) {
/*
@@ -494,7 +629,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
*
* total = 5
*/
- args->auxsize = (9 + 5) * sizeof (aux_entry_t);
+ args->auxsize = (10 + 5) * sizeof (aux_entry_t);
} else if (hasintp) {
/*
* Has PT_INTERP but no PT_PHDR
@@ -504,9 +639,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
*
* total = 2
*/
- args->auxsize = (9 + 2) * sizeof (aux_entry_t);
+ args->auxsize = (10 + 2) * sizeof (aux_entry_t);
} else {
- args->auxsize = 9 * sizeof (aux_entry_t);
+ args->auxsize = 10 * sizeof (aux_entry_t);
}
} else {
args->auxsize = 0;
@@ -520,6 +655,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
args->auxsize += sizeof (aux_entry_t);
/*
+ * If this is a native binary that's been given a modified interpreter
+ * root, inform it that the native system exists at that root.
+ */
+ if (args->brand_nroot != NULL) {
+ args->auxsize += sizeof (aux_entry_t);
+ }
+
+
+ /*
* On supported kernels (x86_64) make room in the auxv for the
* AT_SUN_COMMPAGE entry. This will go unpopulated on i86xpv systems
* which do not provide such functionality.
@@ -531,13 +675,24 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
args->auxsize += 3 * sizeof (aux_entry_t);
#endif /* defined(__amd64) */
- if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+ /*
+ * If we have user credentials, we'll supply the following entries:
+ * AT_SUN_UID
+ * AT_SUN_RUID
+ * AT_SUN_GID
+ * AT_SUN_RGID
+ */
+ if (cred != NULL) {
+ args->auxsize += 4 * sizeof (aux_entry_t);
+ }
+
+ if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
branded = 1;
/*
- * We will be adding 4 entries to the aux vectors. One for
- * the the brandname and 3 for the brand specific aux vectors.
+ * We will be adding 5 entries to the aux vectors. One for
+ * the the brandname and 4 for the brand specific aux vectors.
*/
- args->auxsize += 4 * sizeof (aux_entry_t);
+ args->auxsize += 5 * sizeof (aux_entry_t);
}
/* If the binary has an explicit ASLR flag, it must be honoured */
@@ -566,7 +721,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
(ssize_t)dynsize, (offset_t)(dynoffset + i),
UIO_SYSSPACE, 0, (rlim64_t)0,
- CRED(), &resid)) != 0) {
+ CRED(), NULL)) != 0) {
uprintf("%s: cannot read .dynamic section\n",
exec_file);
goto out;
@@ -594,13 +749,13 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
if (capphdr != NULL &&
(capsize = capphdr->p_filesz) > 0 &&
capsize <= 16 * sizeof (*cap)) {
- int ncaps = capsize / sizeof (*cap);
+ const uint_t ncaps = capsize / sizeof (*cap);
Cap *cp;
cap = kmem_alloc(capsize, KM_SLEEP);
if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
- capsize, (offset_t)capphdr->p_offset,
- UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
+ (ssize_t)capsize, (offset_t)capphdr->p_offset,
+ UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
uprintf("%s: Cannot read capabilities section\n",
exec_file);
goto out;
@@ -618,7 +773,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
aux = bigwad->elfargs;
/*
* Move args to the user's stack.
- * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
+ * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
+ * aux entries.
*/
if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
if (error == -1) {
@@ -640,10 +796,19 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
dtrphdr = NULL;
- if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
+ error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
&stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
- len, execsz, &brksize)) != 0)
+ len, execsz, &brksize);
+ /*
+ * Our uphdr has been dynamically allocated if (and only if) its
+ * program header flags are clear. To avoid leaks, this must be
+ * checked regardless of whether mapelfexec() emitted an error.
+ */
+ dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
+
+ if (error != 0) {
goto bad;
+ }
if (uphdr != NULL && intphdr == NULL)
goto bad;
@@ -659,17 +824,28 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
char *p;
struct vnode *nvp;
- dlnsize = intphdr->p_filesz;
+ dlnsize = intphdr->p_filesz + nsize;
- if (dlnsize > MAXPATHLEN || dlnsize <= 0)
+ /*
+ * Make sure none of the component pieces of dlnsize result in
+ * an oversized or zeroed result.
+ */
+ if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
+ dlnsize == 0 || dlnsize < intphdr->p_filesz) {
goto bad;
+ }
+
+ if (nsize != 0) {
+ bcopy(args->brand_nroot, dlnp, nsize - 1);
+ dlnp[nsize - 1] = '/';
+ }
/*
* Read in "interpreter" pathname.
*/
- if ((error = vn_rdwr(UIO_READ, vp, dlnp, intphdr->p_filesz,
- (offset_t)intphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
- CRED(), &resid)) != 0) {
+ if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
+ (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
+ UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
uprintf("%s: Cannot obtain interpreter pathname\n",
exec_file);
goto bad;
@@ -814,9 +990,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
dtrphdr = NULL;
- error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
+ error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
&junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
execsz, NULL);
+
if (error || junk != NULL) {
VN_RELE(nvp);
uprintf("%s: Cannot map %s\n", exec_file, dlnp);
@@ -849,8 +1026,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
#endif /* defined(__amd64) */
/*
- * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
- * exec_args()
+ * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
+ * filled in via exec_args()
*/
ADDAUX(aux, AT_BASE, voffset)
ADDAUX(aux, AT_FLAGS, at_flags)
@@ -878,7 +1055,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
* malicious user within the zone from crafting a wrapper to
* run native suid commands with unsecure libraries interposed.
*/
- if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
+ if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
(setid &= ~EXECSETID_SETID) != 0))
auxf &= ~AF_SUN_SETUGID;
@@ -893,6 +1070,17 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
/*
+ * Record information about the real and effective user and
+ * group IDs.
+ */
+ if (cred != NULL) {
+ ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
+ ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
+ ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
+ ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
+ }
+
+ /*
* Hardware capability flag word (performance hints)
* Used for choosing faster library routines.
* (Potentially different between 32-bit and 64-bit ABIs)
@@ -921,6 +1109,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
+ ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
}
/*
@@ -952,7 +1141,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
#endif /* defined(__amd64) */
ADDAUX(aux, AT_NULL, 0)
- postfixsize = (char *)aux - (char *)bigwad->elfargs;
+ postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
/*
* We make assumptions above when we determine how many aux
@@ -963,8 +1152,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
* We detect that now and error out.
*/
if (postfixsize != args->auxsize) {
- DTRACE_PROBE2(elfexec_badaux, int, postfixsize,
- int, args->auxsize);
+ DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
+ size_t, args->auxsize);
goto bad;
}
ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
@@ -992,7 +1181,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
bzero(up->u_auxv, sizeof (up->u_auxv));
up->u_commpagep = args->commpage;
if (postfixsize) {
- int num_auxv;
+ size_t num_auxv;
/*
* Copy the aux vector to the user stack.
@@ -1057,6 +1246,8 @@ bad:
if (error == 0)
error = ENOEXEC;
out:
+ if (dynuphdr)
+ kmem_free(uphdr, sizeof (Phdr));
if (phdrbase != NULL)
kmem_free(phdrbase, phdrsize);
if (cap != NULL)
@@ -1069,32 +1260,23 @@ out:
* Compute the memory size requirement for the ELF file.
*/
static size_t
-elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
+elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
+ uintptr_t *lddata)
{
- size_t len;
- Phdr *phdrp = (Phdr *)phdrbase;
- int hsize = ehdrp->e_phentsize;
- int first = 1;
- int dfirst = 1; /* first data segment */
- uintptr_t loaddr = 0;
+ const Phdr *phdrp = (Phdr *)phdrbase;
+ const uint_t hsize = ehdrp->e_phentsize;
+ boolean_t dfirst = B_TRUE;
+ uintptr_t loaddr = UINTPTR_MAX;
uintptr_t hiaddr = 0;
- uintptr_t lo, hi;
- int i;
+ uint_t i;
for (i = nphdrs; i > 0; i--) {
if (phdrp->p_type == PT_LOAD) {
- lo = phdrp->p_vaddr;
- hi = lo + phdrp->p_memsz;
- if (first) {
- loaddr = lo;
- hiaddr = hi;
- first = 0;
- } else {
- if (loaddr > lo)
- loaddr = lo;
- if (hiaddr < hi)
- hiaddr = hi;
- }
+ const uintptr_t lo = phdrp->p_vaddr;
+ const uintptr_t hi = lo + phdrp->p_memsz;
+
+ loaddr = MIN(lo, loaddr);
+ hiaddr = MAX(hi, hiaddr);
/*
* save the address of the first data segment
@@ -1104,16 +1286,18 @@ elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
if ((lddata != NULL) && dfirst &&
(phdrp->p_flags & PF_W)) {
*lddata = lo;
- dfirst = 0;
+ dfirst = B_FALSE;
}
}
phdrp = (Phdr *)((caddr_t)phdrp + hsize);
}
- len = hiaddr - (loaddr & PAGEMASK);
- len = roundup(len, PAGESIZE);
+ if (hiaddr <= loaddr) {
+ /* No non-zero PT_LOAD segment found */
+ return (0);
+ }
- return (len);
+ return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
}
/*
@@ -1123,8 +1307,8 @@ elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
* EINVAL Format recognized but execution not supported
*/
static int
-getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
- int *nphdrs)
+getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
+ uint_t *shstrndx, uint_t *nphdrs)
{
int error;
ssize_t resid;
@@ -1133,10 +1317,10 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
* We got here by the first two bytes in ident,
* now read the entire ELF header.
*/
- if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
- sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
- (rlim64_t)0, credp, &resid)) != 0)
+ if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr),
+ (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) {
return (error);
+ }
/*
* Since a separate version is compiled for handling 32-bit and
@@ -1145,8 +1329,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
*/
if (resid != 0 ||
ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
- ehdr->e_ident[EI_MAG3] != ELFMAG3)
+ ehdr->e_ident[EI_MAG3] != ELFMAG3) {
return (ENOEXEC);
+ }
if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
#if defined(_ILP32) || defined(_ELF32_COMPAT)
@@ -1155,8 +1340,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
#endif
!elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
- ehdr->e_flags))
+ ehdr->e_flags)) {
return (EINVAL);
+ }
*nshdrs = ehdr->e_shnum;
*shstrndx = ehdr->e_shstrndx;
@@ -1164,7 +1350,7 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
/*
* If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
- * to read in the section header at index zero to acces the true
+ * to read in the section header at index zero to access the true
* values for those fields.
*/
if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
@@ -1176,7 +1362,7 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
- (rlim64_t)0, credp, &resid)) != 0)
+ (rlim64_t)0, credp, NULL)) != 0)
return (error);
if (*nshdrs == 0)
@@ -1190,33 +1376,29 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
return (0);
}
-#ifdef _ELF32_COMPAT
-extern size_t elf_nphdr_max;
+/*
+ * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
+ * so e_phentsize must be at least large enough to include those members.
+ */
+#if !defined(_LP64) || defined(_ELF32_COMPAT)
+#define MINPHENTSZ (offsetof(Phdr, p_flags) + \
+ sizeof (((Phdr *)NULL)->p_flags))
#else
-size_t elf_nphdr_max = 1000;
+#define MINPHENTSZ (offsetof(Phdr, p_memsz) + \
+ sizeof (((Phdr *)NULL)->p_memsz))
#endif
static int
-getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs,
- caddr_t *phbasep, ssize_t *phsizep)
+getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
+ caddr_t *phbasep, size_t *phsizep)
{
- ssize_t resid, minsize;
int err;
/*
- * Since we're going to be using e_phentsize to iterate down the
- * array of program headers, it must be 8-byte aligned or else
- * a we might cause a misaligned access. We use all members through
- * p_flags on 32-bit ELF files and p_memsz on 64-bit ELF files so
- * e_phentsize must be at least large enough to include those
- * members.
+ * Ensure that e_phentsize is large enough for required fields to be
+ * accessible and will maintain 8-byte alignment.
*/
-#if !defined(_LP64) || defined(_ELF32_COMPAT)
- minsize = offsetof(Phdr, p_flags) + sizeof (((Phdr *)NULL)->p_flags);
-#else
- minsize = offsetof(Phdr, p_memsz) + sizeof (((Phdr *)NULL)->p_memsz);
-#endif
- if (ehdr->e_phentsize < minsize || (ehdr->e_phentsize & 3))
+ if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
return (EINVAL);
*phsizep = nphdrs * ehdr->e_phentsize;
@@ -1228,9 +1410,9 @@ getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs,
*phbasep = kmem_alloc(*phsizep, KM_SLEEP);
}
- if ((err = vn_rdwr(UIO_READ, vp, *phbasep, *phsizep,
+ if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
(offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
- credp, &resid)) != 0) {
+ credp, NULL)) != 0) {
kmem_free(*phbasep, *phsizep);
*phbasep = NULL;
return (err);
@@ -1239,21 +1421,14 @@ getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs,
return (0);
}
-#ifdef _ELF32_COMPAT
-extern size_t elf_nshdr_max;
-extern size_t elf_shstrtab_max;
-#else
-size_t elf_nshdr_max = 10000;
-size_t elf_shstrtab_max = 100 * 1024;
-#endif
-
+#define MINSHDRSZ (offsetof(Shdr, sh_entsize) + \
+ sizeof (((Shdr *)NULL)->sh_entsize))
static int
-getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
- int nshdrs, int shstrndx, caddr_t *shbasep, ssize_t *shsizep,
- char **shstrbasep, ssize_t *shstrsizep)
+getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
+ uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
+ size_t *shstrsizep)
{
- ssize_t resid, minsize;
int err;
Shdr *shdr;
@@ -1265,9 +1440,8 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
* must be at least large enough to include that member. The index
* of the string table section must also be valid.
*/
- minsize = offsetof(Shdr, sh_entsize) + sizeof (shdr->sh_entsize);
- if (ehdr->e_shentsize < minsize || (ehdr->e_shentsize & 3) ||
- shstrndx >= nshdrs)
+ if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
+ nshdrs == 0 || shstrndx >= nshdrs)
return (EINVAL);
*shsizep = nshdrs * ehdr->e_shentsize;
@@ -1279,16 +1453,16 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
*shbasep = kmem_alloc(*shsizep, KM_SLEEP);
}
- if ((err = vn_rdwr(UIO_READ, vp, *shbasep, *shsizep,
+ if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
(offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
- credp, &resid)) != 0) {
+ credp, NULL)) != 0) {
kmem_free(*shbasep, *shsizep);
return (err);
}
/*
- * Pull the section string table out of the vnode; fail if the size
- * is zero.
+ * Grab the section string table. Walking through the shdrs is
+ * pointless if their names cannot be interrogated.
*/
shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
if ((*shstrsizep = shdr->sh_size) == 0) {
@@ -1306,9 +1480,9 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
*shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
}
- if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep,
+ if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
(offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
- credp, &resid)) != 0) {
+ credp, NULL)) != 0) {
kmem_free(*shbasep, *shsizep);
kmem_free(*shstrbasep, *shstrsizep);
return (err);
@@ -1323,11 +1497,29 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
return (0);
}
+
+int
+elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
+ caddr_t *phbasep, size_t *phsizep)
+{
+ int error;
+ uint_t nshdrs, shstrndx;
+
+ if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
+ nphdrs)) != 0 ||
+ (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
+ phsizep)) != 0) {
+ return (error);
+ }
+ return (0);
+}
+
+
static int
mapelfexec(
vnode_t *vp,
Ehdr *ehdr,
- int nphdrs,
+ uint_t nphdrs,
caddr_t phdrbase,
Phdr **uphdr,
Phdr **intphdr,
@@ -1337,23 +1529,25 @@ mapelfexec(
caddr_t *bssbase,
caddr_t *brkbase,
intptr_t *voffset,
- intptr_t *minaddr,
+ uintptr_t *minaddrp,
size_t len,
- long *execsz,
+ size_t *execsz,
size_t *brksize)
{
Phdr *phdr;
- int i, prot, error;
+ int error, page, prot, lastprot = 0;
caddr_t addr = NULL;
- size_t zfodsz;
- int ptload = 0;
- int page;
+ caddr_t minaddr = (caddr_t)UINTPTR_MAX;
+ uint_t i;
+ size_t zfodsz, memsz;
+ boolean_t ptload = B_FALSE;
off_t offset;
- int hsize = ehdr->e_phentsize;
- caddr_t mintmp = (caddr_t)-1;
+ const uint_t hsize = ehdr->e_phentsize;
+ uintptr_t lastaddr = 0;
extern int use_brk_lpg;
if (ehdr->e_type == ET_DYN) {
+ caddr_t vaddr;
secflagset_t flags = 0;
/*
* Obtain the virtual address of a hole in the
@@ -1365,34 +1559,74 @@ mapelfexec(
map_addr(&addr, len, (offset_t)0, 1, flags);
if (addr == NULL)
return (ENOMEM);
- *voffset = (intptr_t)addr;
/*
- * Calculate the minimum vaddr so it can be subtracted out.
- * According to the ELF specification, since PT_LOAD sections
- * must be sorted by increasing p_vaddr values, this is
- * guaranteed to be the first PT_LOAD section.
+ * Despite the fact that mmapobj(2) refuses to load them, we
+ * need to support executing ET_DYN objects that have a
+ * non-NULL p_vaddr. When found in the wild, these objects
+ * are likely to be due to an old (and largely obviated) Linux
+ * facility, prelink(8), that rewrites shared objects to
+ * prefer specific (disjoint) virtual address ranges. (Yes,
+ * this is putatively for performance -- and yes, it has
+ * limited applicability, many edge conditions and grisly
+ * failure modes; even for Linux, it's insane.) As ELF
+ * mandates that the PT_LOAD segments be in p_vaddr order, we
+ * find the lowest p_vaddr by finding the first PT_LOAD
+ * segment.
*/
phdr = (Phdr *)phdrbase;
for (i = nphdrs; i > 0; i--) {
if (phdr->p_type == PT_LOAD) {
- *voffset -= (uintptr_t)phdr->p_vaddr;
+ addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
break;
}
phdr = (Phdr *)((caddr_t)phdr + hsize);
}
+ /*
+ * We have a non-zero p_vaddr in the first PT_LOAD segment --
+ * presumably because we're directly executing a prelink(8)'d
+ * ld-linux.so. While we could correctly execute such an
+ * object without locating it at its desired p_vaddr (it is,
+ * after all, still relocatable), our inner antiquarian
+ * derives a perverse pleasure in accommodating the steampunk
+ * prelink(8) contraption -- goggles on!
+ */
+ if ((vaddr = addr) != NULL) {
+ if (as_gap(curproc->p_as, len, &addr, &len,
+ AH_LO, NULL) == -1 || addr != vaddr) {
+ addr = NULL;
+ }
+ }
+
+ if (addr == NULL) {
+ /*
+ * We either have a NULL p_vaddr (the common case, by
+ * many orders of magnitude) or we have a non-NULL
+ * p_vaddr and we were unable to obtain the specified
+ * VA range (presumably because it's an illegal
+ * address). Either way, obtain an address in which
+ * to map the interpreter.
+ */
+ map_addr(&addr, len, (offset_t)0, 1, 0);
+ if (addr == NULL)
+ return (ENOMEM);
+ }
+
+ /*
+ * Our voffset is the difference between where we landed and
+ * where we wanted to be.
+ */
+ *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
} else {
*voffset = 0;
}
+
phdr = (Phdr *)phdrbase;
for (i = nphdrs; i > 0; i--) {
switch (phdr->p_type) {
case PT_LOAD:
- if ((*intphdr != NULL) && (*uphdr == NULL))
- return (0);
-
- ptload = 1;
+ ptload = B_TRUE;
prot = PROT_USER;
if (phdr->p_flags & PF_R)
prot |= PROT_READ;
@@ -1403,12 +1637,84 @@ mapelfexec(
addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
+ if ((*intphdr != NULL) && uphdr != NULL &&
+ (*uphdr == NULL)) {
+ /*
+ * The PT_PHDR program header is, strictly
+ * speaking, optional. If we find that this
+ * is missing, we will determine the location
+ * of the program headers based on the address
+ * of the lowest PT_LOAD segment (namely, this
+ * one): we subtract the p_offset to get to
+ * the ELF header and then add back the program
+ * header offset to get to the program headers.
+ * We then cons up a Phdr that corresponds to
+ * the (missing) PT_PHDR, setting the flags
+ * to 0 to denote that this is artificial and
+ * should (must) be freed by the caller.
+ */
+ Phdr *cons;
+
+ cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
+
+ cons->p_flags = 0;
+ cons->p_type = PT_PHDR;
+ cons->p_vaddr = ((uintptr_t)addr -
+ phdr->p_offset) + ehdr->e_phoff;
+
+ *uphdr = cons;
+ }
+
+ /*
+ * The ELF spec dictates that p_filesz may not be
+ * larger than p_memsz in PT_LOAD segments.
+ */
+ if (phdr->p_filesz > phdr->p_memsz) {
+ error = EINVAL;
+ goto bad;
+ }
+
/*
* Keep track of the segment with the lowest starting
* address.
*/
- if (addr < mintmp)
- mintmp = addr;
+ if (addr < minaddr)
+ minaddr = addr;
+
+ /*
+ * Segments need not correspond to page boundaries:
+ * they are permitted to share a page. If two PT_LOAD
+ * segments share the same page, and the permissions
+ * of the segments differ, the behavior is historically
+ * that the permissions of the latter segment are used
+ * for the page that the two segments share. This is
+ * also historically a non-issue: binaries generated
+ * by most anything will make sure that two PT_LOAD
+ * segments with differing permissions don't actually
+ * share any pages. However, there exist some crazy
+ * things out there (including at least an obscure
+ * Portuguese teaching language called G-Portugol) that
+ * actually do the wrong thing and expect it to work:
+ * they have a segment with execute permission share
+ * a page with a subsequent segment that does not
+ * have execute permissions and expect the resulting
+ * shared page to in fact be executable. To accommodate
+ * such broken link editors, we take advantage of a
+ * latitude explicitly granted to the loader: it is
+ * permitted to make _any_ PT_LOAD segment executable
+ * (provided that it is readable or writable). If we
+ * see that we're sharing a page and that the previous
+ * page was executable, we will add execute permissions
+ * to our segment.
+ */
+ if (btop(lastaddr) == btop((uintptr_t)addr) &&
+ (phdr->p_flags & (PF_R | PF_W)) &&
+ (lastprot & PROT_EXEC)) {
+ prot |= PROT_EXEC;
+ }
+
+ lastaddr = (uintptr_t)addr + phdr->p_filesz;
+ lastprot = prot;
zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
@@ -1428,14 +1734,22 @@ mapelfexec(
if (brksize != NULL && use_brk_lpg &&
zfodsz != 0 && phdr == dataphdrp &&
(prot & PROT_WRITE)) {
- size_t tlen = P2NPHASE((uintptr_t)addr +
+ const size_t tlen = P2NPHASE((uintptr_t)addr +
phdr->p_filesz, PAGESIZE);
if (zfodsz > tlen) {
+ const caddr_t taddr = addr +
+ phdr->p_filesz + tlen;
+
+ /*
+ * Since a hole in the AS large enough
+ * for this object as calculated by
+ * elfsize() is available, we do not
+ * need to fear overflow for 'taddr'.
+ */
curproc->p_brkpageszc =
page_szc(map_pgsz(MAPPGSZ_HEAP,
- curproc, addr + phdr->p_filesz +
- tlen, zfodsz - tlen, 0));
+ curproc, taddr, zfodsz - tlen, 0));
}
}
@@ -1477,12 +1791,31 @@ mapelfexec(
*brkbase = addr + phdr->p_memsz;
}
- *execsz += btopr(phdr->p_memsz);
+ memsz = btopr(phdr->p_memsz);
+ if ((*execsz + memsz) < *execsz) {
+ error = ENOMEM;
+ goto bad;
+ }
+ *execsz += memsz;
break;
case PT_INTERP:
- if (ptload)
- goto bad;
+ /*
+ * The ELF specification is unequivocal about the
+ * PT_INTERP program header with respect to any PT_LOAD
+ * program header: "If it is present, it must precede
+ * any loadable segment entry." Linux, however, makes
+ * no attempt to enforce this -- which has allowed some
+ * binary editing tools to get away with generating
+ * invalid ELF binaries in the respect that PT_INTERP
+ * occurs after the first PT_LOAD program header. This
+ * is unfortunate (and of course, disappointing) but
+ * it's no worse than that: there is no reason that we
+ * can't process the PT_INTERP entry (if present) after
+ * one or more PT_LOAD entries. We therefore
+ * deliberately do not check ptload here and always
+ * store dyphdr to be the PT_INTERP program header.
+ */
*intphdr = phdr;
break;
@@ -1491,9 +1824,12 @@ mapelfexec(
break;
case PT_PHDR:
- if (ptload)
+ if (ptload || phdr->p_flags == 0)
goto bad;
- *uphdr = phdr;
+
+ if (uphdr != NULL)
+ *uphdr = phdr;
+
break;
case PT_NULL:
@@ -1512,9 +1848,9 @@ mapelfexec(
phdr = (Phdr *)((caddr_t)phdr + hsize);
}
- if (minaddr != NULL) {
- ASSERT(mintmp != (caddr_t)-1);
- *minaddr = (intptr_t)mintmp;
+ if (minaddrp != NULL) {
+ ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
+ *minaddrp = (uintptr_t)minaddr;
}
if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
@@ -1586,24 +1922,39 @@ elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
return (0);
}
+
/*
* Copy the section data from one vnode to the section of another vnode.
*/
static void
-copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset,
- void *buf, size_t size, cred_t *credp, rlim64_t rlimit)
+elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
{
- ssize_t resid;
- size_t len, n = src->sh_size;
- offset_t off = 0;
+ size_t n = src->sh_size;
+ u_offset_t off = 0;
+ const u_offset_t soff = src->sh_offset;
+ const u_offset_t doff = ctx->ecc_doffset;
+ void *buf = ctx->ecc_buf;
+ vnode_t *dst_vp = ctx->ecc_vp;
+ cred_t *credp = ctx->ecc_credp;
+
+ /* Protect the copy loop below from overflow on the offsets */
+ if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
+ (n + soff) < n || (n + doff) < n) {
+ dst->sh_size = 0;
+ dst->sh_offset = 0;
+ return;
+ }
while (n != 0) {
- len = MIN(size, n);
- if (vn_rdwr(UIO_READ, src_vp, buf, len, src->sh_offset + off,
+ const size_t len = MIN(ctx->ecc_bufsz, n);
+ ssize_t resid;
+
+ if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
+ (offset_t)(soff + off),
UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
- resid >= len ||
- core_write(dst_vp, UIO_SYSSPACE, *doffset + off,
- buf, len - resid, rlimit, credp) != 0) {
+ resid >= len || resid < 0 ||
+ core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
+ buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
dst->sh_size = 0;
dst->sh_offset = 0;
return;
@@ -1615,62 +1966,222 @@ copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset,
off += len - resid;
}
- *doffset += src->sh_size;
+ ctx->ecc_doffset += src->sh_size;
}
-#ifdef _ELF32_COMPAT
-extern size_t elf_datasz_max;
-#else
-size_t elf_datasz_max = 1 * 1024 * 1024;
-#endif
+/*
+ * Walk sections for a given ELF object, counting (or copying) those of
+ * interest (CTF, symtab, strtab).
+ */
+static uint_t
+elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
+ Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab)
+{
+ Ehdr ehdr;
+ const core_content_t content = ctx->ecc_content;
+ cred_t *credp = ctx->ecc_credp;
+ Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
+ uintptr_t off = 0;
+ uint_t nshdrs, shstrndx, nphdrs, count = 0;
+ u_offset_t *doffp = &ctx->ecc_doffset;
+ boolean_t ctf_link = B_FALSE;
+ caddr_t shbase;
+ size_t shsize, shstrsize;
+ char *shstrbase;
+
+ if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) == 0) {
+ return (0);
+ }
+
+ if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
+ getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
+ &shstrbase, &shstrsize) != 0) {
+ return (0);
+ }
+
+ /* Starting at index 1 skips SHT_NULL which is expected at index 0 */
+ off = ehdr.e_shentsize;
+ for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
+ Shdr *shdr, *symchk = NULL, *strchk;
+ const char *name;
+
+ shdr = (Shdr *)(shbase + off);
+ if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
+ continue;
+
+ name = shstrbase + shdr->sh_name;
+
+ if (ctf == NULL &&
+ (content & CC_CONTENT_CTF) != 0 &&
+ strcmp(name, shstrtab_data[STR_CTF]) == 0) {
+ ctf = shdr;
+ if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
+ /* check linked symtab below */
+ symchk = (Shdr *)(shbase +
+ shdr->sh_link * ehdr.e_shentsize);
+ ctf_link = B_TRUE;
+ } else {
+ continue;
+ }
+ } else if (symtab == NULL &&
+ (content & CC_CONTENT_SYMTAB) != 0 &&
+ strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
+ symchk = shdr;
+ } else {
+ continue;
+ }
+
+ ASSERT(symchk != NULL);
+ if ((symchk->sh_type != SHT_DYNSYM &&
+ symchk->sh_type != SHT_SYMTAB) ||
+ symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
+ ctf_link = B_FALSE;
+ continue;
+ }
+ strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
+ if (strchk->sh_type != SHT_STRTAB) {
+ ctf_link = B_FALSE;
+ continue;
+ }
+ symtab = symchk;
+ strtab = strchk;
+
+ if (symtab != NULL && ctf != NULL) {
+ /* No other shdrs are of interest at this point */
+ break;
+ }
+ }
+
+ if (ctf != NULL)
+ count += 1;
+ if (symtab != NULL)
+ count += 2;
+ if (v == NULL || count == 0 || count > remain) {
+ count = MIN(count, remain);
+ goto done;
+ }
+
+ /* output CTF section */
+ if (ctf != NULL) {
+ elf_ctx_resize_scratch(ctx, ctf->sh_size);
+
+ v[idx].sh_name = shstrtab_ndx(shstrtab, STR_CTF);
+ v[idx].sh_addr = (Addr)(uintptr_t)saddr;
+ v[idx].sh_type = SHT_PROGBITS;
+ v[idx].sh_addralign = 4;
+ *doffp = roundup(*doffp, v[idx].sh_addralign);
+ v[idx].sh_offset = *doffp;
+ v[idx].sh_size = ctf->sh_size;
+
+ if (ctf_link) {
+ /*
+ * The linked symtab (and strtab) will be output
+ * immediately after this CTF section. Its shdr index
+ * directly follows this one.
+ */
+ v[idx].sh_link = idx + 1;
+ ASSERT(symtab != NULL);
+ } else {
+ v[idx].sh_link = 0;
+ }
+ elf_copy_scn(ctx, ctf, mvp, &v[idx]);
+ idx++;
+ }
+
+ /* output SYMTAB/STRTAB sections */
+ if (symtab != NULL) {
+ uint_t symtab_name, strtab_name;
+
+ elf_ctx_resize_scratch(ctx,
+ MAX(symtab->sh_size, strtab->sh_size));
+
+ if (symtab->sh_type == SHT_DYNSYM) {
+ symtab_name = shstrtab_ndx(shstrtab, STR_DYNSYM);
+ strtab_name = shstrtab_ndx(shstrtab, STR_DYNSTR);
+ } else {
+ symtab_name = shstrtab_ndx(shstrtab, STR_SYMTAB);
+ strtab_name = shstrtab_ndx(shstrtab, STR_STRTAB);
+ }
+
+ v[idx].sh_name = symtab_name;
+ v[idx].sh_type = symtab->sh_type;
+ v[idx].sh_addr = symtab->sh_addr;
+ if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
+ v[idx].sh_addr += (Addr)(uintptr_t)saddr;
+ v[idx].sh_addralign = symtab->sh_addralign;
+ *doffp = roundup(*doffp, v[idx].sh_addralign);
+ v[idx].sh_offset = *doffp;
+ v[idx].sh_size = symtab->sh_size;
+ v[idx].sh_link = idx + 1;
+ v[idx].sh_entsize = symtab->sh_entsize;
+ v[idx].sh_info = symtab->sh_info;
+
+ elf_copy_scn(ctx, symtab, mvp, &v[idx]);
+ idx++;
+
+ v[idx].sh_name = strtab_name;
+ v[idx].sh_type = SHT_STRTAB;
+ v[idx].sh_flags = SHF_STRINGS;
+ v[idx].sh_addr = strtab->sh_addr;
+ if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
+ v[idx].sh_addr += (Addr)(uintptr_t)saddr;
+ v[idx].sh_addralign = strtab->sh_addralign;
+ *doffp = roundup(*doffp, v[idx].sh_addralign);
+ v[idx].sh_offset = *doffp;
+ v[idx].sh_size = strtab->sh_size;
+
+ elf_copy_scn(ctx, strtab, mvp, &v[idx]);
+ idx++;
+ }
+
+done:
+ kmem_free(shstrbase, shstrsize);
+ kmem_free(shbase, shsize);
+ return (count);
+}
/*
- * This function processes mappings that correspond to load objects to
- * examine their respective sections for elfcore(). It's called once with
- * v set to NULL to count the number of sections that we're going to need
- * and then again with v set to some allocated buffer that we fill in with
- * all the section data.
+ * Walk mappings in process address space, examining those which correspond to
+ * loaded objects. It is called twice from elfcore: Once to simply count
+ * relevant sections, and again later to copy those sections once an adequate
+ * buffer has been allocated for the shdr details.
*/
static int
-process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp,
- Shdr *v, int nv, rlim64_t rlimit, Off *doffsetp, int *nshdrsp)
+elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
{
vnode_t *lastvp = NULL;
struct seg *seg;
- int i, j;
- void *data = NULL;
- size_t datasz = 0;
+ uint_t idx = 0, remain;
shstrtab_t shstrtab;
- struct as *as = p->p_as;
+ struct as *as = ctx->ecc_p->p_as;
int error = 0;
- if (v != NULL)
+ ASSERT(AS_WRITE_HELD(as));
+
+ if (v != NULL) {
+ ASSERT(nv != 0);
+
shstrtab_init(&shstrtab);
+ remain = nv;
+ } else {
+ ASSERT(nv == 0);
- i = 1;
+ /*
+ * The shdrs are being counted, rather than outputting them
+ * into a buffer. Leave room for two entries: the SHT_NULL at
+ * index 0 and the shstrtab at the end.
+ */
+ remain = UINT_MAX - 2;
+ }
+
+ /* Per the ELF spec, shdr index 0 is reserved. */
+ idx = 1;
for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
- uint_t prot;
vnode_t *mvp;
void *tmp = NULL;
- caddr_t saddr = seg->s_base;
- caddr_t naddr;
- caddr_t eaddr;
+ caddr_t saddr = seg->s_base, naddr, eaddr;
size_t segsize;
-
- Ehdr ehdr;
- int nshdrs, shstrndx, nphdrs;
- caddr_t shbase;
- ssize_t shsize;
- char *shstrbase;
- ssize_t shstrsize;
-
- Shdr *shdr;
- const char *name;
- size_t sz;
- uintptr_t off;
-
- int ctf_ndx = 0;
- int symtab_ndx = 0;
+ uint_t count, prot;
/*
* Since we're just looking for text segments of load
@@ -1696,222 +2207,51 @@ process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp,
if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
continue;
- if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx,
- &nphdrs) != 0 ||
- getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx,
- &shbase, &shsize, &shstrbase, &shstrsize) != 0)
- continue;
-
- off = ehdr.e_shentsize;
- for (j = 1; j < nshdrs; j++, off += ehdr.e_shentsize) {
- Shdr *symtab = NULL, *strtab;
-
- shdr = (Shdr *)(shbase + off);
-
- if (shdr->sh_name >= shstrsize)
- continue;
-
- name = shstrbase + shdr->sh_name;
-
- if (strcmp(name, shstrtab_data[STR_CTF]) == 0) {
- if ((content & CC_CONTENT_CTF) == 0 ||
- ctf_ndx != 0)
- continue;
-
- if (shdr->sh_link > 0 &&
- shdr->sh_link < nshdrs) {
- symtab = (Shdr *)(shbase +
- shdr->sh_link * ehdr.e_shentsize);
- }
-
- if (v != NULL && i < nv - 1) {
- if (shdr->sh_size > datasz &&
- shdr->sh_size <= elf_datasz_max) {
- if (data != NULL)
- kmem_free(data, datasz);
-
- datasz = shdr->sh_size;
- data = kmem_alloc(datasz,
- KM_SLEEP);
- }
-
- v[i].sh_name = shstrtab_ndx(&shstrtab,
- STR_CTF);
- v[i].sh_addr = (Addr)(uintptr_t)saddr;
- v[i].sh_type = SHT_PROGBITS;
- v[i].sh_addralign = 4;
- *doffsetp = roundup(*doffsetp,
- v[i].sh_addralign);
- v[i].sh_offset = *doffsetp;
- v[i].sh_size = shdr->sh_size;
- if (symtab == NULL) {
- v[i].sh_link = 0;
- } else if (symtab->sh_type ==
- SHT_SYMTAB &&
- symtab_ndx != 0) {
- v[i].sh_link =
- symtab_ndx;
- } else {
- v[i].sh_link = i + 1;
- }
-
- copy_scn(shdr, mvp, &v[i], vp,
- doffsetp, data, datasz, credp,
- rlimit);
- }
-
- ctf_ndx = i++;
-
- /*
- * We've already dumped the symtab.
- */
- if (symtab != NULL &&
- symtab->sh_type == SHT_SYMTAB &&
- symtab_ndx != 0)
- continue;
-
- } else if (strcmp(name,
- shstrtab_data[STR_SYMTAB]) == 0) {
- if ((content & CC_CONTENT_SYMTAB) == 0 ||
- symtab != 0)
- continue;
-
- symtab = shdr;
- }
-
- if (symtab != NULL) {
- if ((symtab->sh_type != SHT_DYNSYM &&
- symtab->sh_type != SHT_SYMTAB) ||
- symtab->sh_link == 0 ||
- symtab->sh_link >= nshdrs)
- continue;
-
- strtab = (Shdr *)(shbase +
- symtab->sh_link * ehdr.e_shentsize);
-
- if (strtab->sh_type != SHT_STRTAB)
- continue;
-
- if (v != NULL && i < nv - 2) {
- sz = MAX(symtab->sh_size,
- strtab->sh_size);
- if (sz > datasz &&
- sz <= elf_datasz_max) {
- if (data != NULL)
- kmem_free(data, datasz);
-
- datasz = sz;
- data = kmem_alloc(datasz,
- KM_SLEEP);
- }
-
- if (symtab->sh_type == SHT_DYNSYM) {
- v[i].sh_name = shstrtab_ndx(
- &shstrtab, STR_DYNSYM);
- v[i + 1].sh_name = shstrtab_ndx(
- &shstrtab, STR_DYNSTR);
- } else {
- v[i].sh_name = shstrtab_ndx(
- &shstrtab, STR_SYMTAB);
- v[i + 1].sh_name = shstrtab_ndx(
- &shstrtab, STR_STRTAB);
- }
-
- v[i].sh_type = symtab->sh_type;
- v[i].sh_addr = symtab->sh_addr;
- if (ehdr.e_type == ET_DYN ||
- v[i].sh_addr == 0)
- v[i].sh_addr +=
- (Addr)(uintptr_t)saddr;
- v[i].sh_addralign =
- symtab->sh_addralign;
- *doffsetp = roundup(*doffsetp,
- v[i].sh_addralign);
- v[i].sh_offset = *doffsetp;
- v[i].sh_size = symtab->sh_size;
- v[i].sh_link = i + 1;
- v[i].sh_entsize = symtab->sh_entsize;
- v[i].sh_info = symtab->sh_info;
-
- copy_scn(symtab, mvp, &v[i], vp,
- doffsetp, data, datasz, credp,
- rlimit);
-
- v[i + 1].sh_type = SHT_STRTAB;
- v[i + 1].sh_flags = SHF_STRINGS;
- v[i + 1].sh_addr = symtab->sh_addr;
- if (ehdr.e_type == ET_DYN ||
- v[i + 1].sh_addr == 0)
- v[i + 1].sh_addr +=
- (Addr)(uintptr_t)saddr;
- v[i + 1].sh_addralign =
- strtab->sh_addralign;
- *doffsetp = roundup(*doffsetp,
- v[i + 1].sh_addralign);
- v[i + 1].sh_offset = *doffsetp;
- v[i + 1].sh_size = strtab->sh_size;
-
- copy_scn(strtab, mvp, &v[i + 1], vp,
- doffsetp, data, datasz, credp,
- rlimit);
- }
+ count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
+ &shstrtab);
- if (symtab->sh_type == SHT_SYMTAB)
- symtab_ndx = i;
- i += 2;
- }
- }
-
- kmem_free(shstrbase, shstrsize);
- kmem_free(shbase, shsize);
+ ASSERT(count <= remain);
+ ASSERT(v == NULL || (idx + count) < nv);
+ remain -= count;
+ idx += count;
lastvp = mvp;
}
if (v == NULL) {
- if (i == 1)
+ if (idx == 1) {
*nshdrsp = 0;
- else
- *nshdrsp = i + 1;
- goto done;
+ } else {
+ /* Include room for the shrstrtab at the end */
+ *nshdrsp = idx + 1;
+ }
+ return (0);
}
- if (i != nv - 1) {
+ if (idx != nv - 1) {
cmn_err(CE_WARN, "elfcore: core dump failed for "
- "process %d; address space is changing", p->p_pid);
- error = EIO;
- goto done;
+ "process %d; address space is changing",
+ ctx->ecc_p->p_pid);
+ return (EIO);
}
- v[i].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
- v[i].sh_size = shstrtab_size(&shstrtab);
- v[i].sh_addralign = 1;
- *doffsetp = roundup(*doffsetp, v[i].sh_addralign);
- v[i].sh_offset = *doffsetp;
- v[i].sh_flags = SHF_STRINGS;
- v[i].sh_type = SHT_STRTAB;
-
- if (v[i].sh_size > datasz) {
- if (data != NULL)
- kmem_free(data, datasz);
-
- datasz = v[i].sh_size;
- data = kmem_alloc(datasz,
- KM_SLEEP);
+ v[idx].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
+ v[idx].sh_size = shstrtab_size(&shstrtab);
+ v[idx].sh_addralign = 1;
+ v[idx].sh_offset = ctx->ecc_doffset;
+ v[idx].sh_flags = SHF_STRINGS;
+ v[idx].sh_type = SHT_STRTAB;
+
+ elf_ctx_resize_scratch(ctx, v[idx].sh_size);
+ VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
+ shstrtab_dump(&shstrtab, ctx->ecc_buf);
+
+ error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
+ ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
+ if (error == 0) {
+ ctx->ecc_doffset += v[idx].sh_size;
}
- shstrtab_dump(&shstrtab, data);
-
- if ((error = core_write(vp, UIO_SYSSPACE, *doffsetp,
- data, v[i].sh_size, rlimit, credp)) != 0)
- goto done;
-
- *doffsetp += v[i].sh_size;
-
-done:
- if (data != NULL)
- kmem_free(data, datasz);
-
return (error);
}
@@ -1919,27 +2259,30 @@ int
elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
core_content_t content)
{
- offset_t poffset, soffset;
- Off doffset;
- int error, i, nphdrs, nshdrs;
- int overflow = 0;
+ u_offset_t poffset, soffset, doffset;
+ int error;
+ uint_t i, nphdrs, nshdrs;
struct seg *seg;
struct as *as = p->p_as;
- union {
- Ehdr ehdr;
- Phdr phdr[1];
- Shdr shdr[1];
- } *bigwad;
- size_t bigsize;
- size_t phdrsz, shdrsz;
+ void *bigwad;
+ size_t bigsize, phdrsz, shdrsz;
Ehdr *ehdr;
- Phdr *v;
- caddr_t brkbase;
- size_t brksize;
- caddr_t stkbase;
- size_t stksize;
- int ntries = 0;
+ Phdr *phdr;
+ Shdr shdr0;
+ caddr_t brkbase, stkbase;
+ size_t brksize, stksize;
+ boolean_t overflowed = B_FALSE, retried = B_FALSE;
klwp_t *lwp = ttolwp(curthread);
+ elf_core_ctx_t ctx = {
+ .ecc_vp = vp,
+ .ecc_p = p,
+ .ecc_credp = credp,
+ .ecc_rlimit = rlimit,
+ .ecc_content = content,
+ .ecc_doffset = 0,
+ .ecc_buf = NULL,
+ .ecc_bufsz = 0
+ };
top:
/*
@@ -1957,28 +2300,32 @@ top:
*/
nshdrs = 0;
if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
- (void) process_scns(content, p, credp, NULL, NULL, 0, 0,
- NULL, &nshdrs);
+ VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
}
AS_LOCK_EXIT(as);
- ASSERT(nshdrs == 0 || nshdrs > 1);
-
/*
- * The core file contents may required zero section headers, but if
+ * The core file contents may require zero section headers, but if
* we overflow the 16 bits allotted to the program header count in
* the ELF header, we'll need that program header at index zero.
*/
- if (nshdrs == 0 && nphdrs >= PN_XNUM)
+ if (nshdrs == 0 && nphdrs >= PN_XNUM) {
nshdrs = 1;
+ }
+ /*
+ * Allocate a buffer which is sized adequately to hold the ehdr, phdrs
+ * or shdrs needed to produce the core file. It is used for the three
+ * tasks sequentially, not simultaneously, so it does not need space
+ * for all three data at once, only the largest one.
+ */
+ VERIFY(nphdrs >= 2);
phdrsz = nphdrs * sizeof (Phdr);
shdrsz = nshdrs * sizeof (Shdr);
-
- bigsize = MAX(sizeof (*bigwad), MAX(phdrsz, shdrsz));
+ bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
bigwad = kmem_alloc(bigsize, KM_SLEEP);
- ehdr = &bigwad->ehdr;
+ ehdr = (Ehdr *)bigwad;
bzero(ehdr, sizeof (*ehdr));
ehdr->e_ident[EI_MAG0] = ELFMAG0;
@@ -2014,6 +2361,11 @@ top:
#endif /* !defined(_LP64) || defined(_ELF32_COMPAT) */
+ poffset = sizeof (Ehdr);
+ soffset = sizeof (Ehdr) + phdrsz;
+ doffset = sizeof (Ehdr) + phdrsz + shdrsz;
+ bzero(&shdr0, sizeof (shdr0));
+
/*
* If the count of program headers or section headers or the index
* of the section string table can't fit in the mere 16 bits
@@ -2021,50 +2373,52 @@ top:
* extended formats and put the real values in the section header
* as index 0.
*/
- ehdr->e_version = EV_CURRENT;
- ehdr->e_ehsize = sizeof (Ehdr);
-
- if (nphdrs >= PN_XNUM)
+ if (nphdrs >= PN_XNUM) {
ehdr->e_phnum = PN_XNUM;
- else
+ shdr0.sh_info = nphdrs;
+ } else {
ehdr->e_phnum = (unsigned short)nphdrs;
-
- ehdr->e_phoff = sizeof (Ehdr);
- ehdr->e_phentsize = sizeof (Phdr);
+ }
if (nshdrs > 0) {
- if (nshdrs >= SHN_LORESERVE)
+ if (nshdrs >= SHN_LORESERVE) {
ehdr->e_shnum = 0;
- else
+ shdr0.sh_size = nshdrs;
+ } else {
ehdr->e_shnum = (unsigned short)nshdrs;
+ }
- if (nshdrs - 1 >= SHN_LORESERVE)
+ if (nshdrs - 1 >= SHN_LORESERVE) {
ehdr->e_shstrndx = SHN_XINDEX;
- else
+ shdr0.sh_link = nshdrs - 1;
+ } else {
ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
+ }
- ehdr->e_shoff = ehdr->e_phoff + ehdr->e_phentsize * nphdrs;
+ ehdr->e_shoff = soffset;
ehdr->e_shentsize = sizeof (Shdr);
}
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_ehsize = sizeof (Ehdr);
+ ehdr->e_phoff = poffset;
+ ehdr->e_phentsize = sizeof (Phdr);
+
if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
- sizeof (Ehdr), rlimit, credp))
+ sizeof (Ehdr), rlimit, credp)) {
goto done;
+ }
- poffset = sizeof (Ehdr);
- soffset = sizeof (Ehdr) + phdrsz;
- doffset = sizeof (Ehdr) + phdrsz + shdrsz;
-
- v = &bigwad->phdr[0];
- bzero(v, phdrsz);
+ phdr = (Phdr *)bigwad;
+ bzero(phdr, phdrsz);
- setup_old_note_header(&v[0], p);
- v[0].p_offset = doffset = roundup(doffset, sizeof (Word));
- doffset += v[0].p_filesz;
+ setup_old_note_header(&phdr[0], p);
+ phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
+ doffset += phdr[0].p_filesz;
- setup_note_header(&v[1], p);
- v[1].p_offset = doffset = roundup(doffset, sizeof (Word));
- doffset += v[1].p_filesz;
+ setup_note_header(&phdr[1], p);
+ phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
+ doffset += phdr[1].p_filesz;
mutex_enter(&p->p_lock);
@@ -2096,21 +2450,23 @@ top:
prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
- if ((size = (size_t)(naddr - saddr)) == 0)
- continue;
- if (i == nphdrs) {
- overflow++;
+ if ((size = (size_t)(naddr - saddr)) == 0) {
+ ASSERT(tmp == NULL);
continue;
+ } else if (i == nphdrs) {
+ pr_getprot_done(&tmp);
+ overflowed = B_TRUE;
+ break;
}
- v[i].p_type = PT_LOAD;
- v[i].p_vaddr = (Addr)(uintptr_t)saddr;
- v[i].p_memsz = size;
+ phdr[i].p_type = PT_LOAD;
+ phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
+ phdr[i].p_memsz = size;
if (prot & PROT_READ)
- v[i].p_flags |= PF_R;
+ phdr[i].p_flags |= PF_R;
if (prot & PROT_WRITE)
- v[i].p_flags |= PF_W;
+ phdr[i].p_flags |= PF_W;
if (prot & PROT_EXEC)
- v[i].p_flags |= PF_X;
+ phdr[i].p_flags |= PF_X;
/*
* Figure out which mappings to include in the core.
@@ -2172,20 +2528,23 @@ top:
}
doffset = roundup(doffset, sizeof (Word));
- v[i].p_offset = doffset;
- v[i].p_filesz = size;
+ phdr[i].p_offset = doffset;
+ phdr[i].p_filesz = size;
doffset += size;
exclude:
i++;
}
- ASSERT(tmp == NULL);
+ VERIFY(tmp == NULL);
+ if (overflowed)
+ break;
}
AS_LOCK_EXIT(as);
- if (overflow || i != nphdrs) {
- if (ntries++ == 0) {
+ if (overflowed || i != nphdrs) {
+ if (!retried) {
+ retried = B_TRUE;
+ overflowed = B_FALSE;
kmem_free(bigwad, bigsize);
- overflow = 0;
goto top;
}
cmn_err(CE_WARN, "elfcore: core dump failed for "
@@ -2195,23 +2554,25 @@ exclude:
}
if ((error = core_write(vp, UIO_SYSSPACE, poffset,
- v, phdrsz, rlimit, credp)) != 0)
+ phdr, phdrsz, rlimit, credp)) != 0) {
goto done;
+ }
- if ((error = write_old_elfnotes(p, sig, vp, v[0].p_offset, rlimit,
- credp)) != 0)
+ if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
+ credp)) != 0) {
goto done;
-
- if ((error = write_elfnotes(p, sig, vp, v[1].p_offset, rlimit,
- credp, content)) != 0)
+ }
+ if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
+ credp, content)) != 0) {
goto done;
+ }
for (i = 2; i < nphdrs; i++) {
prkillinfo_t killinfo;
sigqueue_t *sq;
int sig, j;
- if (v[i].p_filesz == 0)
+ if (phdr[i].p_filesz == 0)
continue;
/*
@@ -2222,8 +2583,8 @@ exclude:
* this from mappings that were excluded due to the core file
* content settings.
*/
- if ((error = core_seg(p, vp, v[i].p_offset,
- (caddr_t)(uintptr_t)v[i].p_vaddr, v[i].p_filesz,
+ if ((error = core_seg(p, vp, phdr[i].p_offset,
+ (caddr_t)(uintptr_t)phdr[i].p_vaddr, phdr[i].p_filesz,
rlimit, credp)) == 0) {
continue;
}
@@ -2236,14 +2597,14 @@ exclude:
* bytes. This undocumented interface will let us
* understand the nature of the failure.
*/
- (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
+ (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
&error, sizeof (error), rlimit, credp);
- v[i].p_filesz = 0;
- v[i].p_flags |= PF_SUNW_FAILURE;
+ phdr[i].p_filesz = 0;
+ phdr[i].p_flags |= PF_SUNW_FAILURE;
if ((error = core_write(vp, UIO_SYSSPACE,
- poffset + sizeof (v[i]) * i, &v[i], sizeof (v[i]),
- rlimit, credp)) != 0)
+ poffset + sizeof (Phdr) * i, &phdr[i],
+ sizeof (Phdr), rlimit, credp)) != 0)
goto done;
continue;
@@ -2285,15 +2646,15 @@ exclude:
}
#endif
- (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
+ (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
&killinfo, sizeof (killinfo), rlimit, credp);
/*
* For the segment on which we took the signal, indicate that
* its data now refers to a siginfo.
*/
- v[i].p_filesz = 0;
- v[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
+ phdr[i].p_filesz = 0;
+ phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
PF_SUNW_SIGINFO;
/*
@@ -2301,50 +2662,46 @@ exclude:
* is due to a signal.
*/
for (j = i + 1; j < nphdrs; j++) {
- v[j].p_filesz = 0;
- v[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
+ phdr[j].p_filesz = 0;
+ phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
}
/*
* Finally, write out our modified program headers.
*/
if ((error = core_write(vp, UIO_SYSSPACE,
- poffset + sizeof (v[i]) * i, &v[i],
- sizeof (v[i]) * (nphdrs - i), rlimit, credp)) != 0)
+ poffset + sizeof (Phdr) * i, &phdr[i],
+ sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
goto done;
+ }
break;
}
if (nshdrs > 0) {
- bzero(&bigwad->shdr[0], shdrsz);
-
- if (nshdrs >= SHN_LORESERVE)
- bigwad->shdr[0].sh_size = nshdrs;
-
- if (nshdrs - 1 >= SHN_LORESERVE)
- bigwad->shdr[0].sh_link = nshdrs - 1;
-
- if (nphdrs >= PN_XNUM)
- bigwad->shdr[0].sh_info = nphdrs;
+ Shdr *shdr = (Shdr *)bigwad;
+ bzero(shdr, shdrsz);
if (nshdrs > 1) {
+ ctx.ecc_doffset = doffset;
AS_LOCK_ENTER(as, RW_WRITER);
- if ((error = process_scns(content, p, credp, vp,
- &bigwad->shdr[0], nshdrs, rlimit, &doffset,
- NULL)) != 0) {
- AS_LOCK_EXIT(as);
+ error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
+ AS_LOCK_EXIT(as);
+ if (error != 0) {
goto done;
}
- AS_LOCK_EXIT(as);
}
+ /* Copy any extended format data destined for the first shdr */
+ bcopy(&shdr0, shdr, sizeof (shdr0));
- if ((error = core_write(vp, UIO_SYSSPACE, soffset,
- &bigwad->shdr[0], shdrsz, rlimit, credp)) != 0)
- goto done;
+ error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
+ rlimit, credp);
}
done:
+ if (ctx.ecc_bufsz != 0) {
+ kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
+ }
kmem_free(bigwad, bigsize);
return (error);
}
@@ -2369,9 +2726,9 @@ static struct modlexec modlexec = {
#ifdef _LP64
extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
- intpdata_t *idatap, int level, long *execsz,
+ intpdata_t *idatap, int level, size_t *execsz,
int setid, caddr_t exec_file, cred_t *cred,
- int brand_action);
+ int *brand_action);
extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
rlim64_t rlimit, int sig, core_content_t content);
diff --git a/usr/src/uts/common/exec/elf/elf_impl.h b/usr/src/uts/common/exec/elf/elf_impl.h
index 010d5e6256..504cf84dd2 100644
--- a/usr/src/uts/common/exec/elf/elf_impl.h
+++ b/usr/src/uts/common/exec/elf/elf_impl.h
@@ -22,12 +22,13 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
#ifndef _ELF_ELF_IMPL_H
#define _ELF_ELF_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -71,6 +72,17 @@ typedef struct {
char name[8];
} Note;
+typedef struct {
+ vnode_t *ecc_vp;
+ proc_t *ecc_p;
+ cred_t *ecc_credp;
+ rlim64_t ecc_rlimit;
+ core_content_t ecc_content;
+ u_offset_t ecc_doffset;
+ void *ecc_buf;
+ size_t ecc_bufsz;
+} elf_core_ctx_t;
+
#ifdef _ELF32_COMPAT
/*
* These are defined only for the 32-bit compatibility
@@ -79,6 +91,7 @@ typedef struct {
#define elfexec elf32exec
#define elfnote elf32note
#define elfcore elf32core
+#define elfreadhdr elf32readhdr
#define mapexec_brand mapexec32_brand
#define setup_note_header setup_note_header32
#define write_elfnotes write_elfnotes32
diff --git a/usr/src/uts/common/exec/elf/elf_notes.c b/usr/src/uts/common/exec/elf/elf_notes.c
index fbc87fea66..6a024d0d1f 100644
--- a/usr/src/uts/common/exec/elf/elf_notes.c
+++ b/usr/src/uts/common/exec/elf/elf_notes.c
@@ -337,11 +337,13 @@ write_elfnotes(proc_t *p, int sig, vnode_t *vp, offset_t offset,
/* open file table */
+ mutex_enter(&p->p_lock);
vroot = PTOU(p)->u_rdir;
if (vroot == NULL)
vroot = rootdir;
VN_HOLD(vroot);
+ mutex_exit(&p->p_lock);
fip = P_FINFO(p);
diff --git a/usr/src/uts/common/exec/intp/intp.c b/usr/src/uts/common/exec/intp/intp.c
index 269ba86b1b..388d913ea0 100644
--- a/usr/src/uts/common/exec/intp/intp.c
+++ b/usr/src/uts/common/exec/intp/intp.c
@@ -22,6 +22,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2012 Milan Jurik. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1988 AT&T */
@@ -47,6 +48,7 @@
#include <sys/kmem.h>
#include <sys/note.h>
#include <sys/sdt.h>
+#include <sys/brand.h>
/*
* This is the loadable module wrapper.
@@ -54,7 +56,7 @@
#include <sys/modctl.h>
extern int intpexec(struct vnode *, struct execa *, struct uarg *,
- struct intpdata *, int, long *, int, caddr_t, struct cred *, int);
+ struct intpdata *, int, size_t *, int, caddr_t, struct cred *, int *);
static struct execsw esw = {
intpmagicstr,
@@ -126,13 +128,20 @@ getintphead(struct vnode *vp, struct intpdata *idatap)
*cp = '\0';
/*
- * Locate the beginning and end of the interpreter name.
- * In addition to the name, one additional argument may
- * optionally be included here, to be prepended to the
- * arguments provided on the command line. Thus, for
- * example, you can say
+ * Locate the beginning and end of the interpreter name. Historically,
+ * for illumos and its predecessors, in addition to the name, one
+ * additional argument may optionally be included here, to be prepended
+ * to the arguments provided on the command line. Thus, for example,
+ * you can say
*
* #! /usr/bin/awk -f
+ *
+ * However, handling of interpreter arguments varies across operating
+ * systems and other systems allow more than one argument. In
+ * particular, Linux allows more than one and delivers all arguments
+ * as a single string (argv[1] is "-arg1 -arg2 ..."). We support this
+ * style of argument handling as a brand-specific option (setting
+ * b_intp_parse_arg to B_FALSE).
*/
for (cp = &linep[2]; *cp == ' '; cp++)
;
@@ -151,9 +160,12 @@ getintphead(struct vnode *vp, struct intpdata *idatap)
idatap->intp_arg[0] = NULL;
else {
idatap->intp_arg[0] = cp;
- while (*cp && *cp != ' ')
- cp++;
- *cp = '\0';
+ if (!PROC_IS_BRANDED(curproc) ||
+ BROP(curproc)->b_intp_parse_arg) {
+ while (*cp && *cp != ' ')
+ cp++;
+ *cp = '\0';
+ }
}
}
return (0);
@@ -184,13 +196,12 @@ intpexec(
struct uarg *args,
struct intpdata *idatap,
int level,
- long *execsz,
+ size_t *execsz,
int setid,
caddr_t exec_file,
struct cred *cred,
- int brand_action)
+ int *brand_action)
{
- _NOTE(ARGUNUSED(brand_action))
vnode_t *nvp;
int error = 0;
struct intpdata idata;
@@ -281,7 +292,7 @@ intpexec(
}
error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred,
- EBA_NONE);
+ brand_action);
if (!error) {
/*
diff --git a/usr/src/uts/common/exec/java/java.c b/usr/src/uts/common/exec/java/java.c
index fdc327dcbb..a61a6f105f 100644
--- a/usr/src/uts/common/exec/java/java.c
+++ b/usr/src/uts/common/exec/java/java.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -84,8 +85,8 @@ char *jexec_arg = "-jar";
/*ARGSUSED3*/
static int
javaexec(vnode_t *vp, struct execa *uap, struct uarg *args,
- struct intpdata *idatap, int level, long *execsz, int setid,
- caddr_t execfile, cred_t *cred, int brand_action)
+ struct intpdata *idatap, int level, size_t *execsz, int setid,
+ caddr_t execfile, cred_t *cred, int *brand_action)
{
struct intpdata idata;
int error;
diff --git a/usr/src/uts/common/exec/shbin/shbin.c b/usr/src/uts/common/exec/shbin/shbin.c
index ee5060a07e..7b653a4c98 100644
--- a/usr/src/uts/common/exec/shbin/shbin.c
+++ b/usr/src/uts/common/exec/shbin/shbin.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -54,11 +55,11 @@ shbinexec(
struct uarg *args,
struct intpdata *idatap,
int level,
- long *execsz,
+ size_t *execsz,
int setid,
caddr_t exec_file,
struct cred *cred,
- int brand_action);
+ int *brand_action);
#define SHBIN_CNTL(x) ((x)&037)
#define SHBINMAGIC_LEN 4
@@ -158,11 +159,11 @@ shbinexec(
struct uarg *args,
struct intpdata *idatap,
int level,
- long *execsz,
+ size_t *execsz,
int setid,
caddr_t exec_file,
struct cred *cred,
- int brand_action)
+ int *brand_action)
{
_NOTE(ARGUNUSED(brand_action))
vnode_t *nvp;
diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c
index a426eeaf10..ce08e3697b 100644
--- a/usr/src/uts/common/fs/dev/sdev_netops.c
+++ b/usr/src/uts/common/fs/dev/sdev_netops.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
*/
/*
@@ -41,8 +42,102 @@
#include <sys/zone.h>
#include <sys/dls.h>
+static const char *devnet_zpath = "/dev/net/zone/";
struct vnodeops *devnet_vnodeops;
+static zoneid_t
+devnet_nodetozone(sdev_node_t *dv)
+{
+ char *zname = NULL, *dup;
+ zone_t *zone;
+ int duplen;
+ zoneid_t zid;
+
+ /*
+ * If in a non-global zone, always return it's zid no matter what the
+ * node is.
+ */
+ zid = getzoneid();
+ if (zid != GLOBAL_ZONEID)
+ return (zid);
+
+ /*
+ * If it doesn't have /dev/net/zone/ then it can't be a specific zone
+ * we're targetting.
+ */
+ if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0)
+ return (GLOBAL_ZONEID);
+
+ if (dv->sdev_vnode->v_type == VDIR) {
+ zone = zone_find_by_name(dv->sdev_name);
+ } else {
+ /* Non directories have the form /dev/net/zone/%z/%s */
+ dup = strdup(dv->sdev_path);
+ duplen = strlen(dup);
+ zname = strrchr(dup, '/');
+ *zname = '\0';
+ zname--;
+ zname = strrchr(dup, '/');
+ zname++;
+ zone = zone_find_by_name(zname);
+ kmem_free(dup, duplen + 1);
+ }
+ if (zone == NULL)
+ return (GLOBAL_ZONEID);
+ zid = zone->zone_id;
+ zone_rele(zone);
+ return (zid);
+}
+
+static int
+devnet_mkdir(struct sdev_node *ddv, char *name)
+{
+ sdev_node_t *dv;
+ struct vattr va;
+ int ret;
+
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ dv = sdev_cache_lookup(ddv, name);
+ if (dv != NULL) {
+ SDEV_SIMPLE_RELE(dv);
+ return (EEXIST);
+ }
+
+ va = *sdev_getdefault_attr(VDIR);
+ gethrestime(&va.va_atime);
+ va.va_mtime = va.va_atime;
+ va.va_ctime = va.va_atime;
+
+ ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY);
+ if (ret != 0)
+ return (ret);
+ SDEV_SIMPLE_RELE(dv);
+ return (0);
+}
+
+/*
+ * We basically need to walk down the directory path to determine what we should
+ * do. At the top level of /dev/net, only the directory /dev/net/zone is valid,
+ * and it is always valid. Following on that, /dev/net/zone/%zonename is valid
+ * if and only if we can look up that zone name. If it's not, or it's some other
+ * name, then it's SDEV_VTOR_INVALID.
+ */
+static int
+devnet_dirvalidate(struct sdev_node *dv)
+{
+ zone_t *zonep;
+ char *path = "/dev/net/zone";
+
+ if (strcmp(path, dv->sdev_path) == 0)
+ return (SDEV_VTOR_VALID);
+
+ zonep = zone_find_by_name(dv->sdev_name);
+ if (zonep == NULL)
+ return (SDEV_VTOR_INVALID);
+ zone_rele(zonep);
+ return (SDEV_VTOR_VALID);
+}
+
/*
* Check if a net sdev_node is still valid - i.e. it represents a current
* network link.
@@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv)
ASSERT(dv->sdev_state == SDEV_READY);
- if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0)
+ if (dv->sdev_vnode->v_type == VDIR)
+ return (devnet_dirvalidate(dv));
+
+ if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) {
+ ASSERT(SDEV_IS_GLOBAL(dv));
+ zoneid = devnet_nodetozone(dv);
+ } else {
+ zoneid = getzoneid();
+ }
+
+ if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0)
return (SDEV_VTOR_INVALID);
- if (SDEV_IS_GLOBAL(dv))
+ if (zoneid == GLOBAL_ZONEID)
return (SDEV_VTOR_VALID);
- zoneid = getzoneid();
return (zone_check_datalink(&zoneid, linkid) == 0 ?
SDEV_VTOR_VALID : SDEV_VTOR_INVALID);
}
@@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv)
* a net entry when the node is not found in the cache.
*/
static int
-devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp)
+devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp,
+ zoneid_t zid)
{
timestruc_t now;
dev_t dev;
int error;
- if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) {
+ if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) {
sdcmn_err12(("devnet_create_rvp: not a valid vanity name "
"network node: %s\n", nm));
return (error);
@@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
struct sdev_node *ddv = VTOSDEV(dvp);
struct sdev_node *dv = NULL;
dls_dl_handle_t ddh = NULL;
+ zone_t *zone;
struct vattr vattr;
int nmlen;
int error = ENOENT;
@@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
if (SDEVTOV(ddv)->v_type != VDIR)
return (ENOTDIR);
+ if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID)
+ return (EPERM);
+
/*
* Empty name or ., return node itself.
*/
@@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
rw_enter(&ddv->sdev_contents, RW_WRITER);
/*
+ * ZOMBIED parent does not allow new node creation, bail out early.
+ */
+ if (ddv->sdev_state == SDEV_ZOMBIE)
+ goto failed;
+
+ /*
* directory cache lookup:
*/
if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) {
@@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
goto found;
}
+ if (SDEV_IS_GLOBAL(ddv)) {
+ /*
+ * Check for /dev/net/zone
+ */
+ if (strcmp("zone", nm) == 0 && strcmp("/dev/net",
+ ddv->sdev_path) == 0) {
+ (void) devnet_mkdir(ddv, nm);
+ dv = sdev_cache_lookup(ddv, nm);
+ ASSERT(dv != NULL);
+ goto found;
+ }
+
+ /*
+ * Check for /dev/net/zone/%z. We can't use devnet_zpath due to
+ * its trailing slash.
+ */
+ if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) {
+ zone = zone_find_by_name(nm);
+ if (zone == NULL)
+ goto failed;
+ (void) devnet_mkdir(ddv, nm);
+ zone_rele(zone);
+ dv = sdev_cache_lookup(ddv, nm);
+ ASSERT(dv != NULL);
+ goto found;
+ }
+ } else if (strcmp("/dev/net", ddv->sdev_path) != 0) {
+ goto failed;
+ }
+
/*
- * ZOMBIED parent does not allow new node creation, bail out early.
+ * We didn't find what we were looking for. What that is depends a lot
+ * on what directory we're in.
*/
- if (ddv->sdev_state == SDEV_ZOMBIE)
- goto failed;
- error = devnet_create_rvp(nm, &vattr, &ddh);
+ error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv));
if (error != 0)
goto failed;
@@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg)
if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL)
goto found;
- if (devnet_create_rvp(link, &vattr, &ddh) != 0)
+ if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0)
return (0);
ASSERT(ddh != NULL);
@@ -244,16 +388,77 @@ found:
return (0);
}
+/*
+ * Fill in all the entries for the current zone.
+ */
static void
-devnet_filldir(struct sdev_node *ddv)
+devnet_fillzone(struct sdev_node *ddv, zoneid_t zid)
{
- sdev_node_t *dv, *next;
datalink_id_t linkid;
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ if (zid == GLOBAL_ZONEID) {
+ ASSERT(SDEV_IS_GLOBAL(ddv));
+ linkid = DATALINK_INVALID_LINKID;
+ do {
+ linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
+ DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
+ if (linkid != DATALINK_INVALID_LINKID)
+ (void) devnet_filldir_datalink(linkid, ddv);
+ } while (linkid != DATALINK_INVALID_LINKID);
+ } else {
+ (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv);
+ }
+}
+
+/*
+ * Callback for zone_walk when filling up /dev/net/zone/...
+ */
+static int
+devnet_fillzdir_cb(zone_t *zonep, void *arg)
+{
+ sdev_node_t *ddv = arg;
+
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ (void) devnet_mkdir(ddv, zonep->zone_name);
+ return (0);
+}
+
+/*
+ * Fill in a directory that isn't the top level /dev/net.
+ */
+static void
+devnet_fillzdir(struct sdev_node *ddv)
+{
+ zone_t *zonep;
+ char *path = "/dev/net/zone";
+
+ if (strcmp(path, ddv->sdev_path) == 0) {
+ (void) zone_walk(devnet_fillzdir_cb, ddv);
+ return;
+ }
+
+ zonep = zone_find_by_name(ddv->sdev_name);
+ if (zonep == NULL)
+ return;
+ devnet_fillzone(ddv, zonep->zone_id);
+ zone_rele(zonep);
+}
+
+static void
+devnet_filldir(struct sdev_node *ddv)
+{
+ int ret;
+ sdev_node_t *dv, *next;
+
ASSERT(RW_READ_HELD(&ddv->sdev_contents));
if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
rw_exit(&ddv->sdev_contents);
rw_enter(&ddv->sdev_contents, RW_WRITER);
+ if (ddv->sdev_state == SDEV_ZOMBIE) {
+ rw_exit(&ddv->sdev_contents);
+ return;
+ }
}
for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
@@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv)
if (SDEVTOV(dv)->v_count > 0)
continue;
+
SDEV_HOLD(dv);
+
+ /*
+ * Clean out everything underneath before we remove ourselves.
+ */
+ if (SDEVTOV(dv)->v_type == VDIR) {
+ ret = sdev_cleandir(dv, NULL, 0);
+ ASSERT(ret == 0);
+ }
/* remove the cache node */
(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
SDEV_CACHE_DELETE);
SDEV_RELE(dv);
}
+ if (strcmp(ddv->sdev_path, "/dev/net") != 0) {
+ devnet_fillzdir(ddv);
+ goto done;
+ }
+
if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild())
goto done;
if (SDEV_IS_GLOBAL(ddv)) {
- linkid = DATALINK_INVALID_LINKID;
- do {
- linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
- DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
- if (linkid != DATALINK_INVALID_LINKID)
- (void) devnet_filldir_datalink(linkid, ddv);
- } while (linkid != DATALINK_INVALID_LINKID);
+ devnet_fillzone(ddv, GLOBAL_ZONEID);
+ (void) devnet_mkdir(ddv, "zone");
} else {
- (void) zone_datalink_walk(getzoneid(),
- devnet_filldir_datalink, ddv);
+ devnet_fillzone(ddv, getzoneid());
}
ddv->sdev_flags &= ~SDEV_BUILD;
-
done:
rw_downgrade(&ddv->sdev_contents);
}
@@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
ASSERT(sdvp);
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
if (uiop->uio_offset == 0)
devnet_filldir(sdvp);
diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c
new file mode 100644
index 0000000000..6e1618dc3c
--- /dev/null
+++ b/usr/src/uts/common/fs/dev/sdev_plugin.c
@@ -0,0 +1,948 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+/*
+ * Dynamic directory plugin interface for sdev.
+ *
+ * The sdev plugin interfaces provides a means for a dynamic directory based on
+ * in-kernel state to be simply created. Traditionally, dynamic directories were
+ * built into sdev itself. While these legacy plugins are useful, it makes more
+ * sense for these pieces of functionality to live with the individual drivers.
+ *
+ * The plugin interface requires folks to implement three interfaces and
+ * provides a series of callbacks that can be made in the context of those
+ * interfaces to interrogate the sdev_node_t without having to leak
+ * implementation details of the sdev_node_t. These interfaces are:
+ *
+ * o spo_validate
+ *
+ * Given a particular node, answer the question as to whether or not this
+ * entry is still valid. Here, plugins should use the name and the dev_t
+ * associated with the node to verify that it matches something that still
+ * exists.
+ *
+ * o spo_filldir
+ *
+ * Fill all the entries inside of a directory. Note that some of these entries
+ * may already exist.
+ *
+ * o spo_inactive
+ *
+ * The given node is no longer being used. This allows the consumer to
+ * potentially tear down anything that was being held open related to this.
+ * Note that this only fires when the given sdev_node_t becomes a zombie.
+ *
+ * During these callbacks a consumer is not allowed to register or unregister a
+ * plugin, especially their own. They may call the sdev_ctx style functions. All
+ * callbacks fire in a context where blocking is allowed (eg. the spl is below
+ * LOCK_LEVEL).
+ *
+ * When a plugin is added, we create its directory in the global zone. By doing
+ * that, we ensure that something isn't already there and that nothing else can
+ * come along and try and create something without our knowledge. We only have
+ * to create it in the GZ and not for all other instances of sdev because an
+ * instance of sdev that isn't at /dev does not have dynamic directories, and
+ * second, any instance of sdev present in a non-global zone cannot create
+ * anything, therefore we know that by it not being in the global zone's
+ * instance of sdev that we're good to go.
+ *
+ * Lock Ordering
+ * -------------
+ *
+ * The global sdev_plugin_lock must be held before any of the individual
+ * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held,
+ * it is not legal to take any holds on any sdev_node_t or to grab the
+ * sdev_node_t`contents_lock in any way.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/fs/sdev_impl.h>
+#include <sys/fs/sdev_plugin.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+
+kmutex_t sdev_plugin_lock;
+list_t sdev_plugin_list;
+kmem_cache_t *sdev_plugin_cache;
+struct vnodeops *sdev_plugin_vnops;
+
+#define SDEV_PLUGIN_NAMELEN 64
+
+typedef struct sdev_plugin {
+ list_node_t sp_link;
+ char sp_name[SDEV_PLUGIN_NAMELEN]; /* E */
+ int sp_nflags; /* E */
+ struct vnodeops *sp_vnops; /* E */
+ sdev_plugin_ops_t *sp_pops; /* E */
+ boolean_t sp_islegacy; /* E */
+ int (*sp_lvtor)(sdev_node_t *); /* E */
+ kmutex_t sp_lock; /* Protects everything below */
+ kcondvar_t sp_nodecv;
+ size_t sp_nnodes;
+} sdev_plugin_t;
+
+/* ARGSUSED */
+static int
+sdev_plugin_cache_constructor(void *buf, void *arg, int tags)
+{
+ sdev_plugin_t *spp = buf;
+ mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0);
+ cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+sdev_plugin_cache_destructor(void *buf, void *arg)
+{
+ sdev_plugin_t *spp = buf;
+ cv_destroy(&spp->sp_nodecv);
+ mutex_destroy(&spp->sp_lock);
+}
+
+enum vtype
+sdev_ctx_vtype(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_vnode->v_type);
+}
+
+const char *
+sdev_ctx_path(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_path);
+}
+
+const char *
+sdev_ctx_name(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_name);
+}
+
+int
+sdev_ctx_minor(sdev_ctx_t ctx, minor_t *minorp)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ ASSERT(minorp != NULL);
+ if (sdp->sdev_vnode->v_type == VCHR ||
+ sdp->sdev_vnode->v_type == VBLK) {
+ *minorp = getminor(sdp->sdev_vnode->v_rdev);
+ return (0);
+ }
+
+ return (ENODEV);
+}
+
+/*
+ * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL.
+ */
+sdev_ctx_flags_t
+sdev_ctx_flags(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_flags & SDEV_GLOBAL);
+}
+
+/*
+ * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'.
+ */
+static int
+sdev_plugin_name_isvalid(const char *c, int buflen)
+{
+ int i;
+
+ for (i = 0; i < buflen; i++, c++) {
+ if (*c == '\0')
+ return (1);
+
+ if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.')
+ return (0);
+ }
+ /* Never found a null terminator */
+ return (0);
+}
+
+static int
+sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name,
+ vattr_t *vap)
+{
+ int ret;
+ sdev_node_t *svp;
+
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+ ASSERT(spp != NULL);
+ svp = sdev_cache_lookup(sdvp, name);
+ if (svp != NULL) {
+ SDEV_SIMPLE_RELE(svp);
+ return (EEXIST);
+ }
+
+ ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred,
+ SDEV_READY);
+ if (ret != 0)
+ return (ret);
+ SDEV_SIMPLE_RELE(svp);
+
+ return (0);
+}
+
+/*
+ * Plugin node creation callbacks
+ */
+int
+sdev_plugin_mkdir(sdev_ctx_t ctx, char *name)
+{
+ sdev_node_t *sdvp;
+ timestruc_t now;
+ struct vattr vap;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+ return (EINVAL);
+
+ sdvp = (sdev_node_t *)ctx;
+ ASSERT(sdvp->sdev_private != NULL);
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+ vap = *sdev_getdefault_attr(VDIR);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+
+ return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+}
+
+int
+sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev)
+{
+ sdev_node_t *sdvp;
+ timestruc_t now;
+ struct vattr vap;
+ mode_t type = mode & S_IFMT;
+ mode_t access = mode & S_IAMB;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+ return (EINVAL);
+
+ sdvp = (sdev_node_t *)ctx;
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+ /*
+ * Ensure only type and user/group/other permission bits are present.
+ * Do not allow setuid, setgid, etc.
+ */
+ if ((mode & ~(S_IFMT | S_IAMB)) != 0)
+ return (EINVAL);
+
+ /* Disallow types other than character and block devices */
+ if (type != S_IFCHR && type != S_IFBLK)
+ return (EINVAL);
+
+ /* Disallow execute bits */
+ if ((access & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0)
+ return (EINVAL);
+
+ /* No bits other than 0666 in access */
+ ASSERT((access &
+ ~(S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) == 0);
+
+ /* Default to relatively safe access bits if none specified. */
+ if (access == 0)
+ access = 0600;
+
+ ASSERT(sdvp->sdev_private != NULL);
+
+ vap = *sdev_getdefault_attr(type == S_IFCHR ? VCHR : VBLK);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+ vap.va_rdev = dev;
+ vap.va_mode = type | access;
+
+ /* Despite the similar name, this is in fact a different function */
+ return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+}
+
+static int
+sdev_plugin_validate(sdev_node_t *sdp)
+{
+ int ret;
+ sdev_plugin_t *spp;
+
+ ASSERT(sdp->sdev_private != NULL);
+ spp = sdp->sdev_private;
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+ rw_enter(&sdp->sdev_contents, RW_READER);
+ ret = spp->sp_pops->spo_validate((uintptr_t)sdp);
+ rw_exit(&sdp->sdev_contents);
+ return (ret);
+}
+
+static void
+sdev_plugin_validate_dir(sdev_node_t *sdvp)
+{
+ int ret;
+ sdev_node_t *svp, *next;
+
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+ for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) {
+
+ next = SDEV_NEXT_ENTRY(sdvp, svp);
+ ASSERT(svp->sdev_state != SDEV_ZOMBIE);
+ /* skip nodes that aren't ready */
+ if (svp->sdev_state == SDEV_INIT)
+ continue;
+
+ switch (sdev_plugin_validate(svp)) {
+ case SDEV_VTOR_VALID:
+ case SDEV_VTOR_SKIP:
+ continue;
+ case SDEV_VTOR_INVALID:
+ case SDEV_VTOR_STALE:
+ break;
+ }
+
+ SDEV_HOLD(svp);
+
+ /*
+ * Clean out everything underneath this node before we
+ * remove it.
+ */
+ if (svp->sdev_vnode->v_type == VDIR) {
+ ret = sdev_cleandir(svp, NULL, 0);
+ ASSERT(ret == 0);
+ }
+ /* remove the cache node */
+ (void) sdev_cache_update(sdvp, &svp, svp->sdev_name,
+ SDEV_CACHE_DELETE);
+ SDEV_RELE(svp);
+ }
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
+ int *eofp, caller_context_t *ct_unused, int flags_unused)
+{
+ int ret;
+ sdev_node_t *sdvp = VTOSDEV(dvp);
+ sdev_plugin_t *spp;
+
+ ASSERT(RW_READ_HELD(&sdvp->sdev_contents));
+
+ /* Sanity check we're not a zombie before we do anyting else */
+ if (sdvp->sdev_state == SDEV_ZOMBIE)
+ return (ENOENT);
+
+ spp = sdvp->sdev_private;
+ ASSERT(spp != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
+ if (uiop->uio_offset == 0) {
+ /*
+ * We upgrade to a write lock and grab the plugin's lock along
+ * the way. We're almost certainly going to get creation
+ * callbacks, so this is the only safe way to go.
+ */
+ if (rw_tryupgrade(&sdvp->sdev_contents) == 0) {
+ rw_exit(&sdvp->sdev_contents);
+ rw_enter(&sdvp->sdev_contents, RW_WRITER);
+ if (sdvp->sdev_state == SDEV_ZOMBIE) {
+ rw_downgrade(&sdvp->sdev_contents);
+ return (ENOENT);
+ }
+ }
+
+ sdev_plugin_validate_dir(sdvp);
+ ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+ rw_downgrade(&sdvp->sdev_contents);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+}
+
+/*
+ * If we don't have a callback function that returns a failure, then sdev will
+ * try to create a node for us which violates all of our basic assertions. To
+ * work around that we create our own callback for devname_lookup_func which
+ * always returns ENOENT as at this point either it was created with the filldir
+ * callback or it was not.
+ */
+/*ARGSUSED*/
+static int
+sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred,
+ void *unused, char *unused2)
+{
+ return (ENOENT);
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+ struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+ caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+ int ret;
+ sdev_node_t *sdvp;
+ sdev_plugin_t *spp;
+
+ /* execute access is required to search the directory */
+ if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+ return (ret);
+
+ sdvp = VTOSDEV(dvp);
+ spp = sdvp->sdev_private;
+ ASSERT(spp != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
+ /*
+ * Go straight for the write lock.
+ */
+ rw_enter(&sdvp->sdev_contents, RW_WRITER);
+ if (sdvp->sdev_state == SDEV_ZOMBIE) {
+ rw_exit(&sdvp->sdev_contents);
+ return (ENOENT);
+ }
+ sdev_plugin_validate_dir(sdvp);
+ ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+ rw_exit(&sdvp->sdev_contents);
+ if (ret != 0)
+ return (ret);
+
+ return (devname_lookup_func(sdvp, nm, vpp, cred,
+ sdev_plugin_vop_lookup_cb, SDEV_VATTR));
+}
+
+/*
+ * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes
+ * to zero, but isn't necessairily a zombie yet. As such, to make things easier
+ * for users, we only fire the inactive callback when the node becomes a zombie
+ * and thus will be torn down here.
+ */
+static void
+sdev_plugin_vop_inactive_cb(struct vnode *dvp)
+{
+ sdev_node_t *sdp = VTOSDEV(dvp);
+ sdev_plugin_t *spp = sdp->sdev_private;
+
+ rw_enter(&sdp->sdev_contents, RW_READER);
+ if (sdp->sdev_state != SDEV_ZOMBIE) {
+ rw_exit(&sdp->sdev_contents);
+ return;
+ }
+ spp->sp_pops->spo_inactive((uintptr_t)sdp);
+ mutex_enter(&spp->sp_lock);
+ VERIFY(spp->sp_nnodes > 0);
+ spp->sp_nnodes--;
+ cv_signal(&spp->sp_nodecv);
+ mutex_exit(&spp->sp_lock);
+ rw_exit(&sdp->sdev_contents);
+}
+
+/*ARGSUSED*/
+static void
+sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred,
+ caller_context_t *ct)
+{
+ sdev_node_t *sdp = VTOSDEV(dvp);
+ sdev_plugin_t *spp = sdp->sdev_private;
+ ASSERT(sdp->sdev_private != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb);
+}
+
+const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = {
+ VOPNAME_READDIR, { .vop_readdir = sdev_plugin_vop_readdir },
+ VOPNAME_LOOKUP, { .vop_lookup = sdev_plugin_vop_lookup },
+ VOPNAME_INACTIVE, { .vop_inactive = sdev_plugin_vop_inactive },
+ VOPNAME_CREATE, { .error = fs_nosys },
+ VOPNAME_REMOVE, { .error = fs_nosys },
+ VOPNAME_MKDIR, { .error = fs_nosys },
+ VOPNAME_RMDIR, { .error = fs_nosys },
+ VOPNAME_SYMLINK, { .error = fs_nosys },
+ VOPNAME_SETSECATTR, { .error = fs_nosys },
+ NULL, NULL
+};
+
+/*
+ * construct a new template with overrides from vtab
+ */
+static fs_operation_def_t *
+sdev_merge_vtab(const fs_operation_def_t tab[])
+{
+ fs_operation_def_t *new;
+ const fs_operation_def_t *tab_entry;
+
+ /* make a copy of standard vnode ops table */
+ new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
+ bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
+
+ /* replace the overrides from tab */
+ for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
+ fs_operation_def_t *std_entry = new;
+ while (std_entry->name) {
+ if (strcmp(tab_entry->name, std_entry->name) == 0) {
+ std_entry->func = tab_entry->func;
+ break;
+ }
+ std_entry++;
+ }
+ }
+
+ return (new);
+}
+
+/* free memory allocated by sdev_merge_vtab */
+static void
+sdev_free_vtab(fs_operation_def_t *new)
+{
+ kmem_free(new, sdev_vnodeops_tbl_size);
+}
+
+/*
+ * Register a new plugin.
+ */
+sdev_plugin_hdl_t
+sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp)
+{
+ char buf[sizeof ("dev")] = "";
+ struct pathname pn = { 0 };
+ sdev_plugin_t *spp, *iter;
+ vnode_t *vp, *nvp;
+ sdev_node_t *sdp, *slp;
+ timestruc_t now;
+ struct vattr vap;
+ int ret, err;
+
+ /*
+ * Some consumers don't care about why they failed. To keep the code
+ * simple, we'll just pretend they gave us something.
+ */
+ if (errp == NULL)
+ errp = &err;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if (ops->spo_version != 1) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if (ops->spo_validate == NULL || ops->spo_filldir == NULL ||
+ ops->spo_inactive == NULL) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+ (void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN);
+
+ spp->sp_pops = ops;
+ spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR;
+ if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE)
+ spp->sp_nflags |= SDEV_NO_NCACHE;
+ if (ops->spo_flags & SDEV_PLUGIN_SUBDIR)
+ spp->sp_nflags |= SDEV_SUBDIR;
+ spp->sp_vnops = sdev_plugin_vnops;
+ spp->sp_islegacy = B_FALSE;
+ spp->sp_lvtor = NULL;
+ spp->sp_nnodes = 0;
+
+ /*
+ * Make sure our /dev entry is unique and install it. We also need to
+ * go through and grab the sdev root node as we cannot grab any sdev
+ * node locks once we've grabbed the sdev_plugin_lock. We effectively
+ * assert that if a directory is not present in the GZ's /dev, then it
+ * doesn't exist in any of the local zones.
+ *
+ * Note that we may be in NGZ context: during a prof_filldir(".../dev/")
+ * enumeration, for example. So we have to dig as deep as lookuppnvp()
+ * to make sure we really get to the global /dev (i.e. escape both
+ * CRED() and ->u_rdir).
+ */
+ pn_get_buf("dev", UIO_SYSSPACE, &pn, buf, sizeof (buf));
+ VN_HOLD(rootdir);
+ ret = lookuppnvp(&pn, NULL, NO_FOLLOW, NULLVPP,
+ &vp, rootdir, rootdir, kcred);
+
+ if (ret != 0) {
+ *errp = ret;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+ /* Make sure we have the real vnode */
+ if (VOP_REALVP(vp, &nvp, NULL) == 0) {
+ VN_HOLD(nvp);
+ VN_RELE(vp);
+ vp = nvp;
+ nvp = NULL;
+ }
+ VERIFY(vp->v_op == sdev_vnodeops);
+ sdp = VTOSDEV(vp);
+ rw_enter(&sdp->sdev_contents, RW_WRITER);
+ slp = sdev_cache_lookup(sdp, spp->sp_name);
+ if (slp != NULL) {
+ SDEV_RELE(slp);
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+ *errp = EEXIST;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+
+ mutex_enter(&sdev_plugin_lock);
+ for (iter = list_head(&sdev_plugin_list); iter != NULL;
+ iter = list_next(&sdev_plugin_list, iter)) {
+ if (strcmp(spp->sp_name, iter->sp_name) == 0) {
+ mutex_exit(&sdev_plugin_lock);
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+ *errp = EEXIST;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+ }
+
+ list_insert_tail(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ /*
+ * Now go ahead and create the top level directory for the global zone.
+ */
+ vap = *sdev_getdefault_attr(VDIR);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+
+ (void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap);
+
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+
+ *errp = 0;
+
+ return ((sdev_plugin_hdl_t)spp);
+}
+
+static void
+sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg)
+{
+ sdev_plugin_t *spp = arg;
+ sdev_node_t *sdp;
+
+ rw_enter(&rdp->sdev_contents, RW_WRITER);
+ sdp = sdev_cache_lookup(rdp, spp->sp_name);
+ /* If it doesn't exist, we're done here */
+ if (sdp == NULL) {
+ rw_exit(&rdp->sdev_contents);
+ return;
+ }
+
+ /*
+ * We first delete the directory before recursively marking everything
+ * else stale. This ordering should ensure that we don't accidentally
+ * miss anything.
+ */
+ sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE);
+ sdev_stale(sdp);
+ SDEV_RELE(sdp);
+ rw_exit(&rdp->sdev_contents);
+}
+
+int sdev_plugin_unregister_allowed;
+
+/*
+ * Remove a plugin. This will block until everything has become a zombie, thus
+ * guaranteeing the caller that nothing will call into them again once this call
+ * returns. While the call is ongoing, it could be called into. Note that while
+ * this is ongoing, it will block other mounts.
+ *
+ * NB: this is not safe when used from detach() context - we will be DEVI_BUSY,
+ * and other sdev threads may be waiting for this. Only use the over-ride if
+ * willing to risk it.
+ */
+int
+sdev_plugin_unregister(sdev_plugin_hdl_t hdl)
+{
+ sdev_plugin_t *spp = (sdev_plugin_t *)hdl;
+ if (spp->sp_islegacy)
+ return (EINVAL);
+
+ if (!sdev_plugin_unregister_allowed)
+ return (EBUSY);
+
+ mutex_enter(&sdev_plugin_lock);
+ list_remove(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ sdev_mnt_walk(sdev_plugin_unregister_cb, spp);
+ mutex_enter(&spp->sp_lock);
+ while (spp->sp_nnodes > 0)
+ cv_wait(&spp->sp_nodecv, &spp->sp_lock);
+ mutex_exit(&spp->sp_lock);
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (0);
+}
+
+/*
+ * Register an old sdev style plugin to deal with what used to be in the vtab.
+ */
+static int
+sdev_plugin_register_legacy(struct sdev_vop_table *vtp)
+{
+ sdev_plugin_t *spp;
+
+ spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+ (void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN);
+ spp->sp_islegacy = B_TRUE;
+ spp->sp_pops = NULL;
+ spp->sp_nflags = vtp->vt_flags;
+ spp->sp_lvtor = vtp->vt_vtor;
+ spp->sp_nnodes = 0;
+
+ if (vtp->vt_service != NULL) {
+ fs_operation_def_t *templ;
+ templ = sdev_merge_vtab(vtp->vt_service);
+ if (vn_make_ops(vtp->vt_name,
+ (const fs_operation_def_t *)templ,
+ &spp->sp_vnops) != 0) {
+ cmn_err(CE_WARN, "%s: malformed vnode ops\n",
+ vtp->vt_name);
+ sdev_free_vtab(templ);
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (1);
+ }
+
+ if (vtp->vt_global_vops) {
+ *(vtp->vt_global_vops) = spp->sp_vnops;
+ }
+
+ sdev_free_vtab(templ);
+ } else {
+ spp->sp_vnops = sdev_vnodeops;
+ }
+
+ /*
+ * No need to check for EEXIST here. These are loaded as a part of the
+ * sdev's initialization function. Further, we don't have to create them
+ * as that's taken care of in sdev's mount for the GZ.
+ */
+ mutex_enter(&sdev_plugin_lock);
+ list_insert_tail(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ return (0);
+}
+
+/*
+ * We need to match off of the sdev_path, not the sdev_name. We are only allowed
+ * to exist directly under /dev.
+ */
+static sdev_plugin_t *
+sdev_match(sdev_node_t *dv)
+{
+ int vlen;
+ const char *path;
+ sdev_plugin_t *spp;
+
+ if (strlen(dv->sdev_path) <= 5)
+ return (NULL);
+
+ if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
+ return (NULL);
+ path = dv->sdev_path + 5;
+
+ mutex_enter(&sdev_plugin_lock);
+
+ for (spp = list_head(&sdev_plugin_list); spp != NULL;
+ spp = list_next(&sdev_plugin_list, spp)) {
+ if (strcmp(spp->sp_name, path) == 0) {
+ mutex_exit(&sdev_plugin_lock);
+ return (spp);
+ }
+
+ if (spp->sp_nflags & SDEV_SUBDIR) {
+ vlen = strlen(spp->sp_name);
+ if ((strncmp(spp->sp_name, path,
+ vlen - 1) == 0) && path[vlen] == '/') {
+ mutex_exit(&sdev_plugin_lock);
+ return (spp);
+ }
+
+ }
+ }
+
+ mutex_exit(&sdev_plugin_lock);
+ return (NULL);
+}
+
+void
+sdev_set_no_negcache(sdev_node_t *dv)
+{
+ char *path;
+ sdev_plugin_t *spp;
+
+ ASSERT(dv->sdev_path);
+ path = dv->sdev_path + strlen("/dev/");
+
+ mutex_enter(&sdev_plugin_lock);
+ for (spp = list_head(&sdev_plugin_list); spp != NULL;
+ spp = list_next(&sdev_plugin_list, spp)) {
+ if (strcmp(spp->sp_name, path) == 0) {
+ if (spp->sp_nflags & SDEV_NO_NCACHE)
+ dv->sdev_flags |= SDEV_NO_NCACHE;
+ break;
+ }
+ }
+ mutex_exit(&sdev_plugin_lock);
+}
+
+struct vnodeops *
+sdev_get_vop(sdev_node_t *dv)
+{
+ char *path;
+ sdev_plugin_t *spp;
+
+ path = dv->sdev_path;
+ ASSERT(path);
+
+ /* gets the relative path to /dev/ */
+ path += 5;
+
+ if ((spp = sdev_match(dv)) != NULL) {
+ dv->sdev_flags |= spp->sp_nflags;
+ if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
+ (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
+ dv->sdev_flags |= SDEV_PERSIST;
+ return (spp->sp_vnops);
+ }
+
+ /* child inherits the persistence of the parent */
+ if (SDEV_IS_PERSIST(dv->sdev_dotdot))
+ dv->sdev_flags |= SDEV_PERSIST;
+ return (sdev_vnodeops);
+}
+
+void *
+sdev_get_vtor(sdev_node_t *dv)
+{
+ sdev_plugin_t *spp;
+
+ if (dv->sdev_private == NULL) {
+ spp = sdev_match(dv);
+ if (spp == NULL)
+ return (NULL);
+ } else {
+ spp = dv->sdev_private;
+ }
+
+ if (spp->sp_islegacy)
+ return ((void *)spp->sp_lvtor);
+ else
+ return ((void *)sdev_plugin_validate);
+}
+
+void
+sdev_plugin_nodeready(sdev_node_t *sdp)
+{
+ sdev_plugin_t *spp;
+
+ ASSERT(RW_WRITE_HELD(&sdp->sdev_contents));
+ ASSERT(sdp->sdev_private == NULL);
+
+ spp = sdev_match(sdp);
+ if (spp == NULL)
+ return;
+ if (spp->sp_islegacy)
+ return;
+ sdp->sdev_private = spp;
+ mutex_enter(&spp->sp_lock);
+ spp->sp_nnodes++;
+ mutex_exit(&spp->sp_lock);
+}
+
+int
+sdev_plugin_init(void)
+{
+ sdev_vop_table_t *vtp;
+ fs_operation_def_t *templ;
+
+ sdev_plugin_cache = kmem_cache_create("sdev_plugin",
+ sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor,
+ sdev_plugin_cache_destructor, NULL, NULL, NULL, 0);
+ if (sdev_plugin_cache == NULL)
+ return (1);
+ mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&sdev_plugin_list, sizeof (sdev_plugin_t),
+ offsetof(sdev_plugin_t, sp_link));
+
+ /*
+ * Register all of the legacy vnops
+ */
+ for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++)
+ if (sdev_plugin_register_legacy(vtp) != 0)
+ return (1);
+
+ templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl);
+ if (vn_make_ops("sdev_plugin",
+ (const fs_operation_def_t *)templ,
+ &sdev_plugin_vnops) != 0) {
+ sdev_free_vtab(templ);
+ return (1);
+ }
+
+ sdev_free_vtab(templ);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c
index d810dd9a31..42a3874b95 100644
--- a/usr/src/uts/common/fs/dev/sdev_subr.c
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c
@@ -151,12 +151,6 @@ vattr_t sdev_vattr_chr = {
kmem_cache_t *sdev_node_cache; /* sdev_node cache */
int devtype; /* fstype */
-/* static */
-static struct vnodeops *sdev_get_vop(struct sdev_node *);
-static void sdev_set_no_negcache(struct sdev_node *);
-static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
-static void sdev_free_vtab(fs_operation_def_t *);
-
static void
sdev_prof_free(struct sdev_node *dv)
{
@@ -314,6 +308,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
/* overwritten for VLNK nodes */
dv->sdev_symlink = NULL;
+ list_link_init(&dv->sdev_plist);
vp = SDEVTOV(dv);
vn_reinit(vp);
@@ -402,6 +397,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
} else {
dv->sdev_nlink = 1;
}
+ sdev_plugin_nodeready(dv);
if (!(SDEV_IS_GLOBAL(dv))) {
dv->sdev_origin = (struct sdev_node *)args;
@@ -498,37 +494,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
return (dv);
}
-/* directory dependent vop table */
-struct sdev_vop_table {
- char *vt_name; /* subdirectory name */
- const fs_operation_def_t *vt_service; /* vnodeops table */
- struct vnodeops *vt_vops; /* constructed vop */
- struct vnodeops **vt_global_vops; /* global container for vop */
- int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */
- int vt_flags;
-};
-
-/*
- * A nice improvement would be to provide a plug-in mechanism
- * for this table instead of a const table.
- */
-static struct sdev_vop_table vtab[] =
-{
- { "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
+struct sdev_vop_table vtab[] = {
+ { "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate,
SDEV_DYNAMIC | SDEV_VTOR },
- { "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
+ { "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate,
SDEV_DYNAMIC | SDEV_VTOR },
- { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
+ { "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops,
devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
- { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
+ { "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE },
- { "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
- SDEV_DYNAMIC | SDEV_VTOR },
+ { "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate,
+ SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
- { "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
+ { "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops,
devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
/*
@@ -543,132 +524,14 @@ static struct sdev_vop_table vtab[] =
* preventing a mkdir.
*/
- { "lofi", NULL, NULL, NULL, NULL,
+ { "lofi", NULL, NULL, NULL,
SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
- { "rlofi", NULL, NULL, NULL, NULL,
+ { "rlofi", NULL, NULL, NULL,
SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
- { NULL, NULL, NULL, NULL, NULL, 0}
+ { NULL, NULL, NULL, NULL, 0}
};
-/*
- * We need to match off of the sdev_path, not the sdev_name. We are only allowed
- * to exist directly under /dev.
- */
-struct sdev_vop_table *
-sdev_match(struct sdev_node *dv)
-{
- int vlen;
- int i;
- const char *path;
-
- if (strlen(dv->sdev_path) <= 5)
- return (NULL);
-
- if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
- return (NULL);
- path = dv->sdev_path + 5;
-
- for (i = 0; vtab[i].vt_name; i++) {
- if (strcmp(vtab[i].vt_name, path) == 0)
- return (&vtab[i]);
- if (vtab[i].vt_flags & SDEV_SUBDIR) {
- vlen = strlen(vtab[i].vt_name);
- if ((strncmp(vtab[i].vt_name, path,
- vlen - 1) == 0) && path[vlen] == '/')
- return (&vtab[i]);
- }
-
- }
- return (NULL);
-}
-
-/*
- * sets a directory's vnodeops if the directory is in the vtab;
- */
-static struct vnodeops *
-sdev_get_vop(struct sdev_node *dv)
-{
- struct sdev_vop_table *vtp;
- char *path;
-
- path = dv->sdev_path;
- ASSERT(path);
-
- /* gets the relative path to /dev/ */
- path += 5;
-
- /* gets the vtab entry it matches */
- if ((vtp = sdev_match(dv)) != NULL) {
- dv->sdev_flags |= vtp->vt_flags;
- if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
- (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
- dv->sdev_flags |= SDEV_PERSIST;
-
- if (vtp->vt_vops) {
- if (vtp->vt_global_vops)
- *(vtp->vt_global_vops) = vtp->vt_vops;
-
- return (vtp->vt_vops);
- }
-
- if (vtp->vt_service) {
- fs_operation_def_t *templ;
- templ = sdev_merge_vtab(vtp->vt_service);
- if (vn_make_ops(vtp->vt_name,
- (const fs_operation_def_t *)templ,
- &vtp->vt_vops) != 0) {
- cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
- vtp->vt_name);
- /*NOTREACHED*/
- }
- if (vtp->vt_global_vops) {
- *(vtp->vt_global_vops) = vtp->vt_vops;
- }
- sdev_free_vtab(templ);
-
- return (vtp->vt_vops);
- }
-
- return (sdev_vnodeops);
- }
-
- /* child inherits the persistence of the parent */
- if (SDEV_IS_PERSIST(dv->sdev_dotdot))
- dv->sdev_flags |= SDEV_PERSIST;
-
- return (sdev_vnodeops);
-}
-
-static void
-sdev_set_no_negcache(struct sdev_node *dv)
-{
- int i;
- char *path;
-
- ASSERT(dv->sdev_path);
- path = dv->sdev_path + strlen("/dev/");
-
- for (i = 0; vtab[i].vt_name; i++) {
- if (strcmp(vtab[i].vt_name, path) == 0) {
- if (vtab[i].vt_flags & SDEV_NO_NCACHE)
- dv->sdev_flags |= SDEV_NO_NCACHE;
- break;
- }
- }
-}
-
-void *
-sdev_get_vtor(struct sdev_node *dv)
-{
- struct sdev_vop_table *vtp;
-
- vtp = sdev_match(dv);
- if (vtp)
- return ((void *)vtp->vt_vtor);
- else
- return (NULL);
-}
/*
* Build the base root inode
@@ -948,8 +811,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
dv->sdev_path = NULL;
}
- if (!SDEV_IS_GLOBAL(dv))
+ if (!SDEV_IS_GLOBAL(dv)) {
sdev_prof_free(dv);
+ if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL)
+ SDEV_RELE(dv->sdev_origin);
+ }
if (SDEVTOV(dv)->v_type == VDIR) {
ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
@@ -963,6 +829,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
(void) memset((void *)&dv->sdev_instance_data, 0,
sizeof (dv->sdev_instance_data));
vn_invalid(SDEVTOV(dv));
+ dv->sdev_private = NULL;
kmem_cache_free(sdev_node_cache, dv);
}
@@ -2945,46 +2812,6 @@ sdev_modctl_devexists(const char *path)
return (error);
}
-extern int sdev_vnodeops_tbl_size;
-
-/*
- * construct a new template with overrides from vtab
- */
-static fs_operation_def_t *
-sdev_merge_vtab(const fs_operation_def_t tab[])
-{
- fs_operation_def_t *new;
- const fs_operation_def_t *tab_entry;
-
- /* make a copy of standard vnode ops table */
- new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
- bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
-
- /* replace the overrides from tab */
- for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
- fs_operation_def_t *std_entry = new;
- while (std_entry->name) {
- if (strcmp(tab_entry->name, std_entry->name) == 0) {
- std_entry->func = tab_entry->func;
- break;
- }
- std_entry++;
- }
- if (std_entry->name == NULL)
- cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
- tab_entry->name);
- }
-
- return (new);
-}
-
-/* free memory allocated by sdev_merge_vtab */
-static void
-sdev_free_vtab(fs_operation_def_t *new)
-{
- kmem_free(new, sdev_vnodeops_tbl_size);
-}
-
/*
* a generic setattr() function
*
diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c
index d81702185e..55b388c2d4 100644
--- a/usr/src/uts/common/fs/dev/sdev_vfsops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c
@@ -173,7 +173,13 @@ devinit(int fstype, char *name)
if ((devmajor = getudev()) == (major_t)-1) {
cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name);
- return (1);
+ return (ENXIO);
+ }
+
+ if (sdev_plugin_init() != 0) {
+ cmn_err(CE_WARN, "%s: failed to set init plugin subsystem",
+ sdev_vfssw.name);
+ return (EIO);
}
/* initialize negative cache */
@@ -350,6 +356,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
ASSERT(sdev_origins);
dv->sdev_flags &= ~SDEV_GLOBAL;
dv->sdev_origin = sdev_origins->sdev_root;
+ SDEV_HOLD(dv->sdev_origin);
} else {
sdev_ncache_setup();
rw_enter(&dv->sdev_contents, RW_WRITER);
@@ -527,3 +534,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo)
mutex_exit(&vp->v_lock);
mutex_exit(&sdev_lock);
}
+
+void
+sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg)
+{
+ struct sdev_data *mntinfo;
+
+ mutex_enter(&sdev_lock);
+ mntinfo = sdev_mntinfo;
+ while (mntinfo != NULL) {
+ func(mntinfo->sdev_root, arg);
+ mntinfo = mntinfo->sdev_next;
+ }
+ mutex_exit(&sdev_lock);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c
index 79ebd8b2e5..5a00242482 100644
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c
@@ -22,7 +22,7 @@
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -372,7 +372,7 @@ sdev_close(struct vnode *vp, int flag, int count,
/*ARGSUSED*/
static int
sdev_read(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred,
- struct caller_context *ct)
+ struct caller_context *ct)
{
struct sdev_node *dv = (struct sdev_node *)VTOSDEV(vp);
int error;
@@ -399,7 +399,7 @@ sdev_read(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred,
/*ARGSUSED*/
static int
sdev_write(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred,
- struct caller_context *ct)
+ struct caller_context *ct)
{
struct sdev_node *dv = VTOSDEV(vp);
int error = 0;
@@ -582,7 +582,9 @@ sdev_self_access(sdev_node_t *dv, int mode, int flags, struct cred *cr,
{
int ret;
+ ASSERT(RW_READ_HELD(&dv->sdev_contents));
ASSERT(dv->sdev_attr || dv->sdev_attrvp);
+
if (dv->sdev_attrvp) {
ret = VOP_ACCESS(dv->sdev_attrvp, mode, flags, cr, ct);
} else if (dv->sdev_attr) {
@@ -892,6 +894,9 @@ sdev_remove(struct vnode *dvp, char *nm, struct cred *cred,
}
}
+ if (error == 0)
+ i_ddi_di_cache_invalidate();
+
return (error);
}
@@ -1216,6 +1221,7 @@ sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva,
sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
if (SDEV_IS_GLOBAL(parent))
atomic_inc_ulong(&parent->sdev_gdir_gen);
+ i_ddi_di_cache_invalidate();
/* wake up other threads blocked on looking up this node */
mutex_enter(&self->sdev_lookup_lock);
@@ -1288,6 +1294,7 @@ sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
if (SDEV_IS_GLOBAL(parent))
atomic_inc_ulong(&parent->sdev_gdir_gen);
+ i_ddi_di_cache_invalidate();
/* wake up other threads blocked on looking up this node */
mutex_enter(&self->sdev_lookup_lock);
@@ -1403,6 +1410,9 @@ sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
}
+ if (error == 0)
+ i_ddi_di_cache_invalidate();
+
return (error);
}
@@ -1438,32 +1448,24 @@ sdev_readlink(struct vnode *vp, struct uio *uiop, struct cred *cred,
/*ARGSUSED4*/
static int
-sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
+sdev_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
caller_context_t *ct, int flags)
{
- struct sdev_node *parent = VTOSDEV(dvp);
+ struct sdev_node *dv = VTOSDEV(vp);
int error;
+ VERIFY(RW_READ_HELD(&dv->sdev_contents));
+
/*
- * We must check that we have execute access to search the directory --
- * but because our sdev_contents lock is already held as a reader (the
- * caller must have done a VOP_RWLOCK()), we call directly into the
- * underlying access routine if sdev_attr is non-NULL.
+ * We can't recursively take ->sdev_contents via an indirect
+ * VOP_ACCESS(), but we don't need to use that anyway.
*/
- if (parent->sdev_attr != NULL) {
- VERIFY(RW_READ_HELD(&parent->sdev_contents));
-
- if (sdev_unlocked_access(parent, VEXEC, cred) != 0)
- return (EACCES);
- } else {
- if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
- return (error);
- }
+ if ((error = sdev_self_access(dv, VEXEC, 0, cred, ct)) != 0)
+ return (error);
- ASSERT(parent);
- if (!SDEV_IS_GLOBAL(parent))
- prof_filldir(parent);
- return (devname_readdir_func(dvp, uiop, cred, eofp, SDEV_BROWSE));
+ if (!SDEV_IS_GLOBAL(dv))
+ prof_filldir(dv);
+ return (devname_readdir_func(vp, uiop, cred, eofp, SDEV_BROWSE));
}
/*ARGSUSED1*/
diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c
index 8f22ef32f0..e236eb3f72 100644
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c
@@ -472,8 +472,10 @@ devzvol_create_pool_dirs(struct vnode *dvp)
ASSERT(dvp->v_count > 0);
rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
NULL, kcred, NULL, 0, NULL);
- /* should either work, or not be visible from a zone */
- ASSERT(rc == 0 || rc == ENOENT);
+ /*
+ * should either work or we should get an error if this should
+ * not be visible from the zone, or disallowed in the zone
+ */
if (rc == 0)
VN_RELE(vp);
pools++;
diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c
index ca0952642a..50633859ce 100644
--- a/usr/src/uts/common/fs/fem.c
+++ b/usr/src/uts/common/fs/fem.c
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
@@ -33,11 +37,12 @@
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
-
#include <sys/fem.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/vfs_opreg.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
#define NNODES_DEFAULT 8 /* Default number of nodes in a fem_list */
/*
@@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1)
}
#endif
+/*
+ * File event monitoring handoffs
+ *
+ * File event monitoring relies on being able to inject stack frames between
+ * vnode consumers and the underlying file systems. This becomes problematic
+ * when there exist many monitors, as kernel stack depth is finite. The model
+ * very much encodes this injected frame: the flow of control deliberately
+ * lies with the monitor, not with the monitoring system. While we could
+ * conceivably address this by allowing each subsystem to install at most
+ * one monitor per vnode (and impose on subsystems that they handle any
+ * of their own consumer multiplexing internally), this in fact exports a
+ * substantial amount of run-time complexity to deal with an uncommon case
+ * (and, it must be said, assumes a small number of consuming subsystems).
+ * To allow our abstraction to remain clean, we instead check our remaining
+ * stack in every vnext_*() call; if the amount of stack remaining is lower
+ * than a threshold (fem_stack_needed), we call thread_splitstack() to carry
+ * on the execution of the monitors and the underlying vnode operation on a
+ * split stack. Because we can only pass a single argument to our split stack
+ * function, we must marshal our arguments, the mechanics of which are somewhat
+ * ornate in terms of the code: to marshal in a type-safe manner, we define a
+ * baton that is a union of payload structures for each kind of operation,
+ * loading the per-operation payload explicitly and calling into common handoff
+ * code that itself calls thread_splitstack(). The function passed to
+ * thread_splitstack() is a per-entry point function that continues monitor
+ * processing given the specified (marshalled) arguments. While this method
+ * is a little verbose to implement, it has the advantage of being relatively
+ * robust (that is, broadly type-safe) while imposing minimal burden on each
+ * vnext_*() entry point.
+ *
+ * In terms of the implementation:
+ *
+ * - The FEM_BATON_n macros define the per-entry point baton structures
+ * - The fem_baton_payload_t contains the union of these structures
+ * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point
+ * - The FEM_VNEXTn macros constitute the per-handoff entry point
+ *
+ * Note that we don't use variadic macros -- we define a variant of these
+ * macros for each of our relevant argument counts. This may seem overly
+ * explicit, but it is deliberate: the object here is to minimize the
+ * future maintenance burden by minimizing the likelihood of introduced
+ * error -- not to minimize the number of characters in this source file.
+ */
+
+#ifndef STACK_GROWTH_DOWN
+#error Downward stack growth assumed.
+#endif
+
+int fem_stack_toodeep;
+uintptr_t fem_stack_needed = 8 * 1024;
+size_t fem_handoff_stacksize = 128 * 1024;
+
+#define FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
+ (uintptr_t)curthread->t_stkbase < fem_stack_needed)
+
+#define FEM_BATON_1(what, t0, l0) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ } fb_##what
+
+#define FEM_BATON_2(what, t0, l0, t1, l1) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ } fb_##what
+
+#define FEM_BATON_3(what, t0, l0, t1, l1, t2, l2) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ } fb_##what
+
+#define FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ } fb_##what
+
+#define FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ } fb_##what
+
+#define FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ t5 fb_##what##_##l5; \
+ } fb_##what
+
+#define FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+ t6, l6, t7, l7) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ t5 fb_##what##_##l5; \
+ t6 fb_##what##_##l6; \
+ t7 fb_##what##_##l7; \
+ } fb_##what
+
+#define FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+ t6, l6, t7, l7, t8, l8) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ t5 fb_##what##_##l5; \
+ t6 fb_##what##_##l6; \
+ t7 fb_##what##_##l7; \
+ t8 fb_##what##_##l8; \
+ } fb_##what
+
+typedef union {
+ FEM_BATON_2(open, int, mode, cred_t *, cr);
+ FEM_BATON_4(close, int, flag, int, count,
+ offset_t, offset, cred_t *, cr);
+ FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr);
+ FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr);
+ FEM_BATON_5(ioctl, int, cmd, intptr_t, arg,
+ int, flag, cred_t *, cr, int *, rvalp);
+ FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr);
+ FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr);
+ FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr);
+ FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr);
+ FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp,
+ pathname_t *, pnp, int, flags, vnode_t *, rdir,
+ cred_t *, cr, int *, direntflags, pathname_t *, realpnp);
+ FEM_BATON_8(create, char *, name, vattr_t *, vap,
+ vcexcl_t, excl, int, mode, vnode_t **, vpp,
+ cred_t *, cr, int, flag, vsecattr_t *, vsecp);
+ FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags);
+ FEM_BATON_4(link, vnode_t *, svp, char *, tnm,
+ cred_t *, cr, int, flags);
+ FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp,
+ char *, tnm, cred_t *, cr, int, flags);
+ FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap,
+ vnode_t **, vpp, cred_t *, cr, int, flags,
+ vsecattr_t *, vsecp);
+ FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir,
+ cred_t *, cr, int, flags);
+ FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr,
+ int *, eofp, int, flags);
+ FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap,
+ char *, target, cred_t *, cr, int, flags);
+ FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr);
+ FEM_BATON_2(fsync, int, syncflag, cred_t *, cr);
+ FEM_BATON_1(inactive, cred_t *, cr);
+ FEM_BATON_1(fid, fid_t *, fidp);
+ FEM_BATON_1(rwlock, int, write_lock);
+ FEM_BATON_1(rwunlock, int, write_lock);
+ FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp);
+ FEM_BATON_1(cmp, vnode_t *, vp2);
+ FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp,
+ int, flag, offset_t, offset, struct flk_callback *, flk_cbp,
+ cred_t *, cr);
+ FEM_BATON_5(space, int, cmd, struct flock64 *, bfp,
+ int, flag, offset_t, offset, cred_t *, cr);
+ FEM_BATON_1(realvp, vnode_t **, vpp);
+ FEM_BATON_9(getpage, offset_t, off, size_t, len,
+ uint_t *, protp, struct page **, plarr, size_t, plsz,
+ struct seg *, seg, caddr_t, addr, enum seg_rw, rw,
+ cred_t *, cr);
+ FEM_BATON_4(putpage, offset_t, off, size_t, len,
+ int, flags, cred_t *, cr);
+ FEM_BATON_8(map, offset_t, off, struct as *, as,
+ caddr_t *, addrp, size_t, len, uchar_t, prot,
+ uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+ FEM_BATON_8(addmap, offset_t, off, struct as *, as,
+ caddr_t, addr, size_t, len, uchar_t, prot,
+ uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+ FEM_BATON_8(delmap, offset_t, off, struct as *, as,
+ caddr_t, addr, size_t, len, uint_t, prot,
+ uint_t, maxprot, uint_t, flags, cred_t *, cr);
+ FEM_BATON_4(poll, short, events, int, anyyet,
+ short *, reventsp, struct pollhead **, phpp);
+ FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks);
+ FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr);
+ FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off,
+ size_t, io_len, int, flags, cred_t *, cr);
+ FEM_BATON_2(dumpctl, int, action, offset_t *, blkp);
+ FEM_BATON_4(dispose, struct page *, pp, int, flag,
+ int, dn, cred_t *, cr);
+ FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+ FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+ FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr,
+ int, flag, cred_t *, cr);
+ FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname);
+ FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag,
+ xuio_t *, xuiop, cred_t *, cr);
+ FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr);
+} fem_baton_payload_t;
+
+typedef struct {
+ fem_baton_payload_t fb_payload;
+ int (*fb_func)();
+ void (*fb_handoff)();
+ int fb_rval;
+} fem_baton_t;
+
+static int
+fem_handoff(fem_baton_t *bp)
+{
+ fem_stack_toodeep++;
+ thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize);
+
+ return (bp->fb_rval);
+}
+
+#define FEM_VNEXT3_DECL(what, a0, a1, a2) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2); \
+}
+
+#define FEM_VNEXT4_DECL(what, a0, a1, a2, a3) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3); \
+}
+
+#define FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4); \
+}
+
+#define FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5); \
+}
+
+#define FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6); \
+}
+
+#define FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6, \
+ bp->fb_payload.fb_##what.fb_##what##_##a7); \
+}
+
+#define FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6, \
+ bp->fb_payload.fb_##what.fb_##what##_##a7, \
+ bp->fb_payload.fb_##what.fb_##what##_##a8, \
+ bp->fb_payload.fb_##what.fb_##what##_##a9); \
+}
+
+#define FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6, \
+ bp->fb_payload.fb_##what.fb_##what##_##a7, \
+ bp->fb_payload.fb_##what.fb_##what##_##a8, \
+ bp->fb_payload.fb_##what.fb_##what##_##a9, \
+ bp->fb_payload.fb_##what.fb_##what##_##a10); \
+}
+
+#define FEM_VNEXT3(what, func, a0, a1, a2) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2))
+
+#define FEM_VNEXT4(what, func, a0, a1, a2, a3) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3))
+
+#define FEM_VNEXT5(what, func, a0, a1, a2, a3, a4) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4))
+
+#define FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5))
+
+#define FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6))
+
+#define FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6, a7))
+
+#define FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \
+ baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \
+ baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9))
+
+#define FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \
+ baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \
+ baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \
+ baton->fb_payload.fb_##what.fb_##what##_##a10 = a10; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10))
+
static fem_t *
fem_alloc()
{
@@ -2040,10 +2575,60 @@ static struct fs_operation_def fshead_vfs_spec[] = {
* 5. Return by invoking the base operation with the base object.
*
* for each classification, there needs to be at least one "next" operation
- * for each "head"operation.
- *
+ * for each "head" operation. Note that we also use the FEM_VNEXTn_DECL macros
+ * to define the function to run when the stack is split; see the discussion
+ * on "File event monitoring handoffs", above.
*/
+FEM_VNEXT4_DECL(open, arg0, mode, cr, ct)
+FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct)
+FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct)
+FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct)
+FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct)
+FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir,
+ cr, ct, direntflags, realpnp)
+FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)
+FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags)
+FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags)
+FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags)
+FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp)
+FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags)
+FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags)
+FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags)
+FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct)
+FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct)
+FEM_VNEXT3_DECL(fid, arg0, fidp, ct)
+FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct)
+FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct)
+FEM_VNEXT3_DECL(cmp, arg0, vp2, ct)
+FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)
+FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct)
+FEM_VNEXT3_DECL(realvp, arg0, vpp, ct)
+FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz,
+ seg, addr, rw, cr, ct)
+FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct)
+FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot,
+ flags, cr, ct)
+FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct)
+FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct)
+FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct)
+FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct)
+FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct)
+FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct)
+FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct)
+FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct)
+FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct)
+FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct)
+FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct)
+
int
vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
{
@@ -2055,7 +2640,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_open, femop_open);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, mode, cr, ct));
+ FEM_VNEXT4(open, func, arg0, mode, cr, ct);
}
int
@@ -2070,7 +2655,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_close, femop_close);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, flag, count, offset, cr, ct));
+ FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct);
}
int
@@ -2085,7 +2670,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_read, femop_read);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, ioflag, cr, ct));
+ FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct);
}
int
@@ -2100,7 +2685,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_write, femop_write);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, ioflag, cr, ct));
+ FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct);
}
int
@@ -2115,7 +2700,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct));
+ FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct);
}
int
@@ -2130,7 +2715,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, oflags, nflags, cr, ct));
+ FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct);
}
int
@@ -2145,7 +2730,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vap, flags, cr, ct));
+ FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct);
}
int
@@ -2160,7 +2745,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vap, flags, cr, ct));
+ FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct);
}
int
@@ -2175,7 +2760,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_access, femop_access);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, mode, flags, cr, ct));
+ FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct);
}
int
@@ -2191,8 +2776,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp,
vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct,
- direntflags, realpnp));
+ FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct,
+ direntflags, realpnp);
}
int
@@ -2208,7 +2793,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
vsop_find(vf, &func, int, &arg0, vop_create, femop_create);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp));
+ FEM_VNEXT10(create, func, arg0, name, vap, excl,
+ mode, vpp, cr, flag, ct, vsecp);
}
int
@@ -2223,7 +2809,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, nm, cr, ct, flags));
+ FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags);
}
int
@@ -2238,7 +2824,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_link, femop_link);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, svp, tnm, cr, ct, flags));
+ FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags);
}
int
@@ -2253,7 +2839,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags));
+ FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags);
}
int
@@ -2268,7 +2854,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp,
vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp));
+ FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp);
}
int
@@ -2283,7 +2869,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, nm, cdir, cr, ct, flags));
+ FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags);
}
int
@@ -2298,7 +2884,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, cr, eofp, ct, flags));
+ FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags);
}
int
@@ -2313,7 +2899,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target,
vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, linkname, vap, target, cr, ct, flags));
+ FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags);
}
int
@@ -2327,7 +2913,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, cr, ct));
+ FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct);
}
int
@@ -2341,7 +2927,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, syncflag, cr, ct));
+ FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct);
}
void
@@ -2369,7 +2955,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, fidp, ct));
+ FEM_VNEXT3(fid, func, arg0, fidp, ct);
}
int
@@ -2383,7 +2969,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, write_lock, ct));
+ FEM_VNEXT3(rwlock, func, arg0, write_lock, ct);
}
void
@@ -2411,7 +2997,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, ooff, noffp, ct));
+ FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct);
}
int
@@ -2425,7 +3011,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vp2, ct));
+ FEM_VNEXT3(cmp, func, arg0, vp2, ct);
}
int
@@ -2441,7 +3027,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+ FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct);
}
int
@@ -2456,7 +3042,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
vsop_find(vf, &func, int, &arg0, vop_space, femop_space);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct));
+ FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct);
}
int
@@ -2470,7 +3056,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vpp, ct));
+ FEM_VNEXT3(realvp, func, arg0, vpp, ct);
}
int
@@ -2486,8 +3072,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp,
vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw,
- cr, ct));
+ FEM_VNEXT11(getpage, func, arg0, off, len, protp,
+ plarr, plsz, seg, addr, rw, cr, ct);
}
int
@@ -2502,7 +3088,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags,
vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, len, flags, cr, ct));
+ FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct);
}
int
@@ -2518,8 +3104,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp,
vsop_find(vf, &func, int, &arg0, vop_map, femop_map);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags,
- cr, ct));
+ FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags,
+ cr, ct);
}
int
@@ -2535,8 +3121,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
- cr, ct));
+ FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct);
}
int
@@ -2552,8 +3138,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
- cr, ct));
+ FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct);
}
int
@@ -2568,7 +3154,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp,
vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, events, anyyet, reventsp, phpp, ct));
+ FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct);
}
int
@@ -2583,7 +3169,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks,
vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, addr, lbdn, dblks, ct));
+ FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct);
}
int
@@ -2598,7 +3184,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, valp, cr, ct));
+ FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct);
}
int
@@ -2613,7 +3199,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off,
vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct));
+ FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct);
}
int
@@ -2627,7 +3213,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, action, blkp, ct));
+ FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct);
}
void
@@ -2657,7 +3243,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vsap, flag, cr, ct));
+ FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct);
}
int
@@ -2672,7 +3258,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vsap, flag, cr, ct));
+ FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct);
}
int
@@ -2687,7 +3273,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag,
vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, shr, flag, cr, ct));
+ FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct);
}
int
@@ -2702,7 +3288,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname,
vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vnevent, dvp, cname, ct));
+ FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct);
}
int
@@ -2717,7 +3303,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, ioflag, xuiop, cr, ct));
+ FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct);
}
int
@@ -2731,7 +3317,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, xuiop, cr, ct));
+ FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct);
}
int
diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c
index 6e56000ffe..a908f91267 100644
--- a/usr/src/uts/common/fs/fifofs/fifosubr.c
+++ b/usr/src/uts/common/fs/fifofs/fifosubr.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/*
@@ -61,7 +62,6 @@
#if FIFODEBUG
int Fifo_fastmode = 1; /* pipes/fifos will be opened in fast mode */
int Fifo_verbose = 0; /* msg when switching out of fast mode */
-int Fifohiwat = FIFOHIWAT; /* Modifiable FIFO high water mark */
#endif
/*
@@ -196,6 +196,7 @@ fnode_constructor(void *buf, void *cdrarg, int kmflags)
fnp->fn_dest = fnp;
fnp->fn_mp = NULL;
fnp->fn_count = 0;
+ fnp->fn_hiwat = FIFOHIWAT;
fnp->fn_rsynccnt = 0;
fnp->fn_wsynccnt = 0;
fnp->fn_wwaitcnt = 0;
@@ -388,11 +389,7 @@ fifoinit(int fstype, char *name)
pipe_constructor, pipe_destructor, NULL,
(void *)(sizeof (fifodata_t)), NULL, 0);
-#if FIFODEBUG
- if (Fifohiwat < FIFOHIWAT)
- Fifohiwat = FIFOHIWAT;
-#endif /* FIFODEBUG */
- fifo_strdata.qi_minfo->mi_hiwat = Fifohiwat;
+ fifo_strdata.qi_minfo->mi_hiwat = FIFOHIWAT;
return (0);
}
@@ -614,9 +611,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld)
/*
* The other end of the pipe is almost closed so
* reject any other open on this end of the pipe
- * This only happens with a pipe mounted under namefs
+ * This normally only happens with a pipe mounted under namefs, but
+ * we can also see an open via proc/fd, which should still succeed.
+ * To indicate the proc/fd case the FKLYR flag is passed.
*/
- if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) {
+ if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) &&
+ (flag & FKLYR) == 0) {
fifo_cleanup(oldvp, flag);
cv_broadcast(&fnp->fn_wait_cv);
if (!lockheld)
@@ -1161,7 +1161,8 @@ fifo_wakewriter(fifonode_t *fn_dest, fifolock_t *fn_lock)
int fn_dflag = fn_dest->fn_flag;
ASSERT(MUTEX_HELD(&fn_lock->flk_lock));
- ASSERT(fn_dest->fn_dest->fn_count < Fifohiwat);
+ ASSERT(fn_dest->fn_dest->fn_count < fn_dest->fn_dest->fn_hiwat);
+
if ((fn_dflag & FIFOWANTW)) {
cv_broadcast(&fn_dest->fn_wait_cv);
}
diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c
index ef8d76e8e8..c288a2eb61 100644
--- a/usr/src/uts/common/fs/fifofs/fifovnops.c
+++ b/usr/src/uts/common/fs/fifofs/fifovnops.c
@@ -28,7 +28,7 @@
*/
/*
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2017, Joyent, Inc.
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
@@ -104,10 +104,6 @@ static int fifo_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
static int fifo_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
caller_context_t *);
-/* functions local to this file */
-static boolean_t fifo_stayfast_enter(fifonode_t *);
-static void fifo_stayfast_exit(fifonode_t *);
-
/*
* Define the data structures external to this file.
*/
@@ -645,7 +641,7 @@ fifo_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *crp,
* (3) write-only FIFO with no data
* (4) no data and FNDELAY flag is set.
* Otherwise return
- * EAGAIN if FNONBLOCK is set and no data to read
+ * EAGAIN if FNONBLOCK is set and no data to read or FIFORDBLOCK is set
* EINTR if signal received while waiting for data
*
* While there is no data to read....
@@ -681,7 +677,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp,
* Check for data on our input queue
*/
- while (fnp->fn_count == 0) {
+ while (fnp->fn_count == 0 || (fnp->fn_flag & FIFORDBLOCK) != 0) {
/*
* No data on first attempt and no writer, then EOF
*/
@@ -731,6 +727,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp,
}
ASSERT(fnp->fn_mp != NULL);
+ VERIFY((fnp->fn_flag & FIFORDBLOCK) == 0);
/* For pipes copy should not bypass cache */
uiop->uio_extflg |= UIO_COPY_CACHED;
@@ -772,6 +769,18 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp,
&fn_lock->flk_lock))
goto trywake;
+ /*
+ * If another thread snuck in and started to
+ * consume data using read-blocking out of
+ * the pipe while we were blocked in the
+ * cv_wait, then since we have already consumed
+ * some of the data out of the pipe we need
+ * to return with a short read.
+ */
+ if ((fnp->fn_flag & FIFORDBLOCK) != 0) {
+ goto trywake;
+ }
+
if (!(fnp->fn_flag & FIFOFAST))
goto stream_mode;
}
@@ -787,11 +796,11 @@ trywake:
/*
* wake up any blocked writers, processes
* sleeping on POLLWRNORM, or processes waiting for SIGPOLL
- * Note: checking for fn_count < Fifohiwat emulates
+ * Note: checking for fn_count < fn_hiwat emulates
* STREAMS functionality when low water mark is 0
*/
if (fn_dest->fn_flag & (FIFOWANTW | FIFOHIWATW) &&
- fnp->fn_count < Fifohiwat) {
+ fnp->fn_count < fn_dest->fn_hiwat) {
fifo_wakewriter(fn_dest, fn_lock);
}
goto done;
@@ -904,7 +913,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp,
/*
* check to make sure we are not over high water mark
*/
- while (fn_dest->fn_count >= Fifohiwat) {
+ while (fn_dest->fn_count >= fn_dest->fn_hiwat) {
/*
* Indicate that we have gone over high
* water mark
@@ -962,7 +971,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp,
* then we must break the message up into PIPE_BUF
* chunks to stay compliant with STREAMS
*/
- if (uiop->uio_resid + fn_dest->fn_count > Fifohiwat)
+ if (uiop->uio_resid + fn_dest->fn_count > fn_dest->fn_hiwat)
size = MIN(uiop->uio_resid, PIPE_BUF);
else
size = uiop->uio_resid;
@@ -1198,7 +1207,8 @@ fifo_fastioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr,
if (arg != 0) {
goto turn_fastoff;
}
- *rvalp = (fnp->fn_dest->fn_count < Fifohiwat) ? 1 : 0;
+ *rvalp = (fnp->fn_dest->fn_count < fnp->fn_dest->fn_hiwat) ?
+ 1 : 0;
mutex_exit(&fn_lock->flk_lock);
return (0);
@@ -1817,7 +1827,7 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
retevents = POLLHUP;
} else if (events & (POLLWRNORM | POLLWRBAND)) {
if (events & POLLWRNORM) {
- if (fn_dest->fn_count < Fifohiwat)
+ if (fn_dest->fn_count < fn_dest->fn_hiwat)
retevents = POLLWRNORM;
else
fnp->fn_flag |= FIFOHIWATW;
@@ -1986,7 +1996,7 @@ fifo_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *crp,
* the lock.
* If the fifo switches into stream mode while we are waiting, return failure.
*/
-static boolean_t
+boolean_t
fifo_stayfast_enter(fifonode_t *fnp)
{
ASSERT(MUTEX_HELD(&fnp->fn_lock->flk_lock));
@@ -2008,7 +2018,7 @@ fifo_stayfast_enter(fifonode_t *fnp)
* - threads wanting to turn into stream mode waiting in fifo_fastoff(),
* - other writers threads waiting in fifo_stayfast_enter().
*/
-static void
+void
fifo_stayfast_exit(fifonode_t *fnp)
{
fifonode_t *fn_dest = fnp->fn_dest;
diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c
index 3249a574f7..e3d07b595d 100644
--- a/usr/src/uts/common/fs/fs_subr.c
+++ b/usr/src/uts/common/fs/fs_subr.c
@@ -60,6 +60,9 @@
#include <acl/acl_common.h>
#include <sys/pathname.h>
+/* required for fs_reject_epoll */
+#include <sys/poll_impl.h>
+
static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *);
/*
@@ -406,10 +409,20 @@ fs_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct)
}
/*
- * Return the answer requested to poll() for non-device files.
- * Only POLLIN, POLLRDNORM, and POLLOUT are recognized.
+ * Unlike poll(2), epoll should reject attempts to add normal files or
+ * directories to a given handle. Most non-pseudo filesystems rely on
+ * fs_poll() as their implementation of polling behavior. Exceptions to that
+ * rule (ufs) can use fs_reject_epoll(), so they don't require access to the
+ * inner details of poll. Potential race conditions related to the poll module
+ * being loaded are avoided by implementing the check here in genunix.
*/
-struct pollhead fs_pollhd;
+boolean_t
+fs_reject_epoll()
+{
+ /* Check if the currently-active pollcache is epoll-enabled. */
+ return (curthread->t_pollcache != NULL &&
+ (curthread->t_pollcache->pc_flag & PC_EPOLL) != 0);
+}
/* ARGSUSED */
int
@@ -417,13 +430,12 @@ fs_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
struct pollhead **phpp, caller_context_t *ct)
{
/*
- * Reject all attempts for edge-triggered polling. These should only
- * occur when regular files are added to a /dev/poll handle which is in
- * epoll mode. The Linux epoll does not allow epoll-ing on regular
- * files at all, so rejecting EPOLLET requests is congruent with those
- * expectations.
+ * Regular filesystems should reject epollers. On the off chance that
+ * a non-epoll consumer expresses the desire for edge-triggered
+ * polling, we reject them too. Yes, the expected error for this
+ * really is EPERM.
*/
- if (events & POLLET) {
+ if (fs_reject_epoll() || (events & POLLET) != 0) {
return (EPERM);
}
@@ -438,15 +450,7 @@ fs_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
*reventsp |= POLLOUT;
if (events & POLLWRBAND)
*reventsp |= POLLWRBAND;
- /*
- * Emitting a pollhead without the intention of issuing pollwakeup()
- * calls against it is a recipe for trouble. It's only acceptable in
- * this case since the above logic matches practically all useful
- * events.
- */
- if (*reventsp == 0 && !anyyet) {
- *phpp = &fs_pollhd;
- }
+
return (0);
}
diff --git a/usr/src/uts/common/fs/fs_subr.h b/usr/src/uts/common/fs/fs_subr.h
index 27c9e3d830..877dc36f9c 100644
--- a/usr/src/uts/common/fs/fs_subr.h
+++ b/usr/src/uts/common/fs/fs_subr.h
@@ -24,6 +24,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_FS_SUBR_H
@@ -95,6 +96,9 @@ extern int fs_need_estale_retry(int);
extern void fs_vscan_register(int (*av_scan)(vnode_t *, cred_t *, int));
extern int fs_vscan(vnode_t *, cred_t *, int);
+/* Helper function to detect when epoll checks VOP_POLL handlers */
+extern boolean_t fs_reject_epoll();
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..05ee2c6e09
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+ vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define HL_HASH_SIZE 8192 /* must be power of 2 */
+#define HL_MUTEX_SIZE 64
+
+static hldirent_t *hl_hashtable[HL_HASH_SIZE];
+static kmutex_t hl_hashmutex[HL_MUTEX_SIZE];
+
+#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1))
+#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1))
+
+#define HYPRLOFS_HASH(tp, name, hash) \
+ { \
+ char Xc, *Xcp; \
+ hash = (uint_t)(uintptr_t)(tp) >> 8; \
+ for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \
+ hash = (hash << 4) + hash + (uint_t)Xc; \
+ }
+
+void
+hyprlofs_hash_init(void)
+{
+ int ix;
+
+ for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+ mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+ uint_t hash;
+ hldirent_t **prevpp;
+ kmutex_t *hmtx;
+
+ HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+ h->hld_hash = hash;
+ prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ h->hld_link = *prevpp;
+ *prevpp = h;
+ mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+ uint_t hash;
+ hldirent_t **prevpp;
+ kmutex_t *hmtx;
+
+ hash = h->hld_hash;
+ prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ while (*prevpp != h)
+ prevpp = &(*prevpp)->hld_link;
+ *prevpp = h->hld_link;
+ mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+ hlnode_t **found)
+{
+ hldirent_t *l;
+ uint_t hash;
+ kmutex_t *hmtx;
+ hlnode_t *hnp;
+
+ HYPRLOFS_HASH(parent, name, hash);
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ l = hl_hashtable[HL_HASH_INDEX(hash)];
+ while (l) {
+ if (l->hld_hash == hash && l->hld_parent == parent &&
+ strcmp(l->hld_name, name) == 0) {
+ /*
+ * Ensure that the hlnode that we put a hold on is the
+ * same one that we pass back. Thus the temp. var
+ * hnp is necessary.
+ */
+ hnp = l->hld_hlnode;
+ if (hold) {
+ ASSERT(hnp);
+ hlnode_hold(hnp);
+ }
+ if (found)
+ *found = hnp;
+ mutex_exit(hmtx);
+ return (l);
+ } else {
+ l = l->hld_link;
+ }
+ }
+ mutex_exit(hmtx);
+ return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+ int error;
+
+ *foundtp = NULL;
+ if (parent->hln_type != VDIR)
+ return (ENOTDIR);
+
+ if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+ return (error);
+
+ if (*name == '\0') {
+ hlnode_hold(parent);
+ *foundtp = parent;
+ return (0);
+ }
+
+ /*
+ * Search the directory for the matching name. We need the lock
+ * protecting the hln_dir list so that it doesn't change out from
+ * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+ * with a hold on it.
+ */
+ if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+ ASSERT(*foundtp);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+ hlfsmount_t *hm,
+ hlnode_t *dir, /* target directory to make entry in */
+ char *name, /* name of entry */
+ enum de_op op, /* entry operation */
+ vnode_t *realvp, /* real vnode */
+ vattr_t *va,
+ hlnode_t **hpp, /* return hlnode */
+ cred_t *cr)
+{
+ hldirent_t *hdp;
+ hlnode_t *found = NULL;
+ hlnode_t *hp;
+ int error = 0;
+ char *s;
+
+ /* hln_rwlock is held to serialize direnter and dirdeletes */
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ /* Don't allow '/' characters in pathname component */
+ for (s = name; *s; s++)
+ if (*s == '/')
+ return (EACCES);
+
+ if (name[0] == '\0')
+ panic("hyprlofs_direnter: NULL name");
+
+ /*
+ * This might be a "dangling detached directory". It could have been
+ * removed, but a reference to it kept in u_cwd. Don't bother searching
+ * it, and with any luck the user will get tired of dealing with us and
+ * cd to some absolute pathway. This is in ufs, too.
+ */
+ if (dir->hln_nlink == 0) {
+ return (ENOENT);
+ }
+
+ /* Search for the entry. Return "found" if it exists. */
+ hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+ if (hdp) {
+ ASSERT(found);
+ switch (op) {
+ case DE_CREATE:
+ case DE_MKDIR:
+ if (hpp) {
+ *hpp = found;
+ error = EEXIST;
+ } else {
+ hlnode_rele(found);
+ }
+ break;
+ }
+ } else {
+
+ /*
+ * The entry does not exist. Check write perms in dir to see if
+ * entry can be created.
+ */
+ if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+ return (error);
+
+ /* Make new hlnode and directory entry as required. */
+ if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+ cr)))
+ return (error);
+
+ if ((error = hldiraddentry(dir, hp, name))) {
+ /* Unmake the inode we just made. */
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ if ((hp->hln_type) == VDIR) {
+ ASSERT(hdp == NULL);
+ /* cleanup allocs made by hyprlofs_dirinit() */
+ hyprlofs_dirtrunc(hp);
+ }
+ mutex_enter(&hp->hln_tlock);
+ hp->hln_nlink = 0;
+ mutex_exit(&hp->hln_tlock);
+ gethrestime(&hp->hln_ctime);
+ rw_exit(&hp->hln_rwlock);
+ hlnode_rele(hp);
+ hp = NULL;
+ } else if (hpp) {
+ *hpp = hp;
+ } else {
+ hlnode_rele(hp);
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+ cred_t *cr)
+{
+ hldirent_t *hpdp;
+ int error;
+ size_t namelen;
+ hlnode_t *hnp;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ if (nm[0] == '\0')
+ panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+ /* return error if removing . or .. */
+ if (nm[0] == '.') {
+ if (nm[1] == '\0')
+ return (EINVAL);
+ if (nm[1] == '.' && nm[2] == '\0')
+ return (EEXIST); /* thus in ufs */
+ }
+
+ if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0)
+ return (error);
+
+ if (dir->hln_dir == NULL)
+ return (ENOENT);
+
+ hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+ if (hpdp == NULL) {
+ /*
+ * If it is gone, some other thread got here first!
+ * Return error ENOENT.
+ */
+ return (ENOENT);
+ }
+
+ /*
+ * If the hlnode in the hldirent changed (shouldn't happen since we
+ * don't support rename) then original is gone, so return that status
+ * (same as UFS).
+ */
+ if (hp != hnp)
+ return (ENOENT);
+
+ hyprlofs_hash_out(hpdp);
+
+ /* Take hpdp out of the directory list. */
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+ if (hpdp->hld_prev) {
+ hpdp->hld_prev->hld_next = hpdp->hld_next;
+ }
+ if (hpdp->hld_next) {
+ hpdp->hld_next->hld_prev = hpdp->hld_prev;
+ }
+
+ /*
+ * If the roving slot pointer happens to match hpdp, point it at the
+ * previous dirent.
+ */
+ if (dir->hln_dir->hld_prev == hpdp) {
+ dir->hln_dir->hld_prev = hpdp->hld_prev;
+ }
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ /* hpdp points to the correct directory entry */
+ namelen = strlen(hpdp->hld_name) + 1;
+
+ kmem_free(hpdp, sizeof (hldirent_t) + namelen);
+ dir->hln_size -= (sizeof (hldirent_t) + namelen);
+ dir->hln_dirents--;
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+ hp->hln_ctime = now;
+
+ ASSERT(hp->hln_nlink > 0);
+ DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+ if (op == DR_RMDIR && hp->hln_type == VDIR) {
+ hyprlofs_dirtrunc(hp);
+ ASSERT(hp->hln_nlink == 0);
+ }
+ return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+ hlnode_t *parent, /* parent of directory to initialize */
+ hlnode_t *dir) /* the new directory */
+{
+ hldirent_t *dot, *dotdot;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP);
+ dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP);
+
+ /* Initialize the entries */
+ dot->hld_hlnode = dir;
+ dot->hld_offset = 0;
+ dot->hld_name = (char *)dot + sizeof (hldirent_t);
+ dot->hld_name[0] = '.';
+ dot->hld_parent = dir;
+ hyprlofs_hash_in(dot);
+
+ dotdot->hld_hlnode = parent;
+ dotdot->hld_offset = 1;
+ dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+ dotdot->hld_name[0] = '.';
+ dotdot->hld_name[1] = '.';
+ dotdot->hld_parent = dir;
+ hyprlofs_hash_in(dotdot);
+
+ /* Initialize directory entry list. */
+ dot->hld_next = dotdot;
+ dot->hld_prev = dotdot;
+ dotdot->hld_next = NULL;
+ dotdot->hld_prev = dot;
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ /*
+ * Since hyprlofs_dirinit is called with both dir and parent being the
+ * same for the root vnode, we need to increment this before we set
+ * hln_nlink = 2 below.
+ */
+ INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+ parent->hln_ctime = now;
+
+ dir->hln_dir = dot;
+ dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+ dir->hln_dirents = 2;
+ dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+ hldirent_t *hdp;
+ hlnode_t *tp;
+ size_t namelen;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ if (dir->hln_looped)
+ return;
+
+ for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+ ASSERT(hdp->hld_next != hdp);
+ ASSERT(hdp->hld_prev != hdp);
+ ASSERT(hdp->hld_hlnode);
+
+ dir->hln_dir = hdp->hld_next;
+ namelen = strlen(hdp->hld_name) + 1;
+
+ /*
+ * Adjust the link counts to account for this dir entry removal.
+ */
+ tp = hdp->hld_hlnode;
+
+ ASSERT(tp->hln_nlink > 0);
+ DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+ hyprlofs_hash_out(hdp);
+
+ kmem_free(hdp, sizeof (hldirent_t) + namelen);
+ dir->hln_size -= (sizeof (hldirent_t) + namelen);
+ dir->hln_dirents--;
+ }
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ ASSERT(dir->hln_dir == NULL);
+ ASSERT(dir->hln_size == 0);
+ ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+ hlnode_t *dir, /* target directory to make entry in */
+ hlnode_t *hp, /* new hlnode */
+ char *name)
+{
+ hldirent_t *hdp, *hpdp;
+ size_t namelen, alloc_size;
+ timestruc_t now;
+
+ /*
+ * Make sure the parent dir wasn't removed from underneath the caller.
+ */
+ if (dir->hln_dir == NULL)
+ return (ENOENT);
+
+ /* Check that everything is on the same FS. */
+ if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+ return (EXDEV);
+
+ /* Alloc and init dir entry */
+ namelen = strlen(name) + 1;
+ alloc_size = namelen + sizeof (hldirent_t);
+ hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP);
+ if (hdp == NULL)
+ return (ENOSPC);
+
+ dir->hln_size += alloc_size;
+ dir->hln_dirents++;
+ hdp->hld_hlnode = hp;
+ hdp->hld_parent = dir;
+
+ /* The dir entry and its name were allocated sequentially. */
+ hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+ (void) strcpy(hdp->hld_name, name);
+
+ hyprlofs_hash_in(hdp);
+
+ /*
+ * Some utilities expect the size of a directory to remain fairly
+ * static. For example, a routine which unlinks files between calls to
+ * readdir(); the size of the dir changes from underneath it and so the
+ * real dir offset in bytes is invalid. To circumvent this problem, we
+ * initialize a dir entry with a phony offset, and use this offset to
+ * determine end of file in hyprlofs_readdir.
+ */
+ hpdp = dir->hln_dir->hld_prev;
+ /*
+ * Install at first empty "slot" in directory list.
+ */
+ while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+ hpdp->hld_offset) <= 1) {
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+ ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+ hpdp = hpdp->hld_next;
+ }
+ hdp->hld_offset = hpdp->hld_offset + 1;
+
+ /*
+ * If we're at the end of the dirent list and the offset (which is
+ * necessarily the largest offset in this dir) is more than twice the
+ * number of dirents, that means the dir is 50% holes. At this point
+ * we reset the slot pointer back to the beginning of the dir so we
+ * start using the holes. The idea is that if there are N dirents,
+ * there must also be N holes, so we can satisfy the next N creates by
+ * walking at most 2N entries; thus the average cost of a create is
+ * constant. Note that we use the first dirent's hld_prev as the roving
+ * slot pointer. This saves a word in every dirent.
+ */
+ if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+ dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+ else
+ dir->hln_dir->hld_prev = hdp;
+
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ hdp->hld_next = hpdp->hld_next;
+ if (hdp->hld_next) {
+ hdp->hld_next->hld_prev = hdp;
+ }
+ hdp->hld_prev = hpdp;
+ hpdp->hld_next = hdp;
+
+ ASSERT(hdp->hld_next != hdp);
+ ASSERT(hdp->hld_prev != hdp);
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+ vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+ hlnode_t *hp;
+ enum vtype type;
+
+ ASSERT(va != NULL);
+ ASSERT(op == DE_CREATE || op == DE_MKDIR);
+ if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+ ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+ return (EOVERFLOW);
+ type = va->va_type;
+ hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+ hyprlofs_node_init(hm, hp, va, cr);
+
+ hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+ hp->hln_vnode->v_type = type;
+ hp->hln_uid = crgetuid(cr);
+
+ /*
+ * To determine the gid of the created file:
+ * If the directory's set-gid bit is set, set the gid to the gid
+ * of the parent dir, otherwise, use the process's gid.
+ */
+ if (dir->hln_mode & VSGID)
+ hp->hln_gid = dir->hln_gid;
+ else
+ hp->hln_gid = crgetgid(cr);
+
+ /*
+ * If we're creating a dir and the parent dir has the set-GID bit set,
+ * set it on the new dir. Otherwise, if the user is neither privileged
+ * nor a member of the file's new group, clear the file's set-GID bit.
+ */
+ if (dir->hln_mode & VSGID && type == VDIR)
+ hp->hln_mode |= VSGID;
+ else {
+ if ((hp->hln_mode & VSGID) &&
+ secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+ hp->hln_mode &= ~VSGID;
+ }
+
+ if (va->va_mask & AT_ATIME)
+ hp->hln_atime = va->va_atime;
+ if (va->va_mask & AT_MTIME)
+ hp->hln_mtime = va->va_mtime;
+
+ if (op == DE_MKDIR) {
+ hyprlofs_dirinit(dir, hp);
+ hp->hln_looped = 0;
+ } else {
+ hp->hln_realvp = realvp;
+ hp->hln_size = va->va_size;
+ hp->hln_looped = 1;
+ }
+
+ *newnode = hp;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..1d857309f3
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define MODESHIFT 3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+ vnode_t *vp;
+ timestruc_t now;
+
+ ASSERT(vap != NULL);
+
+ rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+ h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ h->hln_mask = 0;
+ h->hln_type = vap->va_type;
+ h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+ h->hln_nlink = 1;
+ h->hln_size = 0;
+
+ if (cr == NULL) {
+ h->hln_uid = vap->va_uid;
+ h->hln_gid = vap->va_gid;
+ } else {
+ h->hln_uid = crgetuid(cr);
+ h->hln_gid = crgetgid(cr);
+ }
+
+ h->hln_fsid = hm->hlm_dev;
+ h->hln_rdev = vap->va_rdev;
+ h->hln_blksize = PAGESIZE;
+ h->hln_nblocks = 0;
+ gethrestime(&now);
+ h->hln_atime = now;
+ h->hln_mtime = now;
+ h->hln_ctime = now;
+ h->hln_seq = 0;
+ h->hln_dir = NULL;
+
+ h->hln_vnode = vn_alloc(KM_SLEEP);
+ vp = HLNTOV(h);
+ vn_setops(vp, hyprlofs_vnodeops);
+ vp->v_vfsp = hm->hlm_vfsp;
+ vp->v_type = vap->va_type;
+ vp->v_rdev = vap->va_rdev;
+ vp->v_data = (caddr_t)h;
+ mutex_enter(&hm->hlm_contents);
+ /*
+ * Increment the pseudo generation number for this hlnode. Since
+ * hlnodes are allocated and freed, there really is no particular
+ * generation number for a new hlnode. Just fake it by using a
+ * counter in each file system.
+ */
+ h->hln_gen = hm->hlm_gen++;
+
+ /*
+ * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+ * Root dir is handled specially in hyprlofs_mount.
+ */
+ if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+ h->hln_forw = NULL;
+ h->hln_back = hm->hlm_rootnode->hln_back;
+ h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+ }
+ mutex_exit(&hm->hlm_contents);
+ vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+ hlnode_t *hp = vtp;
+ int shift = 0;
+
+ /* Check access based on owner, group and public perms in hlnode. */
+ if (crgetuid(cr) != hp->hln_uid) {
+ shift += MODESHIFT;
+ if (groupmember(hp->hln_gid, cr) == 0)
+ shift += MODESHIFT;
+ }
+
+ return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+ hp->hln_mode << shift, mode));
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..c582a8cac2
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,614 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems. It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute. Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses. Entries in the name space are not mounts and
+ * thus do not appear in the mnttab. Entries in the name space are allowed to
+ * refer to files on different backing file systems. Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications. That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place. It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space. The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed. The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /. The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "hyprlofs",
+ hyprlofsinit,
+ VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+ &hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+ 0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+ &mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+ int error;
+
+ error = mod_remove(&modlinkage);
+ if (error)
+ return (error);
+ /*
+ * Tear down the operations vectors
+ */
+ (void) vfs_freevfsops_by_type(hyprlofsfstype);
+ vn_freevnodeops(hyprlofs_vnodeops);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+ static const fs_operation_def_t hl_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount },
+ VFSNAME_ROOT, { .vfs_root = hyprlofs_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs },
+ VFSNAME_VGET, { .vfs_vget = hyprlofs_vget },
+ NULL, NULL
+ };
+ int error;
+ extern void hyprlofs_hash_init();
+
+ hyprlofs_hash_init();
+ hyprlofsfstype = fstype;
+ ASSERT(hyprlofsfstype != 0);
+
+ error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+ return (error);
+ }
+
+ error = vn_make_ops(name, hyprlofs_vnodeops_template,
+ &hyprlofs_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * hyprlofs_minfree is an absolute limit of swap space which still
+ * allows other processes to execute. Set it if its not patched.
+ */
+ if (hyprlofs_minfree == 0)
+ hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+ if ((hyprlofs_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "hyprlofsinit: Can't get unique device number.");
+ hyprlofs_major = 0;
+ }
+ mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ hlfsmount_t *hm = NULL;
+ hlnode_t *hp;
+ struct pathname dpn;
+ int error;
+ vattr_t rattr;
+ int got_attrs;
+
+ if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ return (error);
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (uap->flags & MS_REMOUNT)
+ return (EBUSY);
+
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /* Having the resource be anything but "swap" doesn't make sense. */
+ vfs_setresource(vfsp, "swap", 0);
+
+ if ((error = pn_get(uap->dir,
+ (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE,
+ &dpn)) != 0)
+ goto out;
+
+ if ((hm = kmem_zalloc(sizeof (hlfsmount_t),
+ KM_NORMALPRI | KM_NOSLEEP)) == NULL) {
+ pn_free(&dpn);
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Get an available minor device number for this mount */
+ mutex_enter(&hyprlofs_minor_lock);
+ do {
+ hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+ hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+ } while (vfs_devismounted(hm->hlm_dev));
+ mutex_exit(&hyprlofs_minor_lock);
+
+ /*
+ * Set but don't bother entering the mutex since hlfsmount is not on
+ * the mount list yet.
+ */
+ mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+ hm->hlm_vfsp = vfsp;
+
+ vfsp->vfs_data = (caddr_t)hm;
+ vfsp->vfs_fstype = hyprlofsfstype;
+ vfsp->vfs_dev = hm->hlm_dev;
+ vfsp->vfs_bsize = PAGESIZE;
+ vfsp->vfs_flag |= VFS_NOTRUNC;
+ vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+ hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+ (void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+ /* allocate and initialize root hlnode structure */
+ bzero(&rattr, sizeof (vattr_t));
+ rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+ rattr.va_type = VDIR;
+ rattr.va_rdev = 0;
+ hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+ hyprlofs_node_init(hm, hp, &rattr, cr);
+
+ /* Get the mode, uid, and gid from the underlying mount point. */
+ rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+ got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ HLNTOV(hp)->v_flag |= VROOT;
+
+ /*
+ * If the getattr succeeded, use its results, otherwise allow the
+ * previously set defaults to prevail.
+ */
+ if (got_attrs == 0) {
+ hp->hln_mode = rattr.va_mode;
+ hp->hln_uid = rattr.va_uid;
+ hp->hln_gid = rattr.va_gid;
+ }
+
+ /*
+ * Initialize linked list of hlnodes so that the back pointer of the
+ * root hlnode always points to the last one on the list and the
+ * forward pointer of the last node is null
+ */
+ hp->hln_back = hp;
+ hp->hln_forw = NULL;
+ hp->hln_nlink = 0;
+ hm->hlm_rootnode = hp;
+
+ hyprlofs_dirinit(hp, hp);
+
+ rw_exit(&hp->hln_rwlock);
+
+ pn_free(&dpn);
+ error = 0;
+
+out:
+ return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hnp, *cancel;
+ vnode_t *vp;
+ int error;
+
+ if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (error);
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ /*
+ * forced unmount is not supported by this file system
+ * and thus, ENOTSUP, is being returned.
+ */
+ if (flag & MS_FORCE)
+ return (ENOTSUP);
+
+ mutex_enter(&hm->hlm_contents);
+
+ /*
+ * If there are no open files, only the root node should have a ref cnt.
+ * With hlm_contents held, nothing can be added or removed. There may
+ * be some dirty pages. To prevent fsflush from disrupting the unmount,
+ * put a hold on each node while scanning. If we find a previously
+ * referenced node, undo the holds we have placed and fail EBUSY.
+ */
+ hnp = hm->hlm_rootnode;
+ if (HLNTOV(hnp)->v_count > 1) {
+ mutex_exit(&hm->hlm_contents);
+ return (EBUSY);
+ }
+
+ for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+ if ((vp = HLNTOV(hnp))->v_count > 0) {
+ cancel = hm->hlm_rootnode->hln_forw;
+ while (cancel != hnp) {
+ vp = HLNTOV(cancel);
+ ASSERT(vp->v_count > 0);
+ VN_RELE(vp);
+ cancel = cancel->hln_forw;
+ }
+ mutex_exit(&hm->hlm_contents);
+ return (EBUSY);
+ }
+ VN_HOLD(vp);
+ }
+
+ /* We can drop the mutex now because no one can find this mount */
+ mutex_exit(&hm->hlm_contents);
+
+ /*
+ * Free all alloc'd memory associated with this FS. To do this, we go
+ * through the file list twice, once to remove all the dir entries, and
+ * then to remove all the files.
+ */
+
+ /* Remove all directory entries */
+ for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+ rw_enter(&hnp->hln_rwlock, RW_WRITER);
+ if (hnp->hln_type == VDIR)
+ hyprlofs_dirtrunc(hnp);
+ rw_exit(&hnp->hln_rwlock);
+ }
+
+ ASSERT(hm->hlm_rootnode);
+
+ /*
+ * All links are gone, v_count is keeping nodes in place. VN_RELE
+ * should make the node disappear, unless somebody is holding pages
+ * against it. Wait and retry until it disappears.
+ *
+ * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+ * from blowing it away (in hyprlofs_inactive) while we're trying to
+ * get to it here. Once we have a HOLD on it we know it'll stick around.
+ */
+ mutex_enter(&hm->hlm_contents);
+
+ /* Remove all the files (except the rootnode) backwards. */
+ while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+ mutex_exit(&hm->hlm_contents);
+ /* Note we handled the link count in pass 2 above. */
+ vp = HLNTOV(hnp);
+ VN_RELE(vp);
+ mutex_enter(&hm->hlm_contents);
+ /*
+ * It's still there after the RELE. Someone else like pageout
+ * has a hold on it so wait a bit and then try again.
+ */
+ if (hnp == hm->hlm_rootnode->hln_back) {
+ VN_HOLD(vp);
+ mutex_exit(&hm->hlm_contents);
+ delay(hz / 4);
+ mutex_enter(&hm->hlm_contents);
+ }
+ }
+ mutex_exit(&hm->hlm_contents);
+
+ VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+ ASSERT(hm->hlm_mntpath);
+
+ kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+ mutex_destroy(&hm->hlm_contents);
+ kmem_free(hm, sizeof (hlfsmount_t));
+
+ return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hp = hm->hlm_rootnode;
+ vnode_t *vp;
+
+ ASSERT(hp);
+
+ vp = HLNTOV(hp);
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ ulong_t blocks;
+ dev32_t d32;
+ zoneid_t eff_zid;
+ struct zone *zp;
+
+ /*
+ * The FS may have been mounted by the GZ on behalf of the NGZ. In
+ * that case, the hlfsmount zone_id will be the global zone. We want
+ * to show the swap cap inside the zone in this case, even though the
+ * FS was mounted by the GZ.
+ */
+ if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+ zp = curproc->p_zone;
+ else
+ zp = hm->hlm_vfsp->vfs_zone;
+
+ if (zp == NULL)
+ eff_zid = GLOBAL_ZONEUNIQID;
+ else
+ eff_zid = zp->zone_id;
+
+ sbp->f_bsize = PAGESIZE;
+ sbp->f_frsize = PAGESIZE;
+
+ /*
+ * Find the amount of available physical and memory swap
+ */
+ mutex_enter(&anoninfo_lock);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+ mutex_exit(&anoninfo_lock);
+
+ if (blocks > hyprlofs_minfree)
+ sbp->f_bfree = blocks - hyprlofs_minfree;
+ else
+ sbp->f_bfree = 0;
+
+ sbp->f_bavail = sbp->f_bfree;
+
+ /*
+ * Total number of blocks is what's available plus what's been used
+ */
+ sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+ if (eff_zid != GLOBAL_ZONEUNIQID &&
+ zp->zone_max_swap_ctl != UINT64_MAX) {
+ /*
+ * If the fs is used by a NGZ with a swap cap, then report the
+ * capped size.
+ */
+ rctl_qty_t cap, used;
+ pgcnt_t pgcap, pgused;
+
+ mutex_enter(&zp->zone_mem_lock);
+ cap = zp->zone_max_swap_ctl;
+ used = zp->zone_max_swap;
+ mutex_exit(&zp->zone_mem_lock);
+
+ pgcap = btop(cap);
+ pgused = btop(used);
+
+ sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+ sbp->f_bavail = sbp->f_bfree;
+ sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+ }
+
+ /*
+ * This is fairly inaccurate since it doesn't take into account the
+ * names stored in the directory entries.
+ */
+ sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+ (sizeof (hlnode_t) + sizeof (hldirent_t));
+
+ sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sbp->f_fsid = d32;
+ (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+ (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+ /*
+ * ensure null termination
+ */
+ sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+ sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sbp->f_namemax = MAXNAMELEN - 1;
+ return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+ hlfid_t *hfid;
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hp = NULL;
+
+ hfid = (hlfid_t *)fidp;
+ *vpp = NULL;
+
+ mutex_enter(&hm->hlm_contents);
+ for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+ mutex_enter(&hp->hln_tlock);
+ if (hp->hln_nodeid == hfid->hlfid_ino) {
+ /*
+ * If the gen numbers don't match we know the file
+ * won't be found since only one hlnode can have this
+ * number at a time.
+ */
+ if (hp->hln_gen != hfid->hlfid_gen ||
+ hp->hln_nlink == 0) {
+ mutex_exit(&hp->hln_tlock);
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+ }
+ *vpp = (vnode_t *)HLNTOV(hp);
+
+ VN_HOLD(*vpp);
+
+ if ((hp->hln_mode & S_ISVTX) &&
+ !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+ mutex_enter(&(*vpp)->v_lock);
+ (*vpp)->v_flag |= VISSWAP;
+ mutex_exit(&(*vpp)->v_lock);
+ }
+ mutex_exit(&hp->hln_tlock);
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+ }
+ mutex_exit(&hp->hln_tlock);
+ }
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..52dba31761
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1450 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+ caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+ int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+ int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+ int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call. This is only used to validate that
+ * the input list looks sane.
+ */
+#define MAX_IOCTL_PARAMS 100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ vnode_t *rvp;
+ int error;
+
+ rvp = REALVP(*vpp);
+
+ if (VTOHLN(*vpp)->hln_looped == 0)
+ return (0);
+
+ /*
+ * looped back, pass through to real vnode. Need to hold new reference
+ * to vp since VOP_OPEN() may decide to release it.
+ */
+ VN_HOLD(rvp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+ ASSERT(rvp->v_count > 1);
+ VN_RELE(rvp);
+
+ return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 0) {
+ cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+ cleanshares(vp, ttoproc(curthread)->p_pid);
+ return (0);
+ }
+
+ return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* We don't support writing to non-regular files */
+ if (vp->v_type != VREG)
+ return (EINVAL);
+
+ if (vn_is_readonly(vp))
+ return (EROFS);
+
+ return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+ cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+ uint_t len, cnt;
+ int i, error;
+ model_t model;
+ char path[MAXPATHLEN];
+ char nm[MAXPATHLEN];
+
+ /* We only support the hyprlofs ioctls on the root vnode */
+ if (!(vp->v_flag & VROOT))
+ return (ENOTTY);
+
+ /*
+ * Check if managing hyprlofs is allowed.
+ */
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+ model = get_udatamodel();
+
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_entries_t ebuf;
+ hyprlofs_entry_t *e;
+
+ if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+ return (EFAULT);
+ cnt = ebuf.hle_len;
+ if (cnt > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ len = sizeof (hyprlofs_entry_t) * cnt;
+
+ e = kmem_alloc(len, KM_SLEEP);
+ if (copyin((void *)(ebuf.hle_entries), e, len)) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (e[i].hle_nlen == 0 ||
+ e[i].hle_nlen >= sizeof (nm)) {
+ kmem_free(e, len);
+ return (EINVAL);
+ }
+
+ if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+ != 0) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+ nm[e[i].hle_nlen] = '\0';
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES) {
+ if (e[i].hle_plen == 0 ||
+ e[i].hle_plen >= sizeof (path)) {
+ kmem_free(e, len);
+ return (EINVAL);
+ }
+
+ if (copyin(e[i].hle_path, path,
+ e[i].hle_plen) != 0) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+ path[e[i].hle_plen] = '\0';
+
+ if ((error = hyprlofs_add_entry(vp,
+ path, nm, cr, ct)) != 0) {
+ kmem_free(e, len);
+ return (error);
+ }
+ } else {
+ if ((error = hyprlofs_rm_entry(vp, nm,
+ cr, ct, flag)) != 0) {
+ kmem_free(e, len);
+ return (error);
+ }
+ }
+ }
+
+ kmem_free(e, len);
+ return (0);
+
+ } else {
+ hyprlofs_entries32_t ebuf32;
+ hyprlofs_entry32_t *e32;
+
+ if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+ return (EFAULT);
+
+ cnt = ebuf32.hle_len;
+ if (cnt > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ len = sizeof (hyprlofs_entry32_t) * cnt;
+
+ e32 = kmem_alloc(len, KM_SLEEP);
+ if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+ e32, len)) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (e32[i].hle_nlen == 0 ||
+ e32[i].hle_nlen >= sizeof (nm)) {
+ kmem_free(e32, len);
+ return (EINVAL);
+ }
+
+ if (copyin((void *)(unsigned long)
+ e32[i].hle_name, nm,
+ e32[i].hle_nlen) != 0) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+ nm[e32[i].hle_nlen] = '\0';
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES) {
+ if (e32[i].hle_plen == 0 ||
+ e32[i].hle_plen >= sizeof (path)) {
+ kmem_free(e32, len);
+ return (EINVAL);
+ }
+
+ if (copyin((void *)(unsigned long)
+ e32[i].hle_path, path,
+ e32[i].hle_plen) != 0) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+ path[e32[i].hle_plen] = '\0';
+
+ if ((error = hyprlofs_add_entry(vp,
+ path, nm, cr, ct)) != 0) {
+ kmem_free(e32, len);
+ return (error);
+ }
+ } else {
+ if ((error = hyprlofs_rm_entry(vp, nm,
+ cr, ct, flag)) != 0) {
+ kmem_free(e32, len);
+ return (error);
+ }
+ }
+ }
+
+ kmem_free(e32, len);
+ return (0);
+ }
+ }
+
+ if (cmd == HYPRLOFS_RM_ALL) {
+ return (hyprlofs_rm_all(vp, cr, ct, flag));
+ }
+
+ if (cmd == HYPRLOFS_GET_ENTRIES) {
+ return (hyprlofs_get_all(vp, data, cr, ct, flag));
+ }
+
+ return (ENOTTY);
+}
+
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ vattr_t tmp_va;
+
+ if (tp->hln_looped == 1) {
+ int error;
+
+ if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr,
+ ct)) != 0)
+ return (error);
+ }
+
+ mutex_enter(&tp->hln_tlock);
+ vap->va_type = vp->v_type;
+ vap->va_mode = tp->hln_mode & MODEMASK;
+ vap->va_uid = tp->hln_uid;
+ vap->va_gid = tp->hln_gid;
+ vap->va_fsid = tp->hln_fsid;
+ vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+ vap->va_nlink = tp->hln_nlink;
+ vap->va_size = (u_offset_t)tp->hln_size;
+ vap->va_atime = tp->hln_atime;
+ vap->va_mtime = tp->hln_mtime;
+ vap->va_ctime = tp->hln_ctime;
+ vap->va_blksize = PAGESIZE;
+ vap->va_rdev = tp->hln_rdev;
+ vap->va_seq = tp->hln_seq;
+
+ if (tp->hln_looped == 1) {
+ vap->va_nblocks = tmp_va.va_nblocks;
+ } else {
+ vap->va_nblocks =
+ (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+ }
+ mutex_exit(&tp->hln_tlock);
+ return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+ cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ int error = 0;
+ vattr_t *get;
+ long mask;
+
+ /*
+ * Cannot set these attributes
+ */
+ if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+ return (EINVAL);
+
+ mutex_enter(&tp->hln_tlock);
+
+ get = &tp->hln_attr;
+ /*
+ * Change file access modes. Must be owner or have sufficient
+ * privileges.
+ */
+ error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+ hyprlofs_taccess, tp);
+
+ if (error)
+ goto out;
+
+ mask = vap->va_mask;
+
+ if (mask & AT_MODE) {
+ get->va_mode &= S_IFMT;
+ get->va_mode |= vap->va_mode & ~S_IFMT;
+ }
+
+ if (mask & AT_UID)
+ get->va_uid = vap->va_uid;
+ if (mask & AT_GID)
+ get->va_gid = vap->va_gid;
+ if (mask & AT_ATIME)
+ get->va_atime = vap->va_atime;
+ if (mask & AT_MTIME)
+ get->va_mtime = vap->va_mtime;
+
+ if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+ gethrestime(&tp->hln_ctime);
+
+out:
+ mutex_exit(&tp->hln_tlock);
+ return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ int error;
+
+ if (mode & VWRITE) {
+ if (vp->v_type == VREG && vn_is_readonly(vp))
+ return (EROFS);
+ }
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+ mutex_enter(&tp->hln_tlock);
+ error = hyprlofs_taccess(tp, mode, cr);
+ mutex_exit(&tp->hln_tlock);
+ return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *ntp = NULL;
+ int error;
+
+ if (VTOHLN(dvp)->hln_looped == 1)
+ return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+ cr, ct, direntflags, realpnp));
+
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ /* Null component name is a synonym for directory being searched. */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ }
+ ASSERT(tp);
+
+ if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+ ASSERT(ntp);
+ *vpp = HLNTOV(ntp);
+ }
+ return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+ int mode, cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *parent;
+ hlfsmount_t *tm;
+ int error;
+ hlnode_t *oldtp;
+ vnode_t *vp;
+
+ parent = (hlnode_t *)VTOHLN(dvp);
+ tm = (hlfsmount_t *)VTOHLM(dvp);
+ error = 0;
+ oldtp = NULL;
+
+ if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+ /* we don't support the sticky bit */
+ vap->va_mode &= ~VSVTX;
+ } else if (vap->va_type == VNON) {
+ return (EINVAL);
+ }
+
+ /* Null component name is a synonym for directory being searched. */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ oldtp = parent;
+ } else {
+ error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+ }
+
+ if (error == 0) { /* name found */
+ ASSERT(oldtp);
+
+ rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+ /*
+ * if create/read-only an existing directory, allow it
+ */
+ if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+ error = EISDIR;
+ else {
+ error = hyprlofs_taccess(oldtp, mode, cr);
+ }
+
+ if (error) {
+ rw_exit(&oldtp->hln_rwlock);
+ hlnode_rele(oldtp);
+ return (error);
+ }
+
+ vp = HLNTOV(oldtp);
+ rw_exit(&oldtp->hln_rwlock);
+
+ if (vp->v_type == VREG) {
+ hlnode_rele(oldtp);
+ return (EEXIST);
+ }
+
+ vnevent_create(vp, ct);
+ return (0);
+ }
+
+ if (error != ENOENT)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+ cr);
+ rw_exit(&parent->hln_rwlock);
+
+ return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *self = NULL;
+ hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+ int error;
+
+ /*
+ * Might be dangling directory. Catch it here, because a ENOENT return
+ * from hyprlofs_dirlookup() is a valid return.
+ */
+ if (parent->hln_nlink == 0)
+ return (ENOENT);
+
+ error = hyprlofs_dirlookup(parent, nm, &self, cr);
+ if (error == 0) {
+ ASSERT(self);
+ hlnode_rele(self);
+ /* We can't loop in under a looped in directory */
+ if (self->hln_looped)
+ return (EACCES);
+ *vpp = HLNTOV(self);
+ return (EEXIST);
+ }
+ if (error != ENOENT)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+ va, &self, cr);
+ rw_exit(&parent->hln_rwlock);
+
+ if (error == 0 || error == EEXIST) {
+ hlnode_rele(self);
+ *vpp = HLNTOV(self);
+ }
+
+ return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+ cred_t *cr, caller_context_t *ct)
+{
+ int error;
+ char *p, *pnm;
+ vnode_t *realvp, *dvp;
+ vattr_t va;
+
+ /*
+ * Get vnode for the real file/dir. We'll have a hold on realvp which
+ * we won't vn_rele until hyprlofs_inactive.
+ */
+ if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP,
+ &realvp)) != 0)
+ return (error);
+
+ /* no devices allowed */
+ if (IS_DEVVP(realvp)) {
+ VN_RELE(realvp);
+ return (ENODEV);
+ }
+
+ /*
+ * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+ * to trigger the mount of the intended filesystem. This causes a
+ * loopback mount of the intended filesystem instead of the AUTOFS
+ * filesystem.
+ */
+ if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ /*
+ * We're interested in the top most filesystem. This is specially
+ * important when fspath is a trigger AUTOFS node, since we're really
+ * interested in mounting the filesystem AUTOFS mounted as result of
+ * the VOP_ACCESS() call not the AUTOFS node itself.
+ */
+ if (vn_mountedvfs(realvp) != NULL) {
+ if ((error = traverse(&realvp)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+ }
+
+ va.va_type = VNON;
+ /*
+ * If the target name is a path, make sure we have all of the
+ * intermediate directories, creating them if necessary.
+ */
+ dvp = vp;
+ pnm = p = fsname;
+
+ /* path cannot be absolute */
+ if (*p == '/') {
+ VN_RELE(realvp);
+ return (EINVAL);
+ }
+
+ for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+ if (va.va_type == VNON)
+ /* use the top-level dir as the template va for mkdir */
+ if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ *p = '\0';
+
+ /* Path component cannot be empty or relative */
+ if (pnm[0] == '\0' ||
+ (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) {
+ VN_RELE(realvp);
+ return (EINVAL);
+ }
+
+ if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+ error != EEXIST) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ *p = '/';
+ pnm = p + 1;
+ }
+
+ /* The file name is required */
+ if (pnm[0] == '\0') {
+ VN_RELE(realvp);
+ return (EINVAL);
+ }
+
+ /* Now use the real file's va as the template va */
+ if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ /* Make the vnode */
+ error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct);
+ if (error != 0)
+ VN_RELE(realvp);
+ return (error);
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int error;
+ char *p, *pnm;
+ hlnode_t *parent;
+ hlnode_t *fndtp;
+
+ pnm = p = fsname;
+
+ /* path cannot be absolute */
+ if (*p == '/')
+ return (EINVAL);
+
+ /*
+ * If the target name is a path, get the containing dir and simple
+ * file name.
+ */
+ parent = (hlnode_t *)VTOHLN(dvp);
+ for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+ *p = '\0';
+
+ /* Path component cannot be empty or relative */
+ if (pnm[0] == '\0' ||
+ (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0'))
+ return (EINVAL);
+
+ if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+ return (error);
+
+ dvp = HLNTOV(fndtp);
+ parent = fndtp;
+ pnm = p + 1;
+ }
+
+ /* The file name is required */
+ if (pnm[0] == '\0')
+ return (EINVAL);
+
+ /* Remove the entry from the parent dir */
+ return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int error = 0;
+ hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+ hldirent_t *hdp;
+
+ hlnode_hold(hp);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ goto done;
+ }
+
+ hdp = hp->hln_dir;
+ while (hdp) {
+ hlnode_t *fndhp;
+
+ if (strcmp(hdp->hld_name, ".") == 0 ||
+ strcmp(hdp->hld_name, "..") == 0) {
+ hdp = hdp->hld_next;
+ continue;
+ }
+
+ /* This holds the fndhp vnode */
+ error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+ if (error != 0)
+ goto done;
+ hlnode_rele(fndhp);
+
+ if (fndhp->hln_looped == 0) {
+ /* recursively remove contents of this subdir */
+ if (fndhp->hln_type == VDIR) {
+ vnode_t *tvp = HLNTOV(fndhp);
+
+ error = hyprlofs_rm_all(tvp, cr, ct, flags);
+ if (error != 0)
+ goto done;
+ }
+ }
+
+ /* remove the entry */
+ error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+ if (error != 0)
+ goto done;
+
+ hdp = hp->hln_dir;
+ }
+
+done:
+ hlnode_rele(hp);
+ return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+ char *prefix, uint_t *pcnt, uint_t n_max,
+ cred_t *cr, caller_context_t *ct, int flags)
+{
+ int error = 0;
+ int too_big = 0;
+ uint_t cnt;
+ uint_t len;
+ hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+ hldirent_t *hdp;
+ char *path;
+
+ cnt = *pcnt;
+ path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ hlnode_hold(hp);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ goto done;
+ }
+
+ hdp = hp->hln_dir;
+ while (hdp) {
+ hlnode_t *fndhp;
+ vnode_t *tvp;
+
+ if (strcmp(hdp->hld_name, ".") == 0 ||
+ strcmp(hdp->hld_name, "..") == 0) {
+ hdp = hdp->hld_next;
+ continue;
+ }
+
+ /* This holds the fndhp vnode */
+ error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+ if (error != 0)
+ goto done;
+ hlnode_rele(fndhp);
+
+ if (fndhp->hln_looped == 0) {
+ /* recursively get contents of this subdir */
+ VERIFY(fndhp->hln_type == VDIR);
+ tvp = HLNTOV(fndhp);
+
+ if (*prefix == '\0')
+ (void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+ else
+ (void) snprintf(path, MAXPATHLEN, "%s/%s",
+ prefix, hdp->hld_name);
+
+ error = hyprlofs_get_all_entries(tvp, hcp, path,
+ &cnt, n_max, cr, ct, flags);
+
+ if (error == E2BIG) {
+ too_big = 1;
+ error = 0;
+ }
+ if (error != 0)
+ goto done;
+ } else {
+ if (cnt < n_max) {
+ char *p;
+
+ if (*prefix == '\0')
+ (void) strlcpy(path, hdp->hld_name,
+ MAXPATHLEN);
+ else
+ (void) snprintf(path, MAXPATHLEN,
+ "%s/%s", prefix, hdp->hld_name);
+
+ len = strlen(path);
+ ASSERT(len <= MAXPATHLEN);
+ if (copyout(path, (void *)(hcp[cnt].hce_name),
+ len)) {
+ error = EFAULT;
+ goto done;
+ }
+
+ tvp = REALVP(HLNTOV(fndhp));
+ if (tvp->v_path == vn_vpath_empty) {
+ p = "<unknown>";
+ } else {
+ p = tvp->v_path;
+ }
+ len = strlen(p);
+ ASSERT(len <= MAXPATHLEN);
+ if (copyout(p, (void *)(hcp[cnt].hce_path),
+ len)) {
+ error = EFAULT;
+ goto done;
+ }
+ }
+
+ cnt++;
+ if (cnt > n_max)
+ too_big = 1;
+ }
+
+ hdp = hdp->hld_next;
+ }
+
+done:
+ hlnode_rele(hp);
+ kmem_free(path, MAXPATHLEN);
+
+ *pcnt = cnt;
+ if (error == 0 && too_big == 1)
+ error = E2BIG;
+
+ return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ uint_t limit, cnt;
+ int error;
+ model_t model;
+ hyprlofs_curr_entry_t *e;
+
+ model = get_udatamodel();
+
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_curr_entries_t ebuf;
+
+ if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+ return (EFAULT);
+ limit = ebuf.hce_cnt;
+ e = ebuf.hce_entries;
+ if (limit > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+
+ } else {
+ hyprlofs_curr_entries32_t ebuf32;
+
+ if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+ return (EFAULT);
+
+ limit = ebuf32.hce_cnt;
+ e = (hyprlofs_curr_entry_t *)(unsigned long)
+ (ebuf32.hce_entries);
+ if (limit > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ }
+
+ cnt = 0;
+ error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+ flags);
+
+ if (error == 0 || error == E2BIG) {
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_curr_entries_t ebuf;
+
+ ebuf.hce_cnt = cnt;
+ if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+ return (EFAULT);
+
+ } else {
+ hyprlofs_curr_entries32_t ebuf32;
+
+ ebuf32.hce_cnt = cnt;
+ if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+ return (EFAULT);
+ }
+ }
+
+ return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ int error;
+ hlnode_t *hp = NULL;
+
+ /* This holds the hp vnode */
+ error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+ if (error)
+ return (error);
+
+ ASSERT(hp);
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+ error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+ rw_exit(&hp->hln_rwlock);
+ rw_exit(&parent->hln_rwlock);
+ vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+
+ /*
+ * We've now dropped the dir link so by rele-ing our vnode we should
+ * clean up in hyprlofs_inactive.
+ */
+ hlnode_rele(hp);
+
+ return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *self = NULL;
+ vnode_t *vp;
+ int error = 0;
+
+ /* Return error if removing . or .. */
+ if (strcmp(nm, ".") == 0)
+ return (EINVAL);
+ if (strcmp(nm, "..") == 0)
+ return (EEXIST); /* Should be ENOTEMPTY */
+ error = hyprlofs_dirlookup(parent, nm, &self, cr);
+ if (error)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ rw_enter(&self->hln_rwlock, RW_WRITER);
+
+ vp = HLNTOV(self);
+ if (vp == dvp || vp == cdir) {
+ error = EINVAL;
+ goto done1;
+ }
+ if (self->hln_type != VDIR) {
+ error = ENOTDIR;
+ goto done1;
+ }
+
+ /*
+ * When a dir is looped in, we only remove the in-memory dir, not the
+ * backing dir.
+ */
+ if (self->hln_looped == 0) {
+ mutex_enter(&self->hln_tlock);
+ if (self->hln_nlink > 2) {
+ mutex_exit(&self->hln_tlock);
+ error = EEXIST;
+ goto done1;
+ }
+ mutex_exit(&self->hln_tlock);
+
+ if (vn_vfswlock(vp)) {
+ error = EBUSY;
+ goto done1;
+ }
+ if (vn_mountedvfs(vp) != NULL) {
+ error = EBUSY;
+ goto done;
+ }
+
+ /*
+ * Check for an empty directory, i.e. only includes entries for
+ * "." and ".."
+ */
+ if (self->hln_dirents > 2) {
+ error = EEXIST; /* SIGH should be ENOTEMPTY */
+ /*
+ * Update atime because checking hln_dirents is
+ * equivalent to reading the directory
+ */
+ gethrestime(&self->hln_atime);
+ goto done;
+ }
+
+ error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+ } else {
+ error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+ }
+
+done:
+ if (self->hln_looped == 0)
+ vn_vfsunlock(vp);
+done1:
+ rw_exit(&self->hln_rwlock);
+ rw_exit(&parent->hln_rwlock);
+ vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+
+ /*
+ * We've now dropped the dir link so by rele-ing our vnode we should
+ * clean up in hyprlofs_inactive.
+ */
+ hlnode_rele(self);
+
+ return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hldirent_t *hdp;
+ int error = 0;
+ size_t namelen;
+ struct dirent64 *dp;
+ ulong_t offset;
+ ulong_t total_bytes_wanted;
+ ulong_t outcount = 0;
+ ulong_t bufsize;
+ size_t reclen;
+ caddr_t outbuf;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+ if (uiop->uio_loffset >= MAXOFF_T) {
+ if (eofp)
+ *eofp = 1;
+ return (0);
+ }
+ /* assuming syscall has already called hln_rwlock */
+ ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+ if (uiop->uio_iovcnt != 1)
+ return (EINVAL);
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ return (0);
+ }
+
+ /* Get space for multiple dir entries */
+ total_bytes_wanted = uiop->uio_iov->iov_len;
+ bufsize = total_bytes_wanted + sizeof (struct dirent64);
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+ dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+ offset = 0;
+ hdp = hp->hln_dir;
+ while (hdp) {
+ namelen = strlen(hdp->hld_name); /* no +1 needed */
+ offset = hdp->hld_offset;
+ if (offset >= uiop->uio_offset) {
+ reclen = DIRENT64_RECLEN(namelen);
+ if (outcount + reclen > total_bytes_wanted) {
+ if (!outcount)
+ /* Buffer too small for any entries. */
+ error = EINVAL;
+ break;
+ }
+ ASSERT(hdp->hld_hlnode != NULL);
+
+ /* zero out uninitialized bytes */
+ (void) strncpy(dp->d_name, hdp->hld_name,
+ DIRENT64_NAMELEN(reclen));
+ dp->d_reclen = (ushort_t)reclen;
+ dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+ dp->d_off = (offset_t)hdp->hld_offset + 1;
+ dp = (struct dirent64 *)
+ ((uintptr_t)dp + dp->d_reclen);
+ outcount += reclen;
+ ASSERT(outcount <= bufsize);
+ }
+ hdp = hdp->hld_next;
+ }
+
+ if (!error)
+ error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+ if (!error) {
+ /*
+ * If we reached the end of the list our offset should now be
+ * just past the end.
+ */
+ if (!hdp) {
+ offset += 1;
+ if (eofp)
+ *eofp = 1;
+ } else if (eofp)
+ *eofp = 0;
+ uiop->uio_offset = offset;
+ }
+ gethrestime(&hp->hln_atime);
+ kmem_free(outbuf, bufsize);
+ return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+ mutex_enter(&hp->hln_tlock);
+ mutex_enter(&vp->v_lock);
+ ASSERT(vp->v_count >= 1);
+
+ /*
+ * If we don't have the last hold or the link count is non-zero,
+ * there's nothing to do except drop our hold.
+ */
+ if (vp->v_count > 1 || hp->hln_nlink != 0) {
+ vp->v_count--;
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&hp->hln_tlock);
+ rw_exit(&hp->hln_rwlock);
+ return;
+ }
+
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&hp->hln_tlock);
+
+ /* release hold on the real vnode now */
+ if (hp->hln_looped == 1 && hp->hln_realvp != NULL)
+ VN_RELE(hp->hln_realvp);
+
+ /* Here's our chance to send invalid event while we're between locks */
+ vn_invalid(HLNTOV(hp));
+
+ mutex_enter(&hm->hlm_contents);
+ if (hp->hln_forw == NULL)
+ hm->hlm_rootnode->hln_back = hp->hln_back;
+ else
+ hp->hln_forw->hln_back = hp->hln_back;
+ hp->hln_back->hln_forw = hp->hln_forw;
+ mutex_exit(&hm->hlm_contents);
+ rw_exit(&hp->hln_rwlock);
+ rw_destroy(&hp->hln_rwlock);
+ mutex_destroy(&hp->hln_tlock);
+ vn_free(HLNTOV(hp));
+ kmem_free(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hlfid_t *hfid;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_FID(REALVP(vp), fidp, ct));
+
+ if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+ fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+ return (ENOSPC);
+ }
+
+ hfid = (hlfid_t *)fidp;
+ bzero(hfid, sizeof (hlfid_t));
+ hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+ hfid->hlfid_ino = hp->hln_nodeid;
+ hfid->hlfid_gen = hp->hln_gen;
+
+ return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+ cred_t *cr, caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+ rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+ cred_t *cr, caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+ cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+ flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+ flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+ offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 0)
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+
+ return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ hlnode_t *hp = VTOHLN(vp);
+
+ if (hp->hln_looped == 1)
+ return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+ if (write_lock) {
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ } else {
+ rw_enter(&hp->hln_rwlock, RW_READER);
+ }
+ return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ hlnode_t *hp = VTOHLN(vp);
+
+ if (hp->hln_looped == 1) {
+ VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+ return;
+ }
+
+ rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+ int error;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+ switch (cmd) {
+ case _PC_XATTR_ENABLED:
+ case _PC_XATTR_EXISTS:
+ case _PC_SATTR_ENABLED:
+ case _PC_SATTR_EXISTS:
+ error = EINVAL;
+ break;
+ case _PC_TIMESTAMP_RESOLUTION:
+ /* nanosecond timestamp resolution */
+ *valp = 1L;
+ error = 0;
+ break;
+ default:
+ error = fs_pathconf(vp, cmd, valp, cr, ct);
+ }
+ return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = hyprlofs_open },
+ VOPNAME_CLOSE, { .vop_close = hyprlofs_close },
+ VOPNAME_READ, { .vop_read = hyprlofs_read },
+ VOPNAME_WRITE, { .vop_write = hyprlofs_write },
+ VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr },
+ VOPNAME_ACCESS, { .vop_access = hyprlofs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup },
+ VOPNAME_CREATE, { .error = fs_error },
+ VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove },
+ VOPNAME_LINK, { .error = fs_error },
+ VOPNAME_RENAME, { .error = fs_error },
+ VOPNAME_MKDIR, { .error = fs_error },
+ VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir },
+ VOPNAME_SYMLINK, { .error = fs_error },
+ VOPNAME_READLINK, { .error = fs_error },
+ VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive },
+ VOPNAME_FID, { .vop_fid = hyprlofs_fid },
+ VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock },
+ VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock },
+ VOPNAME_SEEK, { .vop_seek = hyprlofs_seek },
+ VOPNAME_SPACE, { .vop_space = hyprlofs_space },
+ VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage },
+ VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage },
+ VOPNAME_MAP, { .vop_map = hyprlofs_map },
+ VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap },
+ VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap },
+ VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 69c9efff97..f6910c07cf 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -21,6 +21,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Joyent, Inc.
*/
@@ -58,6 +59,7 @@
#include <sys/zone.h>
#include <sys/dnlc.h>
#include <sys/fs/snode.h>
+#include <sys/brand.h>
/* Controls whether paths are stored with vnodes. */
int vfs_vnode_path = 1;
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..24c010a463
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,526 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2017, Joyent, Inc.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+ uio_t *uiop;
+ char *buffer;
+ uint32_t buffsize;
+ char *pos;
+ size_t beg;
+ int error;
+};
+
+int lxpr_bufsize = 4000;
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+ /* Allocate memory for both lxpr_uiobuf and output buffer */
+ int bufsize = lxpr_bufsize;
+ struct lxpr_uiobuf *uiobuf =
+ kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+ uiobuf->uiop = uiop;
+ uiobuf->buffer = (char *)&uiobuf[1];
+ uiobuf->buffsize = bufsize;
+ uiobuf->pos = uiobuf->buffer;
+ uiobuf->beg = 0;
+ uiobuf->error = 0;
+
+ return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+ ASSERT(uiobuf != NULL);
+ ASSERT(uiobuf->pos == uiobuf->buffer);
+
+ kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+ uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+ ASSERT(uiobuf->error == 0);
+
+ uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+ off_t off = uiobuf->uiop->uio_offset;
+ caddr_t uaddr = uiobuf->buffer;
+ size_t beg = uiobuf->beg;
+ size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+ if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ ASSERT(off >= beg);
+
+ if (beg + size > off && off >= 0)
+ uiobuf->error =
+ uiomove(uaddr + (off - beg), size - (off - beg),
+ UIO_READ, uiobuf->uiop);
+
+ uiobuf->beg += size;
+ }
+
+ uiobuf->pos = uaddr;
+
+ return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+ /* While we can still carry on */
+ while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+ ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+ /* Enough space in buffer? */
+ if (remain >= size) {
+ bcopy(buf, uiobuf->pos, size);
+ uiobuf->pos += size;
+ return;
+ }
+
+ /* Not enough space, so copy all we can and try again */
+ bcopy(buf, uiobuf->pos, remain);
+ uiobuf->pos += remain;
+ (void) lxpr_uiobuf_flush(uiobuf);
+ buf += remain;
+ size -= remain;
+ }
+}
+
+#define TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+ va_list args;
+ char buff[TYPBUFFSIZE];
+ int len;
+ char *buffer;
+
+ /* Can we still do any output */
+ if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+ return;
+
+ va_start(args, fmt);
+
+ /* Try using stack allocated buffer */
+ len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+ if (len < TYPBUFFSIZE) {
+ va_end(args);
+ lxpr_uiobuf_write(uiobuf, buff, len);
+ return;
+ }
+
+ /* Not enough space in pre-allocated buffer */
+ buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+ /*
+ * We know we allocated the correct amount of space
+ * so no check on the return value
+ */
+ (void) vsnprintf(buffer, len+1, fmt, args);
+ lxpr_uiobuf_write(uiobuf, buffer, len);
+ va_end(args);
+ kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+ proc_t *p;
+ kmutex_t *mp;
+
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ for (;;) {
+ mutex_enter(&pidlock);
+
+ /*
+ * If the pid is 1, we really want the zone's init process
+ */
+ p = prfind((pid == 1) ?
+ curproc->p_zone->zone_proc_initpid : pid);
+
+ if (p == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (NULL);
+ }
+
+ /*
+ * p_lock is persistent, but p itself is not -- it could
+ * vanish during cv_wait(). Load p->p_lock now so we can
+ * drop it after cv_wait() without referencing p.
+ */
+ mp = &p->p_lock;
+ mutex_enter(mp);
+
+ mutex_exit(&pidlock);
+
+ if (p->p_flag & SEXITING) {
+ /*
+ * This process is exiting -- let it go.
+ */
+ mutex_exit(mp);
+ return (NULL);
+ }
+
+ if (!(p->p_proc_flag & P_PR_LOCK))
+ break;
+
+ cv_wait(&pr_pid_cv[p->p_slot], mp);
+ mutex_exit(mp);
+ }
+
+ p->p_proc_flag |= P_PR_LOCK;
+ THREAD_KPRI_REQUEST();
+ return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ cv_signal(&pr_pid_cv[p->p_slot]);
+ p->p_proc_flag &= ~P_PR_LOCK;
+ mutex_exit(&p->p_lock);
+ THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+ lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+ sizeof (lxpr_node_t), 0,
+ lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+ kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+ lxpr_node_t *lxpnp = buf;
+ vnode_t *vp;
+
+ vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+ if (vp == NULL)
+ return (-1);
+
+ (void) vn_setops(vp, lxpr_vnodeops);
+ vp->v_data = lxpnp;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+ lxpr_node_t *lxpnp = buf;
+
+ vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+ if (pid == 1)
+ pid = curproc->p_zone->zone_proc_initpid;
+
+ switch (type) {
+ case LXPR_PIDDIR:
+ return (pid + 1);
+ case LXPR_PROCDIR:
+ return (maxpid + 2);
+ case LXPR_PID_FD_FD:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ LXPR_NFILES + fd);
+ default:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ type);
+ }
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+ /*
+ * If the input node is the root then the parent inode
+ * is the mounted on inode so just return our inode number
+ */
+ if (lxpnp->lxpr_type != LXPR_PROCDIR)
+ return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+ else
+ return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+ lxpr_node_t *lxpnp;
+ vnode_t *vp;
+ user_t *up;
+ timestruc_t now;
+
+ /*
+ * Allocate a new node. It is deallocated in vop_innactive
+ */
+ lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+ /*
+ * Set defaults (may be overridden below)
+ */
+ gethrestime(&now);
+ lxpnp->lxpr_type = type;
+ lxpnp->lxpr_realvp = NULL;
+ lxpnp->lxpr_parent = dp;
+ VN_HOLD(dp);
+ if (p != NULL) {
+ lxpnp->lxpr_pid = ((p->p_pid ==
+ curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+ lxpnp->lxpr_time = PTOU(p)->u_start;
+ lxpnp->lxpr_uid = crgetruid(p->p_cred);
+ lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+ lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+ } else {
+ /* Pretend files without a proc belong to sched */
+ lxpnp->lxpr_pid = 0;
+ lxpnp->lxpr_time = now;
+ lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+ lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+ }
+
+ /* initialize the vnode data */
+ vp = lxpnp->lxpr_vnode;
+ vn_reinit(vp);
+ vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+ vp->v_vfsp = dp->v_vfsp;
+
+ /*
+ * Do node specific stuff
+ */
+ switch (type) {
+ case LXPR_PROCDIR:
+ vp->v_flag |= VROOT;
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by everyone */
+ break;
+
+ case LXPR_PID_CURDIR:
+ ASSERT(p != NULL);
+
+ /*
+ * Zombie check. p_stat is officially protected by pidlock,
+ * but we can't grab pidlock here because we already hold
+ * p_lock. Luckily if we look at the process exit code
+ * we see that p_stat only transisions from SRUN to SZOMB
+ * while p_lock is held. Aside from this, the only other
+ * p_stat transition that we need to be aware about is
+ * SIDL to SRUN, but that's not a problem since lxpr_lock()
+ * ignores nodes in the SIDL state so we'll never get a node
+ * that isn't already in the SRUN state.
+ */
+ if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ up = PTOU(p);
+ lxpnp->lxpr_realvp = up->u_cdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_ROOTDIR:
+ ASSERT(p != NULL);
+ /* Zombie check. see locking comment above */
+ if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ up = PTOU(p);
+ lxpnp->lxpr_realvp =
+ up->u_rdir != NULL ? up->u_rdir : rootdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_EXE:
+ ASSERT(p != NULL);
+ lxpnp->lxpr_realvp = p->p_exec;
+ if (lxpnp->lxpr_realvp != NULL) {
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777;
+ break;
+
+ case LXPR_SELF:
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_FD_FD:
+ ASSERT(p != NULL);
+ /* lxpr_realvp is set after we return */
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */
+ break;
+
+ case LXPR_PID_FDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0500; /* read-search by owner only */
+ break;
+
+ case LXPR_PIDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0511;
+ break;
+
+ case LXPR_NETDIR:
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by all */
+ break;
+
+ case LXPR_PID_ENV:
+ case LXPR_PID_MEM:
+ ASSERT(p != NULL);
+ /*FALLTHRU*/
+ case LXPR_KCORE:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0400; /* read-only by owner only */
+ break;
+
+ default:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0444; /* read-only by all */
+ break;
+ }
+
+ return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+ ASSERT(lxpnp != NULL);
+ ASSERT(LXPTOV(lxpnp) != NULL);
+
+ /*
+ * delete any association with realvp
+ */
+ if (lxpnp->lxpr_realvp != NULL)
+ VN_RELE(lxpnp->lxpr_realvp);
+
+ /*
+ * delete any association with parent vp
+ */
+ if (lxpnp->lxpr_parent != NULL)
+ VN_RELE(lxpnp->lxpr_parent);
+
+ /*
+ * Release the lxprnode.
+ */
+ kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int lxprocfstype;
+static dev_t lxprocdev;
+static kmutex_t lxpr_mount_lock;
+
+int nproc_highbit; /* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "lxproc",
+ lxpr_init,
+ VSW_ZMOUNT,
+ NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+ &mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int retval;
+
+ /*
+ * attempt to unload the module
+ */
+ if ((retval = mod_remove(&modlinkage)) != 0)
+ goto done;
+
+ /*
+ * destroy lxpr_node cache
+ */
+ lxpr_fininodecache();
+
+ /*
+ * clean out the vfsops and vnodeops
+ */
+ (void) vfs_freevfsops_by_type(lxprocfstype);
+ vn_freevnodeops(lxpr_vnodeops);
+
+ mutex_destroy(&lxpr_mount_lock);
+done:
+ return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+ static const fs_operation_def_t lxpr_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = lxpr_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount },
+ VFSNAME_ROOT, { .vfs_root = lxpr_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs },
+ NULL, NULL
+ };
+ extern const fs_operation_def_t lxpr_vnodeops_template[];
+ int error;
+ major_t dev;
+
+ nproc_highbit = highbit(v.v_proc);
+ lxprocfstype = fstype;
+ ASSERT(lxprocfstype != 0);
+
+ mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Associate VFS ops vector with this fstype.
+ */
+ error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+ return (error);
+ }
+
+ /*
+ * Set up vnode ops vector too.
+ */
+ error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * Assign a unique "device" number (used by stat(2)).
+ */
+ if ((dev = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+ dev = 0;
+ }
+
+ /*
+ * Make the pseudo device
+ */
+ lxprocdev = makedevice(dev, 0);
+
+ /*
+ * Initialize cache for lxpr_nodes
+ */
+ lxpr_initnodecache();
+
+ return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt;
+ zone_t *zone = curproc->p_zone;
+ ldi_ident_t li;
+ int err;
+
+ /*
+ * must be root to mount
+ */
+ if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+ return (EPERM);
+
+ /*
+ * mount point must be a directory
+ */
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (zone == global_zone) {
+ zone_t *mntzone;
+
+ mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+ zone_rele(mntzone);
+ if (zone != mntzone)
+ return (EBUSY);
+ }
+
+ /*
+ * Having the resource be anything but "lxproc" doesn't make sense
+ */
+ vfs_setresource(vfsp, "lxproc", 0);
+
+ lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+ if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+ return (err);
+ }
+
+ lxpr_mnt->lxprm_li = li;
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * Ensure we don't allow overlaying mounts
+ */
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ mutex_exit(&lxpr_mount_lock);
+ kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * allocate the first vnode
+ */
+ zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+ /* Arbitrarily set the parent vnode to the mounted over directory */
+ lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+ /* Correctly set the fs for the root node */
+ lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+ vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+ vfsp->vfs_bsize = DEV_BSIZE;
+ vfsp->vfs_fstype = lxprocfstype;
+ vfsp->vfs_data = (caddr_t)lxpr_mnt;
+ vfsp->vfs_dev = lxprocdev;
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+ vnode_t *vp;
+ int count;
+
+ ASSERT(lxpr_mnt != NULL);
+ vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * must be root to unmount
+ */
+ if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EPERM);
+ }
+
+ /*
+ * forced unmount is not supported by this file system
+ */
+ if (flag & MS_FORCE) {
+ mutex_exit(&lxpr_mount_lock);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Ensure that no vnodes are in use on this mount point.
+ */
+ mutex_enter(&vp->v_lock);
+ count = vp->v_count;
+ mutex_exit(&vp->v_lock);
+ if (count > 1) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * purge the dnlc cache for vnode entries
+ * associated with this file system
+ */
+ count = dnlc_purge_vfsp(vfsp, 0);
+
+ /*
+ * free up the lxprnode
+ */
+ lxpr_freenode(lxpr_mnt->lxprm_node);
+ zone_rele(lxpr_mnt->lxprm_zone);
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+ vnode_t *vp = LXPTOV(lxpnp);
+
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+ int n;
+ dev32_t d32;
+ extern uint_t nproc;
+
+ n = v.v_proc - nproc;
+
+ bzero((caddr_t)sp, sizeof (*sp));
+ sp->f_bsize = DEV_BSIZE;
+ sp->f_frsize = DEV_BSIZE;
+ sp->f_blocks = (fsblkcnt64_t)0;
+ sp->f_bfree = (fsblkcnt64_t)0;
+ sp->f_bavail = (fsblkcnt64_t)0;
+ sp->f_files = (fsfilcnt64_t)v.v_proc + 2;
+ sp->f_ffree = (fsfilcnt64_t)n;
+ sp->f_favail = (fsfilcnt64_t)n;
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sp->f_fsid = d32;
+ /* It is guaranteed that vsw_name will fit in f_basetype */
+ (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+ sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sp->f_namemax = 64; /* quite arbitrary */
+
+ (void) strcpy(sp->f_fstr, "lxproc");
+
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..9bcc0f7e8b
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * We have -- confusingly -- two implementations of Linux /proc. One is to
+ * support the LX brand with a Linux /proc entirely compatible with the Linux
+ * world view; the other -- this one -- is to support native (but Linux-borne)
+ * programs that wish to view the native system via the Linux /proc model. So
+ * the aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth. A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space. (That is, each thread has in
+ * effect a pid.) Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all. Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility. In short, this is meant to be a best
+ * effort -- no more -- and as such, it should not be unified with the much
+ * more complete Linux /proc implementation found in the LX brand.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+ caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+ caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+ pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+ pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+ caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define btok(x) ((x) >> 10) /* bytes to kbytes */
+#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = lxpr_open },
+ VOPNAME_CLOSE, { .vop_close = lxpr_close },
+ VOPNAME_READ, { .vop_read = lxpr_read },
+ VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr },
+ VOPNAME_ACCESS, { .vop_access = lxpr_access },
+ VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup },
+ VOPNAME_READDIR, { .vop_readdir = lxpr_readdir },
+ VOPNAME_READLINK, { .vop_readlink = lxpr_readlink },
+ VOPNAME_FSYNC, { .error = lxpr_sync },
+ VOPNAME_SEEK, { .error = lxpr_sync },
+ VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive },
+ VOPNAME_CMP, { .vop_cmp = lxpr_cmp },
+ VOPNAME_REALVP, { .vop_realvp = lxpr_realvp },
+ NULL, NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+ { LXPR_CMDLINE, "cmdline" },
+ { LXPR_CPUINFO, "cpuinfo" },
+ { LXPR_DEVICES, "devices" },
+ { LXPR_DMA, "dma" },
+ { LXPR_FILESYSTEMS, "filesystems" },
+ { LXPR_INTERRUPTS, "interrupts" },
+ { LXPR_IOPORTS, "ioports" },
+ { LXPR_KCORE, "kcore" },
+ { LXPR_KMSG, "kmsg" },
+ { LXPR_LOADAVG, "loadavg" },
+ { LXPR_MEMINFO, "meminfo" },
+ { LXPR_MOUNTS, "mounts" },
+ { LXPR_NETDIR, "net" },
+ { LXPR_PARTITIONS, "partitions" },
+ { LXPR_SELF, "self" },
+ { LXPR_STAT, "stat" },
+ { LXPR_UPTIME, "uptime" },
+ { LXPR_VERSION, "version" }
+};
+
+#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+ { LXPR_PID_CMDLINE, "cmdline" },
+ { LXPR_PID_CPU, "cpu" },
+ { LXPR_PID_CURDIR, "cwd" },
+ { LXPR_PID_ENV, "environ" },
+ { LXPR_PID_EXE, "exe" },
+ { LXPR_PID_MAPS, "maps" },
+ { LXPR_PID_MEM, "mem" },
+ { LXPR_PID_ROOTDIR, "root" },
+ { LXPR_PID_STAT, "stat" },
+ { LXPR_PID_STATM, "statm" },
+ { LXPR_PID_STATUS, "status" },
+ { LXPR_PID_FDDIR, "fd" }
+};
+
+#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+ { LXPR_NET_ARP, "arp" },
+ { LXPR_NET_DEV, "dev" },
+ { LXPR_NET_DEV_MCAST, "dev_mcast" },
+ { LXPR_NET_IGMP, "igmp" },
+ { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" },
+ { LXPR_NET_IP_MR_VIF, "ip_mr_vif" },
+ { LXPR_NET_MCFILTER, "mcfilter" },
+ { LXPR_NET_NETSTAT, "netstat" },
+ { LXPR_NET_RAW, "raw" },
+ { LXPR_NET_ROUTE, "route" },
+ { LXPR_NET_RPC, "rpc" },
+ { LXPR_NET_RT_CACHE, "rt_cache" },
+ { LXPR_NET_SOCKSTAT, "sockstat" },
+ { LXPR_NET_SNMP, "snmp" },
+ { LXPR_NET_STAT, "stat" },
+ { LXPR_NET_TCP, "tcp" },
+ { LXPR_NET_UDP, "udp" },
+ { LXPR_NET_UNIX, "unix" }
+};
+
+#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * ====================================
+ * | Number | Linux | Native |
+ * | ====== | ========= | ========== |
+ * | 7 | SIGBUS | SIGEMT |
+ * | 10 | SIGUSR1 | SIGBUS |
+ * | 12 | SIGUSR2 | SIGSYS |
+ * | 16 | SIGSTKFLT | SIGUSR1 |
+ * | 17 | SIGCHLD | SIGUSR2 |
+ * | 18 | SIGCONT | SIGCHLD |
+ * | 19 | SIGSTOP | SIGPWR |
+ * | 20 | SIGTSTP | SIGWINCH |
+ * | 21 | SIGTTIN | SIGURG |
+ * | 22 | SIGTTOU | SIGPOLL |
+ * | 23 | SIGURG | SIGSTOP |
+ * | 24 | SIGXCPU | SIGTSTP |
+ * | 25 | SIGXFSZ | SIGCONT |
+ * | 26 | SIGVTALARM | SIGTTIN |
+ * | 27 | SIGPROF | SIGTTOU |
+ * | 28 | SIGWINCH | SIGVTALARM |
+ * | 29 | SIGPOLL | SIGPROF |
+ * | 30 | SIGPWR | SIGXCPU |
+ * | 31 | SIGSYS | SIGXFSZ |
+ * ====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+ 0,
+ LX_SIGHUP,
+ LX_SIGINT,
+ LX_SIGQUIT,
+ LX_SIGILL,
+ LX_SIGTRAP,
+ LX_SIGABRT,
+ LX_SIGSTKFLT,
+ LX_SIGFPE,
+ LX_SIGKILL,
+ LX_SIGBUS,
+ LX_SIGSEGV,
+ LX_SIGSYS,
+ LX_SIGPIPE,
+ LX_SIGALRM,
+ LX_SIGTERM,
+ LX_SIGUSR1,
+ LX_SIGUSR2,
+ LX_SIGCHLD,
+ LX_SIGPWR,
+ LX_SIGWINCH,
+ LX_SIGURG,
+ LX_SIGPOLL,
+ LX_SIGSTOP,
+ LX_SIGTSTP,
+ LX_SIGCONT,
+ LX_SIGTTIN,
+ LX_SIGTTOU,
+ LX_SIGVTALRM,
+ LX_SIGPROF,
+ LX_SIGXCPU,
+ LX_SIGXFSZ,
+ -1, /* 32: illumos SIGWAITING */
+ -1, /* 33: illumos SIGLWP */
+ -1, /* 34: illumos SIGFREEZE */
+ -1, /* 35: illumos SIGTHAW */
+ -1, /* 36: illumos SIGCANCEL */
+ -1, /* 37: illumos SIGLOST */
+ -1, /* 38: illumos SIGXRES */
+ -1, /* 39: illumos SIGJVM1 */
+ -1, /* 40: illumos SIGJVM2 */
+ -1, /* 41: illumos SIGINFO */
+ LX_SIGRTMIN, /* 42: illumos _SIGRTMIN */
+ LX_SIGRTMIN + 1,
+ LX_SIGRTMIN + 2,
+ LX_SIGRTMIN + 3,
+ LX_SIGRTMIN + 4,
+ LX_SIGRTMIN + 5,
+ LX_SIGRTMIN + 6,
+ LX_SIGRTMIN + 7,
+ LX_SIGRTMIN + 8,
+ LX_SIGRTMIN + 9,
+ LX_SIGRTMIN + 10,
+ LX_SIGRTMIN + 11,
+ LX_SIGRTMIN + 12,
+ LX_SIGRTMIN + 13,
+ LX_SIGRTMIN + 14,
+ LX_SIGRTMIN + 15,
+ LX_SIGRTMIN + 16,
+ LX_SIGRTMIN + 17,
+ LX_SIGRTMIN + 18,
+ LX_SIGRTMIN + 19,
+ LX_SIGRTMIN + 20,
+ LX_SIGRTMIN + 21,
+ LX_SIGRTMIN + 22,
+ LX_SIGRTMIN + 23,
+ LX_SIGRTMIN + 24,
+ LX_SIGRTMIN + 25,
+ LX_SIGRTMIN + 26,
+ LX_SIGRTMIN + 27,
+ LX_SIGRTMIN + 28,
+ LX_SIGRTMIN + 29,
+ LX_SIGRTMIN + 30,
+ LX_SIGRTMAX
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ vnode_t *vp = *vpp;
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ vnode_t *rvp;
+ int error = 0;
+
+ /*
+ * We only allow reading in this file systrem
+ */
+ if (flag & FWRITE)
+ return (EROFS);
+
+ /*
+ * If we are opening an underlying file only allow regular files
+ * reject the open for anything but a regular file.
+ * Just do it if we are opening the current or root directory.
+ */
+ if (lxpnp->lxpr_realvp != NULL) {
+ rvp = lxpnp->lxpr_realvp;
+
+ if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+ error = EACCES;
+ else {
+ /*
+ * Need to hold rvp since VOP_OPEN() may release it.
+ */
+ VN_HOLD(rvp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+ if (error) {
+ VN_RELE(rvp);
+ } else {
+ *vpp = rvp;
+ VN_RELE(vp);
+ }
+ }
+ }
+
+ return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpr = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpr->lxpr_type;
+
+ /*
+ * we should never get here because the close is done on the realvp
+ * for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR &&
+ type != LXPR_PID_EXE);
+
+ return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+ lxpr_read_isdir, /* /proc */
+ lxpr_read_isdir, /* /proc/<pid> */
+ lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */
+ lxpr_read_empty, /* /proc/<pid>/cpu */
+ lxpr_read_invalid, /* /proc/<pid>/cwd */
+ lxpr_read_empty, /* /proc/<pid>/environ */
+ lxpr_read_invalid, /* /proc/<pid>/exe */
+ lxpr_read_pid_maps, /* /proc/<pid>/maps */
+ lxpr_read_empty, /* /proc/<pid>/mem */
+ lxpr_read_invalid, /* /proc/<pid>/root */
+ lxpr_read_pid_stat, /* /proc/<pid>/stat */
+ lxpr_read_pid_statm, /* /proc/<pid>/statm */
+ lxpr_read_pid_status, /* /proc/<pid>/status */
+ lxpr_read_isdir, /* /proc/<pid>/fd */
+ lxpr_read_fd, /* /proc/<pid>/fd/nn */
+ lxpr_read_empty, /* /proc/cmdline */
+ lxpr_read_cpuinfo, /* /proc/cpuinfo */
+ lxpr_read_empty, /* /proc/devices */
+ lxpr_read_empty, /* /proc/dma */
+ lxpr_read_empty, /* /proc/filesystems */
+ lxpr_read_empty, /* /proc/interrupts */
+ lxpr_read_empty, /* /proc/ioports */
+ lxpr_read_empty, /* /proc/kcore */
+ lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */
+ lxpr_read_loadavg, /* /proc/loadavg */
+ lxpr_read_meminfo, /* /proc/meminfo */
+ lxpr_read_mounts, /* /proc/mounts */
+ lxpr_read_isdir, /* /proc/net */
+ lxpr_read_net_arp, /* /proc/net/arp */
+ lxpr_read_net_dev, /* /proc/net/dev */
+ lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */
+ lxpr_read_net_igmp, /* /proc/net/igmp */
+ lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */
+ lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */
+ lxpr_read_net_mcfilter, /* /proc/net/mcfilter */
+ lxpr_read_net_netstat, /* /proc/net/netstat */
+ lxpr_read_net_raw, /* /proc/net/raw */
+ lxpr_read_net_route, /* /proc/net/route */
+ lxpr_read_net_rpc, /* /proc/net/rpc */
+ lxpr_read_net_rt_cache, /* /proc/net/rt_cache */
+ lxpr_read_net_sockstat, /* /proc/net/sockstat */
+ lxpr_read_net_snmp, /* /proc/net/snmp */
+ lxpr_read_net_stat, /* /proc/net/stat */
+ lxpr_read_net_tcp, /* /proc/net/tcp */
+ lxpr_read_net_udp, /* /proc/net/udp */
+ lxpr_read_net_unix, /* /proc/net/unix */
+ lxpr_read_partitions, /* /proc/partitions */
+ lxpr_read_invalid, /* /proc/self */
+ lxpr_read_stat, /* /proc/stat */
+ lxpr_read_uptime, /* /proc/uptime */
+ lxpr_read_version, /* /proc/version */
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+ lxpr_lookup_procdir, /* /proc */
+ lxpr_lookup_piddir, /* /proc/<pid> */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/root */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/status */
+ lxpr_lookup_fddir, /* /proc/<pid>/fd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_lookup_not_a_dir, /* /proc/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/cpuinfo */
+ lxpr_lookup_not_a_dir, /* /proc/devices */
+ lxpr_lookup_not_a_dir, /* /proc/dma */
+ lxpr_lookup_not_a_dir, /* /proc/filesystems */
+ lxpr_lookup_not_a_dir, /* /proc/interrupts */
+ lxpr_lookup_not_a_dir, /* /proc/ioports */
+ lxpr_lookup_not_a_dir, /* /proc/kcore */
+ lxpr_lookup_not_a_dir, /* /proc/kmsg */
+ lxpr_lookup_not_a_dir, /* /proc/loadavg */
+ lxpr_lookup_not_a_dir, /* /proc/meminfo */
+ lxpr_lookup_not_a_dir, /* /proc/mounts */
+ lxpr_lookup_netdir, /* /proc/net */
+ lxpr_lookup_not_a_dir, /* /proc/net/arp */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_lookup_not_a_dir, /* /proc/net/igmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_lookup_not_a_dir, /* /proc/net/netstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/raw */
+ lxpr_lookup_not_a_dir, /* /proc/net/route */
+ lxpr_lookup_not_a_dir, /* /proc/net/rpc */
+ lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/sockstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/snmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/stat */
+ lxpr_lookup_not_a_dir, /* /proc/net/tcp */
+ lxpr_lookup_not_a_dir, /* /proc/net/udp */
+ lxpr_lookup_not_a_dir, /* /proc/net/unix */
+ lxpr_lookup_not_a_dir, /* /proc/partitions */
+ lxpr_lookup_not_a_dir, /* /proc/self */
+ lxpr_lookup_not_a_dir, /* /proc/stat */
+ lxpr_lookup_not_a_dir, /* /proc/uptime */
+ lxpr_lookup_not_a_dir, /* /proc/version */
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+ lxpr_readdir_procdir, /* /proc */
+ lxpr_readdir_piddir, /* /proc/<pid> */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/root */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/status */
+ lxpr_readdir_fddir, /* /proc/<pid>/fd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_readdir_not_a_dir, /* /proc/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/cpuinfo */
+ lxpr_readdir_not_a_dir, /* /proc/devices */
+ lxpr_readdir_not_a_dir, /* /proc/dma */
+ lxpr_readdir_not_a_dir, /* /proc/filesystems */
+ lxpr_readdir_not_a_dir, /* /proc/interrupts */
+ lxpr_readdir_not_a_dir, /* /proc/ioports */
+ lxpr_readdir_not_a_dir, /* /proc/kcore */
+ lxpr_readdir_not_a_dir, /* /proc/kmsg */
+ lxpr_readdir_not_a_dir, /* /proc/loadavg */
+ lxpr_readdir_not_a_dir, /* /proc/meminfo */
+ lxpr_readdir_not_a_dir, /* /proc/mounts */
+ lxpr_readdir_netdir, /* /proc/net */
+ lxpr_readdir_not_a_dir, /* /proc/net/arp */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_readdir_not_a_dir, /* /proc/net/igmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_readdir_not_a_dir, /* /proc/net/netstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/raw */
+ lxpr_readdir_not_a_dir, /* /proc/net/route */
+ lxpr_readdir_not_a_dir, /* /proc/net/rpc */
+ lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/sockstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/snmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/stat */
+ lxpr_readdir_not_a_dir, /* /proc/net/tcp */
+ lxpr_readdir_not_a_dir, /* /proc/net/udp */
+ lxpr_readdir_not_a_dir, /* /proc/net/unix */
+ lxpr_readdir_not_a_dir, /* /proc/partitions */
+ lxpr_readdir_not_a_dir, /* /proc/self */
+ lxpr_readdir_not_a_dir, /* /proc/stat */
+ lxpr_readdir_not_a_dir, /* /proc/uptime */
+ lxpr_readdir_not_a_dir, /* /proc/version */
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+ int error;
+
+ ASSERT(type < LXPR_NFILES);
+
+ if (type == LXPR_KMSG) {
+ ldi_ident_t li = VTOLXPM(vp)->lxprm_li;
+ ldi_handle_t ldih;
+ struct strioctl str;
+ int rv;
+
+ /*
+ * Open the zone's console device using the layered driver
+ * interface.
+ */
+ if ((error =
+ ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0)
+ return (error);
+
+ /*
+ * Send an ioctl to the underlying console device, letting it
+ * know we're interested in getting console messages.
+ */
+ str.ic_cmd = I_CONSLOG;
+ str.ic_timout = 0;
+ str.ic_len = 0;
+ str.ic_dp = NULL;
+ if ((error = ldi_ioctl(ldih, I_STR,
+ (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+ return (error);
+
+ lxpr_read_kmsg(lxpnp, uiobuf, ldih);
+
+ if ((error = ldi_close(ldih, FREAD, cr)) != 0)
+ return (error);
+ } else {
+ lxpr_read_function[type](lxpnp, uiobuf);
+ }
+
+ error = lxpr_uiobuf_flush(uiobuf);
+ lxpr_uiobuf_free(uiobuf);
+
+ return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ * but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context. This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ char *buf;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+ lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+ lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ struct as *as;
+ struct seg *seg;
+ char *buf;
+ int buflen = MAXPATHLEN;
+ struct print_data {
+ caddr_t saddr;
+ caddr_t eaddr;
+ int type;
+ char prot[5];
+ uint32_t offset;
+ vnode_t *vp;
+ struct print_data *next;
+ } *print_head = NULL;
+ struct print_data **print_tail = &print_head;
+ struct print_data *pbuf;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ as = p->p_as;
+
+ if (as == &kas) {
+ lxpr_unlock(p);
+ return;
+ }
+
+ mutex_exit(&p->p_lock);
+
+ /* Iterate over all segments in the address space */
+ AS_LOCK_ENTER(as, RW_READER);
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+ vnode_t *vp;
+ uint_t protbits;
+
+ if ((seg->s_flags & S_HOLE) != 0) {
+ continue;
+ }
+
+ pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+ pbuf->saddr = seg->s_base;
+ pbuf->eaddr = seg->s_base+seg->s_size;
+ pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+ /*
+ * Cheat and only use the protection bits of the first page
+ * in the segment
+ */
+ (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+ (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+ if (protbits & PROT_READ) pbuf->prot[0] = 'r';
+ if (protbits & PROT_WRITE) pbuf->prot[1] = 'w';
+ if (protbits & PROT_EXEC) pbuf->prot[2] = 'x';
+ if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's';
+ else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+ if (seg->s_ops == &segvn_ops &&
+ SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+ vp != NULL && vp->v_type == VREG) {
+ VN_HOLD(vp);
+ pbuf->vp = vp;
+ } else {
+ pbuf->vp = NULL;
+ }
+
+ pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+ pbuf->next = NULL;
+ *print_tail = pbuf;
+ print_tail = &pbuf->next;
+ }
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ /* print the data we've extracted */
+ pbuf = print_head;
+ while (pbuf != NULL) {
+ struct print_data *pbuf_next;
+ vattr_t vattr;
+
+ int maj = 0;
+ int min = 0;
+ u_longlong_t inode = 0;
+
+ *buf = '\0';
+ if (pbuf->vp != NULL) {
+ vattr.va_mask = AT_FSID | AT_NODEID;
+ if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+ NULL) == 0) {
+ maj = getmajor(vattr.va_fsid);
+ min = getminor(vattr.va_fsid);
+ inode = vattr.va_nodeid;
+ }
+ (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+ VN_RELE(pbuf->vp);
+ }
+
+ if (*buf != '\0') {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+ pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+ maj, min, inode, buf);
+ } else {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08x-%08x %s %08x %02d:%03d %lld\n",
+ pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+ maj, min, inode);
+ }
+
+ pbuf_next = pbuf->next;
+ kmem_free(pbuf, sizeof (*pbuf));
+ pbuf = pbuf_next;
+ }
+
+ kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ struct as *as;
+ size_t vsize;
+ size_t rss;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ as = p->p_as;
+
+ mutex_exit(&p->p_lock);
+
+ AS_LOCK_ENTER(as, RW_READER);
+ vsize = btopr(as->a_resvsize);
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as);
+
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%lu %lu %lu %lu %lu %lu %lu\n",
+ vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ user_t *up;
+ cred_t *cr;
+ const gid_t *groups;
+ int ngroups;
+ struct as *as;
+ char *status;
+ pid_t pid, ppid;
+ size_t vsize;
+ size_t rss;
+ k_sigset_t current, ignore, handle;
+ int i, lx_sig;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ pid = p->p_pid;
+
+ /*
+ * Convert pid to the Linux default of 1 if we're the zone's init
+ * process
+ */
+ if (pid == curproc->p_zone->zone_proc_initpid) {
+ pid = 1;
+ ppid = 0; /* parent pid for init is 0 */
+ } else {
+ /*
+ * Make sure not to reference parent PIDs that reside outside
+ * the zone
+ */
+ ppid = ((p->p_flag & SZONETOP)
+ ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+ /*
+ * Convert ppid to the Linux default of 1 if our parent is the
+ * zone's init process
+ */
+ if (ppid == curproc->p_zone->zone_proc_initpid)
+ ppid = 1;
+ }
+
+ t = prchoose(p);
+ if (t != NULL) {
+ switch (t->t_state) {
+ case TS_SLEEP:
+ status = "S (sleeping)";
+ break;
+ case TS_RUN:
+ case TS_ONPROC:
+ status = "R (running)";
+ break;
+ case TS_ZOMB:
+ status = "Z (zombie)";
+ break;
+ case TS_STOPPED:
+ status = "T (stopped)";
+ break;
+ default:
+ status = "! (unknown)";
+ break;
+ }
+ thread_unlock(t);
+ } else {
+ /*
+ * there is a hole in the exit code, where a proc can have
+ * no threads but it is yet to be flagged SZOMB. We will
+ * assume we are about to become a zombie
+ */
+ status = "Z (zombie)";
+ }
+
+ up = PTOU(p);
+ mutex_enter(&p->p_crlock);
+ crhold(cr = p->p_cred);
+ mutex_exit(&p->p_crlock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "Name:\t%s\n"
+ "State:\t%s\n"
+ "Tgid:\t%d\n"
+ "Pid:\t%d\n"
+ "PPid:\t%d\n"
+ "TracerPid:\t%d\n"
+ "Uid:\t%u\t%u\t%u\t%u\n"
+ "Gid:\t%u\t%u\t%u\t%u\n"
+ "FDSize:\t%d\n"
+ "Groups:\t",
+ up->u_comm,
+ status,
+ pid, /* thread group id - same as pid */
+ pid,
+ ppid,
+ 0,
+ crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+ crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+ p->p_fno_ctl);
+
+ ngroups = crgetngroups(cr);
+ groups = crgetgroups(cr);
+ for (i = 0; i < ngroups; i++) {
+ lxpr_uiobuf_printf(uiobuf,
+ "%u ",
+ groups[i]);
+ }
+ crfree(cr);
+
+ as = p->p_as;
+ if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) &&
+ (as != &kas)) {
+ mutex_exit(&p->p_lock);
+ AS_LOCK_ENTER(as, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "\n"
+ "VmSize:\t%8lu kB\n"
+ "VmLck:\t%8lu kB\n"
+ "VmRSS:\t%8lu kB\n"
+ "VmData:\t%8lu kB\n"
+ "VmStk:\t%8lu kB\n"
+ "VmExe:\t%8lu kB\n"
+ "VmLib:\t%8lu kB",
+ btok(vsize),
+ 0l,
+ ptok(rss),
+ 0l,
+ btok(p->p_stksize),
+ ptok(rss),
+ 0l);
+ }
+
+ sigemptyset(&current);
+ sigemptyset(&ignore);
+ sigemptyset(&handle);
+
+ for (i = 1; i < NSIG; i++) {
+ lx_sig = lxpr_sigmap[i];
+
+ if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) {
+ if (sigismember(&p->p_sig, i))
+ sigaddset(&current, lx_sig);
+
+ if (up->u_signal[i - 1] == SIG_IGN)
+ sigaddset(&ignore, lx_sig);
+ else if (up->u_signal[i - 1] != SIG_DFL)
+ sigaddset(&handle, lx_sig);
+ }
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "\n"
+ "SigPnd:\t%08x%08x\n"
+ "SigBlk:\t%08x%08x\n"
+ "SigIgn:\t%08x%08x\n"
+ "SigCgt:\t%08x%08x\n"
+ "CapInh:\t%016x\n"
+ "CapPrm:\t%016x\n"
+ "CapEff:\t%016x\n",
+ current.__sigbits[1], current.__sigbits[0],
+ 0, 0, /* signals blocked on per thread basis */
+ ignore.__sigbits[1], ignore.__sigbits[0],
+ handle.__sigbits[1], handle.__sigbits[0],
+ /* Can't do anything with linux capabilities */
+ 0,
+ 0,
+ 0);
+
+ lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ struct as *as;
+ char stat;
+ pid_t pid, ppid, pgpid, spid;
+ gid_t psgid;
+ dev_t psdev;
+ size_t rss, vsize;
+ int nice, pri;
+ caddr_t wchan;
+ processorid_t cpu;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ pid = p->p_pid;
+
+ /*
+ * Set Linux defaults if we're the zone's init process
+ */
+ if (pid == curproc->p_zone->zone_proc_initpid) {
+ pid = 1; /* PID for init */
+ ppid = 0; /* parent PID for init is 0 */
+ pgpid = 0; /* process group for init is 0 */
+ psgid = (gid_t)-1; /* credential GID for init is -1 */
+ spid = 0; /* session id for init is 0 */
+ psdev = 0; /* session device for init is 0 */
+ } else {
+ /*
+ * Make sure not to reference parent PIDs that reside outside
+ * the zone
+ */
+ ppid = ((p->p_flag & SZONETOP) ?
+ curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+ /*
+ * Convert ppid to the Linux default of 1 if our parent is the
+ * zone's init process
+ */
+ if (ppid == curproc->p_zone->zone_proc_initpid)
+ ppid = 1;
+
+ pgpid = p->p_pgrp;
+
+ mutex_enter(&p->p_splock);
+ mutex_enter(&p->p_sessp->s_lock);
+ spid = p->p_sessp->s_sid;
+ psdev = p->p_sessp->s_dev;
+ if (p->p_sessp->s_cred)
+ psgid = crgetgid(p->p_sessp->s_cred);
+ else
+ psgid = crgetgid(p->p_cred);
+
+ mutex_exit(&p->p_sessp->s_lock);
+ mutex_exit(&p->p_splock);
+ }
+
+ t = prchoose(p);
+ if (t != NULL) {
+ switch (t->t_state) {
+ case TS_SLEEP:
+ stat = 'S'; break;
+ case TS_RUN:
+ case TS_ONPROC:
+ stat = 'R'; break;
+ case TS_ZOMB:
+ stat = 'Z'; break;
+ case TS_STOPPED:
+ stat = 'T'; break;
+ default:
+ stat = '!'; break;
+ }
+
+ if (CL_DONICE(t, NULL, 0, &nice) != 0)
+ nice = 0;
+
+ pri = t->t_pri;
+ wchan = t->t_wchan;
+ cpu = t->t_cpu->cpu_id;
+ thread_unlock(t);
+ } else {
+ /* Only zombies have no threads */
+ stat = 'Z';
+ nice = 0;
+ pri = 0;
+ wchan = 0;
+ cpu = 0;
+ }
+ as = p->p_as;
+ mutex_exit(&p->p_lock);
+ AS_LOCK_ENTER(as, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%d (%s) %c %d %d %d %d %d "
+ "%lu %lu %lu %lu %lu "
+ "%lu %lu %ld %ld "
+ "%d %d %d "
+ "%lu "
+ "%lu "
+ "%lu %ld %llu "
+ "%lu %lu %u "
+ "%lu %lu "
+ "%lu %lu %lu %lu "
+ "%lu "
+ "%lu %lu "
+ "%d "
+ "%d"
+ "\n",
+ pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+ 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+ p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+ pri, nice, p->p_lwpcnt,
+ 0l, /* itrealvalue (time before next SIGALRM) */
+ PTOU(p)->u_ticks,
+ vsize, rss, p->p_vmem_ctl,
+ 0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+ 0l, 0l, /* kstkesp, kstkeip */
+ 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+ wchan,
+ 0l, 0l, /* nswap, cnswap */
+ 0, /* exit_signal */
+ cpu);
+
+ lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf, "Inter-| Receive "
+ " | Transmit\n");
+ lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo"
+ " frame compressed multicast|bytes packets errs drop fifo"
+ " colls carrier compressed\n");
+
+ /*
+ * Data about each interface should go here, but that shouldn't be added
+ * unless there is an lxproc reader that actually makes use of it (and
+ * doesn't need anything else that we refuse to provide)...
+ */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define LX_KMSG_PRI "<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh)
+{
+ mblk_t *mp;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_KMSG);
+
+ if (ldi_getmsg(lh, &mp, NULL) == 0) {
+ /*
+ * lxproc doesn't like successive reads to the same file
+ * descriptor unless we do an explicit rewind each time.
+ */
+ lxpr_uiobuf_seek(uiobuf, 0);
+
+ lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+ mp->b_cont->b_rptr);
+
+ freemsg(mp);
+ }
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ulong_t avenrun1;
+ ulong_t avenrun5;
+ ulong_t avenrun15;
+ ulong_t avenrun1_cs;
+ ulong_t avenrun5_cs;
+ ulong_t avenrun15_cs;
+ int loadavg[3];
+ int *loadbuf;
+ cpupart_t *cp;
+ zone_t *zone = LXPTOZ(lxpnp);
+
+ uint_t nrunnable = 0;
+ rctl_qty_t nlwps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+ mutex_enter(&cpu_lock);
+
+ /*
+ * Need to add up values over all CPU partitions. If pools are active,
+ * only report the values of the zone's partition, which by definition
+ * includes the current CPU.
+ */
+ if (pool_pset_enabled()) {
+ psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+ ASSERT(curproc->p_zone != &zone0);
+ cp = CPU->cpu_part;
+
+ nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+ (void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+ loadbuf = &loadavg[0];
+ } else {
+ cp = cp_list_head;
+ do {
+ nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+ } while ((cp = cp->cp_next) != cp_list_head);
+
+ loadbuf = zone == global_zone ?
+ &avenrun[0] : zone->zone_avenrun;
+ }
+
+ /*
+ * If we're in the non-global zone, we'll report the total number of
+ * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+ * otherwise will just use nthread (which will include kernel threads,
+ * but should be good enough for lxproc).
+ */
+ nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+ mutex_exit(&cpu_lock);
+
+ avenrun1 = loadbuf[0] >> FSHIFT;
+ avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun5 = loadbuf[1] >> FSHIFT;
+ avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun15 = loadbuf[2] >> FSHIFT;
+ avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+ avenrun1, avenrun1_cs,
+ avenrun5, avenrun5_cs,
+ avenrun15, avenrun15_cs,
+ nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ int global = zone == global_zone;
+ ulong_t total_mem, free_mem, total_swap, used_swap;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+ zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem,
+ (pgcnt_t *)&free_mem);
+ total_mem = ptob(total_mem);
+ free_mem = ptob(free_mem);
+
+ if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+ total_swap = ptob(k_anoninfo.ani_max);
+ used_swap = ptob(k_anoninfo.ani_phys_resv);
+ } else {
+ mutex_enter(&zone->zone_mem_lock);
+ total_swap = zone->zone_max_swap_ctl;
+ used_swap = zone->zone_max_swap;
+ mutex_exit(&zone->zone_mem_lock);
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ " total: used: free: shared: buffers: cached:\n"
+ "Mem: %8lu %8lu %8lu %8u %8u %8u\n"
+ "Swap: %8lu %8lu %8lu\n"
+ "MemTotal: %8lu kB\n"
+ "MemFree: %8lu kB\n"
+ "MemShared: %8u kB\n"
+ "Buffers: %8u kB\n"
+ "Cached: %8u kB\n"
+ "SwapCached:%8u kB\n"
+ "Active: %8u kB\n"
+ "Inactive: %8u kB\n"
+ "HighTotal: %8u kB\n"
+ "HighFree: %8u kB\n"
+ "LowTotal: %8u kB\n"
+ "LowFree: %8u kB\n"
+ "SwapTotal: %8lu kB\n"
+ "SwapFree: %8lu kB\n",
+ total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+ total_swap, used_swap, total_swap - used_swap,
+ btok(total_mem), /* MemTotal */
+ btok(free_mem), /* MemFree */
+ 0, /* MemShared */
+ 0, /* Buffers */
+ 0, /* Cached */
+ 0, /* SwapCached */
+ 0, /* Active */
+ 0, /* Inactive */
+ 0, /* HighTotal */
+ 0, /* HighFree */
+ btok(total_mem), /* LowTotal */
+ btok(free_mem), /* LowFree */
+ btok(total_swap), /* SwapTotal */
+ btok(total_swap - used_swap)); /* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ struct vfs *vfsp;
+ struct vfs *vfslist;
+ zone_t *zone = LXPTOZ(lxpnp);
+ struct print_data {
+ refstr_t *vfs_mntpt;
+ refstr_t *vfs_resource;
+ uint_t vfs_flag;
+ int vfs_fstype;
+ struct print_data *next;
+ } *print_head = NULL;
+ struct print_data **print_tail = &print_head;
+ struct print_data *printp;
+
+ vfs_list_read_lock();
+
+ if (zone == global_zone) {
+ vfsp = vfslist = rootvfs;
+ } else {
+ vfsp = vfslist = zone->zone_vfslist;
+ /*
+ * If the zone has a root entry, it will be the first in
+ * the list. If it doesn't, we conjure one up.
+ */
+ if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+ zone->zone_rootpath) != 0) {
+ struct vfs *tvfsp;
+ /*
+ * The root of the zone is not a mount point. The vfs
+ * we want to report is that of the zone's root vnode.
+ */
+ tvfsp = zone->zone_rootvp->v_vfsp;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "/ / %s %s 0 0\n",
+ vfssw[tvfsp->vfs_fstype].vsw_name,
+ tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+ }
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ return;
+ }
+ }
+
+ /*
+ * Later on we have to do a lookupname, which can end up causing
+ * another vfs_list_read_lock() to be called. Which can lead to a
+ * deadlock. To avoid this, we extract the data we need into a local
+ * list, then we can run this list without holding vfs_list_read_lock()
+ * We keep the list in the same order as the vfs_list
+ */
+ do {
+ /* Skip mounts we shouldn't show */
+ if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+ goto nextfs;
+ }
+
+ printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+ refstr_hold(vfsp->vfs_mntpt);
+ printp->vfs_mntpt = vfsp->vfs_mntpt;
+ refstr_hold(vfsp->vfs_resource);
+ printp->vfs_resource = vfsp->vfs_resource;
+ printp->vfs_flag = vfsp->vfs_flag;
+ printp->vfs_fstype = vfsp->vfs_fstype;
+ printp->next = NULL;
+
+ *print_tail = printp;
+ print_tail = &printp->next;
+
+nextfs:
+ vfsp = (zone == global_zone) ?
+ vfsp->vfs_next : vfsp->vfs_zone_next;
+
+ } while (vfsp != vfslist);
+
+ vfs_list_unlock();
+
+ /*
+ * now we can run through what we've extracted without holding
+ * vfs_list_read_lock()
+ */
+ printp = print_head;
+ while (printp != NULL) {
+ struct print_data *printp_next;
+ const char *resource;
+ char *mntpt;
+ struct vnode *vp;
+ int error;
+
+ mntpt = (char *)refstr_value(printp->vfs_mntpt);
+ resource = refstr_value(printp->vfs_resource);
+
+ if (mntpt != NULL && mntpt[0] != '\0')
+ mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+ else
+ mntpt = "-";
+
+ error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+ if (error != 0)
+ goto nextp;
+
+ if (!(vp->v_flag & VROOT)) {
+ VN_RELE(vp);
+ goto nextp;
+ }
+ VN_RELE(vp);
+
+ if (resource != NULL && resource[0] != '\0') {
+ if (resource[0] == '/') {
+ resource = ZONE_PATH_VISIBLE(resource, zone) ?
+ ZONE_PATH_TRANSLATE(resource, zone) :
+ mntpt;
+ }
+ } else {
+ resource = "-";
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%s %s %s %s 0 0\n",
+ resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+ printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+ printp_next = printp->next;
+ refstr_rele(printp->vfs_mntpt);
+ refstr_rele(printp->vfs_resource);
+ kmem_free(printp, sizeof (*printp));
+ printp = printp_next;
+
+ }
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices. But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf,
+ "major minor #blocks name rio rmerge rsect ruse "
+ "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file. Note that
+ * we don't lie here -- we don't pretend that we're Linux. If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf,
+ "%s version %s (%s version %d.%d.%d) "
+ "#%s SMP %s\n",
+ utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+ "gcc",
+ __GNUC__,
+ __GNUC_MINOR__,
+ __GNUC_PATCHLEVEL__,
+#else
+ "Sun C",
+ __SUNPRO_C / 0x100,
+ (__SUNPRO_C & 0xff) / 0x10,
+ __SUNPRO_C & 0xf,
+#endif
+ utsname.version,
+ "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t sys_cum = 0;
+ ulong_t user_cum = 0;
+ ulong_t irq_cum = 0;
+ ulong_t cpu_nrunnable_cum = 0;
+ ulong_t w_io_cum = 0;
+
+ ulong_t pgpgin_cum = 0;
+ ulong_t pgpgout_cum = 0;
+ ulong_t pgswapout_cum = 0;
+ ulong_t pgswapin_cum = 0;
+ ulong_t intr_cum = 0;
+ ulong_t pswitch_cum = 0;
+ ulong_t forks_cum = 0;
+ hrtime_t msnsecs[NCMSTATES];
+
+ /* temporary variable since scalehrtime modifies data in place */
+ hrtime_t tmptime;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ /* Calculate cumulative stats */
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ int i;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+ sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+ user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+ pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+ pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+ pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+ pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+ cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+ w_io_cum += CPU_STATS(cp, sys.iowait);
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_cum += NSEC_TO_TICK(tmptime);
+ }
+
+ for (i = 0; i < PIL_MAX; i++)
+ intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+ pswitch_cum += CPU_STATS(cp, sys.pswitch);
+ forks_cum += CPU_STATS(cp, sys.sysfork);
+ forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n",
+ user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L);
+
+ /* Do per processor stats */
+ do {
+ int i;
+
+ ulong_t idle_ticks;
+ ulong_t sys_ticks;
+ ulong_t user_ticks;
+ ulong_t irq_ticks = 0;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+ sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+ user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_ticks += NSEC_TO_TICK(tmptime);
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "cpu%d %lu %lu %lu %lu %lu %lu %lu\n",
+ cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks,
+ 0L, irq_ticks, 0L);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "page %lu %lu\n"
+ "swap %lu %lu\n"
+ "intr %lu\n"
+ "ctxt %lu\n"
+ "btime %lu\n"
+ "processes %lu\n"
+ "procs_running %lu\n"
+ "procs_blocked %lu\n",
+ pgpgin_cum, pgpgout_cum,
+ pgswapin_cum, pgswapout_cum,
+ intr_cum,
+ pswitch_cum,
+ boot_time,
+ forks_cum,
+ cpu_nrunnable_cum,
+ w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t cpu_count = 0;
+ ulong_t idle_s;
+ ulong_t idle_cs;
+ ulong_t up_s;
+ ulong_t up_cs;
+ hrtime_t birthtime;
+ hrtime_t centi_sec = 10000000; /* 10^7 */
+
+ ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+ /* Calculate cumulative stats */
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+ cpu_count += 1;
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+ mutex_exit(&cpu_lock);
+
+ /* Getting the Zone zsched process startup time */
+ birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+ up_cs = (gethrtime() - birthtime) / centi_sec;
+ up_s = up_cs / 100;
+ up_cs %= 100;
+
+ ASSERT(cpu_count > 0);
+ idle_cum /= cpu_count;
+ idle_s = idle_cum / hz;
+ idle_cs = idle_cum % hz;
+ idle_cs *= 100;
+ idle_cs /= hz;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "syscall",
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "mp",
+ "nx", NULL, "mmxext", NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, "lm", "3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+ "lahf_lm", NULL, "svm", NULL,
+ "altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+ "recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "syscall",
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ "nx", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, "lm", NULL, NULL
+};
+
+static const char *intc_edx[] = {
+ "fpu", "vme", "de", "pse",
+ "tsc", "msr", "pae", "mce",
+ "cx8", "apic", NULL, "sep",
+ "mtrr", "pge", "mca", "cmov",
+ "pat", "pse36", "pn", "clflush",
+ NULL, "dts", "acpi", "mmx",
+ "fxsr", "sse", "sse2", "ss",
+ "ht", "tm", "ia64", "pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+ "pni", NULL, NULL, "monitor",
+ "ds_cpl", NULL, NULL, "est",
+ "tm2", NULL, "cid", NULL,
+ NULL, "cx16", "xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ int i;
+ uint32_t bits;
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ const char **fp;
+ char brandstr[CPU_IDSTRLEN];
+ struct cpuid_regs cpr;
+ int maxeax;
+ int std_ecx, std_edx, ext_ecx, ext_edx;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ /*
+ * This returns the maximum eax value for standard cpuid
+ * functions in eax.
+ */
+ cpr.cp_eax = 0;
+ (void) cpuid_insn(cp, &cpr);
+ maxeax = cpr.cp_eax;
+
+ /*
+ * Get standard x86 feature flags.
+ */
+ cpr.cp_eax = 1;
+ (void) cpuid_insn(cp, &cpr);
+ std_ecx = cpr.cp_ecx;
+ std_edx = cpr.cp_edx;
+
+ /*
+ * Now get extended feature flags.
+ */
+ cpr.cp_eax = 0x80000001;
+ (void) cpuid_insn(cp, &cpr);
+ ext_ecx = cpr.cp_ecx;
+ ext_edx = cpr.cp_edx;
+
+ (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "processor\t: %d\n"
+ "vendor_id\t: %s\n"
+ "cpu family\t: %d\n"
+ "model\t\t: %d\n"
+ "model name\t: %s\n"
+ "stepping\t: %d\n"
+ "cpu MHz\t\t: %u.%03u\n",
+ cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+ cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+ (uint32_t)(cpu_freq_hz / 1000000),
+ ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+ lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+ getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+ if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+ /*
+ * 'siblings' is used for HT-style threads
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "physical id\t: %lu\n"
+ "siblings\t: %u\n",
+ pg_plat_hw_instance_id(cp, PGHW_CHIP),
+ cpuid_get_ncpu_per_chip(cp));
+ }
+
+ /*
+ * Since we're relatively picky about running on older hardware,
+ * we can be somewhat cavalier about the answers to these ones.
+ *
+ * In fact, given the hardware we support, we just say:
+ *
+ * fdiv_bug : no (if we're on a 64-bit kernel)
+ * hlt_bug : no
+ * f00f_bug : no
+ * coma_bug : no
+ * wp : yes (write protect in supervsr mode)
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "fdiv_bug\t: %s\n"
+ "hlt_bug \t: no\n"
+ "f00f_bug\t: no\n"
+ "coma_bug\t: no\n"
+ "fpu\t\t: %s\n"
+ "fpu_exception\t: %s\n"
+ "cpuid level\t: %d\n"
+ "flags\t\t:",
+#if defined(__i386)
+ fpu_pentium_fdivbug ? "yes" : "no",
+#else
+ "no",
+#endif /* __i386 */
+ fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+ maxeax);
+
+ for (bits = std_edx, fp = intc_edx, i = 0;
+ i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ /*
+ * name additional features where appropriate
+ */
+ switch (x86_vendor) {
+ case X86_VENDOR_Intel:
+ for (bits = ext_edx, fp = intc_x_edx, i = 0;
+ i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+
+ case X86_VENDOR_AMD:
+ for (bits = ext_edx, fp = amd_x_edx, i = 0;
+ i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+ i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+
+ case X86_VENDOR_TM:
+ for (bits = ext_edx, fp = tm_x_edx, i = 0;
+ i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+ default:
+ break;
+ }
+
+ for (bits = std_ecx, fp = intc_ecx, i = 0;
+ i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+ lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ register lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ extern uint_t nproc;
+ int error;
+
+ /*
+ * Return attributes of underlying vnode if ATTR_REAL
+ *
+ * but keep fd files with the symlink permissions
+ */
+ if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+
+ /*
+ * withold attribute information to owner or root
+ */
+ if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * now its attributes
+ */
+ if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * if it's a file in lx /proc/pid/fd/xx then set its
+ * mode and keep it looking like a symlink
+ */
+ if (type == LXPR_PID_FD_FD) {
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_type = vp->v_type;
+ vap->va_size = 0;
+ vap->va_nlink = 1;
+ }
+ return (0);
+ }
+
+ /* Default attributes, that may be overridden below */
+ bzero(vap, sizeof (*vap));
+ vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+ vap->va_nlink = 1;
+ vap->va_type = vp->v_type;
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_fsid = vp->v_vfsp->vfs_dev;
+ vap->va_blksize = DEV_BSIZE;
+ vap->va_uid = lxpnp->lxpr_uid;
+ vap->va_gid = lxpnp->lxpr_gid;
+ vap->va_nodeid = lxpnp->lxpr_ino;
+
+ switch (type) {
+ case LXPR_PROCDIR:
+ vap->va_nlink = nproc + 2 + PROCDIRFILES;
+ vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+ break;
+ case LXPR_PIDDIR:
+ vap->va_nlink = PIDDIRFILES;
+ vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+ break;
+ case LXPR_SELF:
+ vap->va_uid = crgetruid(curproc->p_cred);
+ vap->va_gid = crgetrgid(curproc->p_cred);
+ break;
+ default:
+ break;
+ }
+
+ vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+ return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ int shift = 0;
+ proc_t *tp;
+
+ /* lx /proc is a read only file system */
+ if (mode & VWRITE)
+ return (EROFS);
+
+ /*
+ * If this is a restricted file, check access permissions.
+ */
+ switch (lxpnp->lxpr_type) {
+ case LXPR_PIDDIR:
+ return (0);
+ case LXPR_PID_CURDIR:
+ case LXPR_PID_ENV:
+ case LXPR_PID_EXE:
+ case LXPR_PID_MAPS:
+ case LXPR_PID_MEM:
+ case LXPR_PID_ROOTDIR:
+ case LXPR_PID_FDDIR:
+ case LXPR_PID_FD_FD:
+ if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+ return (ENOENT);
+ if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+ priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+ lxpr_unlock(tp);
+ return (EACCES);
+ }
+ lxpr_unlock(tp);
+ default:
+ break;
+ }
+
+ if (lxpnp->lxpr_realvp != NULL) {
+ /*
+ * For these we use the underlying vnode's accessibility.
+ */
+ return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+ }
+
+ /* If user is root allow access regardless of permission bits */
+ if (secpolicy_proc_access(cr) == 0)
+ return (0);
+
+ /*
+ * Access check is based on only one of owner, group, public. If not
+ * owner, then check group. If not a member of the group, then check
+ * public access.
+ */
+ if (crgetuid(cr) != lxpnp->lxpr_uid) {
+ shift += 3;
+ if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+ shift += 3;
+ }
+
+ mode &= ~(lxpnp->lxpr_mode << shift);
+
+ if (mode == 0)
+ return (0);
+
+ return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+ return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the lookup
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict lookup permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * Just return the parent vnode if that's where we are trying to go.
+ */
+ if (strcmp(comp, "..") == 0) {
+ VN_HOLD(lxpnp->lxpr_parent);
+ *vpp = lxpnp->lxpr_parent;
+ return (0);
+ }
+
+ /*
+ * Special handling for directory searches. Note: null component name
+ * denotes that the current directory is being searched.
+ */
+ if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+ VN_HOLD(dp);
+ *vpp = dp;
+ return (0);
+ }
+
+ *vpp = (lxpr_lookup_function[type](dp, comp));
+ return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ lxpr_node_t *lxpnp;
+ int count;
+
+ for (count = 0; count < dirtablen; count++) {
+ if (strcmp(dirtab[count].d_name, comp) == 0) {
+ lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+ return (dp);
+ }
+ }
+ return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+ proc_t *p;
+
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+ p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+ if (p == NULL)
+ return (NULL);
+
+ dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+ lxpr_unlock(p);
+
+ return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+ lxpr_node_t *dlxpnp = VTOLXP(dp);
+ lxpr_node_t *lxpnp;
+ vnode_t *vp = NULL;
+ proc_t *p;
+ file_t *fp;
+ uint_t fd;
+ int c;
+ uf_entry_t *ufp;
+ uf_info_t *fip;
+
+ ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+ /*
+ * convert the string rendition of the filename
+ * to a file descriptor
+ */
+ fd = 0;
+ while ((c = *comp++) != '\0') {
+ int ofd;
+ if (c < '0' || c > '9')
+ return (NULL);
+
+ ofd = fd;
+ fd = 10*fd + c - '0';
+ /* integer overflow */
+ if (fd / 10 != ofd)
+ return (NULL);
+ }
+
+ /*
+ * get the proc to work with and lock it
+ */
+ p = lxpr_lock(dlxpnp->lxpr_pid);
+ if ((p == NULL))
+ return (NULL);
+
+ /*
+ * If the process is a zombie or system process
+ * it can't have any open files.
+ */
+ if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+ (p->p_as == &kas)) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * get us a fresh node/vnode
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+ /*
+ * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+ * going away while we dereference into fi_list.
+ */
+ mutex_exit(&p->p_lock);
+
+ /*
+ * get open file info
+ */
+ fip = (&(p)->p_user.u_finfo);
+ mutex_enter(&fip->fi_lock);
+
+ if (fd < fip->fi_nfiles) {
+ UF_ENTER(ufp, fip, fd);
+ /*
+ * ensure the fd is still kosher.
+ * it may have gone between the readdir and
+ * the lookup
+ */
+ if (fip->fi_list[fd].uf_file == NULL) {
+ mutex_exit(&fip->fi_lock);
+ UF_EXIT(ufp);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ lxpr_freenode(lxpnp);
+ return (NULL);
+ }
+
+ if ((fp = ufp->uf_file) != NULL)
+ vp = fp->f_vnode;
+ UF_EXIT(ufp);
+ }
+ mutex_exit(&fip->fi_lock);
+
+ if (vp == NULL) {
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ lxpr_freenode(lxpnp);
+ return (NULL);
+ } else {
+ /*
+ * Fill in the lxpr_node so future references will be able to
+ * find the underlying vnode. The vnode is held on the realvp.
+ */
+ lxpnp->lxpr_realvp = vp;
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+
+ return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+ dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+ return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+ /*
+ * We know all the names of files & dirs in our file system structure
+ * except those that are pid names. These change as pids are created/
+ * deleted etc., so we just look for a number as the first char to see
+ * if we are we doing pid lookups.
+ *
+ * Don't need to check for "self" as it is implemented as a symlink
+ */
+ if (*comp >= '0' && *comp <= '9') {
+ pid_t pid = 0;
+ lxpr_node_t *lxpnp = NULL;
+ proc_t *p;
+ int c;
+
+ while ((c = *comp++) != '\0')
+ pid = 10 * pid + c - '0';
+
+ /*
+ * Can't continue if the process is still loading or it doesn't
+ * really exist yet (or maybe it just died!)
+ */
+ p = lxpr_lock(pid);
+ if (p == NULL)
+ return (NULL);
+
+ if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * allocate and fill in a new lxpr node
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+ lxpr_unlock(p);
+
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+
+ return (dp);
+ }
+
+ /* Lookup fixed names */
+ return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ ssize_t uresid;
+ off_t uoffset;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the readdir
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict readdir permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+ return (error);
+
+ uoffset = uiop->uio_offset;
+ uresid = uiop->uio_resid;
+
+ /* can't do negative reads */
+ if (uoffset < 0 || uresid <= 0)
+ return (EINVAL);
+
+ /* can't read directory entries that don't exist! */
+ if (uoffset % LXPR_SDSIZE)
+ return (ENOENT);
+
+ return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+
+ oresid = uiop->uio_resid;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Satisfy user request
+ */
+ while ((uresid = uiop->uio_resid) > 0) {
+ int dirindex;
+ off_t uoffset;
+ int reclen;
+ int error;
+
+ uoffset = uiop->uio_offset;
+ dirindex = (uoffset / LXPR_SDSIZE) - 2;
+
+ if (uoffset == 0) {
+
+ dirent->d_ino = lxpnp->lxpr_ino;
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '\0';
+ reclen = DIRENT64_RECLEN(1);
+
+ } else if (uoffset == LXPR_SDSIZE) {
+
+ dirent->d_ino = lxpr_parentinode(lxpnp);
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '.';
+ dirent->d_name[2] = '\0';
+ reclen = DIRENT64_RECLEN(2);
+
+ } else if (dirindex >= 0 && dirindex < dirtablen) {
+ int slen = strlen(dirtab[dirindex].d_name);
+
+ dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+ lxpnp->lxpr_pid, 0);
+
+ VERIFY(slen < LXPNSIZ);
+ (void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+ reclen = DIRENT64_RECLEN(slen);
+
+ } else {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid) {
+ return (EINVAL);
+ }
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, ignoring what uiomove() does.
+ */
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ return (error);
+
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ /* Have run out of space, but could have just done last table entry */
+ if (eofp) {
+ *eofp =
+ (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+ return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ zoneid_t zoneid;
+ pid_t pid;
+ int error;
+ int ceof;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+ oresid = uiop->uio_resid;
+ zoneid = LXPTOZ(lxpnp)->zone_id;
+
+ /*
+ * We return directory entries in the order: "." and ".." then the
+ * unique lxproc files, then the directories corresponding to the
+ * running processes. We have defined this as the ordering because
+ * it allows us to more easily keep track of where we are betwen calls
+ * to getdents(). If the number of processes changes between calls
+ * then we can't lose track of where we are in the lxproc files.
+ */
+
+ /* Do the fixed entries */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+ PROCDIRFILES);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ return (error);
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /* Do the process entries */
+ while ((uresid = uiop->uio_resid) > 0) {
+ proc_t *p;
+ int len;
+ int reclen;
+ int i;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop when entire proc table has been examined.
+ */
+ i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+ if (i < 0 || i >= v.v_proc) {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+ mutex_enter(&pidlock);
+
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, a PID of 0,
+ * and anything the security policy doesn't allow
+ * us to look at.
+ */
+ if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+ p->p_pid == 0 ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ mutex_exit(&pidlock);
+ goto next;
+ }
+ mutex_exit(&pidlock);
+
+ /*
+ * Convert pid to the Linux default of 1 if we're the zone's
+ * init process, otherwise use the value from the proc
+ * structure
+ */
+ pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+ p->p_pid : 1);
+
+ /*
+ * If this /proc was mounted in the global zone, view
+ * all procs; otherwise, only view zone member procs.
+ */
+ if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+ goto next;
+ }
+
+ ASSERT(p->p_stat != 0);
+
+ dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ return (EINVAL);
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, in the increment of this for
+ * the loop, ignoring what uiomove() does.
+ */
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ return (error);
+next:
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ if (eofp != NULL) {
+ *eofp = (uiop->uio_offset >=
+ ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+ return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ proc_t *p;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+ /* can't read its contents if it died */
+ mutex_enter(&pidlock);
+
+ p = prfind((lxpnp->lxpr_pid == 1) ?
+ curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+ if (p == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (ENOENT);
+ }
+ mutex_exit(&pidlock);
+
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ int error;
+ int ceof;
+ proc_t *p;
+ int fddirsize = -1;
+ uf_info_t *fip;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+ oresid = uiop->uio_resid;
+
+ /* can't read its contents if it died */
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL)
+ return (ENOENT);
+
+ if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+ (p->p_as == &kas))
+ fddirsize = 0;
+
+ /*
+ * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+ * going away while we iterate over its fi_list.
+ */
+ mutex_exit(&p->p_lock);
+
+ /* Get open file info */
+ fip = (&(p)->p_user.u_finfo);
+ mutex_enter(&fip->fi_lock);
+
+ if (fddirsize == -1)
+ fddirsize = fip->fi_nfiles;
+
+ /* Do the fixed entries (in this case just "." & "..") */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ goto out;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Loop until user's request is satisfied or until
+ * all file descriptors have been examined.
+ */
+ for (; (uresid = uiop->uio_resid) > 0;
+ uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+ int reclen;
+ int fd;
+ int len;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop at the end of the fd list
+ */
+ fd = (uoffset / LXPR_SDSIZE) - 2;
+ if (fd < 0 || fd >= fddirsize) {
+ if (eofp) {
+ *eofp = 1;
+ }
+ goto out;
+ }
+
+ if (fip->fi_list[fd].uf_file == NULL)
+ continue;
+
+ dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ goto out;
+ }
+
+ if (eofp != NULL) {
+ *eofp =
+ (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+out:
+ mutex_exit(&fip->fi_lock);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ char bp[MAXPATHLEN + 1];
+ size_t buflen = sizeof (bp);
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+ pid_t pid;
+ int error = 0;
+
+ /* must be a symbolic link file */
+ if (vp->v_type != VLNK)
+ return (EINVAL);
+
+ /* Try to produce a symlink name for anything that has a realvp */
+ if (rvp != NULL) {
+ if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+ return (error);
+ if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+ return (error);
+ } else {
+ switch (lxpnp->lxpr_type) {
+ case LXPR_SELF:
+ /*
+ * Convert pid to the Linux default of 1 if we're the
+ * zone's init process
+ */
+ pid = ((curproc->p_pid !=
+ curproc->p_zone->zone_proc_initpid)
+ ? curproc->p_pid : 1);
+
+ /*
+ * Don't need to check result as every possible int
+ * will fit within MAXPATHLEN bytes.
+ */
+ (void) snprintf(bp, buflen, "%d", pid);
+ break;
+ case LXPR_PID_CURDIR:
+ case LXPR_PID_ROOTDIR:
+ case LXPR_PID_EXE:
+ return (EACCES);
+ default:
+ /*
+ * Need to return error so that nothing thinks
+ * that the symlink is empty and hence "."
+ */
+ return (EINVAL);
+ }
+ }
+
+ /* copy the link data to user space */
+ return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+ /*
+ * Nothing to sync but this function must never fail
+ */
+ return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ while (vn_matchops(vp1, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+ vp1 = rvp;
+ }
+
+ while (vn_matchops(vp2, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+ vp2 = rvp;
+ }
+
+ if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+ return (vp1 == vp2);
+
+ return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+ vp = rvp;
+ if (VOP_REALVP(vp, &rvp, ct) == 0)
+ vp = rvp;
+ }
+
+ *vpp = vp;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..eadb2ccd27
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,278 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LXPROC_H
+#define _LXPROC_H
+
+#ifdef _LXPROC_BRANDED_H
+#error Attempted to include native lxproc.h after branded lx_proc.h
+#endif
+
+#define _LXPROC_NATIVE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define LX_SIGHUP 1
+#define LX_SIGINT 2
+#define LX_SIGQUIT 3
+#define LX_SIGILL 4
+#define LX_SIGTRAP 5
+#define LX_SIGABRT 6
+#define LX_SIGIOT 6
+#define LX_SIGBUS 7
+#define LX_SIGFPE 8
+#define LX_SIGKILL 9
+#define LX_SIGUSR1 10
+#define LX_SIGSEGV 11
+#define LX_SIGUSR2 12
+#define LX_SIGPIPE 13
+#define LX_SIGALRM 14
+#define LX_SIGTERM 15
+#define LX_SIGSTKFLT 16
+#define LX_SIGCHLD 17
+#define LX_SIGCONT 18
+#define LX_SIGSTOP 19
+#define LX_SIGTSTP 20
+#define LX_SIGTTIN 21
+#define LX_SIGTTOU 22
+#define LX_SIGURG 23
+#define LX_SIGXCPU 24
+#define LX_SIGXFSZ 25
+#define LX_SIGVTALRM 26
+#define LX_SIGPROF 27
+#define LX_SIGWINCH 28
+#define LX_SIGIO 29
+#define LX_SIGPOLL LX_SIGIO
+#define LX_SIGPWR 30
+#define LX_SIGSYS 31
+#define LX_SIGUNUSED 31
+
+#define LX_NSIG 64 /* Linux _NSIG */
+
+#define LX_SIGRTMIN 32
+#define LX_SIGRTMAX LX_NSIG
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define LXPTOZ(lxpnp) \
+ (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define LXPNSIZ 256 /* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define LXPR_SDSIZE 16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+ LXPR_PROCDIR, /* /proc */
+ LXPR_PIDDIR, /* /proc/<pid> */
+ LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */
+ LXPR_PID_CPU, /* /proc/<pid>/cpu */
+ LXPR_PID_CURDIR, /* /proc/<pid>/cwd */
+ LXPR_PID_ENV, /* /proc/<pid>/environ */
+ LXPR_PID_EXE, /* /proc/<pid>/exe */
+ LXPR_PID_MAPS, /* /proc/<pid>/maps */
+ LXPR_PID_MEM, /* /proc/<pid>/mem */
+ LXPR_PID_ROOTDIR, /* /proc/<pid>/root */
+ LXPR_PID_STAT, /* /proc/<pid>/stat */
+ LXPR_PID_STATM, /* /proc/<pid>/statm */
+ LXPR_PID_STATUS, /* /proc/<pid>/status */
+ LXPR_PID_FDDIR, /* /proc/<pid>/fd */
+ LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */
+ LXPR_CMDLINE, /* /proc/cmdline */
+ LXPR_CPUINFO, /* /proc/cpuinfo */
+ LXPR_DEVICES, /* /proc/devices */
+ LXPR_DMA, /* /proc/dma */
+ LXPR_FILESYSTEMS, /* /proc/filesystems */
+ LXPR_INTERRUPTS, /* /proc/interrupts */
+ LXPR_IOPORTS, /* /proc/ioports */
+ LXPR_KCORE, /* /proc/kcore */
+ LXPR_KMSG, /* /proc/kmsg */
+ LXPR_LOADAVG, /* /proc/loadavg */
+ LXPR_MEMINFO, /* /proc/meminfo */
+ LXPR_MOUNTS, /* /proc/mounts */
+ LXPR_NETDIR, /* /proc/net */
+ LXPR_NET_ARP, /* /proc/net/arp */
+ LXPR_NET_DEV, /* /proc/net/dev */
+ LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */
+ LXPR_NET_IGMP, /* /proc/net/igmp */
+ LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */
+ LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */
+ LXPR_NET_MCFILTER, /* /proc/net/mcfilter */
+ LXPR_NET_NETSTAT, /* /proc/net/netstat */
+ LXPR_NET_RAW, /* /proc/net/raw */
+ LXPR_NET_ROUTE, /* /proc/net/route */
+ LXPR_NET_RPC, /* /proc/net/rpc */
+ LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */
+ LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */
+ LXPR_NET_SNMP, /* /proc/net/snmp */
+ LXPR_NET_STAT, /* /proc/net/stat */
+ LXPR_NET_TCP, /* /proc/net/tcp */
+ LXPR_NET_UDP, /* /proc/net/udp */
+ LXPR_NET_UNIX, /* /proc/net/unix */
+ LXPR_PARTITIONS, /* /proc/partitions */
+ LXPR_SELF, /* /proc/self */
+ LXPR_STAT, /* /proc/stat */
+ LXPR_UPTIME, /* /proc/uptime */
+ LXPR_VERSION, /* /proc/version */
+ LXPR_NFILES /* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define LXPRMAXNAMELEN 14
+typedef struct {
+ lxpr_nodetype_t d_type;
+ char d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+ lxpr_nodetype_t lxpr_type; /* type of this node */
+ vnode_t *lxpr_vnode; /* vnode for the node */
+ vnode_t *lxpr_parent; /* parent directory */
+ vnode_t *lxpr_realvp; /* real vnode, file in dirs */
+ timestruc_t lxpr_time; /* creation etc time for file */
+ mode_t lxpr_mode; /* file mode bits */
+ uid_t lxpr_uid; /* file owner */
+ gid_t lxpr_gid; /* file group owner */
+ pid_t lxpr_pid; /* pid of proc referred to */
+ ino_t lxpr_ino; /* node id */
+} lxpr_node_t;
+
+struct zone; /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+ lxpr_node_t *lxprm_node; /* node at root of proc mount */
+ struct zone *lxprm_zone; /* zone for this mount */
+ ldi_ident_t lxprm_li; /* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t *lxpr_vnodeops;
+extern int nproc_highbit; /* highbit(v.v_nproc) */
+
+typedef struct mounta mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
index d6a88a97c3..f6c6b62925 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*/
/*
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index b7354c168a..d3b12817ba 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -29,7 +29,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
if (nvp)
vnevent_rename_dest(nvp, ndvp, nnm, ct);
- if (odvp != ndvp)
- vnevent_rename_dest_dir(ndvp, ct);
ASSERT(ovp != NULL);
vnevent_rename_src(ovp, odvp, onm, ct);
+ vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
}
if (nvp) {
@@ -5523,8 +5522,13 @@ nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
va.va_size = bfp->l_start;
error = nfs3setattr(vp, &va, 0, cr);
- if (error == 0 && bfp->l_start == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (bfp->l_start == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
} else
error = EINVAL;
}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
index f0320aaee0..25088aafcb 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
@@ -22,6 +22,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*/
/*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index 4112cbee05..945d37533d 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -38,7 +38,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/param.h>
@@ -3745,8 +3745,13 @@ nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
*/
error = nfs4setattr(vp, vap, flags, cr, NULL);
- if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0 && (vap->va_mask & AT_SIZE)) {
+ if (vap->va_size == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
return (error);
}
@@ -8062,8 +8067,9 @@ link_call:
* vnode if it already existed.
*/
if (error == 0) {
- vnode_t *tvp;
+ vnode_t *tvp, *tovp;
rnode4_t *trp;
+
/*
* Notify the vnode. Each links is represented by
* a different vnode, in nfsv4.
@@ -8076,23 +8082,20 @@ link_call:
vnevent_rename_dest(tvp, ndvp, nnm, ct);
}
- /*
- * if the source and destination directory are not the
- * same notify the destination directory.
- */
- if (VTOR4(odvp) != VTOR4(ndvp)) {
- trp = VTOR4(ndvp);
- tvp = ndvp;
- if (IS_SHADOW(ndvp, trp))
- tvp = RTOV4(trp);
- vnevent_rename_dest_dir(tvp, ct);
- }
-
trp = VTOR4(ovp);
- tvp = ovp;
+ tovp = ovp;
if (IS_SHADOW(ovp, trp))
+ tovp = RTOV4(trp);
+
+ vnevent_rename_src(tovp, odvp, onm, ct);
+
+ trp = VTOR4(ndvp);
+ tvp = ndvp;
+
+ if (IS_SHADOW(ndvp, trp))
tvp = RTOV4(trp);
- vnevent_rename_src(tvp, odvp, onm, ct);
+
+ vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
}
if (nvp) {
@@ -10997,8 +11000,13 @@ nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
va.va_size = bfp->l_start;
error = nfs4setattr(vp, &va, 0, cr, NULL);
- if (error == 0 && bfp->l_start == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (bfp->l_start == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
} else
error = EINVAL;
}
diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c
index 2851f8bef9..5fa0e6414f 100644
--- a/usr/src/uts/common/fs/nfs/nfs_auth.c
+++ b/usr/src/uts/common/fs/nfs/nfs_auth.c
@@ -22,6 +22,7 @@
/*
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
* Copyright (c) 2015 by Delphix. All rights reserved.
*/
@@ -561,11 +562,16 @@ retry:
*access = res.ares.auth_perm;
*srv_uid = res.ares.auth_srv_uid;
*srv_gid = res.ares.auth_srv_gid;
- *srv_gids_cnt = res.ares.auth_srv_gids.len;
- *srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t),
- KM_SLEEP);
- bcopy(res.ares.auth_srv_gids.val, *srv_gids,
- *srv_gids_cnt * sizeof (gid_t));
+
+ if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) {
+ *srv_gids = kmem_alloc(*srv_gids_cnt *
+ sizeof (gid_t), KM_SLEEP);
+ bcopy(res.ares.auth_srv_gids.val, *srv_gids,
+ *srv_gids_cnt * sizeof (gid_t));
+ } else {
+ *srv_gids = NULL;
+ }
+
break;
case NFSAUTH_DR_EFAIL:
@@ -1054,9 +1060,13 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
if (gid != NULL)
*gid = p->auth_srv_gid;
if (ngids != NULL && gids != NULL) {
- *ngids = p->auth_srv_ngids;
- *gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP);
- bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t));
+ if ((*ngids = p->auth_srv_ngids) != 0) {
+ size_t sz = *ngids * sizeof (gid_t);
+ *gids = kmem_alloc(sz, KM_SLEEP);
+ bcopy(p->auth_srv_gids, *gids, sz);
+ } else {
+ *gids = NULL;
+ }
}
access = p->auth_access;
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index 476da6685a..c6ae29d220 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -2573,6 +2573,9 @@ nfs_srvinit(void)
{
int error;
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (EACCES);
+
error = nfs_exportinit();
if (error != 0)
return (error);
diff --git a/usr/src/uts/common/fs/nfs/nfs_sys.c b/usr/src/uts/common/fs/nfs/nfs_sys.c
index e6ff4a2e0b..b4fc9884b1 100644
--- a/usr/src/uts/common/fs/nfs/nfs_sys.c
+++ b/usr/src/uts/common/fs/nfs/nfs_sys.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
@@ -247,7 +248,7 @@ nfssys(enum nfssys_op opcode, void *arg)
lsa.n_fmly = STRUCT_FGET(ulsa, n_fmly);
lsa.n_proto = STRUCT_FGET(ulsa, n_proto);
lsa.n_rdev = expldev(STRUCT_FGET(ulsa, n_rdev));
- lsa.debug = STRUCT_FGET(ulsa, debug);
+ lsa.n_v4_only = STRUCT_FGET(ulsa, n_v4_only);
lsa.timout = STRUCT_FGET(ulsa, timout);
lsa.grace = STRUCT_FGET(ulsa, grace);
lsa.retransmittimeout = STRUCT_FGET(ulsa,
diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
index c9cc306f95..5041ebb6fe 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index 1a1082bcb8..ee3bac484f 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -26,7 +26,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -1174,8 +1174,13 @@ nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
error = nfssetattr(vp, vap, flags, cr);
- if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0 && (mask & AT_SIZE)) {
+ if (vap->va_size == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
return (error);
}
@@ -2688,11 +2693,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
if (nvp)
vnevent_rename_dest(nvp, ndvp, nnm, ct);
- if (odvp != ndvp)
- vnevent_rename_dest_dir(ndvp, ct);
-
ASSERT(ovp != NULL);
vnevent_rename_src(ovp, odvp, onm, ct);
+ vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
}
if (nvp) {
@@ -4620,8 +4623,13 @@ nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
va.va_size = bfp->l_start;
error = nfssetattr(vp, &va, 0, cr);
- if (error == 0 && bfp->l_start == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (bfp->l_start == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
} else
error = EINVAL;
}
diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c
index 976715e346..275330a0ae 100644
--- a/usr/src/uts/common/fs/pcfs/pc_dir.c
+++ b/usr/src/uts/common/fs/pcfs/pc_dir.c
@@ -22,7 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/param.h>
@@ -826,8 +826,7 @@ top:
if (error == 0) {
vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp);
- if (dp != tdp)
- vnevent_rename_dest_dir(PCTOV(tdp), ctp);
+ vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp);
}
done:
diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c
index cb43f0fe59..b307fe11d7 100644
--- a/usr/src/uts/common/fs/pcfs/pc_vnops.c
+++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c
@@ -782,8 +782,11 @@ pcfs_setattr(
if (error)
goto out;
- if (vap->va_size == 0)
+ if (vap->va_size == 0) {
vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
}
/*
* Change file modified times.
diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c
index 70f773ab55..04a2a421db 100644
--- a/usr/src/uts/common/fs/portfs/port.c
+++ b/usr/src/uts/common/fs/portfs/port.c
@@ -24,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
@@ -1379,12 +1383,18 @@ portnowait:
if (model == DATAMODEL_NATIVE) {
eventsz = sizeof (port_event_t);
- kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
- if (kevp == NULL) {
- if (nmax > pp->port_max_list)
- nmax = pp->port_max_list;
- kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+ if (nmax == 0) {
+ kevp = NULL;
+ } else {
+ kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+ if (kevp == NULL) {
+ if (nmax > pp->port_max_list)
+ nmax = pp->port_max_list;
+ kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+ }
}
+
results = kevp;
lev = NULL; /* start with first event in the queue */
for (nevents = 0; nevents < nmax; ) {
@@ -1421,12 +1431,18 @@ portnowait:
port_event32_t *kevp32;
eventsz = sizeof (port_event32_t);
- kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
- if (kevp32 == NULL) {
- if (nmax > pp->port_max_list)
- nmax = pp->port_max_list;
- kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+ if (nmax == 0) {
+ kevp32 = NULL;
+ } else {
+ kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+ if (kevp32 == NULL) {
+ if (nmax > pp->port_max_list)
+ nmax = pp->port_max_list;
+ kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+ }
}
+
results = kevp32;
lev = NULL; /* start with first event in the queue */
for (nevents = 0; nevents < nmax; ) {
diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c
new file mode 100644
index 0000000000..b09a9c8afc
--- /dev/null
+++ b/usr/src/uts/common/fs/proc/prargv.c
@@ -0,0 +1,441 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/sysmacros.h>
+#include <vm/as.h>
+
+/*
+ * Safely read a contiguous region of memory from 'addr' in the address space
+ * of a particular process into the supplied kernel buffer (*buf, sz).
+ * Partially mapped regions will result in a partial read terminating at the
+ * first hole in the address space. The number of bytes actually read is
+ * returned to the caller via 'rdsz'.
+ */
+int
+prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz)
+{
+ int error = 0;
+ size_t rem = sz;
+ off_t pos = 0;
+
+ if (rdsz != NULL)
+ *rdsz = 0;
+
+ while (rem != 0) {
+ uintptr_t addr = ustart + pos;
+ size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET));
+
+ if ((error = uread(p, buf + pos, len, addr)) != 0) {
+ if (error == ENXIO) {
+ /*
+ * ENXIO from uread() indicates that the page
+ * does not exist. This will simply be a
+ * partial read.
+ */
+ error = 0;
+ }
+ break;
+ }
+
+ rem -= len;
+ pos += len;
+ }
+
+ if (rdsz != NULL)
+ *rdsz = pos;
+
+ return (error);
+}
+
+/*
+ * Attempt to read the argument vector (argv) from this process. The caller
+ * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via
+ * prlock or lx_prlock).
+ *
+ * The caller must provide a buffer (buf, buflen). We will concatenate each
+ * argument string (including the NUL terminator) into this buffer. The number
+ * of characters written to this buffer (including the final NUL terminator)
+ * will be stored in 'slen'.
+ */
+int
+prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+ int error;
+ user_t *up;
+ struct as *as;
+ size_t pos = 0;
+ caddr_t *argv = NULL;
+ size_t argvsz = 0;
+ int i;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+ up = PTOU(p);
+ as = p->p_as;
+
+ if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) {
+ /*
+ * Return the regular psargs string to the caller.
+ */
+ bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs)));
+ buf[bufsz - 1] = '\0';
+ *slen = strlen(buf) + 1;
+
+ return (0);
+ }
+
+ /*
+ * Allocate space to store argv array.
+ */
+ argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ?
+ sizeof (caddr32_t) : sizeof (caddr_t));
+ argv = kmem_alloc(argvsz, KM_SLEEP);
+
+ /*
+ * Extract the argv array from the target process. Drop p_lock
+ * while we do I/O to avoid deadlock with the clock thread.
+ */
+ mutex_exit(&p->p_lock);
+ if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz,
+ NULL)) != 0) {
+ kmem_free(argv, argvsz);
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (-1);
+ }
+
+ /*
+ * Read each argument string from the pointers in the argv array.
+ */
+ pos = 0;
+ for (i = 0; i < up->u_argc; i++) {
+ size_t rdsz, trysz;
+ uintptr_t arg;
+ off_t j;
+ boolean_t found_nul;
+ boolean_t do_retry = B_TRUE;
+
+#ifdef _SYSCALL32_IMPL
+ if (p->p_model == DATAMODEL_ILP32) {
+ arg = (uintptr_t)((caddr32_t *)argv)[i];
+ } else {
+ arg = (uintptr_t)argv[i];
+ }
+#else
+ arg = (uintptr_t)argv[i];
+#endif
+
+ /*
+ * Stop trying to read arguments if we reach a NULL
+ * pointer in the vector.
+ */
+ if (arg == NULL)
+ break;
+
+ /*
+ * Stop reading if we have read the maximum length
+ * we can return to the user.
+ */
+ if (pos >= bufsz)
+ break;
+
+ /*
+ * Initially we try a short read, on the assumption that
+ * most individual argument strings are less than 80
+ * characters long.
+ */
+ if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+ /*
+ * We don't have room in the target buffer for even
+ * an entire short read, so there is no need to retry
+ * with a longer read.
+ */
+ do_retry = B_FALSE;
+ }
+
+retry:
+ /*
+ * Read string data for this argument. Leave room
+ * in the buffer for a final NUL terminator.
+ */
+ if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz,
+ &rdsz)) != 0) {
+ /*
+ * There was a problem reading this string
+ * from the process. Give up.
+ */
+ break;
+ }
+
+ /*
+ * Find the NUL terminator.
+ */
+ found_nul = B_FALSE;
+ for (j = 0; j < rdsz; j++) {
+ if (buf[pos + j] == '\0') {
+ found_nul = B_TRUE;
+ break;
+ }
+ }
+
+ if (!found_nul && do_retry) {
+ /*
+ * We did not find a NUL terminator, but this
+ * was a first pass short read. Try once more
+ * with feeling.
+ */
+ trysz = bufsz - pos - 1;
+ do_retry = B_FALSE;
+ goto retry;
+ }
+
+ /*
+ * Commit the string we read to the buffer.
+ */
+ pos += j + 1;
+ if (!found_nul && pos < bufsz) {
+ /*
+ * A NUL terminator was not found; add one.
+ */
+ buf[pos++] = '\0';
+ }
+ }
+
+ /*
+ * Ensure the entire string is NUL-terminated.
+ */
+ buf[bufsz - 1] = '\0';
+
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ kmem_free(argv, argvsz);
+
+ /*
+ * If the operation was a success, return the copied string length
+ * to the caller.
+ */
+ *slen = (error == 0) ? pos : 0;
+
+ return (error);
+}
+
+/*
+ * Similar to prreadargv except reads the env vector. This is slightly more
+ * complex because there is no count for the env vector that corresponds to
+ * u_argc.
+ */
+int
+prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+ int error;
+ user_t *up;
+ struct as *as;
+ size_t pos = 0;
+ caddr_t *envp = NULL;
+ uintptr_t tmpp = NULL;
+ size_t envpsz = 0, rdsz = 0;
+ int i;
+ int cnt, bound;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+ up = PTOU(p);
+ as = p->p_as;
+
+ if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) {
+ /*
+ * Return empty string.
+ */
+ buf[0] = '\0';
+ *slen = 1;
+
+ return (0);
+ }
+
+ /*
+ * Drop p_lock while we do I/O to avoid deadlock with the clock thread.
+ */
+ mutex_exit(&p->p_lock);
+
+ /*
+ * We first have to count how many env entries we have. This is
+ * somewhat painful. We extract the env entries from the target process
+ * one entry at a time. Stop trying to read env entries if we reach a
+ * NULL pointer in the vector or hit our upper bound (which we take
+ * as the bufsz/4) to ensure we don't run off.
+ */
+ rdsz = (p->p_model == DATAMODEL_ILP32 ?
+ sizeof (caddr32_t) : sizeof (caddr_t));
+ bound = (int)(bufsz / 4);
+ for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) {
+ caddr_t tmp = NULL;
+
+ if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz,
+ NULL)) != 0) {
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (-1);
+ }
+
+ if (tmp == NULL)
+ break;
+ }
+ if (cnt == 0) {
+ /* Return empty string. */
+ buf[0] = '\0';
+ *slen = 1;
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (0);
+ }
+
+ /*
+ * Allocate space to store env array.
+ */
+ envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ?
+ sizeof (caddr32_t) : sizeof (caddr_t));
+ envp = kmem_alloc(envpsz, KM_SLEEP);
+
+ /*
+ * Extract the env array from the target process.
+ */
+ if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz,
+ NULL)) != 0) {
+ kmem_free(envp, envpsz);
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (-1);
+ }
+
+ /*
+ * Read each env string from the pointers in the env array.
+ */
+ pos = 0;
+ for (i = 0; i < cnt; i++) {
+ size_t rdsz, trysz;
+ uintptr_t ev;
+ off_t j;
+ boolean_t found_nul;
+ boolean_t do_retry = B_TRUE;
+
+#ifdef _SYSCALL32_IMPL
+ if (p->p_model == DATAMODEL_ILP32) {
+ ev = (uintptr_t)((caddr32_t *)envp)[i];
+ } else {
+ ev = (uintptr_t)envp[i];
+ }
+#else
+ ev = (uintptr_t)envp[i];
+#endif
+
+ /*
+ * Stop trying to read env entries if we reach a NULL
+ * pointer in the vector.
+ */
+ if (ev == NULL)
+ break;
+
+ /*
+ * Stop reading if we have read the maximum length
+ * we can return to the user.
+ */
+ if (pos >= bufsz)
+ break;
+
+ /*
+ * Initially we try a short read, on the assumption that
+ * most individual env strings are less than 80
+ * characters long.
+ */
+ if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+ /*
+ * We don't have room in the target buffer for even
+ * an entire short read, so there is no need to retry
+ * with a longer read.
+ */
+ do_retry = B_FALSE;
+ }
+
+retry:
+ /*
+ * Read string data for this env var. Leave room
+ * in the buffer for a final NUL terminator.
+ */
+ if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz,
+ &rdsz)) != 0) {
+ /*
+ * There was a problem reading this string
+ * from the process. Give up.
+ */
+ break;
+ }
+
+ /*
+ * Find the NUL terminator.
+ */
+ found_nul = B_FALSE;
+ for (j = 0; j < rdsz; j++) {
+ if (buf[pos + j] == '\0') {
+ found_nul = B_TRUE;
+ break;
+ }
+ }
+
+ if (!found_nul && do_retry) {
+ /*
+ * We did not find a NUL terminator, but this
+ * was a first pass short read. Try once more
+ * with feeling.
+ */
+ trysz = bufsz - pos - 1;
+ do_retry = B_FALSE;
+ goto retry;
+ }
+
+ /*
+ * Commit the string we read to the buffer.
+ */
+ pos += j + 1;
+ if (!found_nul && pos < bufsz) {
+ /*
+ * A NUL terminator was not found; add one.
+ */
+ buf[pos++] = '\0';
+ }
+ }
+
+ /*
+ * Ensure the entire string is NUL-terminated.
+ */
+ buf[bufsz - 1] = '\0';
+
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ kmem_free(envp, envpsz);
+
+ /*
+ * If the operation was a success, return the copied string length
+ * to the caller.
+ */
+ *slen = (error == 0) ? pos : 0;
+
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index 6b151a6369..07dcb1e7db 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip)
} else if (t->t_state == TS_STOPPED && sig == SIGKILL) {
/* If SIGKILL, set stopped lwp running */
p->p_stopsig = 0;
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
t->t_dtrace_stop = 0;
setrun_locked(t);
}
@@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr)
return (EPERM);
if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id)
return (EINVAL);
- if ((zptr = zone_find_by_id(zoneid)) == NULL)
- return (EINVAL);
+ /*
+ * We cannot hold p_lock when we call zone_find_by_id since that can
+ * lead to a deadlock. zone_find_by_id() takes zonehash_lock.
+ * zone_enter() can hold the zonehash_lock and needs p_lock when it
+ * calls task_join.
+ */
mutex_exit(&p->p_lock);
+ if ((zptr = zone_find_by_id(zoneid)) == NULL) {
+ mutex_enter(&p->p_lock);
+ return (EINVAL);
+ }
mutex_enter(&p->p_crlock);
oldcred = p->p_cred;
crhold(oldcred);
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index de816d49e7..706e3ad14d 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -123,6 +123,7 @@ typedef enum prnodetype {
#if defined(__i386) || defined(__amd64)
PR_LDT, /* /proc/<pid>/ldt */
#endif
+ PR_ARGV, /* /proc/<pid>/argv */
PR_USAGE, /* /proc/<pid>/usage */
PR_LUSAGE, /* /proc/<pid>/lusage */
PR_PAGEDATA, /* /proc/<pid>/pagedata */
@@ -349,6 +350,8 @@ extern int pr_unset(proc_t *, long);
extern void pr_sethold(prnode_t *, sigset_t *);
extern void pr_setfault(proc_t *, fltset_t *);
extern int prusrio(proc_t *, enum uio_rw, struct uio *, int);
+extern int prreadargv(proc_t *, char *, size_t, size_t *);
+extern int prreadenvv(proc_t *, char *, size_t, size_t *);
extern int prwritectl(vnode_t *, struct uio *, cred_t *);
extern int prlock(prnode_t *, int);
extern void prunmark(proc_t *);
@@ -375,6 +378,7 @@ extern int clear_watched_area(proc_t *, struct watched_area *);
extern void pr_free_watchpoints(proc_t *);
extern proc_t *pr_cancel_watch(prnode_t *);
extern struct seg *break_seg(proc_t *);
+extern void prgethold(kthread_t *, sigset_t *);
/*
* Machine-dependent routines (defined in prmachdep.c).
diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c
index 8202e49df0..08c5f6ffc0 100644
--- a/usr/src/uts/common/fs/proc/prioctl.c
+++ b/usr/src/uts/common/fs/proc/prioctl.c
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -930,8 +930,7 @@ startover:
}
case PIOCGHOLD: /* get signal-hold mask */
- schedctl_finish_sigblock(t);
- sigktou(&t->t_hold, &un.holdmask);
+ prgethold(t, &un.holdmask);
prunlock(pnp);
if (copyout(&un.holdmask, cmaddr, sizeof (un.holdmask)))
error = EFAULT;
@@ -944,7 +943,7 @@ startover:
case PIOCNMAP: /* get number of memory mappings */
{
- int n;
+ uint_t n;
struct as *as = p->p_as;
if ((p->p_flag & SSYS) || as == &kas)
@@ -957,7 +956,7 @@ startover:
mutex_enter(&p->p_lock);
}
prunlock(pnp);
- if (copyout(&n, cmaddr, sizeof (int)))
+ if (copyout(&n, cmaddr, sizeof (uint_t)))
error = EFAULT;
break;
}
@@ -1395,8 +1394,7 @@ oprgetstatus32(kthread_t *t, prstatus32_t *sp, zone_t *zp)
sp->pr_cursig = lwp->lwp_cursig;
prassignset(&sp->pr_sigpend, &p->p_sig);
prassignset(&sp->pr_lwppend, &t->t_sig);
- schedctl_finish_sigblock(t);
- prassignset(&sp->pr_sighold, &t->t_hold);
+ prgethold(t, &sp->pr_sighold);
sp->pr_altstack.ss_sp =
(caddr32_t)(uintptr_t)lwp->lwp_sigaltstack.ss_sp;
sp->pr_altstack.ss_size = (size32_t)lwp->lwp_sigaltstack.ss_size;
@@ -1673,14 +1671,8 @@ oprgetpsinfo32(proc_t *p, prpsinfo32_t *psp, kthread_t *tp)
/*ARGSUSED*/
static int
-prioctl32(
- struct vnode *vp,
- int cmd,
- intptr_t arg,
- int flag,
- cred_t *cr,
- int *rvalp,
- caller_context_t *ct)
+prioctl32(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
+ int *rvalp, caller_context_t *ct)
{
int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG;
caddr_t cmaddr = (caddr_t)arg;
@@ -2557,8 +2549,7 @@ startover:
}
case PIOCGHOLD: /* get signal-hold mask */
- schedctl_finish_sigblock(t);
- sigktou(&t->t_hold, &un32.holdmask);
+ prgethold(t, &un32.holdmask);
prunlock(pnp);
if (copyout(&un32.holdmask, cmaddr, sizeof (un32.holdmask)))
error = EFAULT;
@@ -2571,7 +2562,7 @@ startover:
case PIOCNMAP: /* get number of memory mappings */
{
- int n;
+ uint_t n;
struct as *as = p->p_as;
if ((p->p_flag & SSYS) || as == &kas)
@@ -2584,7 +2575,7 @@ startover:
mutex_enter(&p->p_lock);
}
prunlock(pnp);
- if (copyout(&n, cmaddr, sizeof (int)))
+ if (copyout(&n, cmaddr, sizeof (uint_t)))
error = EFAULT;
break;
}
@@ -3235,8 +3226,7 @@ oprgetstatus(kthread_t *t, prstatus_t *sp, zone_t *zp)
sp->pr_cursig = lwp->lwp_cursig;
prassignset(&sp->pr_sigpend, &p->p_sig);
prassignset(&sp->pr_lwppend, &t->t_sig);
- schedctl_finish_sigblock(t);
- prassignset(&sp->pr_sighold, &t->t_hold);
+ prgethold(t, &sp->pr_sighold);
sp->pr_altstack = lwp->lwp_sigaltstack;
prgetaction(p, up, lwp->lwp_cursig, &sp->pr_action);
sp->pr_pid = p->p_pid;
diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c
index a2ab06d769..3b4a7f36d0 100644
--- a/usr/src/uts/common/fs/proc/prsubr.c
+++ b/usr/src/uts/common/fs/proc/prsubr.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -148,6 +148,11 @@ prchoose(proc_t *p)
continue;
}
+ /* If this is a process kernel thread, ignore it. */
+ if ((t->t_proc_flag & TP_KTHREAD) != 0) {
+ continue;
+ }
+
thread_lock(t); /* make sure thread is in good state */
switch (t->t_state) {
default:
@@ -201,6 +206,7 @@ prchoose(proc_t *p)
case PR_SYSEXIT:
case PR_SIGNALLED:
case PR_FAULTED:
+ case PR_BRAND:
/*
* Make an lwp calling exit() be the
* last lwp seen in the process.
@@ -534,6 +540,12 @@ prexecend(void)
pcp->prc_tslot = tslot;
}
}
+
+ /*
+ * There may be threads waiting for the flag change blocked behind the
+ * pr_pid_cv as well.
+ */
+ cv_signal(&pr_pid_cv[p->p_slot]);
}
/*
@@ -919,6 +931,29 @@ prgetstatus(proc_t *p, pstatus_t *sp, zone_t *zp)
sp->pr_flags = sp->pr_lwp.pr_flags;
}
+/*
+ * Query mask of held signals for a given thread.
+ *
+ * This makes use of schedctl_sigblock() to query if userspace has requested
+ * that all maskable signals be held. While it would be tempting to call
+ * schedctl_finish_sigblock() and apply that update to t->t_hold, it cannot be
+ * done safely without the risk of racing with the thread under consideration.
+ */
+void
+prgethold(kthread_t *t, sigset_t *sp)
+{
+ k_sigset_t set;
+
+ if (schedctl_sigblock(t)) {
+ set.__sigbits[0] = FILLSET0 & ~CANTMASK0;
+ set.__sigbits[1] = FILLSET1 & ~CANTMASK1;
+ set.__sigbits[2] = FILLSET2 & ~CANTMASK2;
+ } else {
+ set = t->t_hold;
+ }
+ sigktou(&set, sp);
+}
+
#ifdef _SYSCALL32_IMPL
void
prgetlwpstatus32(kthread_t *t, lwpstatus32_t *sp, zone_t *zp)
@@ -980,8 +1015,7 @@ prgetlwpstatus32(kthread_t *t, lwpstatus32_t *sp, zone_t *zp)
sp->pr_lwpid = t->t_tid;
sp->pr_cursig = lwp->lwp_cursig;
prassignset(&sp->pr_lwppend, &t->t_sig);
- schedctl_finish_sigblock(t);
- prassignset(&sp->pr_lwphold, &t->t_hold);
+ prgethold(t, &sp->pr_lwphold);
if (t->t_whystop == PR_FAULTED) {
siginfo_kto32(&lwp->lwp_siginfo, &sp->pr_info);
if (t->t_whatstop == FLTPAGE)
@@ -1212,8 +1246,7 @@ prgetlwpstatus(kthread_t *t, lwpstatus_t *sp, zone_t *zp)
sp->pr_lwpid = t->t_tid;
sp->pr_cursig = lwp->lwp_cursig;
prassignset(&sp->pr_lwppend, &t->t_sig);
- schedctl_finish_sigblock(t);
- prassignset(&sp->pr_lwphold, &t->t_hold);
+ prgethold(t, &sp->pr_lwphold);
if (t->t_whystop == PR_FAULTED)
bcopy(&lwp->lwp_siginfo,
&sp->pr_info, sizeof (k_siginfo_t));
@@ -1370,10 +1403,10 @@ prgetaction32(proc_t *p, user_t *up, uint_t sig, struct sigaction32 *sp)
/*
* Count the number of segments in this process's address space.
*/
-int
+uint_t
prnsegs(struct as *as, int reserved)
{
- int n = 0;
+ uint_t n = 0;
struct seg *seg;
ASSERT(as != &kas && AS_WRITE_HELD(as));
@@ -1390,8 +1423,21 @@ prnsegs(struct as *as, int reserved)
for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
(void) pr_getprot(seg, reserved, &tmp,
&saddr, &naddr, eaddr);
- if (saddr != naddr)
+ if (saddr != naddr) {
n++;
+ /*
+ * prnsegs() was formerly designated to return
+ * an 'int' despite having no ability or use
+ * for negative results. As part of changing
+ * it to 'uint_t', keep the old effective limit
+ * of INT_MAX in place.
+ */
+ if (n == INT_MAX) {
+ pr_getprot_done(&tmp);
+ ASSERT(tmp == NULL);
+ return (n);
+ }
+ }
}
ASSERT(tmp == NULL);
@@ -2591,7 +2637,6 @@ prgetlwpsinfo(kthread_t *t, lwpsinfo_t *psp)
void
prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp)
{
- proc_t *p = ttoproc(t);
klwp_t *lwp = ttolwp(t);
sobj_ops_t *sobj;
char c, state;
@@ -2599,7 +2644,7 @@ prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp)
int retval, niceval;
hrtime_t hrutime, hrstime;
- ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
bzero(psp, sizeof (*psp));
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index 2cf007c42c..657cebf8c2 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -98,6 +98,11 @@ struct prdirect {
#define PRSDSIZE (sizeof (struct prdirect))
/*
+ * Maximum length of the /proc/$$/argv file:
+ */
+int prmaxargvlen = 4096;
+
+/*
* Directory characteristics.
*/
typedef struct prdirent {
@@ -170,6 +175,8 @@ static prdirent_t piddir[] = {
{ PR_LDT, 28 * sizeof (prdirent_t), sizeof (prdirent_t),
"ldt" },
#endif
+ { PR_ARGV, 28 * sizeof (prdirent_t), sizeof (prdirent_t),
+ "argv" },
};
#define NPIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]) - 2)
@@ -588,6 +595,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(),
#if defined(__x86)
pr_read_ldt(),
#endif
+ pr_read_argv(),
pr_read_usage(), pr_read_lusage(), pr_read_pagedata(),
pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(),
pr_read_lwpusage(), pr_read_lwpname(),
@@ -617,6 +625,7 @@ static int (*pr_read_function[PR_NFILES])() = {
#if defined(__x86)
pr_read_ldt, /* /proc/<pid>/ldt */
#endif
+ pr_read_argv, /* /proc/<pid>/argv */
pr_read_usage, /* /proc/<pid>/usage */
pr_read_lusage, /* /proc/<pid>/lusage */
pr_read_pagedata, /* /proc/<pid>/pagedata */
@@ -681,6 +690,41 @@ pr_uioread(void *base, long count, uio_t *uiop)
}
static int
+pr_read_argv(prnode_t *pnp, uio_t *uiop)
+{
+ char *args;
+ int error;
+ size_t asz = prmaxargvlen, sz;
+
+ /*
+ * Allocate a scratch buffer for collection of the process arguments.
+ */
+ args = kmem_alloc(asz, KM_SLEEP);
+
+ ASSERT(pnp->pr_type == PR_ARGV);
+
+ if ((error = prlock(pnp, ZNO)) != 0) {
+ kmem_free(args, asz);
+ return (error);
+ }
+
+ if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz,
+ &sz)) != 0) {
+ prunlock(pnp);
+ kmem_free(args, asz);
+ return (error);
+ }
+
+ prunlock(pnp);
+
+ error = pr_uioread(args, sz, uiop);
+
+ kmem_free(args, asz);
+
+ return (error);
+}
+
+static int
pr_read_as(prnode_t *pnp, uio_t *uiop)
{
int error;
@@ -1827,6 +1871,7 @@ static int (*pr_read_function_32[PR_NFILES])() = {
#if defined(__x86)
pr_read_ldt, /* /proc/<pid>/ldt */
#endif
+ pr_read_argv, /* /proc/<pid>/argv */
pr_read_usage_32, /* /proc/<pid>/usage */
pr_read_lusage_32, /* /proc/<pid>/lusage */
pr_read_pagedata_32, /* /proc/<pid>/pagedata */
@@ -2753,6 +2798,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
#endif
}
+/*
+ * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile
+ * time that PRFNSZ has the same definition as MAXCOMLEN.
+ */
+#if PRFNSZ != MAXCOMLEN
+#error PRFNSZ/MAXCOMLEN mismatch
+#endif
+
+static int
+pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop)
+{
+ char fname[PRFNSZ];
+ int offset = offsetof(psinfo_t, pr_fname), error;
+
+#ifdef _SYSCALL32_IMPL
+ if (curproc->p_model != DATAMODEL_LP64)
+ offset = offsetof(psinfo32_t, pr_fname);
+#endif
+
+ /*
+ * If this isn't a write to pr_fname (or if the size doesn't match
+ * PRFNSZ) return.
+ */
+ if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ)
+ return (0);
+
+ if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0)
+ return (error);
+
+ fname[PRFNSZ - 1] = '\0';
+
+ if ((error = prlock(pnp, ZNO)) != 0)
+ return (error);
+
+ bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ);
+
+ prunlock(pnp);
+
+ return (0);
+}
+
+/*
+ * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile
+ * time that PRARGSZ has the same definition as PSARGSZ.
+ */
+#if PRARGSZ != PSARGSZ
+#error PRARGSZ/PSARGSZ mismatch
+#endif
+
+static int
+pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop)
+{
+ char psargs[PRARGSZ];
+ int offset = offsetof(psinfo_t, pr_psargs), error;
+
+#ifdef _SYSCALL32_IMPL
+ if (curproc->p_model != DATAMODEL_LP64)
+ offset = offsetof(psinfo32_t, pr_psargs);
+#endif
+
+ /*
+ * If this isn't a write to pr_psargs (or if the size doesn't match
+ * PRARGSZ) return.
+ */
+ if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ)
+ return (0);
+
+ if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0)
+ return (error);
+
+ psargs[PRARGSZ - 1] = '\0';
+
+ if ((error = prlock(pnp, ZNO)) != 0)
+ return (error);
+
+ bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ);
+
+ prunlock(pnp);
+
+ return (0);
+}
+
+int
+pr_write_psinfo(prnode_t *pnp, uio_t *uiop)
+{
+ int error;
+
+ if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0)
+ return (error);
+
+ if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0)
+ return (error);
+
+ return (0);
+}
+
+
/* Note we intentionally don't handle partial writes/updates. */
static int
pr_write_lwpname(prnode_t *pnp, uio_t *uiop)
@@ -2879,6 +3021,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
uiop->uio_resid = resid;
return (error);
+ case PR_PSINFO:
+ return (pr_write_psinfo(pnp, uiop));
+
case PR_LWPNAME:
return (pr_write_lwpname(pnp, uiop));
@@ -3168,6 +3313,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
case PR_AUXV:
vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t);
break;
+ case PR_ARGV:
+ if ((p->p_flag & SSYS) || p->p_as == &kas) {
+ vap->va_size = PSARGSZ;
+ } else {
+ vap->va_size = prmaxargvlen;
+ }
+ break;
#if defined(__x86)
case PR_LDT:
mutex_exit(&p->p_lock);
@@ -3344,6 +3496,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
case PR_USAGE:
case PR_LUSAGE:
case PR_LWPUSAGE:
+ case PR_ARGV:
p = pr_p_lock(pnp);
mutex_exit(&pr_pidlock);
if (p == NULL)
@@ -3429,6 +3582,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = {
#if defined(__x86)
pr_lookup_notdir, /* /proc/<pid>/ldt */
#endif
+ pr_lookup_notdir, /* /proc/<pid>/argv */
pr_lookup_notdir, /* /proc/<pid>/usage */
pr_lookup_notdir, /* /proc/<pid>/lusage */
pr_lookup_notdir, /* /proc/<pid>/pagedata */
@@ -4706,16 +4860,17 @@ prgetnode(vnode_t *dp, prnodetype_t type)
pnp->pr_mode = 0600; /* read-write by owner only */
break;
+ case PR_PSINFO:
case PR_LWPNAME:
pnp->pr_mode = 0644; /* readable by all + owner can write */
break;
- case PR_PSINFO:
case PR_LPSINFO:
case PR_LWPSINFO:
case PR_USAGE:
case PR_LUSAGE:
case PR_LWPUSAGE:
+ case PR_ARGV:
pnp->pr_mode = 0444; /* read-only by all */
break;
@@ -4821,6 +4976,7 @@ static int (*pr_readdir_function[PR_NFILES])() = {
#if defined(__x86)
pr_readdir_notdir, /* /proc/<pid>/ldt */
#endif
+ pr_readdir_notdir, /* /proc/<pid>/argv */
pr_readdir_notdir, /* /proc/<pid>/usage */
pr_readdir_notdir, /* /proc/<pid>/lusage */
pr_readdir_notdir, /* /proc/<pid>/pagedata */
@@ -4972,6 +5128,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp)
case PR_PROCDIR:
case PR_PSINFO:
case PR_USAGE:
+ case PR_ARGV:
break;
default:
continue;
diff --git a/usr/src/uts/common/fs/smbsrv/smb_kshare.c b/usr/src/uts/common/fs/smbsrv/smb_kshare.c
index 126eb9f82e..62d2c080b6 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_kshare.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_kshare.c
@@ -362,6 +362,7 @@ smb_kshare_g_fini(void)
kmem_cache_destroy(smb_kshare_cache_vfs);
}
+
/*
* A list of shares in nvlist format can be sent down
* from userspace thourgh the IOCTL interface. The nvlist
diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c
index cf6082e477..1c0010b2c2 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_server.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_server.c
@@ -847,6 +847,22 @@ smb_server_enum(smb_ioc_svcenum_t *ioc)
smb_svcenum_t *svcenum = &ioc->svcenum;
smb_server_t *sv;
int rc;
+ uint32_t buflen_adjusted;
+
+ /*
+ * Reality check that the buffer-length insize the enum doesn't
+ * overrun the ioctl's total length.
+ *
+ * NOTE: Assume se_buf is at the end of smb_svcenum_t.
+ */
+ buflen_adjusted = svcenum->se_buflen +
+ offsetof(smb_svcenum_t, se_buf) + sizeof (ioc->hdr);
+ if (buflen_adjusted < svcenum->se_buflen || /* Overflow check 1, */
+ buflen_adjusted < offsetof(smb_svcenum_t, se_buf) || /* check 2, */
+ buflen_adjusted < sizeof (ioc->hdr) || /* check 3. */
+ buflen_adjusted > ioc->hdr.len) {
+ return (EINVAL);
+ }
/*
* Reality check that the buffer-length insize the enum doesn't
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c
index 87e29b21ae..e7d69f9896 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
* Copyright 2017 Sebastian Wiedenroth
*/
@@ -504,6 +505,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags)
cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
+ so->so_krecv_cb = NULL;
+ so->so_krecv_arg = NULL;
+
return (0);
}
@@ -657,6 +661,10 @@ sonode_fini(struct sonode *so)
if (so->so_filter_top != NULL)
sof_sonode_cleanup(so);
+ /* Clean up any remnants of krecv callbacks */
+ so->so_krecv_cb = NULL;
+ so->so_krecv_arg = NULL;
+
ASSERT(list_is_empty(&so->so_acceptq_list));
ASSERT(list_is_empty(&so->so_acceptq_defer));
ASSERT(!list_link_active(&so->so_acceptq_node));
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index e5bc6dc845..9b8186a8a0 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
{
int error;
- SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
+ SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr));
ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
@@ -305,7 +305,7 @@ so_connect(struct sonode *so, struct sockaddr *name,
* This can happen if a non blocking operation caused an error.
*/
- if (so->so_error != 0) {
+ if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
mutex_enter(&so->so_lock);
error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
@@ -404,7 +404,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
break;
}
- if (so->so_error != 0) {
+ if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
mutex_enter(&so->so_lock);
error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
@@ -513,7 +513,7 @@ so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag,
error = EPIPE;
break;
}
- if (so->so_error != 0) {
+ if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
mutex_enter(&so->so_lock);
error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
@@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
- if ((so->so_mode & SM_SENDFILESUPP) == 0) {
- SO_UNBLOCK_FALLBACK(so);
- return (EOPNOTSUPP);
- }
-
error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top,
B_FALSE);
@@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr,
{
int error;
- SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
+ SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
if (so->so_filter_active == 0 ||
(error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0)
@@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name,
if (level == SOL_FILTER)
return (sof_getsockopt(so, option_name, optval, optlenp, cr));
- SO_BLOCK_FALLBACK(so,
+ SO_BLOCK_FALLBACK_SAFE(so,
SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
if ((so->so_filter_active == 0 ||
@@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name,
if (level == SOL_FILTER)
return (sof_setsockopt(so, option_name, optval, optlen, cr));
- SO_BLOCK_FALLBACK(so,
+ SO_BLOCK_FALLBACK_SAFE(so,
SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
/* X/Open requires this check */
@@ -876,7 +871,7 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
* If there is a pending error, return error
* This can happen if a non blocking operation caused an error.
*/
- if (so->so_error != 0) {
+ if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
mutex_enter(&so->so_lock);
error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
@@ -1329,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp,
}
}
+ mutex_enter(&so->so_lock);
+ if (so->so_krecv_cb != NULL) {
+ boolean_t cont;
+ so_krecv_f func = so->so_krecv_cb;
+ void *arg = so->so_krecv_arg;
+
+ mutex_exit(&so->so_lock);
+ cont = func(so, mp, msg_size, flags & MSG_OOB, arg);
+ mutex_enter(&so->so_lock);
+ if (cont == B_TRUE) {
+ space_left = so->so_rcvbuf;
+ } else {
+ so->so_rcv_queued = so->so_rcvlowat;
+ *errorp = ENOSPC;
+ space_left = -1;
+ }
+ goto done_unlock;
+ }
+ mutex_exit(&so->so_lock);
+
if (flags & MSG_OOB) {
so_queue_oob(so, mp, msg_size);
mutex_enter(&so->so_lock);
@@ -1607,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
return (ENOTCONN);
}
+ mutex_enter(&so->so_lock);
+ if (so->so_krecv_cb != NULL) {
+ mutex_exit(&so->so_lock);
+ return (EOPNOTSUPP);
+ }
+ mutex_exit(&so->so_lock);
+
if (msg->msg_flags & MSG_PEEK)
msg->msg_flags &= ~MSG_WAITALL;
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index 957c8f93b4..df159a122c 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -670,10 +671,15 @@ so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
int more = 0;
int error;
ssize_t oobmark;
+ ssize_t copied = 0;
sodirect_t *sodp = so->so_direct;
+ xuio_t *xuio = NULL;
partial_read = B_FALSE;
*mctlp = NULL;
+ if ((uiop->uio_extflg & UIO_XUIO) != 0) {
+ xuio = (xuio_t *)uiop;
+ }
again:
mutex_enter(&so->so_lock);
again1:
@@ -784,8 +790,6 @@ again1:
* enabled socket, uio_resid can be 0.
*/
if (uiop->uio_resid >= 0) {
- ssize_t copied = 0;
-
if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
mutex_enter(&so->so_lock);
ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
@@ -843,6 +847,18 @@ again1:
}
if (mp != NULL) { /* more data blocks in msg */
more |= MOREDATA;
+
+ /*
+ * If requested, tally up remaining data along with the
+ * amount already copied.
+ */
+ if (xuio != NULL &&
+ xuio->xu_type == UIOTYPE_PEEKSIZE) {
+ xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE;
+ xuio->xu_ext.xu_ps.xu_ps_size =
+ copied + msgdsize(mp);
+ }
+
if ((flags & (MSG_PEEK|MSG_TRUNC))) {
if (flags & MSG_PEEK) {
freemsg(mp);
@@ -2276,9 +2292,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
/*
- * Cannot fallback if the socket has active filters
+ * Cannot fallback if the socket has active filters or a krecv callback.
*/
- if (so->so_filter_active > 0)
+ if (so->so_filter_active > 0 || so->so_krecv_cb != NULL)
return (EINVAL);
switch (so->so_family) {
@@ -2456,3 +2472,53 @@ out:
return (error);
}
+
+int
+so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg)
+{
+ int ret;
+
+ if (cb == NULL && arg != NULL)
+ return (EINVAL);
+
+ SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg));
+
+ mutex_enter(&so->so_lock);
+ if (so->so_state & SS_FALLBACK_COMP) {
+ mutex_exit(&so->so_lock);
+ SO_UNBLOCK_FALLBACK(so);
+ return (ENOTSUP);
+ }
+
+ ret = so_lock_read(so, 0);
+ VERIFY(ret == 0);
+ /*
+ * Other consumers may actually care about getting extant data delivered
+ * to them, when they come along, they should figure out the best API
+ * for that.
+ */
+ so_rcv_flush(so);
+
+ so->so_krecv_cb = cb;
+ so->so_krecv_arg = arg;
+
+ so_unlock_read(so);
+ mutex_exit(&so->so_lock);
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (0);
+}
+
+void
+so_krecv_unblock(sonode_t *so)
+{
+ mutex_enter(&so->so_lock);
+ VERIFY(so->so_krecv_cb != NULL);
+
+ so->so_rcv_queued = 0;
+ (void) so_check_flow_control(so);
+ /*
+ * so_check_flow_control() always drops so->so_lock, so we won't
+ * need to drop it ourselves.
+ */
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c
index 971523945e..7dca6ae6fc 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter.c
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/systm.h>
@@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name,
/* Module loaded OK, so there must be an ops vector */
ASSERT(ent->sofe_mod != NULL);
+
+ /*
+ * Check again to confirm ATTACH is ok. See if the the module
+ * is not SOF_ATT_SAFE after an unsafe operation has taken
+ * place.
+ */
+ if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 &&
+ so->so_state & SS_FILOP_UNSF) {
+ sof_instance_destroy(inst);
+ return (EINVAL);
+ }
+
inst->sofi_ops = &ent->sofe_mod->sofm_ops;
SOF_STAT_ADD(inst, tot_active_attach, 1);
@@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
* sof_register(version, name, ops, flags)
*
* Register a socket filter identified by name `name' and which should use
- * the ops vector `ops' for event notification. `flags' should be set to 0.
+ * the ops vector `ops' for event notification. `flags' should be set to 0
+ * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An
+ * unsafe filter is one that cannot be attached after any socket operation has
+ * occured. This is the legacy default. A "safe" filter can be attached even
+ * after some basic initial socket operations have taken place. This set is
+ * currently bind, getsockname, getsockopt and setsockopt. The order in which
+ * a "safe" filter can be attached is more relaxed, and thus more flexible.
* On success 0 is returned, otherwise an errno is returned.
*/
int
@@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags)
{
sof_module_t *mod;
- _NOTE(ARGUNUSED(flags));
-
if (version != SOF_VERSION)
return (EINVAL);
mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP);
mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
(void) strcpy(mod->sofm_name, name);
+ mod->sofm_flags = flags;
mod->sofm_ops = *ops;
mutex_enter(&sof_module_lock);
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
index 7f7aece1f1..cf2ad8b20d 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SOCKFS_SOCKFILTER_H
@@ -51,6 +52,7 @@ typedef struct sof_kstat sof_kstat_t;
struct sof_module {
char *sofm_name;
+ int sofm_flags;
sof_ops_t sofm_ops;
uint_t sofm_refcnt;
list_node_t sofm_node;
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index ed3c5967e1..7a7651edb5 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
@@ -437,8 +438,10 @@ sogetoff(mblk_t *mp, t_uscalar_t offset,
*
* The underlying filesystem VSOCK vnode has a v_stream pointer that
* references the actual stream head (hence indirectly the actual sonode).
+ *
+ * This function is non-static so it can be used by brand emulation.
*/
-static int
+int
so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
vnode_t **vpp)
{
@@ -1883,7 +1886,7 @@ ssize_t
soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
{
struct uio auio;
- struct iovec aiov[MSG_MAXIOVLEN];
+ struct iovec aiov[1];
register vnode_t *vp;
int ioflag, rwflag;
ssize_t cnt;
diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c
index 4cbd079539..e0b6b5de43 100644
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c
@@ -21,6 +21,8 @@
/*
* Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
@@ -54,6 +56,7 @@
#include <sys/cmn_err.h>
#include <sys/vmsystm.h>
#include <sys/policy.h>
+#include <sys/limits.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
@@ -86,12 +89,6 @@ extern void nl7c_init(void);
extern int sockfs_defer_nl7c_init;
/*
- * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
- * as there isn't a formal definition of IOV_MAX ???
- */
-#define MSG_MAXIOVLEN 16
-
-/*
* Kernel component of socket creation.
*
* The socket library determines which version number to use.
@@ -1021,9 +1018,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
STRUCT_HANDLE(nmsghdr, umsgptr);
struct nmsghdr lmsg;
struct uio auio;
- struct iovec aiov[MSG_MAXIOVLEN];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ ssize_t iovsize = 0;
int iovcnt;
- ssize_t len;
+ ssize_t len, rval;
int i;
int *flagsp;
model_t model;
@@ -1066,22 +1064,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
iovcnt = lmsg.msg_iovlen;
- if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+ if (iovcnt <= 0 || iovcnt > IOV_MAX) {
return (set_errno(EMSGSIZE));
}
+ if (iovcnt > IOV_MAX_STACK) {
+ iovsize = iovcnt * sizeof (struct iovec);
+ aiov = kmem_alloc(iovsize, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded, while ensuring
* that they can't move more than 2Gbytes of data in a single call.
*/
if (model == DATAMODEL_ILP32) {
- struct iovec32 aiov32[MSG_MAXIOVLEN];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ ssize_t iov32size;
ssize32_t count32;
- if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
- iovcnt * sizeof (struct iovec32)))
+ iov32size = iovcnt * sizeof (struct iovec32);
+ if (iovsize != 0)
+ aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
+ if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
@@ -1089,15 +1102,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
iovlen32 = aiov32[i].iov_len;
count32 += iovlen32;
- if (iovlen32 < 0 || count32 < 0)
+ if (iovlen32 < 0 || count32 < 0) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EINVAL));
+ }
+
aiov[i].iov_len = iovlen32;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+
+ if (iovsize != 0)
+ kmem_free(aiov32, iov32size);
} else
#endif /* _SYSCALL32_IMPL */
if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EFAULT));
}
len = 0;
@@ -1105,6 +1131,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
ssize_t iovlen = aiov[i].iov_len;
len += iovlen;
if (iovlen < 0 || len < 0) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EINVAL));
}
}
@@ -1119,12 +1148,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
(do_useracc == 0 ||
useracc(lmsg.msg_control, lmsg.msg_controllen,
B_WRITE) != 0)) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EFAULT));
}
- return (recvit(sock, &lmsg, &auio, flags,
+ rval = recvit(sock, &lmsg, &auio, flags,
STRUCT_FADDR(umsgptr, msg_namelen),
- STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
+ STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
+
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
+ return (rval);
}
/*
@@ -1262,9 +1299,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
struct nmsghdr lmsg;
STRUCT_DECL(nmsghdr, u_lmsg);
struct uio auio;
- struct iovec aiov[MSG_MAXIOVLEN];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ ssize_t iovsize = 0;
int iovcnt;
- ssize_t len;
+ ssize_t len, rval;
int i;
model_t model;
@@ -1307,7 +1345,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
iovcnt = lmsg.msg_iovlen;
- if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+ if (iovcnt <= 0 || iovcnt > IOV_MAX) {
/*
* Unless this is XPG 4.2 we allow iovcnt == 0 to
* be compatible with SunOS 4.X and 4.4BSD.
@@ -1316,19 +1354,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
return (set_errno(EMSGSIZE));
}
+ if (iovcnt > IOV_MAX_STACK) {
+ iovsize = iovcnt * sizeof (struct iovec);
+ aiov = kmem_alloc(iovsize, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded, while ensuring
* that they can't move more than 2Gbytes of data in a single call.
*/
if (model == DATAMODEL_ILP32) {
- struct iovec32 aiov32[MSG_MAXIOVLEN];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ ssize_t iov32size;
ssize32_t count32;
+ iov32size = iovcnt * sizeof (struct iovec32);
+ if (iovsize != 0)
+ aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
if (iovcnt != 0 &&
- copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
- iovcnt * sizeof (struct iovec32)))
+ copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
@@ -1336,17 +1389,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
iovlen32 = aiov32[i].iov_len;
count32 += iovlen32;
- if (iovlen32 < 0 || count32 < 0)
+ if (iovlen32 < 0 || count32 < 0) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EINVAL));
+ }
+
aiov[i].iov_len = iovlen32;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+
+ if (iovsize != 0)
+ kmem_free(aiov32, iov32size);
} else
#endif /* _SYSCALL32_IMPL */
if (iovcnt != 0 &&
copyin(lmsg.msg_iov, aiov,
(unsigned)iovcnt * sizeof (struct iovec))) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EFAULT));
}
len = 0;
@@ -1354,6 +1420,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
ssize_t iovlen = aiov[i].iov_len;
len += iovlen;
if (iovlen < 0 || len < 0) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EINVAL));
}
}
@@ -1364,7 +1433,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
auio.uio_segflg = UIO_USERSPACE;
auio.uio_limit = 0;
- return (sendit(sock, &lmsg, &auio, flags));
+ rval = sendit(sock, &lmsg, &auio, flags);
+
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
+ return (rval);
}
ssize_t
diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
index 6a515be122..24acb81a0a 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi_impl.h
+++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SOCKFS_SOCKTPI_IMPL_H
@@ -56,6 +57,8 @@ extern int sogetrderr(vnode_t *, int, int *);
extern int sogetwrerr(vnode_t *, int, int *);
extern int so_addr_verify(struct sonode *, const struct sockaddr *,
socklen_t);
+extern int so_ux_lookup(struct sonode *, struct sockaddr_un *, int,
+ vnode_t **);
extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *,
socklen_t, int, void **, socklen_t *);
extern void so_unix_close(struct sonode *);
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
* memory that can be used as swap space should do so by
* setting swapfs_desfree at boot time, not swapfs_minfree.
* However, swapfs_minfree is tunable by install as a
- * workaround for bugid 1147463.
+ * workaround for bugid 1147463. Note swapfs_minfree is set
+ * to 1/8th of memory, but clamped at the limit of 256 MB.
*/
- new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+ new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+ btopr(256 * 1024 * 1024));
}
/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
index f6621c8097..1a620642cc 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
@@ -21,10 +21,9 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
@@ -383,20 +382,7 @@ tdirenter(
/*
* Unmake the inode we just made.
*/
- rw_enter(&tp->tn_rwlock, RW_WRITER);
- if ((tp->tn_type) == VDIR) {
- ASSERT(tdp == NULL);
- /*
- * cleanup allocs made by tdirinit()
- */
- tdirtrunc(tp);
- }
- mutex_enter(&tp->tn_tlock);
- tp->tn_nlink = 0;
- mutex_exit(&tp->tn_tlock);
- gethrestime(&tp->tn_ctime);
- rw_exit(&tp->tn_rwlock);
- tmpnode_rele(tp);
+ tmpnode_cleanup(tp);
tp = NULL;
}
} else if (tpp) {
@@ -431,6 +417,7 @@ tdirdelete(
enum dr_op op,
struct cred *cred)
{
+ struct tmount *tm;
struct tdirent *tpdp;
int error;
size_t namelen;
@@ -516,7 +503,8 @@ tdirdelete(
*/
namelen = strlen(tpdp->td_name) + 1;
- tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
+ tm = TNTOTM(dir);
+ tmp_kmem_free(tm, tpdp, sizeof (struct tdirent) + namelen);
dir->tn_size -= (sizeof (struct tdirent) + namelen);
dir->tn_dirents--;
@@ -538,19 +526,27 @@ tdirdelete(
* tdirinit is used internally to initialize a directory (dir)
* with '.' and '..' entries without checking permissions and locking
*/
-void
+int
tdirinit(
struct tmpnode *parent, /* parent of directory to initialize */
struct tmpnode *dir) /* the new directory */
{
+ struct tmount *tm;
struct tdirent *dot, *dotdot;
timestruc_t now;
ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
ASSERT(dir->tn_type == VDIR);
- dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
- dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
+ tm = TNTOTM(parent);
+ dot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 2, KM_SLEEP);
+ if (dot == NULL)
+ return (ENOSPC);
+ dotdot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 3, KM_SLEEP);
+ if (dotdot == NULL) {
+ tmp_kmem_free(tm, dot, sizeof (struct tdirent) + 2);
+ return (ENOSPC);
+ }
/*
* Initialize the entries
@@ -601,6 +597,8 @@ tdirinit(
dir->tn_size = 2 * sizeof (struct tdirent) + 5; /* dot and dotdot */
dir->tn_dirents = 2;
dir->tn_nlink = 2;
+
+ return (0);
}
@@ -612,6 +610,7 @@ tdirtrunc(struct tmpnode *dir)
{
struct tdirent *tdp;
struct tmpnode *tp;
+ struct tmount *tm;
size_t namelen;
timestruc_t now;
int isvattrdir, isdotdot, skip_decr;
@@ -619,6 +618,8 @@ tdirtrunc(struct tmpnode *dir)
ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
ASSERT(dir->tn_type == VDIR);
+ tm = TNTOTM(dir);
+
isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
ASSERT(tdp->td_next != tdp);
@@ -650,7 +651,7 @@ tdirtrunc(struct tmpnode *dir)
tmpfs_hash_out(tdp);
- tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
+ tmp_kmem_free(tm, tdp, sizeof (struct tdirent) + namelen);
dir->tn_size -= (sizeof (struct tdirent) + namelen);
dir->tn_dirents--;
}
@@ -903,6 +904,7 @@ tdiraddentry(
enum de_op op,
struct tmpnode *fromtp)
{
+ struct tmount *tm;
struct tdirent *tdp, *tpdp;
size_t namelen, alloc_size;
timestruc_t now;
@@ -923,9 +925,10 @@ tdiraddentry(
/*
* Allocate and initialize directory entry
*/
+ tm = TNTOTM(dir);
namelen = strlen(name) + 1;
alloc_size = namelen + sizeof (struct tdirent);
- tdp = tmp_memalloc(alloc_size, 0);
+ tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP | KM_NORMALPRI);
if (tdp == NULL)
return (ENOSPC);
@@ -1025,7 +1028,10 @@ tdirmaketnode(
((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
return (EOVERFLOW);
type = va->va_type;
- tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+ tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP);
+ if (tp == NULL) {
+ return (ENOSPC);
+ }
tmpnode_init(tm, tp, va, cred);
/* setup normal file/dir's extended attribute directory */
@@ -1087,8 +1093,13 @@ tdirmaketnode(
if (va->va_mask & AT_MTIME)
tp->tn_mtime = va->va_mtime;
- if (op == DE_MKDIR)
- tdirinit(dir, tp);
+ if (op == DE_MKDIR) {
+ int ret;
+ if ((ret = tdirinit(dir, tp)) != 0) {
+ tmpnode_cleanup(tp);
+ return (ret);
+ }
+ }
*newnode = tp;
return (0);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
index 8723631555..0c48c03a75 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -43,6 +43,7 @@
#include <sys/fs/tmpnode.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
+#include <vm/anon.h>
#define KILOBYTE 1024
#define MEGABYTE (1024 * KILOBYTE)
@@ -54,6 +55,80 @@
extern pgcnt_t swapfs_minfree;
+void *
+tmp_kmem_zalloc(struct tmount *tm, size_t size, int flag)
+{
+ void *buf;
+ zone_t *zone;
+ size_t pages;
+
+ mutex_enter(&tm->tm_contents);
+ zone = tm->tm_vfsp->vfs_zone;
+ if (tm->tm_anonmem + size > tm->tm_anonmax ||
+ tm->tm_anonmem + size < tm->tm_anonmem ||
+ size + ptob(tmpfs_minfree) <= size ||
+ !anon_checkspace(size + ptob(tmpfs_minfree), zone)) {
+ mutex_exit(&tm->tm_contents);
+ return (NULL);
+ }
+
+ /*
+ * Only make anonymous memory reservations when a page boundary is
+ * crossed. This is necessary since the anon_resv functions rounds up
+ * to PAGESIZE internally.
+ */
+ pages = btopr(tm->tm_allocmem + size);
+ pages -= btopr(tm->tm_allocmem);
+ if (pages > 0 && anon_try_resv_zone(ptob(pages), zone) == 0) {
+ mutex_exit(&tm->tm_contents);
+ return (NULL);
+ }
+
+ tm->tm_allocmem += size;
+ tm->tm_anonmem += size;
+ mutex_exit(&tm->tm_contents);
+
+ buf = kmem_zalloc(size, flag);
+ if (buf == NULL) {
+ mutex_enter(&tm->tm_contents);
+ ASSERT(tm->tm_anonmem > tm->tm_anonmem - size);
+ tm->tm_anonmem -= size;
+ if (pages > 0) {
+ /*
+ * Re-chasing the zone pointer is necessary since a
+ * forced umount could have been performed while the
+ * tm_contents lock was dropped during allocation.
+ */
+ anon_unresv_zone(ptob(pages), tm->tm_vfsp->vfs_zone);
+ }
+ mutex_exit(&tm->tm_contents);
+ }
+
+ return (buf);
+}
+
+void
+tmp_kmem_free(struct tmount *tm, void *buf, size_t size)
+{
+ size_t pages;
+
+ kmem_free(buf, size);
+ mutex_enter(&tm->tm_contents);
+ ASSERT(tm->tm_anonmem > tm->tm_anonmem - size);
+ tm->tm_anonmem -= size;
+ pages = btopr(tm->tm_allocmem);
+ tm->tm_allocmem -= size;
+ pages -= btopr(tm->tm_allocmem);
+ /*
+ * Like the tmp_kmem_zalloc case, only unreserve anonymous memory when
+ * a page boundary has been crossed.
+ */
+ if (pages > 0) {
+ anon_unresv_zone(size, tm->tm_vfsp->vfs_zone);
+ }
+ mutex_exit(&tm->tm_contents);
+}
+
int
tmp_taccess(void *vtp, int mode, struct cred *cred)
{
@@ -99,42 +174,8 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
}
/*
- * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded
- * or the 'musthave' flag is set. 'musthave' allocations should
- * always be subordinate to normal allocations so that tmpfs_maxkmem
- * can't be exceeded by more than a few KB. Example: when creating
- * a new directory, the tmpnode is a normal allocation; if that
- * succeeds, the dirents for "." and ".." are 'musthave' allocations.
- */
-void *
-tmp_memalloc(size_t size, int musthave)
-{
- static time_t last_warning;
- time_t now;
-
- if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem ||
- musthave)
- return (kmem_zalloc(size, KM_SLEEP));
-
- atomic_add_long(&tmp_kmemspace, -size);
- now = gethrestime_sec();
- if (last_warning != now) {
- last_warning = now;
- cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit");
- }
- return (NULL);
-}
-
-void
-tmp_memfree(void *cp, size_t size)
-{
- kmem_free(cp, size);
- atomic_add_long(&tmp_kmemspace, -size);
-}
-
-/*
- * Convert a string containing a number (number of bytes) to a pgcnt_t,
- * containing the corresponding number of pages. On 32-bit kernels, the
+ * Convert a string containing a number (number of bytes) to a size_t,
+ * containing the corresponding number of bytes. On 32-bit kernels, the
* maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value
* returned in 'maxpg' is at most ULONG_MAX.
*
@@ -152,7 +193,7 @@ tmp_memfree(void *cp, size_t size)
* error.
*/
int
-tmp_convnum(char *str, pgcnt_t *maxpg)
+tmp_convnum(char *str, size_t *maxbytes)
{
u_longlong_t num = 0;
#ifdef _LP64
@@ -160,6 +201,7 @@ tmp_convnum(char *str, pgcnt_t *maxpg)
#else
u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
#endif
+ size_t pages;
char *c;
const struct convchar {
char *cc_char;
@@ -250,13 +292,21 @@ valid_char:
done:
/*
- * Since btopr() rounds up to page granularity, this round-up can
- * cause an overflow only if 'num' is between (max_bytes - PAGESIZE)
- * and (max_bytes). In this case the resulting number is zero, which
- * is what we check for below.
+ * We've been given a size in bytes; however, we want to make sure that
+ * we have at least one page worth no matter what. Therefore we use
+ * btopr to round up. However, this may cause an overflow only if 'num'
+ * is between (max_bytes - PAGESIZE) and (max_bytes). In this case the
+ * resulting number is zero, which is what we check for below. Note, we
+ * require at least one page, so if pages is zero, well, it wasn't going
+ * to work anyways.
*/
- if ((*maxpg = (pgcnt_t)btopr(num)) == 0 && num != 0)
+ pages = btopr(num);
+ if (pages == 0) {
return (EINVAL);
+ }
+
+ *maxbytes = ptob(pages);
+
return (0);
}
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
index 51e57b2611..13ea356924 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -64,21 +65,35 @@ tmp_resv(
int pagecreate) /* call anon_resv if set */
{
pgcnt_t pages = btopr(delta);
+ size_t pbytes = ptob(pages);
zone_t *zone;
ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
ASSERT(tp->tn_type == VREG);
+
/*
- * pagecreate is set only if we actually need to call anon_resv
- * to reserve an additional page of anonymous memory.
- * Since anon_resv always reserves a page at a time,
- * it should only get called when we know we're growing the
- * file into a new page or filling a hole.
+ * pagecreate is set only if we actually need to call anon_resv to
+ * reserve an additional page of anonymous memory. Since anon_resv
+ * always reserves a page at a time, it should only get called when we
+ * know we're growing the file into a new page or filling a hole. This
+ * is why we transform delta into a number of pages. However, because we
+ * track bytes and not pages, we convert that back to a number of bytes
+ * that we allocate against.
*
- * Deny if trying to reserve more than tmpfs can allocate
+ * Deny if trying to reserve more than tmpfs can allocate, the
+ * allocation causes an overflow, or the delta round up overflowed.
+ * Note, that btopr rounds up, so we need to catch the unsigned
+ * overflow. Note, rounding up when we are within a page of SIZE_MAX is
+ * done by adding a page, overflowing, which will then be rounded back
+ * to zero. Hence the following check.
*/
+ if (pages == 0 && delta != 0)
+ return (1);
+
zone = tm->tm_vfsp->vfs_zone;
- if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) ||
+ if (pagecreate && ((tm->tm_anonmem + pbytes > tm->tm_anonmax) ||
+ (tm->tm_anonmem + pbytes < tm->tm_anonmem) ||
+ (ptob(pages + tmpfs_minfree) <= pbytes) ||
(!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) ||
(anon_try_resv_zone(delta, zone) == 0))) {
return (1);
@@ -89,7 +104,7 @@ tmp_resv(
*/
if (pagecreate) {
mutex_enter(&tm->tm_contents);
- tm->tm_anonmem += pages;
+ tm->tm_anonmem += pbytes;
mutex_exit(&tm->tm_contents);
TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu",
@@ -110,13 +125,27 @@ tmp_unresv(
struct tmpnode *tp,
size_t delta)
{
+ size_t pages, pbytes;
+
ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
ASSERT(tp->tn_type == VREG);
+ /*
+ * If this is true, we have a grevious overflow bug and some size
+ * accounting has been messed with as having an amount to truncate at
+ * this size would imply that all of memory was used for this file. No
+ * matter how small the kernel, it will always need at least one page.
+ */
+ pages = btopr(delta);
+ if (pages == 0 && delta != 0)
+ panic("tmpfs unsigned overflow detected");
+ pbytes = ptob(pages);
+
anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone);
mutex_enter(&tm->tm_contents);
- tm->tm_anonmem -= btopr(delta);
+ ASSERT(tm->tm_anonmem > tm->tm_anonmem - pbytes);
+ tm->tm_anonmem -= pbytes;
mutex_exit(&tm->tm_contents);
TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", tp, delta);
@@ -154,6 +183,26 @@ tmpnode_growmap(struct tmpnode *tp, ulong_t newsize)
}
/*
+ * This is used to clean up a tmpnode that hasn't made it out the door. In other
+ * words, we allocated it and did a tmpnode_init; however, before it could get
+ * fully inserted into a directory, bad things happened and it failed.
+ */
+void
+tmpnode_cleanup(struct tmpnode *tp)
+{
+ rw_enter(&tp->tn_rwlock, RW_WRITER);
+ if ((tp->tn_type) == VDIR) {
+ tdirtrunc(tp);
+ }
+ mutex_enter(&tp->tn_tlock);
+ tp->tn_nlink = 0;
+ mutex_exit(&tp->tn_tlock);
+ gethrestime(&tp->tn_ctime);
+ rw_exit(&tp->tn_rwlock);
+ tmpnode_rele(tp);
+}
+
+/*
* Initialize a tmpnode and add it to file list under mount point.
*/
void
@@ -232,7 +281,6 @@ tmpnode_trunc(
{
size_t oldsize = tp->tn_size;
size_t delta;
- struct vnode *vp = TNTOV(tp);
timestruc_t now;
int error = 0;
@@ -316,7 +364,7 @@ tmpnode_trunc(
/* Delete anon array for tmpnode */
ASSERT(tp->tn_nblocks == 0);
ASSERT(anon_get_ptr(tp->tn_anon, 0) == NULL);
- ASSERT(!vn_has_cached_data(vp));
+ ASSERT(!vn_has_cached_data(TNTOV(tp)));
anon_release(tp->tn_anon, tp->tn_asize);
tp->tn_anon = NULL;
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index a7cf62cb99..c52a6f7c77 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -56,6 +56,15 @@
static int tmpfsfstype;
/*
+ * tmpfs_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. With forced umount support, the
+ * filesystem module must not be allowed to go away before the last
+ * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
+ * there's no need for locking.
+ */
+static uint32_t tmpfs_mountcount;
+
+/*
* tmpfs vfs operations.
*/
static int tmpfsinit(int, char *);
@@ -65,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *);
static int tmp_root(struct vfs *, struct vnode **);
static int tmp_statvfs(struct vfs *, struct statvfs64 *);
static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
+static void tmp_freevfs(vfs_t *vfsp);
/*
* Loadable module wrapper
@@ -123,6 +133,14 @@ _fini()
{
int error;
+ /*
+ * If a forceably unmounted instance is still hanging around, we cannot
+ * allow the module to be unloaded because that would cause panics once
+ * the VFS framework decides it's time to call into VFS_FREEVFS().
+ */
+ if (tmpfs_mountcount)
+ return (EBUSY);
+
error = mod_remove(&modlinkage);
if (error)
return (error);
@@ -141,14 +159,6 @@ _info(struct modinfo *modinfop)
}
/*
- * The following are patchable variables limiting the amount of system
- * resources tmpfs can use.
- *
- * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
- * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
- * It is not determined by setting a hard limit but rather as a percentage of
- * physical memory which is determined when tmpfs is first used in the system.
- *
* tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
* the rest of the system. In other words, if the amount of free swap space
* in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
@@ -157,9 +167,7 @@ _info(struct modinfo *modinfop)
* There is also a per mount limit on the amount of swap space
* (tmount.tm_anonmax) settable via a mount option.
*/
-size_t tmpfs_maxkmem = 0;
size_t tmpfs_minfree = 0;
-size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */
static major_t tmpfs_major;
static minor_t tmpfs_minor;
@@ -178,6 +186,7 @@ tmpfsinit(int fstype, char *name)
VFSNAME_ROOT, { .vfs_root = tmp_root },
VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs },
VFSNAME_VGET, { .vfs_vget = tmp_vget },
+ VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs },
NULL, NULL
};
int error;
@@ -212,18 +221,12 @@ tmpfsinit(int fstype, char *name)
tmpfs_minfree = btopr(TMPMINFREE);
}
- /*
- * The maximum amount of space tmpfs can allocate is
- * TMPMAXPROCKMEM percent of kernel memory
- */
- if (tmpfs_maxkmem == 0)
- tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
-
if ((tmpfs_major = getudev()) == (major_t)-1) {
cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
tmpfs_major = 0;
}
mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+ tmpfs_mountcount = 0;
return (0);
}
@@ -234,7 +237,7 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
struct tmpnode *tp;
struct pathname dpn;
int error;
- pgcnt_t anonmax;
+ size_t anonmax;
struct vattr rattr;
int got_attrs;
boolean_t mode_arg = B_FALSE;
@@ -278,7 +281,18 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
if ((error = tmp_convnum(argstr, &anonmax)) != 0)
goto out;
} else {
- anonmax = ULONG_MAX;
+ anonmax = SIZE_MAX;
+ }
+
+ /*
+ * The "mode" mount argument allows the operator to override the
+ * permissions of the root of the tmpfs mount.
+ */
+ if (vfs_optionisset(vfsp, "mode", &argstr)) {
+ if ((error = tmp_convmode(argstr, &root_mode)) != 0) {
+ goto out;
+ }
+ mode_arg = B_TRUE;
}
/*
@@ -311,7 +325,8 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
goto out;
}
- if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
+ if ((tm = kmem_zalloc(sizeof (struct tmount),
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
pn_free(&dpn);
error = ENOMEM;
goto out;
@@ -343,17 +358,37 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
vfsp->vfs_bsize = PAGESIZE;
vfsp->vfs_flag |= VFS_NOTRUNC;
vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
- tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
+ tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
(void) strcpy(tm->tm_mntpath, dpn.pn_path);
/*
+ * Preemptively set vfs_zone before any of the tmp_kmem_* functions are
+ * called. That field is not populated until after a successful
+ * VFS_MOUNT when domount() sets vfsp metadata via vfs_add(). An
+ * accurate value is required for proper swap usage accounting.
+ */
+ ASSERT0(uap->flags & MS_REMOUNT);
+ ASSERT(vfsp->vfs_zone == NULL);
+ vfsp->vfs_zone = curproc->p_zone;
+
+ /*
* allocate and initialize root tmpnode structure
*/
bzero(&rattr, sizeof (struct vattr));
rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
rattr.va_type = VDIR;
rattr.va_rdev = 0;
- tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+ tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP);
+ if (tp == NULL) {
+ kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+ mutex_destroy(&tm->tm_contents);
+ mutex_destroy(&tm->tm_renamelck);
+ kmem_free(tm, sizeof (struct tmount));
+
+ pn_free(&dpn);
+ error = ENOMEM;
+ goto out;
+ }
tmpnode_init(tm, tp, &rattr, cr);
/*
@@ -392,12 +427,34 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
tp->tn_nlink = 0;
tm->tm_rootnode = tp;
- tdirinit(tp, tp);
+ if (tdirinit(tp, tp) != 0) {
+ /*
+ * While we would normally let our VOP_INACTIVE function take
+ * care of cleaning up here, we're in a bit of a delicate
+ * situation, so we do so manually. While it's tempting to try
+ * and rely upon tmpfs_freevfs() and others, it's probably safer
+ * for the time to do this manually at the cost of duplication.
+ */
+ vn_invalid(TNTOV(tp));
+ rw_destroy(&tp->tn_rwlock);
+ mutex_destroy(&tp->tn_tlock);
+ vn_free(TNTOV(tp));
+ tmp_kmem_free(tm, tp, sizeof (struct tmpnode));
+
+ kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+ mutex_destroy(&tm->tm_contents);
+ mutex_destroy(&tm->tm_renamelck);
+ kmem_free(tm, sizeof (struct tmount));
+ pn_free(&dpn);
+ error = ENOMEM;
+ goto out;
+ }
rw_exit(&tp->tn_rwlock);
pn_free(&dpn);
error = 0;
+ atomic_inc_32(&tmpfs_mountcount);
out:
if (error == 0)
@@ -413,36 +470,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
struct tmpnode *tnp, *cancel;
struct vnode *vp;
int error;
+ uint_t cnt;
+ int i;
if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
return (error);
- /*
- * forced unmount is not supported by this file system
- * and thus, ENOTSUP, is being returned.
- */
- if (flag & MS_FORCE)
- return (ENOTSUP);
-
mutex_enter(&tm->tm_contents);
/*
- * If there are no open files, only the root node should have
- * a reference count.
+ * In the normal unmount case (non-forced unmount), if there are no
+ * open files, only the root node should have a reference count.
+ *
* With tm_contents held, nothing can be added or removed.
* There may be some dirty pages. To prevent fsflush from
* disrupting the unmount, put a hold on each node while scanning.
* If we find a previously referenced node, undo the holds we have
* placed and fail EBUSY.
+ *
+ * However, in the case of a forced umount, things are a bit different.
+ * An additional VFS_HOLD is added for each outstanding VN_HOLD to
+ * ensure that the file system is not cleaned up (tmp_freevfs) until
+ * the last vfs hold is dropped. This happens in tmp_inactive as the
+ * vnodes are released. Also, we can't add an additional VN_HOLD in
+ * this case since that would prevent tmp_inactive from ever being
+ * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
+ * so that the zone is not blocked waiting for the final file system
+ * cleanup.
*/
tnp = tm->tm_rootnode;
- if (TNTOV(tnp)->v_count > 1) {
+
+ vp = TNTOV(tnp);
+ mutex_enter(&vp->v_lock);
+ cnt = vp->v_count;
+ if (flag & MS_FORCE) {
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ /* Extra hold which we rele below when we drop the zone ref */
+ VFS_HOLD(vfsp);
+
+ for (i = 1; i < cnt; i++)
+ VFS_HOLD(vfsp);
+
+ /* drop the mutex now because no one can find this mount */
+ mutex_exit(&tm->tm_contents);
+ } else if (cnt > 1) {
+ mutex_exit(&vp->v_lock);
mutex_exit(&tm->tm_contents);
return (EBUSY);
}
+ mutex_exit(&vp->v_lock);
+ /*
+ * Check for open files. An open file causes everything to unwind
+ * unless this is a forced umount.
+ */
for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
- if ((vp = TNTOV(tnp))->v_count > 0) {
+ vp = TNTOV(tnp);
+ mutex_enter(&vp->v_lock);
+ cnt = vp->v_count;
+ if (flag & MS_FORCE) {
+ for (i = 0; i < cnt; i++)
+ VFS_HOLD(vfsp);
+
+ /*
+ * In the case of a forced umount don't add an
+ * additional VN_HOLD on the already held vnodes, like
+ * we do in the non-forced unmount case. If the
+ * cnt > 0, then the vnode already has at least one
+ * hold and we need tmp_inactive to get called when the
+ * last pre-existing hold on the node is released so
+ * that we can VFS_RELE the VFS holds we just added.
+ */
+ if (cnt == 0) {
+ /* directly add VN_HOLD since have the lock */
+ vp->v_count++;
+ }
+
+ mutex_exit(&vp->v_lock);
+
+ /*
+ * If the tmpnode has any pages associated with it
+ * (i.e. if it's a normal file with non-zero size), the
+ * tmpnode could still be discovered by pageout or
+ * fsflush via the page vnode pointers. To prevent this
+ * from interfering with the tmp_freevfs, truncate the
+ * tmpnode now.
+ */
+ if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
+ rw_enter(&tnp->tn_rwlock, RW_WRITER);
+ rw_enter(&tnp->tn_contents, RW_WRITER);
+
+ (void) tmpnode_trunc(tm, tnp, 0);
+
+ rw_exit(&tnp->tn_contents);
+ rw_exit(&tnp->tn_rwlock);
+
+ ASSERT(tnp->tn_size == 0);
+ ASSERT(tnp->tn_nblocks == 0);
+ }
+ } else if (cnt > 0) {
+ /* An open file; unwind the holds we've been adding. */
+ mutex_exit(&vp->v_lock);
cancel = tm->tm_rootnode->tn_forw;
while (cancel != tnp) {
vp = TNTOV(cancel);
@@ -452,14 +580,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
}
mutex_exit(&tm->tm_contents);
return (EBUSY);
+ } else {
+ /* directly add a VN_HOLD since we have the lock */
+ vp->v_count++;
+ mutex_exit(&vp->v_lock);
}
- VN_HOLD(vp);
}
- /*
- * We can drop the mutex now because no one can find this mount
- */
- mutex_exit(&tm->tm_contents);
+ if (flag & MS_FORCE) {
+ /*
+ * Drop the zone ref now since we don't know how long it will
+ * be until the final vfs_rele is called by tmp_inactive.
+ */
+ if (vfsp->vfs_zone) {
+ zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
+ ZONE_REF_VFS);
+ vfsp->vfs_zone = 0;
+ }
+ /* We can now drop the extra hold we added above. */
+ VFS_RELE(vfsp);
+ } else {
+ /*
+ * For the non-forced case, we can drop the mutex now because
+ * no one can find this mount anymore
+ */
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ mutex_exit(&tm->tm_contents);
+ }
+
+ return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
+ * the vfs framework after umount and the last VFS_RELE, to trigger the release
+ * of any resources still associated with the given vfs_t. We only add
+ * additional VFS_HOLDs during the forced umount case, so this is normally
+ * called immediately after tmp_umount.
+ */
+void
+tmp_freevfs(vfs_t *vfsp)
+{
+ struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
+ struct tmpnode *tnp;
+ struct vnode *vp;
/*
* Free all kmemalloc'd and anonalloc'd memory associated with
@@ -469,6 +633,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
* tmpnode_free which assumes that the directory entry has been
* removed before the file.
*/
+
+ /*
+ * Now that we are tearing ourselves down we need to remove the
+ * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+ * files from the system causing us to have a negative value. Doing this
+ * seems a bit better than trying to set a flag on the tmount that says
+ * we're tearing down.
+ */
+ vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
/*
* Remove all directory entries
*/
@@ -535,15 +709,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
ASSERT(tm->tm_mntpath);
- tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+ kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
ASSERT(tm->tm_anonmem == 0);
mutex_destroy(&tm->tm_contents);
mutex_destroy(&tm->tm_renamelck);
- tmp_memfree(tm, sizeof (struct tmount));
+ kmem_free(tm, sizeof (struct tmount));
- return (0);
+ /* Allow _fini() to succeed now */
+ atomic_dec_32(&tmpfs_mountcount);
}
/*
@@ -605,18 +780,19 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
* If tm_anonmax for this mount is less than the available swap space
* (minus the amount tmpfs can't use), use that instead
*/
- if (blocks > tmpfs_minfree)
+ if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) {
sbp->f_bfree = MIN(blocks - tmpfs_minfree,
- tm->tm_anonmax - tm->tm_anonmem);
- else
+ btop(tm->tm_anonmax) - btopr(tm->tm_anonmem));
+ } else {
sbp->f_bfree = 0;
+ }
sbp->f_bavail = sbp->f_bfree;
/*
* Total number of blocks is what's available plus what's been used
*/
- sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem);
+ sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem));
if (eff_zid != GLOBAL_ZONEUNIQID &&
zp->zone_max_swap_ctl != UINT64_MAX) {
@@ -646,13 +822,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
* available to tmpfs. This is fairly inaccurate since it doesn't
* take into account the names stored in the directory entries.
*/
- if (tmpfs_maxkmem > tmp_kmemspace)
- sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
- (sizeof (struct tmpnode) + sizeof (struct tdirent));
- else
- sbp->f_ffree = 0;
-
- sbp->f_files = tmpfs_maxkmem /
+ sbp->f_ffree = sbp->f_files = ptob(availrmem) /
(sizeof (struct tmpnode) + sizeof (struct tdirent));
sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
(void) cmpldev(&d32, vfsp->vfs_dev);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index a09f206d88..a356f22750 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2016 RackTop Systems.
* Copyright (c) 2017 by Delphix. All rights reserved.
@@ -586,6 +586,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
struct tmount *tm = (struct tmount *)VTOTM(vp);
int error;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
/*
* We don't currently support reading non-regular files
*/
@@ -615,6 +619,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
struct tmount *tm = (struct tmount *)VTOTM(vp);
int error;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
/*
* We don't currently support writing to non-regular files
*/
@@ -788,8 +796,13 @@ tmp_setattr(
rw_exit(&tp->tn_contents);
rw_exit(&tp->tn_rwlock);
- if (error == 0 && vap->va_size == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (vap->va_size == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
goto out1;
}
@@ -835,6 +848,9 @@ tmp_lookup(
struct tmpnode *ntp = NULL;
int error;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
/* allow cd into @ dir */
if (flags & LOOKUP_XATTR) {
@@ -853,6 +869,8 @@ tmp_lookup(
rw_enter(&tp->tn_rwlock, RW_WRITER);
if (tp->tn_xattrdp == NULL) {
+ int err;
+
if (!(flags & CREATE_XATTR_DIR)) {
rw_exit(&tp->tn_rwlock);
return (ENOENT);
@@ -873,9 +891,13 @@ tmp_lookup(
return (error);
}
- xdp = tmp_memalloc(sizeof (struct tmpnode),
- TMP_MUSTHAVE);
tm = VTOTM(dvp);
+ xdp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode),
+ KM_SLEEP);
+ if (xdp == NULL) {
+ rw_exit(&tp->tn_rwlock);
+ return (ENOSPC);
+ }
tmpnode_init(tm, xdp, &tp->tn_attr, NULL);
/*
* Fix-up fields unique to attribute directories.
@@ -893,7 +915,16 @@ tmp_lookup(
}
xdp->tn_vnode->v_type = VDIR;
xdp->tn_vnode->v_flag |= V_XATTRDIR;
- tdirinit(tp, xdp);
+ if ((err = tdirinit(tp, xdp)) != 0) {
+ rw_exit(&tp->tn_rwlock);
+ /*
+ * This never got properly initialized so we can
+ * just clean it up.
+ */
+ xdp->tn_vnode->v_flag &= V_XATTRDIR;
+ tmpnode_cleanup(tp);
+ return (err);
+ }
tp->tn_xattrdp = xdp;
} else {
VN_HOLD(tp->tn_xattrdp->tn_vnode);
@@ -1302,10 +1333,8 @@ tmp_rename(
vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
/*
* vnevent_rename_dest is called in tdirenter().
- * Notify the target dir if not same as source dir.
*/
- if (ndvp != odvp)
- vnevent_rename_dest_dir(ndvp, ct);
+ vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct);
}
done:
@@ -1474,6 +1503,10 @@ tmp_readdir(
int reclen;
caddr_t outbuf;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
if (uiop->uio_loffset >= MAXOFF_T) {
if (eofp)
*eofp = 1;
@@ -1607,12 +1640,12 @@ tmp_symlink(
rw_exit(&parent->tn_rwlock);
if (error) {
- if (self)
+ if (self != NULL)
tmpnode_rele(self);
return (error);
}
len = strlen(tnm) + 1;
- cp = tmp_memalloc(len, 0);
+ cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP | KM_NORMALPRI);
if (cp == NULL) {
tmpnode_rele(self);
return (ENOSPC);
@@ -1677,10 +1710,27 @@ top:
* there's little to do -- just drop our hold.
*/
if (vp->v_count > 1 || tp->tn_nlink != 0) {
- VN_RELE_LOCKED(vp);
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) {
+ /*
+ * Since the file system was forcibly unmounted, we can
+ * have a case (v_count == 1, tn_nlink != 0) where this
+ * file was open so we didn't add an extra hold on the
+ * file in tmp_unmount. We are counting on the
+ * interaction of the hold made in tmp_unmount and
+ * rele-ed in tmp_vfsfree so we need to be sure we
+ * don't decrement in this case.
+ */
+ if (vp->v_count > 1)
+ VN_RELE_LOCKED(vp);
+ } else {
+ VN_RELE_LOCKED(vp);
+ }
mutex_exit(&vp->v_lock);
mutex_exit(&tp->tn_tlock);
rw_exit(&tp->tn_rwlock);
+ /* If the filesystem was umounted by force, rele the vfs ref */
+ if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+ VFS_RELE(tm->tm_vfsp);
return;
}
@@ -1705,7 +1755,7 @@ top:
goto top;
}
if (tp->tn_type == VLNK)
- tmp_memfree(tp->tn_symlink, tp->tn_size + 1);
+ tmp_kmem_free(tm, tp->tn_symlink, tp->tn_size + 1);
}
/*
@@ -1739,7 +1789,11 @@ top:
rw_destroy(&tp->tn_rwlock);
mutex_destroy(&tp->tn_tlock);
vn_free(TNTOV(tp));
- tmp_memfree(tp, sizeof (struct tmpnode));
+ tmp_kmem_free(tm, tp, sizeof (struct tmpnode));
+
+ /* If the filesystem was umounted by force, rele the vfs ref */
+ if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+ VFS_RELE(tm->tm_vfsp);
}
/* ARGSUSED2 */
@@ -1861,6 +1915,10 @@ tmp_getapage(
struct vnode *pvp;
u_offset_t poff;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
if (protp != NULL)
*protp = PROT_ALL;
again:
@@ -2082,6 +2140,10 @@ tmp_putapage(
u_offset_t offset;
u_offset_t tmpoff;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
ASSERT(PAGE_LOCKED(pp));
/* Kluster in tmp_klustsize chunks */
@@ -2342,8 +2404,13 @@ tmp_space(
return (EFBIG);
error = tmp_freesp(vp, bfp, flag);
- if (error == 0 && bfp->l_start == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (bfp->l_start == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
}
return (error);
}
diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c
index c1e2c74a87..def046a0bf 100644
--- a/usr/src/uts/common/fs/udfs/udf_dir.c
+++ b/usr/src/uts/common/fs/udfs/udf_dir.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -562,9 +563,8 @@ out:
namep, ctp);
}
- if (sdp != tdp) {
- vnevent_rename_dest_dir(ITOV(tdp), ctp);
- }
+ vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip),
+ namep, ctp);
}
/*
diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c
index 054056c63a..51ce9b28af 100644
--- a/usr/src/uts/common/fs/udfs/udf_vnops.c
+++ b/usr/src/uts/common/fs/udfs/udf_vnops.c
@@ -569,8 +569,11 @@ udf_setattr(
goto update_inode;
}
- if (vap->va_size == 0)
+ if (vap->va_size == 0) {
vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
}
/*
* Change file access or modified times.
@@ -1649,8 +1652,13 @@ udf_space(
} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
error = ud_freesp(vp, bfp, flag, cr);
- if (error == 0 && bfp->l_start == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (bfp->l_start == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
}
return (error);
diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c
index 79ff1b7071..370c982f08 100644
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c
+++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c
@@ -2084,8 +2084,13 @@ again:
goto update_inode;
}
- if (error == 0 && vap->va_size)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (vap->va_size) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
}
if (ulp) {
@@ -3610,12 +3615,7 @@ retry_firstlock:
if (error == 0) {
vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
- /*
- * Notify the target directory of the rename event
- * if source and target directories are not the same.
- */
- if (sdvp != tdvp)
- vnevent_rename_dest_dir(tdvp, ct);
+ vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
}
errout:
@@ -4350,8 +4350,13 @@ ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
return (error);
error = ufs_freesp(vp, bfp, flag, cr);
- if (error == 0 && bfp->l_start == 0)
- vnevent_truncate(vp, ct);
+ if (error == 0) {
+ if (bfp->l_start == 0) {
+ vnevent_truncate(vp, ct);
+ } else {
+ vnevent_resize(vp, ct);
+ }
+ }
} else if (cmd == F_ALLOCSP) {
error = ufs_lockfs_begin(ufsvfsp, &ulp,
ULOCKFS_FALLOCATE_MASK);
@@ -5630,10 +5635,10 @@ ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
struct ufsvfs *ufsvfsp;
/*
- * Regular files reject edge-triggered pollers.
+ * Regular files reject epollers (and edge-triggered pollers).
* See the comment in fs_poll() for a more detailed explanation.
*/
- if (ev & POLLET) {
+ if (fs_reject_epoll() || (ev & POLLET) != 0) {
return (EPERM);
}
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index 1bee02bfe6..77bc7817a8 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -857,9 +857,11 @@ vfs_mountroot(void)
for (p = practive; p != NULL; p = p->p_next) {
ASSERT(p == &p0 || p->p_parent == &p0);
+ mutex_enter(&p->p_lock);
PTOU(p)->u_cdir = rootdir;
VN_HOLD(PTOU(p)->u_cdir);
PTOU(p)->u_rdir = NULL;
+ mutex_exit(&p->p_lock);
}
mutex_exit(&pidlock);
@@ -3885,6 +3887,8 @@ vfs_to_modname(const char *vfstype)
vfstype = "fdfs";
} else if (strncmp(vfstype, "nfs", 3) == 0) {
vfstype = "nfs";
+ } else if (strcmp(vfstype, "lxproc") == 0) {
+ vfstype = "lxprocfs";
}
return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 6e8f65cacb..6d6c4af5ca 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -207,6 +207,11 @@ static void (**vsd_destructor)(void *);
cr = crgetmapped(cr); \
}
+#define VOP_LATENCY_10MS 10000000
+#define VOP_LATENCY_100MS 100000000
+#define VOP_LATENCY_1S 1000000000
+#define VOP_LATENCY_10S 10000000000
+
/*
* Convert stat(2) formats to vnode types and vice versa. (Knows about
* numerical order of S_IFMT and vnode types.)
@@ -2543,6 +2548,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
if (vp == NULL || vp->v_femhead == NULL) {
return;
}
+ (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
}
@@ -2557,12 +2563,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
}
void
-vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
+vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
+ caller_context_t *ct)
{
if (vp == NULL || vp->v_femhead == NULL) {
return;
}
- (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
+ (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
}
void
@@ -2649,6 +2656,15 @@ vnevent_truncate(vnode_t *vp, caller_context_t *ct)
(void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
}
+void
+vnevent_resize(vnode_t *vp, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
+}
+
/*
* Vnode accessors.
*/
@@ -3424,14 +3440,58 @@ fop_read(
cred_t *cr,
caller_context_t *ct)
{
- int err;
ssize_t resid_start = uiop->uio_resid;
+ zone_t *zonep = curzone;
+ zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+ hrtime_t start = 0, lat;
+ ssize_t len;
+ int err;
+
+ if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
+ vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
+ start = gethrtime();
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_runq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ }
VOPXID_MAP_CR(vp, cr);
err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
- VOPSTATS_UPDATE_IO(vp, read,
- read_bytes, (resid_start - uiop->uio_resid));
+ len = resid_start - uiop->uio_resid;
+
+ VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+ if (start != 0) {
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.reads++;
+ zonep->zone_vfs_rwstats.nread += len;
+ kstat_runq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ if (lat < VOP_LATENCY_100MS)
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+ }
+
return (err);
}
@@ -3443,14 +3503,63 @@ fop_write(
cred_t *cr,
caller_context_t *ct)
{
- int err;
ssize_t resid_start = uiop->uio_resid;
+ zone_t *zonep = curzone;
+ zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+ hrtime_t start = 0, lat;
+ ssize_t len;
+ int err;
+
+ /*
+ * For the purposes of VFS kstat consumers, the "waitq" calculation is
+ * repurposed as the active queue for VFS write operations. There's no
+ * actual wait queue for VFS operations.
+ */
+ if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
+ vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
+ start = gethrtime();
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ }
VOPXID_MAP_CR(vp, cr);
err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
- VOPSTATS_UPDATE_IO(vp, write,
- write_bytes, (resid_start - uiop->uio_resid));
+ len = resid_start - uiop->uio_resid;
+
+ VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+ if (start != 0) {
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.writes++;
+ zonep->zone_vfs_rwstats.nwritten += len;
+ kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ if (lat < VOP_LATENCY_100MS)
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+ }
+
return (err);
}
diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c
index 0ab3513718..0dc61e4907 100644
--- a/usr/src/uts/common/fs/zfs/abd.c
+++ b/usr/src/uts/common/fs/zfs/abd.c
@@ -146,7 +146,10 @@ boolean_t zfs_abd_scatter_enabled = B_TRUE;
* it at runtime would cause ABD iteration to work incorrectly for ABDs which
* were allocated with the old size, so a safeguard has been put in place which
* will cause the machine to panic if you change it and try to access the data
- * within a scattered ABD.
+ * within a scattered ABD. Note that tuning this value to be smaller than the
+ * page size can induce heavy fragmentation in the slab layer, which may itself
+ * result in more memory waste than is saved by the smaller chunk size -- and
+ * will induces more computational work in the slab layer. Tune with caution!
*/
size_t zfs_abd_chunk_size = 4096;
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 844abbcd5d..1175faf65d 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -263,6 +263,7 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
#include <sys/zio_checksum.h>
#include <sys/multilist.h>
#include <sys/abd.h>
@@ -324,7 +325,7 @@ int arc_grow_retry = 60;
int arc_kmem_cache_reap_retry_ms = 1000;
/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int zfs_arc_overflow_shift = 8;
+int zfs_arc_overflow_shift = 3;
/* shift of arc_c for calculating both min and max arc_p */
int arc_p_min_shift = 4;
@@ -5342,6 +5343,14 @@ top:
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
+ /*
+ * At this point, this read I/O has already missed in the ARC
+ * and will be going through to the disk. The I/O throttle
+ * should delay this I/O if this zone is using more than its I/O
+ * priority allows.
+ */
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
if (*arc_flags & ARC_FLAG_WAIT)
return (zio_wait(rzio));
@@ -6297,6 +6306,10 @@ arc_init(void)
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
arc_c_min = arc_meta_limit / 2;
+ /* On larger-memory machines, we clamp the minimum at 1GB */
+ if (zfs_arc_min == 0)
+ arc_c_min = MIN(arc_c_min, (1 << 30));
+
if (zfs_arc_meta_min > 0) {
arc_meta_min = zfs_arc_meta_min;
} else {
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 7421ea291b..979bb8848e 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -1008,8 +1008,17 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
if (bonuslen < max_bonuslen)
bzero(db->db.db_data, max_bonuslen);
- if (bonuslen)
- bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ if (bonuslen) {
+ /*
+ * Absent byzantine on-disk corruption, we fully expect
+ * our bonuslen to be no more than max_bonuslen --
+ * but we nonetheless explicitly clamp it on the bcopy()
+ * to prevent any on-disk corruption from becoming
+ * rampant in-kernel corruption.
+ */
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+ MIN(bonuslen, max_bonuslen));
+ }
DB_DNODE_EXIT(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 95ca9f76aa..966d155a9c 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -2178,7 +2178,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
ZCHECKSUM_FLAG_DEDUP))
dedup_verify = B_TRUE;
}
-
/*
* Enable nopwrite if we have secure enough checksum
* algorithm (see comment in zio_nop_write) and
diff --git a/usr/src/uts/common/fs/zfs/dmu_recv.c b/usr/src/uts/common/fs/zfs/dmu_recv.c
index 542bb42f3f..bee41bd95e 100644
--- a/usr/src/uts/common/fs/zfs/dmu_recv.c
+++ b/usr/src/uts/common/fs/zfs/dmu_recv.c
@@ -1526,8 +1526,12 @@ receive_read_record(struct receive_arg *ra)
{
struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
- void *buf = kmem_zalloc(size, KM_SLEEP);
+ void *buf = NULL;
dmu_object_info_t doi;
+
+ if (size > 0)
+ buf = kmem_zalloc(size, KM_SLEEP);
+
err = receive_read_payload_and_next_header(ra, size, buf);
if (err != 0) {
kmem_free(buf, size);
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 6d65086079..d42a7c66de 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -22,7 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright 2016 RackTop Systems.
* Copyright (c) 2014 Integros [integros.com]
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 53d5765bcb..6cb39d61a5 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -39,11 +39,11 @@
#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
#include <sys/varargs.h>
+#include <sys/zfs_zone.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
-
dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t *dd)
{
@@ -213,6 +213,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
if (len == 0)
return;
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 298516f8a4..35e76e273e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -42,6 +42,7 @@
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
#include <sys/zfeature.h>
#include <sys/policy.h>
#include <sys/zfs_znode.h>
@@ -1398,7 +1399,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
* locks are held.
*/
txg_delay(dd->dd_pool, tx->tx_txg,
- MSEC2NSEC(10), MSEC2NSEC(10));
+ zfs_zone_txg_delay(), MSEC2NSEC(10));
err = SET_ERROR(ERESTART);
}
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 54c88b1e3c..ce77b8c611 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -44,6 +44,7 @@
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/bptree.h>
@@ -865,7 +866,7 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
}
ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
- ASSERT3U(dp->dp_dirty_total, >=, space);
+ VERIFY3U(dp->dp_dirty_total, >=, space);
dsl_pool_dirty_delta(dp, -space);
mutex_exit(&dp->dp_lock);
}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 0044f37964..bfac5ddf1c 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -23,6 +23,7 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
*/
@@ -61,6 +62,11 @@ int zfs_metaslab_sm_blksz = (1 << 12);
int zfs_condense_pct = 200;
/*
+ * Never condense any space map. This is for debugging/recovery only.
+ */
+int zfs_condense_never = 0;
+
+/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
* space used on disk. In particular, a space map uses data in increments of
* MAX(1 << ashift, space_map_blksize), so a metaslab might use the
@@ -152,6 +158,18 @@ int metaslab_load_pct = 50;
int metaslab_unload_delay = TXG_SIZE * 2;
/*
+ * Tunables used to reduce metaslab load/unload thrashing when selection
+ * algorithm is allocating across metaslabs very evenly. In addition to
+ * tracking when the slab was used for allocation (ms_selected_txg), we also
+ * track when it was loaded (ms_loaded_txg). If the slab would be unloaded,
+ * but the load txg is within the window of
+ * metaslab_unload_delay + metaslab_load_window
+ * then we ramp up metaslab_unload_delay instead of unloading the metaslab.
+ */
+int metaslab_load_window = 10;
+int metaslab_unload_delay_max = 256;
+
+/*
* Max number of metaslabs per group to preload.
*/
int metaslab_preload_limit = SPA_DVAS_PER_BP;
@@ -713,6 +731,7 @@ metaslab_group_activate(metaslab_group_t *mg)
{
metaslab_class_t *mc = mg->mg_class;
metaslab_group_t *mgprev, *mgnext;
+ char kstat_name[KSTAT_STRLEN];
ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
@@ -737,6 +756,33 @@ metaslab_group_activate(metaslab_group_t *mg)
mgprev->mg_next = mg;
mgnext->mg_prev = mg;
}
+
+ /* Create a kstat to monitor the loading and unloading of metaslabs. */
+ (void) snprintf(kstat_name, sizeof (kstat_name), "%llx",
+ (unsigned long long) mg->mg_vd->vdev_guid);
+
+ mutex_init(&mg->mg_kstat_lock, NULL, MUTEX_DEFAULT, NULL);
+ if ((mg->mg_kstat = kstat_create("zfs_metaslab_group", 0,
+ kstat_name, "misc", KSTAT_TYPE_NAMED,
+ sizeof (metaslab_group_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL)) != NULL) {
+
+ metaslab_group_kstat_t *mg_kstat = kmem_zalloc(
+ sizeof (metaslab_group_kstat_t), KM_SLEEP);
+ kstat_named_init(&mg_kstat->mg_loads, "loads",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&mg_kstat->mg_unloads, "unloads",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&mg_kstat->mg_spa_name, "spa_name",
+ KSTAT_DATA_STRING);
+ kstat_named_setstr(&mg_kstat->mg_spa_name,
+ mg->mg_vd->vdev_spa->spa_name);
+
+ mg->mg_kstat->ks_data = mg_kstat;
+ mg->mg_kstat->ks_lock = &mg->mg_kstat_lock;
+ kstat_install(mg->mg_kstat);
+ }
+
mc->mc_rotor = mg;
}
@@ -813,6 +859,14 @@ metaslab_group_passivate(metaslab_group_t *mg)
mg->mg_prev = NULL;
mg->mg_next = NULL;
+
+ if (mg->mg_kstat != NULL) {
+ metaslab_group_kstat_t *data = mg->mg_kstat->ks_data;
+
+ kstat_delete(mg->mg_kstat);
+ kmem_free(data, sizeof (metaslab_group_kstat_t));
+ }
+ mutex_destroy(&mg->mg_kstat_lock);
}
boolean_t
@@ -1773,8 +1827,9 @@ metaslab_load_impl(metaslab_t *msp)
}
int
-metaslab_load(metaslab_t *msp)
+metaslab_load(metaslab_t *msp, uint64_t txg)
{
+ kstat_t *ksp;
ASSERT(MUTEX_HELD(&msp->ms_lock));
/*
@@ -1787,9 +1842,16 @@ metaslab_load(metaslab_t *msp)
VERIFY(!msp->ms_loading);
ASSERT(!msp->ms_condensing);
+ ksp = msp->ms_group->mg_kstat;
+ if (ksp != NULL) {
+ metaslab_group_kstat_t *mg_ksp = ksp->ks_data;
+ atomic_inc_64(&mg_ksp->mg_loads.value.ui64);
+ }
+
msp->ms_loading = B_TRUE;
int error = metaslab_load_impl(msp);
msp->ms_loading = B_FALSE;
+ msp->ms_loaded_txg = txg;
cv_broadcast(&msp->ms_load_cv);
return (error);
@@ -1804,6 +1866,7 @@ metaslab_unload(metaslab_t *msp)
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
+ msp->ms_loaded_txg = 0;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
@@ -1918,7 +1981,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
*/
if (metaslab_debug_load && ms->ms_sm != NULL) {
mutex_enter(&ms->ms_lock);
- VERIFY0(metaslab_load(ms));
+ VERIFY0(metaslab_load(ms, txg));
mutex_exit(&ms->ms_lock);
}
@@ -2432,12 +2495,13 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
}
static int
-metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight,
+ uint64_t txg)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = metaslab_load(msp);
+ int error = metaslab_load(msp, txg);
if (error != 0) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
@@ -2552,7 +2616,7 @@ metaslab_preload(void *arg)
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock);
- (void) metaslab_load(msp);
+ (void) metaslab_load(msp, spa_syncing_txg(spa));
msp->ms_selected_txg = spa_syncing_txg(spa);
mutex_exit(&msp->ms_lock);
}
@@ -2625,6 +2689,9 @@ metaslab_should_condense(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
+ if (zfs_condense_never != 0)
+ return (B_FALSE);
+
/*
* Allocations and frees in early passes are generally more space
* efficient (in terms of blocks described in space map entries)
@@ -3087,22 +3154,35 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
/*
* If the metaslab is loaded and we've not tried to load or allocate
- * from it in 'metaslab_unload_delay' txgs, then unload it.
+ * from it in 'metaslab_unload_delay' txgs, then we normally unload it.
+ * However, to prevent thrashing, if the metaslab was recently loaded,
+ * then instead of unloading it, we increase the unload delay (only up
+ * to the maximum).
*/
if (msp->ms_loaded &&
msp->ms_initializing == 0 &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {
- for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
- VERIFY0(range_tree_space(
- msp->ms_allocating[(txg + t) & TXG_MASK]));
- }
- if (msp->ms_allocator != -1) {
- metaslab_passivate(msp, msp->ms_weight &
- ~METASLAB_ACTIVE_MASK);
- }
+ if (msp->ms_loaded_txg != 0 && msp->ms_loaded_txg +
+ metaslab_unload_delay + metaslab_load_window >= txg) {
+ if (metaslab_unload_delay + metaslab_load_window <=
+ metaslab_unload_delay_max) {
+ metaslab_unload_delay += metaslab_load_window;
+ }
+ DTRACE_PROBE1(zfs__metaslab__delay__unload,
+ metaslab_t *, msp);
+ } else {
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ VERIFY0(range_tree_space(
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
+ }
+ if (msp->ms_allocator != -1) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
- if (!metaslab_debug_unload)
- metaslab_unload(msp);
+ if (!metaslab_debug_unload)
+ metaslab_unload(msp);
+ }
}
ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
@@ -3362,8 +3442,6 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
- /* Track the last successful allocation */
- msp->ms_alloc_txg = txg;
metaslab_verify_space(msp, txg);
}
@@ -3545,7 +3623,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
continue;
}
- if (metaslab_activate(msp, allocator, activation_weight) != 0) {
+ if (metaslab_activate(msp, allocator, activation_weight,
+ txg) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -4252,7 +4331,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
mutex_enter(&msp->ms_lock);
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
- error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+ error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM, txg);
/*
* No need to fail in that case; someone else has activated the
* metaslab, but that doesn't preclude us from using it.
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index d3c0a3e8ef..1d7b72d72c 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -24,6 +24,7 @@
* Portions Copyright 2011 iXsystems, Inc
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -400,15 +401,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
{
sa_os_t *sa = os->os_sa;
sa_lot_t *tb, *findtb;
- int i;
+ int i, size;
avl_index_t loc;
ASSERT(MUTEX_HELD(&sa->sa_lock));
tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
tb->lot_attr_count = attr_count;
- tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
- KM_SLEEP);
- bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+
+ if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) {
+ tb->lot_attrs = kmem_alloc(size, KM_SLEEP);
+ bcopy(attrs, tb->lot_attrs, size);
+ }
+
tb->lot_num = lot_num;
tb->lot_hash = hash;
tb->lot_instance = 0;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 403ace2d9d..0795e2c69b 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -27,7 +27,7 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2017 Datto Inc.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
@@ -227,6 +227,13 @@ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
uint64_t zfs_max_missing_tvds_scan = 0;
/*
+ * Interval in seconds at which to poll spare vdevs for health.
+ * Setting this to zero disables spare polling.
+ * Set to three hours by default.
+ */
+uint_t spa_spare_poll_interval_seconds = 60 * 60 * 3;
+
+/*
* Debugging aid that pauses spa_sync() towards the end.
*/
boolean_t zfs_pause_spa_sync = B_FALSE;
@@ -1854,6 +1861,12 @@ spa_check_for_missing_logs(spa_t *spa)
if (idx > 0) {
spa_load_failed(spa, "some log devices are missing");
vdev_dbgmsg_print_tree(rvd, 2);
+
+ /* Save the timestamp of the last completed txg. */
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_TIME,
+ spa->spa_last_ubsync_txg_ts) == 0);
+
return (SET_ERROR(ENXIO));
}
} else {
@@ -1862,10 +1875,21 @@ spa_check_for_missing_logs(spa_t *spa)
if (tvd->vdev_islog &&
tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+ nvlist_t *rewind_info = fnvlist_alloc();
+
spa_set_log_state(spa, SPA_LOG_CLEAR);
spa_load_note(spa, "some log devices are "
"missing, ZIL is dropped.");
vdev_dbgmsg_print_tree(rvd, 2);
+
+ VERIFY(nvlist_add_uint64(rewind_info,
+ ZPOOL_CONFIG_LOAD_TIME,
+ spa->spa_uberblock.ub_timestamp) == 0);
+
+ VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_REWIND_INFO,
+ rewind_info) == 0);
+
break;
}
}
@@ -7150,6 +7174,8 @@ spa_async_thread(void *arg)
if (tasks & SPA_ASYNC_PROBE) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
+ for (int i = 0; i < spa->spa_spares.sav_count; i++)
+ spa_async_probe(spa, spa->spa_spares.sav_vdevs[i]);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@@ -8156,6 +8182,14 @@ spa_sync(spa_t *spa, uint64_t txg)
spa_handle_ignored_writes(spa);
+ /* Mark unused spares as needing a health check. */
+ if (spa_spare_poll_interval_seconds != 0 &&
+ NSEC2SEC(gethrtime() - spa->spa_spares_last_polled) >
+ spa_spare_poll_interval_seconds) {
+ spa_spare_poll(spa);
+ spa->spa_spares_last_polled = gethrtime();
+ }
+
/*
* If any async tasks have been requested, kick them off.
*/
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 9a80f89a8a..f951f1cc97 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -26,6 +26,7 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2017 Datto Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
*/
@@ -1023,6 +1024,41 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
* be completely consistent with respect to other vdev configuration changes.
*/
+/*
+ * Poll the spare vdevs to make sure they are not faulty.
+ *
+ * The probe operation will raise an ENXIO error and create an FM ereport if the
+ * probe fails.
+ */
+void
+spa_spare_poll(spa_t *spa)
+{
+ boolean_t async_request = B_FALSE;
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ for (int i = 0; i < spa->spa_spares.sav_count; i++) {
+ spa_aux_t search, *found;
+ vdev_t *vd = spa->spa_spares.sav_vdevs[i];
+
+ search.aux_guid = vd->vdev_guid;
+
+ mutex_enter(&spa_spare_lock);
+ found = avl_find(&spa_spare_avl, &search, NULL);
+ /* This spare is in use by a pool. */
+ if (found != NULL && found->aux_pool != NULL) {
+ mutex_exit(&spa_spare_lock);
+ continue;
+ }
+ mutex_exit(&spa_spare_lock);
+
+ vd->vdev_probe_wanted = B_TRUE;
+ async_request = B_TRUE;
+ }
+ if (async_request)
+ spa_async_request(spa, SPA_ASYNC_PROBE);
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+}
+
static int
spa_spare_compare(const void *a, const void *b)
{
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index d26b095d14..567ab411e4 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -49,7 +49,7 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
metaslab_t **);
void metaslab_fini(metaslab_t *);
-int metaslab_load(metaslab_t *);
+int metaslab_load(metaslab_t *, uint64_t);
void metaslab_unload(metaslab_t *);
uint64_t metaslab_allocated_space(metaslab_t *);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index f8d36f38f7..fe93fdc0d1 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -276,8 +276,17 @@ struct metaslab_group {
boolean_t mg_initialize_updating;
kmutex_t mg_ms_initialize_lock;
kcondvar_t mg_ms_initialize_cv;
+
+ kstat_t *mg_kstat;
+ kmutex_t mg_kstat_lock;
};
+typedef struct metaslab_group_kstat {
+ kstat_named_t mg_loads;
+ kstat_named_t mg_unloads;
+ kstat_named_t mg_spa_name;
+} metaslab_group_kstat_t;
+
/*
* This value defines the number of elements in the ms_lbas array. The value
* of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
@@ -461,8 +470,8 @@ struct metaslab {
* stay cached.
*/
uint64_t ms_selected_txg;
+ uint64_t ms_loaded_txg; /* track when metaslab was loaded */
- uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
uint64_t ms_max_size; /* maximum allocatable size */
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 4ff552447e..82a1514598 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -25,7 +25,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
*/
@@ -679,6 +679,9 @@ extern void spa_spare_remove(vdev_t *vd);
extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
extern void spa_spare_activate(vdev_t *vd);
+/* spare polling */
+extern void spa_spare_poll(spa_t *spa);
+
/* L2ARC state (which is global across all pools) */
extern void spa_l2cache_add(vdev_t *vd);
extern void spa_l2cache_remove(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index dcb6cc9f19..539ed4b43e 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -25,6 +25,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
*/
@@ -252,6 +253,7 @@ struct spa {
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
+ hrtime_t spa_spares_last_polled; /* time spares last polled */
nvlist_t *spa_label_features; /* Features for reading MOS */
uint64_t spa_config_object; /* MOS object for pool config */
uint64_t spa_config_generation; /* config generation number */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 6ddbe55a0c..7ef03e0483 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -143,6 +143,7 @@ struct vdev_queue {
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
uint64_t vq_last_offset;
+ zoneid_t vq_last_zone_id;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
kmutex_t vq_lock;
};
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..f1431b3f55
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ZONE_H
+#define _SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ ZFS_ZONE_IOP_READ = 0,
+ ZFS_ZONE_IOP_WRITE,
+ ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern hrtime_t zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t,
+ avl_tree_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 517764f1ce..fa45fd8385 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -24,7 +24,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc. All rights reserved.
* Copyright 2016 Toomas Soome <tsoome@me.com>
*/
@@ -378,8 +378,14 @@ typedef int zio_pipe_stage_t(zio_t *zio);
* the reexecute flags are protected by io_lock, modifiable by children,
* and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
*/
-#define ZIO_REEXECUTE_NOW 0x01
-#define ZIO_REEXECUTE_SUSPEND 0x02
+#define ZIO_REEXECUTE_NOW 0x01
+#define ZIO_REEXECUTE_SUSPEND 0x02
+#define ZIO_REEXECUTE_NO_SUSPEND 0x04
+
+#define ZIO_SHOULD_REEXECUTE(x) \
+ ((x)->io_reexecute & ZIO_REEXECUTE_NOW || \
+ ((x)->io_reexecute & ZIO_REEXECUTE_SUSPEND && \
+ (((x)->io_reexecute & ZIO_REEXECUTE_NO_SUSPEND) == 0)))
typedef struct zio_alloc_list {
list_t zal_list;
@@ -440,6 +446,7 @@ struct zio {
hrtime_t io_timestamp;
hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
+ hrtime_t io_dispatched; /* time I/O was dispatched to disk */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
@@ -472,6 +479,7 @@ struct zio {
zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
+ zoneid_t io_zoneid; /* zone which originated this I/O */
/* Taskq dispatching state */
taskq_ent_t io_tqent;
};
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index cb3d6a51cb..c97cfdb82c 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -32,6 +32,7 @@
#include <sys/dsl_scan.h>
#include <sys/zil.h>
#include <sys/callb.h>
+#include <sys/zfs_zone.h>
/*
* ZFS Transaction Groups
@@ -535,6 +536,8 @@ txg_sync_thread(void *arg)
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
+ zfs_zone_report_txg_sync(dp);
+
start = ddi_get_lbolt();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 2d431373ce..af653c2f28 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -26,6 +26,7 @@
*/
#include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
#include <sys/spa_impl.h>
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
@@ -52,6 +53,11 @@ extern ldi_ident_t zfs_li;
static void vdev_disk_close(vdev_t *);
+typedef struct vdev_disk_buf {
+ buf_t vdb_buf;
+ zio_t *vdb_io;
+} vdev_disk_buf_t;
+
typedef struct vdev_disk_ldi_cb {
list_node_t lcb_next;
ldi_callback_id_t lcb_id;
@@ -150,6 +156,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
int ldi_result, void *arg, void *ev_data)
{
vdev_t *vd = (vdev_t *)arg;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_ldi_cb_t *lcb;
/*
* Ignore events other than offline.
@@ -613,6 +621,7 @@ static void
vdev_disk_close(vdev_t *vd)
{
vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_ldi_cb_t *lcb;
if (vd->vdev_reopening || dvd == NULL)
return;
@@ -847,6 +856,8 @@ vdev_disk_io_start(zio_t *zio)
bp->b_bufsize = zio->io_size;
bp->b_iodone = vdev_disk_io_intr;
+ zfs_zone_zio_start(zio);
+
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
}
@@ -856,6 +867,8 @@ vdev_disk_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
+ zfs_zone_zio_done(zio);
+
/*
* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
* the device has been removed. If this is the case, then we trigger an
diff --git a/usr/src/uts/common/fs/zfs/vdev_initialize.c b/usr/src/uts/common/fs/zfs/vdev_initialize.c
index e1aa4e9523..9b103811d4 100644
--- a/usr/src/uts/common/fs/zfs/vdev_initialize.c
+++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c
@@ -474,7 +474,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
* metaslab. Load it and walk the free tree for more accurate
* progress estimation.
*/
- VERIFY0(metaslab_load(msp));
+ VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa)));
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
@@ -605,7 +605,7 @@ vdev_initialize_thread(void *arg)
vdev_initialize_ms_mark(msp);
mutex_enter(&msp->ms_lock);
- VERIFY0(metaslab_load(msp));
+ VERIFY0(metaslab_load(msp, spa_syncing_txg(spa)));
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
vd);
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index dff83e3108..74860c5c0a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*/
/*
@@ -34,6 +35,7 @@
#include <sys/zio.h>
#include <sys/avl.h>
#include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
@@ -144,7 +146,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10;
uint32_t zfs_vdev_sync_write_max_active = 10;
uint32_t zfs_vdev_async_read_min_active = 1;
uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_min_active = 3;
uint32_t zfs_vdev_async_write_max_active = 10;
uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 2;
@@ -260,6 +262,8 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
+ vq->vq_last_zone_id = 0;
+
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);
@@ -298,6 +302,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
spa_t *spa = zio->io_spa;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ zfs_zone_zio_enqueue(zio);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
@@ -314,6 +319,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
spa_t *spa = zio->io_spa;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ zfs_zone_zio_dequeue(zio);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
@@ -693,7 +699,11 @@ again:
search.io_timestamp = 0;
search.io_offset = vq->vq_last_offset + 1;
VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+#ifdef _KERNEL
+ zio = zfs_zone_schedule(vq, p, idx, tree);
+#else
zio = avl_nearest(tree, idx, AVL_AFTER);
+#endif
if (zio == NULL)
zio = avl_first(tree);
ASSERT3U(zio->io_priority, ==, p);
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index ad78295a54..8c4c05a1d1 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2015, Joyent, Inc.
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 712abee22f..fcdc8bcbc7 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -25,7 +25,7 @@
* Portions Copyright 2011 Martin Matuska
* Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, 2019 Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
@@ -634,9 +634,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
* Check permissions for special properties.
*/
switch (prop) {
+ case ZFS_PROP_DEDUP:
case ZFS_PROP_ZONED:
/*
- * Disallow setting of 'zoned' from within a local zone.
+ * Disallow setting these properties from within a local zone.
*/
if (!INGLOBALZONE(curproc))
return (SET_ERROR(EPERM));
@@ -966,6 +967,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
+ if (secpolicy_fs_import(cr) != 0)
+ return (set_errno(EPERM));
+
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
return (error);
@@ -2088,7 +2092,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
}
static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+ boolean_t cachedpropsonly)
{
int error = 0;
nvlist_t *nv;
@@ -2106,7 +2111,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
* XXX reading with out owning
*/
if (!zc->zc_objset_stats.dds_inconsistent &&
- dmu_objset_type(os) == DMU_OST_ZVOL) {
+ dmu_objset_type(os) == DMU_OST_ZVOL &&
+ !cachedpropsonly) {
error = zvol_get_stats(os, nv);
if (error == EIO)
return (error);
@@ -2133,11 +2139,24 @@ static int
zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
+ return (error);
+
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error == 0) {
- error = zfs_ioc_objset_stats_impl(zc, os);
+ error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
dmu_objset_rele(os, FTAG);
}
@@ -2332,8 +2351,21 @@ static int
zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
{
objset_t *os;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
+ return (error);
+
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0) {
return (error == ENOENT ? ESRCH : error);
@@ -2363,8 +2395,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
objset_t *ossnap;
error = dmu_objset_from_ds(ds, &ossnap);
- if (error == 0)
- error = zfs_ioc_objset_stats_impl(zc, ossnap);
+ if (error == 0) {
+ error = zfs_ioc_objset_stats_impl(zc,
+ ossnap, cachedpropsonly);
+ }
dsl_dataset_rele(ds, FTAG);
}
} else if (error == ENOENT) {
@@ -3049,6 +3083,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
uint64_t u8 = ZFS_PROP_UNDEFINED;
+ int error;
ASSERT(zplprops != NULL);
@@ -3095,8 +3130,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
- if (norm == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+ if (norm == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
@@ -3105,13 +3141,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
*/
if (norm)
u8 = 1;
- if (u8 == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+ if (u8 == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
- if (sense == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+ if (sense == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
@@ -5849,7 +5887,8 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
static void
-zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+zfs_ioctl_register_legacy(const char *name, zfs_ioc_t ioc,
+ zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
{
@@ -5860,6 +5899,7 @@ zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
ASSERT3P(vec->zvec_legacy_func, ==, NULL);
ASSERT3P(vec->zvec_func, ==, NULL);
+ vec->zvec_name = name;
vec->zvec_legacy_func = func;
vec->zvec_secpolicy = secpolicy;
vec->zvec_namecheck = namecheck;
@@ -5901,7 +5941,7 @@ zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
zfs_ioc_poolcheck_t pool_check)
{
- zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
POOL_NAME, log_history, pool_check);
}
@@ -5909,14 +5949,15 @@ static void
zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
{
- zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
DATASET_NAME, B_FALSE, pool_check);
}
static void
-zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+zfs_ioctl_register_pool_modify(const char *name, zfs_ioc_t ioc,
+ zfs_ioc_legacy_func_t *func)
{
- zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
+ zfs_ioctl_register_legacy(name, ioc, func, zfs_secpolicy_config,
POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
}
@@ -5924,7 +5965,7 @@ static void
zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy)
{
- zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
NO_NAME, B_FALSE, POOL_CHECK_NONE);
}
@@ -5932,7 +5973,7 @@ static void
zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
{
- zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
}
@@ -5944,10 +5985,10 @@ zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
}
static void
-zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
- zfs_secpolicy_func_t *secpolicy)
+zfs_ioctl_register_dataset_modify(const char *name, zfs_ioc_t ioc,
+ zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
{
- zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ zfs_ioctl_register_legacy(name, ioc, func, secpolicy,
DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
}
@@ -6042,34 +6083,35 @@ zfs_ioctl_init(void)
/* IOCTLS that use the legacy function signature */
- zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
- zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
+ zfs_ioctl_register_legacy("pool_freeze", ZFS_IOC_POOL_FREEZE,
+ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
+ POOL_CHECK_READONLY);
zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
+ zfs_ioctl_register_pool_modify("pool_scan", ZFS_IOC_POOL_SCAN,
zfs_ioc_pool_scan);
- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
+ zfs_ioctl_register_pool_modify("pool_upgrade", ZFS_IOC_POOL_UPGRADE,
zfs_ioc_pool_upgrade);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
+ zfs_ioctl_register_pool_modify("vdev_add", ZFS_IOC_VDEV_ADD,
zfs_ioc_vdev_add);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
+ zfs_ioctl_register_pool_modify("vdev_remove", ZFS_IOC_VDEV_REMOVE,
zfs_ioc_vdev_remove);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
+ zfs_ioctl_register_pool_modify("vdev_set_state", ZFS_IOC_VDEV_SET_STATE,
zfs_ioc_vdev_set_state);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
+ zfs_ioctl_register_pool_modify("vdev_attach", ZFS_IOC_VDEV_ATTACH,
zfs_ioc_vdev_attach);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
+ zfs_ioctl_register_pool_modify("vdev_detach", ZFS_IOC_VDEV_DETACH,
zfs_ioc_vdev_detach);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
+ zfs_ioctl_register_pool_modify("vdev_setpath", ZFS_IOC_VDEV_SETPATH,
zfs_ioc_vdev_setpath);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
+ zfs_ioctl_register_pool_modify("vdev_setfru", ZFS_IOC_VDEV_SETFRU,
zfs_ioc_vdev_setfru);
- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
+ zfs_ioctl_register_pool_modify("pool_set_props", ZFS_IOC_POOL_SET_PROPS,
zfs_ioc_pool_set_props);
- zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
+ zfs_ioctl_register_pool_modify("vdev_split", ZFS_IOC_VDEV_SPLIT,
zfs_ioc_vdev_split);
- zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
+ zfs_ioctl_register_pool_modify("pool_reguid", ZFS_IOC_POOL_REGUID,
zfs_ioc_pool_reguid);
zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
@@ -6147,20 +6189,20 @@ zfs_ioctl_init(void)
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
zfs_ioc_send, zfs_secpolicy_send);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
- zfs_secpolicy_none);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
- zfs_secpolicy_destroy);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
- zfs_secpolicy_rename);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
+ zfs_ioctl_register_dataset_modify("set_prop", ZFS_IOC_SET_PROP,
+ zfs_ioc_set_prop, zfs_secpolicy_none);
+ zfs_ioctl_register_dataset_modify("destroy", ZFS_IOC_DESTROY,
+ zfs_ioc_destroy, zfs_secpolicy_destroy);
+ zfs_ioctl_register_dataset_modify("rename", ZFS_IOC_RENAME,
+ zfs_ioc_rename, zfs_secpolicy_rename);
+ zfs_ioctl_register_dataset_modify("recv", ZFS_IOC_RECV, zfs_ioc_recv,
zfs_secpolicy_recv);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
- zfs_secpolicy_promote);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
+ zfs_ioctl_register_dataset_modify("promote", ZFS_IOC_PROMOTE,
+ zfs_ioc_promote, zfs_secpolicy_promote);
+ zfs_ioctl_register_dataset_modify("inherit_prop", ZFS_IOC_INHERIT_PROP,
zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
- zfs_secpolicy_set_fsacl);
+ zfs_ioctl_register_dataset_modify("set_fsacl", ZFS_IOC_SET_FSACL,
+ zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
zfs_secpolicy_share, POOL_CHECK_NONE);
@@ -6443,7 +6485,32 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
nvlist_free(outnvl);
} else {
+ spa_t *spa;
+ uint64_t orig_cookie = zc->zc_cookie;
+
error = vec->zvec_legacy_func(zc);
+
+ if (error == 0 && vec->zvec_allow_log &&
+ vec->zvec_name != NULL &&
+ spa_open(zc->zc_name, &spa, FTAG) == 0) {
+ nvlist_t *lognv = NULL;
+ char *msg;
+ uint_t len = strlen(vec->zvec_name) +
+ strlen(zc->zc_name) + 128;
+
+ msg = kmem_alloc(len, KM_SLEEP);
+
+ lognv = fnvlist_alloc();
+ (void) snprintf(msg, len,
+ "%s pool: %s cookie: %lu guid: %lx", vec->zvec_name,
+ zc->zc_name, orig_cookie, zc->zc_guid);
+ fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, msg);
+
+ (void) spa_history_log_nvl(spa, lognv);
+ spa_close(spa, FTAG);
+ fnvlist_free(lognv);
+ kmem_free(msg, len);
+ }
}
out:
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index f7beea4cc9..a5912b19ab 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
@@ -1912,6 +1913,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
if (zfsvfs->z_ctldir != NULL)
zfsctl_destroy(zfsvfs);
+ /*
+ * If we're doing a forced unmount on a dataset which still has
+ * references and is in a zone, then we need to cleanup the zone
+ * reference at this point or else the zone will never be able to
+ * shutdown.
+ */
+ if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) {
+ zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS);
+ vfsp->vfs_zone = NULL;
+ }
+
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index a68fc3dd34..96e03d9291 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -687,6 +687,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
limit = MAXOFFSET_T;
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
+ */
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+ uio_prefaultpages(n, uio);
+
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
@@ -741,17 +752,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
}
/*
- * Pre-fault the pages to ensure slow (eg NFS) pages
- * don't hold up txg.
- * Skip this if uio contains loaned arc_buf.
- */
- if ((uio->uio_extflg == UIO_XUIO) &&
- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
- xuio = (xuio_t *)uio;
- else
- uio_prefaultpages(MIN(n, max_blksz), uio);
-
- /*
* If in append mode, set the io offset pointer to eof.
*/
locked_range_t *lr;
@@ -996,9 +996,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
break;
ASSERT(tx_bytes == nbytes);
n -= nbytes;
-
- if (!xuio && n > 0)
- uio_prefaultpages(MIN(n, max_blksz), uio);
}
rangelock_exit(lr);
@@ -2854,8 +2851,11 @@ top:
return (err);
}
- if (vap->va_size == 0)
+ if (vap->va_size == 0) {
vnevent_truncate(ZTOV(zp), ct);
+ } else {
+ vnevent_resize(ZTOV(zp), ct);
+ }
}
if (mask & (AT_ATIME|AT_MTIME) ||
@@ -3783,9 +3783,7 @@ top:
if (error == 0) {
vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
- /* notify the target dir if it is not the same as source dir */
- if (tdvp != sdvp)
- vnevent_rename_dest_dir(tdvp, ct);
+ vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
}
out:
if (zl != NULL)
@@ -4819,10 +4817,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
- if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
- vn_has_cached_data(vp))
- (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
return (0);
}
@@ -4888,8 +4882,13 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
error = zfs_freesp(zp, off, len, flag, TRUE);
- if (error == 0 && off == 0 && len == 0)
- vnevent_truncate(ZTOV(zp), ct);
+ if (error == 0 && len == 0) {
+ if (off == 0) {
+ vnevent_truncate(ZTOV(zp), ct);
+ } else {
+ vnevent_resize(ZTOV(zp), ct);
+ }
+ }
ZFS_EXIT(zfsvfs);
return (error);
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..f151595095
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1419 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
+ * ZFS I/O resources for each zone.
+ *
+ * I/O contention can be major pain point on a multi-tenant system. A single
+ * zone can issue a stream of I/O operations, usually synchronous writes, which
+ * disrupt I/O performance for all other zones. This problem is further
+ * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
+ * a set of blocks which are atomically synced to disk. The process of
+ * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
+ * out any pending read operations.
+ *
+ * There are two facets to this capability; the throttle and the scheduler.
+ *
+ * Throttle
+ *
+ * The requirements on the throttle are:
+ *
+ * 1) Ensure consistent and predictable I/O latency across all zones.
+ * 2) Sequential and random workloads have very different characteristics,
+ * so it is a non-starter to track IOPS or throughput.
+ * 3) A zone should be able to use the full disk bandwidth if no other zone
+ * is actively using the disk.
+ *
+ * The throttle has two components: one to track and account for each zone's
+ * I/O requests, and another to throttle each zone's operations when it
+ * exceeds its fair share of disk I/O. When the throttle detects that a zone is
+ * consuming more than is appropriate, each read or write system call is
+ * delayed by up to 100 microseconds, which we've found is sufficient to allow
+ * other zones to interleave I/O requests during those delays.
+ *
+ * Note: The throttle will delay each logical I/O (as opposed to the physical
+ * I/O which will likely be issued asynchronously), so it may be easier to
+ * think of the I/O throttle delaying each read/write syscall instead of the
+ * actual I/O operation. For each zone, the throttle tracks an ongoing average
+ * of read and write operations performed to determine the overall I/O
+ * utilization for each zone.
+ *
+ * The throttle calculates a I/O utilization metric for each zone using the
+ * following formula:
+ *
+ * (# of read syscalls) x (Average read latency) +
+ * (# of write syscalls) x (Average write latency)
+ *
+ * Once each zone has its utilization metric, the I/O throttle will compare I/O
+ * utilization across all zones, and if a zone has a higher-than-average I/O
+ * utilization, system calls from that zone are throttled. That is, if one
+ * zone has a much higher utilization, that zone's delay is increased by 5
+ * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
+ * already throttled and has a lower utilization than average, its delay will
+ * be lowered by 5 microseconds.
+ *
+ * The throttle calculation is driven by IO activity, but since IO does not
+ * happen at fixed intervals, timestamps are used to track when the last update
+ * was made and to drive recalculation.
+ *
+ * The throttle recalculates each zone's I/O usage and throttle delay (if any)
+ * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
+ * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
+ *
+ * Scheduler
+ *
+ * The I/O scheduler manages the vdev queues – the queues of pending I/Os to
+ * issue to the disks. It only makes scheduling decisions for the two
+ * synchronous I/O queues (read & write).
+ *
+ * The scheduler maintains how many I/Os in the queue are from each zone, and
+ * if one zone has a disproportionately large number of I/Os in the queue, the
+ * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
+ * and pulled from the middle of the queue. This bump allows zones with a small
+ * number of I/Os (so small they may not even be taken into account by the
+ * throttle) to complete quickly instead of waiting behind dozens of I/Os from
+ * other zones.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+ return (MSEC2NSEC(10));
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
+uint8_t zfs_zone_delay_step = 5; /* usec amnt to change delay */
+uint8_t zfs_zone_delay_ceiling = 100; /* usec delay max */
+
+boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O. The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down. As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency. If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t zfs_zone_rw_lat_limit = 10;
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level. This tunable controls the
+ * threshold at which the throttle will start delaying zones. When the number
+ * of vdevs is small, the calculation should correspond closely with the %b
+ * column from iostat -- but as the number of vdevs becomes large, it will
+ * correlate less and less to any single device (therefore making it a poor
+ * approximation for the actual I/O utilization on such systems). We
+ * therefore use our derived utilization conservatively: we know that low
+ * derived utilization does indeed correlate to low I/O use -- but that a high
+ * rate of derived utilization does not necesarily alone denote saturation;
+ * where we see a high rate of utilization, we also look for laggard I/Os to
+ * attempt to detect saturation.
+ */
+uint_t zfs_zone_util_threshold = 80;
+uint_t zfs_zone_underutil_threshold = 60;
+
+/*
+ * There are three important tunables here: zfs_zone_laggard_threshold denotes
+ * the threshold at which an I/O is considered to be of notably high latency;
+ * zfs_zone_laggard_recent denotes the number of microseconds before the
+ * current time after which the last laggard is considered to be sufficiently
+ * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
+ * the microseconds before the current time before which the last laggard is
+ * considered to be sufficiently old to merit decreasing the throttle. The
+ * most important tunable of these three is the zfs_zone_laggard_threshold: in
+ * modeling data from a large public cloud, this tunable was found to have a
+ * much greater effect on the throttle than the two time-based thresholds.
+ * This must be set high enough to not result in spurious throttling, but not
+ * so high as to allow pathological I/O to persist in the system.
+ */
+uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */
+uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */
+uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds. Our system
+ * average cycle is one second or 1 million microseconds. Our zone counter
+ * update cycle is two seconds or 2 million microseconds. We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */
+uint_t zfs_zone_cycle_time = 2000000; /* 2 s */
+
+/*
+ * How often the I/O throttle will reevaluate each zone's utilization, in
+ * microseconds. Default is 1/4 sec.
+ */
+uint_t zfs_zone_adjust_time = 250000; /* 250 ms */
+
+typedef struct {
+ hrtime_t cycle_start;
+ hrtime_t cycle_lat;
+ hrtime_t sys_avg_lat;
+ uint_t cycle_cnt;
+} sys_lat_cycle_t;
+
+typedef struct {
+ hrtime_t zi_now;
+ uint_t zi_avgrlat;
+ uint_t zi_avgwlat;
+ uint64_t zi_totpri;
+ uint64_t zi_totutil;
+ int zi_active;
+ uint_t zi_diskutil;
+ boolean_t zi_underutil;
+ boolean_t zi_overutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t rd_lat;
+static sys_lat_cycle_t wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization. The utilization info
+ * for all disks on the system is aggregated into these values.
+ *
+ * Overall disk utilization for the current cycle is calculated as:
+ *
+ * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
+ * ----------------------------------------------
+ * ((now - zfs_zone_last_checked) * 1000);
+ */
+kmutex_t zfs_disk_lock; /* protects the following: */
+uint_t zfs_disk_rcnt; /* Number of outstanding IOs */
+hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
+hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */
+
+hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
+/* time that we last updated per-zone throttle info */
+kmutex_t zfs_last_check_lock; /* protects zfs_zone_last_checked */
+hrtime_t zfs_zone_last_checked = 0;
+hrtime_t zfs_disk_last_laggard = 0;
+
+/*
+ * Data used to keep track of how often txg sync is running.
+ */
+extern int zfs_txg_timeout;
+static uint_t txg_last_check;
+static uint_t txg_cnt;
+static uint_t txg_sync_rate;
+
+boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on the zfs_vdev_sync_read_max_active value for the
+ * number of I/Os that can be pending on a device. If there are more than the
+ * max_active ops already queued up, beyond those already issued to the vdev,
+ * then use zone-based scheduling to get the next synchronous zio.
+ */
+uint32_t zfs_zone_schedule_thresh = 10;
+
+/*
+ * On each pass of the scheduler we increment the zone's weight (up to this
+ * maximum). The weight is used by the scheduler to prevent starvation so
+ * that zones which haven't been able to do any IO over many iterations
+ * will max out thier weight to this value.
+ */
+#define SCHED_WEIGHT_MAX 20
+
+/*
+ * Tunables for delay throttling when TXG sync is occurring.
+ *
+ * If the zone is performing a write and we're doing above normal TXG syncing,
+ * then throttle for longer than normal. The zone's wait time is multiplied
+ * by the scale (zfs_zone_txg_throttle_scale).
+ */
+int zfs_zone_txg_throttle_scale = 2;
+hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
+
+typedef struct {
+ int zq_qdepth;
+ zio_priority_t zq_queue;
+ int zq_priority;
+ int zq_wt;
+ zoneid_t zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define GET_USEC_TIME (gethrtime() / 1000)
+#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
+ * accounted for.
+ *
+ * If the number of ops is >1 then we can just use that value. However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system. We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made. If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return a time delta indicating how far into the current cycle we are or 0
+ * if the last IO was more than a cycle ago.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new zone count.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_cycle_time)
+ return (delta);
+
+ /* A previous cycle is past, compute the new zone count. */
+
+ /*
+ * Figure out how many generations we have to decay the historical
+ * count, since multiple cycles may have elapsed since our last IO.
+ * We depend on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+ /* If more than 5 cycles since last the IO, reset count. */
+ if (gen_cnt > 5) {
+ cp->zone_avg_cnt = 0;
+ } else {
+ /* Update the count. */
+ int i;
+
+ /*
+ * If the zone did more than 1 IO, just use its current count
+ * as the historical value, otherwise decay the historical
+ * count and factor that into the new historical count. We
+ * pick a threshold > 1 so that we don't lose track of IO due
+ * to int rounding.
+ */
+ if (cp->cycle_cnt > 1)
+ cp->zone_avg_cnt = cp->cycle_cnt;
+ else
+ cp->zone_avg_cnt = cp->cycle_cnt +
+ (cp->zone_avg_cnt / 2);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+
+ return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+ zone_zfs_io_t *iop;
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return;
+ }
+
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops);
+ iop->zpers_rd_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops);
+ iop->zpers_wr_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_LOGICAL_WRITE:
+ (void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops);
+ iop->zpers_lwr_ops.cycle_cnt++;
+ break;
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last
+ * update was made. If it was more than one cycle ago, then we need to decay
+ * the average by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static boolean_t
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new average.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_sys_avg_cycle)
+ return (B_FALSE);
+
+ /* A previous cycle is past, compute a new system average. */
+
+ /*
+ * Figure out how many generations we have to decay, since multiple
+ * cycles may have elapsed since our last IO.
+ * We count on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+ /* If more than 5 cycles since last the IO, reset average. */
+ if (gen_cnt > 5) {
+ cp->sys_avg_lat = 0;
+ } else {
+ /* Update the average. */
+ int i;
+
+ cp->sys_avg_lat =
+ (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->sys_avg_lat = cp->sys_avg_lat / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+ cp->cycle_lat = 0;
+
+ return (B_TRUE);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_new_sys_avg(unow, &rd_lat);
+ atomic_inc_uint(&rd_lat.cycle_cnt);
+ atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat);
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_new_sys_avg(unow, &wr_lat);
+ atomic_inc_uint(&wr_lat.cycle_cnt);
+ atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat);
+ break;
+ }
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ uint_t cnt;
+
+ if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ cnt = cp->zone_avg_cnt;
+ } else {
+ /*
+ * If we're less than half way through the cycle then use
+ * the current count plus half the historical count, otherwise
+ * just use the current count.
+ */
+ if (delta < (zfs_zone_cycle_time / 2))
+ cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+ else
+ cnt = cp->cycle_cnt;
+ }
+
+ return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ if (compute_new_sys_avg(unow, cp)) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ return (cp->sys_avg_lat);
+ } else {
+ /*
+ * We're within a cycle; weight the current activity higher
+ * compared to the historical data and use that.
+ */
+ DTRACE_PROBE3(zfs__zone__calc__wt__avg,
+ uintptr_t, cp->sys_avg_lat,
+ uintptr_t, cp->cycle_lat,
+ uintptr_t, cp->cycle_cnt);
+
+ return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+ (1 + (cp->cycle_cnt * 8)));
+ }
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op,
+ hrtime_t lat)
+{
+ /* Add op to zone */
+ add_zone_iop(zpd, unow, op);
+
+ /* Track system latency */
+ if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+ add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone. If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops,
+ uint_t *lwops)
+{
+ ASSERT3P(zpd, !=, NULL);
+
+ *rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops);
+ *wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops);
+ *lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops);
+
+ DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd,
+ uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
+
+ return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+ *rlat = calc_avg_lat(unow, &rd_lat);
+ *wlat = calc_avg_lat(unow, &wr_lat);
+
+ /*
+ * In an attempt to improve the accuracy of the throttling algorithm,
+ * assume that IO operations can't have zero latency. Instead, assume
+ * a reasonable lower bound for each operation type. If the actual
+ * observed latencies are non-zero, use those latency values instead.
+ */
+ if (*rlat == 0)
+ *rlat = 1000;
+ if (*wlat == 0)
+ *wlat = 1000;
+
+ DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
+ uintptr_t, *wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+ zoneio_stats_t *sp = arg;
+ uint_t rops, wops, lwops;
+ zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+ zone_zfs_io_t *iop = zpd->zpers_zfsp;
+
+ ASSERT3P(iop, !=, NULL);
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ if (zonep->zone_id == GLOBAL_ZONEID ||
+ get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return (0);
+ }
+
+ iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) +
+ (lwops * sp->zi_avgwlat);
+ sp->zi_totutil += iop->zpers_io_util;
+
+ if (iop->zpers_io_util > 0) {
+ sp->zi_active++;
+ sp->zi_totpri += iop->zpers_zfs_io_pri;
+ }
+
+ /*
+ * sdt:::zfs-zone-utilization
+ *
+ * arg0: zone ID
+ * arg1: read operations observed during time window
+ * arg2: physical write operations observed during time window
+ * arg3: logical write ops observed during time window
+ * arg4: calculated utilization given read and write ops
+ * arg5: I/O priority assigned to this zone
+ */
+ DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
+ uint_t, rops, uint_t, wops, uint_t, lwops,
+ uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri);
+
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_zfs_io_t *zpd)
+{
+ ASSERT3P(zpd, !=, NULL);
+
+ if (zpd->zpers_io_delay < zfs_zone_delay_ceiling)
+ zpd->zpers_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_zfs_io_t *zpd)
+{
+ ASSERT3P(zpd, !=, NULL);
+
+ if (zpd->zpers_io_delay > 0)
+ zpd->zpers_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay. Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+ zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+ zone_zfs_io_t *iop = zpd->zpers_zfsp;
+ zoneio_stats_t *sp = arg;
+ uint8_t delay;
+ uint_t fairutil = 0;
+
+ ASSERT3P(iop, !=, NULL);
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ delay = iop->zpers_io_delay;
+ iop->zpers_io_util_above_avg = 0;
+
+ /*
+ * Given the calculated total utilitzation for all zones, calculate the
+ * fair share of I/O for this zone.
+ */
+ if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+ fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) /
+ sp->zi_totpri;
+ } else if (sp->zi_active > 0) {
+ fairutil = sp->zi_totutil / sp->zi_active;
+ }
+
+ /*
+ * Adjust each IO's delay. If the overall delay becomes too high, avoid
+ * increasing beyond the ceiling value.
+ */
+ if (iop->zpers_io_util > fairutil && sp->zi_overutil) {
+ iop->zpers_io_util_above_avg = 1;
+
+ if (sp->zi_active > 1)
+ zfs_zone_delay_inc(iop);
+ } else if (iop->zpers_io_util < fairutil || sp->zi_underutil ||
+ sp->zi_active <= 1) {
+ zfs_zone_delay_dec(iop);
+ }
+
+ /*
+ * sdt:::zfs-zone-throttle
+ *
+ * arg0: zone ID
+ * arg1: old delay for this zone
+ * arg2: new delay for this zone
+ * arg3: calculated fair I/O utilization
+ * arg4: actual I/O utilization
+ */
+ DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
+ uintptr_t, delay, uintptr_t, iop->zpers_io_delay,
+ uintptr_t, fairutil, uintptr_t, iop->zpers_io_util);
+
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
+{
+ zoneio_stats_t stats;
+ hrtime_t laggard_udelta = 0;
+
+ (void) bzero(&stats, sizeof (stats));
+
+ stats.zi_now = unow;
+ get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+ if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+ stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+ else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+ stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+ if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+ return;
+
+ /*
+ * Calculate disk utilization for the most recent period.
+ */
+ if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
+ stats.zi_diskutil = 0;
+ } else {
+ stats.zi_diskutil =
+ ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+ ((unow - last_checked) * 1000);
+ }
+ zfs_disk_last_rtime = zfs_disk_rtime;
+
+ if (unow > zfs_disk_last_laggard)
+ laggard_udelta = unow - zfs_disk_last_laggard;
+
+ /*
+ * To minimize porpoising, we have three separate states for our
+ * assessment of I/O performance: overutilized, underutilized, and
+ * neither overutilized nor underutilized. We will increment the
+ * throttle if a zone is using more than its fair share _and_ I/O
+ * is overutilized; we will decrement the throttle if a zone is using
+ * less than its fair share _or_ I/O is underutilized.
+ */
+ stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
+ laggard_udelta > zfs_zone_laggard_ancient;
+
+ stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
+ laggard_udelta < zfs_zone_laggard_recent;
+
+ /*
+ * sdt:::zfs-zone-stats
+ *
+ * Statistics observed over the last period:
+ *
+ * arg0: average system read latency
+ * arg1: average system write latency
+ * arg2: number of active zones
+ * arg3: total I/O 'utilization' for all zones
+ * arg4: total I/O priority of all active zones
+ * arg5: calculated disk utilization
+ */
+ DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
+ uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
+ uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
+ uintptr_t, stats.zi_diskutil);
+
+ (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue. Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+ int pri;
+ uint_t cnt;
+ zone_q_bump_t *qbp = arg;
+ zio_priority_t p = qbp->zq_queue;
+ zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+ zone_zfs_io_t *iop;
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return (0);
+ }
+
+ cnt = iop->zpers_zfs_queued[p];
+ if (cnt == 0) {
+ iop->zpers_zfs_weight = 0;
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return (0);
+ }
+
+ /*
+ * On each pass, increment the zone's weight. We use this as input
+ * to the calculation to prevent starvation. The value is reset
+ * each time we issue an IO for this zone so zones which haven't
+ * done any IO over several iterations will see their weight max
+ * out.
+ */
+ if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX)
+ iop->zpers_zfs_weight++;
+
+ /*
+ * This zone's IO priority is the inverse of the number of IOs
+ * the zone has enqueued * zone's configured priority * weight.
+ * The queue depth has already been scaled by 10 to avoid problems
+ * with int rounding.
+ *
+ * This means that zones with fewer IOs in the queue will get
+ * preference unless other zone's assigned priority pulls them
+ * ahead. The weight is factored in to help ensure that zones
+ * which haven't done IO in a while aren't getting starved.
+ */
+ pri = (qbp->zq_qdepth / cnt) *
+ iop->zpers_zfs_io_pri * iop->zpers_zfs_weight;
+
+ /*
+ * If this zone has a higher priority than what we found so far,
+ * it becomes the new leading contender.
+ */
+ if (pri > qbp->zq_priority) {
+ qbp->zq_zoneid = zonep->zone_id;
+ qbp->zq_priority = pri;
+ qbp->zq_wt = iop->zpers_zfs_weight;
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue. This is only
+ * done on the two synchronous I/O queues (see the block comment on the
+ * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
+ * queue depth from our caller.
+ *
+ * For single-threaded synchronous processes a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple processes
+ * in parallel. This can cause an imbalance in performance if there are zones
+ * with many parallel processes (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded processes, such as interactive tasks in the
+ * shell. These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result. This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority. We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p,
+ avl_tree_t *tree)
+{
+ zone_q_bump_t qbump;
+ zio_t *zp = NULL, *zphead;
+ int cnt = 0;
+
+ /* To avoid problems with int rounding, scale the queue depth by 10 */
+ qbump.zq_qdepth = qdepth * 10;
+ qbump.zq_priority = 0;
+ qbump.zq_zoneid = 0;
+ qbump.zq_queue = p;
+ (void) zone_walk(get_sched_pri_cb, &qbump);
+
+ zphead = avl_first(tree);
+
+ /* Check if the scheduler didn't pick a zone for some reason!? */
+ if (qbump.zq_zoneid != 0) {
+ for (zp = avl_first(tree); zp != NULL;
+ zp = avl_walk(tree, zp, AVL_AFTER)) {
+ if (zp->io_zoneid == qbump.zq_zoneid)
+ break;
+ cnt++;
+ }
+ }
+
+ if (zp == NULL) {
+ zp = zphead;
+ } else if (zp != zphead) {
+ /*
+ * Only fire the probe if we actually picked a different zio
+ * than the one already at the head of the queue.
+ */
+ DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
+ uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
+ }
+
+ return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out TX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+ zone_t *zonep = curzone;
+
+ zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track and throttle IO operations per zone. Called from:
+ * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes
+ * go through this path)
+ * - arc_read for read ops that miss the ARC (both dataset and zvol)
+ * For each operation, increment that zone's counter based on the type of
+ * operation, then delay the operation, if necessary.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls. Those ops go into a TXG which
+ * we'll count here. Sometime later a kernel taskq thread (we'll see the
+ * vdev IO as zone 0) will perform some number of physical writes to commit
+ * the TXG to disk. Those writes are not associated with the zone which
+ * made the write syscalls and the number of operations is not correlated
+ * between the taskq and the zone. We only see logical writes in this
+ * function, we see the physcial writes in the zfs_zone_zio_start and
+ * zfs_zone_zio_done functions.
+ * 2) An application opens a file with O_SYNC. Each write will result in
+ * an operation which we'll see here plus a low-level vdev write from
+ * that zone.
+ * 3) An application does write syscalls followed by an fsync(). We'll
+ * count the writes going into a TXG here. We'll also see some number
+ * (usually much smaller, maybe only 1) of low-level vdev writes from this
+ * zone when the fsync is performed, plus some other low-level vdev writes
+ * from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ * writing out dirty pages to swap, or sync(2) calls, which will be handled
+ * by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice; first because this function
+ * is always called by a zone thread for logical writes, but then we also will
+ * count the physical writes that are performed at a low level via
+ * zfs_zone_zio_start. Without this, it can look like a non-global zone never
+ * writes (case 1). Depending on when the TXG is synced, the counts may be in
+ * the same sample bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics. The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through arc_read and we only come into this
+ * function when we have an arc miss.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+ zoneid_t zid = curzone->zone_id;
+ zone_persist_t *zpd = &zone_pdata[zid];
+ zone_zfs_io_t *iop;
+ hrtime_t unow;
+ uint16_t wait;
+
+ unow = GET_USEC_TIME;
+
+ /*
+ * Only bump the counter for logical writes here. The counters for
+ * tracking physical IO operations are handled in zfs_zone_zio_done.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+ add_iop(zpd, unow, type, 0);
+ }
+
+ if (!zfs_zone_delay_enable)
+ return;
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return;
+ }
+
+ /*
+ * If the zone's I/O priority is set to zero, don't throttle that zone's
+ * operations at all.
+ */
+ if (iop->zpers_zfs_io_pri == 0) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return;
+ }
+
+ /* Handle periodically updating the per-zone I/O parameters */
+ if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+ hrtime_t last_checked;
+ boolean_t do_update = B_FALSE;
+
+ /* Recheck under mutex */
+ mutex_enter(&zfs_last_check_lock);
+ last_checked = zfs_zone_last_checked;
+ if ((unow - last_checked) > zfs_zone_adjust_time) {
+ zfs_zone_last_checked = unow;
+ do_update = B_TRUE;
+ }
+ mutex_exit(&zfs_last_check_lock);
+
+ if (do_update) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ zfs_zone_wait_adjust(unow, last_checked);
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return;
+ }
+ }
+ }
+
+ wait = iop->zpers_io_delay;
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ if (wait > 0) {
+ /*
+ * If this is a write and we're doing above normal TXG
+ * syncing, then throttle for longer than normal.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+ (txg_cnt > 1 || txg_sync_rate > 1))
+ wait *= zfs_zone_txg_throttle_scale;
+
+ /*
+ * sdt:::zfs-zone-wait
+ *
+ * arg0: zone ID
+ * arg1: type of IO operation
+ * arg2: time to delay (in us)
+ */
+ DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid,
+ uintptr_t, type, uintptr_t, wait);
+
+ drv_usecwait(wait);
+
+ if (curzone->zone_vfs_stats != NULL) {
+ atomic_inc_64(&curzone->zone_vfs_stats->
+ zv_delay_cnt.value.ui64);
+ atomic_add_64(&curzone->zone_vfs_stats->
+ zv_delay_time.value.ui64, wait);
+ }
+ }
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TXG sync rate is running above the expected rate.
+ * If so, this implies that we are filling TXG's at a high rate due to a heavy
+ * write workload. We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load. In this case, the sync rate is going to be 1. When there
+ * is a heavy write load, TXG's fill up fast and the sync thread will write
+ * the TXG more frequently (perhaps once a second). In this case the rate
+ * will be > 1. The sync rate is a lagging indicator since it can be up
+ * to 5 seconds old. We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_sync_rate to keep track of the previous
+ * 5 second interval. In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+ uint_t now;
+
+ txg_cnt++;
+ now = (uint_t)(gethrtime() / NANOSEC);
+ if ((now - txg_last_check) >= zfs_txg_timeout) {
+ txg_sync_rate = txg_cnt / 2;
+ txg_cnt = 0;
+ txg_last_check = now;
+ }
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+ zone_persist_t *zpd = &zone_pdata[curzone->zone_id];
+ zone_zfs_io_t *iop;
+ uint8_t above;
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return (0);
+ }
+
+ above = iop->zpers_io_util_above_avg;
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ if (above) {
+ return (zfs_zone_txg_delay_nsec);
+ }
+
+ return (MSEC2NSEC(10));
+}
+
+/*
+ * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+ zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+ zone_zfs_io_t *iop;
+
+ /*
+ * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+ * an actual I/O operation. Ignore those operations as they relate to
+ * throttling and scheduling.
+ */
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ if (zp->io_type == ZIO_TYPE_READ)
+ kstat_runq_enter(&iop->zpers_zfs_rwstats);
+ iop->zpers_zfs_weight = 0;
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zp->io_dispatched = gethrtime();
+
+ if (zfs_disk_rcnt++ != 0)
+ zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = zp->io_dispatched;
+ mutex_exit(&zfs_disk_lock);
+}
+
+/*
+ * Called from vdev_disk_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+ zone_persist_t *zpd;
+ zone_zfs_io_t *iop;
+ hrtime_t now, unow, udelta;
+
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ if (zp->io_dispatched == 0)
+ return;
+
+ zpd = &zone_pdata[zp->io_zoneid];
+
+ now = gethrtime();
+ unow = NANO_TO_MICRO(now);
+ udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ /*
+ * To calculate the wsvc_t average, keep a cumulative sum of
+ * all the wait time before each I/O was dispatched. Since most
+ * writes are asynchronous, only track the wait time for
+ * read I/Os.
+ */
+ if (zp->io_type == ZIO_TYPE_READ) {
+ iop->zpers_zfs_rwstats.reads++;
+ iop->zpers_zfs_rwstats.nread += zp->io_size;
+ iop->zpers_zfs_rd_waittime +=
+ zp->io_dispatched - zp->io_timestamp;
+ kstat_runq_exit(&iop->zpers_zfs_rwstats);
+ } else {
+ iop->zpers_zfs_rwstats.writes++;
+ iop->zpers_zfs_rwstats.nwritten += zp->io_size;
+ }
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zfs_disk_rcnt--;
+ zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = now;
+
+ if (udelta > zfs_zone_laggard_threshold)
+ zfs_disk_last_laggard = unow;
+
+ mutex_exit(&zfs_disk_lock);
+
+ if (zfs_zone_delay_enable) {
+ add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ?
+ ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+ }
+
+ /*
+ * sdt:::zfs-zone-latency
+ *
+ * arg0: zone ID
+ * arg1: type of I/O operation
+ * arg2: I/O latency (in us)
+ */
+ DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
+ uintptr_t, zp->io_type, uintptr_t, udelta);
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+ zio_priority_t p;
+ zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+ zone_zfs_io_t *iop;
+
+ p = zp->io_priority;
+ if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+ return;
+
+ /* We depend on p being defined as either 0 or 1 */
+ ASSERT(p < 2);
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ ASSERT(iop->zpers_zfs_queued[p] > 0);
+ if (iop->zpers_zfs_queued[p] == 0) {
+ cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+ } else {
+ iop->zpers_zfs_queued[p]--;
+ }
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+ zio_priority_t p;
+ zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+ zone_zfs_io_t *iop;
+
+ p = zp->io_priority;
+ if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+ return;
+
+ /* We depend on p being defined as either 0 or 1 */
+ ASSERT(p < 2);
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ iop->zpers_zfs_queued[p]++;
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue. That function is where zio's are listed
+ * in FIFO order on one of the sync queues, then pulled off (by
+ * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling
+ * here to find a zone's zio deeper in the sync queue and issue that instead
+ * of simply doing FIFO.
+ *
+ * We only do zone-based zio scheduling for the two synchronous I/O queues
+ * (read & write). These queues are normally serviced in FIFO order but we
+ * may decide to move a zone's zio to the head of the line. A typical I/O
+ * load will be mostly synchronous reads and some asynchronous writes (which
+ * are scheduled differently due to transaction groups). There will also be
+ * some synchronous writes for those apps which want to ensure their data is on
+ * disk. We want to make sure that a zone with a single-threaded app (e.g. the
+ * shell) that is doing synchronous I/O (typically reads) isn't penalized by
+ * other zones which are doing lots of synchronous I/O because they have many
+ * running threads.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx,
+ avl_tree_t *tree)
+{
+ vdev_queue_class_t *vqc = &vq->vq_class[p];
+ uint_t cnt;
+ zoneid_t last_zone;
+ zio_t *zio;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ /* Don't change the order on the LBA ordered queues. */
+ if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+ return (avl_nearest(tree, idx, AVL_AFTER));
+
+ /* We depend on p being defined as either 0 or 1 */
+ ASSERT(p < 2);
+
+ cnt = avl_numnodes(tree);
+ last_zone = vq->vq_last_zone_id;
+
+ /*
+ * If there are only a few zios in the queue then just issue the head.
+ * If there are more than a few zios already queued up, then use
+ * scheduling to get the next zio.
+ */
+ if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ else
+ zio = get_next_zio(vqc, cnt, p, tree);
+
+ vq->vq_last_zone_id = zio->io_zoneid;
+
+ /*
+ * Probe with 4 args; the number of IOs in the queue, the zone that
+ * was last scheduled off this queue, the zone that was associated
+ * with the next IO that is scheduled, and which queue (priority).
+ */
+ DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
+ uint_t, zio->io_zoneid, uint_t, p);
+
+ return (zio);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 546e4f3d1e..547ebac383 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -3064,13 +3065,20 @@ zil_close(zilog_t *zilog)
txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
mutex_exit(&zilog->zl_lock);
- /*
- * We need to use txg_wait_synced() to wait long enough for the
- * ZIL to be clean, and to wait for all pending lwbs to be
- * written out.
- */
- if (txg != 0)
+ if (zilog_is_dirty(zilog)) {
+ /*
+ * If we're dirty, always wait for the current transaction --
+ * our lwb_max_txg may be in the past.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ } else if (txg != 0) {
+ /*
+ * We need to use txg_wait_synced() to wait long enough for the
+ * ZIL to be clean, and to wait for all pending lwbs to be
+ * written out.
+ */
txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
if (zilog_is_dirty(zilog))
zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 619dad47f3..8aaa2e19a2 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -22,7 +22,9 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
*/
@@ -41,6 +43,7 @@
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
+#include <sys/zfs_zone.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
#include <sys/cityhash.h>
@@ -621,6 +624,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
+ zio->io_zoneid = pio->io_zoneid;
if (zio->io_metaslab_class == NULL)
zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
@@ -628,6 +632,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
zio_add_child(pio, zio);
+ } else {
+ zfs_zone_zio_init(zio);
}
return (zio);
@@ -3827,6 +3833,24 @@ zio_done(zio_t *zio)
}
}
+ /*
+ * When we have an error on a slog vdev, we must ensure that the
+ * zio is not suspended. Suspending the zio will cause dataset deletion
+ * or an attempt to remove the slog to hang. In both cases, the code
+ * might be trying to clean up the zil blocks on the slog, but because
+ * the slog is dead, the suspended zio causes this to hang indefinitely.
+ * The system properly switches over to using zils on regular storage
+ * when the slog dies.
+ *
+ * This is a reasonable point in the stack to detect that the vdev is
+ * a slog. The 'no_suspend' flag will propagate up to the logical zio
+ * via zio_notify_parent.
+ */
+ if (zio->io_error && vd != NULL && vd->vdev_islog &&
+ !vdev_accessible(vd, zio)) {
+ zio->io_reexecute |= ZIO_REEXECUTE_NO_SUSPEND;
+ }
+
if (zio->io_error && zio == lio) {
/*
* Determine whether zio should be reexecuted. This will
@@ -3871,7 +3895,7 @@ zio_done(zio_t *zio)
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
- if ((zio->io_error || zio->io_reexecute) &&
+ if ((zio->io_error || ZIO_SHOULD_REEXECUTE(zio)) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
@@ -3885,7 +3909,7 @@ zio_done(zio_t *zio)
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
zio->io_reexecute = 0;
- if (zio->io_reexecute) {
+ if (ZIO_SHOULD_REEXECUTE(zio)) {
/*
* This is a logical I/O that wants to reexecute.
*
@@ -3956,7 +3980,7 @@ zio_done(zio_t *zio)
}
ASSERT(zio->io_child_count == 0);
- ASSERT(zio->io_reexecute == 0);
+ ASSERT(!ZIO_SHOULD_REEXECUTE(zio));
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
/*
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 10ea804f8d..33bac61d21 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -25,7 +25,6 @@
*
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2019, Joyent, Inc.
*/
@@ -85,11 +84,13 @@
#include <sys/zvol.h>
#include <sys/dumphdr.h>
#include <sys/zil_impl.h>
+#include <sys/sdt.h>
#include <sys/dbuf.h>
#include <sys/dmu_tx.h>
#include <sys/zfeature.h>
#include <sys/zio_checksum.h>
#include <sys/zil_impl.h>
+#include <sys/ht.h>
#include <sys/dkioc_free_util.h>
#include <sys/zfs_rlock.h>
@@ -142,6 +143,11 @@ typedef struct zvol_state {
#define ZVOL_EXCL 0x4
#define ZVOL_WCE 0x8
+#define VOP_LATENCY_10MS 10000000
+#define VOP_LATENCY_100MS 100000000
+#define VOP_LATENCY_1S 1000000000
+#define VOP_LATENCY_10S 10000000000
+
/*
* zvol maximum transfer in one DMU tx.
*/
@@ -1272,6 +1278,8 @@ zvol_strategy(buf_t *bp)
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
!doread && !is_dumpified;
+ ht_begin_unsafe();
+
/*
* There must be no buffer changes when doing a dmu_sync() because
* we can't change the data whilst calculating the checksum.
@@ -1319,6 +1327,8 @@ zvol_strategy(buf_t *bp)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
biodone(bp);
+ ht_end_unsafe();
+
return (0);
}
@@ -1380,6 +1390,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
zvol_state_t *zv;
uint64_t volsize;
int error = 0;
+ zone_t *zonep = curzone;
+ uint64_t tot_bytes;
+ hrtime_t start, lat;
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
if (zv == NULL)
@@ -1396,6 +1409,16 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ ht_begin_unsafe();
+
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_runq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ start = gethrtime();
+ tot_bytes = 0;
+
locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
uio->uio_loffset, uio->uio_resid, RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1405,6 +1428,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
if (bytes > volsize - uio->uio_loffset)
bytes = volsize - uio->uio_loffset;
+ tot_bytes += bytes;
error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
if (error) {
/* convert checksum errors into IO errors */
@@ -1415,6 +1439,40 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
}
rangelock_exit(lr);
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.reads++;
+ zonep->zone_vfs_rwstats.nread += tot_bytes;
+ kstat_runq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ zone_vfs_kstat_t *zvp;
+
+ zvp = zonep->zone_vfs_stats;
+ if (lat < VOP_LATENCY_100MS) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+ error);
+
+ ht_end_unsafe();
+
return (error);
}
@@ -1427,6 +1485,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
uint64_t volsize;
int error = 0;
boolean_t sync;
+ zone_t *zonep = curzone;
+ uint64_t tot_bytes;
+ hrtime_t start, lat;
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
if (zv == NULL)
@@ -1443,6 +1504,21 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ ht_begin_unsafe();
+
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
+ /*
+ * For the purposes of VFS kstat consumers, the "waitq" calculation is
+ * repurposed as the active queue for zvol write operations. There's no
+ * actual wait queue for zvol operations.
+ */
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ start = gethrtime();
+ tot_bytes = 0;
+
sync = !(zv->zv_flags & ZVOL_WCE) ||
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
@@ -1456,6 +1532,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
if (bytes > volsize - off) /* don't write past the end */
bytes = volsize - off;
+ tot_bytes += bytes;
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
@@ -1474,6 +1551,41 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+ error);
+
+ ht_end_unsafe();
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.writes++;
+ zonep->zone_vfs_rwstats.nwritten += tot_bytes;
+ kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ zone_vfs_kstat_t *zvp;
+
+ zvp = zonep->zone_vfs_stats;
+ if (lat < VOP_LATENCY_100MS) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+
return (error);
}
@@ -1714,11 +1826,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
case DKIOCFLUSHWRITECACHE:
dkc = (struct dk_callback *)arg;
mutex_exit(&zfsdev_state_lock);
+
+ ht_begin_unsafe();
+
zil_commit(zv->zv_zilog, ZVOL_OBJ);
if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
(*dkc->dkc_callback)(dkc->dkc_cookie, error);
error = 0;
}
+
+ ht_end_unsafe();
+
return (error);
case DKIOCGETWCE:
@@ -1743,7 +1861,9 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
} else {
zv->zv_flags &= ~ZVOL_WCE;
mutex_exit(&zfsdev_state_lock);
+ ht_begin_unsafe();
zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ ht_end_unsafe();
}
return (0);
}
@@ -1796,6 +1916,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
mutex_exit(&zfsdev_state_lock);
+ ht_begin_unsafe();
+
for (int i = 0; i < dfl->dfl_num_exts; i++) {
uint64_t start = dfl->dfl_exts[i].dfle_start,
length = dfl->dfl_exts[i].dfle_length,
@@ -1851,6 +1973,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
if (!(flag & FKIOCTL))
dfl_free(dfl);
+ ht_end_unsafe();
+
return (error);
}
diff --git a/usr/src/uts/common/inet/bpf.h b/usr/src/uts/common/inet/bpf.h
new file mode 100644
index 0000000000..e3eac799e5
--- /dev/null
+++ b/usr/src/uts/common/inet/bpf.h
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _INET_BPF_H
+#define _INET_BPF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+
+/*
+ * Clone bpf_insn definition so that consumers don't need net/bpf.h to reason
+ * about struct sizing.
+ */
+typedef struct ip_bpf_insn {
+ uint16_t code;
+ uint8_t jt;
+ uint8_t jf;
+ uint32_t k;
+} ip_bpf_insn_t;
+
+extern uint32_t ip_bpf_filter(ip_bpf_insn_t *, uchar_t *, uint_t, uint_t);
+extern boolean_t ip_bpf_validate(ip_bpf_insn_t *, uint_t);
+
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_BPF_H */
diff --git a/usr/src/uts/common/io/bpf/bpf_filter.c b/usr/src/uts/common/inet/bpf_filter.c
index db5b224a5e..5a9ba38da6 100644
--- a/usr/src/uts/common/io/bpf/bpf_filter.c
+++ b/usr/src/uts/common/inet/bpf_filter.c
@@ -38,6 +38,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/param.h>
@@ -45,11 +46,12 @@
#include <sys/stream.h>
#include <sys/byteorder.h>
#include <sys/sdt.h>
+#include <inet/bpf.h>
+#include <net/bpf.h>
#define EXTRACT_SHORT(p) BE_IN16(p)
#define EXTRACT_LONG(p) BE_IN32(p)
-#ifdef _KERNEL
#define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr)
#define mtod(_a, _t) ((_t)((_a)->b_rptr))
#define MINDEX(len, m, k) \
@@ -123,11 +125,7 @@ m_xhalf(mblk_t *m, uint32_t k, int *err)
*err = 0;
return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]);
}
-#else /* _KERNEL */
-#include <stdlib.h>
-#endif /* !_KERNEL */
-#include <net/bpf.h>
/*
* Execute the filter program starting at pc on the packet p
@@ -137,8 +135,8 @@ m_xhalf(mblk_t *m, uint32_t k, int *err)
* packet is only in one mblk_t.
* When buflen is 0, p is an mblk_t pointer.
*/
-uint_t
-bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
+uint32_t
+ip_bpf_filter(ip_bpf_insn_t *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
{
uint32_t A, X, k;
uint32_t mem[BPF_MEMWORDS];
@@ -147,7 +145,7 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
/*
* No filter means accept all.
*/
- return ((uint_t)-1);
+ return ((uint32_t)-1);
A = 0;
X = 0;
--pc;
@@ -165,10 +163,10 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
abort();
#endif
case BPF_RET|BPF_K:
- return ((uint_t)pc->k);
+ return (pc->k);
case BPF_RET|BPF_A:
- return ((uint_t)A);
+ return (A);
case BPF_LD|BPF_W|BPF_ABS:
k = pc->k;
@@ -456,7 +454,6 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
/* NOTREACHED */
}
-#ifdef _KERNEL
/*
* Return true if the 'fcode' is a valid filter program.
* The constraints are that each jump be forward and to a valid
@@ -468,14 +465,14 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
* The kernel needs to be able to verify an application's filter code.
* Otherwise, a bogus program could easily crash the system.
*/
-int
-bpf_validate(struct bpf_insn *f, int len)
+boolean_t
+ip_bpf_validate(ip_bpf_insn_t *f, uint_t len)
{
uint_t i, from;
- struct bpf_insn *p;
+ ip_bpf_insn_t *p;
if (len < 1 || len > BPF_MAXINSNS)
- return (0);
+ return (B_FALSE);
for (i = 0; i < len; ++i) {
p = &f[i];
@@ -489,7 +486,7 @@ bpf_validate(struct bpf_insn *f, int len)
switch (BPF_MODE(p->code)) {
case BPF_MEM:
if (p->k >= BPF_MEMWORDS)
- return (0);
+ return (B_FALSE);
break;
case BPF_ABS:
case BPF_IND:
@@ -498,13 +495,13 @@ bpf_validate(struct bpf_insn *f, int len)
case BPF_LEN:
break;
default:
- return (0);
+ return (B_FALSE);
}
break;
case BPF_ST:
case BPF_STX:
if (p->k >= BPF_MEMWORDS)
- return (0);
+ return (B_FALSE);
break;
case BPF_ALU:
switch (BPF_OP(p->code)) {
@@ -522,10 +519,10 @@ bpf_validate(struct bpf_insn *f, int len)
* Check for constant division by 0.
*/
if (BPF_RVAL(p->code) == BPF_K && p->k == 0)
- return (0);
+ return (B_FALSE);
break;
default:
- return (0);
+ return (B_FALSE);
}
break;
case BPF_JMP:
@@ -549,17 +546,17 @@ bpf_validate(struct bpf_insn *f, int len)
switch (BPF_OP(p->code)) {
case BPF_JA:
if (from + p->k < from || from + p->k >= len)
- return (0);
+ return (B_FALSE);
break;
case BPF_JEQ:
case BPF_JGT:
case BPF_JGE:
case BPF_JSET:
if (from + p->jt >= len || from + p->jf >= len)
- return (0);
+ return (B_FALSE);
break;
default:
- return (0);
+ return (B_FALSE);
}
break;
case BPF_RET:
@@ -567,10 +564,9 @@ bpf_validate(struct bpf_insn *f, int len)
case BPF_MISC:
break;
default:
- return (0);
+ return (B_FALSE);
}
}
return (BPF_CLASS(f[len - 1].code) == BPF_RET);
}
-#endif
diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h
new file mode 100644
index 0000000000..a790a797d1
--- /dev/null
+++ b/usr/src/uts/common/inet/inet_hash.h
@@ -0,0 +1,37 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _INET_INET_HASH_H
+#define _INET_INET_HASH_H
+
+/*
+ * Common packet hashing routines shared across MAC, UDP, and others.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INET_PKT_HASH_L2 0x01
+#define INET_PKT_HASH_L3 0x02
+#define INET_PKT_HASH_L4 0x04
+
+extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_INET_HASH_H */
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index f67ade9060..e9a3fcdeeb 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -1415,6 +1415,7 @@ typedef union ill_g_head_u {
#define ILL_CAPAB_DLD 0x20 /* DLD capabilities */
#define ILL_CAPAB_DLD_POLL 0x40 /* Polling */
#define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */
+#define ILL_CAPAB_DLD_IPCHECK 0x100 /* Check if IPs are permitted */
/*
* Per-ill Hardware Checksumming capbilities.
@@ -1729,6 +1730,8 @@ typedef struct ill_s {
* Capabilities related fields.
*/
uint_t ill_dlpi_capab_state; /* State of capability query, IDCS_* */
+ kcondvar_t ill_dlpi_capab_cv; /* CV for broadcasting state changes */
+ kmutex_t ill_dlpi_capab_lock; /* Lock for accessing above Cond Var */
uint_t ill_capab_pending_cnt;
uint64_t ill_capabilities; /* Enabled capabilities, ILL_CAPAB_* */
ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */
@@ -1770,6 +1773,10 @@ typedef struct ill_s {
* Used to save errors that occur during plumbing
*/
uint_t ill_ifname_pending_err;
+ /*
+ * Used to save errors that occur during binding
+ */
+ uint_t ill_dl_bind_err;
avl_node_t ill_avl_byppa; /* avl node based on ppa */
uint_t ill_mcast_nces; /* Number of NCEs that are multicast. */
list_t ill_nce; /* pointer to nce_s list */
@@ -1936,6 +1943,7 @@ typedef struct ill_s {
* ill_nd_lla_len ipsq + down ill only when ill is up
* ill_phys_addr_pend ipsq + down ill only when ill is up
* ill_ifname_pending_err ipsq ipsq
+ * ill_dl_bind_err ipsq ipsq
* ill_avl_byppa ipsq, ill_g_lock write once
*
* ill_fastpath_list ill_lock ill_lock
@@ -3578,6 +3586,8 @@ typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
typedef void *(*ip_dld_callb_t)(void *,
ip_flow_enable_t, void *);
typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t);
+typedef boolean_t (*ip_mac_ipcheck_t)(void *, boolean_t,
+ in6_addr_t *);
typedef int (*ip_capab_func_t)(void *, uint_t,
void *, uint_t);
@@ -3630,6 +3640,12 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */
void *idd_tx_fctl_dh; /* mac_client_handle */
} ill_dld_direct_t;
+/* IP - DLD direct function call to check if an IP is allowed */
+typedef struct ill_dld_ipcheck_s {
+ ip_mac_ipcheck_t idi_allowed_df;
+ void *idi_allowed_dh;
+} ill_dld_ipcheck_t;
+
/* IP - DLD polling capability */
typedef struct ill_dld_poll_s {
ill_rx_ring_t idp_ring_tbl[ILL_MAX_RINGS];
@@ -3641,6 +3657,7 @@ struct ill_dld_capab_s {
void *idc_capab_dh; /* dld_str_t *dsp */
ill_dld_direct_t idc_direct;
ill_dld_poll_t idc_poll;
+ ill_dld_ipcheck_t idc_ipcheck;
};
/*
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index bcbc1c4949..b4bff4d7b4 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -619,6 +620,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
case SO_REUSEADDR:
*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
break; /* goto sizeof (int) option return */
+ case SO_REUSEPORT:
+ *i1 = connp->conn_reuseport;
+ break; /* goto sizeof (int) option return */
case SO_TYPE:
*i1 = connp->conn_so_type;
break; /* goto sizeof (int) option return */
@@ -1186,8 +1190,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
int error;
- if (connp->conn_family != AF_INET)
+ if (connp->conn_family == AF_INET6 &&
+ connp->conn_ipversion == IPV4_VERSION) {
+ /*
+ * Allow certain IPv4 options to be set on an AF_INET6 socket
+ * if the connection is still IPv4.
+ */
+ switch (name) {
+ case IP_TOS:
+ case T_IP_TOS:
+ case IP_TTL:
+ case IP_DONTFRAG:
+ break;
+ default:
+ return (EINVAL);
+ }
+ } else if (connp->conn_family != AF_INET) {
return (EINVAL);
+ }
switch (name) {
case IP_TTL:
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 36eb88d743..b1a77ae0cc 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -81,6 +81,7 @@
#include <sys/tsol/tnet.h>
#include <inet/rawip_impl.h>
+#include <net/bpf.h>
#include <sys/disp.h>
@@ -1013,6 +1014,12 @@ icmp_close_free(conn_t *connp)
icmp->icmp_filter = NULL;
}
+ if (icmp->icmp_bpf_len != 0) {
+ kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+ icmp->icmp_bpf_len = 0;
+ icmp->icmp_bpf_prog = NULL;
+ }
+
/*
* Clear any fields which the kmem_cache constructor clears.
* Only icmp_connp needs to be preserved.
@@ -1966,6 +1973,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
return (err);
}
+static int
+icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
+{
+ struct bpf_program prog;
+ ip_bpf_insn_t *insns = NULL;
+ unsigned int size;
+
+#ifdef _LP64
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ struct bpf_program32 *prog32;
+
+ if (inlen != sizeof (struct bpf_program32)) {
+ return (EINVAL);
+ }
+ prog32 = (struct bpf_program32 *)invalp;
+ prog.bf_len = prog32->bf_len;
+ prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
+ } else
+#endif
+ if (inlen == sizeof (struct bpf_program)) {
+ bcopy(invalp, &prog, sizeof (prog));
+ } else {
+ return (EINVAL);
+ }
+
+ if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
+ return (EINVAL);
+ }
+ size = prog.bf_len * sizeof (struct bpf_insn);
+ insns = kmem_alloc(size, KM_SLEEP);
+ if (copyin(prog.bf_insns, insns, size) != 0) {
+ kmem_free(insns, size);
+ return (EFAULT);
+ }
+ if (!ip_bpf_validate(insns, prog.bf_len)) {
+ kmem_free(insns, size);
+ return (EINVAL);
+ }
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+ if (icmp->icmp_bpf_len != 0) {
+ ASSERT(icmp->icmp_bpf_prog != NULL);
+
+ kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+ }
+ icmp->icmp_bpf_len = size;
+ icmp->icmp_bpf_prog = insns;
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (0);
+}
+
+static int
+icmp_detach_filter(icmp_t *icmp)
+{
+ int error;
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+ if (icmp->icmp_bpf_len == 0) {
+ ASSERT(icmp->icmp_bpf_prog == NULL);
+ error = ENOENT;
+ } else {
+ kmem_free(icmp->icmp_bpf_prog,
+ icmp->icmp_bpf_len);
+ icmp->icmp_bpf_len = 0;
+ icmp->icmp_bpf_prog = NULL;
+ error = 0;
+ }
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (error);
+}
+
+static boolean_t
+icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
+{
+ boolean_t res;
+ uchar_t *buf = mp->b_rptr;
+ uint_t wirelen, len = MBLKL(mp);
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_READER);
+ if (icmp->icmp_bpf_len == 0) {
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (B_FALSE);
+ }
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)buf;
+
+ wirelen = ntohs(ipha->ipha_length);
+ } else {
+ ip6_t *ip6h = (ip6_t *)buf;
+
+ wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ }
+ res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
+ rw_exit(&icmp->icmp_bpf_lock);
+
+ return (res);
+}
+
/*
* This routine sets socket options.
*/
@@ -2055,6 +2160,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
return (ENOBUFS);
}
break;
+ case SO_ATTACH_FILTER:
+ return (icmp_attach_filter(icmp, inlen, invalp));
+ case SO_DETACH_FILTER:
+ return (icmp_detach_filter(icmp));
}
break;
@@ -2600,6 +2709,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
/* Initialize regardless of IP version */
ipps.ipp_fields = 0;
+ /* Apply socket filter, if needed */
+ if (icmp->icmp_bpf_len != 0) {
+ if (icmp_eval_filter(icmp, mp, ira)) {
+ freemsg(mp);
+ return;
+ }
+ }
+
if (ira->ira_flags & IRAF_IS_IPV4) {
ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
ASSERT(MBLKL(mp) >= sizeof (ipha_t));
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index ff0310de0c..d65d3164d3 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -41,6 +42,7 @@
#include <netinet/ip_mroute.h>
#include <inet/optcom.h>
#include <inet/rawip_impl.h>
+#include <net/bpf.h>
/*
* Table of all known options handled on a ICMP protocol stack.
@@ -86,6 +88,10 @@ opdes_t icmp_opt_arr[] = {
0 },
{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_ATTACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0,
+ sizeof (struct bpf_program), 0 },
+{ SO_DETACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 },
+
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 46272b2b22..5c256729dc 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -4123,6 +4123,8 @@ ip_modclose(ill_t *ill)
rw_destroy(&ill->ill_mcast_lock);
mutex_destroy(&ill->ill_mcast_serializer);
list_destroy(&ill->ill_nce);
+ cv_destroy(&ill->ill_dlpi_capab_cv);
+ mutex_destroy(&ill->ill_dlpi_capab_lock);
/*
* Now we are done with the module close pieces that
@@ -8197,7 +8199,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
conn_t *connp = NULL;
t_uscalar_t paddrreq;
mblk_t *mp_hw;
- boolean_t success;
boolean_t ioctl_aborted = B_FALSE;
boolean_t log = B_TRUE;
@@ -8297,7 +8298,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
mutex_exit(&ill->ill_lock);
/*
- * Something went wrong with the bind. We presumably
+ * Something went wrong with the bind. If this was the
+ * result of a DL_NOTE_REPLUMB, then we presumably
* have an IOCTL hanging out waiting for completion.
* Find it, take down the interface that was coming
* up, and complete the IOCTL with the error noted.
@@ -8314,6 +8316,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
(void) ipif_down(ipif, NULL, NULL);
/* error is set below the switch */
+ } else {
+ /*
+ * There's no pending IOCTL, so the bind was
+ * most likely started by ill_dl_up(). We save
+ * the error and let it take care of responding
+ * to the IOCTL.
+ */
+ ill->ill_dl_bind_err = dlea->dl_unix_errno ?
+ dlea->dl_unix_errno : ENXIO;
}
break;
case DL_ENABMULTI_REQ:
@@ -8437,55 +8448,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
- /*
- * Now bring up the resolver; when that is complete, we'll
- * create IREs. Note that we intentionally mirror what
- * ipif_up() would have done, because we got here by way of
- * ill_dl_up(), which stopped ipif_up()'s processing.
- */
- if (ill->ill_isv6) {
- /*
- * v6 interfaces.
- * Unlike ARP which has to do another bind
- * and attach, once we get here we are
- * done with NDP
- */
- (void) ipif_resolver_up(ipif, Res_act_initial);
- if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
- err = ipif_up_done_v6(ipif);
- } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
- /*
- * ARP and other v4 external resolvers.
- * Leave the pending mblk intact so that
- * the ioctl completes in ip_rput().
- */
- if (connp != NULL)
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
- mutex_exit(&ill->ill_lock);
- if (connp != NULL)
- mutex_exit(&connp->conn_lock);
- if (success) {
- err = ipif_resolver_up(ipif, Res_act_initial);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- mp1 = ipsq_pending_mp_get(ipsq, &connp);
- } else {
- /* The conn has started closing */
- err = EINTR;
- }
- } else {
- /*
- * This one is complete. Reply to pending ioctl.
- */
- (void) ipif_resolver_up(ipif, Res_act_initial);
- err = ipif_up_done(ipif);
- }
-
- if ((err == 0) && (ill->ill_up_ipifs)) {
+ if (ill->ill_up_ipifs) {
err = ill_up_ipifs(ill, q, mp1);
if (err == EINPROGRESS) {
freemsg(mp);
@@ -8493,25 +8456,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
}
- /*
- * If we have a moved ipif to bring up, and everything has
- * succeeded to this point, bring it up on the IPMP ill.
- * Otherwise, leave it down -- the admin can try to bring it
- * up by hand if need be.
- */
- if (ill->ill_move_ipif != NULL) {
- if (err != 0) {
- ill->ill_move_ipif = NULL;
- } else {
- ipif = ill->ill_move_ipif;
- ill->ill_move_ipif = NULL;
- err = ipif_up(ipif, q, mp1);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- }
- }
break;
case DL_NOTIFY_IND: {
@@ -9635,12 +9579,18 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req)
if ((mpctl = udp_snmp_get(q, mpctl, legacy_req)) == NULL) {
return (1);
}
+ if (level == MIB2_UDP) {
+ goto done;
+ }
}
if (level != MIB2_UDP) {
if ((mpctl = tcp_snmp_get(q, mpctl, legacy_req)) == NULL) {
return (1);
}
+ if (level == MIB2_TCP) {
+ goto done;
+ }
}
if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl,
@@ -9717,6 +9667,7 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req)
if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
return (1);
}
+done:
freemsg(mpctl);
return (1);
}
@@ -12573,6 +12524,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
ip_ioctl_cmd_t *ipip = arg;
ip_extract_func_t *extract_funcp;
+ ill_t *ill;
cmd_info_t ci;
int err;
boolean_t entered_ipsq = B_FALSE;
@@ -12693,6 +12645,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
/*
+ * We need to cache the ill_t that we're going to use as the argument
+ * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
+ * blown away by calling ipi_func.
+ */
+ ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
+
+ /*
* A return value of EINPROGRESS means the ioctl is
* either queued and waiting for some reason or has
* already completed.
@@ -12700,9 +12659,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
- int, ipip->ipi_cmd,
- ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
- ipif_t *, ci.ci_ipif);
+ int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
if (entered_ipsq)
diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c
index c7c241f944..96cc281da5 100644
--- a/usr/src/uts/common/inet/ip/ip6_input.c
+++ b/usr/src/uts/common/inet/ip/ip6_input.c
@@ -23,6 +23,7 @@
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -143,11 +144,9 @@ static void ip_input_multicast_v6(ire_t *, mblk_t *, ip6_t *,
* The ill will always be valid if this function is called directly from
* the driver.
*
- * If ip_input_v6() is called from GLDv3:
- *
- * - This must be a non-VLAN IP stream.
- * - 'mp' is either an untagged or a special priority-tagged packet.
- * - Any VLAN tag that was in the MAC header has been stripped.
+ * If this chain is part of a VLAN stream, then the VLAN tag is
+ * stripped from the MAC header before being delivered to this
+ * function.
*
* If the IP header in packet is not 32-bit aligned, every message in the
* chain will be aligned before further operations. This is required on SPARC
@@ -1892,6 +1891,16 @@ ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h,
return (B_TRUE);
}
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if (hck_flags & HW_LOCAL_MAC) {
+ /*
+ * The packet is from a same-machine sender in which
+ * case we assume data integrity.
+ */
+ return (B_TRUE);
+ }
+
/*
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload.
@@ -1908,9 +1917,6 @@ ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h,
* We apply this for all ULP protocols. Does the HW know to
* not set the flags for SCTP and other protocols.
*/
-
- hck_flags = DB_CKSUMFLAGS(mp);
-
if (hck_flags & HCK_FULLCKSUM_OK) {
/*
* Hardware has already verified the checksum.
diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c
index b023a2fe6a..dc074454e3 100644
--- a/usr/src/uts/common/inet/ip/ip6_output.c
+++ b/usr/src/uts/common/inet/ip/ip6_output.c
@@ -23,6 +23,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -866,8 +867,16 @@ ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
ixa->ixa_raw_cksum_offset);
cksum = htons(protocol);
} else if (protocol == IPPROTO_ICMPV6) {
- cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
- cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */
+ /*
+ * Currently we assume no HW support for ICMP checksum calc.
+ *
+ * When HW support is advertised for ICMP, we'll want the
+ * following to be set:
+ * cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+ * cksum = IP_ICMPV6_CSUM_COMP; Pseudo-header cksum
+ */
+
+ return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
} else {
ip_hdr_cksum:
/* No IP header checksum for IPv6 */
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 31789fb8de..11a9024053 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -174,7 +174,7 @@ static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static void ill_delete_interface_type(ill_if_t *);
-static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
+static int ill_dl_up(ill_t *ill, ipif_t *ipif);
static void ill_dl_down(ill_t *ill);
static void ill_down(ill_t *ill);
static void ill_down_ipifs(ill_t *, boolean_t);
@@ -1380,6 +1380,35 @@ ill_capability_probe(ill_t *ill)
ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
}
+static boolean_t
+ill_capability_wait(ill_t *ill)
+{
+ /*
+ * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can
+ * only be set by someone who is the writer. Since we
+ * drop-and-reacquire the squeue in this loop, we need to check for
+ * ILL_CONDEMNED, which if set means nothing can signal our capability
+ * condition variable.
+ */
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ while (ill->ill_capab_pending_cnt != 0 &&
+ (ill->ill_state_flags & ILL_CONDEMNED) == 0) {
+ mutex_enter(&ill->ill_dlpi_capab_lock);
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ cv_wait(&ill->ill_dlpi_capab_cv, &ill->ill_dlpi_capab_lock);
+ mutex_exit(&ill->ill_dlpi_capab_lock);
+ /*
+ * If ipsq_enter() fails, someone set ILL_CONDEMNED
+ * while we dropped the squeue. Indicate such to the caller.
+ */
+ if (!ipsq_enter(ill, B_FALSE, CUR_OP))
+ return (B_FALSE);
+ }
+
+ return ((ill->ill_state_flags & ILL_CONDEMNED) == 0);
+}
+
void
ill_capability_reset(ill_t *ill, boolean_t reneg)
{
@@ -1390,6 +1419,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg)
ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
+ ASSERT(ill->ill_capab_reset_mp != NULL);
+
ill_capability_send(ill, ill->ill_capab_reset_mp);
ill->ill_capab_reset_mp = NULL;
/*
@@ -2108,6 +2139,49 @@ ill_capability_lso_enable(ill_t *ill)
}
}
+/*
+ * Check whether or not mac will prevent us from sending with a given IP
+ * address. This requires having the IPCHECK capability, which we should
+ * always be able to successfully negotiate, but if it's somehow missing
+ * then we just permit the caller to use the address, since mac does the
+ * actual enforcement and ip is just performing a courtesy check to help
+ * prevent users from unwittingly setting and attempting to use blocked
+ * addresses.
+ */
+static boolean_t
+ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr)
+{
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0)
+ return (B_TRUE);
+
+ ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck;
+ ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df;
+ return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr));
+}
+
+static void
+ill_capability_ipcheck_enable(ill_t *ill)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ ill_dld_ipcheck_t *idi = &idc->idc_ipcheck;
+ dld_capab_ipcheck_t spoof;
+ int rc;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ bzero(&spoof, sizeof (spoof));
+ if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+ &spoof, DLD_ENABLE)) == 0) {
+ idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df;
+ idi->idi_allowed_dh = spoof.ipc_allowed_dh;
+ ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK;
+ } else {
+ cmn_err(CE_WARN, "warning: could not enable IPCHECK "
+ "capability, rc = %d\n", rc);
+ DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc);
+ }
+}
+
static void
ill_capability_dld_enable(ill_t *ill)
{
@@ -2115,15 +2189,15 @@ ill_capability_dld_enable(ill_t *ill)
ASSERT(IAM_WRITER_ILL(ill));
- if (ill->ill_isv6)
- return;
-
ill_mac_perim_enter(ill, &mph);
if (!ill->ill_isv6) {
ill_capability_direct_enable(ill);
ill_capability_poll_enable(ill);
ill_capability_lso_enable(ill);
}
+
+ ill_capability_ipcheck_enable(ill);
+
ill->ill_capabilities |= ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
}
@@ -2188,6 +2262,15 @@ ill_capability_dld_disable(ill_t *ill)
NULL, DLD_DISABLE);
}
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) {
+ ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL);
+ ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL);
+
+ ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK;
+ (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+ NULL, DLD_DISABLE);
+ }
+
ill->ill_capabilities &= ~ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
}
@@ -3430,6 +3513,9 @@ ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
ill->ill_max_buf = ND_MAX_Q;
ill->ill_refcnt = 0;
+ cv_init(&ill->ill_dlpi_capab_cv, NULL, NULL, NULL);
+ mutex_init(&ill->ill_dlpi_capab_lock, NULL, MUTEX_DEFAULT, NULL);
+
return (0);
}
@@ -9677,7 +9763,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
in6_addr_t v6addr;
boolean_t need_up = B_FALSE;
ill_t *ill;
- int i;
ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -9752,20 +9837,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
}
- /*
- * verify that the address being configured is permitted by the
- * ill_allowed_ips[] for the interface.
- */
- if (ill->ill_allowed_ips_cnt > 0) {
- for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
- if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
- &v6addr))
- break;
- }
- if (i == ill->ill_allowed_ips_cnt) {
- pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
- return (EPERM);
- }
+ /* verify that the address being configured is permitted by mac */
+ if (!ill_ipcheck_addr(ill, &v6addr)) {
+ return (EPERM);
}
/*
* Even if there is no change we redo things just to rerun
@@ -12705,6 +12779,12 @@ ill_dl_down(ill_t *ill)
}
ill->ill_unbind_mp = NULL;
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_dl_up = 0;
+ ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
+ mutex_exit(&ill->ill_lock);
+
if (mp != NULL) {
ip1dbg(("ill_dl_down: %s (%u) for %s\n",
dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
@@ -12727,11 +12807,13 @@ ill_dl_down(ill_t *ill)
ill_capability_dld_disable(ill);
ill_capability_reset(ill, B_FALSE);
ill_dlpi_send(ill, mp);
+
+ /*
+ * Wait for the capability reset to finish.
+ * In this case, it doesn't matter WHY or HOW it finished.
+ */
+ (void) ill_capability_wait(ill);
}
- mutex_enter(&ill->ill_lock);
- ill->ill_dl_up = 0;
- ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
- mutex_exit(&ill->ill_lock);
}
void
@@ -12860,6 +12942,10 @@ ill_capability_done(ill_t *ill)
if (ill->ill_capab_pending_cnt == 0 &&
ill->ill_dlpi_capab_state == IDCS_OK)
ill_capability_reset_alloc(ill);
+
+ mutex_enter(&ill->ill_dlpi_capab_lock);
+ cv_broadcast(&ill->ill_dlpi_capab_cv);
+ mutex_exit(&ill->ill_dlpi_capab_lock);
}
/*
@@ -14481,7 +14567,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
* address/netmask etc cause a down/up dance, but
* does not cause an unbind (DL_UNBIND) with the driver
*/
- return (ill_dl_up(ill, ipif, mp, q));
+ if ((err = ill_dl_up(ill, ipif)) != 0) {
+ return (err);
+ }
+ }
+
+ /* Reject bringing up interfaces with unusable IP addresses */
+ if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) {
+ return (EPERM);
}
/*
@@ -14594,24 +14687,22 @@ ill_delete_ires(ill_t *ill)
/*
* Perform a bind for the physical device.
- * When the routine returns EINPROGRESS then mp has been consumed and
- * the ioctl will be acked from ip_rput_dlpi.
- * Allocate an unbind message and save it until ipif_down.
+ *
+ * When the routine returns successfully then dlpi has been bound and
+ * capabilities negotiated. An unbind message will have been allocated
+ * for later use in ipif_down.
*/
static int
-ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
+ill_dl_up(ill_t *ill, ipif_t *ipif)
{
mblk_t *bind_mp = NULL;
mblk_t *unbind_mp = NULL;
- conn_t *connp;
- boolean_t success;
int err;
DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(mp != NULL);
/*
* Make sure we have an IRE_MULTICAST in case we immediately
@@ -14646,19 +14737,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
if (unbind_mp == NULL)
goto bad;
}
- /*
- * Record state needed to complete this operation when the
- * DL_BIND_ACK shows up. Also remember the pre-allocated mblks.
- */
- connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
- ASSERT(connp != NULL || !CONN_Q(q));
- GRAB_CONN_LOCK(q);
- mutex_enter(&ipif->ipif_ill->ill_lock);
- success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
- mutex_exit(&ipif->ipif_ill->ill_lock);
- RELEASE_CONN_LOCK(q);
- if (!success)
- goto bad;
/*
* Save the unbind message for ill_dl_down(); it will be consumed when
@@ -14670,6 +14748,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
ill_dlpi_send(ill, bind_mp);
/* Send down link-layer capabilities probe if not already done. */
ill_capability_probe(ill);
+ /*
+ * Wait for DLPI to be bound and the capability probe to finish.
+ * The call drops-and-reacquires the squeue. If it couldn't because
+ * ILL_CONDEMNED got set, bail.
+ */
+ if (!ill_capability_wait(ill))
+ return (ENXIO);
+
+ /* DLPI failed to bind. Return the saved error */
+ if (!ill->ill_dl_up) {
+ return (ill->ill_dl_bind_err);
+ }
/*
* Sysid used to rely on the fact that netboots set domainname
@@ -14687,11 +14777,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
cmn_err(CE_WARN, "no cached dhcp response");
}
- /*
- * This operation will complete in ip_rput_dlpi with either
- * a DL_BIND_ACK or DL_ERROR_ACK.
- */
- return (EINPROGRESS);
+ return (0);
bad:
ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c
index 6aa70b014a..e2e7dca22c 100644
--- a/usr/src/uts/common/inet/ip/ip_input.c
+++ b/usr/src/uts/common/inet/ip/ip_input.c
@@ -23,6 +23,7 @@
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -56,6 +57,7 @@
#include <sys/vtrace.h>
#include <sys/isa_defs.h>
#include <sys/mac.h>
+#include <sys/mac_client.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/route.h>
@@ -146,11 +148,9 @@ static void ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
* The ill will always be valid if this function is called directly from
* the driver.
*
- * If ip_input() is called from GLDv3:
- *
- * - This must be a non-VLAN IP stream.
- * - 'mp' is either an untagged or a special priority-tagged packet.
- * - Any VLAN tag that was in the MAC header has been stripped.
+ * If this chain is part of a VLAN stream, then the VLAN tag is
+ * stripped from the MAC header before being delivered to this
+ * function.
*
* If the IP header in packet is not 32-bit aligned, every message in the
* chain will be aligned before further operations. This is required on SPARC
@@ -660,11 +660,13 @@ ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
}
/*
- * If there is a good HW IP header checksum we clear the need
+ * If the packet originated from a same-machine sender or
+ * there is a good HW IP header checksum, we clear the need
* look at the IP header checksum.
*/
- if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
- ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+ if ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) ||
+ ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
+ ILL_HCKSUM_CAPABLE(ill) && dohwcksum)) {
/* Header checksum was ok. Clear the flag */
DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
@@ -2241,6 +2243,17 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
/* No ULP checksum to verify. */
return (B_TRUE);
}
+
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if (hck_flags & HW_LOCAL_MAC) {
+ /*
+ * The packet is from a same-machine sender in which
+ * case we assume data integrity.
+ */
+ return (B_TRUE);
+ }
+
/*
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload.
@@ -2257,9 +2270,6 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
* We apply this for all ULP protocols. Does the HW know to
* not set the flags for SCTP and other protocols.
*/
-
- hck_flags = DB_CKSUMFLAGS(mp);
-
if (hck_flags & HCK_FULLCKSUM_OK) {
/*
* Hardware has already verified the checksum.
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
index ea69412933..169859707e 100644
--- a/usr/src/uts/common/inet/ip/ip_output.c
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -1737,6 +1738,13 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
#endif
sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
goto ip_hdr_cksum;
+ } else if (protocol == IPPROTO_ICMP) {
+ /*
+ * Note that we always calculate a SW checksum for ICMP. In the
+ * future, if HW support for ICMP is advertised, we can change
+ * this.
+ */
+ return (ip_output_sw_cksum_v4(mp, ipha, ixa));
} else {
ip_hdr_cksum:
/* Calculate IPv4 header checksum */
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index ee7c7b0f1d..b6565d9c1f 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
/*
@@ -101,10 +102,6 @@
*
* ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
* /dev/ip.
- *
- * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
- * created. This is the time squeue code waits before waking up the worker
- * thread after queuing a request.
*/
#include <sys/types.h>
@@ -142,13 +139,6 @@ kmutex_t sqset_lock;
static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
-/*
- * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
- * created. This is the time squeue code waits before waking up the worker
- * thread after queuing a request.
- */
-uint_t ip_squeue_worker_wait = 10;
-
static squeue_t *ip_squeue_create(pri_t);
static squeue_set_t *ip_squeue_set_create(processorid_t);
static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
@@ -163,7 +153,7 @@ ip_squeue_create(pri_t pri)
{
squeue_t *sqp;
- sqp = squeue_create(ip_squeue_worker_wait, pri);
+ sqp = squeue_create(pri, B_TRUE);
ASSERT(sqp != NULL);
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index bc2173ff24..a59027801f 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/*
@@ -868,67 +869,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
mutex_exit(&(connfp)->connf_lock); \
}
-#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
- conn_t *pconnp = NULL, *nconnp; \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- nconnp = (connfp)->connf_head; \
- while (nconnp != NULL && \
- !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
- pconnp = nconnp; \
- nconnp = nconnp->conn_next; \
- } \
- if (pconnp != NULL) { \
- pconnp->conn_next = (connp); \
- (connp)->conn_prev = pconnp; \
- } else { \
- (connfp)->connf_head = (connp); \
- } \
- if (nconnp != NULL) { \
- (connp)->conn_next = nconnp; \
- nconnp->conn_prev = (connp); \
- } \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF(connp); \
- mutex_exit(&(connfp)->connf_lock); \
-}
+/*
+ * When inserting bound or wildcard entries into the hash, ordering rules are
+ * used to facilitate timely and correct lookups. The order is as follows:
+ * 1. Entries bound to a specific address
+ * 2. Entries bound to INADDR_ANY
+ * 3. Entries bound to ADDR_UNSPECIFIED
+ * Entries in a category which share conn_lport (such as those using
+ * SO_REUSEPORT) will be ordered such that the newest inserted is first.
+ */
-#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
- conn_t **list, *prev, *next; \
- boolean_t isv4mapped = \
- IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- list = &(connfp)->connf_head; \
- prev = NULL; \
- while ((next = *list) != NULL) { \
- if (isv4mapped && \
- IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
- connp->conn_zoneid == next->conn_zoneid) { \
- (connp)->conn_next = next; \
- if (prev != NULL) \
- prev = next->conn_prev; \
- next->conn_prev = (connp); \
- break; \
- } \
- list = &next->conn_next; \
- prev = next; \
- } \
- (connp)->conn_prev = prev; \
- *list = (connp); \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF((connp)); \
- mutex_exit(&(connfp)->connf_lock); \
+void
+ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
+{
+ conn_t *pconnp, *nconnp;
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ /*
+ * Walk though entries associated with the fanout until one is
+ * found which fulfills any of these conditions:
+ * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
+ * 2. Listen port the same as connp
+ */
+ if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
+ connp->conn_lport == nconnp->conn_lport)
+ break;
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
void
ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
{
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ conn_t **list, *prev, *next;
+ conn_t *pconnp = NULL, *nconnp;
+ boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
+ isv4mapped && connp->conn_lport == nconnp->conn_lport)
+ break;
+ if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
+ (isv4mapped ||
+ connp->conn_lport == nconnp->conn_lport))
+ break;
+
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
/*
@@ -1034,9 +1059,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
} else {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
}
} else {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
@@ -1205,9 +1230,9 @@ ipcl_bind_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (protocol == IPPROTO_RSVP)
ill_set_inputfn_all(ipst);
@@ -1219,9 +1244,9 @@ ipcl_bind_insert_v4(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
ASSERT(connp->conn_ipversion == IPV4_VERSION);
@@ -1271,9 +1296,9 @@ ipcl_bind_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
@@ -1283,9 +1308,9 @@ ipcl_bind_insert_v6(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
sa_family_t addr_family;
@@ -1416,9 +1441,9 @@ ipcl_conn_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
@@ -1504,9 +1529,9 @@ ipcl_conn_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
@@ -2092,6 +2117,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
connp->conn_flags = IPCL_RAWIPCONN;
connp->conn_proto = IPPROTO_ICMP;
icmp->icmp_connp = connp;
+ rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
if (connp->conn_ixa == NULL)
@@ -2116,6 +2142,7 @@ rawip_conn_destructor(void *buf, void *cdrarg)
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
rw_destroy(&connp->conn_ilg_lock);
+ rw_destroy(&icmp->icmp_bpf_lock);
/* Can be NULL if constructor failed */
if (connp->conn_ixa != NULL) {
diff --git a/usr/src/uts/common/inet/ip/sadb.c b/usr/src/uts/common/inet/ip/sadb.c
index 40d5078526..44ebb21db3 100644
--- a/usr/src/uts/common/inet/ip/sadb.c
+++ b/usr/src/uts/common/inet/ip/sadb.c
@@ -3767,7 +3767,8 @@ sadb_expire_assoc(queue_t *pfkey_q, ipsa_t *assoc)
}
alloclen = sizeof (*samsg) + sizeof (*current) + sizeof (*expire) +
- 2 * sizeof (sadb_address_t) + sizeof (*saext);
+ 2 * sizeof (sadb_address_t) + sizeof (*saext) +
+ sizeof (sadb_x_kmc_t);
af = assoc->ipsa_addrfam;
switch (af) {
@@ -3896,6 +3897,10 @@ sadb_expire_assoc(queue_t *pfkey_q, ipsa_t *assoc)
ASSERT(mp->b_wptr != NULL);
}
+ mp->b_wptr = sadb_make_kmc_ext(mp->b_wptr, end, assoc->ipsa_kmp,
+ assoc->ipsa_kmc);
+ ASSERT(mp->b_wptr != NULL);
+
/* Can just putnext, we're ready to go! */
putnext(pfkey_q, mp1);
}
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index 2b37528eb9..fc90e6f217 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _INET_IP_IMPL_H
@@ -159,9 +160,27 @@ extern "C" {
#define ILL_DIRECT_CAPABLE(ill) \
(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
-/* This macro is used by the mac layer */
+/*
+ * Determine if a mblk needs to take the "slow path", aka OTH
+ * softring. There are multiple reasons why a mblk might take the slow
+ * path.
+ *
+ * o The mblk is not a data message.
+ *
+ * o There is more than one outstanding reference to the mblk and it
+ * does not originate from a local MAC client. If the mblk does
+ * originate from a local MAC then allow it to pass through with
+ * more than one reference and leave the copying up to the consumer.
+ *
+ * o The IP header is not aligned (we assume alignment in the checksum
+ * routine).
+ *
+ * o The mblk doesn't contain enough data to populate a simple IP header.
+ */
#define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \
- (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \
+ (DB_TYPE(mp) != M_DATA || \
+ (DB_REF(mp) != 1 && ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) == 0)) || \
+ !OK_32PTR(ipha) || \
(((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr))
/*
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index f6466434f6..c3139d9288 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _INET_IPCLASSIFIER_H
@@ -293,7 +294,8 @@ struct conn_s {
conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */
conn_mcbc_bind : 1, /* Bound to multi/broadcast */
- conn_pad_to_bit_31 : 12;
+ conn_reuseport : 1, /* SO_REUSEPORT state */
+ conn_pad_to_bit_31 : 11;
boolean_t conn_blocked; /* conn is flow-controlled */
diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c
index d1c5dfdb9b..25e0b699c5 100644
--- a/usr/src/uts/common/inet/ipd/ipd.c
+++ b/usr/src/uts/common/inet/ipd/ipd.c
@@ -9,7 +9,7 @@
* http://www.illumos.org/license/CDDL.
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
*/
/*
@@ -222,7 +222,7 @@ typedef struct ipd_netstack {
net_handle_t ipdn_v6hdl; /* IPv4 net handle */
int ipdn_hooked; /* are hooks registered */
hook_t *ipdn_v4in; /* IPv4 traffic in hook */
- hook_t *ipdn_v4out; /* IPv4 traffice out hook */
+ hook_t *ipdn_v4out; /* IPv4 traffic out hook */
hook_t *ipdn_v6in; /* IPv6 traffic in hook */
hook_t *ipdn_v6out; /* IPv6 traffic out hook */
int ipdn_enabled; /* which perturbs are on */
@@ -613,7 +613,7 @@ ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
/*
* If ipd_check_hooks_failed, that must mean that we failed to set up
* the hooks, so we are going to effectively zero out and fail the
- * request to enable corruption.
+ * request to enable packet delays.
*/
if (rval != 0)
ins->ipdn_delay = 0;
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index f958ca2261..4cb67a2dab 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#if !defined(lint)
@@ -22,11 +22,13 @@ static const char rcsid[] = "@(#)$Id: ip_fil_solaris.c,v 2.62.2.19 2005/07/13 21
#include <sys/filio.h>
#include <sys/systm.h>
#include <sys/strsubr.h>
+#include <sys/strsun.h>
#include <sys/cred.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/kmem.h>
+#include <sys/mac_provider.h>
#include <sys/mkdev.h>
#include <sys/protosw.h>
#include <sys/socket.h>
@@ -83,9 +85,27 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
void *));
static int ipf_hook6 __P((hook_data_t, int, int, void *));
+static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+ void *));
+
+static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *));
+static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t,
+ void *));
+
extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
+static int ipf_hook_protocol_notify __P((hook_notify_cmd_t, void *,
+ const char *, const char *, const char *));
+static int ipf_hook_instance_notify __P((hook_notify_cmd_t, void *,
+ const char *, const char *, const char *));
+
#if SOLARIS2 < 10
#if SOLARIS2 >= 7
u_int *ip_ttl_ptr = NULL;
@@ -152,6 +172,22 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
char *hook6_loop_out = "ipfilter_hook6_loop_out";
char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in = "ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in = "ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out = "ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out = "ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz";
+
+/* viona hook names */
+char *hook_viona_in = "ipfilter_hookviona_in";
+char *hook_viona_in_gz = "ipfilter_hookviona_in_gz";
+char *hook_viona_out = "ipfilter_hookviona_out";
+char *hook_viona_out_gz = "ipfilter_hookviona_out_gz";
+
/* ------------------------------------------------------------------------ */
/* Function: ipldetach */
/* Returns: int - 0 == success, else error. */
@@ -248,8 +284,65 @@ ipf_stack_t *ifs;
ifs->ifs_ipf_ipv4 = NULL;
}
+ /*
+ * Remove VND hooks
+ */
+ if (ifs->ifs_ipf_vndl3v4 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v4 = NULL;
+ }
+
+ if (ifs->ifs_ipf_vndl3v6 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v6 = NULL;
+ }
+
+ /*
+ * Remove notification of viona hooks
+ */
+ net_instance_notify_unregister(ifs->ifs_netid,
+ ipf_hook_instance_notify);
+
#undef UNDO_HOOK
+ /*
+ * Normally, viona will unregister itself before ipldetach() is called,
+ * so these will be no-ops, but out of caution, we try to make sure
+ * we've removed any of our references.
+ */
+ (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+ NH_PHYSICAL_IN);
+ (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+ NH_PHYSICAL_OUT);
+
+ {
+ char netidstr[12]; /* Large enough for INT_MAX + NUL */
+ (void) snprintf(netidstr, sizeof (netidstr), "%d",
+ ifs->ifs_netid);
+
+ /*
+ * The notify callbacks expect the netid value passed as a
+ * string in the third argument. To prevent confusion if
+ * traced, we pass the same value the nethook framework would
+ * pass, even though the callback does not currently use the
+ * value.
+ */
+ (void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr,
+ NULL, Hn_VIONA);
+ }
+
#ifdef IPFDEBUG
cmn_err(CE_CONT, "ipldetach()\n");
#endif
@@ -445,6 +538,64 @@ ipf_stack_t *ifs;
}
/*
+ * Add VND INET hooks
+ */
+ ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+ if (ifs->ifs_ipf_vndl3v4 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+ hook4_vnd_in, hook4_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+ hook4_vnd_out, hook4_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_out)
+ goto hookup_failed;
+
+
+ /*
+ * VND INET6 hooks
+ */
+ ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+ if (ifs->ifs_ipf_vndl3v6 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+ hook6_vnd_in, hook6_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+ hook6_vnd_out, hook6_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_out)
+ goto hookup_failed;
+
+ /*
+ * VIONA INET hooks. While the nethook framework allows us to register
+ * hooks for events that haven't been registered yet, we instead
+ * register and unregister our hooks in response to notifications
+ * about the viona hooks from the nethook framework. This prevents
+ * problems when the viona module gets unloaded while the ipf module
+ * does not. If we do not unregister our hooks after the viona module
+ * is unloaded, the viona module cannot later re-register them if it
+ * gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded
+ * even on DEBUG kernels, they do not experience this issue.
+ */
+ if (net_instance_notify_register(id, ipf_hook_instance_notify,
+ ifs) != 0)
+ goto hookup_failed;
+
+ /*
* Reacquire ipf_global, now it is safe.
*/
WRITE_ENTER(&ifs->ifs_ipf_global);
@@ -507,6 +658,155 @@ hookup_failed:
return -1;
}
+/* ------------------------------------------------------------------------ */
+/*
+ * Called whenever a nethook protocol is registered or unregistered. Currently
+ * only used to add or remove the hooks for viona.
+ *
+ * While the function signature requires returning int, nothing
+ * in usr/src/uts/common/io/hook.c that invokes the callbacks
+ * captures the return value (nor is there currently any documentation
+ * on what return values should be). For now at least, we'll return 0
+ * on success (or 'not applicable') or an error value. Even if the
+ * nethook framework doesn't use the return address, it can be observed via
+ * dtrace if needed.
+ */
+static int
+ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg,
+ const char *name, const char *dummy __unused, const char *he_name)
+{
+ ipf_stack_t *ifs = arg;
+ hook_t **hookpp;
+ char *hook_name, *hint_name;
+ hook_func_t hookfn;
+ boolean_t *hookedp;
+ hook_hint_t hint;
+ boolean_t out;
+ int ret = 0;
+
+ const boolean_t gz = ifs->ifs_gz_controlled;
+
+ /* We currently only care about viona hooks notifications */
+ if (strcmp(name, Hn_VIONA) != 0)
+ return (0);
+
+ if (strcmp(he_name, NH_PHYSICAL_IN) == 0) {
+ out = B_FALSE;
+ } else if (strcmp(he_name, NH_PHYSICAL_OUT) == 0) {
+ out = B_TRUE;
+ } else {
+ /*
+ * If we've added more hook events to viona, we must add
+ * the corresponding handling here (even if it's just to
+ * ignore it) to prevent the firewall from not working as
+ * intended.
+ */
+ cmn_err(CE_PANIC, "%s: unhandled hook event %s", __func__,
+ he_name);
+
+ return (0);
+ }
+
+ if (out) {
+ hookpp = &ifs->ifs_ipfhookviona_out;
+ hookfn = ipf_hookviona_out;
+ hookedp = &ifs->ifs_hookviona_physical_out;
+ name = gz ? hook_viona_out_gz : hook_viona_out;
+ hint = gz ? HH_AFTER : HH_BEFORE;
+ hint_name = gz ? hook_viona_out : hook_viona_out_gz;
+ } else {
+ hookpp = &ifs->ifs_ipfhookviona_in;
+ hookfn = ipf_hookviona_in;
+ hookedp = &ifs->ifs_hookviona_physical_in;
+ name = gz ? hook_viona_in_gz : hook_viona_in;
+ hint = gz ? HH_BEFORE : HH_AFTER;
+ hint_name = gz ? hook_viona_in : hook_viona_in_gz;
+ }
+
+ switch (command) {
+ default:
+ case HN_NONE:
+ break;
+ case HN_REGISTER:
+ HOOK_INIT(*hookpp, hookfn, (char *)name, ifs);
+ (*hookpp)->h_hint = hint;
+ (*hookpp)->h_hintvalue = (uintptr_t)hint_name;
+ ret = net_hook_register(ifs->ifs_ipf_viona,
+ (char *)he_name, *hookpp);
+ if (ret != 0) {
+ cmn_err(CE_NOTE, "%s: could not register hook "
+ "(hook family=%s hook=%s) err=%d", __func__,
+ name, he_name, ret);
+ *hookedp = B_FALSE;
+ return (ret);
+ }
+ *hookedp = B_TRUE;
+ break;
+ case HN_UNREGISTER:
+ if (ifs->ifs_ipf_viona == NULL)
+ break;
+
+ ret = *hookedp ? net_hook_unregister(ifs->ifs_ipf_viona,
+ (char *)he_name, *hookpp) : 0;
+ if ((ret == 0 || ret == ENXIO)) {
+ if (*hookpp != NULL) {
+ hook_free(*hookpp);
+ *hookpp = NULL;
+ }
+ *hookedp = B_FALSE;
+ }
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * Called whenever a new nethook instance is created. Currently only used
+ * with the Hn_VIONA nethooks. Similar to ipf_hook_protocol_notify, the out
+ * function signature must return an int, though the result is never used.
+ * We elect to return 0 on success (or not applicable) or a non-zero value
+ * on error.
+ */
+static int
+ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
+ const char *netid, const char *dummy __unused, const char *instance)
+{
+ ipf_stack_t *ifs = arg;
+ int ret = 0;
+
+ /* We currently only care about viona hooks */
+ if (strcmp(instance, Hn_VIONA) != 0)
+ return (0);
+
+ switch (command) {
+ case HN_NONE:
+ default:
+ return (0);
+ case HN_REGISTER:
+ ifs->ifs_ipf_viona = net_protocol_lookup(ifs->ifs_netid,
+ NHF_VIONA);
+
+ if (ifs->ifs_ipf_viona == NULL)
+ return (EPROTONOSUPPORT);
+
+ ret = net_protocol_notify_register(ifs->ifs_ipf_viona,
+ ipf_hook_protocol_notify, ifs);
+ VERIFY(ret == 0 || ret == ESHUTDOWN);
+ break;
+ case HN_UNREGISTER:
+ if (ifs->ifs_ipf_viona == NULL)
+ break;
+ VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona,
+ ipf_hook_protocol_notify));
+ VERIFY0(net_protocol_release(ifs->ifs_ipf_viona));
+ ifs->ifs_ipf_viona = NULL;
+ break;
+ }
+
+ return (ret);
+}
+
static int fr_setipfloopback(set, ifs)
int set;
ipf_stack_t *ifs;
@@ -1011,7 +1311,6 @@ cred_t *cp;
return ENXIO;
unit = isp->ipfs_minor;
-
/*
* ipf_find_stack returns with a read lock on ifs_ipf_global
*/
@@ -1715,8 +2014,7 @@ int len;
* Need to preserve checksum information by copying them
* to newmp which heads the pulluped message.
*/
- hcksum_retrieve(m, NULL, NULL, &start, &stuff, &end,
- &value, &flags);
+ mac_hcksum_get(m, &start, &stuff, &end, &value, &flags);
if (pullupmsg(m, len + ipoff + inc) == 0) {
ATOMIC_INCL(ifs->ifs_frstats[out].fr_pull[1]);
@@ -1729,8 +2027,7 @@ int len;
return NULL;
}
- (void) hcksum_assoc(m, NULL, NULL, start, stuff, end,
- value, flags, 0);
+ mac_hcksum_set(m, start, stuff, end, value, flags);
m->b_prev = m2;
m->b_rptr += inc;
@@ -1856,8 +2153,12 @@ frdest_t *fdp;
return (-1);
}
- /* Check the src here, fin_ifp is the src interface. */
- if (!fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p))
+ /*
+ * If we're forwarding (vs. injecting), check the src here, fin_ifp is
+ * the src interface.
+ */
+ if (fdp != NULL &&
+ !fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p))
return (-1);
inj = net_inject_alloc(NETINFO_VERSION);
@@ -1924,8 +2225,8 @@ frdest_t *fdp;
inj->ni_physical = net_routeto(net_data_p, sinp, NULL);
}
- /* we're checking the destinatation here */
- if (!fr_forwarding_enabled(inj->ni_physical, net_data_p))
+ /* If we're forwarding (vs. injecting), check the destinatation here. */
+ if (fdp != NULL && !fr_forwarding_enabled(inj->ni_physical, net_data_p))
goto bad_fastroute;
/*
@@ -2045,6 +2346,160 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
}
/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookvndl3_in */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The vnd hooks are private hooks to ON. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. The driver sends */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the */
+/* traditional packet hook flags. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_out(token, info, arg);
+}
+
+/* Static constants used by ipf_hook_ether */
+static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+static uint8_t ipf_eth_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+static uint8_t ipf_eth_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/* ------------------------------------------------------------------------ */
+/* Function: ipf_hook_ether */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: token(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The ipf_hook_ether hook is currently private to illumos. It represents */
+/* a layer 2 datapath generally used by virtual machines. Currently the */
+/* hook is only used by the viona driver to pass along L2 frames for */
+/* inspection. It requires that the L2 ethernet header is contained within */
+/* a single dblk_t (however layers above the L2 header have no restrctions */
+/* in ipf). ipf does not currently support filtering on L2 fields (e.g. */
+/* filtering on a MAC address or ethertype), however virtual machines do */
+/* not have native IP stack instances where ipf traditionally hooks in. */
+/* Instead this entry point is used to determine if the packet is unicast, */
+/* broadcast, or multicast. The IPv4 or IPv6 packet is then passed to the */
+/* traditional ip hooks for filtering. Non IPv4 or non IPv6 packets are */
+/* not subject to examination. */
+/* ------------------------------------------------------------------------ */
+int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg,
+ boolean_t out)
+{
+ struct ether_header *ethp;
+ hook_pkt_event_t *hpe = (hook_pkt_event_t *)info;
+ mblk_t *mp;
+ size_t offset, len;
+ uint16_t etype;
+ boolean_t v6;
+
+ /*
+ * viona will only pass us mblks with the L2 header contained in a
+ * single data block.
+ */
+ mp = *hpe->hpe_mp;
+ len = MBLKL(mp);
+
+ VERIFY3S(len, >=, sizeof (struct ether_header));
+
+ ethp = (struct ether_header *)mp->b_rptr;
+ if ((etype = ntohs(ethp->ether_type)) == ETHERTYPE_VLAN) {
+ struct ether_vlan_header *evh =
+ (struct ether_vlan_header *)ethp;
+
+ VERIFY3S(len, >=, sizeof (struct ether_vlan_header));
+
+ etype = ntohs(evh->ether_type);
+ offset = sizeof (*evh);
+ } else {
+ offset = sizeof (*ethp);
+ }
+
+ /*
+ * ipf only support filtering IPv4 and IPv6. Ignore other types.
+ */
+ if (etype == ETHERTYPE_IP)
+ v6 = B_FALSE;
+ else if (etype == ETHERTYPE_IPV6)
+ v6 = B_TRUE;
+ else
+ return (0);
+
+ if (bcmp(ipf_eth_bcast_addr, ethp, ETHERADDRL) == 0)
+ hpe->hpe_flags |= HPE_BROADCAST;
+ else if (bcmp(ipf_eth_ipv4_mcast, ethp,
+ sizeof (ipf_eth_ipv4_mcast)) == 0)
+ hpe->hpe_flags |= HPE_MULTICAST;
+ else if (bcmp(ipf_eth_ipv6_mcast, ethp,
+ sizeof (ipf_eth_ipv6_mcast)) == 0)
+ hpe->hpe_flags |= HPE_MULTICAST;
+
+ /* Find the start of the IPv4 or IPv6 header */
+ for (; offset >= len; len = MBLKL(mp)) {
+ offset -= len;
+ mp = mp->b_cont;
+ if (mp == NULL) {
+ freemsg(*hpe->hpe_mp);
+ *hpe->hpe_mp = NULL;
+ return (-1);
+ }
+ }
+ hpe->hpe_mb = mp;
+ hpe->hpe_hdr = mp->b_rptr + offset;
+
+ return (v6 ? ipf_hook6(info, out, 0, arg) :
+ ipf_hook(info, out, 0, arg));
+}
+
+/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookviona_{in,out} */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The viona hooks are private hooks to illumos. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. */
+/* along L2 packets. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+int
+ipf_hookviona_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return (ipf_hook_ether(token, info, arg, B_FALSE));
+}
+
+int
+ipf_hookviona_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return (ipf_hook_ether(token, info, arg, B_TRUE));
+}
+
+/* ------------------------------------------------------------------------ */
/* Function: ipf_hook4_loop_in */
/* Returns: int - 0 == packet ok, else problem, free packet if not done */
/* Parameters: event(I) - pointer to event */
@@ -2387,7 +2842,7 @@ fr_info_t *fin;
#ifdef USE_INET6
struct in6_addr tmp_src6;
#endif
-
+
ASSERT(fin->fin_p == IPPROTO_TCP);
/*
@@ -2429,7 +2884,7 @@ fr_info_t *fin;
#endif
if (tcp != NULL) {
- /*
+ /*
* Adjust TCP header:
* swap ports,
* set flags,
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
#
#
name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..5c156e9c44 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -6,7 +6,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc. All rights reserved.
*/
#ifndef __IPF_STACK_H__
@@ -87,8 +87,8 @@ struct ipf_stack {
#endif
int ifs_ipf_locks_done;
- ipftoken_t *ifs_ipftokenhead;
- ipftoken_t **ifs_ipftokentail;
+ ipftoken_t *ifs_ipftokenhead;
+ ipftoken_t **ifs_ipftokentail;
ipfmutex_t ifs_ipl_mutex;
ipfmutex_t ifs_ipf_authmx;
@@ -126,6 +126,14 @@ struct ipf_stack {
hook_t *ifs_ipfhook6_loop_out;
hook_t *ifs_ipfhook6_nicevents;
+ hook_t *ifs_ipfhookvndl3v4_in;
+ hook_t *ifs_ipfhookvndl3v6_in;
+ hook_t *ifs_ipfhookvndl3v4_out;
+ hook_t *ifs_ipfhookvndl3v6_out;
+
+ hook_t *ifs_ipfhookviona_in;
+ hook_t *ifs_ipfhookviona_out;
+
/* flags to indicate whether hooks are registered. */
boolean_t ifs_hook4_physical_in;
boolean_t ifs_hook4_physical_out;
@@ -137,10 +145,19 @@ struct ipf_stack {
boolean_t ifs_hook6_nic_events;
boolean_t ifs_hook6_loopback_in;
boolean_t ifs_hook6_loopback_out;
+ boolean_t ifs_hookvndl3v4_physical_in;
+ boolean_t ifs_hookvndl3v6_physical_in;
+ boolean_t ifs_hookvndl3v4_physical_out;
+ boolean_t ifs_hookvndl3v6_physical_out;
+ boolean_t ifs_hookviona_physical_in;
+ boolean_t ifs_hookviona_physical_out;
int ifs_ipf_loopback;
net_handle_t ifs_ipf_ipv4;
net_handle_t ifs_ipf_ipv6;
+ net_handle_t ifs_ipf_vndl3v4;
+ net_handle_t ifs_ipf_vndl3v6;
+ net_handle_t ifs_ipf_viona;
/* ip_auth.c */
int ifs_fr_authsize;
@@ -167,8 +184,8 @@ struct ipf_stack {
ipfr_t **ifs_ipfr_nattail;
ipfr_t **ifs_ipfr_nattab;
- ipfr_t *ifs_ipfr_ipidlist;
- ipfr_t **ifs_ipfr_ipidtail;
+ ipfr_t *ifs_ipfr_ipidlist;
+ ipfr_t **ifs_ipfr_ipidtail;
ipfr_t **ifs_ipfr_ipidtab;
ipfrstat_t ifs_ipfr_stats;
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5d56debc31 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
/*
* Destroy things for ipf for one stack.
*/
-/* ARGSUSED */
static void
ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
{
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index f6b6b996a8..847ad1c560 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -20,7 +20,10 @@
*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-/* Copyright (c) 1990 Mentat Inc. */
+/*
+ * Copyright (c) 1990 Mentat Inc.
+ * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ */
#ifndef _INET_MIB2_H
#define _INET_MIB2_H
@@ -1354,25 +1357,46 @@ typedef struct mib2_tcpConnEntry {
/* remote port for this connection { tcpConnEntry 5 } */
int tcpConnRemPort; /* In host byte order */
struct tcpConnEntryInfo_s {
- /* seq # of next segment to send */
+ Counter64 ce_in_data_inorder_bytes;
+ Counter64 ce_in_data_inorder_segs;
+ Counter64 ce_in_data_unorder_bytes;
+ Counter64 ce_in_data_unorder_segs;
+ Counter64 ce_in_zwnd_probes;
+
+ Counter64 ce_out_data_bytes;
+ Counter64 ce_out_data_segs;
+ Counter64 ce_out_retrans_bytes;
+ Counter64 ce_out_retrans_segs;
+ Counter64 ce_out_zwnd_probes;
+ Counter64 ce_rtt_sum;
+
+ /* seq # of next segment to send */
Gauge ce_snxt;
/* seq # of of last segment unacknowledged */
Gauge ce_suna;
- /* currect send window size */
+ /* current send window size */
Gauge ce_swnd;
+ /* current congestion window size */
+ Gauge ce_cwnd;
/* seq # of next expected segment */
Gauge ce_rnxt;
/* seq # of last ack'd segment */
Gauge ce_rack;
- /* currenct receive window size */
+ /* # of unsent bytes in the xmit queue */
+ Gauge ce_unsent;
+ /* current receive window size */
Gauge ce_rwnd;
- /* current rto (retransmit timeout) */
+ /* round-trip time smoothed average (us) */
+ Gauge ce_rtt_sa;
+ /* current rto (retransmit timeout) */
Gauge ce_rto;
- /* current max segment size */
+ /* round-trip time count */
+ Gauge ce_rtt_cnt;
+ /* current max segment size */
Gauge ce_mss;
/* actual internal state */
int ce_state;
- } tcpConnEntryInfo;
+ } tcpConnEntryInfo;
/* pid of the processes that created this connection */
uint32_t tcpConnCreationProcess;
@@ -1408,26 +1432,7 @@ typedef struct mib2_tcp6ConnEntry {
DeviceIndex tcp6ConnIfIndex;
/* state of tcp6 connection { ipv6TcpConnEntry 6 } RW */
int tcp6ConnState;
- struct tcp6ConnEntryInfo_s {
- /* seq # of next segment to send */
- Gauge ce_snxt;
- /* seq # of of last segment unacknowledged */
- Gauge ce_suna;
- /* currect send window size */
- Gauge ce_swnd;
- /* seq # of next expected segment */
- Gauge ce_rnxt;
- /* seq # of last ack'd segment */
- Gauge ce_rack;
- /* currenct receive window size */
- Gauge ce_rwnd;
- /* current rto (retransmit timeout) */
- Gauge ce_rto;
- /* current max segment size */
- Gauge ce_mss;
- /* actual internal state */
- int ce_state;
- } tcp6ConnEntryInfo;
+ struct tcpConnEntryInfo_s tcp6ConnEntryInfo;
/* pid of the processes that created this connection */
uint32_t tcp6ConnCreationProcess;
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 6fb72d1d08..ddb482db78 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -43,6 +44,7 @@ extern "C" {
#include <inet/ip.h>
#include <inet/optcom.h>
#include <inet/tunables.h>
+#include <inet/bpf.h>
/*
* ICMP stack instances
@@ -84,6 +86,10 @@ typedef struct icmp_s {
mblk_t *icmp_fallback_queue_head;
mblk_t *icmp_fallback_queue_tail;
struct sockaddr_storage icmp_delayed_addr;
+
+ krwlock_t icmp_bpf_lock; /* protects icmp_bpf */
+ ip_bpf_insn_t *icmp_bpf_prog; /* SO_ATTACH_FILTER bpf */
+ uint_t icmp_bpf_len;
} icmp_t;
/*
diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c
new file mode 100644
index 0000000000..6e1171de46
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/datafilt.c
@@ -0,0 +1,116 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * This file implements a socketfilter used to deter TCP connections.
+ * To defer a connection means to delay the return of accept(3SOCKET)
+ * until at least one byte is ready to be read(2). This filter may be
+ * applied automatically or programmatically through the use of
+ * soconfig(1M) and setsockopt(3SOCKET).
+ */
+
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+#include <sys/note.h>
+#include <sys/taskq.h>
+
+#define DATAFILT_MODULE "datafilt"
+
+static struct modlmisc dataf_modlmisc = {
+ &mod_miscops,
+ "Kernel data-ready socket filter"
+};
+
+static struct modlinkage dataf_modlinkage = {
+ MODREV_1,
+ &dataf_modlmisc,
+ NULL
+};
+
+static sof_rval_t
+dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph,
+ void *parg, struct sockaddr *laddr, socklen_t laddrlen,
+ struct sockaddr *faddr, socklen_t faddrlen, void **cookiep)
+{
+ _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen,
+ cookiep));
+ return (SOF_RVAL_DEFER);
+}
+
+static void
+dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr)
+{
+ _NOTE(ARGUNUSED(handle, cookie, cr));
+}
+
+static mblk_t *
+dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags,
+ size_t *lenp)
+{
+ _NOTE(ARGUNUSED(cookie, flags, lenp));
+
+ if (mp != NULL && MBLKL(mp) > 0) {
+ sof_newconn_ready(handle);
+ sof_bypass(handle);
+ }
+
+ return (mp);
+}
+
+static sof_ops_t dataf_ops = {
+ .sofop_attach_passive = dataf_attach_passive_cb,
+ .sofop_detach = dataf_detach_cb,
+ .sofop_data_in = dataf_data_in_cb
+};
+
+int
+_init(void)
+{
+ int err;
+
+ /*
+ * This module is safe to attach even after some preliminary socket
+ * setup calls have taken place. See the comment for SOF_ATT_SAFE.
+ */
+ err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops,
+ SOF_ATT_SAFE);
+ if (err != 0)
+ return (err);
+ if ((err = mod_install(&dataf_modlinkage)) != 0)
+ (void) sof_unregister(DATAFILT_MODULE);
+
+ return (err);
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = sof_unregister(DATAFILT_MODULE)) != 0)
+ return (err);
+
+ return (mod_remove(&dataf_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&dataf_modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
index 586d7f06f8..76191e93b8 100644
--- a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
+++ b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -51,6 +51,7 @@
#include <sys/mac_client.h>
#include <sys/mac_provider.h>
#include <sys/mac_client_priv.h>
+#include <inet/bpf.h>
#include <netpacket/packet.h>
@@ -448,7 +449,7 @@ pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag)
buffer = (uchar_t *)mp;
}
rw_enter(&ps->ps_bpflock, RW_READER);
- if (bpf_filter(ps->ps_bpf.bf_insns, buffer,
+ if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer,
hdr.mhi_pktsize, buflen) == 0) {
rw_exit(&ps->ps_bpflock);
ps->ps_stats.tp_drops++;
@@ -1336,7 +1337,7 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
const void *optval, socklen_t optlen)
{
struct bpf_program prog;
- struct bpf_insn *fcode;
+ ip_bpf_insn_t *fcode;
struct pfpsock *ps;
struct sock_proto_props sopp;
int error = 0;
@@ -1370,10 +1371,10 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
return (EFAULT);
}
- if (bpf_validate(fcode, (int)prog.bf_len)) {
+ if (ip_bpf_validate(fcode, prog.bf_len)) {
rw_enter(&ps->ps_bpflock, RW_WRITER);
pfp_release_bpf(ps);
- ps->ps_bpf.bf_insns = fcode;
+ ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode;
ps->ps_bpf.bf_len = size;
rw_exit(&ps->ps_bpflock);
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 2e08dc359b..a1c0dbe697 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,7 +23,7 @@
*/
/*
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/*
@@ -61,6 +61,10 @@
* connection are processed on that squeue. The connection ("conn") to
* squeue mapping is stored in "conn_t" member "conn_sqp".
*
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
* Since the processing of the connection cuts across multiple layers
* but still allows packets for different connnection to be processed on
* other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -132,21 +136,20 @@
#include <sys/squeue_impl.h>
-static void squeue_fire(void *);
static void squeue_drain(squeue_t *, uint_t, hrtime_t);
static void squeue_worker(squeue_t *sqp);
static void squeue_polling_thread(squeue_t *sqp);
+static void squeue_worker_wakeup(squeue_t *sqp);
+static void squeue_try_drain_one(squeue_t *, conn_t *);
kmem_cache_t *squeue_cache;
#define SQUEUE_MSEC_TO_NSEC 1000000
int squeue_drain_ms = 20;
-int squeue_workerwait_ms = 0;
/* The values above converted to ticks or nano seconds */
-static int squeue_drain_ns = 0;
-static int squeue_workerwait_tick = 0;
+static uint_t squeue_drain_ns = 0;
uintptr_t squeue_drain_stack_needed = 10240;
uint_t squeue_drain_stack_toodeep;
@@ -239,19 +242,16 @@ squeue_init(void)
sizeof (squeue_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
squeue_drain_ns = squeue_drain_ms * SQUEUE_MSEC_TO_NSEC;
- squeue_workerwait_tick = MSEC_TO_TICK_ROUNDUP(squeue_workerwait_ms);
}
-/* ARGSUSED */
squeue_t *
-squeue_create(clock_t wait, pri_t pri)
+squeue_create(pri_t pri, boolean_t isip)
{
squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
bzero(sqp, sizeof (squeue_t));
sqp->sq_bind = PBIND_NONE;
sqp->sq_priority = pri;
- sqp->sq_wait = MSEC_TO_TICK(wait);
sqp->sq_worker = thread_create(NULL, 0, squeue_worker,
sqp, 0, &p0, TS_RUN, pri);
@@ -260,11 +260,36 @@ squeue_create(clock_t wait, pri_t pri)
sqp->sq_enter = squeue_enter;
sqp->sq_drain = squeue_drain;
+ sqp->sq_isip = isip;
return (sqp);
}
/*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+ kt_did_t worker, poll;
+ mutex_enter(&sqp->sq_lock);
+ VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+ worker = sqp->sq_worker->t_did;
+ poll = sqp->sq_poll_thr->t_did;
+ sqp->sq_state |= SQS_EXIT;
+ cv_signal(&sqp->sq_poll_cv);
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+
+ thread_join(poll);
+ thread_join(worker);
+ kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
* Bind squeue worker thread to the specified CPU, given by CPU id.
* If the CPU id value is -1, bind the worker thread to the value
* specified in sq_bind field. If a thread is already bound to a
@@ -309,97 +334,6 @@ squeue_unbind(squeue_t *sqp)
mutex_exit(&sqp->sq_lock);
}
-void
-squeue_worker_wakeup(squeue_t *sqp)
-{
- timeout_id_t tid = (sqp)->sq_tid;
-
- ASSERT(MUTEX_HELD(&(sqp)->sq_lock));
-
- if (sqp->sq_wait == 0) {
- ASSERT(tid == 0);
- ASSERT(!(sqp->sq_state & SQS_TMO_PROG));
- sqp->sq_awaken = ddi_get_lbolt();
- cv_signal(&sqp->sq_worker_cv);
- mutex_exit(&sqp->sq_lock);
- return;
- }
-
- /*
- * Queue isn't being processed, so take
- * any post enqueue actions needed before leaving.
- */
- if (tid != 0) {
- /*
- * Waiting for an enter() to process mblk(s).
- */
- clock_t now = ddi_get_lbolt();
- clock_t waited = now - sqp->sq_awaken;
-
- if (TICK_TO_MSEC(waited) >= sqp->sq_wait) {
- /*
- * Times up and have a worker thread
- * waiting for work, so schedule it.
- */
- sqp->sq_tid = 0;
- sqp->sq_awaken = now;
- cv_signal(&sqp->sq_worker_cv);
- mutex_exit(&sqp->sq_lock);
- (void) untimeout(tid);
- return;
- }
- mutex_exit(&sqp->sq_lock);
- return;
- } else if (sqp->sq_state & SQS_TMO_PROG) {
- mutex_exit(&sqp->sq_lock);
- return;
- } else {
- clock_t wait = sqp->sq_wait;
- /*
- * Wait up to sqp->sq_wait ms for an
- * enter() to process this queue. We
- * don't want to contend on timeout locks
- * with sq_lock held for performance reasons,
- * so drop the sq_lock before calling timeout
- * but we need to check if timeout is required
- * after re acquiring the sq_lock. Once
- * the sq_lock is dropped, someone else could
- * have processed the packet or the timeout could
- * have already fired.
- */
- sqp->sq_state |= SQS_TMO_PROG;
- mutex_exit(&sqp->sq_lock);
- tid = timeout(squeue_fire, sqp, wait);
- mutex_enter(&sqp->sq_lock);
- /* Check again if we still need the timeout */
- if (((sqp->sq_state & (SQS_PROC|SQS_TMO_PROG)) ==
- SQS_TMO_PROG) && (sqp->sq_tid == 0) &&
- (sqp->sq_first != NULL)) {
- sqp->sq_state &= ~SQS_TMO_PROG;
- sqp->sq_tid = tid;
- mutex_exit(&sqp->sq_lock);
- return;
- } else {
- if (sqp->sq_state & SQS_TMO_PROG) {
- sqp->sq_state &= ~SQS_TMO_PROG;
- mutex_exit(&sqp->sq_lock);
- (void) untimeout(tid);
- } else {
- /*
- * The timer fired before we could
- * reacquire the sq_lock. squeue_fire
- * removes the SQS_TMO_PROG flag
- * and we don't need to do anything
- * else.
- */
- mutex_exit(&sqp->sq_lock);
- }
- }
- }
-
- ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
-}
-
/*
* squeue_enter() - enter squeue sqp with mblk mp (which can be
* a chain), while tail points to the end and cnt in number of
@@ -475,18 +409,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
SQUEUE_DBG_CLEAR(sqp);
- CONN_DEC_REF(connp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -497,23 +434,28 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
sqp->sq_run = NULL;
if (sqp->sq_first == NULL ||
process_flag == SQ_NODRAIN) {
- if (sqp->sq_first != NULL) {
- squeue_worker_wakeup(sqp);
- return;
+ /*
+ * Even if SQ_NODRAIN was specified, it may
+ * still be best to process a single queued
+ * item if it matches the active connection.
+ */
+ if (sqp->sq_first != NULL && sqp->sq_isip) {
+ squeue_try_drain_one(sqp, connp);
}
+
/*
- * We processed inline our packet and nothing
- * new has arrived. We are done. In case any
- * control actions are pending, wake up the
- * worker.
+ * If work or control actions are pending, wake
+ * up the worker thread.
*/
- if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
- cv_signal(&sqp->sq_worker_cv);
+ if (sqp->sq_first != NULL ||
+ sqp->sq_state & SQS_WORKER_THR_CONTROL) {
+ squeue_worker_wakeup(sqp);
+ }
mutex_exit(&sqp->sq_lock);
return;
}
} else {
- if (ira != NULL) {
+ if (sqp->sq_isip == B_TRUE && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -565,10 +507,9 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* up the worker.
*/
sqp->sq_run = NULL;
- if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
- cv_signal(&sqp->sq_worker_cv);
- mutex_exit(&sqp->sq_lock);
- return;
+ if (sqp->sq_state & SQS_WORKER_THR_CONTROL) {
+ squeue_worker_wakeup(sqp);
+ }
} else {
/*
* We let a thread processing a squeue reenter only
@@ -587,7 +528,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
if (!(sqp->sq_state & SQS_REENTER) &&
(process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
(sqp->sq_run == curthread) && (cnt == 1) &&
- (connp->conn_on_sqp == B_FALSE)) {
+ (sqp->sq_isip == B_FALSE ||
+ connp->conn_on_sqp == B_FALSE)) {
sqp->sq_state |= SQS_REENTER;
mutex_exit(&sqp->sq_lock);
@@ -602,15 +544,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+ SQUEUE_DBG_SET(sqp, mp, proc, connp,
+ tag);
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -631,7 +579,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
#ifdef DEBUG
mp->b_tag = tag;
#endif
- if (ira != NULL) {
+ if (sqp->sq_isip && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -657,54 +605,33 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
tail = mp = attrmp;
}
ENQUEUE_CHAIN(sqp, mp, tail, cnt);
- if (!(sqp->sq_state & SQS_PROC)) {
- squeue_worker_wakeup(sqp);
- return;
- }
/*
- * In case any control actions are pending, wake
- * up the worker.
+ * If the worker isn't running or control actions are pending,
+ * wake it it up now.
*/
- if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
- cv_signal(&sqp->sq_worker_cv);
- mutex_exit(&sqp->sq_lock);
- return;
+ if ((sqp->sq_state & SQS_PROC) == 0 ||
+ (sqp->sq_state & SQS_WORKER_THR_CONTROL) != 0) {
+ squeue_worker_wakeup(sqp);
+ }
}
+ mutex_exit(&sqp->sq_lock);
}
/*
* PRIVATE FUNCTIONS
*/
+
+/*
+ * Wake up worker thread for squeue to process queued work.
+ */
static void
-squeue_fire(void *arg)
+squeue_worker_wakeup(squeue_t *sqp)
{
- squeue_t *sqp = arg;
- uint_t state;
-
- mutex_enter(&sqp->sq_lock);
-
- state = sqp->sq_state;
- if (sqp->sq_tid == 0 && !(state & SQS_TMO_PROG)) {
- mutex_exit(&sqp->sq_lock);
- return;
- }
-
- sqp->sq_tid = 0;
- /*
- * The timeout fired before we got a chance to set it.
- * Process it anyway but remove the SQS_TMO_PROG so that
- * the guy trying to set the timeout knows that it has
- * already been processed.
- */
- if (state & SQS_TMO_PROG)
- sqp->sq_state &= ~SQS_TMO_PROG;
+ ASSERT(MUTEX_HELD(&(sqp)->sq_lock));
- if (!(state & SQS_PROC)) {
- sqp->sq_awaken = ddi_get_lbolt();
- cv_signal(&sqp->sq_worker_cv);
- }
- mutex_exit(&sqp->sq_lock);
+ cv_signal(&sqp->sq_worker_cv);
+ sqp->sq_awoken = gethrtime();
}
static void
@@ -714,10 +641,8 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
mblk_t *head;
sqproc_t proc;
conn_t *connp;
- timeout_id_t tid;
ill_rx_ring_t *sq_rx_ring = sqp->sq_rx_ring;
hrtime_t now;
- boolean_t did_wakeup = B_FALSE;
boolean_t sq_poll_capable;
ip_recv_attr_t *ira, iras;
@@ -729,8 +654,7 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
if (proc_type != SQS_WORKER && STACK_BIAS + (uintptr_t)getfp() -
(uintptr_t)curthread->t_stkbase < squeue_drain_stack_needed) {
ASSERT(mutex_owned(&sqp->sq_lock));
- sqp->sq_awaken = ddi_get_lbolt();
- cv_signal(&sqp->sq_worker_cv);
+ squeue_worker_wakeup(sqp);
squeue_drain_stack_toodeep++;
return;
}
@@ -746,9 +670,6 @@ again:
sqp->sq_last = NULL;
sqp->sq_count = 0;
- if ((tid = sqp->sq_tid) != 0)
- sqp->sq_tid = 0;
-
sqp->sq_state |= SQS_PROC | proc_type;
/*
@@ -765,9 +686,6 @@ again:
SQS_POLLING_ON(sqp, sq_poll_capable, sq_rx_ring);
mutex_exit(&sqp->sq_lock);
- if (tid != 0)
- (void) untimeout(tid);
-
while ((mp = head) != NULL) {
head = mp->b_next;
@@ -779,7 +697,7 @@ again:
mp->b_prev = NULL;
/* Is there an ip_recv_attr_t to handle? */
- if (ip_recv_attr_is_mblk(mp)) {
+ if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
mblk_t *attrmp = mp;
ASSERT(attrmp->b_cont != NULL);
@@ -804,20 +722,25 @@ again:
/*
- * Handle squeue switching. More details in the
- * block comment at the top of the file
+ * Handle squeue switching. More details in the block comment at
+ * the top of the file. non-IP squeues cannot switch, as there
+ * is no conn_t.
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
mp->b_tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -864,11 +787,9 @@ again:
if (proc_type == SQS_WORKER)
SQS_POLL_RING(sqp);
goto again;
- } else {
- did_wakeup = B_TRUE;
- sqp->sq_awaken = ddi_get_lbolt();
- cv_signal(&sqp->sq_worker_cv);
}
+
+ squeue_worker_wakeup(sqp);
}
/*
@@ -927,17 +848,14 @@ again:
SQS_POLL_QUIESCE_DONE)));
SQS_POLLING_OFF(sqp, sq_poll_capable, sq_rx_ring);
sqp->sq_state &= ~(SQS_PROC | proc_type);
- if (!did_wakeup && sqp->sq_first != NULL) {
- squeue_worker_wakeup(sqp);
- mutex_enter(&sqp->sq_lock);
- }
/*
* If we are not the worker and there is a pending quiesce
* event, wake up the worker
*/
if ((proc_type != SQS_WORKER) &&
- (sqp->sq_state & SQS_WORKER_THR_CONTROL))
- cv_signal(&sqp->sq_worker_cv);
+ (sqp->sq_state & SQS_WORKER_THR_CONTROL)) {
+ squeue_worker_wakeup(sqp);
+ }
}
}
@@ -1051,6 +969,11 @@ squeue_polling_thread(squeue_t *sqp)
cv_wait(async, lock);
CALLB_CPR_SAFE_END(&cprinfo, lock);
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
SQS_POLL_THR_QUIESCED);
if (ctl_state != 0) {
@@ -1076,6 +999,9 @@ squeue_polling_thread(squeue_t *sqp)
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
+ /* Only IP related squeues should reach this point */
+ VERIFY(sqp->sq_isip == B_TRUE);
+
poll_again:
sq_rx_ring = sqp->sq_rx_ring;
sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1137,7 +1063,6 @@ poll_again:
*/
}
- sqp->sq_awaken = ddi_get_lbolt();
/*
* Put the SQS_PROC_HELD on so the worker
* thread can distinguish where its called from. We
@@ -1153,7 +1078,7 @@ poll_again:
*/
sqp->sq_state |= SQS_PROC_HELD;
sqp->sq_state &= ~SQS_GET_PKTS;
- cv_signal(&sqp->sq_worker_cv);
+ squeue_worker_wakeup(sqp);
} else if (sqp->sq_first == NULL &&
!(sqp->sq_state & SQS_WORKER)) {
/*
@@ -1173,8 +1098,9 @@ poll_again:
* wake up the worker, since it is currently
* not running.
*/
- if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
- cv_signal(&sqp->sq_worker_cv);
+ if (sqp->sq_state & SQS_WORKER_THR_CONTROL) {
+ squeue_worker_wakeup(sqp);
+ }
} else {
/*
* Worker thread is already running. We don't need
@@ -1205,6 +1131,7 @@ squeue_worker_thr_control(squeue_t *sqp)
ill_rx_ring_t *rx_ring;
ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ VERIFY(sqp->sq_isip == B_TRUE);
if (sqp->sq_state & SQS_POLL_RESTART) {
/* Restart implies a previous quiesce. */
@@ -1316,6 +1243,11 @@ squeue_worker(squeue_t *sqp)
for (;;) {
for (;;) {
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
/*
* If the poll thread has handed control to us
* we need to break out of the wait.
@@ -1412,6 +1344,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
again:
sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1483,36 +1416,109 @@ again:
}
}
-void
-squeue_synch_exit(conn_t *connp)
+/*
+ * If possible, attempt to immediately process a single queued request, should
+ * it match the supplied conn_t reference. This is primarily intended to elide
+ * squeue worker thread wake-ups during local TCP connect() or close()
+ * operations where the response is placed on the squeue during processing.
+ */
+static void
+squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn)
{
- squeue_t *sqp = connp->conn_sqp;
+ mblk_t *next, *mp = sqp->sq_first;
+ conn_t *connp;
+ sqproc_t proc = (sqproc_t)mp->b_queue;
+ ip_recv_attr_t iras, *ira = NULL;
- mutex_enter(&sqp->sq_lock);
- if (sqp->sq_run == curthread) {
- ASSERT(sqp->sq_state & SQS_PROC);
+ ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ ASSERT((sqp->sq_state & SQS_PROC) == 0);
+ ASSERT(sqp->sq_run == NULL);
+ ASSERT(sqp->sq_isip);
+ VERIFY(mp != NULL);
- sqp->sq_state &= ~SQS_PROC;
- sqp->sq_run = NULL;
- connp->conn_on_sqp = B_FALSE;
+ /*
+ * There is no guarantee that compare_conn references a valid object at
+ * this time, so under no circumstance may it be deferenced unless it
+ * matches the squeue entry.
+ */
+ connp = (conn_t *)mp->b_prev;
+ if (connp != compare_conn) {
+ return;
+ }
- if (sqp->sq_first == NULL) {
- mutex_exit(&sqp->sq_lock);
- } else {
- /*
- * If this was a normal thread, then it would
- * (most likely) continue processing the pending
- * requests. Since the just completed operation
- * was executed synchronously, the thread should
- * not be delayed. To compensate, wake up the
- * worker thread right away when there are outstanding
- * requests.
- */
- sqp->sq_awaken = ddi_get_lbolt();
- cv_signal(&sqp->sq_worker_cv);
- mutex_exit(&sqp->sq_lock);
- }
+ next = mp->b_next;
+ proc = (sqproc_t)mp->b_queue;
+
+ ASSERT(proc != NULL);
+ ASSERT(sqp->sq_count > 0);
+
+ /* Dequeue item from squeue */
+ if (next == NULL) {
+ sqp->sq_first = NULL;
+ sqp->sq_last = NULL;
} else {
+ sqp->sq_first = next;
+ }
+ sqp->sq_count--;
+
+ sqp->sq_state |= SQS_PROC;
+ sqp->sq_run = curthread;
+ mutex_exit(&sqp->sq_lock);
+
+ /* Prep mblk_t and retrieve ira if needed */
+ mp->b_prev = NULL;
+ mp->b_queue = NULL;
+ mp->b_next = NULL;
+ if (ip_recv_attr_is_mblk(mp)) {
+ mblk_t *attrmp = mp;
+
+ ASSERT(attrmp->b_cont != NULL);
+
+ mp = attrmp->b_cont;
+ attrmp->b_cont = NULL;
+
+ ASSERT(mp->b_queue == NULL);
+ ASSERT(mp->b_prev == NULL);
+
+ if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
+ /* ill_t or ip_stack_t disappeared */
+ ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
+ ira_cleanup(&iras, B_TRUE);
+ CONN_DEC_REF(connp);
+ goto done;
+ }
+ ira = &iras;
+ }
+
+ SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag);
+ connp->conn_on_sqp = B_TRUE;
+ DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp,
+ conn_t *, connp);
+ (*proc)(connp, mp, sqp, ira);
+ DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp);
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ SQUEUE_DBG_CLEAR(sqp);
+
+ if (ira != NULL)
+ ira_cleanup(ira, B_TRUE);
+
+done:
+ mutex_enter(&sqp->sq_lock);
+ sqp->sq_state &= ~(SQS_PROC);
+ sqp->sq_run = NULL;
+}
+
+void
+squeue_synch_exit(conn_t *connp, int flag)
+{
+ squeue_t *sqp = connp->conn_sqp;
+
+ VERIFY(sqp->sq_isip == B_TRUE);
+ ASSERT(flag == SQ_NODRAIN || flag == SQ_PROCESS);
+
+ mutex_enter(&sqp->sq_lock);
+ if (sqp->sq_run != curthread) {
/*
* The caller doesn't own the squeue, clear the SQS_PAUSE flag,
* and wake up the squeue owner, such that owner can continue
@@ -1524,5 +1530,23 @@ squeue_synch_exit(conn_t *connp)
/* There should be only one thread blocking on sq_synch_cv. */
cv_signal(&sqp->sq_synch_cv);
mutex_exit(&sqp->sq_lock);
+ return;
}
+
+ ASSERT(sqp->sq_state & SQS_PROC);
+
+ sqp->sq_state &= ~SQS_PROC;
+ sqp->sq_run = NULL;
+ connp->conn_on_sqp = B_FALSE;
+
+ /* If the caller opted in, attempt to process the head squeue item. */
+ if (flag == SQ_PROCESS && sqp->sq_first != NULL) {
+ squeue_try_drain_one(sqp, connp);
+ }
+
+ /* Wake up the worker if further requests are pending. */
+ if (sqp->sq_first != NULL) {
+ squeue_worker_wakeup(sqp);
+ }
+ mutex_exit(&sqp->sq_lock);
}
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index b2b9973291..68404716b9 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -20,9 +20,9 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -134,6 +134,7 @@ typedef struct tcphdra_s {
struct conn_s;
struct tcp_listen_cnt_s;
+struct tcp_rg_s;
/*
* Control structure for each open TCP stream,
@@ -177,16 +178,11 @@ typedef struct tcp_s {
mblk_t *tcp_xmit_tail; /* Last data sent */
uint32_t tcp_unsent; /* # of bytes in hand that are unsent */
uint32_t tcp_xmit_tail_unsent; /* # of unsent bytes in xmit_tail */
-
uint32_t tcp_suna; /* Sender unacknowledged */
uint32_t tcp_rexmit_nxt; /* Next rexmit seq num */
uint32_t tcp_rexmit_max; /* Max retran seq num */
uint32_t tcp_cwnd; /* Congestion window */
int32_t tcp_cwnd_cnt; /* cwnd cnt in congestion avoidance */
-
- uint32_t tcp_ibsegs; /* Inbound segments on this stream */
- uint32_t tcp_obsegs; /* Outbound segments on this stream */
-
uint32_t tcp_naglim; /* Tunable nagle limit */
uint32_t tcp_valid_bits;
#define TCP_ISS_VALID 0x1 /* Is the tcp_iss seq num active? */
@@ -194,8 +190,6 @@ typedef struct tcp_s {
#define TCP_URG_VALID 0x4 /* Is the tcp_urg seq num active? */
#define TCP_OFO_FIN_VALID 0x8 /* Has TCP received an out of order FIN? */
-
-
timeout_id_t tcp_timer_tid; /* Control block for timer service */
uchar_t tcp_timer_backoff; /* Backoff shift count. */
int64_t tcp_last_recv_time; /* Last time we receive a segment. */
@@ -282,9 +276,11 @@ typedef struct tcp_s {
uint32_t tcp_cwnd_max;
uint32_t tcp_csuna; /* Clear (no rexmits in window) suna */
- clock_t tcp_rtt_sa; /* Round trip smoothed average */
- clock_t tcp_rtt_sd; /* Round trip smoothed deviation */
- clock_t tcp_rtt_update; /* Round trip update(s) */
+ hrtime_t tcp_rtt_sum; /* Round trip sum */
+ uint32_t tcp_rtt_cnt; /* Round trip count (non_dup ACKs) */
+ hrtime_t tcp_rtt_sa; /* Round trip smoothed average */
+ hrtime_t tcp_rtt_sd; /* Round trip smoothed deviation */
+ uint32_t tcp_rtt_update; /* Round trip update(s) */
clock_t tcp_ms_we_have_waited; /* Total retrans time */
uint32_t tcp_swl1; /* These help us avoid using stale */
@@ -404,6 +400,13 @@ typedef struct tcp_s {
struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
struct tcp_s **tcp_ptpbhn;
+ /*
+ * Group of tcp_t entries bound to the same adress and port via
+ * SO_REUSEPORT. The pointer itself is protected by tf_lock in the
+ * containing tcps_bind_fanout slot.
+ */
+ struct tcp_rg_s *tcp_rg_bind;
+
uint_t tcp_maxpsz_multiplier;
uint32_t tcp_lso_max; /* maximum LSO payload */
@@ -493,6 +496,8 @@ typedef struct tcp_s {
/* FIN-WAIT-2 flush timeout */
uint32_t tcp_fin_wait_2_flush_interval;
+ tcp_conn_stats_t tcp_cs;
+
#ifdef DEBUG
pc_t tcmp_stk[15];
#endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index d340aff2a5..ba66be0b2b 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -21,9 +21,9 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013,2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -266,8 +266,6 @@ typedef struct tcpt_s {
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
-void tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
- ip_recv_attr_t *ira);
void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *ira);
static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
@@ -640,15 +638,9 @@ tcp_set_destination(tcp_t *tcp)
tcp->tcp_localnet = uinfo.iulp_localnet;
if (uinfo.iulp_rtt != 0) {
- clock_t rto;
-
- tcp->tcp_rtt_sa = uinfo.iulp_rtt;
- tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra +
- (tcp->tcp_rtt_sa >> 5);
-
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt);
+ tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd);
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
}
if (uinfo.iulp_ssthresh != 0)
tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
@@ -967,8 +959,7 @@ void
tcp_stop_lingering(tcp_t *tcp)
{
clock_t delta = 0;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- conn_t *connp = tcp->tcp_connp;
+ conn_t *connp = tcp->tcp_connp;
tcp->tcp_linger_tid = 0;
if (tcp->tcp_state > TCPS_LISTEN) {
@@ -996,7 +987,7 @@ tcp_stop_lingering(tcp_t *tcp)
if (tcp->tcp_state == TCPS_TIME_WAIT) {
tcp_time_wait_append(tcp);
- TCP_DBGSTAT(tcps, tcp_detach_time_wait);
+ TCP_DBGSTAT(tcp->tcp_tcps, tcp_detach_time_wait);
goto finish;
}
@@ -1239,11 +1230,6 @@ tcp_closei_local(tcp_t *tcp)
if (!TCP_IS_SOCKET(tcp))
tcp_acceptor_hash_remove(tcp);
- TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
- tcp->tcp_ibsegs = 0;
- TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
- tcp->tcp_obsegs = 0;
-
/*
* This can be called via tcp_time_wait_processing() if TCP gets a
* SYN with sequence number outside the TIME-WAIT connection's
@@ -1423,6 +1409,21 @@ tcp_free(tcp_t *tcp)
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
/*
+ * Destroy any association with SO_REUSEPORT group.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ /*
+ * This is only necessary for connections which enabled
+ * SO_REUSEPORT but were never bound. Such connections should
+ * be the one and only member of the tcp_rg_tp to which they
+ * have been associated.
+ */
+ VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ tcp->tcp_rg_bind = NULL;
+ }
+
+ /*
* If this is a non-STREAM socket still holding on to an upper
* handle, release it. As a result of fallback we might also see
* STREAMS based conns with upper handles, in which case there is
@@ -1912,15 +1913,6 @@ tcp_reinit(tcp_t *tcp)
/* Cancel outstanding timers */
tcp_timers_stop(tcp);
- /*
- * Reset everything in the state vector, after updating global
- * MIB data from instance counters.
- */
- TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
- tcp->tcp_ibsegs = 0;
- TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
- tcp->tcp_obsegs = 0;
-
tcp_close_mpp(&tcp->tcp_xmit_head);
if (tcp->tcp_snd_zcopy_aware)
tcp_zcopy_notify(tcp);
@@ -2092,9 +2084,6 @@ tcp_reinit_values(tcp_t *tcp)
tcp->tcp_swnd = 0;
DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */
- ASSERT(tcp->tcp_ibsegs == 0);
- ASSERT(tcp->tcp_obsegs == 0);
-
if (connp->conn_ht_iphc != NULL) {
kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
connp->conn_ht_iphc = NULL;
@@ -2186,6 +2175,8 @@ tcp_reinit_values(tcp_t *tcp)
DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */
tcp->tcp_rtt_update = 0;
+ tcp->tcp_rtt_sum = 0;
+ tcp->tcp_rtt_cnt = 0;
DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
@@ -2334,7 +2325,6 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
conn_t *connp = tcp->tcp_connp;
- clock_t rto;
ASSERT((connp->conn_family == AF_INET &&
connp->conn_ipversion == IPV4_VERSION) ||
@@ -2403,12 +2393,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
* during first few transmissions of a connection as seen in slow
* links.
*/
- tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
- tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
- tcps->tcps_conn_grace_period;
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
+ tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
+ tcps->tcps_conn_grace_period);
tcp->tcp_timer_backoff = 0;
tcp->tcp_ms_we_have_waited = 0;
@@ -2455,8 +2443,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
* Path MTU might have changed by either increase or decrease, so need to
* adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
* or negative MSS, since tcp_mss_set() will do it.
+ *
+ * Returns B_TRUE when the connection PMTU changes, otherwise B_FALSE.
*/
-void
+boolean_t
tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
{
uint32_t pmtu;
@@ -2466,10 +2456,10 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
iaflags_t ixaflags;
if (tcp->tcp_tcps->tcps_ignore_path_mtu)
- return;
+ return (B_FALSE);
if (tcp->tcp_state < TCPS_ESTABLISHED)
- return;
+ return (B_FALSE);
/*
* Always call ip_get_pmtu() to make sure that IP has updated
@@ -2489,13 +2479,13 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
* Nothing to change, so just return.
*/
if (mss == tcp->tcp_mss)
- return;
+ return (B_FALSE);
/*
* Currently, for ICMP errors, only PMTU decrease is handled.
*/
if (mss > tcp->tcp_mss && decrease_only)
- return;
+ return (B_FALSE);
DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
@@ -2530,6 +2520,7 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
}
ixa->ixa_flags = ixaflags;
+ return (B_TRUE);
}
int
@@ -3400,7 +3391,7 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
tcp_update_lso(tcp, connp->conn_ixa);
break;
case IXAN_PMTU:
- tcp_update_pmtu(tcp, B_FALSE);
+ (void) tcp_update_pmtu(tcp, B_FALSE);
break;
case IXAN_ZCOPY:
tcp_update_zcopy(tcp);
@@ -3731,7 +3722,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
{
tcp_stack_t *tcps;
int i;
- int error = 0;
major_t major;
size_t arrsz;
@@ -3795,8 +3785,7 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
tcps->tcps_mibkp = tcp_kstat_init(stackid);
major = mod_name_to_major(INET_NAME);
- error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident);
- ASSERT(error == 0);
+ VERIFY0(ldi_ident_from_major(major, &tcps->tcps_ldi_ident));
tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL);
diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c
index 72093af2f2..ec2a5d4e29 100644
--- a/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ b/usr/src/uts/common/inet/tcp/tcp_bind.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -56,6 +57,7 @@ static uint32_t tcp_random_anon_port = 1;
static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
cred_t *cr);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
/*
* Hash list insertion routine for tcp_t structures. Each hash bucket
@@ -173,6 +175,16 @@ tcp_bind_hash_remove(tcp_t *tcp)
ASSERT(lockp != NULL);
mutex_enter(lockp);
+
+ /* destroy any association with SO_REUSEPORT group */
+ if (tcp->tcp_rg_bind != NULL) {
+ if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+ /* Last one out turns off the lights */
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ }
+ tcp->tcp_rg_bind = NULL;
+ }
+
if (tcp->tcp_ptpbhn) {
tcpnext = tcp->tcp_bind_hash_port;
if (tcpnext != NULL) {
@@ -637,13 +649,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
}
/*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
*
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it. If not, return
- * the first anonymous port we happen across. If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it. If not, return the first anonymous port we
+ * happen across. If no anonymous ports are available, return 0.
*
* In either case, when succeeding update the tcp_t to record the port number
* and insert it in the bind hash table.
@@ -663,6 +674,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int loopmax;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ boolean_t reuseport = connp->conn_reuseport;
/*
* Lookup for free addresses is done in a loop and "loopmax"
@@ -699,6 +711,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
tf_t *tbf;
tcp_t *ltcp;
conn_t *lconnp;
+ boolean_t attempt_reuse = B_FALSE;
lport = htons(port);
@@ -725,6 +738,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
boolean_t not_socket;
boolean_t exclbind;
+ boolean_t addrmatch;
lconnp = ltcp->tcp_connp;
@@ -830,22 +844,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
&lconnp->conn_faddr_v6)))
continue;
+ addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+ &lconnp->conn_bound_addr_v6);
+
+ if (addrmatch && reuseport && bind_to_req_port_only &&
+ (ltcp->tcp_state == TCPS_BOUND ||
+ ltcp->tcp_state == TCPS_LISTEN)) {
+ /*
+ * This entry is bound to the exact same
+ * address and port. If SO_REUSEPORT is set on
+ * the calling socket, attempt to reuse this
+ * binding if it too had SO_REUSEPORT enabled
+ * when it was bound.
+ */
+ attempt_reuse = (ltcp->tcp_rg_bind != NULL);
+ break;
+ }
+
if (!reuseaddr) {
/*
- * No socket option SO_REUSEADDR.
- * If existing port is bound to
- * a non-wildcard IP address
- * and the requesting stream is
- * bound to a distinct
- * different IP addresses
- * (non-wildcard, also), keep
- * going.
+ * No socket option SO_REUSEADDR. If an
+ * existing port is bound to a non-wildcard IP
+ * address and the requesting stream is bound
+ * to a distinct different IP address
+ * (non-wildcard, also), keep going.
*/
if (!V6_OR_V4_INADDR_ANY(*laddr) &&
!V6_OR_V4_INADDR_ANY(
lconnp->conn_bound_addr_v6) &&
- !IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6))
+ !addrmatch)
continue;
if (ltcp->tcp_state >= TCPS_BOUND) {
/*
@@ -860,27 +887,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* socket option SO_REUSEADDR is set on the
* binding tcp_t.
*
- * If two streams are bound to
- * same IP address or both addr
- * and bound source are wildcards
- * (INADDR_ANY), we want to stop
- * searching.
- * We have found a match of IP source
- * address and source port, which is
- * refused regardless of the
- * SO_REUSEADDR setting, so we break.
+ * If two streams are bound to the same IP
+ * address or both addr and bound source are
+ * wildcards (INADDR_ANY), we want to stop
+ * searching. We have found a match of IP
+ * source address and source port, which is
+ * refused regardless of the SO_REUSEADDR
+ * setting, so we break.
*/
- if (IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6) &&
+ if (addrmatch &&
(ltcp->tcp_state == TCPS_LISTEN ||
ltcp->tcp_state == TCPS_BOUND))
break;
}
}
- if (ltcp != NULL) {
+ if (ltcp != NULL && !attempt_reuse) {
/* The port number is busy */
mutex_exit(&tbf->tf_lock);
} else {
+ if (attempt_reuse) {
+ int err;
+ struct tcp_rg_s *rg;
+
+ ASSERT(ltcp != NULL);
+ ASSERT(ltcp->tcp_rg_bind != NULL);
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+ err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+ if (err != 0) {
+ mutex_exit(&tbf->tf_lock);
+ return (0);
+ }
+ /*
+ * Now that the newly-binding socket has joined
+ * the existing reuseport group on ltcp, it
+ * should clean up its own (empty) group.
+ */
+ rg = tcp->tcp_rg_bind;
+ tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+ VERIFY(tcp_rg_remove(rg, tcp));
+ tcp_rg_destroy(rg);
+ }
+
/*
* This port is ours. Insert in fanout and mark as
* bound to prevent others from getting the port
@@ -945,3 +994,125 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
} while (++count < loopmax);
return (0);
}
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define TCP_RG_SIZE_MAX 64
+/* Step size when expanding members array */
+#define TCP_RG_SIZE_STEP 2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+ tcp_rg_t *rg;
+ rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
+ if (rg == NULL)
+ return (NULL);
+ rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
+ KM_NOSLEEP|KM_NORMALPRI);
+ if (rg->tcprg_members == NULL) {
+ kmem_free(rg, sizeof (tcp_rg_t));
+ return (NULL);
+ }
+
+ mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+ rg->tcprg_size = 2;
+ rg->tcprg_count = 1;
+ rg->tcprg_active = 1;
+ rg->tcprg_members[0] = tcp;
+ return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+ mutex_enter(&rg->tcprg_lock);
+ ASSERT(rg->tcprg_count == 0);
+ ASSERT(rg->tcprg_active == 0);
+ kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+ mutex_destroy(&rg->tcprg_lock);
+ kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+ mutex_enter(&rg->tcprg_lock);
+
+ VERIFY(rg->tcprg_size > 0);
+ VERIFY(rg->tcprg_count <= rg->tcprg_size);
+ if (rg->tcprg_count != 0) {
+ cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+ cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+ if (crgetuid(oldcred) != crgetuid(newcred) ||
+ crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EPERM);
+ }
+ }
+
+ if (rg->tcprg_count == rg->tcprg_size) {
+ unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+ unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+ tcp_t **newmembers;
+
+ if (newsize > TCP_RG_SIZE_MAX) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EINVAL);
+ }
+ newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+ KM_NOSLEEP|KM_NORMALPRI);
+ if (newmembers == NULL) {
+ mutex_exit(&rg->tcprg_lock);
+ return (ENOMEM);
+ }
+ bcopy(rg->tcprg_members, newmembers, oldalloc);
+ kmem_free(rg->tcprg_members, oldalloc);
+ rg->tcprg_members = newmembers;
+ rg->tcprg_size = newsize;
+ }
+
+ rg->tcprg_members[rg->tcprg_count] = tcp;
+ rg->tcprg_count++;
+ rg->tcprg_active++;
+
+ mutex_exit(&rg->tcprg_lock);
+ return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+ int i;
+ boolean_t is_empty;
+
+ mutex_enter(&rg->tcprg_lock);
+ for (i = 0; i < rg->tcprg_count; i++) {
+ if (rg->tcprg_members[i] == tcp)
+ break;
+ }
+ /* The item should be present */
+ ASSERT(i < rg->tcprg_count);
+ /* Move the last member into this position */
+ rg->tcprg_count--;
+ rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+ rg->tcprg_members[rg->tcprg_count] = NULL;
+ if (tcp->tcp_connp->conn_reuseport != 0)
+ rg->tcprg_active--;
+ is_empty = (rg->tcprg_count == 0);
+ mutex_exit(&rg->tcprg_lock);
+ return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+ mutex_enter(&rg->tcprg_lock);
+ if (is_active) {
+ rg->tcprg_active++;
+ } else {
+ rg->tcprg_active--;
+ }
+ mutex_exit(&rg->tcprg_lock);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index 6acc02d769..e73c34de34 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -645,14 +646,16 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, send_size);
+ tcp->tcp_cs.tcp_out_data_bytes += send_size;
+ tcp->tcp_cs.tcp_out_data_segs++;
TCPS_BUMP_MIB(tcps, tcpHCInSegs);
TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, send_size);
-
- BUMP_LOCAL(tcp->tcp_obsegs);
- BUMP_LOCAL(peer_tcp->tcp_ibsegs);
+ peer_tcp->tcp_cs.tcp_in_data_inorder_bytes += send_size;
+ peer_tcp->tcp_cs.tcp_in_data_inorder_segs++;
DTRACE_TCP5(send, void, NULL, ip_xmit_attr_t *, connp->conn_ixa,
__dtrace_tcp_void_ip_t *, NULL, tcp_t *, tcp,
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index e917f7c774..11b40e7280 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -22,8 +22,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
/* This file contains all TCP input processing functions. */
@@ -166,7 +166,7 @@ static void tcp_process_options(tcp_t *, tcpha_t *);
static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
-static void tcp_set_rto(tcp_t *, time_t);
+static void tcp_set_rto(tcp_t *, hrtime_t);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
/*
@@ -559,7 +559,7 @@ tcp_process_options(tcp_t *tcp, tcpha_t *tcpha)
static mblk_t *
tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
{
- uint32_t end;
+ uint32_t end, bytes;
mblk_t *mp1;
mblk_t *mp2;
mblk_t *next_mp;
@@ -578,26 +578,26 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
freeb(mp);
continue;
}
+ bytes = end - start;
mp->b_cont = NULL;
TCP_REASS_SET_SEQ(mp, start);
TCP_REASS_SET_END(mp, end);
mp1 = tcp->tcp_reass_tail;
- if (!mp1) {
- tcp->tcp_reass_tail = mp;
- tcp->tcp_reass_head = mp;
- TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
- TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
- end - start);
- continue;
- }
- /* New stuff completely beyond tail? */
- if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
- /* Link it on end. */
- mp1->b_cont = mp;
+ if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
+ if (mp1 != NULL) {
+ /*
+ * New stuff is beyond the tail; link it on the
+ * end.
+ */
+ mp1->b_cont = mp;
+ } else {
+ tcp->tcp_reass_head = mp;
+ }
tcp->tcp_reass_tail = mp;
TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
- TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
- end - start);
+ TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
+ tcp->tcp_cs.tcp_in_data_unorder_segs++;
+ tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
continue;
}
mp1 = tcp->tcp_reass_head;
@@ -2414,7 +2414,7 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
flags = (unsigned int)tcpha->tha_flags & 0xFF;
- BUMP_LOCAL(tcp->tcp_ibsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCInSegs);
DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
if ((flags & TH_URG) && sqp != NULL) {
@@ -2659,7 +2659,7 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
tcp->tcp_ack_tid = 0;
}
tcp_send_data(tcp, ack_mp);
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
if (!IPCL_IS_NONSTR(connp)) {
@@ -3048,6 +3048,7 @@ try_again:;
if (tcp->tcp_rwnd == 0) {
TCPS_BUMP_MIB(tcps, tcpInWinProbe);
+ tcp->tcp_cs.tcp_in_zwnd_probes++;
} else {
TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
@@ -3297,6 +3298,9 @@ ok:;
} else if (seg_len > 0) {
TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
+ tcp->tcp_cs.tcp_in_data_inorder_segs++;
+ tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
+
/*
* If an out of order FIN was received before, and the seq
* num and len of the new segment match that of the FIN,
@@ -3362,7 +3366,7 @@ ok:;
* and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
* byte was at seg_seq - 1, in which case we ignore the urgent flag.
*/
- if (flags & TH_URG && urp >= 0) {
+ if ((flags & TH_URG) && urp >= 0) {
if (!tcp->tcp_urp_last_valid ||
SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
/*
@@ -4146,7 +4150,7 @@ process_ack:
}
mp = tcp_ack_mp(tcp);
if (mp != NULL) {
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
tcp_send_data(tcp, mp);
}
@@ -4304,36 +4308,29 @@ process_ack:
SEQ_GT(seg_ack, tcp->tcp_urg))
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
- /* Can we update the RTT estimates? */
- if (tcp->tcp_snd_ts_ok) {
- /* Ignore zero timestamp echo-reply. */
- if (tcpopt.tcp_opt_ts_ecr != 0) {
- tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
- (int32_t)tcpopt.tcp_opt_ts_ecr);
- }
-
- /* If needed, restart the timer. */
- if (tcp->tcp_set_timer == 1) {
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- tcp->tcp_set_timer = 0;
- }
- /*
- * Update tcp_csuna in case the other side stops sending
- * us timestamps.
- */
- tcp->tcp_csuna = tcp->tcp_snxt;
- } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
+ /*
+ * Update the RTT estimates. Note that we don't use the TCP
+ * timestamp option to calculate RTT even if one is present. This is
+ * because the timestamp option's resolution (CPU tick) is
+ * too coarse to measure modern datacenter networks' microsecond
+ * latencies. The timestamp field's resolution is limited by its
+ * 4-byte width (see RFC1323), and since we always store a
+ * high-resolution nanosecond presision timestamp along with the data,
+ * there is no point to ever using the timestamp option.
+ */
+ if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
/*
* An ACK sequence we haven't seen before, so get the RTT
* and update the RTO. But first check if the timestamp is
* valid to use.
*/
if ((mp1->b_next != NULL) &&
- SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
- tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
- (int32_t)(intptr_t)mp1->b_prev);
- else
+ SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
+ tcp_set_rto(tcp, gethrtime() -
+ (hrtime_t)(intptr_t)mp1->b_prev);
+ } else {
TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
+ }
/* Remeber the last sequence to be ACKed */
tcp->tcp_csuna = seg_ack;
@@ -4362,7 +4359,7 @@ process_ack:
if (SEQ_GT(seg_ack,
(uint32_t)(uintptr_t)(mp1->b_next))) {
mp1->b_prev =
- (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
mp1->b_next = NULL;
}
break;
@@ -4839,11 +4836,13 @@ xmit_check:
if (mp1 != NULL) {
tcp->tcp_xmit_head->b_prev =
- (mblk_t *)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
tcp->tcp_csuna = tcp->tcp_snxt;
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
snd_size);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
tcp_send_data(tcp, mp1);
}
}
@@ -4873,9 +4872,10 @@ xmit_check:
* timer is used to avoid a timeout before the
* limited transmitted segment's ACK gets back.
*/
- if (tcp->tcp_xmit_head != NULL)
+ if (tcp->tcp_xmit_head != NULL) {
tcp->tcp_xmit_head->b_prev =
- (mblk_t *)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
+ }
}
/* Anything more to do? */
@@ -4918,7 +4918,7 @@ ack_check:
if (mp1 != NULL) {
tcp_send_data(tcp, mp1);
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
}
if (tcp->tcp_ack_tid != 0) {
@@ -5211,38 +5211,53 @@ tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
return (mp);
}
-/* The minimum of smoothed mean deviation in RTO calculation. */
-#define TCP_SD_MIN 400
+/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
+#define TCP_SD_MIN 400000000
/*
- * Set RTO for this connection. The formula is from Jacobson and Karels'
- * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
- * are the same as those in Appendix A.2 of that paper.
+ * Set RTO for this connection based on a new round-trip time measurement.
+ * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
+ * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
+ * of that paper.
*
* m = new measurement
* sa = smoothed RTT average (8 * average estimates).
* sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
*/
static void
-tcp_set_rto(tcp_t *tcp, clock_t rtt)
+tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
{
- long m = TICK_TO_MSEC(rtt);
- clock_t sa = tcp->tcp_rtt_sa;
- clock_t sv = tcp->tcp_rtt_sd;
- clock_t rto;
- tcp_stack_t *tcps = tcp->tcp_tcps;
+ hrtime_t m = rtt;
+ hrtime_t sa = tcp->tcp_rtt_sa;
+ hrtime_t sv = tcp->tcp_rtt_sd;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
TCPS_BUMP_MIB(tcps, tcpRttUpdate);
tcp->tcp_rtt_update++;
+ tcp->tcp_rtt_sum += m;
+ tcp->tcp_rtt_cnt++;
/* tcp_rtt_sa is not 0 means this is a new sample. */
if (sa != 0) {
/*
- * Update average estimator:
- * new rtt = 7/8 old rtt + 1/8 Error
+ * Update average estimator (see section 2.3 of RFC6298):
+ * SRTT = 7/8 SRTT + 1/8 rtt
+ *
+ * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
+ * tcp_rtt_sa = 7 * SRTT + rtt
+ * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
+ * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
+ *
+ * (rtt - tcp_rtt_sa / 8) is simply the difference
+ * between the new rtt measurement and the existing smoothed
+ * RTT average. This is referred to as "Error" in subsequent
+ * calculations.
*/
- /* m is now Error in estimate. */
+ /* m is now Error. */
m -= sa >> 3;
if ((sa += m) <= 0) {
/*
@@ -5255,7 +5270,13 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt)
/*
* Update deviation estimator:
- * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
+ * mdev = 3/4 mdev + 1/4 abs(Error)
+ *
+ * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
+ * tcp_rtt_sd = 3 * mdev + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
*/
if (m < 0)
m = -m;
@@ -5275,33 +5296,21 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt)
}
if (sv < TCP_SD_MIN) {
/*
- * We do not know that if sa captures the delay ACK
- * effect as in a long train of segments, a receiver
- * does not delay its ACKs. So set the minimum of sv
- * to be TCP_SD_MIN, which is default to 400 ms, twice
- * of BSD DATO. That means the minimum of mean
+ * Since a receiver doesn't delay its ACKs during a long run of
+ * segments, sa may not have captured the effect of delayed ACK
+ * timeouts on the RTT. To make sure we always account for the
+ * possible delay (and avoid the unnecessary retransmission),
+ * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
+ * 200ms on older SunOS/BSD systems and modern Windows systems
+ * (as of 2019). This means that the minimum possible mean
* deviation is 100 ms.
- *
*/
sv = TCP_SD_MIN;
}
tcp->tcp_rtt_sa = sa;
tcp->tcp_rtt_sd = sv;
- /*
- * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
- *
- * Add tcp_rexmit_interval extra in case of extreme environment
- * where the algorithm fails to work. The default value of
- * tcp_rexmit_interval_extra should be 0.
- *
- * As we use a finer grained clock than BSD and update
- * RTO for every ACKs, add in another .25 of RTT to the
- * deviation of RTO to accomodate burstiness of 1/4 of
- * window size.
- */
- rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
/* Now, we can reset tcp_timer_backoff to use the new RTO... */
tcp->tcp_timer_backoff = 0;
@@ -5563,10 +5572,12 @@ noticmpv4:
switch (icmph->icmph_code) {
case ICMP_FRAGMENTATION_NEEDED:
/*
- * Update Path MTU, then try to send something out.
+ * Attempt to update path MTU and, if the MSS of the
+ * connection is altered, retransmit outstanding data.
*/
- tcp_update_pmtu(tcp, B_TRUE);
- tcp_rexmit_after_error(tcp);
+ if (tcp_update_pmtu(tcp, B_TRUE)) {
+ tcp_rexmit_after_error(tcp);
+ }
break;
case ICMP_PORT_UNREACHABLE:
case ICMP_PROTOCOL_UNREACHABLE:
@@ -5609,7 +5620,7 @@ noticmpv4:
break;
}
break;
- case ICMP_SOURCE_QUENCH: {
+ case ICMP_SOURCE_QUENCH:
/*
* use a global boolean to control
* whether TCP should respond to ICMP_SOURCE_QUENCH.
@@ -5630,7 +5641,6 @@ noticmpv4:
}
break;
}
- }
freemsg(mp);
}
@@ -5683,10 +5693,12 @@ noticmpv6:
switch (icmp6->icmp6_type) {
case ICMP6_PACKET_TOO_BIG:
/*
- * Update Path MTU, then try to send something out.
+ * Attempt to update path MTU and, if the MSS of the connection
+ * is altered, retransmit outstanding data.
*/
- tcp_update_pmtu(tcp, B_TRUE);
- tcp_rexmit_after_error(tcp);
+ if (tcp_update_pmtu(tcp, B_TRUE)) {
+ tcp_rexmit_after_error(tcp);
+ }
break;
case ICMP6_DST_UNREACH:
switch (icmp6->icmp6_code) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 40148b416a..4774412992 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -21,6 +21,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -62,7 +64,8 @@ opdes_t tcp_opt_arr[] = {
{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
@@ -484,6 +487,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
}
/*
+ * Set a TCP connection's participation in SO_REUSEPORT. This operation is
+ * performed under the protection of the squeue via tcp_setsockopt.
+ * The manipulation of tcp_rg_bind, as part of this operation, is subject to
+ * these constraints:
+ * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
+ * under the protection of the squeue.
+ * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
+ * altered until such time as tcp_free() cleans up the connection.
+ * 3. A connection undergoing bind, which matches to a connection participating
+ * in port-reuse, will switch its tcp_rg_bind pointer when it joins the
+ * group of an existing connection in tcp_bindi().
+ */
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+ tcp_t *tcp = connp->conn_tcp;
+ struct tcp_rg_s *rg;
+
+ if (!IPCL_IS_NONSTR(connp)) {
+ if (do_enable) {
+ /*
+ * SO_REUSEPORT cannot be enabled on sockets which have
+ * fallen back to the STREAMS API.
+ */
+ return (EINVAL);
+ } else {
+ /*
+ * A connection with SO_REUSEPORT enabled should be
+ * prevented from falling back to STREAMS mode via
+ * logic in tcp_fallback. It is legal, however, for
+ * fallen-back connections to affirm the disabled state
+ * of SO_REUSEPORT.
+ */
+ ASSERT(connp->conn_reuseport == 0);
+ return (0);
+ }
+ }
+ if (tcp->tcp_state <= TCPS_CLOSED) {
+ return (EINVAL);
+ }
+ if (connp->conn_reuseport == 0 && do_enable) {
+ /* disabled -> enabled */
+ if (tcp->tcp_rg_bind != NULL) {
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ } else {
+ /*
+ * Connection state is not a concern when initially
+ * populating tcp_rg_bind. Setting it to non-NULL on a
+ * bound or listening connection would only mean that
+ * new reused-port binds become a possibility.
+ */
+ if ((rg = tcp_rg_init(tcp)) == NULL) {
+ return (ENOMEM);
+ }
+ tcp->tcp_rg_bind = rg;
+ }
+ connp->conn_reuseport = 1;
+ } else if (connp->conn_reuseport != 0 && !do_enable) {
+ /* enabled -> disabled */
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ if (tcp->tcp_state == TCPS_IDLE) {
+ /*
+ * If the connection has not been bound yet, discard
+ * the reuse group state. Since disabling SO_REUSEPORT
+ * on a bound socket will _not_ prevent others from
+ * reusing the port, the presence of tcp_rg_bind is
+ * used to determine reuse availability, not
+ * conn_reuseport.
+ *
+ * This allows proper behavior for examples such as:
+ *
+ * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
+ * bind(fd1, &myaddr, ...);
+ * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
+ *
+ * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
+ * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
+ *
+ */
+ rg = tcp->tcp_rg_bind;
+ tcp->tcp_rg_bind = NULL;
+ VERIFY(tcp_rg_remove(rg, tcp));
+ tcp_rg_destroy(rg);
+ } else {
+ /*
+ * If a connection has been bound, it's no longer safe
+ * to manipulate tcp_rg_bind until connection clean-up
+ * during tcp_free. Just mark the member status of the
+ * connection as inactive.
+ */
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ }
+ connp->conn_reuseport = 0;
+ }
+ return (0);
+}
+
+/*
* We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
* Parameters are assumed to be verified by the caller.
*/
@@ -653,6 +754,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
*outlenp = inlen;
return (0);
+ case SO_REUSEPORT:
+ if (!checkonly) {
+ return (tcp_set_reuseport(connp, *i1 != 0));
+ }
+ return (0);
}
break;
case IPPROTO_TCP:
@@ -869,9 +975,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
tcp->tcp_cork = onoff;
}
break;
- case TCP_RTO_INITIAL: {
- clock_t rto;
-
+ case TCP_RTO_INITIAL:
if (checkonly || val == 0)
break;
@@ -901,15 +1005,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
if (tcp->tcp_state >= TCPS_SYN_SENT)
break;
- tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
- tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra +
- (tcp->tcp_rtt_sa >> 5) +
- tcps->tcps_conn_grace_period;
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
+ tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
+ tcps->tcps_conn_grace_period);
break;
- }
case TCP_RTO_MIN:
if (checkonly || val == 0)
break;
@@ -976,10 +1076,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
break;
case IPPROTO_IP:
- if (connp->conn_family != AF_INET) {
- *outlenp = 0;
- return (EINVAL);
- }
switch (name) {
case IP_SEC_OPT:
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c
index 60840a3d54..f54ab3fb33 100644
--- a/usr/src/uts/common/inet/tcp/tcp_output.c
+++ b/usr/src/uts/common/inet/tcp/tcp_output.c
@@ -21,7 +21,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/* This file contains all TCP output processing functions. */
@@ -58,12 +59,12 @@ static void tcp_wput_flush(tcp_t *, mblk_t *);
static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
static int tcp_xmit_end(tcp_t *);
static int tcp_send(tcp_t *, const int, const int, const int,
- const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
+ const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
-static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
+static void tcp_fill_header(tcp_t *, uchar_t *, int);
/*
* Functions called directly via squeue having a prototype of edesc_t.
@@ -454,7 +455,7 @@ data_null:
}
}
- local_time = (mblk_t *)now;
+ local_time = (mblk_t *)(intptr_t)gethrtime();
/*
* "Our" Nagle Algorithm. This is not the same as in the old
@@ -1183,12 +1184,13 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
snxt = tcp->tcp_snxt;
/*
- * Check to see if this connection has been idled for some
- * time and no ACK is expected. If it is, we need to slow
- * start again to get back the connection's "self-clock" as
- * described in VJ's paper.
+ * Check to see if this connection has been idle for some time and no
+ * ACK is expected. If so, then the congestion window size is no longer
+ * meaningfully tied to current network conditions.
*
- * Reinitialize tcp_cwnd after idle.
+ * We reinitialize tcp_cwnd, and slow start again to get back the
+ * connection's "self-clock" as described in Van Jacobson's 1988 paper
+ * "Congestion avoidance and control".
*/
now = LBOLT_FASTPATH;
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
@@ -1256,7 +1258,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
if ((mp1 = dupb(mp)) == 0)
goto no_memory;
- mp->b_prev = (mblk_t *)(uintptr_t)now;
+ mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
mp->b_next = (mblk_t *)(uintptr_t)snxt;
/* adjust tcp header information */
@@ -1271,7 +1273,9 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
+ tcp->tcp_cs.tcp_out_data_segs++;
+ tcp->tcp_cs.tcp_out_data_bytes += len;
/* Update the latest receive window size in TCP header. */
tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
@@ -1311,12 +1315,10 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
/* Fill in the timestamp option. */
if (tcp->tcp_snd_ts_ok) {
- uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
-
- U32_TO_BE32(llbolt,
- (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
+ U32_TO_BE32(now,
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
U32_TO_BE32(tcp->tcp_ts_recent,
- (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
} else {
ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
}
@@ -1771,7 +1773,7 @@ tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
static int
tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
const int tcp_hdr_len, const int num_sack_blk, int *usable,
- uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
+ uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
{
int num_lso_seg = 1;
uint_t lso_usable;
@@ -1960,16 +1962,21 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
}
*snxt += len;
*tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
+ tcp->tcp_cs.tcp_out_data_segs++;
+ tcp->tcp_cs.tcp_out_data_bytes += len;
tcp_send_data(tcp, mp);
continue;
}
*snxt += len; /* Adjust later if we don't send all of len */
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
+ tcp->tcp_cs.tcp_out_data_segs++;
+ tcp->tcp_cs.tcp_out_data_bytes += len;
if (*tail_unsent) {
/* Are the bytes above us in flight? */
@@ -2066,7 +2073,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
* Fill in the header using the template header, and add
* options such as time-stamp, ECN and/or SACK, as needed.
*/
- tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
+ tcp_fill_header(tcp, rptr, num_sack_blk);
mp->b_rptr = rptr;
@@ -2145,6 +2152,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
*snxt += spill;
tcp->tcp_last_sent_len += spill;
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
+ tcp->tcp_cs.tcp_out_data_bytes += spill;
/*
* Adjust the checksum
*/
@@ -2193,7 +2201,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
*/
ixa->ixa_fragsize = ixa->ixa_pmtu;
ixa->ixa_extra_ident = 0;
- tcp->tcp_obsegs += num_lso_seg;
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCP_STAT(tcps, tcp_lso_times);
TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
} else {
@@ -2204,7 +2212,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
*/
lso_info_cleanup(mp);
tcp_send_data(tcp, mp);
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
}
}
@@ -2284,8 +2292,8 @@ tcp_xmit_end(tcp_t *tcp)
* So don't do any update.
*/
bzero(&uinfo, sizeof (uinfo));
- uinfo.iulp_rtt = tcp->tcp_rtt_sa;
- uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
+ uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
+ uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
/*
* Note that uinfo is kept for conn_faddr in the DCE. Could update even
@@ -2420,7 +2428,7 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
tcp->tcp_rack_cnt = 0;
TCPS_BUMP_MIB(tcps, tcpOutAck);
}
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
tcpha->tha_seq = htonl(seq);
tcpha->tha_ack = htonl(ack);
/*
@@ -3389,11 +3397,13 @@ tcp_sack_rexmit(tcp_t *tcp, uint_t *flags)
/*
* Update the send timestamp to avoid false retransmission.
*/
- snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += seg_len;
/*
* Update tcp_rexmit_max to extend this SACK recovery phase.
* This happens when new data sent during fast recovery is
@@ -3461,9 +3471,11 @@ tcp_ss_rexmit(tcp_t *tcp)
* Update the send timestamp to avoid false
* retransmission.
*/
- old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += cnt;
tcp->tcp_rexmit_nxt = snxt;
}
@@ -3621,7 +3633,7 @@ tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
* ECN and/or SACK.
*/
static void
-tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
+tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
{
tcpha_t *tcp_tmpl, *tcpha;
uint32_t *dst, *src;
@@ -3643,7 +3655,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
/* Fill time-stamp option if needed */
if (tcp->tcp_snd_ts_ok) {
- U32_TO_BE32((uint32_t)now,
+ U32_TO_BE32(LBOLT_FASTPATH,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
U32_TO_BE32(tcp->tcp_ts_recent,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index a431bf63d1..2de76ea060 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/* This file contains all TCP kernel socket related functions. */
@@ -221,7 +222,7 @@ tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
}
- squeue_synch_exit(connp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
if (error < 0) {
if (error == -TOUTSTATE)
@@ -268,7 +269,7 @@ tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
else
error = proto_tlitosyserr(-error);
}
- squeue_synch_exit(connp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
return (error);
}
@@ -332,7 +333,13 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
connp->conn_upper_handle, &sopp);
}
done:
- squeue_synch_exit(connp);
+ /*
+ * Indicate (via SQ_PROCESS) that it is acceptable for the squeue to
+ * attempt to drain a pending request relevant to this connection when
+ * exiting the synchronous context. This can improve the performance
+ * and efficiency of TCP connect(2) operations to localhost.
+ */
+ squeue_synch_exit(connp, SQ_PROCESS);
return ((error == 0) ? EINPROGRESS : error);
}
@@ -401,7 +408,7 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
}
len = tcp_opt_get(connp, level, option_name, optvalp_buf);
- squeue_synch_exit(connp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
if (len == -1) {
kmem_free(optvalp_buf, max_optbuf_len);
@@ -462,14 +469,14 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
if (error < 0) {
error = proto_tlitosyserr(-error);
}
- squeue_synch_exit(connp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
return (error);
}
error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
NULL, cr);
- squeue_synch_exit(connp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
ASSERT(error >= 0);
@@ -645,7 +652,7 @@ tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
}
}
- squeue_synch_exit(connp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
}
/* ARGSUSED */
@@ -1022,6 +1029,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
}
/*
+ * Do not allow fallback on connections making use of SO_REUSEPORT.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ freeb(stropt_mp);
+ freeb(ordrel_mp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
+ return (EINVAL);
+ }
+
+ /*
* Both endpoints must be of the same type (either STREAMS or
* non-STREAMS) for fusion to be enabled. So if we are fused,
* we have to unfuse.
@@ -1051,7 +1068,7 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
* There should be atleast two ref's (IP + TCP)
*/
ASSERT(connp->conn_ref >= 2);
- squeue_synch_exit(connp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
return (0);
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c
index e6b13fe6c9..dbf320d09d 100644
--- a/usr/src/uts/common/inet/tcp/tcp_stats.c
+++ b/usr/src/uts/common/inet/tcp/tcp_stats.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -86,6 +87,50 @@ tcp_snmp_state(tcp_t *tcp)
}
}
+static void
+tcp_set_conninfo(tcp_t *tcp, struct tcpConnEntryInfo_s *tcei, boolean_t ispriv)
+{
+ /* Don't want just anybody seeing these... */
+ if (ispriv) {
+ tcei->ce_snxt = tcp->tcp_snxt;
+ tcei->ce_suna = tcp->tcp_suna;
+ tcei->ce_rnxt = tcp->tcp_rnxt;
+ tcei->ce_rack = tcp->tcp_rack;
+ } else {
+ /*
+ * Netstat, unfortunately, uses this to get send/receive queue
+ * sizes. How to fix? Why not compute the difference only?
+ */
+ tcei->ce_snxt = tcp->tcp_snxt - tcp->tcp_suna;
+ tcei->ce_suna = 0;
+ tcei->ce_rnxt = tcp->tcp_rnxt - tcp->tcp_rack;
+ tcei->ce_rack = 0;
+ }
+
+ tcei->ce_in_data_inorder_bytes = tcp->tcp_cs.tcp_in_data_inorder_bytes;
+ tcei->ce_in_data_inorder_segs = tcp->tcp_cs.tcp_in_data_inorder_segs;
+ tcei->ce_in_data_unorder_bytes = tcp->tcp_cs.tcp_in_data_unorder_bytes;
+ tcei->ce_in_data_unorder_segs = tcp->tcp_cs.tcp_in_data_unorder_segs;
+ tcei->ce_in_zwnd_probes = tcp->tcp_cs.tcp_in_zwnd_probes;
+
+ tcei->ce_out_data_bytes = tcp->tcp_cs.tcp_out_data_bytes;
+ tcei->ce_out_data_segs = tcp->tcp_cs.tcp_out_data_segs;
+ tcei->ce_out_retrans_bytes = tcp->tcp_cs.tcp_out_retrans_bytes;
+ tcei->ce_out_retrans_segs = tcp->tcp_cs.tcp_out_retrans_segs;
+ tcei->ce_out_zwnd_probes = tcp->tcp_cs.tcp_out_zwnd_probes;
+
+ tcei->ce_unsent = tcp->tcp_unsent;
+ tcei->ce_swnd = tcp->tcp_swnd;
+ tcei->ce_cwnd = tcp->tcp_cwnd;
+ tcei->ce_rwnd = tcp->tcp_rwnd;
+ tcei->ce_rto = tcp->tcp_rto;
+ tcei->ce_mss = tcp->tcp_mss;
+ tcei->ce_state = tcp->tcp_state;
+ tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3);
+ tcei->ce_rtt_sum = NSEC2USEC(tcp->tcp_rtt_sum);
+ tcei->ce_rtt_cnt = tcp->tcp_rtt_cnt;
+}
+
/*
* Return SNMP stuff in buffer in mpdata.
*/
@@ -183,11 +228,6 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl, boolean_t legacy_req)
continue; /* not in this zone */
tcp = connp->conn_tcp;
- TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
- tcp->tcp_ibsegs = 0;
- TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
- tcp->tcp_obsegs = 0;
-
tce6.tcp6ConnState = tce.tcpConnState =
tcp_snmp_state(tcp);
if (tce.tcpConnState == MIB2_TCP_established ||
@@ -243,35 +283,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl, boolean_t legacy_req)
} else {
tce6.tcp6ConnIfIndex = connp->conn_bound_if;
}
- /* Don't want just anybody seeing these... */
- if (ispriv) {
- tce6.tcp6ConnEntryInfo.ce_snxt =
- tcp->tcp_snxt;
- tce6.tcp6ConnEntryInfo.ce_suna =
- tcp->tcp_suna;
- tce6.tcp6ConnEntryInfo.ce_rnxt =
- tcp->tcp_rnxt;
- tce6.tcp6ConnEntryInfo.ce_rack =
- tcp->tcp_rack;
- } else {
- /*
- * Netstat, unfortunately, uses this to
- * get send/receive queue sizes. How to fix?
- * Why not compute the difference only?
- */
- tce6.tcp6ConnEntryInfo.ce_snxt =
- tcp->tcp_snxt - tcp->tcp_suna;
- tce6.tcp6ConnEntryInfo.ce_suna = 0;
- tce6.tcp6ConnEntryInfo.ce_rnxt =
- tcp->tcp_rnxt - tcp->tcp_rack;
- tce6.tcp6ConnEntryInfo.ce_rack = 0;
- }
- tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd;
- tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd;
- tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto;
- tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss;
- tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state;
+ tcp_set_conninfo(tcp, &tce6.tcp6ConnEntryInfo,
+ ispriv);
tce6.tcp6ConnCreationProcess =
(connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS :
@@ -307,37 +321,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl, boolean_t legacy_req)
}
tce.tcpConnLocalPort = ntohs(connp->conn_lport);
tce.tcpConnRemPort = ntohs(connp->conn_fport);
- /* Don't want just anybody seeing these... */
- if (ispriv) {
- tce.tcpConnEntryInfo.ce_snxt =
- tcp->tcp_snxt;
- tce.tcpConnEntryInfo.ce_suna =
- tcp->tcp_suna;
- tce.tcpConnEntryInfo.ce_rnxt =
- tcp->tcp_rnxt;
- tce.tcpConnEntryInfo.ce_rack =
- tcp->tcp_rack;
- } else {
- /*
- * Netstat, unfortunately, uses this to
- * get send/receive queue sizes. How
- * to fix?
- * Why not compute the difference only?
- */
- tce.tcpConnEntryInfo.ce_snxt =
- tcp->tcp_snxt - tcp->tcp_suna;
- tce.tcpConnEntryInfo.ce_suna = 0;
- tce.tcpConnEntryInfo.ce_rnxt =
- tcp->tcp_rnxt - tcp->tcp_rack;
- tce.tcpConnEntryInfo.ce_rack = 0;
- }
- tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd;
- tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd;
- tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto;
- tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss;
- tce.tcpConnEntryInfo.ce_state =
- tcp->tcp_state;
+ tcp_set_conninfo(tcp, &tce.tcpConnEntryInfo,
+ ispriv);
tce.tcpConnCreationProcess =
(connp->conn_cpid < 0) ?
diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
index 72997de24a..caf7aeda50 100644
--- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c
+++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
@@ -608,7 +608,7 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
- BUMP_LOCAL(tcp->tcp_ibsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCInSegs);
DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
flags = (unsigned int)tcpha->tha_flags & 0xFF;
@@ -794,6 +794,8 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
TCPS_BUMP_MIB(tcps, tcpInClosed);
TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
+ tcp->tcp_cs.tcp_in_data_inorder_segs++;
+ tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
}
if (flags & TH_RST) {
(void) tcp_clean_death(tcp, 0);
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
index e3dba42c9b..81cf5c57a5 100644
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -23,7 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2011 Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -594,7 +594,7 @@ tcp_ack_timer(void *arg)
mp = tcp_ack_mp(tcp);
if (mp != NULL) {
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
tcp_send_data(tcp, mp);
@@ -751,15 +751,14 @@ tcp_timer(void *arg)
case TCPS_LAST_ACK:
/* If we have data to rexmit */
if (tcp->tcp_suna != tcp->tcp_snxt) {
- clock_t time_to_wait;
+ clock_t time_to_wait;
TCPS_BUMP_MIB(tcps, tcpTimRetrans);
if (!tcp->tcp_xmit_head)
break;
- time_to_wait = ddi_get_lbolt() -
- (clock_t)tcp->tcp_xmit_head->b_prev;
- time_to_wait = tcp->tcp_rto -
- TICK_TO_MSEC(time_to_wait);
+ time_to_wait = NSEC2MSEC(gethrtime() -
+ (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
+ time_to_wait = tcp->tcp_rto - time_to_wait;
/*
* If the timer fires too early, 1 clock tick earlier,
* restart the timer.
@@ -854,6 +853,7 @@ tcp_timer(void *arg)
tcp->tcp_swnd++;
tcp->tcp_zero_win_probe = B_TRUE;
TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
+ tcp->tcp_cs.tcp_out_zwnd_probes++;
} else {
/*
* Handle timeout from sender SWS avoidance.
@@ -1012,8 +1012,8 @@ tcp_timer(void *arg)
* window probe.
*/
if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
- tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
- (tcp->tcp_rtt_sa >> 5);
+ tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
+ tcp->tcp_rtt_sa >> 5;
tcp->tcp_rtt_sa = 0;
tcp_ip_notify(tcp);
tcp->tcp_rtt_update = 0;
@@ -1022,24 +1022,14 @@ tcp_timer(void *arg)
timer_rexmit:
tcp->tcp_timer_backoff++;
- if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
- tcp->tcp_rto_min) {
- /*
- * This means the original RTO is tcp_rexmit_interval_min.
- * So we will use tcp_rexmit_interval_min as the RTO value
- * and do the backoff.
- */
- ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
- } else {
- ms <<= tcp->tcp_timer_backoff;
- }
+ /*
+ * Calculate the backed off retransmission timeout. If the shift brings
+ * us back over the max, then we repin the value, and decrement the
+ * backoff to avoid overflow.
+ */
+ ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff;
if (ms > tcp->tcp_rto_max) {
ms = tcp->tcp_rto_max;
- /*
- * ms is at max, decrement tcp_timer_backoff to avoid
- * overflow.
- */
tcp->tcp_timer_backoff--;
}
tcp->tcp_ms_we_have_waited += ms;
@@ -1059,8 +1049,9 @@ timer_rexmit:
if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
mss = tcp->tcp_swnd;
- if ((mp = tcp->tcp_xmit_head) != NULL)
- mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ if ((mp = tcp->tcp_xmit_head) != NULL) {
+ mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+ }
mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
B_TRUE);
@@ -1091,6 +1082,8 @@ timer_rexmit:
tcp->tcp_csuna = tcp->tcp_snxt;
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += mss;
tcp_send_data(tcp, mp);
}
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 4ef1886bae..d2e24a71fb 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -20,9 +20,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#ifndef _INET_TCP_IMPL_H
@@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls;
* by setting it to 0.
*/
#define TCP_XMIT_LOWATER 4096
-#define TCP_XMIT_HIWATER 49152
+#define TCP_XMIT_HIWATER 128000
#define TCP_RECV_LOWATER 2048
-#define TCP_RECV_HIWATER 128000
+#define TCP_RECV_HIWATER 1048576
/*
* Bind hash list size and has function. It has to be a power of 2 for
@@ -300,17 +300,6 @@ typedef struct tcp_squeue_priv_s {
}
/*
- * Set tcp_rto with boundary checking.
- */
-#define TCP_SET_RTO(tcp, rto) \
- if ((rto) < (tcp)->tcp_rto_min) \
- (tcp)->tcp_rto = (tcp)->tcp_rto_min; \
- else if ((rto) > (tcp)->tcp_rto_max) \
- (tcp)->tcp_rto = (tcp)->tcp_rto_max; \
- else \
- (tcp)->tcp_rto = (rto);
-
-/*
* TCP options struct returned from tcp_parse_options.
*/
typedef struct tcp_opt_s {
@@ -406,6 +395,22 @@ typedef struct tcp_listen_cnt_s {
uint32_t tlc_drop;
} tcp_listen_cnt_t;
+/*
+ * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT.
+ * - tcprg_lock: Protects the other fields
+ * - tcprg_size: Allocated size (in entries) of tcprg_members array
+ * - tcprg_count: Count of occupied tcprg_members slots
+ * - tcprg_active: Count of members which still have SO_REUSEPORT set
+ * - tcprg_members: Connections associated with address/port group
+ */
+typedef struct tcp_rg_s {
+ kmutex_t tcprg_lock;
+ unsigned int tcprg_size;
+ unsigned int tcprg_count;
+ unsigned int tcprg_active;
+ tcp_t **tcprg_members;
+} tcp_rg_t;
+
#define TCP_TLC_REPORT_INTERVAL (30 * MINUTES)
#define TCP_DECR_LISTEN_CNT(tcp) \
@@ -574,6 +579,61 @@ extern uint32_t tcp_early_abort;
#define tcps_reass_timeout tcps_propinfo_tbl[59].prop_cur_uval
#define tcps_iss_incr tcps_propinfo_tbl[65].prop_cur_uval
+
+/*
+ * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a
+ * multiple of the deviation estimates (K * RTTVAR):
+ *
+ * RTO = SRTT + max(G, K * RTTVAR)
+ *
+ * K is defined in the RFC as 4, and G is the clock granularity. We constrain
+ * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this
+ * becomes:
+ *
+ * RTO = SRTT + 4 * RTTVAR
+ *
+ * In practice, however, we make several additions to it. As we use a finer
+ * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of
+ * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size:
+ *
+ * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR
+ *
+ * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR,
+ * this becomes:
+ *
+ * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd
+ * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd
+ * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd
+ *
+ * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are
+ * used to help account for extreme environments where the algorithm fails to
+ * work; by default they should be 0. (The latter tunable is only used for
+ * calculating the intial RTO, and so is optionally passed in as "extra".) We
+ * add them here:
+ *
+ * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd +
+ * tcps_rexmit_interval_extra + tcps_conn_grace_period
+ *
+ * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5
+ * of RFC 6298).
+ */
+static __GNU_INLINE clock_t
+tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra)
+{
+ clock_t rto;
+
+ rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
+ tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra;
+
+ if (rto < tcp->tcp_rto_min) {
+ rto = tcp->tcp_rto_min;
+ } else if (rto > tcp->tcp_rto_max) {
+ rto = tcp->tcp_rto_max;
+ }
+
+ return (rto);
+}
+
extern struct qinit tcp_rinitv4, tcp_rinitv6;
extern boolean_t do_tcp_fusion;
@@ -632,7 +692,7 @@ extern int tcp_rwnd_set(tcp_t *, uint32_t);
extern int tcp_set_destination(tcp_t *);
extern void tcp_set_ws_value(tcp_t *);
extern void tcp_stop_lingering(tcp_t *);
-extern void tcp_update_pmtu(tcp_t *, boolean_t);
+extern boolean_t tcp_update_pmtu(tcp_t *, boolean_t);
extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
extern boolean_t tcp_zcopy_check(tcp_t *);
extern void tcp_zcopy_notify(tcp_t *);
@@ -649,6 +709,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *,
int, boolean_t, boolean_t, boolean_t);
extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *,
boolean_t);
+extern tcp_rg_t *tcp_rg_init(tcp_t *);
+extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *);
+extern void tcp_rg_destroy(tcp_rg_t *);
+extern void tcp_rg_setactive(tcp_rg_t *, boolean_t);
/*
* Fusion related functions in tcp_fusion.c.
diff --git a/usr/src/uts/common/inet/tcp_stats.h b/usr/src/uts/common/inet/tcp_stats.h
index 487d0d3414..704102e9d6 100644
--- a/usr/src/uts/common/inet/tcp_stats.h
+++ b/usr/src/uts/common/inet/tcp_stats.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#ifndef _INET_TCP_STATS_H
@@ -205,6 +206,26 @@ typedef struct {
tcp_stat_counter_t tcp_sc_stats;
} tcp_stats_cpu_t;
+/*
+ * Per-connection statistics. Some of these are also kept globally in the
+ * per-cpu tcp_sc_mib entry (see tcp_stats_cpu_t above). We need not maintain
+ * per-cpu versions of these stats since a connection is typically processed
+ * on the same CPU.
+ */
+typedef struct tcp_conn_stats {
+ uint64_t tcp_in_data_inorder_bytes;
+ uint64_t tcp_in_data_inorder_segs;
+ uint64_t tcp_in_data_unorder_bytes;
+ uint64_t tcp_in_data_unorder_segs;
+ uint64_t tcp_in_zwnd_probes;
+
+ uint64_t tcp_out_data_bytes;
+ uint64_t tcp_out_data_segs;
+ uint64_t tcp_out_retrans_bytes;
+ uint64_t tcp_out_retrans_segs;
+ uint64_t tcp_out_zwnd_probes;
+} tcp_conn_stats_t;
+
#define TCPS_BUMP_MIB(tcps, x) \
BUMP_MIB(&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_mib, x)
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index d233ea14de..165adcb852 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -22,6 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2018, Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -76,7 +77,8 @@
#include <inet/ipclassifier.h>
#include <sys/squeue_impl.h>
#include <inet/ipnet.h>
-#include <sys/ethernet.h>
+#include <sys/vxlan.h>
+#include <inet/inet_hash.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
@@ -346,6 +348,85 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
typedef union T_primitives *t_primp_t;
/*
+ * Various protocols that encapsulate UDP have no real use for the source port.
+ * Instead, they want to vary the source port to provide better equal-cost
+ * multipathing and other systems that use fanout. Consider something like
+ * VXLAN. If you're actually sending multiple different streams to a single
+ * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP,
+ * SRC Port, DST Port) will always be the same.
+ *
+ * Here, we return a port to hash this to, if we know how to hash it. If for
+ * some reason we can't perform an L4 hash, then we just return the default
+ * value, usually the default port. After we determine the hash we transform it
+ * so that it's in the range of [ min, max ].
+ *
+ * We'd like to avoid a pull up for the sake of performing the hash. If the
+ * first mblk_t doesn't have the full protocol header, then we just send it to
+ * the default. If for some reason we have an encapsulated packet that has its
+ * protocol header in different parts of an mblk_t, then we'll go with the
+ * default port. This means that that if a driver isn't consistent about how it
+ * generates the frames for a given flow, it will not always be consistently
+ * hashed. That should be an uncommon event.
+ */
+uint16_t
+udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
+ uint16_t def)
+{
+ size_t szused = 0;
+ ip6_t *ip6h;
+ ipha_t *ipha;
+ uint16_t sap;
+ uint64_t hash;
+ uint32_t mod;
+
+ ASSERT(min <= max);
+
+ if (type != UDP_HASH_VXLAN)
+ return (def);
+
+ if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)))
+ return (def);
+
+ /*
+ * The following logic is VXLAN specific to get at the header, if we
+ * have formats, eg. GENEVE, then we should ignore this.
+ *
+ * The kernel overlay device often puts a first mblk_t for the data
+ * which is just the encap. If so, then we're going to use that and try
+ * to avoid a pull up.
+ */
+ if (MBLKL(mp) == VXLAN_HDR_LEN) {
+ if (mp->b_cont == NULL)
+ return (def);
+ mp = mp->b_cont;
+ } else if (MBLKL(mp) < VXLAN_HDR_LEN) {
+ return (def);
+ } else {
+ szused = VXLAN_HDR_LEN;
+ }
+
+ /* Can we hold a MAC header? */
+ if (MBLKL(mp) + szused < sizeof (struct ether_header))
+ return (def);
+
+ /*
+ * We need to lie about the starting offset into the message block for
+ * convenience. Undo it at the end. We know that inet_pkt_hash() won't
+ * modify the mblk_t.
+ */
+ mp->b_rptr += szused;
+ hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 |
+ INET_PKT_HASH_L3 | INET_PKT_HASH_L4);
+ mp->b_rptr -= szused;
+
+ if (hash == 0)
+ return (def);
+
+ mod = max - min + 1;
+ return ((hash % mod) + min);
+}
+
+/*
* Return the next anonymous port in the privileged port range for
* bind checking.
*
@@ -1584,6 +1665,16 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
*i1 = udp->udp_rcvhdr ? 1 : 0;
mutex_exit(&connp->conn_lock);
return (sizeof (int));
+ case UDP_SRCPORT_HASH:
+ mutex_enter(&connp->conn_lock);
+ *i1 = udp->udp_vxlanhash;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
+ case UDP_SND_TO_CONNECTED:
+ mutex_enter(&connp->conn_lock);
+ *i1 = udp->udp_snd_to_conn ? 1 : 0;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
}
}
mutex_enter(&connp->conn_lock);
@@ -1719,6 +1810,31 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
udp->udp_rcvhdr = onoff;
mutex_exit(&connp->conn_lock);
return (0);
+ case UDP_SRCPORT_HASH:
+ /*
+ * This should have already been verified, but double
+ * check.
+ */
+ if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
+ return (error);
+ }
+
+ /* First see if the val is something we understand */
+ if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN)
+ return (EINVAL);
+
+ if (!checkonly) {
+ mutex_enter(&connp->conn_lock);
+ udp->udp_vxlanhash = *i1;
+ mutex_exit(&connp->conn_lock);
+ }
+ /* Fully handled this option. */
+ return (0);
+ case UDP_SND_TO_CONNECTED:
+ mutex_enter(&connp->conn_lock);
+ udp->udp_snd_to_conn = onoff;
+ mutex_exit(&connp->conn_lock);
+ return (0);
}
break;
}
@@ -2002,13 +2118,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
uint32_t cksum;
udp_t *udp = connp->conn_udp;
boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ boolean_t hash_srcport = udp->udp_vxlanhash;
uint_t ulp_hdr_len;
+ uint16_t srcport;
data_len = msgdsize(data_mp);
ulp_hdr_len = UDPH_SIZE;
if (insert_spi)
ulp_hdr_len += sizeof (uint32_t);
+ /*
+ * If we have source port hashing going on, determine the hash before
+ * we modify the mblk_t.
+ */
+ if (hash_srcport == B_TRUE) {
+ srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+ IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+ ntohs(connp->conn_lport));
+ }
+
mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
if (mp == NULL) {
@@ -2020,7 +2148,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
- udpha->uha_src_port = connp->conn_lport;
+ if (hash_srcport == B_TRUE) {
+ udpha->uha_src_port = htons(srcport);
+ } else {
+ udpha->uha_src_port = connp->conn_lport;
+ }
udpha->uha_dst_port = dstport;
udpha->uha_checksum = 0;
udpha->uha_length = htons(data_len);
@@ -3195,6 +3327,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
udp_t *udp = connp->conn_udp;
udp_stack_t *us = udp->udp_us;
boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ boolean_t hash_srcport = udp->udp_vxlanhash;
uint_t pktlen;
uint_t alloclen;
uint_t copylen;
@@ -3203,10 +3336,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
udpha_t *udpha;
uint32_t cksum;
ip_pkt_t *ipp;
+ uint16_t srcport;
ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
+ * If we have source port hashing going on, determine the hash before
+ * we modify the mblk_t.
+ */
+ if (hash_srcport == B_TRUE) {
+ srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+ IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+ ntohs(connp->conn_lport));
+ }
+
+ /*
* Copy the header template and leave space for an SPI
*/
copylen = connp->conn_ht_iphc_len;
@@ -3304,6 +3448,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
*((uint32_t *)(udpha + 1)) = 0;
udpha->uha_dst_port = dstport;
+ if (hash_srcport == B_TRUE)
+ udpha->uha_src_port = htons(srcport);
+
return (mp);
}
@@ -5952,10 +6099,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
else
return (error);
}
- if (udp->udp_state == TS_DATA_XFER) {
+
+ /*
+ * Check if we're allowed to send to a connection on which we've
+ * already called 'connect'. The posix spec. allows both behaviors but
+ * historically we've returned an error if already connected. The
+ * client can allow this via a sockopt.
+ */
+ if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {
UDPS_BUMP_MIB(us, udpOutErrors);
return (EISCONN);
}
+
error = proto_verify_ip_addr(connp->conn_family,
(struct sockaddr *)msg->msg_name, msg->msg_namelen);
if (error != 0) {
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index c279bb4a21..847e2cdde6 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -292,6 +293,9 @@ opdes_t udp_opt_arr[] = {
},
{ UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int),
0 },
+{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 },
+{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+ 0 }
};
/*
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 1e5204bb15..ef11973707 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _UDP_IMPL_H
@@ -178,8 +179,12 @@ typedef struct udp_s {
udp_issocket : 1, /* socket mode; sockfs is on top */
udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */
udp_rcvhdr : 1, /* UDP_RCVHDR option */
+ udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */
+ /* Because there's only VXLAN, cheat */
+ /* and only use a single bit */
+ udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */
- udp_pad_to_bit_31 : 29;
+ udp_pad_to_bit_31 : 27;
/* Following 2 fields protected by the uf_lock */
struct udp_s *udp_bind_hash; /* Bind hash chain */
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c
index 7e930c89e8..c5d6f09b0c 100644
--- a/usr/src/uts/common/io/aggr/aggr_grp.c
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -32,39 +32,69 @@
* module. The hash key is the linkid associated with the link
* aggregation group.
*
- * A set of MAC ports are associated with each association group.
+ * Each aggregation contains a set of ports. The port is represented
+ * by the aggr_port_t structure. A port consists of a single MAC
+ * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
+ * MAC. This client is used by the aggr to send and receive LACP
+ * traffic. Each port client takes on the same MAC unicast address --
+ * the address of the aggregation itself (taken from the first port by
+ * default).
*
- * Aggr pseudo TX rings
- * --------------------
- * The underlying ports (NICs) in an aggregation can have TX rings. To
- * enhance aggr's performance, these TX rings are made available to the
- * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
- * They are already present and implemented on the RX side. It is called
- * as pseudo RX rings. The same concept is extended to the TX side where
- * each TX ring of an underlying port is reflected in aggr as a pseudo
- * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
- * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
- * TX ring is given to the aggregation layer.
+ * The MAC client that hangs off each aggr port is not your typical
+ * MAC client. Not only does it have exclusive control of the MAC, but
+ * it also has no Tx or Rx SRSes. An SRS is designed to queue and
+ * fanout traffic among L4 protocols; but the aggr is an intermediary,
+ * not a consumer. Instead of using SRSes, the aggr puts the
+ * underlying hardware rings into passthru mode and ships packets up
+ * via a direct call to aggr_recv_cb(). This allows aggr to enforce
+ * LACP while passing all other traffic up to clients of the aggr.
+ *
+ * Pseudo Rx Groups and Rings
+ * --------------------------
+ *
+ * It is imperative for client performance that the aggr provide as
+ * many MAC groups as possible. In order to use the underlying HW
+ * resources, aggr creates pseudo groups to aggregate the underlying
+ * HW groups. Every HW group gets mapped to a pseudo group; and every
+ * HW ring in that group gets mapped to a pseudo ring. The pseudo
+ * group at index 0 combines all the HW groups at index 0 from each
+ * port, etc. The aggr's MAC then creates normal MAC groups and rings
+ * out of these pseudo groups and rings to present to the aggr's
+ * clients. To the clients, the aggr's groups and rings are absolutely
+ * no different than a NIC's groups or rings.
+ *
+ * Pseudo Tx Rings
+ * ---------------
+ *
+ * The underlying ports (NICs) in an aggregation can have Tx rings. To
+ * enhance aggr's performance, these Tx rings are made available to
+ * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
+ * not new. They are already present and implemented on the Rx side.
+ * The same concept is extended to the Tx side where each Tx ring of
+ * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
+ * each pseudo Tx ring will map to a specific hardware Tx ring. Even
+ * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
+ * is given to the aggregation layer.
*
* With this change, the outgoing stack depth looks much better:
*
* mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
* mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
*
- * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
+ * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
* SRS_TX_AGGR and SRS_TX_BW_AGGR.
*
* In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
- * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
+ * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
* ring belonging to a port on which the packet has to be sent.
* aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
- * policy and then uses the fanout_hint passed to it to pick a TX ring from
+ * policy and then uses the fanout_hint passed to it to pick a Tx ring from
* the selected port.
*
* In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
* bandwidth limit is applied first on the outgoing packet and the packets
* allowed to go out would call mac_tx_aggr_mode() to send the packet on a
- * particular TX ring.
+ * particular Tx ring.
*/
#include <sys/types.h>
@@ -121,9 +151,12 @@ static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
static int aggr_pseudo_disable_intr(mac_intr_handle_t);
static int aggr_pseudo_enable_intr(mac_intr_handle_t);
-static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
+static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
+static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
static int aggr_addmac(void *, const uint8_t *);
static int aggr_remmac(void *, const uint8_t *);
+static int aggr_addvlan(mac_group_driver_t, uint16_t);
+static int aggr_remvlan(mac_group_driver_t, uint16_t);
static mblk_t *aggr_rx_poll(void *, int);
static void aggr_fill_ring(void *, mac_ring_type_t, const int,
const int, mac_ring_info_t *, mac_ring_handle_t);
@@ -324,6 +357,7 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
return (B_FALSE);
}
+ mutex_enter(&grp->lg_stat_lock);
if (grp->lg_ifspeed == 0) {
/*
* The group inherits the speed of the first link being
@@ -337,8 +371,10 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
* the group link speed, as per 802.3ad. Since it is
* not, the attach is cancelled.
*/
+ mutex_exit(&grp->lg_stat_lock);
return (B_FALSE);
}
+ mutex_exit(&grp->lg_stat_lock);
grp->lg_nattached_ports++;
@@ -347,7 +383,9 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
*/
if (grp->lg_link_state != LINK_STATE_UP) {
grp->lg_link_state = LINK_STATE_UP;
+ mutex_enter(&grp->lg_stat_lock);
grp->lg_link_duplex = LINK_DUPLEX_FULL;
+ mutex_exit(&grp->lg_stat_lock);
link_state_changed = B_TRUE;
}
@@ -359,9 +397,13 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
aggr_grp_multicst_port(port, B_TRUE);
/*
- * Set port's receive callback
+ * The port client doesn't have an Rx SRS; instead of calling
+ * mac_rx_set() we set the client's flow callback directly.
+ * This datapath is used only when the port's driver doesn't
+ * support MAC_CAPAB_RINGS. Drivers with ring support will
+ * deliver traffic to the aggr via ring passthru.
*/
- mac_rx_set(port->lp_mch, aggr_recv_cb, port);
+ mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
/*
* If LACP is OFF, the port can be used to send data as soon
@@ -391,7 +433,7 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
return (B_FALSE);
- mac_rx_clear(port->lp_mch);
+ mac_client_clear_flow_cb(port->lp_mch);
aggr_grp_multicst_port(port, B_FALSE);
@@ -405,9 +447,11 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
grp->lg_nattached_ports--;
if (grp->lg_nattached_ports == 0) {
/* the last attached MAC port of the group is being detached */
- grp->lg_ifspeed = 0;
grp->lg_link_state = LINK_STATE_DOWN;
+ mutex_enter(&grp->lg_stat_lock);
+ grp->lg_ifspeed = 0;
grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
+ mutex_exit(&grp->lg_stat_lock);
link_state_changed = B_TRUE;
}
@@ -528,26 +572,27 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
zoneid_t port_zoneid = ALL_ZONES;
int err;
- /* The port must be int the same zone as the aggregation. */
+ /* The port must be in the same zone as the aggregation. */
if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
port_zoneid = GLOBAL_ZONEID;
if (grp->lg_zoneid != port_zoneid)
return (EBUSY);
/*
- * lg_mh could be NULL when the function is called during the creation
- * of the aggregation.
+ * If we are creating the aggr, then there is no MAC handle
+ * and thus no perimeter to hold. If we are adding a port to
+ * an existing aggr, then the perimiter of the aggr's MAC must
+ * be held.
*/
ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
- /* create new port */
err = aggr_port_create(grp, port_linkid, force, &port);
if (err != 0)
return (err);
mac_perim_enter_by_mh(port->lp_mh, &mph);
- /* add port to list of group constituent ports */
+ /* Add the new port to the end of the list. */
cport = &grp->lg_ports;
while (*cport != NULL)
cport = &((*cport)->lp_next);
@@ -629,6 +674,7 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port,
ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
ring->arr_hw_rh = hw_rh;
ring->arr_port = port;
+ ring->arr_grp = rx_grp;
rx_grp->arg_ring_cnt++;
/*
@@ -639,10 +685,15 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port,
ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
ring->arr_hw_rh = NULL;
ring->arr_port = NULL;
+ ring->arr_grp = NULL;
rx_grp->arg_ring_cnt--;
} else {
- mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
- mac_find_ring(rx_grp->arg_gh, j));
+ /*
+ * This must run after the MAC is registered.
+ */
+ ASSERT3P(ring->arr_rh, !=, NULL);
+ mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
+ (void *)port, (mac_resource_handle_t)ring);
}
return (err);
}
@@ -653,11 +704,9 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port,
static void
aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
{
- aggr_pseudo_rx_ring_t *ring;
- int j;
+ for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
+ aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
- for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
- ring = rx_grp->arg_rings + j;
if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
ring->arr_hw_rh != hw_rh) {
continue;
@@ -668,134 +717,140 @@ aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
ring->arr_hw_rh = NULL;
ring->arr_port = NULL;
+ ring->arr_grp = NULL;
rx_grp->arg_ring_cnt--;
- mac_hwring_teardown(hw_rh);
+ mac_hwring_clear_passthru(hw_rh);
break;
}
}
/*
- * This function is called to create pseudo rings over the hardware rings of
- * the underlying device. Note that there is a 1:1 mapping between the pseudo
- * RX rings of the aggr and the hardware rings of the underlying port.
+ * Create pseudo rings over the HW rings of the port.
+ *
+ * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
+ *
+ * o Program existing unicast filters on the pseudo group into the HW group.
+ *
+ * o Program existing VLAN filters on the pseudo group into the HW group.
*/
static int
aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
{
- aggr_grp_t *grp = port->lp_grp;
mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
aggr_unicst_addr_t *addr, *a;
mac_perim_handle_t pmph;
- int hw_rh_cnt, i = 0, j;
+ aggr_vlan_t *avp;
+ uint_t hw_rh_cnt, i;
int err = 0;
+ uint_t g_idx = rx_grp->arg_index;
- ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
mac_perim_enter_by_mh(port->lp_mh, &pmph);
/*
- * This function must be called after the aggr registers its mac
- * and its RX group has been initialized.
+ * This function must be called after the aggr registers its
+ * MAC and its Rx groups have been initialized.
*/
ASSERT(rx_grp->arg_gh != NULL);
/*
- * Get the list the the underlying HW rings.
+ * Get the list of the underlying HW rings.
*/
- hw_rh_cnt = mac_hwrings_get(port->lp_mch,
- &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
-
- if (port->lp_hwgh != NULL) {
- /*
- * Quiesce the HW ring and the mac srs on the ring. Note
- * that the HW ring will be restarted when the pseudo ring
- * is started. At that time all the packets will be
- * directly passed up to the pseudo RX ring and handled
- * by mac srs created over the pseudo RX ring.
- */
- mac_rx_client_quiesce(port->lp_mch);
- mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
- }
+ hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
+ &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
/*
- * Add all the unicast addresses to the newly added port.
+ * Add existing VLAN and unicast address filters to the port.
*/
+ for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
+ avp = list_next(&rx_grp->arg_vlans, avp)) {
+ if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
+ goto err;
+ }
+
for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
- if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
- break;
+ if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
+ goto err;
}
- for (i = 0; err == 0 && i < hw_rh_cnt; i++)
+ for (i = 0; i < hw_rh_cnt; i++) {
err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
+ if (err != 0)
+ goto err;
+ }
- if (err != 0) {
- for (j = 0; j < i; j++)
- aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
+ mac_perim_exit(pmph);
+ return (0);
+
+err:
+ ASSERT(err != 0);
+
+ for (uint_t j = 0; j < i; j++)
+ aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
+
+ for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
+ aggr_port_remmac(port, g_idx, a->aua_addr);
- for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
- aggr_port_remmac(port, a->aua_addr);
+ if (avp != NULL)
+ avp = list_prev(&rx_grp->arg_vlans, avp);
- if (port->lp_hwgh != NULL) {
- mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
- mac_rx_client_restart(port->lp_mch);
- port->lp_hwgh = NULL;
+ for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
+ int err2;
+
+ if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
+ cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
+ ": errno %d.", avp->av_vid,
+ mac_client_name(port->lp_mch), err2);
}
- } else {
- port->lp_rx_grp_added = B_TRUE;
}
-done:
+
+ port->lp_hwghs[g_idx] = NULL;
mac_perim_exit(pmph);
return (err);
}
/*
- * This function is called by aggr to remove pseudo RX rings over the
- * HW rings of the underlying port.
+ * Destroy the pseudo rings mapping to this port and remove all VLAN
+ * and unicast filters from this port. Even if there are no underlying
+ * HW rings we must still remove the unicast filters to take the port
+ * out of promisc mode.
*/
static void
aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
{
- aggr_grp_t *grp = port->lp_grp;
mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
aggr_unicst_addr_t *addr;
- mac_group_handle_t hwgh;
mac_perim_handle_t pmph;
- int hw_rh_cnt, i;
+ uint_t hw_rh_cnt;
+ uint_t g_idx = rx_grp->arg_index;
- ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
+ ASSERT3P(rx_grp->arg_gh, !=, NULL);
mac_perim_enter_by_mh(port->lp_mh, &pmph);
- if (!port->lp_rx_grp_added)
- goto done;
-
- ASSERT(rx_grp->arg_gh != NULL);
- hw_rh_cnt = mac_hwrings_get(port->lp_mch,
- &hwgh, hw_rh, MAC_RING_TYPE_RX);
+ hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
+ MAC_RING_TYPE_RX);
- /*
- * If hw_rh_cnt is 0, it means that the underlying port does not
- * support RX rings. Directly return in this case.
- */
- for (i = 0; i < hw_rh_cnt; i++)
+ for (uint_t i = 0; i < hw_rh_cnt; i++)
aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
- aggr_port_remmac(port, addr->aua_addr);
+ aggr_port_remmac(port, g_idx, addr->aua_addr);
- if (port->lp_hwgh != NULL) {
- port->lp_hwgh = NULL;
+ for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
+ avp = list_next(&rx_grp->arg_vlans, avp)) {
+ int err;
- /*
- * First clear the permanent-quiesced flag of the RX srs then
- * restart the HW ring and the mac srs on the ring. Note that
- * the HW ring and associated SRS will soon been removed when
- * the port is removed from the aggr.
- */
- mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
- mac_rx_client_restart(port->lp_mch);
+ if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
+ cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
+ ": errno %d.", avp->av_vid,
+ mac_client_name(port->lp_mch), err);
+ }
}
- port->lp_rx_grp_added = B_FALSE;
-done:
+ port->lp_hwghs[g_idx] = NULL;
mac_perim_exit(pmph);
}
@@ -899,8 +954,8 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
/*
* Get the list the the underlying HW rings.
*/
- hw_rh_cnt = mac_hwrings_get(port->lp_mch,
- NULL, hw_rh, MAC_RING_TYPE_TX);
+ hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
+ MAC_RING_TYPE_TX);
/*
* Even if the underlying NIC does not have TX rings, we
@@ -1006,21 +1061,45 @@ aggr_pseudo_enable_intr(mac_intr_handle_t ih)
}
/*
- * Here we need to start the pseudo-ring. As MAC already ensures that the
- * underlying device is set up, all we need to do is save the ring generation.
- *
- * Note, we don't end up wanting to use the underlying mac_hwring_start/stop
- * functions here as those don't actually stop and start the ring, they just
- * quiesce the ring. Regardless of whether the aggr is logically up or not, we
- * want to make sure that we can receive traffic for LACP.
+ * Start the pseudo ring. Since the pseudo ring is just an abstraction
+ * over an actual HW ring, the real task is to start the underlying HW
+ * ring.
*/
static int
-aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
+aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
{
+ int err;
aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
+ err = mac_hwring_start(rr_ring->arr_hw_rh);
+
+ if (err != 0)
+ return (err);
+
rr_ring->arr_gen = mr_gen;
- return (0);
+ return (err);
+}
+
+/*
+ * Stop the pseudo ring. Since the pseudo ring is just an abstraction
+ * over an actual HW ring, the real task is to stop the underlying HW
+ * ring.
+ */
+static void
+aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
+{
+ aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
+
+ /*
+ * The rings underlying the default group must stay up to
+ * continue receiving LACP traffic. We would normally never
+ * stop the default Rx rings because of the primary MAC
+ * client; but aggr's primary MAC client doesn't call
+ * mac_unicast_add() and thus mi_active is 0 when the last
+ * non-primary client is deleted.
+ */
+ if (rr_ring->arr_grp->arg_index != 0)
+ mac_hwring_stop(rr_ring->arr_hw_rh);
}
/*
@@ -1030,13 +1109,15 @@ int
aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
laioc_port_t *ports)
{
- int rc, i, nadded = 0;
+ int rc;
+ uint_t port_added = 0;
+ uint_t grp_added;
aggr_grp_t *grp = NULL;
aggr_port_t *port;
boolean_t link_state_changed = B_FALSE;
mac_perim_handle_t mph, pmph;
- /* get group corresponding to linkid */
+ /* Get the aggr corresponding to linkid. */
rw_enter(&aggr_grp_lock, RW_READER);
if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
(mod_hash_val_t *)&grp) != 0) {
@@ -1046,20 +1127,22 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
AGGR_GRP_REFHOLD(grp);
/*
- * Hold the perimeter so that the aggregation won't be destroyed.
+ * Hold the perimeter so that the aggregation can't be destroyed.
*/
mac_perim_enter_by_mh(grp->lg_mh, &mph);
rw_exit(&aggr_grp_lock);
- /* add the specified ports to group */
- for (i = 0; i < nports; i++) {
- /* add port to group */
+ /* Add the specified ports to the aggr. */
+ for (uint_t i = 0; i < nports; i++) {
+ grp_added = 0;
+
if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
force, &port)) != 0) {
goto bail;
}
+
ASSERT(port != NULL);
- nadded++;
+ port_added++;
/* check capabilities */
if (!aggr_grp_capab_check(grp, port) ||
@@ -1076,9 +1159,16 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
if (rc != 0)
goto bail;
- rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
- if (rc != 0)
- goto bail;
+
+ for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
+ rc = aggr_add_pseudo_rx_group(port,
+ &grp->lg_rx_groups[j]);
+
+ if (rc != 0)
+ goto bail;
+
+ grp_added++;
+ }
mac_perim_enter_by_mh(port->lp_mh, &pmph);
@@ -1096,7 +1186,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
/*
* Turn on the promiscuous mode over the port when it
* is requested to be turned on to receive the
- * non-primary address over a port, or the promiscous
+ * non-primary address over a port, or the promiscuous
* mode is enabled over the aggr.
*/
if (grp->lg_promisc || port->lp_prom_addr != NULL) {
@@ -1131,17 +1221,33 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
bail:
if (rc != 0) {
/* stop and remove ports that have been added */
- for (i = 0; i < nadded; i++) {
+ for (uint_t i = 0; i < port_added; i++) {
+ uint_t grp_remove;
+
port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
ASSERT(port != NULL);
+
if (grp->lg_started) {
mac_perim_enter_by_mh(port->lp_mh, &pmph);
(void) aggr_port_promisc(port, B_FALSE);
aggr_port_stop(port);
mac_perim_exit(pmph);
}
+
aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
- aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
+
+ /*
+ * Only the last port could have a partial set
+ * of groups added.
+ */
+ grp_remove = (i + 1 == port_added) ? grp_added :
+ grp->lg_rx_group_count;
+
+ for (uint_t j = 0; j < grp_remove; j++) {
+ aggr_rem_pseudo_rx_group(port,
+ &grp->lg_rx_groups[j]);
+ }
+
(void) aggr_grp_rem_port(grp, port, NULL, NULL);
}
}
@@ -1303,7 +1409,8 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
MAX_RINGS_PER_GROUP), KM_SLEEP);
grp->lg_tx_blocked_cnt = 0;
- bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
+ bzero(&grp->lg_rx_groups,
+ sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
aggr_lacp_init_grp(grp);
@@ -1323,11 +1430,48 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
grp->lg_key = key;
for (i = 0; i < nports; i++) {
- err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
+ err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
if (err != 0)
goto bail;
}
+ grp->lg_rx_group_count = 1;
+
+ for (i = 0, port = grp->lg_ports; port != NULL;
+ i++, port = port->lp_next) {
+ uint_t num_rgroups;
+
+ mac_perim_enter_by_mh(port->lp_mh, &mph);
+ num_rgroups = mac_get_num_rx_groups(port->lp_mh);
+ mac_perim_exit(mph);
+
+ /*
+ * Utilize all the groups in a port. If some ports
+ * have less groups than others, then traffic destined
+ * for the same unicast address may be HW classified
+ * on some ports but SW classified by aggr when
+ * arriving on other ports.
+ */
+ grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
+ num_rgroups);
+ }
+
+ /*
+ * There could be cases where the hardware provides more
+ * groups than aggr can support. Make sure we never go above
+ * the max aggr can support.
+ */
+ grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
+ MAX_GROUPS_PER_PORT);
+
+ ASSERT3U(grp->lg_rx_group_count, >, 0);
+ for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
+ grp->lg_rx_groups[i].arg_index = i;
+ grp->lg_rx_groups[i].arg_untagged = 0;
+ list_create(&(grp->lg_rx_groups[i].arg_vlans),
+ sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
+ }
+
/*
* If no explicit MAC address was specified by the administrator,
* set it to the MAC address of the first port.
@@ -1345,7 +1489,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
grp->lg_mac_addr_port = grp->lg_ports;
}
- /* set the initial group capabilities */
+ /* Set the initial group capabilities. */
aggr_grp_capab_set(grp);
if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
@@ -1380,14 +1524,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
* Update the MAC address of the constituent ports.
* None of the port is attached at this time, the link state of the
* aggregation will not change.
+ *
+ * All ports take on the primary MAC address of the aggr
+ * (lg_aggr). At this point, none of the ports are attached;
+ * thus the link state of the aggregation will not change.
*/
link_state_changed = aggr_grp_update_ports_mac(grp);
ASSERT(!link_state_changed);
- /* update outbound load balancing policy */
+ /* Update outbound load balancing policy. */
aggr_send_update_policy(grp, policy);
- /* set LACP mode */
+ /* Set LACP mode. */
aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
/*
@@ -1395,12 +1543,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
*/
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
/*
- * Create the pseudo ring for each HW ring of the underlying
- * port. Note that this is done after the aggr registers the
- * mac.
+ * Create the pseudo ring for each HW ring of the
+ * underlying port. Note that this is done after the
+ * aggr registers its MAC.
*/
- VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
- VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
+ VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
+ ==, 0);
+
+ for (i = 0; i < grp->lg_rx_group_count; i++) {
+ VERIFY3S(aggr_add_pseudo_rx_group(port,
+ &grp->lg_rx_groups[i]), ==, 0);
+ }
+
if (aggr_port_notify_link(grp, port))
link_state_changed = B_TRUE;
@@ -1545,7 +1699,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
continue;
val = aggr_port_stat(port, stat);
val -= port->lp_stat[i];
+ mutex_enter(&grp->lg_stat_lock);
grp->lg_stat[i] += val;
+ mutex_exit(&grp->lg_stat_lock);
}
for (i = 0; i < ETHER_NSTAT; i++) {
stat = i + MACTYPE_STAT_MIN;
@@ -1553,7 +1709,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
continue;
val = aggr_port_stat(port, stat);
val -= port->lp_ether_stat[i];
+ mutex_enter(&grp->lg_stat_lock);
grp->lg_ether_stat[i] += val;
+ mutex_exit(&grp->lg_stat_lock);
}
grp->lg_nports--;
@@ -1678,7 +1836,8 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
* aggr_find_tx_ring() will not return any rings
* belonging to it.
*/
- aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
+ for (i = 0; i < grp->lg_rx_group_count; i++)
+ aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
/* remove port from group */
rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
@@ -1783,7 +1942,8 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
(void) aggr_grp_detach_port(grp, port);
mac_perim_exit(pmph);
aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
- aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
+ for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
+ aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
aggr_port_delete(port);
port = cport;
}
@@ -1802,6 +1962,10 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
VERIFY(mac_unregister(grp->lg_mh) == 0);
grp->lg_mh = NULL;
+ for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
+ list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
+ }
+
AGGR_GRP_REFRELE(grp);
return (0);
}
@@ -1884,6 +2048,8 @@ aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
aggr_port_t *port;
uint_t stat_index;
+ ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
+
/* We only aggregate counter statistics. */
if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
@@ -1952,10 +2118,9 @@ static int
aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
{
aggr_grp_t *grp = arg;
- mac_perim_handle_t mph;
int rval = 0;
- mac_perim_enter_by_mh(grp->lg_mh, &mph);
+ mutex_enter(&grp->lg_stat_lock);
switch (stat) {
case MAC_STAT_IFSPEED:
@@ -1975,7 +2140,7 @@ aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
rval = aggr_grp_stat(grp, stat, val);
}
- mac_perim_exit(mph);
+ mutex_exit(&grp->lg_stat_lock);
return (rval);
}
@@ -2165,17 +2330,15 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
return (!grp->lg_zcopy);
case MAC_CAPAB_RINGS: {
mac_capab_rings_t *cap_rings = cap_data;
+ uint_t ring_cnt = 0;
+
+ for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
+ ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
- cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
-
- /*
- * An aggregation advertises only one (pseudo) RX
- * group, which virtualizes the main/primary group of
- * the underlying devices.
- */
- cap_rings->mr_gnum = 1;
+ cap_rings->mr_rnum = ring_cnt;
+ cap_rings->mr_gnum = grp->lg_rx_group_count;
cap_rings->mr_gaddring = NULL;
cap_rings->mr_gremring = NULL;
} else {
@@ -2207,19 +2370,17 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
}
/*
- * Callback funtion for MAC layer to register groups.
+ * Callback function for MAC layer to register groups.
*/
static void
aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
mac_group_info_t *infop, mac_group_handle_t gh)
{
aggr_grp_t *grp = arg;
- aggr_pseudo_rx_group_t *rx_group;
- aggr_pseudo_tx_group_t *tx_group;
- ASSERT(index == 0);
if (rtype == MAC_RING_TYPE_RX) {
- rx_group = &grp->lg_rx_group;
+ aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
+
rx_group->arg_gh = gh;
rx_group->arg_grp = grp;
@@ -2229,8 +2390,18 @@ aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
infop->mgi_addmac = aggr_addmac;
infop->mgi_remmac = aggr_remmac;
infop->mgi_count = rx_group->arg_ring_cnt;
+
+ /*
+ * Always set the HW VLAN callbacks. They are smart
+ * enough to know when a port has HW VLAN filters to
+ * program and when it doesn't.
+ */
+ infop->mgi_addvlan = aggr_addvlan;
+ infop->mgi_remvlan = aggr_remvlan;
} else {
- tx_group = &grp->lg_tx_group;
+ aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
+
+ ASSERT3S(index, ==, 0);
tx_group->atg_gh = gh;
}
}
@@ -2246,13 +2417,13 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
switch (rtype) {
case MAC_RING_TYPE_RX: {
- aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group;
+ aggr_pseudo_rx_group_t *rx_group;
aggr_pseudo_rx_ring_t *rx_ring;
mac_intr_t aggr_mac_intr;
- ASSERT(rg_index == 0);
-
- ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
+ rx_group = &grp->lg_rx_groups[rg_index];
+ ASSERT3S(index, >=, 0);
+ ASSERT3S(index, <, rx_group->arg_ring_cnt);
rx_ring = rx_group->arg_rings + index;
rx_ring->arr_rh = rh;
@@ -2266,8 +2437,8 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
aggr_mac_intr.mi_ddi_handle = NULL;
infop->mri_driver = (mac_ring_driver_t)rx_ring;
- infop->mri_start = aggr_pseudo_start_ring;
- infop->mri_stop = NULL;
+ infop->mri_start = aggr_pseudo_start_rx_ring;
+ infop->mri_stop = aggr_pseudo_stop_rx_ring;
infop->mri_intr = aggr_mac_intr;
infop->mri_poll = aggr_rx_poll;
@@ -2354,6 +2525,7 @@ aggr_addmac(void *arg, const uint8_t *mac_addr)
aggr_port_t *port, *p;
mac_perim_handle_t mph;
int err = 0;
+ uint_t idx = rx_group->arg_index;
mac_perim_enter_by_mh(grp->lg_mh, &mph);
@@ -2380,12 +2552,12 @@ aggr_addmac(void *arg, const uint8_t *mac_addr)
*pprev = addr;
for (port = grp->lg_ports; port != NULL; port = port->lp_next)
- if ((err = aggr_port_addmac(port, mac_addr)) != 0)
+ if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
break;
if (err != 0) {
for (p = grp->lg_ports; p != port; p = p->lp_next)
- aggr_port_remmac(p, mac_addr);
+ aggr_port_remmac(p, idx, mac_addr);
*pprev = NULL;
kmem_free(addr, sizeof (aggr_unicst_addr_t));
@@ -2430,7 +2602,7 @@ aggr_remmac(void *arg, const uint8_t *mac_addr)
}
for (port = grp->lg_ports; port != NULL; port = port->lp_next)
- aggr_port_remmac(port, mac_addr);
+ aggr_port_remmac(port, rx_group->arg_index, mac_addr);
*pprev = addr->aua_next;
kmem_free(addr, sizeof (aggr_unicst_addr_t));
@@ -2440,6 +2612,188 @@ aggr_remmac(void *arg, const uint8_t *mac_addr)
}
/*
+ * Search for VID in the Rx group's list and return a pointer if
+ * found. Otherwise return NULL.
+ */
+static aggr_vlan_t *
+aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
+{
+ ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
+ for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
+ avp = list_next(&rx_group->arg_vlans, avp)) {
+ if (avp->av_vid == vid)
+ return (avp);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Accept traffic on the specified VID.
+ *
+ * Persist VLAN state in the aggr so that ports added later will
+ * receive the correct filters. In the future it would be nice to
+ * allow aggr to iterate its clients instead of duplicating state.
+ */
+static int
+aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+ aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
+ aggr_grp_t *aggr = rx_group->arg_grp;
+ aggr_port_t *port, *p;
+ mac_perim_handle_t mph;
+ int err = 0;
+ aggr_vlan_t *avp = NULL;
+ uint_t idx = rx_group->arg_index;
+
+ mac_perim_enter_by_mh(aggr->lg_mh, &mph);
+
+ if (vid == MAC_VLAN_UNTAGGED) {
+ /*
+ * Aggr is both a MAC provider and MAC client. As a
+ * MAC provider it is passed MAC_VLAN_UNTAGGED by its
+ * client. As a client itself, it should pass
+ * VLAN_ID_NONE to its ports.
+ */
+ vid = VLAN_ID_NONE;
+ rx_group->arg_untagged++;
+ goto update_ports;
+ }
+
+ avp = aggr_find_vlan(rx_group, vid);
+
+ if (avp != NULL) {
+ avp->av_refs++;
+ mac_perim_exit(mph);
+ return (0);
+ }
+
+ avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
+ avp->av_vid = vid;
+ avp->av_refs = 1;
+
+update_ports:
+ for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
+ if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
+ break;
+
+ if (err != 0) {
+ /*
+ * If any of these calls fail then we are in a
+ * situation where the ports have different HW state.
+ * There's no reasonable action the MAC client can
+ * take in this scenario to rectify the situation.
+ */
+ for (p = aggr->lg_ports; p != port; p = p->lp_next) {
+ int err2;
+
+ if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
+ cmn_err(CE_WARN, "Failed to remove VLAN %u"
+ " from port %s: errno %d.", vid,
+ mac_client_name(p->lp_mch), err2);
+ }
+
+ }
+
+ if (vid == VLAN_ID_NONE)
+ rx_group->arg_untagged--;
+
+ if (avp != NULL) {
+ kmem_free(avp, sizeof (aggr_vlan_t));
+ avp = NULL;
+ }
+ }
+
+ if (avp != NULL)
+ list_insert_tail(&rx_group->arg_vlans, avp);
+
+done:
+ mac_perim_exit(mph);
+ return (err);
+}
+
+/*
+ * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
+ */
+static int
+aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+ aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
+ aggr_grp_t *aggr = rx_group->arg_grp;
+ aggr_port_t *port, *p;
+ mac_perim_handle_t mph;
+ int err = 0;
+ aggr_vlan_t *avp = NULL;
+ uint_t idx = rx_group->arg_index;
+
+ mac_perim_enter_by_mh(aggr->lg_mh, &mph);
+
+ /*
+ * See the comment in aggr_addvlan().
+ */
+ if (vid == MAC_VLAN_UNTAGGED) {
+ vid = VLAN_ID_NONE;
+ rx_group->arg_untagged--;
+
+ if (rx_group->arg_untagged > 0)
+ goto done;
+
+ goto update_ports;
+ }
+
+ avp = aggr_find_vlan(rx_group, vid);
+
+ if (avp == NULL) {
+ err = ENOENT;
+ goto done;
+ }
+
+ avp->av_refs--;
+
+ if (avp->av_refs > 0)
+ goto done;
+
+update_ports:
+ for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
+ if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
+ break;
+
+ /*
+ * See the comment in aggr_addvlan() for justification of the
+ * use of VERIFY here.
+ */
+ if (err != 0) {
+ for (p = aggr->lg_ports; p != port; p = p->lp_next) {
+ int err2;
+
+ if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
+ cmn_err(CE_WARN, "Failed to add VLAN %u"
+ " to port %s: errno %d.", vid,
+ mac_client_name(p->lp_mch), err2);
+ }
+ }
+
+ if (avp != NULL)
+ avp->av_refs++;
+
+ if (vid == VLAN_ID_NONE)
+ rx_group->arg_untagged++;
+
+ goto done;
+ }
+
+ if (err == 0 && avp != NULL) {
+ VERIFY3U(avp->av_refs, ==, 0);
+ list_remove(&rx_group->arg_vlans, avp);
+ kmem_free(avp, sizeof (aggr_vlan_t));
+ }
+
+done:
+ mac_perim_exit(mph);
+ return (err);
+}
+
+/*
* Add or remove the multicast addresses that are defined for the group
* to or from the specified port.
*
diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c
index 00545d2c03..c8dbe00336 100644
--- a/usr/src/uts/common/io/aggr/aggr_port.c
+++ b/usr/src/uts/common/io/aggr/aggr_port.c
@@ -21,6 +21,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -69,10 +71,10 @@ aggr_port_destructor(void *buf, void *arg)
{
aggr_port_t *port = buf;
- ASSERT(port->lp_mnh == NULL);
- ASSERT(port->lp_mphp == NULL);
- ASSERT(!port->lp_rx_grp_added && !port->lp_tx_grp_added);
- ASSERT(port->lp_hwgh == NULL);
+ ASSERT3P(port->lp_mnh, ==, NULL);
+ ASSERT(!port->lp_tx_grp_added);
+ for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++)
+ ASSERT3P(port->lp_hwghs[i], ==, NULL);
}
void
@@ -126,7 +128,6 @@ aggr_port_init_callbacks(aggr_port_t *port)
aggr_grp_port_hold(port);
}
-/* ARGSUSED */
int
aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
aggr_port_t **pp)
@@ -195,9 +196,9 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
}
/*
- * As the underlying mac's current margin size is used to determine
+ * As the underlying MAC's current margin size is used to determine
* the margin size of the aggregation itself, request the underlying
- * mac not to change to a smaller size.
+ * MAC not to change to a smaller size.
*/
if ((err = mac_margin_add(mh, &margin, B_TRUE)) != 0) {
id_free(aggr_portids, portid);
@@ -206,7 +207,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
if ((err = mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY |
MAC_UNICAST_DISABLE_TX_VID_CHECK, &mah, 0, &diag)) != 0) {
- VERIFY(mac_margin_remove(mh, margin) == 0);
+ VERIFY3S(mac_margin_remove(mh, margin), ==, 0);
id_free(aggr_portids, portid);
goto fail;
}
@@ -261,6 +262,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
fail:
if (mch != NULL)
mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
+
mac_close(mh);
return (err);
}
@@ -270,13 +272,11 @@ aggr_port_delete(aggr_port_t *port)
{
aggr_lacp_port_t *pl = &port->lp_lacp;
- ASSERT(port->lp_mphp == NULL);
ASSERT(!port->lp_promisc_on);
-
port->lp_closing = B_TRUE;
+ VERIFY0(mac_margin_remove(port->lp_mh, port->lp_margin));
+ mac_client_clear_flow_cb(port->lp_mch);
- VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0);
- mac_rx_clear(port->lp_mch);
/*
* If the notification callback is already in process and waiting for
* the aggr grp's mac perimeter, don't wait (otherwise there would be
@@ -307,8 +307,10 @@ aggr_port_delete(aggr_port_t *port)
* port's MAC_NOTE_UNICST notify callback function being called.
*/
(void) mac_unicast_primary_set(port->lp_mh, port->lp_addr);
+
if (port->lp_mah != NULL)
(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
+
mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
mac_close(port->lp_mh);
AGGR_PORT_REFRELE(port);
@@ -373,10 +375,14 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port)
/* link speed changes? */
ifspeed = aggr_port_stat(port, MAC_STAT_IFSPEED);
if (port->lp_ifspeed != ifspeed) {
+ mutex_enter(&grp->lg_stat_lock);
+
if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
do_detach |= (ifspeed != grp->lg_ifspeed);
else
do_attach |= (ifspeed == grp->lg_ifspeed);
+
+ mutex_exit(&grp->lg_stat_lock);
}
port->lp_ifspeed = ifspeed;
@@ -515,6 +521,10 @@ aggr_port_stop(aggr_port_t *port)
port->lp_started = B_FALSE;
}
+/*
+ * Set the promisc mode of the port. If the port is already in the
+ * requested mode then do nothing.
+ */
int
aggr_port_promisc(aggr_port_t *port, boolean_t on)
{
@@ -523,27 +533,14 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on)
ASSERT(MAC_PERIM_HELD(port->lp_mh));
if (on == port->lp_promisc_on)
- /* already in desired promiscous mode */
return (0);
- if (on) {
- mac_rx_clear(port->lp_mch);
- rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL,
- aggr_recv_cb, port, &port->lp_mphp,
- MAC_PROMISC_FLAGS_NO_TX_LOOP);
- if (rc != 0) {
- mac_rx_set(port->lp_mch, aggr_recv_cb, port);
- return (rc);
- }
- } else {
- mac_promisc_remove(port->lp_mphp);
- port->lp_mphp = NULL;
- mac_rx_set(port->lp_mch, aggr_recv_cb, port);
- }
+ rc = mac_set_promisc(port->lp_mh, on);
- port->lp_promisc_on = on;
+ if (rc == 0)
+ port->lp_promisc_on = on;
- return (0);
+ return (rc);
}
/*
@@ -583,35 +580,45 @@ aggr_port_stat(aggr_port_t *port, uint_t stat)
}
/*
- * Add a non-primary unicast address to the underlying port. If the port
- * supports HW Rx group, try to add the address into the HW Rx group of
- * the port first. If that fails, or if the port does not support HW Rx
- * group, enable the port's promiscous mode.
+ * Add a non-primary unicast address to the underlying port. If the
+ * port supports HW Rx groups, then try to add the address filter to
+ * the HW group first. If that fails, or if the port does not support
+ * RINGS capab, then enable the port's promiscous mode.
*/
int
-aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr)
+aggr_port_addmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr)
{
aggr_unicst_addr_t *addr, **pprev;
mac_perim_handle_t pmph;
int err;
ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
mac_perim_enter_by_mh(port->lp_mh, &pmph);
/*
- * If the underlying port support HW Rx group, add the mac to its
- * RX group directly.
+ * If the port doesn't have a HW group to back the aggr's
+ * pseudo group, then try using the port's default group and
+ * let the aggr SW classify its traffic. This scenario happens
+ * when mixing ports with a different number of HW groups.
*/
- if ((port->lp_hwgh != NULL) &&
- ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) {
+ if (port->lp_hwghs[idx] == NULL)
+ idx = 0;
+
+ /*
+ * If there is an underlying HW Rx group, then try adding this
+ * unicast address to it.
+ */
+ if ((port->lp_hwghs[idx] != NULL) &&
+ ((mac_hwgroup_addmac(port->lp_hwghs[idx], mac_addr)) == 0)) {
mac_perim_exit(pmph);
return (0);
}
/*
- * If that fails, or if the port does not support HW Rx group, enable
- * the port's promiscous mode. (Note that we turn on the promiscous
- * mode only if the port is already started.
+ * If the port doesn't have HW groups, or we failed to add the
+ * HW filter, then enable the port's promiscuous mode. We
+ * enable promiscuous mode only if the port is already started.
*/
if (port->lp_started &&
((err = aggr_port_promisc(port, B_TRUE)) != 0)) {
@@ -643,13 +650,14 @@ aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr)
* promiscous mode.
*/
void
-aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
+aggr_port_remmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr)
{
aggr_grp_t *grp = port->lp_grp;
aggr_unicst_addr_t *addr, **pprev;
mac_perim_handle_t pmph;
ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
mac_perim_enter_by_mh(port->lp_mh, &pmph);
/*
@@ -662,6 +670,7 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
break;
pprev = &addr->aua_next;
}
+
if (addr != NULL) {
/*
* This unicast address put the port into the promiscous mode,
@@ -674,8 +683,65 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
if (port->lp_prom_addr == NULL && !grp->lg_promisc)
(void) aggr_port_promisc(port, B_FALSE);
} else {
- ASSERT(port->lp_hwgh != NULL);
- (void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr);
+ /* See comment in aggr_port_addmac(). */
+ if (port->lp_hwghs[idx] == NULL)
+ idx = 0;
+
+ ASSERT3P(port->lp_hwghs[idx], !=, NULL);
+ (void) mac_hwgroup_remmac(port->lp_hwghs[idx], mac_addr);
}
+
mac_perim_exit(pmph);
}
+
+int
+aggr_port_addvlan(aggr_port_t *port, uint_t idx, uint16_t vid)
+{
+ mac_perim_handle_t pmph;
+ int err;
+
+ ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ /* See comment in aggr_port_addmac(). */
+ if (port->lp_hwghs[idx] == NULL)
+ idx = 0;
+
+ /*
+ * Add the VLAN filter to the HW group if the port has a HW
+ * group. If the port doesn't have a HW group, then it will
+ * implicitly allow tagged traffic to pass and there is
+ * nothing to do.
+ */
+ if (port->lp_hwghs[idx] == NULL)
+ err = 0;
+ else
+ err = mac_hwgroup_addvlan(port->lp_hwghs[idx], vid);
+
+ mac_perim_exit(pmph);
+ return (err);
+}
+
+int
+aggr_port_remvlan(aggr_port_t *port, uint_t idx, uint16_t vid)
+{
+ mac_perim_handle_t pmph;
+ int err;
+
+ ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+ ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
+ mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+ /* See comment in aggr_port_addmac(). */
+ if (port->lp_hwghs[idx] == NULL)
+ idx = 0;
+
+ if (port->lp_hwghs[idx] == NULL)
+ err = 0;
+ else
+ err = mac_hwgroup_remvlan(port->lp_hwghs[idx], vid);
+
+ mac_perim_exit(pmph);
+ return (err);
+}
diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c
index 2bdb7872e3..b6b3e6de1f 100644
--- a/usr/src/uts/common/io/aggr/aggr_recv.c
+++ b/usr/src/uts/common/io/aggr/aggr_recv.c
@@ -21,6 +21,8 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -55,7 +57,7 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
{
aggr_grp_t *grp = port->lp_grp;
- /* in promiscuous mode, send copy of packet up */
+ /* In promiscuous mode, pass copy of packet up. */
if (grp->lg_promisc) {
mblk_t *nmp = copymsg(mp);
@@ -68,11 +70,11 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
/*
* Callback function invoked by MAC service module when packets are
- * made available by a MAC port.
+ * made available by a MAC port, both in promisc_on mode and not.
*/
/* ARGSUSED */
-void
-aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+static void
+aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
boolean_t loopback)
{
aggr_port_t *port = (aggr_port_t *)arg;
@@ -161,3 +163,10 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
}
}
}
+
+void
+aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
+{
+ aggr_recv_path_cb(arg, mrh, mp, loopback);
+}
diff --git a/usr/src/uts/common/io/bpf/bpf_wrap.c b/usr/src/uts/common/io/bpf/bpf_wrap.c
new file mode 100644
index 0000000000..6cbde58a20
--- /dev/null
+++ b/usr/src/uts/common/io/bpf/bpf_wrap.c
@@ -0,0 +1,35 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <net/bpf.h>
+#include <inet/bpf.h>
+
+/*
+ * With BPF filter validation and evaluation moved into the 'ip' module, these
+ * wrapper functions are provided to expose the original interface.
+ */
+
+uint_t
+bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
+{
+ return ((uint_t)ip_bpf_filter((ip_bpf_insn_t *)pc, p, wirelen, buflen));
+}
+
+int
+bpf_validate(struct bpf_insn *f, int len)
+{
+ return ((int)ip_bpf_validate((ip_bpf_insn_t *)f, (uint_t)len));
+}
diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c
index bc54527515..375d166972 100644
--- a/usr/src/uts/common/io/bridge.c
+++ b/usr/src/uts/common/io/bridge.c
@@ -23,6 +23,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -41,6 +42,7 @@
#include <sys/modctl.h>
#include <sys/note.h>
#include <sys/param.h>
+#include <sys/pattr.h>
#include <sys/policy.h>
#include <sys/sdt.h>
#include <sys/stat.h>
@@ -1693,7 +1695,8 @@ bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
* The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
*/
static mblk_t *
-reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
+reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid,
+ boolean_t keep_flags)
{
boolean_t source_has_tag = (tci != 0xFFFF);
mblk_t *mpcopy;
@@ -1705,8 +1708,13 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
if (mp == NULL)
return (mp);
- /* No forwarded packet can have hardware checksum enabled */
- DB_CKSUMFLAGS(mp) = 0;
+ /*
+ * A forwarded packet cannot have HW offloads enabled unless
+ * the destination is known to be local to the host and HW
+ * offloads haven't been emulated.
+ */
+ if (!keep_flags)
+ DB_CKSUMFLAGS(mp) = 0;
/* Get the no-modification cases out of the way first */
if (!source_has_tag && vlanid == pvid) /* 1a */
@@ -1907,17 +1915,46 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
blp->bl_trillthreads++;
mutex_exit(&blp->bl_trilllock);
update_header(mp, hdr_info, B_FALSE);
- if (is_xmit)
- mp = mac_fix_cksum(mp);
- /* all trill data frames have Inner.VLAN */
- mp = reform_vlan_header(mp, vlanid, tci, 0);
- if (mp == NULL) {
- KIINCR(bki_drops);
- fwd_unref(bfp);
- return (NULL);
+
+ if (is_xmit) {
+ mac_hw_emul(&mp, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
+
+ if (mp == NULL) {
+ KIINCR(bki_drops);
+ goto done;
+ }
}
- trill_encap_fn(tdp, blp, hdr_info, mp,
- bfp->bf_trill_nick);
+
+ while (mp != NULL) {
+ mblk_t *next = mp->b_next;
+
+ mp->b_next = NULL;
+
+ /*
+ * All trill data frames have
+ * Inner.VLAN.
+ */
+ mp = reform_vlan_header(mp, vlanid, tci,
+ 0, B_FALSE);
+
+ if (mp == NULL) {
+ /*
+ * Make sure to free
+ * any remaining
+ * segments.
+ */
+ freemsgchain(next);
+ KIINCR(bki_drops);
+ goto done;
+ }
+
+ trill_encap_fn(tdp, blp, hdr_info, mp,
+ bfp->bf_trill_nick);
+ mp = next;
+ }
+
+done:
mutex_enter(&blp->bl_trilllock);
if (--blp->bl_trillthreads == 0 &&
blp->bl_trilldata == NULL)
@@ -1959,31 +1996,68 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
mpsend = copymsg(mp);
}
- if (!from_trill && is_xmit)
- mpsend = mac_fix_cksum(mpsend);
+ /*
+ * If the destination is not local to the host
+ * then we need to emulate HW offloads because
+ * we can't guarantee the forwarding
+ * destination provides them.
+ */
+ if (!from_trill && is_xmit &&
+ !(bfp->bf_flags & BFF_LOCALADDR)) {
+ mac_hw_emul(&mpsend, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
- mpsend = reform_vlan_header(mpsend, vlanid, tci,
- blpsend->bl_pvid);
- if (mpsend == NULL) {
- KIINCR(bki_drops);
- continue;
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ continue;
+ }
+ }
+
+ /*
+ * The HW emulation above may have segmented
+ * an LSO mblk.
+ */
+ while ((mpsend != NULL) &&
+ !(bfp->bf_flags & BFF_LOCALADDR)) {
+ mblk_t *next = mpsend->b_next;
+
+ mpsend->b_next = NULL;
+ mpsend = reform_vlan_header(mpsend, vlanid, tci,
+ blpsend->bl_pvid, B_FALSE);
+
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ mpsend = next;
+ continue;
+ }
+
+ KIINCR(bki_forwards);
+ KLPINCR(blpsend, bkl_xmit);
+ MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
+ mpsend);
+ freemsg(mpsend);
+ mpsend = next;
}
- KIINCR(bki_forwards);
/*
* No need to bump up the link reference count, as
* the forwarding entry itself holds a reference to
* the link.
*/
if (bfp->bf_flags & BFF_LOCALADDR) {
+ mpsend = reform_vlan_header(mpsend, vlanid, tci,
+ blpsend->bl_pvid, B_TRUE);
+
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ continue;
+ }
+
+ KIINCR(bki_forwards);
mac_rx_common(blpsend->bl_mh, NULL, mpsend);
- } else {
- KLPINCR(blpsend, bkl_xmit);
- MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
- mpsend);
- freemsg(mpsend);
}
}
+
/*
* Handle a special case: if we're transmitting to the original
* link, then check whether the localaddr flag is set. If it
@@ -2019,7 +2093,7 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
* Inner.VLAN
*/
mpsend = reform_vlan_header(mpsend,
- vlanid, tci, 0);
+ vlanid, tci, 0, B_FALSE);
if (mpsend == NULL) {
KIINCR(bki_drops);
} else {
@@ -2070,25 +2144,57 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
mpsend = copymsg(mp);
}
- if (!from_trill && is_xmit)
- mpsend = mac_fix_cksum(mpsend);
+ /*
+ * In this case, send to all links connected
+ * to the bridge. Some of these destinations
+ * may not provide HW offload -- so just
+ * emulate it here.
+ */
+ if (!from_trill && is_xmit) {
+ mac_hw_emul(&mpsend, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
- mpsend = reform_vlan_header(mpsend, vlanid, tci,
- blpsend->bl_pvid);
- if (mpsend == NULL) {
- KIINCR(bki_drops);
- continue;
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ continue;
+ }
+ }
+
+ /*
+ * The HW emulation above may have segmented
+ * an LSO mblk.
+ */
+ while (mpsend != NULL) {
+ mblk_t *next = mpsend->b_next;
+
+ mpsend->b_next = NULL;
+ mpsend = reform_vlan_header(mpsend, vlanid, tci,
+ blpsend->bl_pvid, B_FALSE);
+
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ mpsend = next;
+ continue;
+ }
+
+ if (hdr_info->mhi_dsttype ==
+ MAC_ADDRTYPE_UNICAST)
+ KIINCR(bki_unknown);
+ else
+ KIINCR(bki_mbcast);
+
+ KLPINCR(blpsend, bkl_xmit);
+ if ((mpcopy = copymsg(mpsend)) != NULL) {
+ mac_rx_common(blpsend->bl_mh, NULL,
+ mpcopy);
+ }
+
+ MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
+ mpsend);
+ freemsg(mpsend);
+ mpsend = next;
}
- if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
- KIINCR(bki_unknown);
- else
- KIINCR(bki_mbcast);
- KLPINCR(blpsend, bkl_xmit);
- if ((mpcopy = copymsg(mpsend)) != NULL)
- mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
- MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
- freemsg(mpsend);
link_unref(blpsend);
}
}
diff --git a/usr/src/uts/common/io/chxge/ch.c b/usr/src/uts/common/io/chxge/ch.c
index e7ea942405..46920a1ea2 100644
--- a/usr/src/uts/common/io/chxge/ch.c
+++ b/usr/src/uts/common/io/chxge/ch.c
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -59,6 +60,7 @@
#include <sys/sunddi.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
+#include <sys/mac_provider.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <inet/common.h>
@@ -1377,8 +1379,7 @@ ch_send_up(ch_t *chp, mblk_t *mp, uint32_t cksum, int flg)
* set in /etc/system (see sge.c).
*/
if (flg)
- (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, cksum,
- HCK_FULLCKSUM, 0);
+ mac_hcksum_set(mp, 0, 0, 0, cksum, HCK_FULLCKSUM);
gld_recv(chp->ch_macp, mp);
} else {
freemsg(mp);
@@ -1693,8 +1694,7 @@ ch_send(gld_mac_info_t *macinfo, mblk_t *mp)
msg_flg = 0;
if (chp->ch_config.cksum_enabled) {
if (is_T2(chp)) {
- hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL,
- NULL, &msg_flg);
+ mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &msg_flg);
flg = (msg_flg & HCK_FULLCKSUM)?
CH_NO_CPL: CH_NO_HWCKSUM|CH_NO_CPL;
} else
diff --git a/usr/src/uts/common/io/cons.c b/usr/src/uts/common/io/cons.c
index 507f918d8f..8635023fe3 100644
--- a/usr/src/uts/common/io/cons.c
+++ b/usr/src/uts/common/io/cons.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
*/
/*
@@ -53,6 +54,7 @@
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/stat.h>
+#include <sys/limits.h>
#include <sys/console.h>
#include <sys/consdev.h>
@@ -414,14 +416,24 @@ cnwrite(dev_t dev, struct uio *uio, struct cred *cred)
*/
if (vsconsvp != NULL && vsconsvp->v_stream != NULL) {
struiod_t uiod;
+ struct iovec buf[IOV_MAX_STACK];
+ int iovlen = 0;
+
+ if (uio->uio_iovcnt > IOV_MAX_STACK) {
+ iovlen = uio->uio_iovcnt * sizeof (iovec_t);
+ uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+ } else {
+ uiod.d_iov = buf;
+ }
/*
* strwrite modifies uio so need to make copy.
*/
- (void) uiodup(uio, &uiod.d_uio, uiod.d_iov,
- sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+ (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, uio->uio_iovcnt);
(void) strwrite(vsconsvp, &uiod.d_uio, cred);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
}
if (rconsvp->v_stream != NULL)
diff --git a/usr/src/uts/common/io/cpqary3/cpqary3.c b/usr/src/uts/common/io/cpqary3/cpqary3.c
index 622f0dcf68..f67d77b3d2 100644
--- a/usr/src/uts/common/io/cpqary3/cpqary3.c
+++ b/usr/src/uts/common/io/cpqary3/cpqary3.c
@@ -41,7 +41,7 @@ extern cpqary3_driver_info_t gdriver_info;
* Global Variables Definitions
*/
-static char cpqary3_brief[] = "HP Smart Array Driver";
+static char cpqary3_brief[] = "HP Smart Array (Legacy)";
void *cpqary3_state;
/* HPQaculi Changes */
diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c
index 6f9bf93226..7368c9b43d 100644
--- a/usr/src/uts/common/io/devpoll.c
+++ b/usr/src/uts/common/io/devpoll.c
@@ -25,7 +25,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -245,30 +245,20 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
* stale entries!
*/
static int
-dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
- pollcache_t *pcp, nfds_t nfds, int *fdcntp)
+dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds,
+ int *fdcntp)
{
- int start, ostart, end;
- int fdcnt, fd;
- boolean_t done;
- file_t *fp;
- short revent;
- boolean_t no_wrap;
- pollhead_t *php;
- polldat_t *pdp;
+ int start, ostart, end, fdcnt, error = 0;
+ boolean_t done, no_wrap;
pollfd_t *pfdp;
epoll_event_t *epoll;
- int error = 0;
- short mask = POLLRDHUP | POLLWRBAND;
- boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
+ const short mask = POLLRDHUP | POLLWRBAND;
+ const boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
ASSERT(MUTEX_HELD(&pcp->pc_lock));
if (pcp->pc_bitmap == NULL) {
- /*
- * No Need to search because no poll fd
- * has been cached.
- */
- return (error);
+ /* No Need to search because no poll fd has been cached. */
+ return (0);
}
if (is_epoll) {
@@ -281,7 +271,6 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
retry:
start = ostart = pcp->pc_mapstart;
end = pcp->pc_mapend;
- php = NULL;
if (start == 0) {
/*
@@ -294,8 +283,11 @@ retry:
done = B_FALSE;
fdcnt = 0;
while ((fdcnt < nfds) && !done) {
- php = NULL;
- revent = 0;
+ pollhead_t *php = NULL;
+ short revent = 0;
+ uf_entry_gen_t gen;
+ int fd;
+
/*
* Examine the bit map in a circular fashion
* to avoid starvation. Always resume from
@@ -305,6 +297,9 @@ retry:
fd = bt_getlowbit(pcp->pc_bitmap, start, end);
ASSERT(fd <= end);
if (fd >= 0) {
+ file_t *fp;
+ polldat_t *pdp;
+
if (fd == end) {
if (no_wrap) {
done = B_TRUE;
@@ -328,28 +323,14 @@ repoll:
*/
continue;
}
- if ((fp = getf(fd)) == NULL) {
- /*
- * The fd has been closed, but user has not
- * done a POLLREMOVE on this fd yet. Instead
- * of cleaning it here implicitly, we return
- * POLLNVAL. This is consistent with poll(2)
- * polling a closed fd. Hope this will remind
- * user to do a POLLREMOVE.
- */
- if (!is_epoll && pfdp != NULL) {
- pfdp[fdcnt].fd = fd;
- pfdp[fdcnt].revents = POLLNVAL;
- fdcnt++;
- continue;
- }
-
- /*
- * In the epoll compatibility case, we actually
- * perform the implicit removal to remain
- * closer to the epoll semantics.
- */
+ if ((fp = getf_gen(fd, &gen)) == NULL) {
if (is_epoll) {
+ /*
+ * In the epoll compatibility case, we
+ * actually perform the implicit
+ * removal to remain closer to the
+ * epoll semantics.
+ */
pdp->pd_fp = NULL;
pdp->pd_events = 0;
@@ -360,30 +341,36 @@ repoll:
}
BT_CLEAR(pcp->pc_bitmap, fd);
- continue;
+ } else if (pfdp != NULL) {
+ /*
+ * The fd has been closed, but user has
+ * not done a POLLREMOVE on this fd
+ * yet. Instead of cleaning it here
+ * implicitly, we return POLLNVAL. This
+ * is consistent with poll(2) polling a
+ * closed fd. Hope this will remind
+ * user to do a POLLREMOVE.
+ */
+ pfdp[fdcnt].fd = fd;
+ pfdp[fdcnt].revents = POLLNVAL;
+ fdcnt++;
}
+ continue;
}
- if (fp != pdp->pd_fp) {
+ /*
+ * Detect a change to the resource underlying a cached
+ * file descriptor. While the fd generation comparison
+ * will catch nearly all cases, the file_t comparison
+ * is maintained as a failsafe as well.
+ */
+ if (gen != pdp->pd_gen || fp != pdp->pd_fp) {
/*
* The user is polling on a cached fd which was
* closed and then reused. Unfortunately there
* is no good way to communicate this fact to
* the consumer.
*
- * If the file struct is also reused, we may
- * not be able to detect the fd reuse at all.
- * As long as this does not cause system
- * failure and/or memory leaks, we will play
- * along. The man page states that if the user
- * does not clean up closed fds, polling
- * results will be indeterministic.
- *
- * XXX: perhaps log the detection of fd reuse?
- */
- pdp->pd_fp = fp;
-
- /*
* When this situation has been detected, it's
* likely that any existing pollhead is
* ill-suited to perform proper wake-ups.
@@ -396,7 +383,42 @@ repoll:
pollhead_delete(pdp->pd_php, pdp);
pdp->pd_php = NULL;
}
+
+ /*
+ * Since epoll is expected to act on the
+ * underlying 'struct file' (in Linux terms,
+ * our vnode_t would be a closer analog) rather
+ * than the fd itself, an implicit remove
+ * is necessary under these circumstances to
+ * suppress any results (or errors) from the
+ * new resource occupying the fd.
+ */
+ if (is_epoll) {
+ pdp->pd_fp = NULL;
+ pdp->pd_events = 0;
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ releasef(fd);
+ continue;
+ } else {
+ /*
+ * Regular /dev/poll is unbothered
+ * about the fd reassignment.
+ */
+ pdp->pd_fp = fp;
+ pdp->pd_gen = gen;
+ }
}
+
+ /*
+ * Skip entries marked with the sentinal value for
+ * having already fired under oneshot conditions.
+ */
+ if (pdp->pd_events == POLLONESHOT) {
+ releasef(fd);
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ continue;
+ }
+
/*
* XXX - pollrelock() logic needs to know which
* which pollcache lock to grab. It'd be a
@@ -537,18 +559,19 @@ repoll:
/* Handle special polling modes. */
if (pdp->pd_events & POLLONESHOT) {
/*
- * If POLLONESHOT is set, perform the
- * implicit POLLREMOVE.
+ * Entries operating under POLLONESHOT
+ * will be marked with a sentinel value
+ * to indicate that they have "fired"
+ * when emitting an event. This will
+ * disable them from polling until a
+ * later add/modify event rearms them.
*/
- pdp->pd_fp = NULL;
- pdp->pd_events = 0;
-
+ pdp->pd_events = POLLONESHOT;
if (pdp->pd_php != NULL) {
pollhead_delete(pdp->pd_php,
pdp);
pdp->pd_php = NULL;
}
-
BT_CLEAR(pcp->pc_bitmap, fd);
} else if (pdp->pd_events & POLLET) {
/*
@@ -700,14 +723,10 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
pollfd_t *pollfdp, *pfdp;
dvpoll_epollfd_t *epfdp;
uintptr_t limit;
- int error, size;
- ssize_t uiosize;
- size_t copysize;
+ int error;
+ uint_t size;
+ size_t copysize, uiosize;
nfds_t pollfdnum;
- struct pollhead *php = NULL;
- polldat_t *pdp;
- int fd;
- file_t *fp;
boolean_t is_epoll, fds_added = B_FALSE;
minor = getminor(dev);
@@ -732,10 +751,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
pcp->pc_pid = curproc->p_pid;
}
- uiosize = uiop->uio_resid;
+ if (uiop->uio_resid < 0) {
+ /* No one else is this careful, but maybe they should be. */
+ return (EINVAL);
+ }
+
+ uiosize = (size_t)uiop->uio_resid;
pollfdnum = uiosize / size;
/*
+ * For epoll-enabled handles, restrict the allowed write size to 2.
+ * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD
+ * operation which is expanded into two operations (DEL and ADD).
+ *
+ * All other operations performed through epoll_ctl(3C) will consist of
+ * a single entry.
+ */
+ if (is_epoll && pollfdnum > 2) {
+ return (EINVAL);
+ }
+
+ /*
* We want to make sure that pollfdnum isn't large enough to DoS us,
* but we also don't want to grab p_lock unnecessarily -- so we
* perform the full check against our resource limits if and only if
@@ -794,6 +830,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
ASSERT(dpep->dpe_refcnt != 0);
+ /*
+ * The epoll API does not allow EINTR as a result when making
+ * modifications to the set of polled fds. Given that write
+ * activity is relatively quick and the size of accepted writes
+ * is limited above to two entries, a signal-ignorant wait is
+ * used here to avoid the EINTR.
+ */
+ if (is_epoll) {
+ cv_wait(&dpep->dpe_cv, &dpep->dpe_lock);
+ continue;
+ }
+
+ /*
+ * Non-epoll writers to /dev/poll handles can tolerate EINTR.
+ */
if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
dpep->dpe_writerwait--;
mutex_exit(&dpep->dpe_lock);
@@ -828,7 +879,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
}
for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
- fd = pfdp->fd;
+ int fd = pfdp->fd;
+ polldat_t *pdp;
+
if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
/*
* epoll semantics demand that we return EBADF if our
@@ -844,76 +897,60 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
pdp = pcache_lookup_fd(pcp, fd);
if (pfdp->events != POLLREMOVE) {
+ uf_entry_gen_t gen;
+ file_t *fp = NULL;
+ struct pollhead *php = NULL;
- fp = NULL;
-
- if (pdp == NULL) {
- /*
- * If we're in epoll compatibility mode, check
- * that the fd is valid before allocating
- * anything for it; epoll semantics demand that
- * we return EBADF if our specified fd is
- * invalid.
- */
- if (is_epoll) {
- if ((fp = getf(fd)) == NULL) {
- error = EBADF;
- break;
- }
+ /*
+ * If we're in epoll compatibility mode, check that the
+ * fd is valid before allocating anything for it; epoll
+ * semantics demand that we return EBADF if our
+ * specified fd is invalid.
+ */
+ if (is_epoll) {
+ if ((fp = getf_gen(fd, &gen)) == NULL) {
+ error = EBADF;
+ break;
}
-
+ }
+ if (pdp == NULL) {
pdp = pcache_alloc_fd(0);
pdp->pd_fd = fd;
pdp->pd_pcache = pcp;
pcache_insert_fd(pcp, pdp, pollfdnum);
- } else {
+ }
+
+ if (is_epoll) {
/*
- * epoll semantics demand that we error out if
- * a file descriptor is added twice, which we
- * check (imperfectly) by checking if we both
- * have the file descriptor cached and the
- * file pointer that correponds to the file
- * descriptor matches our cached value. If
- * there is a pointer mismatch, the file
- * descriptor was closed without being removed.
- * The converse is clearly not true, however,
- * so to narrow the window by which a spurious
- * EEXIST may be returned, we also check if
- * this fp has been added to an epoll control
- * descriptor in the past; if it hasn't, we
- * know that this is due to fp reuse -- it's
- * not a true EEXIST case. (By performing this
- * additional check, we limit the window of
- * spurious EEXIST to situations where a single
- * file descriptor is being used across two or
- * more epoll control descriptors -- and even
- * then, the file descriptor must be closed and
- * reused in a relatively tight time span.)
+ * If the fd is already a member of the epoll
+ * set, error emission is needed only when the
+ * fd assignment generation matches the one
+ * recorded in the polldat_t. Absence of such
+ * a generation match indicates that a new
+ * resource has been assigned at that fd.
+ *
+ * Caveat: It is possible to force a generation
+ * update while keeping the same backing
+ * resource. This is possible via dup2, but
+ * does not represent real-world use cases,
+ * making the lack of error acceptable.
*/
- if (is_epoll) {
- if (pdp->pd_fp != NULL &&
- (fp = getf(fd)) != NULL &&
- fp == pdp->pd_fp &&
- (fp->f_flag2 & FEPOLLED)) {
- error = EEXIST;
- releasef(fd);
- break;
- }
-
- /*
- * We have decided that the cached
- * information was stale: it either
- * didn't match, or the fp had never
- * actually been epoll()'d on before.
- * We need to now clear our pd_events
- * to assure that we don't mistakenly
- * operate on cached event disposition.
- */
- pdp->pd_events = 0;
+ if (pdp->pd_fp != NULL && pdp->pd_gen == gen) {
+ error = EEXIST;
+ releasef(fd);
+ break;
}
- }
- if (is_epoll) {
+ /*
+ * We have decided that the cached information
+ * was stale. Reset pd_events to assure that
+ * we don't mistakenly operate on cached event
+ * disposition. This configures the implicit
+ * subscription to HUP and ERR events which
+ * epoll features.
+ */
+ pdp->pd_events = POLLERR|POLLHUP;
+
epfdp = (dvpoll_epollfd_t *)pfdp;
pdp->pd_epolldata = epfdp->dpep_data;
}
@@ -928,39 +965,36 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
if (fd > pcp->pc_mapend) {
pcp->pc_mapend = fd;
}
- if (fp == NULL && (fp = getf(fd)) == NULL) {
- /*
- * The fd is not valid. Since we can't pass
- * this error back in the write() call, set
- * the bit in bitmap to force DP_POLL ioctl
- * to examine it.
- */
- BT_SET(pcp->pc_bitmap, fd);
- pdp->pd_events |= pfdp->events;
- continue;
- }
- /*
- * To (greatly) reduce EEXIST false positives, we
- * denote that this fp has been epoll()'d. We do this
- * regardless of epoll compatibility mode, as the flag
- * is harmless if not in epoll compatibility mode.
- */
- fp->f_flag2 |= FEPOLLED;
+ if (!is_epoll) {
+ ASSERT(fp == NULL);
- /*
- * Don't do VOP_POLL for an already cached fd with
- * same poll events.
- */
- if ((pdp->pd_events == pfdp->events) &&
- (pdp->pd_fp == fp)) {
+ if ((fp = getf_gen(fd, &gen)) == NULL) {
+ /*
+ * The fd is not valid. Since we can't
+ * pass this error back in the write()
+ * call, set the bit in bitmap to force
+ * DP_POLL ioctl to examine it.
+ */
+ BT_SET(pcp->pc_bitmap, fd);
+ pdp->pd_events |= pfdp->events;
+ continue;
+ }
/*
- * the events are already cached
+ * Don't do VOP_POLL for an already cached fd
+ * with same poll events.
*/
- releasef(fd);
- continue;
+ if ((pdp->pd_events == pfdp->events) &&
+ (pdp->pd_fp == fp)) {
+ /*
+ * the events are already cached
+ */
+ releasef(fd);
+ continue;
+ }
}
+
/*
* do VOP_POLL and cache this poll fd.
*/
@@ -992,11 +1026,11 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
* wake-ups.
*
* Drivers which never emit a pollhead will simply
- * disobey the exectation of edge-triggered behavior.
+ * disobey the expectation of edge-triggered behavior.
* This includes recursive epoll which, even on Linux,
* yields its events in a level-triggered fashion only.
*/
- if ((pdp->pd_events & POLLET) && error == 0 &&
+ if ((pfdp->events & POLLET) != 0 && error == 0 &&
php == NULL) {
short levent = 0;
@@ -1018,6 +1052,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
break;
}
pdp->pd_fp = fp;
+ pdp->pd_gen = gen;
pdp->pd_events |= pfdp->events;
if (php != NULL) {
if (pdp->pd_php == NULL) {
@@ -1143,8 +1178,13 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
* to turn it off for a particular open.
*/
dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
- mutex_exit(&dpep->dpe_lock);
+ /* Record the epoll-enabled nature in the pollcache too */
+ mutex_enter(&pcp->pc_lock);
+ pcp->pc_flag |= PC_EPOLL;
+ mutex_exit(&pcp->pc_lock);
+
+ mutex_exit(&dpep->dpe_lock);
return (0);
}
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index cfe0f78415..00b5f0e3de 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -347,8 +347,8 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0)
return (err);
- if ((err = mac_perim_enter_by_macname(
- dls_devnet_mac(dlh), &mph)) != 0) {
+ if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh),
+ &mph)) != 0) {
dls_devnet_rele_tmp(dlh);
return (err);
}
@@ -360,7 +360,6 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
}
mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu);
-
dls_link_rele(dlp);
mac_perim_exit(mph);
dls_devnet_rele_tmp(dlh);
@@ -702,7 +701,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
err = EACCES;
goto done;
}
- err = dls_devnet_setzid(dlh, dzp->diz_zid);
+ err = dls_devnet_setzid(dlh, dzp->diz_zid,
+ dzp->diz_transient);
} else {
kprop->pr_perm_flags = MAC_PROP_PERM_RW;
(*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh);
@@ -717,8 +717,18 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
else
err = drv_ioc_clrap(linkid);
} else {
- if (kprop->pr_valsize == 0)
- return (ENOBUFS);
+ /*
+ * You might think that the earlier call to
+ * mac_prop_check_size() should catch this but
+ * it can't. The autopush prop uses 0 as a
+ * sentinel value to clear the prop. This
+ * check ensures we don't allow a get with a
+ * valsize of 0.
+ */
+ if (kprop->pr_valsize == 0) {
+ err = ENOBUFS;
+ goto done;
+ }
kprop->pr_perm_flags = MAC_PROP_PERM_RW;
err = drv_ioc_getap(linkid, dlap);
@@ -866,7 +876,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
return (err);
if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2,
- dir->dir_link)) != 0)
+ dir->dir_link, dir->dir_zoneinit)) != 0)
return (err);
if (dir->dir_linkid2 == DATALINK_INVALID_LINKID)
@@ -1321,10 +1331,13 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred,
dls_link_t *dlp = NULL;
dld_ioc_gettran_t *dgt = karg;
- if ((ret = mac_perim_enter_by_linkid(dgt->dgt_linkid, &mph)) != 0)
+ if ((ret = dls_devnet_hold_tmp(dgt->dgt_linkid, &dlh)) != 0)
+ goto done;
+
+ if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0)
goto done;
- if ((ret = dls_devnet_hold_link(dgt->dgt_linkid, &dlh, &dlp)) != 0)
+ if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
goto done;
/*
@@ -1343,13 +1356,14 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred,
}
done:
- if (dlh != NULL && dlp != NULL) {
- dls_devnet_rele_link(dlh, dlp);
- }
+ if (dlp != NULL)
+ dls_link_rele(dlp);
- if (mph != NULL) {
+ if (mph != NULL)
mac_perim_exit(mph);
- }
+
+ if (dlh != NULL)
+ dls_devnet_rele_tmp(dlh);
return (ret);
}
@@ -1373,10 +1387,13 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred,
if (dti->dti_nbytes != 256 || dti->dti_off != 0)
return (EINVAL);
- if ((ret = mac_perim_enter_by_linkid(dti->dti_linkid, &mph)) != 0)
+ if ((ret = dls_devnet_hold_tmp(dti->dti_linkid, &dlh)) != 0)
+ goto done;
+
+ if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0)
goto done;
- if ((ret = dls_devnet_hold_link(dti->dti_linkid, &dlh, &dlp)) != 0)
+ if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
goto done;
/*
@@ -1396,13 +1413,14 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred,
}
done:
- if (dlh != NULL && dlp != NULL) {
- dls_devnet_rele_link(dlh, dlp);
- }
+ if (dlp != NULL)
+ dls_link_rele(dlp);
- if (mph != NULL) {
+ if (mph != NULL)
mac_perim_exit(mph);
- }
+
+ if (dlh != NULL)
+ dls_devnet_rele_tmp(dlh);
return (ret);
}
@@ -1499,7 +1517,6 @@ done:
return (ret);
}
-
/*
* Note that ioctls that modify links have a NULL di_priv_func(), as
* privileges can only be checked after we know the class of the link being
@@ -1575,7 +1592,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = {
{SIMNET_IOC, "simnet", 0, NULL, 0},
{BRIDGE_IOC, "bridge", 0, NULL, 0},
{IPTUN_IOC, "iptun", 0, NULL, 0},
- {IBPART_IOC, "ibp", -1, NULL, 0}
+ {IBPART_IOC, "ibp", -1, NULL, 0},
+ {OVERLAY_IOC, "overlay", 0, NULL, 0}
};
#define DLDIOC_CNT \
(sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t))
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index cadd2a76d3..1371fa47c0 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -42,7 +42,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req,
proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req,
proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req,
- proto_notify_req, proto_passive_req;
+ proto_notify_req, proto_passive_req, proto_exclusive_req;
static void proto_capability_advertise(dld_str_t *, mblk_t *);
static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *);
@@ -122,6 +122,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp)
case DL_PASSIVE_REQ:
proto_passive_req(dsp, mp);
break;
+ case DL_EXCLUSIVE_REQ:
+ proto_exclusive_req(dsp, mp);
+ break;
default:
proto_req(dsp, mp);
break;
@@ -606,6 +609,14 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp)
new_flags |= DLS_PROMISC_PHYS;
break;
+ case DL_PROMISC_RX_ONLY:
+ new_flags |= DLS_PROMISC_RX_ONLY;
+ break;
+
+ case DL_PROMISC_FIXUPS:
+ new_flags |= DLS_PROMISC_FIXUPS;
+ break;
+
default:
dl_err = DL_NOTSUPPORTED;
goto failed2;
@@ -693,6 +704,22 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp)
new_flags &= ~DLS_PROMISC_PHYS;
break;
+ case DL_PROMISC_RX_ONLY:
+ if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) {
+ dl_err = DL_NOTENAB;
+ goto failed2;
+ }
+ new_flags &= ~DLS_PROMISC_RX_ONLY;
+ break;
+
+ case DL_PROMISC_FIXUPS:
+ if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) {
+ dl_err = DL_NOTENAB;
+ goto failed2;
+ }
+ new_flags &= ~DLS_PROMISC_FIXUPS;
+ break;
+
default:
dl_err = DL_NOTSUPPORTED;
goto failed2;
@@ -1184,7 +1211,6 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp)
uint16_t sap;
uint_t addr_length;
mblk_t *bp, *payload;
- uint32_t start, stuff, end, value, flags;
t_uscalar_t dl_err;
uint_t max_sdu;
@@ -1253,9 +1279,7 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp)
/*
* Transfer the checksum offload information if it is present.
*/
- hcksum_retrieve(payload, NULL, NULL, &start, &stuff, &end, &value,
- &flags);
- (void) hcksum_assoc(bp, NULL, NULL, start, stuff, end, value, flags, 0);
+ mac_hcksum_clone(payload, bp);
/*
* Link the payload onto the new header.
@@ -1296,7 +1320,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp)
* If we've already become active by issuing an active primitive,
* then it's too late to try to become passive.
*/
- if (dsp->ds_passivestate == DLD_ACTIVE) {
+ if (dsp->ds_passivestate == DLD_ACTIVE ||
+ dsp->ds_passivestate == DLD_EXCLUSIVE) {
dl_err = DL_OUTSTATE;
goto failed;
}
@@ -1350,12 +1375,20 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+ if (dsp->ds_sap == ETHERTYPE_IPV6)
+ return (ENOTSUP);
+
switch (flags) {
case DLD_ENABLE:
dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
direct->di_rx_ch);
- direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+ if (direct->di_flags & DI_DIRECT_RAW) {
+ direct->di_tx_df =
+ (uintptr_t)str_mdata_raw_fastpath_put;
+ } else {
+ direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+ }
direct->di_tx_dh = dsp;
direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
direct->di_tx_cb_dh = dsp->ds_mch;
@@ -1377,24 +1410,22 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
}
/*
- * dld_capab_poll_enable()
- *
- * This function is misnamed. All polling and fanouts are run out of the
- * lower mac (in case of VNIC and the only mac in case of NICs). The
- * availability of Rx ring and promiscous mode is all taken care between
- * the soft ring set (mac_srs), the Rx ring, and S/W classifier. Any
- * fanout necessary is done by the soft rings that are part of the
- * mac_srs (by default mac_srs sends the packets up via a TCP and
- * non TCP soft ring).
+ * This function is misnamed. All polling and fanouts are run out of
+ * the lower MAC for VNICs and out of the MAC for NICs. The
+ * availability of Rx rings and promiscous mode is taken care of
+ * between the soft ring set (mac_srs), the Rx ring, and the SW
+ * classifier. Fanout, if necessary, is done by the soft rings that
+ * are part of the SRS. By default the SRS divvies up the packets
+ * based on protocol: TCP, UDP, or Other (OTH).
*
- * The mac_srs (or its associated soft rings) always store the ill_rx_ring
+ * The SRS (or its associated soft rings) always store the ill_rx_ring
* (the cookie returned when they registered with IP during plumb) as their
* 2nd argument which is passed up as mac_resource_handle_t. The upcall
* function and 1st argument is what the caller registered when they
* called mac_rx_classify_flow_add() to register the flow. For VNIC,
* the function is vnic_rx and argument is vnic_t. For regular NIC
* case, it mac_rx_default and mac_handle_t. As explained above, the
- * mac_srs (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t)
+ * SRS (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t)
* from its stored 2nd argument.
*/
static int
@@ -1407,11 +1438,11 @@ dld_capab_poll_enable(dld_str_t *dsp, dld_capab_poll_t *poll)
return (ENOTSUP);
/*
- * Enable client polling if and only if DLS bypass is possible.
- * Special cases like VLANs need DLS processing in the Rx data path.
- * In such a case we can neither allow the client (IP) to directly
- * poll the softring (since DLS processing hasn't been done) nor can
- * we allow DLS bypass.
+ * Enable client polling if and only if DLS bypass is
+ * possible. Some traffic requires DLS processing in the Rx
+ * data path. In such a case we can neither allow the client
+ * (IP) to directly poll the soft ring (since DLS processing
+ * hasn't been done) nor can we allow DLS bypass.
*/
if (!mac_rx_bypass_set(dsp->ds_mch, dsp->ds_rx, dsp->ds_rx_arg))
return (ENOTSUP);
@@ -1456,6 +1487,9 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+ if (dsp->ds_sap == ETHERTYPE_IPV6)
+ return (ENOTSUP);
+
switch (flags) {
case DLD_ENABLE:
return (dld_capab_poll_enable(dsp, poll));
@@ -1466,12 +1500,34 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
}
static int
+dld_capab_ipcheck(dld_str_t *dsp, void *data, uint_t flags)
+{
+ dld_capab_ipcheck_t *ipc = data;
+
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+ switch (flags) {
+ case DLD_ENABLE:
+ ipc->ipc_allowed_df = (uintptr_t)mac_protect_check_addr;
+ ipc->ipc_allowed_dh = dsp->ds_mch;
+ return (0);
+ case DLD_DISABLE:
+ return (0);
+ }
+
+ return (ENOTSUP);
+}
+
+static int
dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
{
dld_capab_lso_t *lso = data;
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+ if (dsp->ds_sap == ETHERTYPE_IPV6)
+ return (ENOTSUP);
+
switch (flags) {
case DLD_ENABLE: {
mac_capab_lso_t mac_lso;
@@ -1517,8 +1573,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
* completes. So we limit the check to DLD_ENABLE case.
*/
if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) &&
- (dsp->ds_sap != ETHERTYPE_IP ||
- !check_mod_above(dsp->ds_rq, "ip"))) {
+ (((dsp->ds_sap != ETHERTYPE_IP && dsp->ds_sap != ETHERTYPE_IPV6) ||
+ !check_mod_above(dsp->ds_rq, "ip")) &&
+ !check_mod_above(dsp->ds_rq, "vnd"))) {
return (ENOTSUP);
}
@@ -1539,6 +1596,10 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
err = dld_capab_lso(dsp, data, flags);
break;
+ case DLD_CAPAB_IPCHECK:
+ err = dld_capab_ipcheck(dsp, data, flags);
+ break;
+
default:
err = ENOTSUP;
break;
@@ -1600,9 +1661,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
}
/*
- * Direct capability negotiation interface between IP and DLD
+ * Direct capability negotiation interface between IP/VND and DLD. Note
+ * that for vnd we only allow the case where the media type is the
+ * native media type so we know that there are no transformations that
+ * would have to happen to the mac header that it receives.
*/
- if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) {
+ if (((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) &&
+ check_mod_above(dsp->ds_rq, "ip")) ||
+ (check_mod_above(dsp->ds_rq, "vnd") &&
+ dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) {
dld_capable = B_TRUE;
subsize += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_dld_t);
@@ -1721,3 +1788,36 @@ dld_capabilities_disable(dld_str_t *dsp)
if (dsp->ds_polling)
(void) dld_capab_poll_disable(dsp, NULL);
}
+
+static void
+proto_exclusive_req(dld_str_t *dsp, mblk_t *mp)
+{
+ int ret = 0;
+ t_uscalar_t dl_err;
+ mac_perim_handle_t mph;
+
+ if (dsp->ds_passivestate != DLD_UNINITIALIZED) {
+ dl_err = DL_OUTSTATE;
+ goto failed;
+ }
+
+ if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) {
+ dl_err = DL_BADPRIM;
+ goto failed;
+ }
+
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+ ret = dls_exclusive_set(dsp, B_TRUE);
+ mac_perim_exit(mph);
+
+ if (ret != 0) {
+ dl_err = DL_SYSERR;
+ goto failed;
+ }
+
+ dsp->ds_passivestate = DLD_EXCLUSIVE;
+ dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ);
+ return;
+failed:
+ dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret);
+}
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index 9f89165455..5efbe0576d 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -857,6 +857,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
return (mp);
}
+static boolean_t
+i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp)
+{
+ mblk_t *mp = *mpp;
+ mblk_t *newmp;
+ uint_t pri, vid, dvid;
+
+ dvid = mac_client_vid(dsp->ds_mch);
+
+ /*
+ * Discard the packet if this is a VLAN stream but the VID in
+ * the packet is not correct.
+ */
+ vid = VLAN_ID(mhip->mhi_tci);
+ if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
+ return (B_FALSE);
+
+ /*
+ * Discard the packet if this packet is a tagged packet
+ * but both pri and VID are 0.
+ */
+ pri = VLAN_PRI(mhip->mhi_tci);
+ if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 &&
+ vid == VLAN_ID_NONE)
+ return (B_FALSE);
+
+ /*
+ * Update the priority bits to the per-stream priority if
+ * priority is not set in the packet. Update the VID for
+ * packets on a VLAN stream.
+ */
+ pri = (pri == 0) ? dsp->ds_pri : 0;
+ if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
+ if ((newmp = i_dld_ether_header_update_tag(mp, pri,
+ dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
+ return (B_FALSE);
+ }
+ *mpp = newmp;
+ }
+
+ return (B_TRUE);
+}
+
+mac_tx_cookie_t
+str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
+ uint16_t flag)
+{
+ boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
+ mac_header_info_t mhi;
+ mac_tx_cookie_t cookie;
+
+ if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
+ goto discard;
+
+ if (is_ethernet) {
+ if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
+ goto discard;
+ }
+
+ if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
+ DLD_SETQFULL(dsp);
+ }
+ return (cookie);
+discard:
+ /* TODO: bump kstat? */
+ freemsg(mp);
+ return (NULL);
+}
+
+
+
/*
* M_DATA put (IP fast-path mode)
*/
@@ -905,7 +976,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
mblk_t *bp, *newmp;
size_t size;
mac_header_info_t mhi;
- uint_t pri, vid, dvid;
uint_t max_sdu;
/*
@@ -951,38 +1021,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
goto discard;
if (is_ethernet) {
- dvid = mac_client_vid(dsp->ds_mch);
-
- /*
- * Discard the packet if this is a VLAN stream but the VID in
- * the packet is not correct.
- */
- vid = VLAN_ID(mhi.mhi_tci);
- if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
- goto discard;
-
- /*
- * Discard the packet if this packet is a tagged packet
- * but both pri and VID are 0.
- */
- pri = VLAN_PRI(mhi.mhi_tci);
- if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
- vid == VLAN_ID_NONE)
+ if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
goto discard;
-
- /*
- * Update the priority bits to the per-stream priority if
- * priority is not set in the packet. Update the VID for
- * packets on a VLAN stream.
- */
- pri = (pri == 0) ? dsp->ds_pri : 0;
- if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
- if ((newmp = i_dld_ether_header_update_tag(mp, pri,
- dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
- goto discard;
- }
- mp = newmp;
- }
}
if (DLD_TX(dsp, mp, 0, 0) != 0) {
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index d6bc723371..b71d95bd44 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -171,16 +171,16 @@ dls_bind(dld_str_t *dsp, uint32_t sap)
/*
* The MAC layer does the VLAN demultiplexing and will only pass up
* untagged packets to non-promiscuous primary MAC clients. In order to
- * support the binding to the VLAN SAP which is required by DLPI, dls
+ * support binding to the VLAN SAP, which is required by DLPI, DLS
* needs to get a copy of all tagged packets when the client binds to
* the VLAN SAP. We do this by registering a separate promiscuous
- * callback for each dls client binding to that SAP.
+ * callback for each DLS client binding to that SAP.
*
* Note: even though there are two promiscuous handles in dld_str_t,
* ds_mph is for the regular promiscuous mode, ds_vlan_mph is the handle
- * to receive VLAN pkt when promiscuous mode is not on. Only one of
- * them can be non-NULL at the same time, to avoid receiving dup copies
- * of pkts.
+ * to receive VLAN traffic when promiscuous mode is not on. Only one of
+ * them can be non-NULL at the same time, to avoid receiving duplicate
+ * copies of packets.
*/
if (sap == ETHERTYPE_VLAN && dsp->ds_promisc == 0) {
int err;
@@ -250,19 +250,69 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
{
int err = 0;
uint32_t old_flags = dsp->ds_promisc;
+ uint32_t new_type = new_flags &
+ ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS);
mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL;
+ uint16_t mac_flags = 0;
+ boolean_t doremove = B_FALSE;
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
- DLS_PROMISC_PHYS)));
+ DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)));
+
+ /*
+ * If we only have the non-data receive flags set or are only changing
+ * them, then there's nothing to do other than update the flags here.
+ * Basically when we only have something in the set of
+ * DLS_PROMISC_RX_ONLY and DLS_PROMISC_FIXUPS around, then there's
+ * nothing else for us to do other than toggle it, as there's no need to
+ * talk to MAC and we don't have to do anything else.
+ */
+ if ((old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+ (new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0) {
+ dsp->ds_promisc = new_flags;
+ return (0);
+ }
/*
* If the user has only requested DLS_PROMISC_MULTI then we need to make
* sure that they don't see all packets.
*/
- if (new_flags == DLS_PROMISC_MULTI)
+ if (new_type == DLS_PROMISC_MULTI)
mptype = MAC_CLIENT_PROMISC_MULTI;
+ /*
+ * Look at new flags and figure out the correct mac promisc flags.
+ * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS,
+ * don't turn on physical promisc mode.
+ */
+ if (new_flags & DLS_PROMISC_RX_ONLY)
+ mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP;
+ if (new_flags & DLS_PROMISC_FIXUPS)
+ mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS;
+ if (new_type == DLS_PROMISC_SAP)
+ mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS;
+
+ /*
+ * If we're coming in and we're being asked to transition to a state
+ * where the only DLS flags would be enabled are flags that change what
+ * we do with promiscuous packets (DLS_PROMISC_RX_ONLY and
+ * DLS_PROMISC_FIXUPS) and not which packets we should receive, then we
+ * need to remove the MAC layer promiscuous handler.
+ */
+ if ((new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+ (old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) != 0 &&
+ new_flags != 0) {
+ doremove = B_TRUE;
+ }
+
+ /*
+ * There are three cases we care about here with respect to MAC. Going
+ * from nothing to something, something to nothing, something to
+ * something where we need to change how we're getting stuff from mac.
+ * In the last case, as long as they're not equal, we need to assume
+ * something has changed and do something about it.
+ */
if (dsp->ds_promisc == 0 && new_flags != 0) {
/*
* If only DLS_PROMISC_SAP, we don't turn on the
@@ -270,9 +320,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
*/
dsp->ds_promisc = new_flags;
err = mac_promisc_add(dsp->ds_mch, mptype,
- dls_rx_promisc, dsp, &dsp->ds_mph,
- (new_flags != DLS_PROMISC_SAP) ? 0 :
- MAC_PROMISC_FLAGS_NO_PHYS);
+ dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
if (err != 0) {
dsp->ds_promisc = old_flags;
return (err);
@@ -283,7 +331,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
mac_promisc_remove(dsp->ds_vlan_mph);
dsp->ds_vlan_mph = NULL;
}
- } else if (dsp->ds_promisc != 0 && new_flags == 0) {
+ } else if (dsp->ds_promisc != 0 &&
+ (new_flags == 0 || doremove == B_TRUE)) {
ASSERT(dsp->ds_mph != NULL);
mac_promisc_remove(dsp->ds_mph);
@@ -298,19 +347,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
&dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
}
- } else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 &&
- new_flags != dsp->ds_promisc) {
- /*
- * If the old flag is PROMISC_SAP, but the current flag has
- * changed to some new non-zero value, we need to turn the
- * physical promiscuous mode.
- */
+ } else if (new_flags != 0 && new_flags != old_flags) {
ASSERT(dsp->ds_mph != NULL);
mac_promisc_remove(dsp->ds_mph);
/* Honors both after-remove and before-add semantics! */
dsp->ds_promisc = new_flags;
err = mac_promisc_add(dsp->ds_mch, mptype,
- dls_rx_promisc, dsp, &dsp->ds_mph, 0);
+ dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
if (err != 0)
dsp->ds_promisc = old_flags;
} else {
@@ -631,6 +674,22 @@ boolean_t
dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
void **ds_rx_arg, boolean_t loopback)
{
+ if (dsp->ds_promisc == 0) {
+ /*
+ * If there are active walkers of the mi_promisc_list when
+ * promiscuousness is disabled, ds_promisc will be cleared,
+ * but the DLS will remain on the mi_promisc_list until the
+ * walk is completed. If we do not recognize this case here,
+ * we won't properly execute the ds_promisc case in the common
+ * accept routine -- and we will potentially accept a packet
+ * that has originated with this DLS (which in turn can
+ * induce recursion and death by stack overflow). If
+ * ds_promisc is zero, we know that we are in this window --
+ * and we refuse to accept the packet.
+ */
+ return (B_FALSE);
+ }
+
return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
loopback));
}
@@ -652,8 +711,8 @@ dls_mac_active_set(dls_link_t *dlp)
/* request the primary MAC address */
if ((err = mac_unicast_add(dlp->dl_mch, NULL,
MAC_UNICAST_PRIMARY | MAC_UNICAST_TAG_DISABLE |
- MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah, 0,
- &diag)) != 0) {
+ MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah,
+ VLAN_ID_NONE, &diag)) != 0) {
return (err);
}
@@ -661,7 +720,10 @@ dls_mac_active_set(dls_link_t *dlp)
* Set the function to start receiving packets.
*/
mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
+ } else if (dlp->dl_exclusive == B_TRUE) {
+ return (EBUSY);
}
+
dlp->dl_nactive++;
return (0);
}
@@ -687,7 +749,11 @@ dls_active_set(dld_str_t *dsp)
if (dsp->ds_passivestate == DLD_PASSIVE)
return (0);
- /* If we're already active, then there's nothing more to do. */
+ if (dsp->ds_dlp->dl_exclusive == B_TRUE &&
+ dsp->ds_passivestate != DLD_EXCLUSIVE)
+ return (EBUSY);
+
+ /* If we're already active, we need to check the link's exclusivity */
if ((dsp->ds_nactive == 0) &&
((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) {
/* except for ENXIO all other errors are mapped to EBUSY */
@@ -696,7 +762,8 @@ dls_active_set(dld_str_t *dsp)
return (err);
}
- dsp->ds_passivestate = DLD_ACTIVE;
+ dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ?
+ DLD_EXCLUSIVE : DLD_ACTIVE;
dsp->ds_nactive++;
return (0);
}
@@ -727,7 +794,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all)
if (dsp->ds_nactive != 0)
return;
- ASSERT(dsp->ds_passivestate == DLD_ACTIVE);
+ ASSERT(dsp->ds_passivestate == DLD_ACTIVE ||
+ dsp->ds_passivestate == DLD_EXCLUSIVE);
dls_mac_active_clear(dsp->ds_dlp);
+ /*
+ * We verify below to ensure that no other part of DLS has mucked with
+ * our exclusive state.
+ */
+ if (dsp->ds_passivestate == DLD_EXCLUSIVE)
+ VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0);
dsp->ds_passivestate = DLD_UNINITIALIZED;
}
+
+int
+dls_exclusive_set(dld_str_t *dsp, boolean_t enable)
+{
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+ if (enable == B_FALSE) {
+ dsp->ds_dlp->dl_exclusive = B_FALSE;
+ return (0);
+ }
+
+ if (dsp->ds_dlp->dl_nactive != 0)
+ return (EBUSY);
+
+ dsp->ds_dlp->dl_exclusive = B_TRUE;
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 6c8ffcb0a9..c792251052 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -30,11 +30,15 @@
#include <sys/sysmacros.h>
#include <sys/strsubr.h>
+#include <sys/pattr.h>
#include <sys/strsun.h>
#include <sys/vlan.h>
#include <sys/dld_impl.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
+#include <sys/sysevent.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/datalink.h>
static kmem_cache_t *i_dls_link_cachep;
mod_hash_t *i_dls_link_hash;
@@ -159,6 +163,18 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
uint16_t cvid, cpri;
int err;
+ /*
+ * If this message is from a same-machine sender, then
+ * there may be HW checksum offloads to emulate.
+ */
+ if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+ mblk_t *tmpnext = mp->b_next;
+
+ mp->b_next = NULL;
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ mp->b_next = tmpnext;
+ }
+
DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err);
if (err != 0)
break;
@@ -353,6 +369,22 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
int err, rval;
/*
+ * The mac_hw_emul() function, by design, doesn't predicate on
+ * HW_LOCAL_MAC. But since we are in Rx context we know that
+ * any LSO packet must also be from a same-machine sender. We
+ * take advantage of that and forgoe writing a manual loop to
+ * predicate on HW_LOCAL_MAC.
+ *
+ * But for checksum emulation we need to predicate on
+ * HW_LOCAL_MAC to avoid calling mac_hw_emul() on packets that
+ * don't need it (thanks to the fact that HCK_IPV4_HDRCKSUM
+ * and HCK_IPV4_HDRCKSUM_OK use the same value). Therefore we
+ * do the checksum emulation in the second loop and in
+ * subchain matching.
+ */
+ mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
+
+ /*
* Walk the packet chain.
*/
for (; mp != NULL; mp = nextp) {
@@ -361,6 +393,18 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
*/
accepted = B_FALSE;
+ /*
+ * If this message is from a same-machine sender, then
+ * there may be HW checksum offloads to emulate.
+ */
+ if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+ mblk_t *tmpnext = mp->b_next;
+
+ mp->b_next = NULL;
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ mp->b_next = tmpnext;
+ }
+
DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
if (err != 0) {
atomic_inc_32(&(dlp->dl_unknowns));
@@ -379,7 +423,16 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
vid = VLAN_ID(mhi.mhi_tci);
+ /*
+ * This condition is true only when a sun4v vsw client
+ * is on the scene; as it is the only type of client
+ * that multiplexes VLANs on a single client instance.
+ * All other types of clients have one VLAN per client
+ * instance. In that case, MAC strips the VLAN tag
+ * before delivering it to DLS (see mac_rx_deliver()).
+ */
if (mhi.mhi_istagged) {
+
/*
* If it is tagged traffic, send it upstream to
* all dld_str_t which are attached to the physical
@@ -554,7 +607,13 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
dls_head_t *dhp;
mod_hash_key_t key;
+ /*
+ * We expect to deal with only a single packet.
+ */
+ ASSERT3P(mp->b_next, ==, NULL);
+
DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
+
if (err != 0)
goto drop;
@@ -580,6 +639,67 @@ drop:
freemsg(mp);
}
+/*
+ * We'd like to notify via sysevents that a link state change has occurred.
+ * There are a couple of challenges associated with this. The first is that if
+ * the link is flapping a lot, we may not see an accurate state when we launch
+ * the notification, we're told it changed, not what it changed to.
+ *
+ * The next problem is that all of the information that a user has associated
+ * with this device is the exact opposite of what we have on the dls_link_t. We
+ * have the name of the mac device, which has no bearing on what users see.
+ * Likewise, we don't have the datalink id either. So we're going to have to get
+ * this from dls.
+ *
+ * This is all further complicated by the fact that this could be going on in
+ * another thread at the same time as someone is tearing down the dls_link_t
+ * that we're associated with. We need to be careful not to grab the mac
+ * perimeter, otherwise we stand a good chance of deadlock.
+ */
+static void
+dls_link_notify(void *arg, mac_notify_type_t type)
+{
+ dls_link_t *dlp = arg;
+ dls_dl_handle_t dhp;
+ nvlist_t *nvp;
+ sysevent_t *event;
+ sysevent_id_t eid;
+
+ if (type != MAC_NOTE_LINK && type != MAC_NOTE_LOWLINK)
+ return;
+
+ /*
+ * If we can't find a devnet handle for this link, then there is no user
+ * knowable device for this at the moment and there's nothing we can
+ * really share with them that will make sense.
+ */
+ if (dls_devnet_hold_tmp_by_link(dlp, &dhp) != 0)
+ return;
+
+ /*
+ * Because we're attaching this nvlist_t to the sysevent, it'll get
+ * cleaned up when we call sysevent_free.
+ */
+ VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_int32(nvp, DATALINK_EV_LINK_ID,
+ dls_devnet_linkid(dhp)) == 0);
+ VERIFY(nvlist_add_string(nvp, DATALINK_EV_LINK_NAME,
+ dls_devnet_link(dhp)) == 0);
+ VERIFY(nvlist_add_int32(nvp, DATALINK_EV_ZONE_ID,
+ dls_devnet_getzid(dhp)) == 0);
+
+ dls_devnet_rele_tmp(dhp);
+
+ event = sysevent_alloc(EC_DATALINK, ESC_DATALINK_LINK_STATE,
+ ILLUMOS_KERN_PUB"dls", SE_SLEEP);
+ VERIFY(event != NULL);
+ (void) sysevent_attach_attributes(event, (sysevent_attr_list_t *)nvp);
+
+ (void) log_sysevent(event, SE_SLEEP, &eid);
+ sysevent_free(event);
+
+}
+
static void
i_dls_link_destroy(dls_link_t *dlp)
{
@@ -590,6 +710,9 @@ i_dls_link_destroy(dls_link_t *dlp)
/*
* Free the structure back to the cache.
*/
+ if (dlp->dl_mnh != NULL)
+ mac_notify_remove(dlp->dl_mnh, B_TRUE);
+
if (dlp->dl_mch != NULL)
mac_client_close(dlp->dl_mch, 0);
@@ -601,8 +724,10 @@ i_dls_link_destroy(dls_link_t *dlp)
dlp->dl_mh = NULL;
dlp->dl_mch = NULL;
dlp->dl_mip = NULL;
+ dlp->dl_mnh = NULL;
dlp->dl_unknowns = 0;
dlp->dl_nonip_cnt = 0;
+ dlp->dl_exclusive = B_FALSE;
kmem_cache_free(i_dls_link_cachep, dlp);
}
@@ -641,6 +766,8 @@ i_dls_link_create(const char *name, dls_link_t **dlpp)
if (err != 0)
goto bail;
+ dlp->dl_mnh = mac_notify_add(dlp->dl_mh, dls_link_notify, dlp);
+
DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
dlp->dl_mch);
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index 05620698ca..f813acaac6 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2017 Joyent, Inc.
*/
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
@@ -85,6 +86,14 @@ static door_handle_t dls_mgmt_dh = NULL;
/* dls_devnet_t dd_flags */
#define DD_CONDEMNED 0x1
#define DD_IMPLICIT_IPTUN 0x2 /* Implicitly-created ip*.*tun* tunnel */
+#define DD_INITIALIZING 0x4
+
+/*
+ * If the link is marked as initializing or condemned then it should
+ * not be visible outside of the DLS framework.
+ */
+#define DD_NOT_VISIBLE(flags) ( \
+ (flags & (DD_CONDEMNED | DD_INITIALIZING)) != 0)
/*
* This structure is used to keep the <linkid, macname> mapping.
@@ -108,13 +117,14 @@ typedef struct dls_devnet_s {
zoneid_t dd_zid; /* current zone */
boolean_t dd_prop_loaded;
taskqid_t dd_prop_taskid;
+ boolean_t dd_transient; /* link goes away when zone does */
} dls_devnet_t;
static int i_dls_devnet_create_iptun(const char *, const char *,
datalink_id_t *);
static int i_dls_devnet_destroy_iptun(datalink_id_t);
-static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t);
-static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t);
+static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t);
+static int dls_devnet_unset(mac_handle_t, datalink_id_t *, boolean_t);
/*ARGSUSED*/
static int
@@ -134,9 +144,9 @@ i_dls_devnet_destructor(void *buf, void *arg)
{
dls_devnet_t *ddp = buf;
- ASSERT(ddp->dd_ksp == NULL);
- ASSERT(ddp->dd_ref == 0);
- ASSERT(ddp->dd_tref == 0);
+ VERIFY(ddp->dd_ksp == NULL);
+ VERIFY(ddp->dd_ref == 0);
+ VERIFY(ddp->dd_tref == 0);
mutex_destroy(&ddp->dd_mutex);
cv_destroy(&ddp->dd_cv);
}
@@ -148,7 +158,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg)
dls_devnet_t *ddp;
if (dls_devnet_hold_tmp(linkid, &ddp) == 0) {
- (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID);
+ /*
+ * Don't bother moving transient links back to the global zone
+ * since we will simply delete them in dls_devnet_unset.
+ */
+ if (!ddp->dd_transient)
+ (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
dls_devnet_rele_tmp(ddp);
}
return (0);
@@ -529,6 +544,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+ getlinkid.ld_zoneid = getzoneid();
if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
sizeof (retval))) == 0) {
@@ -537,6 +553,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
return (err);
}
+int
+dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid,
+ zoneid_t zid)
+{
+ dlmgmt_door_getlinkid_t getlinkid;
+ dlmgmt_getlinkid_retval_t retval;
+ int err;
+
+ ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid());
+ getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
+ (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+ getlinkid.ld_zoneid = zid;
+
+ if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
+ sizeof (retval))) == 0) {
+ *linkid = retval.lr_linkid;
+ }
+ return (err);
+}
+
+
datalink_id_t
dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class,
datalink_media_t dmedia, uint32_t flags)
@@ -736,13 +773,24 @@ dls_devnet_stat_update(kstat_t *ksp, int rw)
* Create the "link" kstats.
*/
static void
-dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid)
+dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid)
{
kstat_t *ksp;
+ char *nm;
+ char kname[MAXLINKNAMELEN];
+
+ if (zoneid != newzoneid) {
+ ASSERT(zoneid == GLOBAL_ZONEID);
+ (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid,
+ ddp->dd_linkname);
+ nm = kname;
+ } else {
+ nm = ddp->dd_linkname;
+ }
- if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid,
+ if (dls_stat_create("link", 0, nm, zoneid,
dls_devnet_stat_update, (void *)(uintptr_t)ddp->dd_linkid,
- &ksp) == 0) {
+ &ksp, newzoneid) == 0) {
ASSERT(ksp != NULL);
if (zoneid == ddp->dd_owner_zid) {
ASSERT(ddp->dd_ksp == NULL);
@@ -762,12 +810,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
{
if (zoneid == ddp->dd_owner_zid) {
if (ddp->dd_ksp != NULL) {
- kstat_delete(ddp->dd_ksp);
+ dls_stat_delete(ddp->dd_ksp);
ddp->dd_ksp = NULL;
}
} else {
if (ddp->dd_zone_ksp != NULL) {
- kstat_delete(ddp->dd_zone_ksp);
+ dls_stat_delete(ddp->dd_zone_ksp);
ddp->dd_zone_ksp = NULL;
}
}
@@ -778,24 +826,38 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
* and create the new set using the new name.
*/
static void
-dls_devnet_stat_rename(dls_devnet_t *ddp)
+dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit)
{
if (ddp->dd_ksp != NULL) {
- kstat_delete(ddp->dd_ksp);
+ dls_stat_delete(ddp->dd_ksp);
ddp->dd_ksp = NULL;
}
- /* We can't rename a link while it's assigned to a non-global zone. */
+ if (zoneinit && ddp->dd_zone_ksp != NULL) {
+ dls_stat_delete(ddp->dd_zone_ksp);
+ ddp->dd_zone_ksp = NULL;
+ }
+ /*
+ * We can't rename a link while it's assigned to a non-global zone
+ * unless we're first initializing the zone while readying it.
+ */
ASSERT(ddp->dd_zone_ksp == NULL);
- dls_devnet_stat_create(ddp, ddp->dd_owner_zid);
+ dls_devnet_stat_create(ddp, ddp->dd_owner_zid,
+ (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid));
+ if (zoneinit)
+ dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid);
}
/*
- * Associate a linkid with a given link (identified by macname)
+ * Associate the linkid with the link identified by macname. If this
+ * is called on behalf of a physical link then linkid may be
+ * DATALINK_INVALID_LINKID. Otherwise, if called on behalf of a
+ * virtual link, linkid must have a value.
*/
static int
-dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid,
+dls_devnet_set(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid,
dls_devnet_t **ddpp)
{
+ const char *macname = mac_name(mh);
dls_devnet_t *ddp = NULL;
datalink_class_t class;
int err;
@@ -828,17 +890,41 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid,
}
/*
- * This might be a physical link that has already
- * been created, but which does not have a linkid
- * because dlmgmtd was not running when it was created.
+ * If we arrive here we know we are attempting to set
+ * the linkid on a physical link. A virtual link
+ * should never arrive here because it should never
+ * call this function without a linkid. Virtual links
+ * are created through dlgmtmd and thus we know
+ * dlmgmtd is alive to assign it a linkid (search for
+ * uses of dladm_create_datalink_id() to prove this to
+ * yourself); we don't have the same guarantee for a
+ * physical link which may perform an upcall for a
+ * linkid while dlmgmtd is down but will continue
+ * creating a devnet without the linkid (see
+ * softmac_create_datalink() to see how physical link
+ * creation works). That is why there is no entry in
+ * the id hash but there is one in the macname hash --
+ * softmac couldn't acquire a linkid the first time it
+ * called this function.
+ *
+ * Because of the check above, we also know that
+ * ddp->dd_linkid is not set. Following this, the link
+ * must still be in the DD_INITIALIZING state because
+ * that flag is removed IFF dd_linkid is set. This is
+ * why we can ASSERT the DD_INITIALIZING flag below if
+ * the call to i_dls_devnet_setzid() fails.
*/
if (linkid == DATALINK_INVALID_LINKID ||
class != DATALINK_CLASS_PHYS) {
err = EINVAL;
goto done;
}
+
+ ASSERT(ddp->dd_flags & DD_INITIALIZING);
+
} else {
ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP);
+ ddp->dd_flags = DD_INITIALIZING;
ddp->dd_tref = 0;
ddp->dd_ref++;
ddp->dd_owner_zid = zoneid;
@@ -875,8 +961,19 @@ done:
rw_exit(&i_dls_devnet_lock);
if (err == 0) {
if (zoneid != GLOBAL_ZONEID &&
- (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0)
- (void) dls_devnet_unset(macname, &linkid, B_TRUE);
+ (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE,
+ B_FALSE)) != 0) {
+ /*
+ * At this point the link is marked as
+ * DD_INITIALIZING -- there can be no
+ * outstanding temp refs and therefore no need
+ * to wait for them.
+ */
+ ASSERT(ddp->dd_flags & DD_INITIALIZING);
+ (void) dls_devnet_unset(mh, &linkid, B_FALSE);
+ return (err);
+ }
+
/*
* The kstat subsystem holds its own locks (rather perimeter)
* before calling the ks_update (dls_devnet_stat_update) entry
@@ -884,20 +981,35 @@ done:
* lock hierarchy is kstat locks -> i_dls_devnet_lock.
*/
if (stat_create)
- dls_devnet_stat_create(ddp, zoneid);
+ dls_devnet_stat_create(ddp, zoneid, zoneid);
if (ddpp != NULL)
*ddpp = ddp;
+
+ mutex_enter(&ddp->dd_mutex);
+ if (linkid != DATALINK_INVALID_LINKID &&
+ !ddp->dd_prop_loaded && ddp->dd_prop_taskid == NULL) {
+ ddp->dd_prop_taskid = taskq_dispatch(system_taskq,
+ dls_devnet_prop_task, ddp, TQ_SLEEP);
+ }
+ mutex_exit(&ddp->dd_mutex);
+
}
return (err);
}
/*
- * Disassociate a linkid with a given link (identified by macname)
- * This waits until temporary references to the dls_devnet_t are gone.
+ * Disassociate the linkid from the link identified by macname. If
+ * wait is B_TRUE, wait until all temporary refs are released and the
+ * prop task is finished.
+ *
+ * If waiting then you SHOULD NOT call this from inside the MAC perim
+ * as deadlock will ensue. Otherwise, this function is safe to call
+ * from inside or outside the MAC perim.
*/
static int
-dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
+dls_devnet_unset(mac_handle_t mh, datalink_id_t *id, boolean_t wait)
{
+ const char *macname = mac_name(mh);
dls_devnet_t *ddp;
int err;
mod_hash_val_t val;
@@ -918,21 +1030,62 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
* deadlock. Return EBUSY if the asynchronous thread started for
* property loading as part of the post attach hasn't yet completed.
*/
- ASSERT(ddp->dd_ref != 0);
+ VERIFY(ddp->dd_ref != 0);
if ((ddp->dd_ref != 1) || (!wait &&
(ddp->dd_tref != 0 || ddp->dd_prop_taskid != 0))) {
- mutex_exit(&ddp->dd_mutex);
- rw_exit(&i_dls_devnet_lock);
- return (EBUSY);
+ int zstatus = 0;
+
+ /*
+ * There are a couple of alternatives that might be going on
+ * here; a) the zone is shutting down and it has a transient
+ * link assigned, in which case we want to clean it up instead
+ * of moving it back to the global zone, or b) its possible
+ * that we're trying to clean up an orphaned vnic that was
+ * delegated to a zone and which wasn't cleaned up properly
+ * when the zone went away. Check for either of these cases
+ * before we simply return EBUSY.
+ *
+ * zstatus indicates which situation we are dealing with:
+ * 0 - means return EBUSY
+ * 1 - means case (a), cleanup transient link
+ * -1 - means case (b), orphained VNIC
+ */
+ if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) {
+ zone_t *zp;
+
+ if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) {
+ zstatus = -1;
+ } else {
+ if (ddp->dd_transient) {
+ zone_status_t s = zone_status_get(zp);
+
+ if (s >= ZONE_IS_SHUTTING_DOWN)
+ zstatus = 1;
+ }
+ zone_rele(zp);
+ }
+ }
+
+ if (zstatus == 0) {
+ mutex_exit(&ddp->dd_mutex);
+ rw_exit(&i_dls_devnet_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * We want to delete the link, reset ref to 1;
+ */
+ if (zstatus == -1)
+ /* Log a warning, but continue in this case */
+ cmn_err(CE_WARN, "clear orphaned datalink: %s\n",
+ ddp->dd_linkname);
+ ddp->dd_ref = 1;
}
ddp->dd_flags |= DD_CONDEMNED;
ddp->dd_ref--;
*id = ddp->dd_linkid;
- if (ddp->dd_zid != GLOBAL_ZONEID)
- (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
-
/*
* Remove this dls_devnet_t from the hash table.
*/
@@ -947,18 +1100,40 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
}
rw_exit(&i_dls_devnet_lock);
+ /*
+ * It is important to call i_dls_devnet_setzid() WITHOUT the
+ * i_dls_devnet_lock held. The setzid call grabs the MAC
+ * perim; thus causing DLS -> MAC lock ordering if performed
+ * with the i_dls_devnet_lock held. This forces consumers to
+ * grab the MAC perim before calling dls_devnet_unset() (the
+ * locking rules state MAC -> DLS order). By performing the
+ * setzid outside of the i_dls_devnet_lock consumers can
+ * safely call dls_devnet_unset() outside the MAC perim.
+ */
+ if (ddp->dd_zid != GLOBAL_ZONEID) {
+ dls_devnet_stat_destroy(ddp, ddp->dd_zid);
+ (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE,
+ B_FALSE);
+ }
+
if (wait) {
/*
* Wait until all temporary references are released.
+ * The holders of the tref need the MAC perim to
+ * perform their work and release the tref. To avoid
+ * deadlock, assert that the perim is never held here.
*/
+ ASSERT0(MAC_PERIM_HELD(mh));
while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != 0))
cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
} else {
- ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL);
+ VERIFY(ddp->dd_tref == 0);
+ VERIFY(ddp->dd_prop_taskid == NULL);
}
- if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid);
+ }
ddp->dd_prop_loaded = B_FALSE;
ddp->dd_linkid = DATALINK_INVALID_LINKID;
@@ -969,6 +1144,39 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
return (0);
}
+/*
+ * This is a private hold routine used when we already have the dls_link_t, thus
+ * we know that it cannot go away.
+ */
+int
+dls_devnet_hold_tmp_by_link(dls_link_t *dlp, dls_dl_handle_t *ddhp)
+{
+ int err;
+ dls_devnet_t *ddp = NULL;
+
+ rw_enter(&i_dls_devnet_lock, RW_WRITER);
+ if ((err = mod_hash_find(i_dls_devnet_hash,
+ (mod_hash_key_t)dlp->dl_name, (mod_hash_val_t *)&ddp)) != 0) {
+ ASSERT(err == MH_ERR_NOTFOUND);
+ rw_exit(&i_dls_devnet_lock);
+ return (ENOENT);
+ }
+
+ mutex_enter(&ddp->dd_mutex);
+ VERIFY(ddp->dd_ref > 0);
+ if (DD_NOT_VISIBLE(ddp->dd_flags)) {
+ mutex_exit(&ddp->dd_mutex);
+ rw_exit(&i_dls_devnet_lock);
+ return (ENOENT);
+ }
+ ddp->dd_tref++;
+ mutex_exit(&ddp->dd_mutex);
+ rw_exit(&i_dls_devnet_lock);
+
+ *ddhp = ddp;
+ return (0);
+}
+
static int
dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
boolean_t tmp_hold)
@@ -985,8 +1193,8 @@ dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
}
mutex_enter(&ddp->dd_mutex);
- ASSERT(ddp->dd_ref > 0);
- if (ddp->dd_flags & DD_CONDEMNED) {
+ VERIFY(ddp->dd_ref > 0);
+ if (DD_NOT_VISIBLE(ddp->dd_flags)) {
mutex_exit(&ddp->dd_mutex);
rw_exit(&i_dls_devnet_lock);
return (ENOENT);
@@ -1053,8 +1261,8 @@ dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp)
return (ENOENT);
}
mutex_enter(&ddp->dd_mutex);
- ASSERT(ddp->dd_ref > 0);
- if (ddp->dd_flags & DD_CONDEMNED) {
+ VERIFY(ddp->dd_ref > 0);
+ if (DD_NOT_VISIBLE(ddp->dd_flags)) {
mutex_exit(&ddp->dd_mutex);
rw_exit(&i_dls_devnet_lock);
return (ENOENT);
@@ -1071,7 +1279,7 @@ void
dls_devnet_rele(dls_devnet_t *ddp)
{
mutex_enter(&ddp->dd_mutex);
- ASSERT(ddp->dd_ref > 1);
+ VERIFY(ddp->dd_ref > 1);
ddp->dd_ref--;
if ((ddp->dd_flags & DD_IMPLICIT_IPTUN) && ddp->dd_ref == 1) {
mutex_exit(&ddp->dd_mutex);
@@ -1083,7 +1291,7 @@ dls_devnet_rele(dls_devnet_t *ddp)
}
static int
-dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
+dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
{
char drv[MAXLINKNAMELEN];
uint_t ppa;
@@ -1093,7 +1301,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
dls_dev_handle_t ddh;
int err;
- if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0)
+ if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0)
return (dls_devnet_hold(linkid, ddpp));
/*
@@ -1236,9 +1444,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp)
*
* This case does not change the <link name, linkid> mapping, so the link's
* kstats need to be updated with using name associated the given id2.
+ *
+ * The zoneinit parameter is used to allow us to create a VNIC in the global
+ * zone which is assigned to a non-global zone. Since there is a race condition
+ * in the create process if two VNICs have the same name, we need to rename it
+ * after it has been assigned to the zone.
*/
int
-dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
+dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link,
+ boolean_t zoneinit)
{
dls_dev_handle_t ddh = NULL;
int err = 0;
@@ -1283,10 +1497,12 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
}
mutex_enter(&ddp->dd_mutex);
- if (ddp->dd_ref > 1) {
- mutex_exit(&ddp->dd_mutex);
- err = EBUSY;
- goto done;
+ if (!zoneinit) {
+ if (ddp->dd_ref > 1) {
+ mutex_exit(&ddp->dd_mutex);
+ err = EBUSY;
+ goto done;
+ }
}
mutex_exit(&ddp->dd_mutex);
@@ -1297,7 +1513,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
/* rename mac client name and its flow if exists */
if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
goto done;
- (void) mac_rename_primary(mh, link);
+ if (zoneinit) {
+ char tname[MAXLINKNAMELEN];
+
+ (void) snprintf(tname, sizeof (tname), "z%d_%s",
+ ddp->dd_zid, link);
+ (void) mac_rename_primary(mh, tname);
+ } else {
+ (void) mac_rename_primary(mh, link);
+ }
mac_close(mh);
goto done;
}
@@ -1364,7 +1588,7 @@ done:
rw_exit(&i_dls_devnet_lock);
if (err == 0)
- dls_devnet_stat_rename(ddp);
+ dls_devnet_stat_rename(ddp, zoneinit);
if (mph != NULL)
mac_perim_exit(mph);
@@ -1373,7 +1597,8 @@ done:
}
static int
-i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
+i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop,
+ boolean_t transient)
{
int err;
mac_perim_handle_t mph;
@@ -1402,10 +1627,18 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
sizeof (retval));
if (err != 0)
goto done;
+
+ /*
+ * We set upcall_done only if the upcall is
+ * successful. This way, if dls_link_setzid() fails,
+ * we know another upcall must be done to reset the
+ * dlmgmtd state.
+ */
upcall_done = B_TRUE;
}
if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) {
ddp->dd_zid = new_zoneid;
+ ddp->dd_transient = transient;
devnet_need_rebuild = B_TRUE;
}
@@ -1420,7 +1653,7 @@ done:
}
int
-dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
+dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient)
{
dls_devnet_t *ddp;
int err;
@@ -1442,7 +1675,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
refheld = B_TRUE;
}
- if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) {
+ if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) {
if (refheld)
dls_devnet_rele(ddp);
return (err);
@@ -1459,7 +1692,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
if (old_zid != GLOBAL_ZONEID)
dls_devnet_stat_destroy(ddh, old_zid);
if (new_zid != GLOBAL_ZONEID)
- dls_devnet_stat_create(ddh, new_zid);
+ dls_devnet_stat_create(ddh, new_zid, new_zid);
return (0);
}
@@ -1497,15 +1730,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid)
* Access a vanity naming node.
*/
int
-dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp,
+ zoneid_t zid)
{
dls_devnet_t *ddp;
dls_link_t *dlp;
- zoneid_t zid = getzoneid();
+ zoneid_t czid = getzoneid();
int err;
mac_perim_handle_t mph;
- if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
+ if (czid != GLOBAL_ZONEID && czid != zid)
+ return (ENOENT);
+
+ if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0)
return (err);
dls_devnet_prop_task_wait(ddp);
@@ -1538,6 +1775,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
return (0);
}
+int
+dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+{
+ return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid()));
+}
+
/*
* Close access to a vanity naming node.
*/
@@ -1594,13 +1837,32 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid)
* we need to use the linkid to get the user name for the link
* when we create the MAC client.
*/
- if ((err = dls_devnet_set(mac_name(mh), linkid, zoneid, &ddp)) == 0) {
+ if ((err = dls_devnet_set(mh, linkid, zoneid, &ddp)) == 0) {
if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) {
mac_perim_exit(mph);
- (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
+ (void) dls_devnet_unset(mh, &linkid, B_FALSE);
return (err);
}
+
+ /*
+ * If dd_linkid is set then the link was successfully
+ * initialized. In this case we can remove the
+ * initializing flag and make the link visible to the
+ * rest of the system.
+ *
+ * If not set then we were called by softmac and it
+ * was unable to obtain a linkid for the physical link
+ * because dlmgmtd is down. In that case softmac will
+ * eventually obtain a linkid and call
+ * dls_devnet_recreate() to complete initialization.
+ */
+ mutex_enter(&ddp->dd_mutex);
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ ddp->dd_flags &= ~DD_INITIALIZING;
+ mutex_exit(&ddp->dd_mutex);
+
}
+
mac_perim_exit(mph);
return (err);
}
@@ -1614,8 +1876,19 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid)
int
dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid)
{
- ASSERT(linkid != DATALINK_INVALID_LINKID);
- return (dls_devnet_set(mac_name(mh), linkid, GLOBAL_ZONEID, NULL));
+ dls_devnet_t *ddp;
+ int err;
+
+ VERIFY(linkid != DATALINK_INVALID_LINKID);
+ if ((err = dls_devnet_set(mh, linkid, GLOBAL_ZONEID, &ddp)) == 0) {
+ mutex_enter(&ddp->dd_mutex);
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ ddp->dd_flags &= ~DD_INITIALIZING;
+ mutex_exit(&ddp->dd_mutex);
+ }
+
+ return (err);
+
}
int
@@ -1625,15 +1898,52 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
mac_perim_handle_t mph;
*idp = DATALINK_INVALID_LINKID;
- err = dls_devnet_unset(mac_name(mh), idp, wait);
- if (err != 0 && err != ENOENT)
+ err = dls_devnet_unset(mh, idp, wait);
+
+ /*
+ * We continue on in the face of ENOENT because the devnet
+ * unset and DLS link release are not atomic and we may have a
+ * scenario where there is no entry in i_dls_devnet_hash for
+ * the MAC name but there is an entry in i_dls_link_hash. For
+ * example, if the following occurred:
+ *
+ * 1. dls_devnet_unset() returns success, and
+ *
+ * 2. dls_link_rele_by_name() fails with ENOTEMPTY because
+ * flows still exist, and
+ *
+ * 3. dls_devnet_set() fails to set the zone id and calls
+ * dls_devnet_unset() -- leaving an entry in
+ * i_dls_link_hash but no corresponding entry in
+ * i_dls_devnet_hash.
+ *
+ * Even if #3 wasn't true the dls_devnet_set() may fail for
+ * different reasons in the future; the point is that it _can_
+ * fail as part of its contract. We can't rely on it working
+ * so we must assume that these two pieces of state (devnet
+ * and link hashes), which should always be in sync, can get
+ * out of sync and thus even if we get ENOENT from the devnet
+ * hash we should still try to delete from the link hash just
+ * in case.
+ *
+ * We could prevent the ENOTEMPTY from dls_link_rele_by_name()
+ * by calling mac_disable() before calling
+ * dls_devnet_destroy() but that's not currently possible due
+ * to a long-standing bug. OpenSolaris 6791335: The semantics
+ * of mac_disable() were modified by Crossbow such that
+ * dls_devnet_destroy() needs to be called before
+ * mac_disable() can succeed. This is because of the implicit
+ * reference that dls has on the mac_impl_t.
+ */
+ if (err != 0 && err != ENOENT) {
return (err);
+ }
mac_perim_enter_by_mh(mh, &mph);
err = dls_link_rele_by_name(mac_name(mh));
- mac_perim_exit(mph);
-
if (err != 0) {
+ dls_devnet_t *ddp;
+
/*
* XXX It is a general GLDv3 bug that dls_devnet_set() has to
* be called to re-set the link when destroy fails. The
@@ -1641,9 +1951,22 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
* called from kernel context or from a zone other than that
* which initially created the link.
*/
- (void) dls_devnet_set(mac_name(mh), *idp, crgetzoneid(CRED()),
- NULL);
+ (void) dls_devnet_set(mh, *idp, crgetzoneid(CRED()), &ddp);
+
+ /*
+ * You might think dd_linkid should always be set
+ * here, but in the case where dls_devnet_unset()
+ * returns ENOENT it will be DATALINK_INVALID_LINKID.
+ * Stay consistent with the rest of DLS and only
+ * remove the initializing flag if linkid is set.
+ */
+ mutex_enter(&ddp->dd_mutex);
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ ddp->dd_flags &= ~DD_INITIALIZING;
+ mutex_exit(&ddp->dd_mutex);
}
+
+ mac_perim_exit(mph);
return (err);
}
@@ -1717,6 +2040,12 @@ i_dls_devnet_destroy_iptun(datalink_id_t linkid)
}
const char *
+dls_devnet_link(dls_dl_handle_t ddh)
+{
+ return (ddh->dd_linkname);
+}
+
+const char *
dls_devnet_mac(dls_dl_handle_t ddh)
{
return (ddh->dd_mac);
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 51e4be7260..82dceff278 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
/*
@@ -30,30 +31,33 @@
#include <sys/dld_impl.h>
#include <sys/mac_ether.h>
-static mac_stat_info_t i_dls_si[] = {
- { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32,
- (uint64_t)LINK_STATE_UNKNOWN}
-};
-
-#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0]))
+/*
+ * structure for link kstats
+ */
+typedef struct {
+ kstat_named_t dk_ifspeed;
+ kstat_named_t dk_multircv;
+ kstat_named_t dk_brdcstrcv;
+ kstat_named_t dk_multixmt;
+ kstat_named_t dk_brdcstxmt;
+ kstat_named_t dk_norcvbuf;
+ kstat_named_t dk_ierrors;
+ kstat_named_t dk_noxmtbuf;
+ kstat_named_t dk_oerrors;
+ kstat_named_t dk_collisions;
+ kstat_named_t dk_rbytes;
+ kstat_named_t dk_ipackets;
+ kstat_named_t dk_obytes;
+ kstat_named_t dk_opackets;
+ kstat_named_t dk_rbytes64;
+ kstat_named_t dk_ipackets64;
+ kstat_named_t dk_obytes64;
+ kstat_named_t dk_opackets64;
+ kstat_named_t dk_link_state;
+ kstat_named_t dk_link_duplex;
+ kstat_named_t dk_unknowns;
+ kstat_named_t dk_zonename;
+} dls_kstat_t;
/*
* Exported functions.
@@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = {
int
dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
{
- kstat_named_t *knp;
- uint_t i;
- uint64_t val;
+ dls_kstat_t *dkp = ksp->ks_data;
if (rw != KSTAT_READ)
return (EACCES);
- knp = (kstat_named_t *)ksp->ks_data;
- for (i = 0; i < STAT_INFO_COUNT; i++) {
- val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
-
- switch (i_dls_si[i].msi_type) {
- case KSTAT_DATA_UINT64:
- knp->value.ui64 = val;
- break;
- case KSTAT_DATA_UINT32:
- knp->value.ui32 = (uint32_t)val;
- break;
- default:
- ASSERT(B_FALSE);
- }
-
- knp++;
- }
+ dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED);
+ dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_MULTIRCV);
+ dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_BRDCSTRCV);
+ dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_MULTIXMT);
+ dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_BRDCSTXMT);
+ dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_NORCVBUF);
+ dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS);
+ dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_NOXMTBUF);
+ dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS);
+ dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_COLLISIONS);
+ dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+ dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_IPACKETS);
+ dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+ dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_OPACKETS);
+ dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+ dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_IPACKETS);
+ dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+ dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_OPACKETS);
+ dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_LINK_STATE);
/*
* Ethernet specific kstat "link_duplex"
*/
if (dlp->dl_mip->mi_nativemedia != DL_ETHER) {
- knp->value.ui32 = LINK_DUPLEX_UNKNOWN;
+ dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN;
} else {
- val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
- knp->value.ui32 = (uint32_t)val;
+ dkp->dk_link_duplex.value.ui32 =
+ (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
}
- knp++;
- knp->value.ui32 = dlp->dl_unknowns;
+
+ dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns;
return (0);
}
@@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
int
dls_stat_create(const char *module, int instance, const char *name,
zoneid_t zoneid, int (*update)(struct kstat *, int), void *private,
- kstat_t **kspp)
+ kstat_t **kspp, zoneid_t newzoneid)
{
kstat_t *ksp;
- kstat_named_t *knp;
- uint_t i;
+ zone_t *zone;
+ dls_kstat_t *dkp;
if ((ksp = kstat_create_zone(module, instance, name, "net",
- KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) {
+ KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) {
return (EINVAL);
}
ksp->ks_update = update;
ksp->ks_private = private;
+ dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP);
+ if ((zone = zone_find_by_id(newzoneid)) != NULL) {
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ }
- knp = (kstat_named_t *)ksp->ks_data;
- for (i = 0; i < STAT_INFO_COUNT; i++) {
- kstat_named_init(knp, i_dls_si[i].msi_name,
- i_dls_si[i].msi_type);
- knp++;
+ kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_link_duplex, "link_duplex",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING);
+
+ if (zone != NULL) {
+ kstat_named_setstr(&dkp->dk_zonename, zone->zone_name);
+ zone_rele(zone);
}
- kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32);
- kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32);
kstat_install(ksp);
*kspp = ksp;
return (0);
}
+
+void
+dls_stat_delete(kstat_t *ksp)
+{
+ void *data;
+ if (ksp != NULL) {
+ data = ksp->ks_data;
+ kstat_delete(ksp);
+ kmem_free(data, sizeof (dls_kstat_t));
+ }
+}
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..00aefb6f51
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
@@ -0,0 +1,32 @@
+/*
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..ac6d2d1b15
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+DR_SAS DRIVER
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c
new file mode 100644
index 0000000000..02354c9b16
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.c
@@ -0,0 +1,5510 @@
+/*
+ * dr_sas.c: source for dr_sas driver
+ *
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ * Arun Chandrashekhar
+ * Manju R
+ * Rajesh Prabhakaran
+ * Seokmann Ju
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/pci.h>
+#include <sys/scsi/scsi.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/signal.h>
+#include <sys/fs/dv_node.h> /* devfs_clean */
+
+#include "dr_sas.h"
+
+/*
+ * FMA header files
+ */
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/io/ddi.h>
+
+/*
+ * Local static data
+ */
+static void *drsas_state = NULL;
+static int debug_level_g = CL_NONE;
+
+#pragma weak scsi_hba_open
+#pragma weak scsi_hba_close
+#pragma weak scsi_hba_ioctl
+
+static ddi_dma_attr_t drsas_generic_dma_attr = {
+ DMA_ATTR_V0, /* dma_attr_version */
+ 0, /* low DMA address range */
+ 0xFFFFFFFFU, /* high DMA address range */
+ 0xFFFFFFFFU, /* DMA counter register */
+ 8, /* DMA address alignment */
+ 0x07, /* DMA burstsizes */
+ 1, /* min DMA size */
+ 0xFFFFFFFFU, /* max DMA size */
+ 0xFFFFFFFFU, /* segment boundary */
+ DRSAS_MAX_SGE_CNT, /* dma_attr_sglen */
+ 512, /* granularity of device */
+ 0 /* bus specific DMA flags */
+};
+
+int32_t drsas_max_cap_maxxfer = 0x1000000;
+
+/*
+ * cb_ops contains base level routines
+ */
+static struct cb_ops drsas_cb_ops = {
+ drsas_open, /* open */
+ drsas_close, /* close */
+ nodev, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ drsas_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ nodev, /* cb_prop_op */
+ 0, /* streamtab */
+ D_NEW | D_HOTPLUG, /* cb_flag */
+ CB_REV, /* cb_rev */
+ nodev, /* cb_aread */
+ nodev /* cb_awrite */
+};
+
+/*
+ * dev_ops contains configuration routines
+ */
+static struct dev_ops drsas_ops = {
+ DEVO_REV, /* rev, */
+ 0, /* refcnt */
+ drsas_getinfo, /* getinfo */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ drsas_attach, /* attach */
+ drsas_detach, /* detach */
+ drsas_reset, /* reset */
+ &drsas_cb_ops, /* char/block ops */
+ NULL, /* bus ops */
+ NULL, /* power */
+ ddi_quiesce_not_supported, /* quiesce */
+};
+
+char _depends_on[] = "misc/scsi";
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* module type - driver */
+ DRSAS_VERSION,
+ &drsas_ops, /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, /* ml_rev - must be MODREV_1 */
+ &modldrv, /* ml_linkage */
+ NULL /* end of driver linkage */
+};
+
+static struct ddi_device_acc_attr endian_attr = {
+ DDI_DEVICE_ATTR_V0,
+ DDI_STRUCTURE_LE_ACC,
+ DDI_STRICTORDER_ACC
+};
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for loadable kernel modules *
+ * *
+ * ************************************************************************** *
+ */
+
+int
+_init(void)
+{
+ int ret;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ ret = ddi_soft_state_init(&drsas_state,
+ sizeof (struct drsas_instance), 0);
+
+ if (ret != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state"));
+ return (ret);
+ }
+
+ if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba"));
+ ddi_soft_state_fini(&drsas_state);
+ return (ret);
+ }
+
+ ret = mod_install(&modlinkage);
+
+ if (ret != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed"));
+ scsi_hba_fini(&modlinkage);
+ ddi_soft_state_fini(&drsas_state);
+ }
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS)
+ return (ret);
+
+ scsi_hba_fini(&modlinkage);
+
+ ddi_soft_state_fini(&drsas_state);
+
+ return (ret);
+}
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for autoconfiguration *
+ * *
+ * ************************************************************************** *
+ */
+
+static int
+drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int instance_no;
+ int nregs;
+ uint8_t added_isr_f = 0;
+ uint8_t added_soft_isr_f = 0;
+ uint8_t create_devctl_node_f = 0;
+ uint8_t create_scsi_node_f = 0;
+ uint8_t create_ioc_node_f = 0;
+ uint8_t tran_alloc_f = 0;
+ uint8_t irq;
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t subsysvid;
+ uint16_t subsysid;
+ uint16_t command;
+ off_t reglength = 0;
+ int intr_types = 0;
+ char *data;
+ int msi_enable = 0;
+
+ scsi_hba_tran_t *tran;
+ ddi_dma_attr_t tran_dma_attr;
+ struct drsas_instance *instance;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* CONSTCOND */
+ ASSERT(NO_COMPETING_THREADS);
+
+ instance_no = ddi_get_instance(dip);
+
+ /*
+ * check to see whether this device is in a DMA-capable slot.
+ */
+ if (ddi_slaveonly(dip) == DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Device in slave-only slot, unused",
+ instance_no));
+ return (DDI_FAILURE);
+ }
+
+ switch (cmd) {
+ case DDI_ATTACH:
+ con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH"));
+ /* allocate the soft state for the instance */
+ if (ddi_soft_state_zalloc(drsas_state, instance_no)
+ != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Failed to allocate soft state",
+ instance_no));
+
+ return (DDI_FAILURE);
+ }
+
+ instance = (struct drsas_instance *)ddi_get_soft_state
+ (drsas_state, instance_no);
+
+ if (instance == NULL) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Bad soft state", instance_no));
+
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ bzero((caddr_t)instance,
+ sizeof (struct drsas_instance));
+
+ instance->func_ptr = kmem_zalloc(
+ sizeof (struct drsas_func_ptr), KM_SLEEP);
+ ASSERT(instance->func_ptr);
+
+ /* Setup the PCI configuration space handles */
+ if (pci_config_setup(dip, &instance->pci_handle) !=
+ DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: pci config setup failed ",
+ instance_no));
+
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to get registers."));
+
+ pci_config_teardown(&instance->pci_handle);
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ vendor_id = pci_config_get16(instance->pci_handle,
+ PCI_CONF_VENID);
+ device_id = pci_config_get16(instance->pci_handle,
+ PCI_CONF_DEVID);
+
+ subsysvid = pci_config_get16(instance->pci_handle,
+ PCI_CONF_SUBVENID);
+ subsysid = pci_config_get16(instance->pci_handle,
+ PCI_CONF_SUBSYSID);
+
+ pci_config_put16(instance->pci_handle, PCI_CONF_COMM,
+ (pci_config_get16(instance->pci_handle,
+ PCI_CONF_COMM) | PCI_COMM_ME));
+ irq = pci_config_get8(instance->pci_handle,
+ PCI_CONF_ILINE);
+
+ con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+ "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s",
+ instance_no, vendor_id, device_id, subsysvid,
+ subsysid, irq, DRSAS_VERSION));
+
+ /* enable bus-mastering */
+ command = pci_config_get16(instance->pci_handle,
+ PCI_CONF_COMM);
+
+ if (!(command & PCI_COMM_ME)) {
+ command |= PCI_COMM_ME;
+
+ pci_config_put16(instance->pci_handle,
+ PCI_CONF_COMM, command);
+
+ con_log(CL_ANN, (CE_CONT, "dr_sas%d: "
+ "enable bus-mastering", instance_no));
+ } else {
+ con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+ "bus-mastering already set", instance_no));
+ }
+
+ /* initialize function pointers */
+ if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) ||
+ (device_id == PCI_DEVICE_ID_LSI_2108V)) {
+ con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+ "2108V/DE detected", instance_no));
+ instance->func_ptr->read_fw_status_reg =
+ read_fw_status_reg_ppc;
+ instance->func_ptr->issue_cmd = issue_cmd_ppc;
+ instance->func_ptr->issue_cmd_in_sync_mode =
+ issue_cmd_in_sync_mode_ppc;
+ instance->func_ptr->issue_cmd_in_poll_mode =
+ issue_cmd_in_poll_mode_ppc;
+ instance->func_ptr->enable_intr =
+ enable_intr_ppc;
+ instance->func_ptr->disable_intr =
+ disable_intr_ppc;
+ instance->func_ptr->intr_ack = intr_ack_ppc;
+ } else {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: Invalid device detected"));
+
+ pci_config_teardown(&instance->pci_handle);
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ instance->baseaddress = pci_config_get32(
+ instance->pci_handle, PCI_CONF_BASE0);
+ instance->baseaddress &= 0x0fffc;
+
+ instance->dip = dip;
+ instance->vendor_id = vendor_id;
+ instance->device_id = device_id;
+ instance->subsysvid = subsysvid;
+ instance->subsysid = subsysid;
+ instance->instance = instance_no;
+
+ /* Initialize FMA */
+ instance->fm_capabilities = ddi_prop_get_int(
+ DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS,
+ "fm-capable", DDI_FM_EREPORT_CAPABLE |
+ DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE
+ | DDI_FM_ERRCB_CAPABLE);
+
+ drsas_fm_init(instance);
+
+ /* Initialize Interrupts */
+ if ((ddi_dev_regsize(instance->dip,
+ REGISTER_SET_IO_2108, &reglength) != DDI_SUCCESS) ||
+ reglength < MINIMUM_MFI_MEM_SZ) {
+ return (DDI_FAILURE);
+ }
+ if (reglength > DEFAULT_MFI_MEM_SZ) {
+ reglength = DEFAULT_MFI_MEM_SZ;
+ con_log(CL_DLEVEL1, (CE_NOTE,
+ "dr_sas: register length to map is "
+ "0x%lx bytes", reglength));
+ }
+ if (ddi_regs_map_setup(instance->dip,
+ REGISTER_SET_IO_2108, &instance->regmap, 0,
+ reglength, &endian_attr, &instance->regmap_handle)
+ != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: couldn't map control registers"));
+ goto fail_attach;
+ }
+
+ /*
+ * Disable Interrupt Now.
+ * Setup Software interrupt
+ */
+ instance->func_ptr->disable_intr(instance);
+
+ msi_enable = 0;
+ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+ "drsas-enable-msi", &data) == DDI_SUCCESS) {
+ if (strncmp(data, "yes", 3) == 0) {
+ msi_enable = 1;
+ con_log(CL_ANN, (CE_WARN,
+ "msi_enable = %d ENABLED",
+ msi_enable));
+ }
+ ddi_prop_free(data);
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d",
+ msi_enable));
+
+ /* Check for all supported interrupt types */
+ if (ddi_intr_get_supported_types(
+ dip, &intr_types) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "ddi_intr_get_supported_types() failed"));
+ goto fail_attach;
+ }
+
+ con_log(CL_DLEVEL1, (CE_NOTE,
+ "ddi_intr_get_supported_types() ret: 0x%x",
+ intr_types));
+
+ /* Initialize and Setup Interrupt handler */
+ if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) {
+ if (drsas_add_intrs(instance,
+ DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "MSIX interrupt query failed"));
+ goto fail_attach;
+ }
+ instance->intr_type = DDI_INTR_TYPE_MSIX;
+ } else if (msi_enable && (intr_types &
+ DDI_INTR_TYPE_MSI)) {
+ if (drsas_add_intrs(instance,
+ DDI_INTR_TYPE_MSI) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "MSI interrupt query failed"));
+ goto fail_attach;
+ }
+ instance->intr_type = DDI_INTR_TYPE_MSI;
+ } else if (intr_types & DDI_INTR_TYPE_FIXED) {
+ msi_enable = 0;
+ if (drsas_add_intrs(instance,
+ DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "FIXED interrupt query failed"));
+ goto fail_attach;
+ }
+ instance->intr_type = DDI_INTR_TYPE_FIXED;
+ } else {
+ con_log(CL_ANN, (CE_WARN, "Device cannot "
+ "suppport either FIXED or MSI/X "
+ "interrupts"));
+ goto fail_attach;
+ }
+
+ added_isr_f = 1;
+
+ /* setup the mfi based low level driver */
+ if (init_mfi(instance) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: "
+ "could not initialize the low level driver"));
+
+ goto fail_attach;
+ }
+
+ /* Initialize all Mutex */
+ INIT_LIST_HEAD(&instance->completed_pool_list);
+ mutex_init(&instance->completed_pool_mtx,
+ "completed_pool_mtx", MUTEX_DRIVER,
+ DDI_INTR_PRI(instance->intr_pri));
+
+ mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx",
+ MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+ cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL);
+
+ mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx",
+ MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+
+ /* Register our soft-isr for highlevel interrupts. */
+ instance->isr_level = instance->intr_pri;
+ if (instance->isr_level == HIGH_LEVEL_INTR) {
+ if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH,
+ &instance->soft_intr_id, NULL, NULL,
+ drsas_softintr, (caddr_t)instance) !=
+ DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ " Software ISR did not register"));
+
+ goto fail_attach;
+ }
+
+ added_soft_isr_f = 1;
+ }
+
+ /* Allocate a transport structure */
+ tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP);
+
+ if (tran == NULL) {
+ con_log(CL_ANN, (CE_WARN,
+ "scsi_hba_tran_alloc failed"));
+ goto fail_attach;
+ }
+
+ tran_alloc_f = 1;
+
+ instance->tran = tran;
+
+ tran->tran_hba_private = instance;
+ tran->tran_tgt_init = drsas_tran_tgt_init;
+ tran->tran_tgt_probe = scsi_hba_probe;
+ tran->tran_tgt_free = drsas_tran_tgt_free;
+ tran->tran_init_pkt = drsas_tran_init_pkt;
+ tran->tran_start = drsas_tran_start;
+ tran->tran_abort = drsas_tran_abort;
+ tran->tran_reset = drsas_tran_reset;
+ tran->tran_getcap = drsas_tran_getcap;
+ tran->tran_setcap = drsas_tran_setcap;
+ tran->tran_destroy_pkt = drsas_tran_destroy_pkt;
+ tran->tran_dmafree = drsas_tran_dmafree;
+ tran->tran_sync_pkt = drsas_tran_sync_pkt;
+ tran->tran_bus_config = drsas_tran_bus_config;
+
+ tran_dma_attr = drsas_generic_dma_attr;
+ tran_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+
+ /* Attach this instance of the hba */
+ if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0)
+ != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "scsi_hba_attach failed"));
+
+ goto fail_attach;
+ }
+
+ /* create devctl node for cfgadm command */
+ if (ddi_create_minor_node(dip, "devctl",
+ S_IFCHR, INST2DEVCTL(instance_no),
+ DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create devctl node."));
+
+ goto fail_attach;
+ }
+
+ create_devctl_node_f = 1;
+
+ /* create scsi node for cfgadm command */
+ if (ddi_create_minor_node(dip, "scsi", S_IFCHR,
+ INST2SCSI(instance_no),
+ DDI_NT_SCSI_ATTACHMENT_POINT, 0) ==
+ DDI_FAILURE) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create scsi node."));
+
+ goto fail_attach;
+ }
+
+ create_scsi_node_f = 1;
+
+ (void) sprintf(instance->iocnode, "%d:lsirdctl",
+ instance_no);
+
+ /*
+ * Create a node for applications
+ * for issuing ioctl to the driver.
+ */
+ if (ddi_create_minor_node(dip, instance->iocnode,
+ S_IFCHR, INST2LSIRDCTL(instance_no),
+ DDI_PSEUDO, 0) == DDI_FAILURE) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create ioctl node."));
+
+ goto fail_attach;
+ }
+
+ create_ioc_node_f = 1;
+
+ /* Create a taskq to handle dr events */
+ if ((instance->taskq = ddi_taskq_create(dip,
+ "drsas_dr_taskq", 1,
+ TASKQ_DEFAULTPRI, 0)) == NULL) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create taskq "));
+ instance->taskq = NULL;
+ goto fail_attach;
+ }
+
+ /* enable interrupt */
+ instance->func_ptr->enable_intr(instance);
+
+ /* initiate AEN */
+ if (start_mfi_aen(instance)) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to initiate AEN."));
+ goto fail_initiate_aen;
+ }
+
+ con_log(CL_DLEVEL1, (CE_NOTE,
+ "AEN started for instance %d.", instance_no));
+
+ /* Finally! We are on the air. */
+ ddi_report_dev(dip);
+
+ if (drsas_check_acc_handle(instance->regmap_handle) !=
+ DDI_SUCCESS) {
+ goto fail_attach;
+ }
+ if (drsas_check_acc_handle(instance->pci_handle) !=
+ DDI_SUCCESS) {
+ goto fail_attach;
+ }
+ instance->dr_ld_list =
+ kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld),
+ KM_SLEEP);
+ break;
+ case DDI_PM_RESUME:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: DDI_PM_RESUME"));
+ break;
+ case DDI_RESUME:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: DDI_RESUME"));
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: invalid attach cmd=%x", cmd));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+
+fail_initiate_aen:
+fail_attach:
+ if (create_devctl_node_f) {
+ ddi_remove_minor_node(dip, "devctl");
+ }
+
+ if (create_scsi_node_f) {
+ ddi_remove_minor_node(dip, "scsi");
+ }
+
+ if (create_ioc_node_f) {
+ ddi_remove_minor_node(dip, instance->iocnode);
+ }
+
+ if (tran_alloc_f) {
+ scsi_hba_tran_free(tran);
+ }
+
+
+ if (added_soft_isr_f) {
+ ddi_remove_softintr(instance->soft_intr_id);
+ }
+
+ if (added_isr_f) {
+ drsas_rem_intrs(instance);
+ }
+
+ if (instance && instance->taskq) {
+ ddi_taskq_destroy(instance->taskq);
+ }
+
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+ drsas_fm_fini(instance);
+
+ pci_config_teardown(&instance->pci_handle);
+
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: return failure from drsas_attach"));
+
+ return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
+{
+ int rval;
+ int drsas_minor = getminor((dev_t)arg);
+
+ struct drsas_instance *instance;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ instance = (struct drsas_instance *)
+ ddi_get_soft_state(drsas_state,
+ MINOR2INST(drsas_minor));
+
+ if (instance == NULL) {
+ *resultp = NULL;
+ rval = DDI_FAILURE;
+ } else {
+ *resultp = instance->dip;
+ rval = DDI_SUCCESS;
+ }
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *resultp = (void *)instance;
+ rval = DDI_SUCCESS;
+ break;
+ default:
+ *resultp = NULL;
+ rval = DDI_FAILURE;
+ }
+
+ return (rval);
+}
+
+static int
+drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ int instance_no;
+
+ struct drsas_instance *instance;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* CONSTCOND */
+ ASSERT(NO_COMPETING_THREADS);
+
+ instance_no = ddi_get_instance(dip);
+
+ instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state,
+ instance_no);
+
+ if (!instance) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas:%d could not get instance in detach",
+ instance_no));
+
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x",
+ instance_no, instance->vendor_id, instance->device_id,
+ instance->subsysvid, instance->subsysid));
+
+ switch (cmd) {
+ case DDI_DETACH:
+ con_log(CL_ANN, (CE_NOTE,
+ "drsas_detach: DDI_DETACH"));
+
+ if (scsi_hba_detach(dip) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas:%d failed to detach",
+ instance_no));
+
+ return (DDI_FAILURE);
+ }
+
+ scsi_hba_tran_free(instance->tran);
+
+ flush_cache(instance);
+
+ if (abort_aen_cmd(instance, instance->aen_cmd)) {
+ con_log(CL_ANN, (CE_WARN, "drsas_detach: "
+ "failed to abort prevous AEN command"));
+
+ return (DDI_FAILURE);
+ }
+
+ instance->func_ptr->disable_intr(instance);
+
+ if (instance->isr_level == HIGH_LEVEL_INTR) {
+ ddi_remove_softintr(instance->soft_intr_id);
+ }
+
+ drsas_rem_intrs(instance);
+
+ if (instance->taskq) {
+ ddi_taskq_destroy(instance->taskq);
+ }
+ kmem_free(instance->dr_ld_list, MRDRV_MAX_LD
+ * sizeof (struct drsas_ld));
+ free_space_for_mfi(instance);
+
+ drsas_fm_fini(instance);
+
+ pci_config_teardown(&instance->pci_handle);
+
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+
+ ddi_soft_state_free(drsas_state, instance_no);
+ break;
+ case DDI_PM_SUSPEND:
+ con_log(CL_ANN, (CE_NOTE,
+ "drsas_detach: DDI_PM_SUSPEND"));
+
+ break;
+ case DDI_SUSPEND:
+ con_log(CL_ANN, (CE_NOTE,
+ "drsas_detach: DDI_SUSPEND"));
+
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN,
+ "invalid detach command:0x%x", cmd));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for character driver types *
+ * *
+ * ************************************************************************** *
+ */
+static int
+drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{
+ int rval = 0;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* Check root permissions */
+ if (drv_priv(credp) != 0) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: Non-root ioctl access denied!"));
+ return (EPERM);
+ }
+
+ /* Verify we are being opened as a character device */
+ if (otyp != OTYP_CHR) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: ioctl node must be a char node"));
+ return (EINVAL);
+ }
+
+ if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev)))
+ == NULL) {
+ return (ENXIO);
+ }
+
+ if (scsi_hba_open) {
+ rval = scsi_hba_open(dev, openflags, otyp, credp);
+ }
+
+ return (rval);
+}
+
+static int
+drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp)
+{
+ int rval = 0;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* no need for locks! */
+
+ if (scsi_hba_close) {
+ rval = scsi_hba_close(dev, openflags, otyp, credp);
+ }
+
+ return (rval);
+}
+
+static int
+drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ int rval = 0;
+
+ struct drsas_instance *instance;
+ struct drsas_ioctl *ioctl;
+ struct drsas_aen aen;
+ int i;
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev)));
+
+ if (instance == NULL) {
+ /* invalid minor number */
+ con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found."));
+ return (ENXIO);
+ }
+
+ ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl),
+ KM_SLEEP);
+ ASSERT(ioctl);
+
+ switch ((uint_t)cmd) {
+ case DRSAS_IOCTL_FIRMWARE:
+ for (i = 0; i < sizeof (struct drsas_ioctl); i++) {
+ if (ddi_copyin((uint8_t *)arg+i,
+ (uint8_t *)ioctl+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "drsas_ioctl "
+ "ERROR IOCTL copyin"));
+ kmem_free(ioctl,
+ sizeof (struct drsas_ioctl));
+ return (EFAULT);
+ }
+ }
+ if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) {
+ rval = handle_drv_ioctl(instance, ioctl, mode);
+ } else {
+ rval = handle_mfi_ioctl(instance, ioctl, mode);
+ }
+ for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) {
+ if (ddi_copyout((uint8_t *)ioctl+i,
+ (uint8_t *)arg+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_ioctl: ddi_copyout "
+ "failed"));
+ rval = 1;
+ break;
+ }
+ }
+
+ break;
+ case DRSAS_IOCTL_AEN:
+ for (i = 0; i < sizeof (struct drsas_aen); i++) {
+ if (ddi_copyin((uint8_t *)arg+i,
+ (uint8_t *)&aen+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_ioctl: "
+ "ERROR AEN copyin"));
+ kmem_free(ioctl,
+ sizeof (struct drsas_ioctl));
+ return (EFAULT);
+ }
+ }
+
+ rval = handle_mfi_aen(instance, &aen);
+ for (i = 0; i < sizeof (struct drsas_aen); i++) {
+ if (ddi_copyout((uint8_t *)&aen + i,
+ (uint8_t *)arg + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_ioctl: "
+ "ddi_copyout failed"));
+ rval = 1;
+ break;
+ }
+ }
+
+ break;
+ default:
+ rval = scsi_hba_ioctl(dev, cmd, arg,
+ mode, credp, rvalp);
+
+ con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: "
+ "scsi_hba_ioctl called, ret = %x.", rval));
+ }
+
+ kmem_free(ioctl, sizeof (struct drsas_ioctl));
+ return (rval);
+}
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for block driver types *
+ * *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd)
+{
+ int instance_no;
+
+ struct drsas_instance *instance;
+
+ instance_no = ddi_get_instance(dip);
+ instance = (struct drsas_instance *)ddi_get_soft_state
+ (drsas_state, instance_no);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (!instance) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter "
+ "in reset", instance_no));
+ return (DDI_FAILURE);
+ }
+
+ instance->func_ptr->disable_intr(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d",
+ instance_no));
+
+ flush_cache(instance);
+
+ return (DDI_SUCCESS);
+}
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * entry points (SCSI HBA) *
+ * *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *tran, struct scsi_device *sd)
+{
+ struct drsas_instance *instance;
+ uint16_t tgt = sd->sd_address.a_target;
+ uint8_t lun = sd->sd_address.a_lun;
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d",
+ tgt, lun));
+
+ instance = ADDR2MR(&sd->sd_address);
+
+ if (ndi_dev_is_persistent_node(tgt_dip) == 0) {
+ (void) ndi_merge_node(tgt_dip, drsas_name_node);
+ ddi_set_name_addr(tgt_dip, NULL);
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in "
+ "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d",
+ tgt, lun));
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p",
+ (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip));
+
+ if (tgt < MRDRV_MAX_LD && lun == 0) {
+ if (instance->dr_ld_list[tgt].dip == NULL &&
+ strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) {
+ instance->dr_ld_list[tgt].dip = tgt_dip;
+ instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN;
+ }
+ }
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ struct drsas_instance *instance;
+ int tgt = sd->sd_address.a_target;
+ int lun = sd->sd_address.a_lun;
+
+ instance = ADDR2MR(&sd->sd_address);
+
+ con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun));
+
+ if (tgt < MRDRV_MAX_LD && lun == 0) {
+ if (instance->dr_ld_list[tgt].dip == tgt_dip) {
+ instance->dr_ld_list[tgt].dip = NULL;
+ }
+ }
+}
+
+static dev_info_t *
+drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun)
+{
+ dev_info_t *child = NULL;
+ char addr[SCSI_MAXNAMELEN];
+ char tmp[MAXNAMELEN];
+
+ (void) sprintf(addr, "%x,%x", tgt, lun);
+ for (child = ddi_get_child(instance->dip); child;
+ child = ddi_get_next_sibling(child)) {
+
+ if (drsas_name_node(child, tmp, MAXNAMELEN) !=
+ DDI_SUCCESS) {
+ continue;
+ }
+
+ if (strcmp(addr, tmp) == 0) {
+ break;
+ }
+ }
+ con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p",
+ (void *)child));
+ return (child);
+}
+
+static int
+drsas_name_node(dev_info_t *dip, char *name, int len)
+{
+ int tgt, lun;
+
+ tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+ DDI_PROP_DONTPASS, "target", -1);
+ con_log(CL_ANN1, (CE_NOTE,
+ "drsas_name_node: dip %p tgt %d", (void *)dip, tgt));
+ if (tgt == -1) {
+ return (DDI_FAILURE);
+ }
+ lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+ "lun", -1);
+ con_log(CL_ANN1,
+ (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun));
+ if (lun == -1) {
+ return (DDI_FAILURE);
+ }
+ (void) snprintf(name, len, "%x,%x", tgt, lun);
+ return (DDI_SUCCESS);
+}
+
+static struct scsi_pkt *
+drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
+ struct buf *bp, int cmdlen, int statuslen, int tgtlen,
+ int flags, int (*callback)(), caddr_t arg)
+{
+ struct scsa_cmd *acmd;
+ struct drsas_instance *instance;
+ struct scsi_pkt *new_pkt;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ instance = ADDR2MR(ap);
+
+ /* step #1 : pkt allocation */
+ if (pkt == NULL) {
+ pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen,
+ tgtlen, sizeof (struct scsa_cmd), callback, arg);
+ if (pkt == NULL) {
+ return (NULL);
+ }
+
+ acmd = PKT2CMD(pkt);
+
+ /*
+ * Initialize the new pkt - we redundantly initialize
+ * all the fields for illustrative purposes.
+ */
+ acmd->cmd_pkt = pkt;
+ acmd->cmd_flags = 0;
+ acmd->cmd_scblen = statuslen;
+ acmd->cmd_cdblen = cmdlen;
+ acmd->cmd_dmahandle = NULL;
+ acmd->cmd_ncookies = 0;
+ acmd->cmd_cookie = 0;
+ acmd->cmd_cookiecnt = 0;
+ acmd->cmd_nwin = 0;
+
+ pkt->pkt_address = *ap;
+ pkt->pkt_comp = (void (*)())NULL;
+ pkt->pkt_flags = 0;
+ pkt->pkt_time = 0;
+ pkt->pkt_resid = 0;
+ pkt->pkt_state = 0;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_reason = 0;
+ new_pkt = pkt;
+ } else {
+ acmd = PKT2CMD(pkt);
+ new_pkt = NULL;
+ }
+
+ /* step #2 : dma allocation/move */
+ if (bp && bp->b_bcount != 0) {
+ if (acmd->cmd_dmahandle == NULL) {
+ if (drsas_dma_alloc(instance, pkt, bp, flags,
+ callback) == DDI_FAILURE) {
+ if (new_pkt) {
+ scsi_hba_pkt_free(ap, new_pkt);
+ }
+ return ((struct scsi_pkt *)NULL);
+ }
+ } else {
+ if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) {
+ return ((struct scsi_pkt *)NULL);
+ }
+ }
+ }
+
+ return (pkt);
+}
+
+static int
+drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
+{
+ uchar_t cmd_done = 0;
+
+ struct drsas_instance *instance = ADDR2MR(ap);
+ struct drsas_cmd *cmd;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x",
+ __func__, __LINE__, pkt->pkt_cdbp[0]));
+
+ pkt->pkt_reason = CMD_CMPLT;
+ *pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */
+
+ cmd = build_cmd(instance, ap, pkt, &cmd_done);
+
+ /*
+ * Check if the command is already completed by the drsas_build_cmd()
+ * routine. In which case the busy_flag would be clear and scb will be
+ * NULL and appropriate reason provided in pkt_reason field
+ */
+ if (cmd_done) {
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET
+ | STATE_SENT_CMD;
+ if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+
+ return (TRAN_ACCEPT);
+ }
+
+ if (cmd == NULL) {
+ return (TRAN_BUSY);
+ }
+
+ if ((pkt->pkt_flags & FLAG_NOINTR) == 0) {
+ if (instance->fw_outstanding > instance->max_fw_cmds) {
+ con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy"));
+ return_mfi_pkt(instance, cmd);
+ return (TRAN_BUSY);
+ }
+
+ /* Synchronize the Cmd frame for the controller */
+ (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
+ DDI_DMA_SYNC_FORDEV);
+
+ instance->func_ptr->issue_cmd(cmd, instance);
+
+ } else {
+ struct drsas_header *hdr = &cmd->frame->hdr;
+
+ cmd->sync_cmd = DRSAS_TRUE;
+
+ instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd);
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+ switch (ddi_get8(cmd->frame_dma_obj.acc_handle,
+ &hdr->cmd_status)) {
+ case MFI_STAT_OK:
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ break;
+
+ case MFI_STAT_SCSI_DONE_WITH_ERROR:
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+
+ ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1;
+ break;
+
+ case MFI_STAT_DEVICE_NOT_FOUND:
+ pkt->pkt_reason = CMD_DEV_GONE;
+ pkt->pkt_statistics = STAT_DISCON;
+ break;
+
+ default:
+ ((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1;
+ }
+
+ return_mfi_pkt(instance, cmd);
+ (void) drsas_common_check(instance, cmd);
+
+ if (pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+
+ }
+
+ return (TRAN_ACCEPT);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* abort command not supported by H/W */
+
+ return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_reset(struct scsi_address *ap, int level)
+{
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* reset command not supported by H/W */
+
+ return (DDI_FAILURE);
+
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
+{
+ int rval = 0;
+
+ struct drsas_instance *instance = ADDR2MR(ap);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* we do allow inquiring about capabilities for other targets */
+ if (cap == NULL) {
+ return (-1);
+ }
+
+ switch (scsi_hba_lookup_capstr(cap)) {
+ case SCSI_CAP_DMA_MAX:
+ /* Limit to 16MB max transfer */
+ rval = drsas_max_cap_maxxfer;
+ break;
+ case SCSI_CAP_MSG_OUT:
+ rval = 1;
+ break;
+ case SCSI_CAP_DISCONNECT:
+ rval = 0;
+ break;
+ case SCSI_CAP_SYNCHRONOUS:
+ rval = 0;
+ break;
+ case SCSI_CAP_WIDE_XFER:
+ rval = 1;
+ break;
+ case SCSI_CAP_TAGGED_QING:
+ rval = 1;
+ break;
+ case SCSI_CAP_UNTAGGED_QING:
+ rval = 1;
+ break;
+ case SCSI_CAP_PARITY:
+ rval = 1;
+ break;
+ case SCSI_CAP_INITIATOR_ID:
+ rval = instance->init_id;
+ break;
+ case SCSI_CAP_ARQ:
+ rval = 1;
+ break;
+ case SCSI_CAP_LINKED_CMDS:
+ rval = 0;
+ break;
+ case SCSI_CAP_RESET_NOTIFICATION:
+ rval = 1;
+ break;
+ case SCSI_CAP_GEOMETRY:
+ rval = -1;
+
+ break;
+ default:
+ con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x",
+ scsi_hba_lookup_capstr(cap)));
+ rval = -1;
+ break;
+ }
+
+ return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom)
+{
+ int rval = 1;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* We don't allow setting capabilities for other targets */
+ if (cap == NULL || whom == 0) {
+ return (-1);
+ }
+
+ switch (scsi_hba_lookup_capstr(cap)) {
+ case SCSI_CAP_DMA_MAX:
+ case SCSI_CAP_MSG_OUT:
+ case SCSI_CAP_PARITY:
+ case SCSI_CAP_LINKED_CMDS:
+ case SCSI_CAP_RESET_NOTIFICATION:
+ case SCSI_CAP_DISCONNECT:
+ case SCSI_CAP_SYNCHRONOUS:
+ case SCSI_CAP_UNTAGGED_QING:
+ case SCSI_CAP_WIDE_XFER:
+ case SCSI_CAP_INITIATOR_ID:
+ case SCSI_CAP_ARQ:
+ /*
+ * None of these are settable via
+ * the capability interface.
+ */
+ break;
+ case SCSI_CAP_TAGGED_QING:
+ rval = 1;
+ break;
+ case SCSI_CAP_SECTOR_SIZE:
+ rval = 1;
+ break;
+
+ case SCSI_CAP_TOTAL_SECTORS:
+ rval = 1;
+ break;
+ default:
+ rval = -1;
+ break;
+ }
+
+ return (rval);
+}
+
+static void
+drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+ (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+ ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+ acmd->cmd_dmahandle = NULL;
+ }
+
+ /* free the pkt */
+ scsi_hba_pkt_free(ap, pkt);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ register struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+ (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+ ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+ acmd->cmd_dmahandle = NULL;
+ }
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ register struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset,
+ acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ?
+ DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU);
+ }
+}
+
+/*
+ * drsas_isr(caddr_t)
+ *
+ * The Interrupt Service Routine
+ *
+ * Collect status for all completed commands and do callback
+ *
+ */
+static uint_t
+drsas_isr(struct drsas_instance *instance)
+{
+ int need_softintr;
+ uint32_t producer;
+ uint32_t consumer;
+ uint32_t context;
+
+ struct drsas_cmd *cmd;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ ASSERT(instance);
+ if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
+ !instance->func_ptr->intr_ack(instance)) {
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+ 0, 0, DDI_DMA_SYNC_FORCPU);
+
+ if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+ != DDI_SUCCESS) {
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+ instance->producer);
+ consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+ instance->consumer);
+
+ con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ",
+ producer, consumer));
+ if (producer == consumer) {
+ con_log(CL_ANN1, (CE_WARN, "producer = consumer case"));
+ return (DDI_INTR_UNCLAIMED);
+ }
+ mutex_enter(&instance->completed_pool_mtx);
+
+ while (consumer != producer) {
+ context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+ &instance->reply_queue[consumer]);
+ cmd = instance->cmd_list[context];
+ mlist_add_tail(&cmd->list, &instance->completed_pool_list);
+
+ consumer++;
+ if (consumer == (instance->max_fw_cmds + 1)) {
+ consumer = 0;
+ }
+ }
+
+ mutex_exit(&instance->completed_pool_mtx);
+
+ ddi_put32(instance->mfi_internal_dma_obj.acc_handle,
+ instance->consumer, consumer);
+ (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+ 0, 0, DDI_DMA_SYNC_FORDEV);
+
+ if (instance->softint_running) {
+ need_softintr = 0;
+ } else {
+ need_softintr = 1;
+ }
+
+ if (instance->isr_level == HIGH_LEVEL_INTR) {
+ if (need_softintr) {
+ ddi_trigger_softintr(instance->soft_intr_id);
+ }
+ } else {
+ /*
+ * Not a high-level interrupt, therefore call the soft level
+ * interrupt explicitly
+ */
+ (void) drsas_softintr(instance);
+ }
+
+ return (DDI_INTR_CLAIMED);
+}
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * libraries *
+ * *
+ * ************************************************************************** *
+ */
+/*
+ * get_mfi_pkt : Get a command from the free pool
+ * After successful allocation, the caller of this routine
+ * must clear the frame buffer (memset to zero) before
+ * using the packet further.
+ *
+ * ***** Note *****
+ * After clearing the frame buffer the context id of the
+ * frame buffer SHOULD be restored back.
+ */
+static struct drsas_cmd *
+get_mfi_pkt(struct drsas_instance *instance)
+{
+ mlist_t *head = &instance->cmd_pool_list;
+ struct drsas_cmd *cmd = NULL;
+
+ mutex_enter(&instance->cmd_pool_mtx);
+ ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+ if (!mlist_empty(head)) {
+ cmd = mlist_entry(head->next, struct drsas_cmd, list);
+ mlist_del_init(head->next);
+ }
+ if (cmd != NULL)
+ cmd->pkt = NULL;
+ mutex_exit(&instance->cmd_pool_mtx);
+
+ return (cmd);
+}
+
+/*
+ * return_mfi_pkt : Return a cmd to free command pool
+ */
+static void
+return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+ mutex_enter(&instance->cmd_pool_mtx);
+ ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+ mlist_add(&cmd->list, &instance->cmd_pool_list);
+
+ mutex_exit(&instance->cmd_pool_mtx);
+}
+
+/*
+ * destroy_mfi_frame_pool
+ */
+static void
+destroy_mfi_frame_pool(struct drsas_instance *instance)
+{
+ int i;
+ uint32_t max_cmd = instance->max_fw_cmds;
+
+ struct drsas_cmd *cmd;
+
+ /* return all frames to pool */
+ for (i = 0; i < max_cmd+1; i++) {
+
+ cmd = instance->cmd_list[i];
+
+ if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED)
+ (void) drsas_free_dma_obj(instance, cmd->frame_dma_obj);
+
+ cmd->frame_dma_obj_status = DMA_OBJ_FREED;
+ }
+
+}
+
+/*
+ * create_mfi_frame_pool
+ */
+static int
+create_mfi_frame_pool(struct drsas_instance *instance)
+{
+ int i = 0;
+ int cookie_cnt;
+ uint16_t max_cmd;
+ uint16_t sge_sz;
+ uint32_t sgl_sz;
+ uint32_t tot_frame_size;
+
+ struct drsas_cmd *cmd;
+
+ max_cmd = instance->max_fw_cmds;
+
+ sge_sz = sizeof (struct drsas_sge64);
+
+ /* calculated the number of 64byte frames required for SGL */
+ sgl_sz = sge_sz * instance->max_num_sge;
+ tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH;
+
+ con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: "
+ "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size));
+
+ while (i < max_cmd+1) {
+ cmd = instance->cmd_list[i];
+
+ cmd->frame_dma_obj.size = tot_frame_size;
+ cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr;
+ cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ cmd->frame_dma_obj.dma_attr.dma_attr_align = 64;
+
+
+ cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC);
+
+ if (cookie_cnt == -1 || cookie_cnt > 1) {
+ con_log(CL_ANN, (CE_WARN,
+ "create_mfi_frame_pool: could not alloc."));
+ return (DDI_FAILURE);
+ }
+
+ bzero(cmd->frame_dma_obj.buffer, tot_frame_size);
+
+ cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED;
+ cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer;
+ cmd->frame_phys_addr =
+ cmd->frame_dma_obj.dma_cookie[0].dmac_address;
+
+ cmd->sense = (uint8_t *)(((unsigned long)
+ cmd->frame_dma_obj.buffer) +
+ tot_frame_size - SENSE_LENGTH);
+ cmd->sense_phys_addr =
+ cmd->frame_dma_obj.dma_cookie[0].dmac_address +
+ tot_frame_size - SENSE_LENGTH;
+
+ if (!cmd->frame || !cmd->sense) {
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: pci_pool_alloc failed"));
+
+ return (ENOMEM);
+ }
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &cmd->frame->io.context, cmd->index);
+ i++;
+
+ con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x",
+ cmd->index, cmd->frame_phys_addr));
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * free_additional_dma_buffer
+ */
+static void
+free_additional_dma_buffer(struct drsas_instance *instance)
+{
+ if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) {
+ (void) drsas_free_dma_obj(instance,
+ instance->mfi_internal_dma_obj);
+ instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+ }
+
+ if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) {
+ (void) drsas_free_dma_obj(instance,
+ instance->mfi_evt_detail_obj);
+ instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED;
+ }
+}
+
+/*
+ * alloc_additional_dma_buffer
+ */
+static int
+alloc_additional_dma_buffer(struct drsas_instance *instance)
+{
+ uint32_t reply_q_sz;
+ uint32_t internal_buf_size = PAGESIZE*2;
+
+ /* max cmds plus 1 + producer & consumer */
+ reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2);
+
+ instance->mfi_internal_dma_obj.size = internal_buf_size;
+ instance->mfi_internal_dma_obj.dma_attr = drsas_generic_dma_attr;
+ instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max =
+ 0xFFFFFFFFU;
+ instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1;
+
+ if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: could not alloc reply queue"));
+ return (DDI_FAILURE);
+ }
+
+ bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size);
+
+ instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+ instance->producer = (uint32_t *)((unsigned long)
+ instance->mfi_internal_dma_obj.buffer);
+ instance->consumer = (uint32_t *)((unsigned long)
+ instance->mfi_internal_dma_obj.buffer + 4);
+ instance->reply_queue = (uint32_t *)((unsigned long)
+ instance->mfi_internal_dma_obj.buffer + 8);
+ instance->internal_buf = (caddr_t)(((unsigned long)
+ instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8);
+ instance->internal_buf_dmac_add =
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address +
+ (reply_q_sz + 8);
+ instance->internal_buf_size = internal_buf_size -
+ (reply_q_sz + 8);
+
+ /* allocate evt_detail */
+ instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail);
+ instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1;
+
+ if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ bzero(instance->mfi_evt_detail_obj.buffer,
+ sizeof (struct drsas_evt_detail));
+
+ instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED;
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * free_space_for_mfi
+ */
+static void
+free_space_for_mfi(struct drsas_instance *instance)
+{
+ int i;
+ uint32_t max_cmd = instance->max_fw_cmds;
+
+ /* already freed */
+ if (instance->cmd_list == NULL) {
+ return;
+ }
+
+ free_additional_dma_buffer(instance);
+
+ /* first free the MFI frame pool */
+ destroy_mfi_frame_pool(instance);
+
+ /* free all the commands in the cmd_list */
+ for (i = 0; i < instance->max_fw_cmds+1; i++) {
+ kmem_free(instance->cmd_list[i],
+ sizeof (struct drsas_cmd));
+
+ instance->cmd_list[i] = NULL;
+ }
+
+ /* free the cmd_list buffer itself */
+ kmem_free(instance->cmd_list,
+ sizeof (struct drsas_cmd *) * (max_cmd+1));
+
+ instance->cmd_list = NULL;
+
+ INIT_LIST_HEAD(&instance->cmd_pool_list);
+}
+
+/*
+ * alloc_space_for_mfi
+ */
+static int
+alloc_space_for_mfi(struct drsas_instance *instance)
+{
+ int i;
+ uint32_t max_cmd;
+ size_t sz;
+
+ struct drsas_cmd *cmd;
+
+ max_cmd = instance->max_fw_cmds;
+
+ /* reserve 1 more slot for flush_cache */
+ sz = sizeof (struct drsas_cmd *) * (max_cmd+1);
+
+ /*
+ * instance->cmd_list is an array of struct drsas_cmd pointers.
+ * Allocate the dynamic array first and then allocate individual
+ * commands.
+ */
+ instance->cmd_list = kmem_zalloc(sz, KM_SLEEP);
+ ASSERT(instance->cmd_list);
+
+ for (i = 0; i < max_cmd+1; i++) {
+ instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd),
+ KM_SLEEP);
+ ASSERT(instance->cmd_list[i]);
+ }
+
+ INIT_LIST_HEAD(&instance->cmd_pool_list);
+
+ /* add all the commands to command pool (instance->cmd_pool) */
+ for (i = 0; i < max_cmd; i++) {
+ cmd = instance->cmd_list[i];
+ cmd->index = i;
+
+ mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
+ }
+
+ /* single slot for flush_cache won't be added in command pool */
+ cmd = instance->cmd_list[max_cmd];
+ cmd->index = i;
+
+ /* create a frame pool and assign one frame to each cmd */
+ if (create_mfi_frame_pool(instance)) {
+ con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+ return (DDI_FAILURE);
+ }
+
+ /* create a frame pool and assign one frame to each cmd */
+ if (alloc_additional_dma_buffer(instance)) {
+ con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * get_ctrl_info
+ */
+static int
+get_ctrl_info(struct drsas_instance *instance,
+ struct drsas_ctrl_info *ctrl_info)
+{
+ int ret = 0;
+
+ struct drsas_cmd *cmd;
+ struct drsas_dcmd_frame *dcmd;
+ struct drsas_ctrl_info *ci;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ con_log(CL_ANN, (CE_WARN,
+ "Failed to get a cmd for ctrl info"));
+ return (DDI_FAILURE);
+ }
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ dcmd = &cmd->frame->dcmd;
+
+ ci = (struct drsas_ctrl_info *)instance->internal_buf;
+
+ if (!ci) {
+ con_log(CL_ANN, (CE_WARN,
+ "Failed to alloc mem for ctrl info"));
+ return_mfi_pkt(instance, cmd);
+ return (DDI_FAILURE);
+ }
+
+ (void) memset(ci, 0, sizeof (struct drsas_ctrl_info));
+
+ /* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status,
+ MFI_CMD_STATUS_POLL_MODE);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_READ);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+ sizeof (struct drsas_ctrl_info));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_GET_INFO);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+ instance->internal_buf_dmac_add);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+ sizeof (struct drsas_ctrl_info));
+
+ cmd->frame_count = 1;
+
+ if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+ ret = 0;
+ ddi_rep_get8(cmd->frame_dma_obj.acc_handle,
+ (uint8_t *)ctrl_info, (uint8_t *)ci,
+ sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR);
+ } else {
+ con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed"));
+ ret = -1;
+ }
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+ ret = -1;
+ }
+
+ return (ret);
+}
+
+/*
+ * abort_aen_cmd
+ */
+static int
+abort_aen_cmd(struct drsas_instance *instance,
+ struct drsas_cmd *cmd_to_abort)
+{
+ int ret = 0;
+
+ struct drsas_cmd *cmd;
+ struct drsas_abort_frame *abort_fr;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ con_log(CL_ANN, (CE_WARN,
+ "Failed to get a cmd for ctrl info"));
+ return (DDI_FAILURE);
+ }
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ abort_fr = &cmd->frame->abort;
+
+ /* prepare and issue the abort frame */
+ ddi_put8(cmd->frame_dma_obj.acc_handle,
+ &abort_fr->cmd, MFI_CMD_OP_ABORT);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status,
+ MFI_CMD_STATUS_SYNC_MODE);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context,
+ cmd_to_abort->index);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &abort_fr->abort_mfi_phys_addr_hi, 0);
+
+ instance->aen_cmd->abort_aen = 1;
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN,
+ "abort_aen_cmd: issue_cmd_in_sync_mode failed"));
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+
+ instance->aen_cmd->abort_aen = 1;
+ instance->aen_cmd = 0;
+
+ return_mfi_pkt(instance, cmd);
+ (void) drsas_common_check(instance, cmd);
+
+ return (ret);
+}
+
+/*
+ * init_mfi
+ */
+static int
+init_mfi(struct drsas_instance *instance)
+{
+ struct drsas_cmd *cmd;
+ struct drsas_ctrl_info ctrl_info;
+ struct drsas_init_frame *init_frame;
+ struct drsas_init_queue_info *initq_info;
+
+ /* we expect the FW state to be READY */
+ if (mfi_state_transition_to_ready(instance)) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready"));
+ goto fail_ready_state;
+ }
+
+ /* get various operational parameters from status register */
+ instance->max_num_sge =
+ (instance->func_ptr->read_fw_status_reg(instance) &
+ 0xFF0000) >> 0x10;
+ /*
+ * Reduce the max supported cmds by 1. This is to ensure that the
+ * reply_q_sz (1 more than the max cmd that driver may send)
+ * does not exceed max cmds that the FW can support
+ */
+ instance->max_fw_cmds =
+ instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF;
+ instance->max_fw_cmds = instance->max_fw_cmds - 1;
+
+ instance->max_num_sge =
+ (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ?
+ DRSAS_MAX_SGE_CNT : instance->max_num_sge;
+
+ /* create a pool of commands */
+ if (alloc_space_for_mfi(instance) != DDI_SUCCESS)
+ goto fail_alloc_fw_space;
+
+ /*
+ * Prepare a init frame. Note the init frame points to queue info
+ * structure. Each frame has SGL allocated after first 64 bytes. For
+ * this frame - since we don't need any SGL - we use SGL's space as
+ * queue info structure
+ */
+ cmd = get_mfi_pkt(instance);
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ init_frame = (struct drsas_init_frame *)cmd->frame;
+ initq_info = (struct drsas_init_queue_info *)
+ ((unsigned long)init_frame + 64);
+
+ (void) memset(init_frame, 0, MRMFI_FRAME_SIZE);
+ (void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info));
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->reply_queue_entries, instance->max_fw_cmds + 1);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->producer_index_phys_addr_hi, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->producer_index_phys_addr_lo,
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->consumer_index_phys_addr_hi, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->consumer_index_phys_addr_lo,
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->reply_queue_start_phys_addr_hi, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->reply_queue_start_phys_addr_lo,
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle,
+ &init_frame->cmd, MFI_CMD_OP_INIT);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status,
+ MFI_CMD_STATUS_POLL_MODE);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &init_frame->queue_info_new_phys_addr_lo,
+ cmd->frame_phys_addr + 64);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &init_frame->queue_info_new_phys_addr_hi, 0);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len,
+ sizeof (struct drsas_init_queue_info));
+
+ cmd->frame_count = 1;
+
+ /* issue the init frame in polled mode */
+ if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN, "failed to init firmware"));
+ goto fail_fw_init;
+ }
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+ goto fail_fw_init;
+ }
+
+ /* gather misc FW related information */
+ if (!get_ctrl_info(instance, &ctrl_info)) {
+ instance->max_sectors_per_req = ctrl_info.max_request_size;
+ con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d",
+ ctrl_info.product_name, ctrl_info.ld_present_count));
+ } else {
+ instance->max_sectors_per_req = instance->max_num_sge *
+ PAGESIZE / 512;
+ }
+
+ if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+ goto fail_fw_init;
+ }
+
+ return (DDI_SUCCESS);
+
+fail_fw_init:
+fail_alloc_fw_space:
+
+ free_space_for_mfi(instance);
+
+fail_ready_state:
+ ddi_regs_map_free(&instance->regmap_handle);
+
+fail_mfi_reg_setup:
+ return (DDI_FAILURE);
+}
+
+/*
+ * mfi_state_transition_to_ready : Move the FW to READY state
+ *
+ * @reg_set : MFI register set
+ */
+static int
+mfi_state_transition_to_ready(struct drsas_instance *instance)
+{
+ int i;
+ uint8_t max_wait;
+ uint32_t fw_ctrl;
+ uint32_t fw_state;
+ uint32_t cur_state;
+
+ fw_state =
+ instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK;
+ con_log(CL_ANN1, (CE_NOTE,
+ "mfi_state_transition_to_ready:FW state = 0x%x", fw_state));
+
+ while (fw_state != MFI_STATE_READY) {
+ con_log(CL_ANN, (CE_NOTE,
+ "mfi_state_transition_to_ready:FW state%x", fw_state));
+
+ switch (fw_state) {
+ case MFI_STATE_FAULT:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: FW in FAULT state!!"));
+
+ return (ENODEV);
+ case MFI_STATE_WAIT_HANDSHAKE:
+ /* set the CLR bit in IMR0 */
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: FW waiting for HANDSHAKE"));
+ /*
+ * PCI_Hot Plug: MFI F/W requires
+ * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+ * to be set
+ */
+ /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */
+ WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE |
+ MFI_INIT_HOTPLUG, instance);
+
+ max_wait = 2;
+ cur_state = MFI_STATE_WAIT_HANDSHAKE;
+ break;
+ case MFI_STATE_BOOT_MESSAGE_PENDING:
+ /* set the CLR bit in IMR0 */
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: FW state boot message pending"));
+ /*
+ * PCI_Hot Plug: MFI F/W requires
+ * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+ * to be set
+ */
+ WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance);
+
+ max_wait = 10;
+ cur_state = MFI_STATE_BOOT_MESSAGE_PENDING;
+ break;
+ case MFI_STATE_OPERATIONAL:
+ /* bring it to READY state; assuming max wait 2 secs */
+ instance->func_ptr->disable_intr(instance);
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: FW in OPERATIONAL state"));
+ /*
+ * PCI_Hot Plug: MFI F/W requires
+ * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT)
+ * to be set
+ */
+ /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */
+ WR_IB_DOORBELL(MFI_RESET_FLAGS, instance);
+
+ max_wait = 10;
+ cur_state = MFI_STATE_OPERATIONAL;
+ break;
+ case MFI_STATE_UNDEFINED:
+ /* this state should not last for more than 2 seconds */
+ con_log(CL_ANN, (CE_NOTE, "FW state undefined"));
+
+ max_wait = 2;
+ cur_state = MFI_STATE_UNDEFINED;
+ break;
+ case MFI_STATE_BB_INIT:
+ max_wait = 2;
+ cur_state = MFI_STATE_BB_INIT;
+ break;
+ case MFI_STATE_FW_INIT:
+ max_wait = 2;
+ cur_state = MFI_STATE_FW_INIT;
+ break;
+ case MFI_STATE_DEVICE_SCAN:
+ max_wait = 10;
+ cur_state = MFI_STATE_DEVICE_SCAN;
+ break;
+ default:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: Unknown state 0x%x", fw_state));
+ return (ENODEV);
+ }
+
+ /* the cur_state should not last for more than max_wait secs */
+ for (i = 0; i < (max_wait * MILLISEC); i++) {
+ /* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */
+ fw_state =
+ instance->func_ptr->read_fw_status_reg(instance) &
+ MFI_STATE_MASK;
+
+ if (fw_state == cur_state) {
+ delay(1 * drv_usectohz(MILLISEC));
+ } else {
+ break;
+ }
+ }
+
+ /* return error if fw_state hasn't changed after max_wait */
+ if (fw_state == cur_state) {
+ con_log(CL_ANN, (CE_NOTE,
+ "FW state hasn't changed in %d secs", max_wait));
+ return (ENODEV);
+ }
+ };
+
+ fw_ctrl = RD_IB_DOORBELL(instance);
+
+ con_log(CL_ANN1, (CE_NOTE,
+ "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl));
+
+ /*
+ * Write 0xF to the doorbell register to do the following.
+ * - Abort all outstanding commands (bit 0).
+ * - Transition from OPERATIONAL to READY state (bit 1).
+ * - Discard (possible) low MFA posted in 64-bit mode (bit-2).
+ * - Set to release FW to continue running (i.e. BIOS handshake
+ * (bit 3).
+ */
+ WR_IB_DOORBELL(0xF, instance);
+
+ if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+ return (ENODEV);
+ }
+ return (DDI_SUCCESS);
+}
+
+/*
+ * get_seq_num
+ */
+static int
+get_seq_num(struct drsas_instance *instance,
+ struct drsas_evt_log_info *eli)
+{
+ int ret = DDI_SUCCESS;
+
+ dma_obj_t dcmd_dma_obj;
+ struct drsas_cmd *cmd;
+ struct drsas_dcmd_frame *dcmd;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ cmn_err(CE_WARN, "dr_sas: failed to get a cmd");
+ return (ENOMEM);
+ }
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ dcmd = &cmd->frame->dcmd;
+
+ /* allocate the data transfer buffer */
+ dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info);
+ dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+ dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+ if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN,
+ "get_seq_num: could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ (void) memset(dcmd_dma_obj.buffer, 0,
+ sizeof (struct drsas_evt_log_info));
+
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_READ);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+ sizeof (struct drsas_evt_log_info));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_EVENT_GET_INFO);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+ sizeof (struct drsas_evt_log_info));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+ dcmd_dma_obj.dma_cookie[0].dmac_address);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ cmn_err(CE_WARN, "get_seq_num: "
+ "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO");
+ ret = DDI_FAILURE;
+ } else {
+ /* copy the data back into callers buffer */
+ ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli,
+ (uint8_t *)dcmd_dma_obj.buffer,
+ sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR);
+ ret = DDI_SUCCESS;
+ }
+
+ if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+ ret = DDI_FAILURE;
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+ ret = DDI_FAILURE;
+ }
+ return (ret);
+}
+
+/*
+ * start_mfi_aen
+ */
+static int
+start_mfi_aen(struct drsas_instance *instance)
+{
+ int ret = 0;
+
+ struct drsas_evt_log_info eli;
+ union drsas_evt_class_locale class_locale;
+
+ /* get the latest sequence number from FW */
+ (void) memset(&eli, 0, sizeof (struct drsas_evt_log_info));
+
+ if (get_seq_num(instance, &eli)) {
+ cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num");
+ return (-1);
+ }
+
+ /* register AEN with FW for latest sequence number plus 1 */
+ class_locale.members.reserved = 0;
+ class_locale.members.locale = DR_EVT_LOCALE_ALL;
+ class_locale.members.class = DR_EVT_CLASS_INFO;
+ ret = register_mfi_aen(instance, eli.newest_seq_num + 1,
+ class_locale.word);
+
+ if (ret) {
+ cmn_err(CE_WARN, "start_mfi_aen: aen registration failed");
+ return (-1);
+ }
+
+ return (ret);
+}
+
+/*
+ * flush_cache
+ */
+static void
+flush_cache(struct drsas_instance *instance)
+{
+ struct drsas_cmd *cmd = NULL;
+ struct drsas_dcmd_frame *dcmd;
+ uint32_t max_cmd = instance->max_fw_cmds;
+
+ cmd = instance->cmd_list[max_cmd];
+
+ if (cmd == NULL)
+ return;
+
+ dcmd = &cmd->frame->dcmd;
+
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_NONE);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_CACHE_FLUSH);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0],
+ DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE);
+
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+ con_log(CL_ANN1, (CE_WARN,
+ "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH"));
+ }
+ con_log(CL_DLEVEL1, (CE_NOTE, "done"));
+}
+
+/*
+ * service_mfi_aen- Completes an AEN command
+ * @instance: Adapter soft state
+ * @cmd: Command to be completed
+ *
+ */
+static void
+service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+ uint32_t seq_num;
+ struct drsas_evt_detail *evt_detail =
+ (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer;
+ int rval = 0;
+ int tgt = 0;
+ ddi_acc_handle_t acc_handle;
+
+ acc_handle = cmd->frame_dma_obj.acc_handle;
+
+ cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status);
+
+ if (cmd->cmd_status == ENODATA) {
+ cmd->cmd_status = 0;
+ }
+
+ /*
+ * log the MFI AEN event to the sysevent queue so that
+ * application will get noticed
+ */
+ if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS",
+ NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) {
+ int instance_no = ddi_get_instance(instance->dip);
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Failed to log AEN event", instance_no));
+ }
+ /*
+ * Check for any ld devices that has changed state. i.e. online
+ * or offline.
+ */
+ con_log(CL_ANN1, (CE_NOTE,
+ "AEN: code = %x class = %x locale = %x args = %x",
+ ddi_get32(acc_handle, &evt_detail->code),
+ evt_detail->cl.members.class,
+ ddi_get16(acc_handle, &evt_detail->cl.members.locale),
+ ddi_get8(acc_handle, &evt_detail->arg_type)));
+
+ switch (ddi_get32(acc_handle, &evt_detail->code)) {
+ case DR_EVT_CFG_CLEARED: {
+ for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+ if (instance->dr_ld_list[tgt].dip != NULL) {
+ rval = drsas_service_evt(instance, tgt, 0,
+ DRSAS_EVT_UNCONFIG_TGT, NULL);
+ con_log(CL_ANN1, (CE_WARN,
+ "dr_sas: CFG CLEARED AEN rval = %d "
+ "tgt id = %d", rval, tgt));
+ }
+ }
+ break;
+ }
+
+ case DR_EVT_LD_DELETED: {
+ rval = drsas_service_evt(instance,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+ DRSAS_EVT_UNCONFIG_TGT, NULL);
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d "
+ "tgt id = %d index = %d", rval,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+ ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+ break;
+ } /* End of DR_EVT_LD_DELETED */
+
+ case DR_EVT_LD_CREATED: {
+ rval = drsas_service_evt(instance,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+ DRSAS_EVT_CONFIG_TGT, NULL);
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d "
+ "tgt id = %d index = %d", rval,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+ ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+ break;
+ } /* End of DR_EVT_LD_CREATED */
+ } /* End of Main Switch */
+
+ /* get copy of seq_num and class/locale for re-registration */
+ seq_num = ddi_get32(acc_handle, &evt_detail->seq_num);
+ seq_num++;
+ (void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+ sizeof (struct drsas_evt_detail));
+
+ ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0);
+ ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num);
+
+ instance->aen_seq_num = seq_num;
+
+ cmd->frame_count = 1;
+
+ /* Issue the aen registration frame */
+ instance->func_ptr->issue_cmd(cmd, instance);
+}
+
+/*
+ * complete_cmd_in_sync_mode - Completes an internal command
+ * @instance: Adapter soft state
+ * @cmd: Command to be completed
+ *
+ * The issue_cmd_in_sync_mode() function waits for a command to complete
+ * after it issues a command. This function wakes up that waiting routine by
+ * calling wake_up() on the wait queue.
+ */
+static void
+complete_cmd_in_sync_mode(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle,
+ &cmd->frame->io.cmd_status);
+
+ cmd->sync_cmd = DRSAS_FALSE;
+
+ if (cmd->cmd_status == ENODATA) {
+ cmd->cmd_status = 0;
+ }
+
+ cv_broadcast(&instance->int_cmd_cv);
+}
+
+/*
+ * drsas_softintr - The Software ISR
+ * @param arg : HBA soft state
+ *
+ * called from high-level interrupt if hi-level interrupt are not there,
+ * otherwise triggered as a soft interrupt
+ */
+static uint_t
+drsas_softintr(struct drsas_instance *instance)
+{
+ struct scsi_pkt *pkt;
+ struct scsa_cmd *acmd;
+ struct drsas_cmd *cmd;
+ struct mlist_head *pos, *next;
+ mlist_t process_list;
+ struct drsas_header *hdr;
+ struct scsi_arq_status *arqstat;
+
+ con_log(CL_ANN1, (CE_CONT, "drsas_softintr called"));
+
+ ASSERT(instance);
+ mutex_enter(&instance->completed_pool_mtx);
+
+ if (mlist_empty(&instance->completed_pool_list)) {
+ mutex_exit(&instance->completed_pool_mtx);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ instance->softint_running = 1;
+
+ INIT_LIST_HEAD(&process_list);
+ mlist_splice(&instance->completed_pool_list, &process_list);
+ INIT_LIST_HEAD(&instance->completed_pool_list);
+
+ mutex_exit(&instance->completed_pool_mtx);
+
+ /* perform all callbacks first, before releasing the SCBs */
+ mlist_for_each_safe(pos, next, &process_list) {
+ cmd = mlist_entry(pos, struct drsas_cmd, list);
+
+ /* syncronize the Cmd frame for the controller */
+ (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle,
+ 0, 0, DDI_DMA_SYNC_FORCPU);
+
+ if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+ DDI_SUCCESS) {
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ hdr = &cmd->frame->hdr;
+
+ /* remove the internal command from the process list */
+ mlist_del_init(&cmd->list);
+
+ switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) {
+ case MFI_CMD_OP_PD_SCSI:
+ case MFI_CMD_OP_LD_SCSI:
+ case MFI_CMD_OP_LD_READ:
+ case MFI_CMD_OP_LD_WRITE:
+ /*
+ * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI
+ * could have been issued either through an
+ * IO path or an IOCTL path. If it was via IOCTL,
+ * we will send it to internal completion.
+ */
+ if (cmd->sync_cmd == DRSAS_TRUE) {
+ complete_cmd_in_sync_mode(instance, cmd);
+ break;
+ }
+
+ /* regular commands */
+ acmd = cmd->cmd;
+ pkt = CMD2PKT(acmd);
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle,
+ acmd->cmd_dma_offset,
+ acmd->cmd_dma_len,
+ DDI_DMA_SYNC_FORCPU);
+ }
+ }
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_state = STATE_GOT_BUS
+ | STATE_GOT_TARGET | STATE_SENT_CMD
+ | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+ con_log(CL_ANN1, (CE_CONT,
+ "CDB[0] = %x completed for %s: size %lx context %x",
+ pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"),
+ acmd->cmd_dmacount, hdr->context));
+
+ if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) {
+ struct scsi_inquiry *inq;
+
+ if (acmd->cmd_dmacount != 0) {
+ bp_mapin(acmd->cmd_buf);
+ inq = (struct scsi_inquiry *)
+ acmd->cmd_buf->b_un.b_addr;
+
+ /* don't expose physical drives to OS */
+ if (acmd->islogical &&
+ (hdr->cmd_status == MFI_STAT_OK)) {
+ display_scsi_inquiry(
+ (caddr_t)inq);
+ } else if ((hdr->cmd_status ==
+ MFI_STAT_OK) && inq->inq_dtype ==
+ DTYPE_DIRECT) {
+
+ display_scsi_inquiry(
+ (caddr_t)inq);
+
+ /* for physical disk */
+ hdr->cmd_status =
+ MFI_STAT_DEVICE_NOT_FOUND;
+ }
+ }
+ }
+
+ switch (hdr->cmd_status) {
+ case MFI_STAT_OK:
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ break;
+ case MFI_STAT_LD_CC_IN_PROGRESS:
+ case MFI_STAT_LD_RECON_IN_PROGRESS:
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ break;
+ case MFI_STAT_LD_INIT_IN_PROGRESS:
+ con_log(CL_ANN,
+ (CE_WARN, "Initialization in Progress"));
+ pkt->pkt_reason = CMD_TRAN_ERR;
+
+ break;
+ case MFI_STAT_SCSI_DONE_WITH_ERROR:
+ con_log(CL_ANN1, (CE_CONT, "scsi_done error"));
+
+ pkt->pkt_reason = CMD_CMPLT;
+ ((struct scsi_status *)
+ pkt->pkt_scbp)->sts_chk = 1;
+
+ if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) {
+
+ con_log(CL_ANN,
+ (CE_WARN, "TEST_UNIT_READY fail"));
+
+ } else {
+ pkt->pkt_state |= STATE_ARQ_DONE;
+ arqstat = (void *)(pkt->pkt_scbp);
+ arqstat->sts_rqpkt_reason = CMD_CMPLT;
+ arqstat->sts_rqpkt_resid = 0;
+ arqstat->sts_rqpkt_state |=
+ STATE_GOT_BUS | STATE_GOT_TARGET
+ | STATE_SENT_CMD
+ | STATE_XFERRED_DATA;
+ *(uint8_t *)&arqstat->sts_rqpkt_status =
+ STATUS_GOOD;
+ ddi_rep_get8(
+ cmd->frame_dma_obj.acc_handle,
+ (uint8_t *)
+ &(arqstat->sts_sensedata),
+ cmd->sense,
+ acmd->cmd_scblen -
+ offsetof(struct scsi_arq_status,
+ sts_sensedata), DDI_DEV_AUTOINCR);
+ }
+ break;
+ case MFI_STAT_LD_OFFLINE:
+ case MFI_STAT_DEVICE_NOT_FOUND:
+ con_log(CL_ANN1, (CE_CONT,
+ "device not found error"));
+ pkt->pkt_reason = CMD_DEV_GONE;
+ pkt->pkt_statistics = STAT_DISCON;
+ break;
+ case MFI_STAT_LD_LBA_OUT_OF_RANGE:
+ pkt->pkt_state |= STATE_ARQ_DONE;
+ pkt->pkt_reason = CMD_CMPLT;
+ ((struct scsi_status *)
+ pkt->pkt_scbp)->sts_chk = 1;
+
+ arqstat = (void *)(pkt->pkt_scbp);
+ arqstat->sts_rqpkt_reason = CMD_CMPLT;
+ arqstat->sts_rqpkt_resid = 0;
+ arqstat->sts_rqpkt_state |= STATE_GOT_BUS
+ | STATE_GOT_TARGET | STATE_SENT_CMD
+ | STATE_XFERRED_DATA;
+ *(uint8_t *)&arqstat->sts_rqpkt_status =
+ STATUS_GOOD;
+
+ arqstat->sts_sensedata.es_valid = 1;
+ arqstat->sts_sensedata.es_key =
+ KEY_ILLEGAL_REQUEST;
+ arqstat->sts_sensedata.es_class =
+ CLASS_EXTENDED_SENSE;
+
+ /*
+ * LOGICAL BLOCK ADDRESS OUT OF RANGE:
+ * ASC: 0x21h; ASCQ: 0x00h;
+ */
+ arqstat->sts_sensedata.es_add_code = 0x21;
+ arqstat->sts_sensedata.es_qual_code = 0x00;
+
+ break;
+
+ default:
+ con_log(CL_ANN, (CE_CONT, "Unknown status!"));
+ pkt->pkt_reason = CMD_TRAN_ERR;
+
+ break;
+ }
+
+ atomic_add_16(&instance->fw_outstanding, (-1));
+
+ return_mfi_pkt(instance, cmd);
+
+ (void) drsas_common_check(instance, cmd);
+
+ if (acmd->cmd_dmahandle) {
+ if (drsas_check_dma_handle(
+ acmd->cmd_dmahandle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip,
+ DDI_SERVICE_UNAFFECTED);
+ pkt->pkt_reason = CMD_TRAN_ERR;
+ pkt->pkt_statistics = 0;
+ }
+ }
+
+ /* Call the callback routine */
+ if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+ pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+
+ break;
+ case MFI_CMD_OP_SMP:
+ case MFI_CMD_OP_STP:
+ complete_cmd_in_sync_mode(instance, cmd);
+ break;
+ case MFI_CMD_OP_DCMD:
+ /* see if got an event notification */
+ if (ddi_get32(cmd->frame_dma_obj.acc_handle,
+ &cmd->frame->dcmd.opcode) ==
+ DR_DCMD_CTRL_EVENT_WAIT) {
+ if ((instance->aen_cmd == cmd) &&
+ (instance->aen_cmd->abort_aen)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_softintr: "
+ "aborted_aen returned"));
+ } else {
+ atomic_add_16(&instance->fw_outstanding,
+ (-1));
+ service_mfi_aen(instance, cmd);
+ }
+ } else {
+ complete_cmd_in_sync_mode(instance, cmd);
+ }
+
+ break;
+ case MFI_CMD_OP_ABORT:
+ con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete"));
+ /*
+ * MFI_CMD_OP_ABORT successfully completed
+ * in the synchronous mode
+ */
+ complete_cmd_in_sync_mode(instance, cmd);
+ break;
+ default:
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+ if (cmd->pkt != NULL) {
+ pkt = cmd->pkt;
+ if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+ pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+ }
+ con_log(CL_ANN, (CE_WARN, "Cmd type unknown !"));
+ break;
+ }
+ }
+
+ instance->softint_running = 0;
+
+ return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * drsas_alloc_dma_obj
+ *
+ * Allocate the memory and other resources for an dma object.
+ */
+static int
+drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj,
+ uchar_t endian_flags)
+{
+ int i;
+ size_t alen = 0;
+ uint_t cookie_cnt;
+ struct ddi_device_acc_attr tmp_endian_attr;
+
+ tmp_endian_attr = endian_attr;
+ tmp_endian_attr.devacc_attr_endian_flags = endian_flags;
+
+ i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr,
+ DDI_DMA_SLEEP, NULL, &obj->dma_handle);
+ if (i != DDI_SUCCESS) {
+
+ switch (i) {
+ case DDI_DMA_BADATTR :
+ con_log(CL_ANN, (CE_WARN,
+ "Failed ddi_dma_alloc_handle- Bad attribute"));
+ break;
+ case DDI_DMA_NORESOURCES :
+ con_log(CL_ANN, (CE_WARN,
+ "Failed ddi_dma_alloc_handle- No Resources"));
+ break;
+ default :
+ con_log(CL_ANN, (CE_WARN,
+ "Failed ddi_dma_alloc_handle: "
+ "unknown status %d", i));
+ break;
+ }
+
+ return (-1);
+ }
+
+ if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr,
+ DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
+ &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) ||
+ alen < obj->size) {
+
+ ddi_dma_free_handle(&obj->dma_handle);
+
+ con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc"));
+
+ return (-1);
+ }
+
+ if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer,
+ obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP,
+ NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) {
+
+ ddi_dma_mem_free(&obj->acc_handle);
+ ddi_dma_free_handle(&obj->dma_handle);
+
+ con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle"));
+
+ return (-1);
+ }
+
+ if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (-1);
+ }
+
+ if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (-1);
+ }
+
+ return (cookie_cnt);
+}
+
+/*
+ * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t)
+ *
+ * De-allocate the memory and other resources for an dma object, which must
+ * have been alloated by a previous call to drsas_alloc_dma_obj()
+ */
+static int
+drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj)
+{
+
+ if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ return (DDI_FAILURE);
+ }
+
+ if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ return (DDI_FAILURE);
+ }
+
+ (void) ddi_dma_unbind_handle(obj.dma_handle);
+ ddi_dma_mem_free(&obj.acc_handle);
+ ddi_dma_free_handle(&obj.dma_handle);
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *,
+ * int, int (*)())
+ *
+ * Allocate dma resources for a new scsi command
+ */
+static int
+drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt,
+ struct buf *bp, int flags, int (*callback)())
+{
+ int dma_flags;
+ int (*cb)(caddr_t);
+ int i;
+
+ ddi_dma_attr_t tmp_dma_attr = drsas_generic_dma_attr;
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ acmd->cmd_buf = bp;
+
+ if (bp->b_flags & B_READ) {
+ acmd->cmd_flags &= ~CFLAG_DMASEND;
+ dma_flags = DDI_DMA_READ;
+ } else {
+ acmd->cmd_flags |= CFLAG_DMASEND;
+ dma_flags = DDI_DMA_WRITE;
+ }
+
+ if (flags & PKT_CONSISTENT) {
+ acmd->cmd_flags |= CFLAG_CONSISTENT;
+ dma_flags |= DDI_DMA_CONSISTENT;
+ }
+
+ if (flags & PKT_DMA_PARTIAL) {
+ dma_flags |= DDI_DMA_PARTIAL;
+ }
+
+ dma_flags |= DDI_DMA_REDZONE;
+
+ cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
+
+ tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+ tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull;
+
+ if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr,
+ cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) {
+ switch (i) {
+ case DDI_DMA_BADATTR:
+ bioerror(bp, EFAULT);
+ return (DDI_FAILURE);
+
+ case DDI_DMA_NORESOURCES:
+ bioerror(bp, 0);
+ return (DDI_FAILURE);
+
+ default:
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: "
+ "impossible result (0x%x)", i));
+ bioerror(bp, EFAULT);
+ return (DDI_FAILURE);
+ }
+ }
+
+ i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags,
+ cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies);
+
+ switch (i) {
+ case DDI_DMA_PARTIAL_MAP:
+ if ((dma_flags & DDI_DMA_PARTIAL) == 0) {
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+ "DDI_DMA_PARTIAL_MAP impossible"));
+ goto no_dma_cookies;
+ }
+
+ if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) ==
+ DDI_FAILURE) {
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed"));
+ goto no_dma_cookies;
+ }
+
+ if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+ &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+ &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+ DDI_FAILURE) {
+
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed"));
+ goto no_dma_cookies;
+ }
+
+ goto get_dma_cookies;
+ case DDI_DMA_MAPPED:
+ acmd->cmd_nwin = 1;
+ acmd->cmd_dma_len = 0;
+ acmd->cmd_dma_offset = 0;
+
+get_dma_cookies:
+ i = 0;
+ acmd->cmd_dmacount = 0;
+ for (;;) {
+ acmd->cmd_dmacount +=
+ acmd->cmd_dmacookies[i++].dmac_size;
+
+ if (i == instance->max_num_sge ||
+ i == acmd->cmd_ncookies)
+ break;
+
+ ddi_dma_nextcookie(acmd->cmd_dmahandle,
+ &acmd->cmd_dmacookies[i]);
+ }
+
+ acmd->cmd_cookie = i;
+ acmd->cmd_cookiecnt = i;
+
+ acmd->cmd_flags |= CFLAG_DMAVALID;
+
+ if (bp->b_bcount >= acmd->cmd_dmacount) {
+ pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+ } else {
+ pkt->pkt_resid = 0;
+ }
+
+ return (DDI_SUCCESS);
+ case DDI_DMA_NORESOURCES:
+ bioerror(bp, 0);
+ break;
+ case DDI_DMA_NOMAPPING:
+ bioerror(bp, EFAULT);
+ break;
+ case DDI_DMA_TOOBIG:
+ bioerror(bp, EINVAL);
+ break;
+ case DDI_DMA_INUSE:
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:"
+ " DDI_DMA_INUSE impossible"));
+ break;
+ default:
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+ "impossible result (0x%x)", i));
+ break;
+ }
+
+no_dma_cookies:
+ ddi_dma_free_handle(&acmd->cmd_dmahandle);
+ acmd->cmd_dmahandle = NULL;
+ acmd->cmd_flags &= ~CFLAG_DMAVALID;
+ return (DDI_FAILURE);
+}
+
+/*
+ * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *)
+ *
+ * move dma resources to next dma window
+ *
+ */
+static int
+drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt,
+ struct buf *bp)
+{
+ int i = 0;
+
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ /*
+ * If there are no more cookies remaining in this window,
+ * must move to the next window first.
+ */
+ if (acmd->cmd_cookie == acmd->cmd_ncookies) {
+ if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) {
+ return (DDI_SUCCESS);
+ }
+
+ /* at last window, cannot move */
+ if (++acmd->cmd_curwin >= acmd->cmd_nwin) {
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+ &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+ &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+ DDI_FAILURE) {
+ return (DDI_FAILURE);
+ }
+
+ acmd->cmd_cookie = 0;
+ } else {
+ /* still more cookies in this window - get the next one */
+ ddi_dma_nextcookie(acmd->cmd_dmahandle,
+ &acmd->cmd_dmacookies[0]);
+ }
+
+ /* get remaining cookies in this window, up to our maximum */
+ for (;;) {
+ acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size;
+ acmd->cmd_cookie++;
+
+ if (i == instance->max_num_sge ||
+ acmd->cmd_cookie == acmd->cmd_ncookies) {
+ break;
+ }
+
+ ddi_dma_nextcookie(acmd->cmd_dmahandle,
+ &acmd->cmd_dmacookies[i]);
+ }
+
+ acmd->cmd_cookiecnt = i;
+
+ if (bp->b_bcount >= acmd->cmd_dmacount) {
+ pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+ } else {
+ pkt->pkt_resid = 0;
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * build_cmd
+ */
+static struct drsas_cmd *
+build_cmd(struct drsas_instance *instance, struct scsi_address *ap,
+ struct scsi_pkt *pkt, uchar_t *cmd_done)
+{
+ uint16_t flags = 0;
+ uint32_t i;
+ uint32_t context __unused;
+ uint32_t sge_bytes;
+ ddi_acc_handle_t acc_handle;
+ struct drsas_cmd *cmd;
+ struct drsas_sge64 *mfi_sgl;
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+ struct drsas_pthru_frame *pthru;
+ struct drsas_io_frame *ldio;
+
+ /* find out if this is logical or physical drive command. */
+ acmd->islogical = MRDRV_IS_LOGICAL(ap);
+ acmd->device_id = MAP_DEVICE_ID(instance, ap);
+ *cmd_done = 0;
+
+ /* get the command packet */
+ if (!(cmd = get_mfi_pkt(instance))) {
+ return (NULL);
+ }
+
+ acc_handle = cmd->frame_dma_obj.acc_handle;
+
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index);
+
+ cmd->pkt = pkt;
+ cmd->cmd = acmd;
+
+ /* lets get the command directions */
+ if (acmd->cmd_flags & CFLAG_DMASEND) {
+ flags = MFI_FRAME_DIR_WRITE;
+
+ if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle,
+ acmd->cmd_dma_offset, acmd->cmd_dma_len,
+ DDI_DMA_SYNC_FORDEV);
+ }
+ } else if (acmd->cmd_flags & ~CFLAG_DMASEND) {
+ flags = MFI_FRAME_DIR_READ;
+
+ if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle,
+ acmd->cmd_dma_offset, acmd->cmd_dma_len,
+ DDI_DMA_SYNC_FORCPU);
+ }
+ } else {
+ flags = MFI_FRAME_DIR_NONE;
+ }
+
+ flags |= MFI_FRAME_SGL64;
+
+ switch (pkt->pkt_cdbp[0]) {
+
+ /*
+ * case SCMD_SYNCHRONIZE_CACHE:
+ * flush_cache(instance);
+ * return_mfi_pkt(instance, cmd);
+ * *cmd_done = 1;
+ *
+ * return (NULL);
+ */
+
+ case SCMD_READ:
+ case SCMD_WRITE:
+ case SCMD_READ_G1:
+ case SCMD_WRITE_G1:
+ if (acmd->islogical) {
+ ldio = (struct drsas_io_frame *)cmd->frame;
+
+ /*
+ * preare the Logical IO frame:
+ * 2nd bit is zero for all read cmds
+ */
+ ddi_put8(acc_handle, &ldio->cmd,
+ (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE
+ : MFI_CMD_OP_LD_READ);
+ ddi_put8(acc_handle, &ldio->cmd_status, 0x0);
+ ddi_put8(acc_handle, &ldio->scsi_status, 0x0);
+ ddi_put8(acc_handle, &ldio->target_id, acmd->device_id);
+ ddi_put16(acc_handle, &ldio->timeout, 0);
+ ddi_put8(acc_handle, &ldio->reserved_0, 0);
+ ddi_put16(acc_handle, &ldio->pad_0, 0);
+ ddi_put16(acc_handle, &ldio->flags, flags);
+
+ /* Initialize sense Information */
+ bzero(cmd->sense, SENSE_LENGTH);
+ ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH);
+ ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0);
+ ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo,
+ cmd->sense_phys_addr);
+ ddi_put32(acc_handle, &ldio->start_lba_hi, 0);
+ ddi_put8(acc_handle, &ldio->access_byte,
+ (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0);
+ ddi_put8(acc_handle, &ldio->sge_count,
+ acmd->cmd_cookiecnt);
+ mfi_sgl = (struct drsas_sge64 *)&ldio->sgl;
+
+ context = ddi_get32(acc_handle, &ldio->context);
+
+ if (acmd->cmd_cdblen == CDB_GROUP0) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ (uint16_t)(pkt->pkt_cdbp[4])));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[3])) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 8) |
+ ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F)
+ << 16)));
+ } else if (acmd->cmd_cdblen == CDB_GROUP1) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ ((uint16_t)(pkt->pkt_cdbp[8])) |
+ ((uint16_t)(pkt->pkt_cdbp[7]) << 8)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[5])) |
+ ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+ } else if (acmd->cmd_cdblen == CDB_GROUP2) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ ((uint16_t)(pkt->pkt_cdbp[9])) |
+ ((uint16_t)(pkt->pkt_cdbp[8]) << 8) |
+ ((uint16_t)(pkt->pkt_cdbp[7]) << 16) |
+ ((uint16_t)(pkt->pkt_cdbp[6]) << 24)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[5])) |
+ ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+ } else if (acmd->cmd_cdblen == CDB_GROUP3) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ ((uint16_t)(pkt->pkt_cdbp[13])) |
+ ((uint16_t)(pkt->pkt_cdbp[12]) << 8) |
+ ((uint16_t)(pkt->pkt_cdbp[11]) << 16) |
+ ((uint16_t)(pkt->pkt_cdbp[10]) << 24)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[9])) |
+ ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[6]) << 24)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[5])) |
+ ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+ }
+
+ break;
+ }
+ /* fall through */
+ default:
+
+ switch (pkt->pkt_cdbp[0]) {
+ case SCMD_MODE_SENSE:
+ case SCMD_MODE_SENSE_G1: {
+ union scsi_cdb *cdbp;
+ uint16_t page_code;
+
+ cdbp = (void *)pkt->pkt_cdbp;
+ page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0];
+ switch (page_code) {
+ case 0x3:
+ case 0x4:
+ (void) drsas_mode_sense_build(pkt);
+ return_mfi_pkt(instance, cmd);
+ *cmd_done = 1;
+ return (NULL);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ pthru = (struct drsas_pthru_frame *)cmd->frame;
+
+ /* prepare the DCDB frame */
+ ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ?
+ MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI);
+ ddi_put8(acc_handle, &pthru->cmd_status, 0x0);
+ ddi_put8(acc_handle, &pthru->scsi_status, 0x0);
+ ddi_put8(acc_handle, &pthru->target_id, acmd->device_id);
+ ddi_put8(acc_handle, &pthru->lun, 0);
+ ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen);
+ ddi_put16(acc_handle, &pthru->timeout, 0);
+ ddi_put16(acc_handle, &pthru->flags, flags);
+ ddi_put32(acc_handle, &pthru->data_xfer_len,
+ acmd->cmd_dmacount);
+ ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt);
+ mfi_sgl = (struct drsas_sge64 *)&pthru->sgl;
+
+ bzero(cmd->sense, SENSE_LENGTH);
+ ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH);
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo,
+ cmd->sense_phys_addr);
+
+ context = ddi_get32(acc_handle, &pthru->context);
+ ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp,
+ (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR);
+
+ break;
+ }
+#ifdef lint
+ context = context;
+#endif
+ /* prepare the scatter-gather list for the firmware */
+ for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) {
+ ddi_put64(acc_handle, &mfi_sgl->phys_addr,
+ acmd->cmd_dmacookies[i].dmac_laddress);
+ ddi_put32(acc_handle, &mfi_sgl->length,
+ acmd->cmd_dmacookies[i].dmac_size);
+ }
+
+ sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt;
+
+ cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) +
+ ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1;
+
+ if (cmd->frame_count >= 8) {
+ cmd->frame_count = 8;
+ }
+
+ return (cmd);
+}
+
+/*
+ * issue_mfi_pthru
+ */
+static int
+issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *ubuf;
+ uint32_t kphys_addr = 0;
+ uint32_t xferlen = 0;
+ uint_t model;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ dma_obj_t pthru_dma_obj;
+ struct drsas_pthru_frame *kpthru;
+ struct drsas_pthru_frame *pthru;
+ int i;
+ pthru = &cmd->frame->pthru;
+ kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+
+ xferlen = kpthru->sgl.sge32[0].length;
+
+ ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+ xferlen = kpthru->sgl.sge32[0].length;
+ ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64"));
+ xferlen = kpthru->sgl.sge64[0].length;
+ ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr;
+#endif
+ }
+
+ if (xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ pthru_dma_obj.size = xferlen;
+ pthru_dma_obj.dma_attr = drsas_generic_dma_attr;
+ pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ pthru_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ pthru_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &pthru_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ if (kpthru->flags & MFI_FRAME_DIR_WRITE) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyin((uint8_t *)ubuf+i,
+ (uint8_t *)pthru_dma_obj.buffer+i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_pthru : "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address;
+ }
+
+ ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd);
+ ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len);
+ ddi_put8(acc_handle, &pthru->cmd_status, 0);
+ ddi_put8(acc_handle, &pthru->scsi_status, 0);
+ ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id);
+ ddi_put8(acc_handle, &pthru->lun, kpthru->lun);
+ ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len);
+ ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count);
+ ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout);
+ ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len);
+
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+ /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0);
+
+ ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb,
+ pthru->cdb_len, DDI_DEV_AUTOINCR);
+
+ ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64);
+ ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen);
+ ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_pthru: fw_ioctl failed"));
+ } else {
+ if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)pthru_dma_obj.buffer+i,
+ (uint8_t *)ubuf+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_pthru : "
+ "copy to user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+
+ kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status);
+ kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status);
+
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, "
+ "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status));
+
+ if (xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_dcmd
+ */
+static int
+issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *ubuf;
+ uint32_t kphys_addr = 0;
+ uint32_t xferlen = 0;
+ uint32_t model;
+ dma_obj_t dcmd_dma_obj;
+ struct drsas_dcmd_frame *kdcmd;
+ struct drsas_dcmd_frame *dcmd;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ int i;
+ dcmd = &cmd->frame->dcmd;
+ kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+
+ xferlen = kdcmd->sgl.sge32[0].length;
+
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+ xferlen = kdcmd->sgl.sge32[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64"));
+ xferlen = kdcmd->sgl.sge64[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+ }
+ if (xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ dcmd_dma_obj.size = xferlen;
+ dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+ dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ if (kdcmd->flags & MFI_FRAME_DIR_WRITE) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyin((uint8_t *)ubuf + i,
+ (uint8_t *)dcmd_dma_obj.buffer + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_dcmd : "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address;
+ }
+
+ ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd);
+ ddi_put8(acc_handle, &dcmd->cmd_status, 0);
+ ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count);
+ ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout);
+ ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len);
+ ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode);
+
+ ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b,
+ (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR);
+
+ ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64);
+ ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen);
+ ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed"));
+ } else {
+ if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)dcmd_dma_obj.buffer + i,
+ (uint8_t *)ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_dcmd : "
+ "copy to user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+
+ kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status);
+
+ if (xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_smp
+ */
+static int
+issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *request_ubuf;
+ void *response_ubuf;
+ uint32_t request_xferlen = 0;
+ uint32_t response_xferlen = 0;
+ uint_t model;
+ dma_obj_t request_dma_obj;
+ dma_obj_t response_dma_obj;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ struct drsas_smp_frame *ksmp;
+ struct drsas_smp_frame *smp;
+ struct drsas_sge32 *sge32;
+#ifndef _ILP32
+ struct drsas_sge64 *sge64;
+#endif
+ int i;
+ uint64_t tmp_sas_addr;
+
+ smp = &cmd->frame->smp;
+ ksmp = (struct drsas_smp_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+ sge32 = &ksmp->sgl[0].sge32[0];
+ response_xferlen = sge32[0].length;
+ request_xferlen = sge32[1].length;
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+ "response_xferlen = %x, request_xferlen = %x",
+ response_xferlen, request_xferlen));
+
+ response_ubuf = (void *)(ulong_t)sge32[0].phys_addr;
+ request_ubuf = (void *)(ulong_t)sge32[1].phys_addr;
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+ "response_ubuf = %p, request_ubuf = %p",
+ response_ubuf, request_ubuf));
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+ sge32 = &ksmp->sgl[0].sge32[0];
+ response_xferlen = sge32[0].length;
+ request_xferlen = sge32[1].length;
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+ "response_xferlen = %x, request_xferlen = %x",
+ response_xferlen, request_xferlen));
+
+ response_ubuf = (void *)(ulong_t)sge32[0].phys_addr;
+ request_ubuf = (void *)(ulong_t)sge32[1].phys_addr;
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+ "response_ubuf = %p, request_ubuf = %p",
+ response_ubuf, request_ubuf));
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64"));
+
+ sge64 = &ksmp->sgl[0].sge64[0];
+ response_xferlen = sge64[0].length;
+ request_xferlen = sge64[1].length;
+
+ response_ubuf = (void *)(ulong_t)sge64[0].phys_addr;
+ request_ubuf = (void *)(ulong_t)sge64[1].phys_addr;
+#endif
+ }
+ if (request_xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ request_dma_obj.size = request_xferlen;
+ request_dma_obj.dma_attr = drsas_generic_dma_attr;
+ request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ request_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ request_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &request_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < request_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)request_ubuf + i,
+ (uint8_t *)request_dma_obj.buffer + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ if (response_xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ response_dma_obj.size = response_xferlen;
+ response_dma_obj.dma_attr = drsas_generic_dma_attr;
+ response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ response_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ response_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &response_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < response_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)response_ubuf + i,
+ (uint8_t *)response_dma_obj.buffer + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ ddi_put8(acc_handle, &smp->cmd, ksmp->cmd);
+ ddi_put8(acc_handle, &smp->cmd_status, 0);
+ ddi_put8(acc_handle, &smp->connection_status, 0);
+ ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count);
+ /* smp->context = ksmp->context; */
+ ddi_put16(acc_handle, &smp->timeout, ksmp->timeout);
+ ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len);
+
+ bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr,
+ sizeof (uint64_t));
+ ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr);
+
+ ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64);
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+ sge32 = &smp->sgl[0].sge32[0];
+ ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+ ddi_put32(acc_handle, &sge32[0].phys_addr,
+ response_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+ ddi_put32(acc_handle, &sge32[1].phys_addr,
+ request_dma_obj.dma_cookie[0].dmac_address);
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+ sge32 = &smp->sgl[0].sge32[0];
+ ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+ ddi_put32(acc_handle, &sge32[0].phys_addr,
+ response_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+ ddi_put32(acc_handle, &sge32[1].phys_addr,
+ request_dma_obj.dma_cookie[0].dmac_address);
+#else
+ con_log(CL_ANN1, (CE_NOTE,
+ "issue_mfi_smp: DDI_MODEL_LP64"));
+ sge64 = &smp->sgl[0].sge64[0];
+ ddi_put32(acc_handle, &sge64[0].length, response_xferlen);
+ ddi_put64(acc_handle, &sge64[0].phys_addr,
+ response_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &sge64[1].length, request_xferlen);
+ ddi_put64(acc_handle, &sge64[1].phys_addr,
+ request_dma_obj.dma_cookie[0].dmac_address);
+#endif
+ }
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : "
+ "smp->response_xferlen = %d, smp->request_xferlen = %d "
+ "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length),
+ ddi_get32(acc_handle, &sge32[1].length),
+ ddi_get32(acc_handle, &smp->data_xfer_len)));
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_smp: fw_ioctl failed"));
+ } else {
+ con_log(CL_ANN1, (CE_NOTE,
+ "issue_mfi_smp: copy to user space"));
+
+ if (request_xferlen) {
+ for (i = 0; i < request_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)request_dma_obj.buffer +
+ i, (uint8_t *)request_ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_smp : copy to user space"
+ " failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ if (response_xferlen) {
+ for (i = 0; i < response_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)response_dma_obj.buffer
+ + i, (uint8_t *)response_ubuf
+ + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_smp : copy to "
+ "user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+
+ ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status);
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d",
+ ddi_get8(acc_handle, &smp->cmd_status)));
+
+
+ if (request_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, request_dma_obj) !=
+ DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ if (response_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, response_dma_obj) !=
+ DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_stp
+ */
+static int
+issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *fis_ubuf;
+ void *data_ubuf;
+ uint32_t fis_xferlen = 0;
+ uint32_t data_xferlen = 0;
+ uint_t model;
+ dma_obj_t fis_dma_obj;
+ dma_obj_t data_dma_obj;
+ struct drsas_stp_frame *kstp;
+ struct drsas_stp_frame *stp;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ int i;
+
+ stp = &cmd->frame->stp;
+ kstp = (struct drsas_stp_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+ fis_xferlen = kstp->sgl.sge32[0].length;
+ data_xferlen = kstp->sgl.sge32[1].length;
+
+ fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+ data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+ }
+ else
+ {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+ fis_xferlen = kstp->sgl.sge32[0].length;
+ data_xferlen = kstp->sgl.sge32[1].length;
+
+ fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+ data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64"));
+
+ fis_xferlen = kstp->sgl.sge64[0].length;
+ data_xferlen = kstp->sgl.sge64[1].length;
+
+ fis_ubuf = (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr;
+ data_ubuf = (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr;
+#endif
+ }
+
+
+ if (fis_xferlen) {
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: "
+ "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen));
+
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ fis_dma_obj.size = fis_xferlen;
+ fis_dma_obj.dma_attr = drsas_generic_dma_attr;
+ fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ fis_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ fis_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &fis_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < fis_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)fis_ubuf + i,
+ (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ if (data_xferlen) {
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p "
+ "data_xferlen = %x", data_ubuf, data_xferlen));
+
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ data_dma_obj.size = data_xferlen;
+ data_dma_obj.dma_attr = drsas_generic_dma_attr;
+ data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ data_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ data_dma_obj.dma_attr.dma_attr_align = 1;
+
+/* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &data_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < data_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)data_ubuf + i,
+ (uint8_t *)data_dma_obj.buffer + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ ddi_put8(acc_handle, &stp->cmd, kstp->cmd);
+ ddi_put8(acc_handle, &stp->cmd_status, 0);
+ ddi_put8(acc_handle, &stp->connection_status, 0);
+ ddi_put8(acc_handle, &stp->target_id, kstp->target_id);
+ ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count);
+
+ ddi_put16(acc_handle, &stp->timeout, kstp->timeout);
+ ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len);
+
+ ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10,
+ DDI_DEV_AUTOINCR);
+
+ ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64);
+ ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags);
+ ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen);
+ ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr,
+ fis_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen);
+ ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr,
+ data_dma_obj.dma_cookie[0].dmac_address);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed"));
+ } else {
+
+ if (fis_xferlen) {
+ for (i = 0; i < fis_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)fis_dma_obj.buffer + i,
+ (uint8_t *)fis_ubuf + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_stp : copy to "
+ "user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+ if (data_xferlen) {
+ for (i = 0; i < data_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)data_dma_obj.buffer + i,
+ (uint8_t *)data_ubuf + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_stp : copy to"
+ " user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status);
+
+ if (fis_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ if (data_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * fill_up_drv_ver
+ */
+static void
+fill_up_drv_ver(struct drsas_drv_ver *dv)
+{
+ (void) memset(dv, 0, sizeof (struct drsas_drv_ver));
+
+ (void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$"));
+ (void) memcpy(dv->os_name, "Solaris", strlen("Solaris"));
+ (void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas"));
+ (void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION));
+ (void) memcpy(dv->drv_rel_date, DRSAS_RELDATE,
+ strlen(DRSAS_RELDATE));
+}
+
+/*
+ * handle_drv_ioctl
+ */
+static int
+handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ int mode)
+{
+ int i;
+ int rval = DDI_SUCCESS;
+ int *props = NULL;
+ void *ubuf;
+
+ uint8_t *pci_conf_buf;
+ uint32_t xferlen;
+ uint32_t num_props;
+ uint_t model;
+ struct drsas_dcmd_frame *kdcmd;
+ struct drsas_drv_ver dv;
+ struct drsas_pci_information pi;
+
+ kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+ xferlen = kdcmd->sgl.sge32[0].length;
+
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+ xferlen = kdcmd->sgl.sge32[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_LP64"));
+ xferlen = kdcmd->sgl.sge64[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+ }
+ con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+ "dataBuf=%p size=%d bytes", ubuf, xferlen));
+
+ switch (kdcmd->opcode) {
+ case DRSAS_DRIVER_IOCTL_DRIVER_VERSION:
+ con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"));
+
+ fill_up_drv_ver(&dv);
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"
+ " : copy to user space failed"));
+ kdcmd->cmd_status = 1;
+ rval = DDI_FAILURE;
+ break;
+ }
+ }
+ if (i == xferlen)
+ kdcmd->cmd_status = 0;
+ break;
+ case DRSAS_DRIVER_IOCTL_PCI_INFORMATION:
+ con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON"));
+
+ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip,
+ 0, "reg", &props, &num_props)) {
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : "
+ "ddi_prop_look_int_array failed"));
+ rval = DDI_FAILURE;
+ } else {
+
+ pi.busNumber = (props[0] >> 16) & 0xFF;
+ pi.deviceNumber = (props[0] >> 11) & 0x1f;
+ pi.functionNumber = (props[0] >> 8) & 0x7;
+ ddi_prop_free((void *)props);
+ }
+
+ pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo;
+
+ for (i = 0; i < (sizeof (struct drsas_pci_information) -
+ offsetof(struct drsas_pci_information, pciHeaderInfo));
+ i++) {
+ pci_conf_buf[i] =
+ pci_config_get8(instance->pci_handle, i);
+ }
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_PCI_INFORMATION"
+ " : copy to user space failed"));
+ kdcmd->cmd_status = 1;
+ rval = DDI_FAILURE;
+ break;
+ }
+ }
+
+ if (i == xferlen)
+ kdcmd->cmd_status = 0;
+
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "invalid driver specific IOCTL opcode = 0x%x",
+ kdcmd->opcode));
+ kdcmd->cmd_status = 1;
+ rval = DDI_FAILURE;
+ break;
+ }
+
+ return (rval);
+}
+
+/*
+ * handle_mfi_ioctl
+ */
+static int
+handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ int mode)
+{
+ int rval = DDI_SUCCESS;
+
+ struct drsas_header *hdr;
+ struct drsas_cmd *cmd;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: "
+ "failed to get a cmd packet"));
+ return (DDI_FAILURE);
+ }
+
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ hdr = (struct drsas_header *)&ioctl->frame[0];
+
+ switch (hdr->cmd) {
+ case MFI_CMD_OP_DCMD:
+ rval = issue_mfi_dcmd(instance, ioctl, cmd, mode);
+ break;
+ case MFI_CMD_OP_SMP:
+ rval = issue_mfi_smp(instance, ioctl, cmd, mode);
+ break;
+ case MFI_CMD_OP_STP:
+ rval = issue_mfi_stp(instance, ioctl, cmd, mode);
+ break;
+ case MFI_CMD_OP_LD_SCSI:
+ case MFI_CMD_OP_PD_SCSI:
+ rval = issue_mfi_pthru(instance, ioctl, cmd, mode);
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: "
+ "invalid mfi ioctl hdr->cmd = %d", hdr->cmd));
+ rval = DDI_FAILURE;
+ break;
+ }
+
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS)
+ rval = DDI_FAILURE;
+ return (rval);
+}
+
+/*
+ * AEN
+ */
+static int
+handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen)
+{
+ int rval = 0;
+
+ rval = register_mfi_aen(instance, instance->aen_seq_num,
+ aen->class_locale_word);
+
+ aen->cmd_status = (uint8_t)rval;
+
+ return (rval);
+}
+
+static int
+register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num,
+ uint32_t class_locale_word)
+{
+ int ret_val;
+
+ struct drsas_cmd *cmd, *aen_cmd;
+ struct drsas_dcmd_frame *dcmd;
+ union drsas_evt_class_locale curr_aen;
+ union drsas_evt_class_locale prev_aen;
+
+ /*
+ * If there an AEN pending already (aen_cmd), check if the
+ * class_locale of that pending AEN is inclusive of the new
+ * AEN request we currently have. If it is, then we don't have
+ * to do anything. In other words, whichever events the current
+ * AEN request is subscribing to, have already been subscribed
+ * to.
+ *
+ * If the old_cmd is _not_ inclusive, then we have to abort
+ * that command, form a class_locale that is superset of both
+ * old and current and re-issue to the FW
+ */
+
+ curr_aen.word = class_locale_word;
+ aen_cmd = instance->aen_cmd;
+ if (aen_cmd) {
+ prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle,
+ &aen_cmd->frame->dcmd.mbox.w[1]);
+
+ /*
+ * A class whose enum value is smaller is inclusive of all
+ * higher values. If a PROGRESS (= -1) was previously
+ * registered, then a new registration requests for higher
+ * classes need not be sent to FW. They are automatically
+ * included.
+ *
+ * Locale numbers don't have such hierarchy. They are bitmap
+ * values
+ */
+ if ((prev_aen.members.class <= curr_aen.members.class) &&
+ !((prev_aen.members.locale & curr_aen.members.locale) ^
+ curr_aen.members.locale)) {
+ /*
+ * Previously issued event registration includes
+ * current request. Nothing to do.
+ */
+
+ return (0);
+ } else {
+ curr_aen.members.locale |= prev_aen.members.locale;
+
+ if (prev_aen.members.class < curr_aen.members.class)
+ curr_aen.members.class = prev_aen.members.class;
+
+ ret_val = abort_aen_cmd(instance, aen_cmd);
+
+ if (ret_val) {
+ con_log(CL_ANN, (CE_WARN, "register_mfi_aen: "
+ "failed to abort prevous AEN command"));
+
+ return (ret_val);
+ }
+ }
+ } else {
+ curr_aen.word = class_locale_word;
+ }
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd)
+ return (ENOMEM);
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ dcmd = &cmd->frame->dcmd;
+
+ /* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ (void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+ sizeof (struct drsas_evt_detail));
+
+ /* Prepare DCMD for aen registration */
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_READ);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+ sizeof (struct drsas_evt_detail));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_EVENT_WAIT);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1],
+ curr_aen.word);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+ instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+ sizeof (struct drsas_evt_detail));
+
+ instance->aen_seq_num = seq_num;
+
+
+ /*
+ * Store reference to the cmd used to register for AEN. When an
+ * application wants us to register for AEN, we have to abort this
+ * cmd and re-register with a new EVENT LOCALE supplied by that app
+ */
+ instance->aen_cmd = cmd;
+
+ cmd->frame_count = 1;
+
+ /* Issue the aen registration frame */
+ /* atomic_add_16 (&instance->fw_outstanding, 1); */
+ instance->func_ptr->issue_cmd(cmd, instance);
+
+ return (0);
+}
+
+static void
+display_scsi_inquiry(caddr_t scsi_inq)
+{
+#define MAX_SCSI_DEVICE_CODE 14
+ int i;
+ char inquiry_buf[256] = {0};
+ int len;
+ const char *const scsi_device_types[] = {
+ "Direct-Access ",
+ "Sequential-Access",
+ "Printer ",
+ "Processor ",
+ "WORM ",
+ "CD-ROM ",
+ "Scanner ",
+ "Optical Device ",
+ "Medium Changer ",
+ "Communications ",
+ "Unknown ",
+ "Unknown ",
+ "Unknown ",
+ "Enclosure ",
+ };
+
+ len = 0;
+
+ len += snprintf(inquiry_buf + len, 265 - len, " Vendor: ");
+ for (i = 8; i < 16; i++) {
+ len += snprintf(inquiry_buf + len, 265 - len, "%c",
+ scsi_inq[i]);
+ }
+
+ len += snprintf(inquiry_buf + len, 265 - len, " Model: ");
+
+ for (i = 16; i < 32; i++) {
+ len += snprintf(inquiry_buf + len, 265 - len, "%c",
+ scsi_inq[i]);
+ }
+
+ len += snprintf(inquiry_buf + len, 265 - len, " Rev: ");
+
+ for (i = 32; i < 36; i++) {
+ len += snprintf(inquiry_buf + len, 265 - len, "%c",
+ scsi_inq[i]);
+ }
+
+ len += snprintf(inquiry_buf + len, 265 - len, "\n");
+
+
+ i = scsi_inq[0] & 0x1f;
+
+
+ len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ",
+ i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] :
+ "Unknown ");
+
+
+ len += snprintf(inquiry_buf + len, 265 - len,
+ " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07);
+
+ if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) {
+ len += snprintf(inquiry_buf + len, 265 - len, " CCS\n");
+ } else {
+ len += snprintf(inquiry_buf + len, 265 - len, "\n");
+ }
+
+ con_log(CL_ANN1, (CE_CONT, inquiry_buf));
+}
+
+static int
+read_fw_status_reg_ppc(struct drsas_instance *instance)
+{
+ return ((int)RD_OB_SCRATCH_PAD_0(instance));
+}
+
+static void
+issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance)
+{
+ atomic_add_16(&instance->fw_outstanding, 1);
+
+ /* Issue the command to the FW */
+ WR_IB_QPORT((cmd->frame_phys_addr) |
+ (((cmd->frame_count - 1) << 1) | 1), instance);
+}
+
+/*
+ * issue_cmd_in_sync_mode
+ */
+static int
+issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ int i;
+ uint32_t msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC);
+
+ con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called"));
+
+ cmd->cmd_status = ENODATA;
+
+ WR_IB_QPORT((cmd->frame_phys_addr) |
+ (((cmd->frame_count - 1) << 1) | 1), instance);
+
+ mutex_enter(&instance->int_cmd_mtx);
+
+ for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) {
+ cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx);
+ }
+
+ mutex_exit(&instance->int_cmd_mtx);
+
+ con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done"));
+
+ if (i < (msecs -1)) {
+ return (DDI_SUCCESS);
+ } else {
+ return (DDI_FAILURE);
+ }
+}
+
+/*
+ * issue_cmd_in_poll_mode
+ */
+static int
+issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ int i;
+ uint16_t flags;
+ uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC;
+ struct drsas_header *frame_hdr;
+
+ con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called"));
+
+ frame_hdr = (struct drsas_header *)cmd->frame;
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
+ MFI_CMD_STATUS_POLL_MODE);
+ flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
+ flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
+
+ /* issue the frame using inbound queue port */
+ WR_IB_QPORT((cmd->frame_phys_addr) |
+ (((cmd->frame_count - 1) << 1) | 1), instance);
+
+ /* wait for cmd_status to change from 0xFF */
+ for (i = 0; i < msecs && (
+ ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+ == MFI_CMD_STATUS_POLL_MODE); i++) {
+ drv_usecwait(MILLISEC); /* wait for 1000 usecs */
+ }
+
+ if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+ == MFI_CMD_STATUS_POLL_MODE) {
+ con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: "
+ "cmd polling timed out"));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+static void
+enable_intr_ppc(struct drsas_instance *instance)
+{
+ uint32_t mask;
+
+ con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called"));
+
+ /* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */
+ WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance);
+
+ /* WR_OB_INTR_MASK(~0x80000000, instance); */
+ WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance);
+
+ /* dummy read to force PCI flush */
+ mask = RD_OB_INTR_MASK(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: "
+ "outbound_intr_mask = 0x%x", mask));
+}
+
+static void
+disable_intr_ppc(struct drsas_instance *instance)
+{
+ uint32_t mask __unused;
+
+ con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called"));
+
+ con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : "
+ "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+ /* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */
+ WR_OB_INTR_MASK(OB_INTR_MASK, instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : "
+ "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+ /* dummy read to force PCI flush */
+ mask = RD_OB_INTR_MASK(instance);
+#ifdef lint
+ mask = mask;
+#endif
+}
+
+static int
+intr_ack_ppc(struct drsas_instance *instance)
+{
+ uint32_t status;
+
+ con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called"));
+
+ /* check if it is our interrupt */
+ status = RD_OB_INTR_STATUS(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status));
+
+ if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) {
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ /* clear the interrupt by writing back the same value */
+ WR_OB_DOORBELL_CLEAR(status, instance);
+
+ /* dummy READ */
+ status = RD_OB_INTR_STATUS(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared"));
+
+ return (DDI_INTR_CLAIMED);
+}
+
+static int
+drsas_common_check(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ int ret = DDI_SUCCESS;
+
+ if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+ DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+ if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+ != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+ if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) !=
+ DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+ if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+
+ ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0);
+
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data)
+{
+ /*
+ * as the driver can always deal with an error in any dma or
+ * access handle, we can just return the fme_status value.
+ */
+ pci_ereport_post(dip, err, NULL);
+ return (err->fme_status);
+}
+
+static void
+drsas_fm_init(struct drsas_instance *instance)
+{
+ /* Need to change iblock to priority for new MSI intr */
+ ddi_iblock_cookie_t fm_ibc;
+
+ /* Only register with IO Fault Services if we have some capability */
+ if (instance->fm_capabilities) {
+ /* Adjust access and dma attributes for FMA */
+ endian_attr.devacc_attr_access = DDI_FLAGERR_ACC;
+ drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR;
+
+ /*
+ * Register capabilities with IO Fault Services.
+ * fm_capabilities will be updated to indicate
+ * capabilities actually supported (not requested.)
+ */
+
+ ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc);
+
+ /*
+ * Initialize pci ereport capabilities if ereport
+ * capable (should always be.)
+ */
+
+ if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+ DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ pci_ereport_setup(instance->dip);
+ }
+
+ /*
+ * Register error callback if error callback capable.
+ */
+ if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ ddi_fm_handler_register(instance->dip,
+ drsas_fm_error_cb, (void*) instance);
+ }
+ } else {
+ endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+ drsas_generic_dma_attr.dma_attr_flags = 0;
+ }
+}
+
+static void
+drsas_fm_fini(struct drsas_instance *instance)
+{
+ /* Only unregister FMA capabilities if registered */
+ if (instance->fm_capabilities) {
+ /*
+ * Un-register error callback if error callback capable.
+ */
+ if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ ddi_fm_handler_unregister(instance->dip);
+ }
+
+ /*
+ * Release any resources allocated by pci_ereport_setup()
+ */
+ if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+ DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ pci_ereport_teardown(instance->dip);
+ }
+
+ /* Unregister from IO Fault Services */
+ ddi_fm_fini(instance->dip);
+
+ /* Adjust access and dma attributes for FMA */
+ endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+ drsas_generic_dma_attr.dma_attr_flags = 0;
+ }
+}
+
+int
+drsas_check_acc_handle(ddi_acc_handle_t handle)
+{
+ ddi_fm_error_t de;
+
+ if (handle == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION);
+
+ return (de.fme_status);
+}
+
+int
+drsas_check_dma_handle(ddi_dma_handle_t handle)
+{
+ ddi_fm_error_t de;
+
+ if (handle == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION);
+
+ return (de.fme_status);
+}
+
+void
+drsas_fm_ereport(struct drsas_instance *instance, char *detail)
+{
+ uint64_t ena;
+ char buf[FM_MAX_CLASS];
+
+ (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+ if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) {
+ ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP,
+ FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL);
+ }
+}
+
+static int
+drsas_add_intrs(struct drsas_instance *instance, int intr_type)
+{
+
+ dev_info_t *dip = instance->dip;
+ int avail, actual, count;
+ int i, flag, ret;
+
+ con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x",
+ intr_type));
+
+ /* Get number of interrupts */
+ ret = ddi_intr_get_nintrs(dip, intr_type, &count);
+ if ((ret != DDI_SUCCESS) || (count == 0)) {
+ con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:"
+ "ret %d count %d", ret, count));
+
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count));
+
+ /* Get number of available interrupts */
+ ret = ddi_intr_get_navail(dip, intr_type, &avail);
+ if ((ret != DDI_SUCCESS) || (avail == 0)) {
+ con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:"
+ "ret %d avail %d", ret, avail));
+
+ return (DDI_FAILURE);
+ }
+ con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail));
+
+ /* Only one interrupt routine. So limit the count to 1 */
+ if (count > 1) {
+ count = 1;
+ }
+
+ /*
+ * Allocate an array of interrupt handlers. Currently we support
+ * only one interrupt. The framework can be extended later.
+ */
+ instance->intr_size = count * sizeof (ddi_intr_handle_t);
+ instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP);
+ ASSERT(instance->intr_htable);
+
+ flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type ==
+ DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL;
+
+ /* Allocate interrupt */
+ ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0,
+ count, &actual, flag);
+
+ if ((ret != DDI_SUCCESS) || (actual == 0)) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "avail = %d", avail));
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+ if (actual < count) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "Requested = %d Received = %d", count, actual));
+ }
+ instance->intr_cnt = actual;
+
+ /*
+ * Get the priority of the interrupt allocated.
+ */
+ if ((ret = ddi_intr_get_pri(instance->intr_htable[0],
+ &instance->intr_pri)) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "get priority call failed"));
+
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Test for high level mutex. we don't support them.
+ */
+ if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "High level interrupts not supported."));
+
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ",
+ instance->intr_pri));
+
+ /* Call ddi_intr_add_handler() */
+ for (i = 0; i < actual; i++) {
+ ret = ddi_intr_add_handler(instance->intr_htable[i],
+ (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance,
+ (caddr_t)(uintptr_t)i);
+
+ if (ret != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:"
+ "failed %d", ret));
+
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done"));
+
+ if ((ret = ddi_intr_get_cap(instance->intr_htable[0],
+ &instance->intr_cap)) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d",
+ ret));
+
+ /* Free already allocated intr */
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_remove_handler(
+ instance->intr_htable[i]);
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) {
+ con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable"));
+
+ (void) ddi_intr_block_enable(instance->intr_htable,
+ instance->intr_cnt);
+ } else {
+ con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable"));
+
+ for (i = 0; i < instance->intr_cnt; i++) {
+ (void) ddi_intr_enable(instance->intr_htable[i]);
+ con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns "
+ "%d", i));
+ }
+ }
+
+ return (DDI_SUCCESS);
+
+}
+
+
+static void
+drsas_rem_intrs(struct drsas_instance *instance)
+{
+ int i;
+
+ con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called"));
+
+ /* Disable all interrupts first */
+ if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) {
+ (void) ddi_intr_block_disable(instance->intr_htable,
+ instance->intr_cnt);
+ } else {
+ for (i = 0; i < instance->intr_cnt; i++) {
+ (void) ddi_intr_disable(instance->intr_htable[i]);
+ }
+ }
+
+ /* Remove all the handlers */
+
+ for (i = 0; i < instance->intr_cnt; i++) {
+ (void) ddi_intr_remove_handler(instance->intr_htable[i]);
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+
+ kmem_free(instance->intr_htable, instance->intr_size);
+}
+
+static int
+drsas_tran_bus_config(dev_info_t *parent, uint_t flags,
+ ddi_bus_config_op_t op, void *arg, dev_info_t **childp)
+{
+ struct drsas_instance *instance;
+ int config;
+ int rval;
+
+ char *ptr = NULL;
+ int tgt, lun;
+
+ con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op));
+
+ if ((instance = ddi_get_soft_state(drsas_state,
+ ddi_get_instance(parent))) == NULL) {
+ return (NDI_FAILURE);
+ }
+
+ /* Hold nexus during bus_config */
+ ndi_devi_enter(parent, &config);
+ switch (op) {
+ case BUS_CONFIG_ONE: {
+
+ /* parse wwid/target name out of name given */
+ if ((ptr = strchr((char *)arg, '@')) == NULL) {
+ rval = NDI_FAILURE;
+ break;
+ }
+ ptr++;
+
+ if (drsas_parse_devname(arg, &tgt, &lun) != 0) {
+ rval = NDI_FAILURE;
+ break;
+ }
+
+ if (lun == 0) {
+ rval = drsas_config_ld(instance, tgt, lun, childp);
+ } else {
+ rval = NDI_FAILURE;
+ }
+
+ break;
+ }
+ case BUS_CONFIG_DRIVER:
+ case BUS_CONFIG_ALL: {
+
+ rval = drsas_config_all_devices(instance);
+
+ rval = NDI_SUCCESS;
+ break;
+ }
+ }
+
+ if (rval == NDI_SUCCESS) {
+ rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0);
+
+ }
+ ndi_devi_exit(parent, config);
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x",
+ rval));
+ return (rval);
+}
+
+static int
+drsas_config_all_devices(struct drsas_instance *instance)
+{
+ int rval, tgt;
+
+ for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+ (void) drsas_config_ld(instance, tgt, 0, NULL);
+
+ }
+
+ rval = NDI_SUCCESS;
+ return (rval);
+}
+
+static int
+drsas_parse_devname(char *devnm, int *tgt, int *lun)
+{
+ char devbuf[SCSI_MAXNAMELEN];
+ char *addr;
+ char *p, *tp, *lp;
+ long num;
+
+ /* Parse dev name and address */
+ (void) strcpy(devbuf, devnm);
+ addr = "";
+ for (p = devbuf; *p != '\0'; p++) {
+ if (*p == '@') {
+ addr = p + 1;
+ *p = '\0';
+ } else if (*p == ':') {
+ *p = '\0';
+ break;
+ }
+ }
+
+ /* Parse target and lun */
+ for (p = tp = addr, lp = NULL; *p != '\0'; p++) {
+ if (*p == ',') {
+ lp = p + 1;
+ *p = '\0';
+ break;
+ }
+ }
+ if (tgt && tp) {
+ if (ddi_strtol(tp, NULL, 0x10, &num)) {
+ return (DDI_FAILURE); /* Can declare this as constant */
+ }
+ *tgt = (int)num;
+ }
+ if (lun && lp) {
+ if (ddi_strtol(lp, NULL, 0x10, &num)) {
+ return (DDI_FAILURE);
+ }
+ *lun = (int)num;
+ }
+ return (DDI_SUCCESS); /* Success case */
+}
+
+static int
+drsas_config_ld(struct drsas_instance *instance, uint16_t tgt,
+ uint8_t lun, dev_info_t **ldip)
+{
+ struct scsi_device *sd;
+ dev_info_t *child;
+ int rval;
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d",
+ tgt, lun));
+
+ if ((child = drsas_find_child(instance, tgt, lun)) != NULL) {
+ if (ldip) {
+ *ldip = child;
+ }
+ con_log(CL_ANN1, (CE_NOTE,
+ "drsas_config_ld: Child = %p found t = %d l = %d",
+ (void *)child, tgt, lun));
+ return (NDI_SUCCESS);
+ }
+
+ sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP);
+ sd->sd_address.a_hba_tran = instance->tran;
+ sd->sd_address.a_target = (uint16_t)tgt;
+ sd->sd_address.a_lun = (uint8_t)lun;
+
+ if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS)
+ rval = drsas_config_scsi_device(instance, sd, ldip);
+ else
+ rval = NDI_FAILURE;
+
+ /* sd_unprobe is blank now. Free buffer manually */
+ if (sd->sd_inq) {
+ kmem_free(sd->sd_inq, SUN_INQSIZE);
+ sd->sd_inq = (struct scsi_inquiry *)NULL;
+ }
+
+ kmem_free(sd, sizeof (struct scsi_device));
+ con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d",
+ rval));
+ return (rval);
+}
+
+static int
+drsas_config_scsi_device(struct drsas_instance *instance,
+ struct scsi_device *sd, dev_info_t **dipp)
+{
+ char *nodename = NULL;
+ char **compatible = NULL;
+ int ncompatible = 0;
+ char *childname;
+ dev_info_t *ldip = NULL;
+ int tgt = sd->sd_address.a_target;
+ int lun = sd->sd_address.a_lun;
+ int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK;
+ int rval;
+
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun));
+ scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype,
+ NULL, &nodename, &compatible, &ncompatible);
+
+ if (nodename == NULL) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver "
+ "for t%dL%d", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+
+ childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename;
+ con_log(CL_ANN1, (CE_WARN,
+ "dr_sas: Childname = %2s nodename = %s", childname, nodename));
+
+ /* Create a dev node */
+ rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip);
+ con_log(CL_ANN1, (CE_WARN,
+ "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval));
+ if (rval == NDI_SUCCESS) {
+ if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) !=
+ DDI_PROP_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+ "property for t%dl%d target", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+ if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) !=
+ DDI_PROP_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+ "property for t%dl%d lun", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+
+ if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip,
+ "compatible", compatible, ncompatible) !=
+ DDI_PROP_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+ "property for t%dl%d compatible", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+
+ rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH);
+ if (rval != NDI_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online "
+ "t%dl%d", tgt, lun));
+ ndi_prop_remove_all(ldip);
+ (void) ndi_devi_free(ldip);
+ } else {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :"
+ "0 t%dl%d", tgt, lun));
+ }
+
+ }
+finish:
+ if (dipp) {
+ *dipp = ldip;
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN,
+ "dr_sas: config_scsi_device rval = %d t%dL%d",
+ rval, tgt, lun));
+ scsi_hba_nodename_compatible_free(nodename, compatible);
+ return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event,
+ uint64_t wwn)
+{
+ struct drsas_eventinfo *mrevt = NULL;
+
+ con_log(CL_ANN1, (CE_NOTE,
+ "drsas_service_evt called for t%dl%d event = %d",
+ tgt, lun, event));
+
+ if ((instance->taskq == NULL) || (mrevt =
+ kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) {
+ return (ENOMEM);
+ }
+
+ mrevt->instance = instance;
+ mrevt->tgt = tgt;
+ mrevt->lun = lun;
+ mrevt->event = event;
+
+ if ((ddi_taskq_dispatch(instance->taskq,
+ (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) !=
+ DDI_SUCCESS) {
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: Event task failed for t%dl%d event = %d",
+ tgt, lun, event));
+ kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+ return (DDI_FAILURE);
+ }
+ return (DDI_SUCCESS);
+}
+
+static void
+drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt)
+{
+ struct drsas_instance *instance = mrevt->instance;
+ dev_info_t *dip, *pdip;
+ int circ1 = 0;
+ char *devname;
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for"
+ " tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+
+ if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) {
+ dip = instance->dr_ld_list[mrevt->tgt].dip;
+ } else {
+ return;
+ }
+
+ ndi_devi_enter(instance->dip, &circ1);
+ switch (mrevt->event) {
+ case DRSAS_EVT_CONFIG_TGT:
+ if (dip == NULL) {
+
+ if (mrevt->lun == 0) {
+ (void) drsas_config_ld(instance, mrevt->tgt,
+ 0, NULL);
+ }
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_CONFIG_TGT called:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+
+ } else {
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_CONFIG_TGT dip != NULL:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+ }
+ break;
+ case DRSAS_EVT_UNCONFIG_TGT:
+ if (dip) {
+ if (i_ddi_devi_attached(dip)) {
+
+ pdip = ddi_get_parent(dip);
+
+ devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP);
+ (void) ddi_deviname(dip, devname);
+
+ (void) devfs_clean(pdip, devname + 1,
+ DV_CLEAN_FORCE);
+ kmem_free(devname, MAXNAMELEN + 1);
+ }
+ (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE);
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_UNCONFIG_TGT called:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+ } else {
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_UNCONFIG_TGT dip == NULL:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+ }
+ break;
+ }
+ kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+ ndi_devi_exit(instance->dip, circ1);
+}
+
+static int
+drsas_mode_sense_build(struct scsi_pkt *pkt)
+{
+ union scsi_cdb *cdbp;
+ uint16_t page_code;
+ struct scsa_cmd *acmd;
+ struct buf *bp;
+ struct mode_header *modehdrp;
+
+ cdbp = (void *)pkt->pkt_cdbp;
+ page_code = cdbp->cdb_un.sg.scsi[0];
+ acmd = PKT2CMD(pkt);
+ bp = acmd->cmd_buf;
+ if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) {
+ con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command"));
+ /* ADD pkt statistics as Command failed. */
+ return (NULL);
+ }
+
+ bp_mapin(bp);
+ bzero(bp->b_un.b_addr, bp->b_bcount);
+
+ switch (page_code) {
+ case 0x3: {
+ struct mode_format *page3p = NULL;
+ modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+ modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+ page3p = (void *)((caddr_t)modehdrp +
+ MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+ page3p->mode_page.code = 0x3;
+ page3p->mode_page.length =
+ (uchar_t)(sizeof (struct mode_format));
+ page3p->data_bytes_sect = 512;
+ page3p->sect_track = 63;
+ break;
+ }
+ case 0x4: {
+ struct mode_geometry *page4p = NULL;
+ modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+ modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+ page4p = (void *)((caddr_t)modehdrp +
+ MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+ page4p->mode_page.code = 0x4;
+ page4p->mode_page.length =
+ (uchar_t)(sizeof (struct mode_geometry));
+ page4p->heads = 255;
+ page4p->rpm = 10000;
+ break;
+ }
+ default:
+ break;
+ }
+ return (NULL);
+}
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf
new file mode 100644
index 0000000000..3792f43ca4
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2008-2009, LSI Logic Corporation.
+# All rights reserved.
+#
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# dr_sas.conf for sol 10 (and later) for all supported architectures
+#
+# global definitions
+
+# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI
+#drsas-enable-msi="yes";
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h
new file mode 100644
index 0000000000..8f78658edf
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.h
@@ -0,0 +1,1766 @@
+/*
+ * dr_sas.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DR_SAS_H_
+#define _DR_SAS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/scsi/scsi.h>
+#include "dr_sas_list.h"
+
+/*
+ * MegaRAID SAS2.0 Driver meta data
+ */
+#define DRSAS_VERSION "LSIv2.0"
+#define DRSAS_RELDATE "Jan 9, 2009"
+
+#define DRSAS_TRUE 1
+#define DRSAS_FALSE 0
+
+/*
+ * MegaRAID SAS2.0 device id conversion definitions.
+ */
+#define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT)
+
+/*
+ * MegaRAID SAS2.0 supported controllers
+ */
+#define PCI_DEVICE_ID_LSI_2108VDE 0x0078
+#define PCI_DEVICE_ID_LSI_2108V 0x0079
+
+/*
+ * Register Index for 2108 Controllers.
+ */
+#define REGISTER_SET_IO_2108 (2)
+
+#define DRSAS_MAX_SGE_CNT 0x50
+
+#define DRSAS_IOCTL_DRIVER 0x12341234
+#define DRSAS_IOCTL_FIRMWARE 0x12345678
+#define DRSAS_IOCTL_AEN 0x87654321
+
+#define DRSAS_1_SECOND 1000000
+
+/* Dynamic Enumeration Flags */
+#define DRSAS_PD_LUN 1
+#define DRSAS_LD_LUN 0
+#define DRSAS_PD_TGT_MAX 255
+#define DRSAS_GET_PD_MAX(s) ((s)->dr_pd_max)
+#define WWN_STRLEN 17
+
+/*
+ * =====================================
+ * MegaRAID SAS2.0 MFI firmware definitions
+ * =====================================
+ */
+/*
+ * MFI stands for MegaRAID SAS2.0 FW Interface. This is just a moniker for
+ * protocol between the software and firmware. Commands are issued using
+ * "message frames"
+ */
+
+/*
+ * FW posts its state in upper 4 bits of outbound_msg_0 register
+ */
+#define MFI_STATE_SHIFT 28
+#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT)
+#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT)
+#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT)
+#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT)
+#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT)
+#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT)
+#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT)
+#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT)
+#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT)
+#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT)
+#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT)
+#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT)
+
+#define MRMFI_FRAME_SIZE 64
+
+/*
+ * During FW init, clear pending cmds & reset state using inbound_msg_0
+ *
+ * ABORT : Abort all pending cmds
+ * READY : Move from OPERATIONAL to READY state; discard queue info
+ * MFIMODE : Discard (possible) low MFA posted in 64-bit mode (??)
+ * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver
+ */
+#define MFI_INIT_ABORT 0x00000001
+#define MFI_INIT_READY 0x00000002
+#define MFI_INIT_MFIMODE 0x00000004
+#define MFI_INIT_CLEAR_HANDSHAKE 0x00000008
+#define MFI_INIT_HOTPLUG 0x00000010
+#define MFI_STOP_ADP 0x00000020
+#define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT
+
+/*
+ * MFI frame flags
+ */
+#define MFI_FRAME_POST_IN_REPLY_QUEUE 0x0000
+#define MFI_FRAME_DONT_POST_IN_REPLY_QUEUE 0x0001
+#define MFI_FRAME_SGL32 0x0000
+#define MFI_FRAME_SGL64 0x0002
+#define MFI_FRAME_SENSE32 0x0000
+#define MFI_FRAME_SENSE64 0x0004
+#define MFI_FRAME_DIR_NONE 0x0000
+#define MFI_FRAME_DIR_WRITE 0x0008
+#define MFI_FRAME_DIR_READ 0x0010
+#define MFI_FRAME_DIR_BOTH 0x0018
+
+/*
+ * Definition for cmd_status
+ */
+#define MFI_CMD_STATUS_POLL_MODE 0xFF
+#define MFI_CMD_STATUS_SYNC_MODE 0xFF
+
+/*
+ * MFI command opcodes
+ */
+#define MFI_CMD_OP_INIT 0x00
+#define MFI_CMD_OP_LD_READ 0x01
+#define MFI_CMD_OP_LD_WRITE 0x02
+#define MFI_CMD_OP_LD_SCSI 0x03
+#define MFI_CMD_OP_PD_SCSI 0x04
+#define MFI_CMD_OP_DCMD 0x05
+#define MFI_CMD_OP_ABORT 0x06
+#define MFI_CMD_OP_SMP 0x07
+#define MFI_CMD_OP_STP 0x08
+
+#define DR_DCMD_CTRL_GET_INFO 0x01010000
+
+#define DR_DCMD_CTRL_CACHE_FLUSH 0x01101000
+#define DR_FLUSH_CTRL_CACHE 0x01
+#define DR_FLUSH_DISK_CACHE 0x02
+
+#define DR_DCMD_CTRL_SHUTDOWN 0x01050000
+#define DRSAS_ENABLE_DRIVE_SPINDOWN 0x01
+
+#define DR_DCMD_CTRL_EVENT_GET_INFO 0x01040100
+#define DR_DCMD_CTRL_EVENT_GET 0x01040300
+#define DR_DCMD_CTRL_EVENT_WAIT 0x01040500
+#define DR_DCMD_LD_GET_PROPERTIES 0x03030000
+#define DR_DCMD_PD_GET_INFO 0x02020000
+
+/*
+ * Solaris Specific MAX values
+ */
+#define MAX_SGL 24
+/*
+ * MFI command completion codes
+ */
+enum MFI_STAT {
+ MFI_STAT_OK = 0x00,
+ MFI_STAT_INVALID_CMD = 0x01,
+ MFI_STAT_INVALID_DCMD = 0x02,
+ MFI_STAT_INVALID_PARAMETER = 0x03,
+ MFI_STAT_INVALID_SEQUENCE_NUMBER = 0x04,
+ MFI_STAT_ABORT_NOT_POSSIBLE = 0x05,
+ MFI_STAT_APP_HOST_CODE_NOT_FOUND = 0x06,
+ MFI_STAT_APP_IN_USE = 0x07,
+ MFI_STAT_APP_NOT_INITIALIZED = 0x08,
+ MFI_STAT_ARRAY_INDEX_INVALID = 0x09,
+ MFI_STAT_ARRAY_ROW_NOT_EMPTY = 0x0a,
+ MFI_STAT_CONFIG_RESOURCE_CONFLICT = 0x0b,
+ MFI_STAT_DEVICE_NOT_FOUND = 0x0c,
+ MFI_STAT_DRIVE_TOO_SMALL = 0x0d,
+ MFI_STAT_FLASH_ALLOC_FAIL = 0x0e,
+ MFI_STAT_FLASH_BUSY = 0x0f,
+ MFI_STAT_FLASH_ERROR = 0x10,
+ MFI_STAT_FLASH_IMAGE_BAD = 0x11,
+ MFI_STAT_FLASH_IMAGE_INCOMPLETE = 0x12,
+ MFI_STAT_FLASH_NOT_OPEN = 0x13,
+ MFI_STAT_FLASH_NOT_STARTED = 0x14,
+ MFI_STAT_FLUSH_FAILED = 0x15,
+ MFI_STAT_HOST_CODE_NOT_FOUNT = 0x16,
+ MFI_STAT_LD_CC_IN_PROGRESS = 0x17,
+ MFI_STAT_LD_INIT_IN_PROGRESS = 0x18,
+ MFI_STAT_LD_LBA_OUT_OF_RANGE = 0x19,
+ MFI_STAT_LD_MAX_CONFIGURED = 0x1a,
+ MFI_STAT_LD_NOT_OPTIMAL = 0x1b,
+ MFI_STAT_LD_RBLD_IN_PROGRESS = 0x1c,
+ MFI_STAT_LD_RECON_IN_PROGRESS = 0x1d,
+ MFI_STAT_LD_WRONG_RAID_LEVEL = 0x1e,
+ MFI_STAT_MAX_SPARES_EXCEEDED = 0x1f,
+ MFI_STAT_MEMORY_NOT_AVAILABLE = 0x20,
+ MFI_STAT_MFC_HW_ERROR = 0x21,
+ MFI_STAT_NO_HW_PRESENT = 0x22,
+ MFI_STAT_NOT_FOUND = 0x23,
+ MFI_STAT_NOT_IN_ENCL = 0x24,
+ MFI_STAT_PD_CLEAR_IN_PROGRESS = 0x25,
+ MFI_STAT_PD_TYPE_WRONG = 0x26,
+ MFI_STAT_PR_DISABLED = 0x27,
+ MFI_STAT_ROW_INDEX_INVALID = 0x28,
+ MFI_STAT_SAS_CONFIG_INVALID_ACTION = 0x29,
+ MFI_STAT_SAS_CONFIG_INVALID_DATA = 0x2a,
+ MFI_STAT_SAS_CONFIG_INVALID_PAGE = 0x2b,
+ MFI_STAT_SAS_CONFIG_INVALID_TYPE = 0x2c,
+ MFI_STAT_SCSI_DONE_WITH_ERROR = 0x2d,
+ MFI_STAT_SCSI_IO_FAILED = 0x2e,
+ MFI_STAT_SCSI_RESERVATION_CONFLICT = 0x2f,
+ MFI_STAT_SHUTDOWN_FAILED = 0x30,
+ MFI_STAT_TIME_NOT_SET = 0x31,
+ MFI_STAT_WRONG_STATE = 0x32,
+ MFI_STAT_LD_OFFLINE = 0x33,
+ /* UNUSED: 0x34 to 0xfe */
+ MFI_STAT_INVALID_STATUS = 0xFF
+};
+
+enum DR_EVT_CLASS {
+ DR_EVT_CLASS_DEBUG = -2,
+ DR_EVT_CLASS_PROGRESS = -1,
+ DR_EVT_CLASS_INFO = 0,
+ DR_EVT_CLASS_WARNING = 1,
+ DR_EVT_CLASS_CRITICAL = 2,
+ DR_EVT_CLASS_FATAL = 3,
+ DR_EVT_CLASS_DEAD = 4
+};
+
+enum DR_EVT_LOCALE {
+ DR_EVT_LOCALE_LD = 0x0001,
+ DR_EVT_LOCALE_PD = 0x0002,
+ DR_EVT_LOCALE_ENCL = 0x0004,
+ DR_EVT_LOCALE_BBU = 0x0008,
+ DR_EVT_LOCALE_SAS = 0x0010,
+ DR_EVT_LOCALE_CTRL = 0x0020,
+ DR_EVT_LOCALE_CONFIG = 0x0040,
+ DR_EVT_LOCALE_CLUSTER = 0x0080,
+ DR_EVT_LOCALE_ALL = 0xffff
+};
+
+#define DR_EVT_CFG_CLEARED 0x0004
+#define DR_EVT_LD_CREATED 0x008a
+#define DR_EVT_LD_DELETED 0x008b
+#define DR_EVT_PD_REMOVED_EXT 0x00f8
+#define DR_EVT_PD_INSERTED_EXT 0x00f7
+
+enum LD_STATE {
+ LD_OFFLINE = 0,
+ LD_PARTIALLY_DEGRADED = 1,
+ LD_DEGRADED = 2,
+ LD_OPTIMAL = 3,
+ LD_INVALID = 0xFF
+};
+
+enum DRSAS_EVT {
+ DRSAS_EVT_CONFIG_TGT = 0,
+ DRSAS_EVT_UNCONFIG_TGT = 1,
+ DRSAS_EVT_UNCONFIG_SMP = 2
+};
+
+#define DMA_OBJ_ALLOCATED 1
+#define DMA_OBJ_REALLOCATED 2
+#define DMA_OBJ_FREED 3
+
+/*
+ * dma_obj_t - Our DMA object
+ * @param buffer : kernel virtual address
+ * @param size : size of the data to be allocated
+ * @param acc_handle : access handle
+ * @param dma_handle : dma handle
+ * @param dma_cookie : scatter-gather list
+ * @param dma_attr : dma attributes for this buffer
+ * Our DMA object. The caller must initialize the size and dma attributes
+ * (dma_attr) fields before allocating the resources.
+ */
+typedef struct {
+ caddr_t buffer;
+ uint32_t size;
+ ddi_acc_handle_t acc_handle;
+ ddi_dma_handle_t dma_handle;
+ ddi_dma_cookie_t dma_cookie[DRSAS_MAX_SGE_CNT];
+ ddi_dma_attr_t dma_attr;
+ uint8_t status;
+ uint8_t reserved[3];
+} dma_obj_t;
+
+struct drsas_eventinfo {
+ struct drsas_instance *instance;
+ int tgt;
+ int lun;
+ int event;
+};
+
+struct drsas_ld {
+ dev_info_t *dip;
+ uint8_t lun_type;
+ uint8_t reserved[3];
+};
+
+struct drsas_pd {
+ dev_info_t *dip;
+ uint8_t lun_type;
+ uint8_t dev_id;
+ uint8_t flags;
+ uint8_t reserved;
+};
+
+struct drsas_pd_info {
+ uint16_t deviceId;
+ uint16_t seqNum;
+ uint8_t inquiryData[96];
+ uint8_t vpdPage83[64];
+ uint8_t notSupported;
+ uint8_t scsiDevType;
+ uint8_t a;
+ uint8_t device_speed;
+ uint32_t mediaerrcnt;
+ uint32_t other;
+ uint32_t pred;
+ uint32_t lastpred;
+ uint16_t fwState;
+ uint8_t disabled;
+ uint8_t linkspwwd;
+ uint32_t ddfType;
+ struct {
+ uint8_t count;
+ uint8_t isPathBroken;
+ uint8_t connectorIndex[2];
+ uint8_t reserved[4];
+ uint64_t sasAddr[2];
+ uint8_t reserved2[16];
+ } pathInfo;
+};
+
+typedef struct drsas_instance {
+ uint32_t *producer;
+ uint32_t *consumer;
+
+ uint32_t *reply_queue;
+ dma_obj_t mfi_internal_dma_obj;
+
+ uint8_t init_id;
+ uint8_t reserved[3];
+
+ uint16_t max_num_sge;
+ uint16_t max_fw_cmds;
+ uint32_t max_sectors_per_req;
+
+ struct drsas_cmd **cmd_list;
+
+ mlist_t cmd_pool_list;
+ kmutex_t cmd_pool_mtx;
+
+ mlist_t cmd_pend_list;
+ kmutex_t cmd_pend_mtx;
+
+ dma_obj_t mfi_evt_detail_obj;
+ struct drsas_cmd *aen_cmd;
+
+ uint32_t aen_seq_num;
+ uint32_t aen_class_locale_word;
+
+ scsi_hba_tran_t *tran;
+
+ kcondvar_t int_cmd_cv;
+ kmutex_t int_cmd_mtx;
+
+ kcondvar_t aen_cmd_cv;
+ kmutex_t aen_cmd_mtx;
+
+ kcondvar_t abort_cmd_cv;
+ kmutex_t abort_cmd_mtx;
+
+ dev_info_t *dip;
+ ddi_acc_handle_t pci_handle;
+
+ timeout_id_t timeout_id;
+ uint32_t unique_id;
+ uint16_t fw_outstanding;
+ caddr_t regmap;
+ ddi_acc_handle_t regmap_handle;
+ uint8_t isr_level;
+ ddi_iblock_cookie_t iblock_cookie;
+ ddi_iblock_cookie_t soft_iblock_cookie;
+ ddi_softintr_t soft_intr_id;
+ uint8_t softint_running;
+ kmutex_t completed_pool_mtx;
+ mlist_t completed_pool_list;
+
+ caddr_t internal_buf;
+ uint32_t internal_buf_dmac_add;
+ uint32_t internal_buf_size;
+
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t subsysvid;
+ uint16_t subsysid;
+ int instance;
+ int baseaddress;
+ char iocnode[16];
+
+ int fm_capabilities;
+
+ struct drsas_func_ptr *func_ptr;
+ /* MSI interrupts specific */
+ ddi_intr_handle_t *intr_htable;
+ int intr_type;
+ int intr_cnt;
+ size_t intr_size;
+ uint_t intr_pri;
+ int intr_cap;
+
+ ddi_taskq_t *taskq;
+ struct drsas_ld *dr_ld_list;
+} drsas_t;
+
+struct drsas_func_ptr {
+ int (*read_fw_status_reg)(struct drsas_instance *);
+ void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *);
+ int (*issue_cmd_in_sync_mode)(struct drsas_instance *,
+ struct drsas_cmd *);
+ int (*issue_cmd_in_poll_mode)(struct drsas_instance *,
+ struct drsas_cmd *);
+ void (*enable_intr)(struct drsas_instance *);
+ void (*disable_intr)(struct drsas_instance *);
+ int (*intr_ack)(struct drsas_instance *);
+};
+
+/*
+ * ### Helper routines ###
+ */
+
+/*
+ * con_log() - console log routine
+ * @param level : indicates the severity of the message.
+ * @fparam mt : format string
+ *
+ * con_log displays the error messages on the console based on the current
+ * debug level. Also it attaches the appropriate kernel severity level with
+ * the message.
+ *
+ *
+ * console messages debug levels
+ */
+#define CL_NONE 0 /* No debug information */
+#define CL_ANN 1 /* print unconditionally, announcements */
+#define CL_ANN1 2 /* No o/p */
+#define CL_DLEVEL1 3 /* debug level 1, informative */
+#define CL_DLEVEL2 4 /* debug level 2, verbose */
+#define CL_DLEVEL3 5 /* debug level 3, very verbose */
+
+#ifdef __SUNPRO_C
+#define __func__ ""
+#endif
+
+#define con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; }
+
+/*
+ * ### SCSA definitions ###
+ */
+#define PKT2TGT(pkt) ((pkt)->pkt_address.a_target)
+#define PKT2LUN(pkt) ((pkt)->pkt_address.a_lun)
+#define PKT2TRAN(pkt) ((pkt)->pkt_adress.a_hba_tran)
+#define ADDR2TRAN(ap) ((ap)->a_hba_tran)
+
+#define TRAN2MR(tran) (struct drsas_instance *)(tran)->tran_hba_private)
+#define ADDR2MR(ap) (TRAN2MR(ADDR2TRAN(ap))
+
+#define PKT2CMD(pkt) ((struct scsa_cmd *)(pkt)->pkt_ha_private)
+#define CMD2PKT(sp) ((sp)->cmd_pkt)
+#define PKT2REQ(pkt) (&(PKT2CMD(pkt)->request))
+
+#define CMD2ADDR(cmd) (&CMD2PKT(cmd)->pkt_address)
+#define CMD2TRAN(cmd) (CMD2PKT(cmd)->pkt_address.a_hba_tran)
+#define CMD2MR(cmd) (TRAN2MR(CMD2TRAN(cmd)))
+
+#define CFLAG_DMAVALID 0x0001 /* requires a dma operation */
+#define CFLAG_DMASEND 0x0002 /* Transfer from the device */
+#define CFLAG_CONSISTENT 0x0040 /* consistent data transfer */
+
+/*
+ * ### Data structures for ioctl inteface and internal commands ###
+ */
+
+/*
+ * Data direction flags
+ */
+#define UIOC_RD 0x00001
+#define UIOC_WR 0x00002
+
+#define SCP2HOST(scp) (scp)->device->host /* to host */
+#define SCP2HOSTDATA(scp) SCP2HOST(scp)->hostdata /* to soft state */
+#define SCP2CHANNEL(scp) (scp)->device->channel /* to channel */
+#define SCP2TARGET(scp) (scp)->device->id /* to target */
+#define SCP2LUN(scp) (scp)->device->lun /* to LUN */
+
+#define SCSIHOST2ADAP(host) (((caddr_t *)(host->hostdata))[0])
+#define SCP2ADAPTER(scp) \
+ (struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp))
+
+#define MRDRV_IS_LOGICAL_SCSA(instance, acmd) \
+ (acmd->device_id < MRDRV_MAX_LD) ? 1 : 0
+#define MRDRV_IS_LOGICAL(ap) \
+ ((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0
+#define MAP_DEVICE_ID(instance, ap) \
+ (ap->a_target)
+
+#define HIGH_LEVEL_INTR 1
+#define NORMAL_LEVEL_INTR 0
+
+/*
+ * scsa_cmd - Per-command mr private data
+ * @param cmd_dmahandle : dma handle
+ * @param cmd_dmacookies : current dma cookies
+ * @param cmd_pkt : scsi_pkt reference
+ * @param cmd_dmacount : dma count
+ * @param cmd_cookie : next cookie
+ * @param cmd_ncookies : cookies per window
+ * @param cmd_cookiecnt : cookies per sub-win
+ * @param cmd_nwin : number of dma windows
+ * @param cmd_curwin : current dma window
+ * @param cmd_dma_offset : current window offset
+ * @param cmd_dma_len : current window length
+ * @param cmd_flags : private flags
+ * @param cmd_cdblen : length of cdb
+ * @param cmd_scblen : length of scb
+ * @param cmd_buf : command buffer
+ * @param channel : channel for scsi sub-system
+ * @param target : target for scsi sub-system
+ * @param lun : LUN for scsi sub-system
+ *
+ * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E)
+ * - Pointed to by pkt_ha_private field in scsi_pkt
+ */
+struct scsa_cmd {
+ ddi_dma_handle_t cmd_dmahandle;
+ ddi_dma_cookie_t cmd_dmacookies[DRSAS_MAX_SGE_CNT];
+ struct scsi_pkt *cmd_pkt;
+ ulong_t cmd_dmacount;
+ uint_t cmd_cookie;
+ uint_t cmd_ncookies;
+ uint_t cmd_cookiecnt;
+ uint_t cmd_nwin;
+ uint_t cmd_curwin;
+ off_t cmd_dma_offset;
+ ulong_t cmd_dma_len;
+ ulong_t cmd_flags;
+ uint_t cmd_cdblen;
+ uint_t cmd_scblen;
+ struct buf *cmd_buf;
+ ushort_t device_id;
+ uchar_t islogical;
+ uchar_t lun;
+ struct drsas_device *drsas_dev;
+};
+
+
+struct drsas_cmd {
+ union drsas_frame *frame;
+ uint32_t frame_phys_addr;
+ uint8_t *sense;
+ uint32_t sense_phys_addr;
+ dma_obj_t frame_dma_obj;
+ uint8_t frame_dma_obj_status;
+
+ uint32_t index;
+ uint8_t sync_cmd;
+ uint8_t cmd_status;
+ uint16_t abort_aen;
+ mlist_t list;
+ uint32_t frame_count;
+ struct scsa_cmd *cmd;
+ struct scsi_pkt *pkt;
+};
+
+#define MAX_MGMT_ADAPTERS 1024
+#define IOC_SIGNATURE "MR-SAS"
+
+#define IOC_CMD_FIRMWARE 0x0
+#define DRSAS_DRIVER_IOCTL_COMMON 0xF0010000
+#define DRSAS_DRIVER_IOCTL_DRIVER_VERSION 0xF0010100
+#define DRSAS_DRIVER_IOCTL_PCI_INFORMATION 0xF0010200
+#define DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS 0xF0010300
+
+
+#define DRSAS_MAX_SENSE_LENGTH 32
+
+struct drsas_mgmt_info {
+
+ uint16_t count;
+ struct drsas_instance *instance[MAX_MGMT_ADAPTERS];
+ uint16_t map[MAX_MGMT_ADAPTERS];
+ int max_index;
+};
+
+#pragma pack(1)
+
+/*
+ * SAS controller properties
+ */
+struct drsas_ctrl_prop {
+ uint16_t seq_num;
+ uint16_t pred_fail_poll_interval;
+ uint16_t intr_throttle_count;
+ uint16_t intr_throttle_timeouts;
+
+ uint8_t rebuild_rate;
+ uint8_t patrol_read_rate;
+ uint8_t bgi_rate;
+ uint8_t cc_rate;
+ uint8_t recon_rate;
+
+ uint8_t cache_flush_interval;
+
+ uint8_t spinup_drv_count;
+ uint8_t spinup_delay;
+
+ uint8_t cluster_enable;
+ uint8_t coercion_mode;
+ uint8_t disk_write_cache_disable;
+ uint8_t alarm_enable;
+
+ uint8_t reserved[44];
+};
+
+/*
+ * SAS controller information
+ */
+struct drsas_ctrl_info {
+ /* PCI device information */
+ struct {
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t sub_vendor_id;
+ uint16_t sub_device_id;
+ uint8_t reserved[24];
+ } pci;
+
+ /* Host interface information */
+ struct {
+ uint8_t PCIX : 1;
+ uint8_t PCIE : 1;
+ uint8_t iSCSI : 1;
+ uint8_t SAS_3G : 1;
+ uint8_t reserved_0 : 4;
+ uint8_t reserved_1[6];
+ uint8_t port_count;
+ uint64_t port_addr[8];
+ } host_interface;
+
+ /* Device (backend) interface information */
+ struct {
+ uint8_t SPI : 1;
+ uint8_t SAS_3G : 1;
+ uint8_t SATA_1_5G : 1;
+ uint8_t SATA_3G : 1;
+ uint8_t reserved_0 : 4;
+ uint8_t reserved_1[6];
+ uint8_t port_count;
+ uint64_t port_addr[8];
+ } device_interface;
+
+ /* List of components residing in flash. All str are null terminated */
+ uint32_t image_check_word;
+ uint32_t image_component_count;
+
+ struct {
+ char name[8];
+ char version[32];
+ char build_date[16];
+ char built_time[16];
+ } image_component[8];
+
+ /*
+ * List of flash components that have been flashed on the card, but
+ * are not in use, pending reset of the adapter. This list will be
+ * empty if a flash operation has not occurred. All stings are null
+ * terminated
+ */
+ uint32_t pending_image_component_count;
+
+ struct {
+ char name[8];
+ char version[32];
+ char build_date[16];
+ char build_time[16];
+ } pending_image_component[8];
+
+ uint8_t max_arms;
+ uint8_t max_spans;
+ uint8_t max_arrays;
+ uint8_t max_lds;
+
+ char product_name[80];
+ char serial_no[32];
+
+ /*
+ * Other physical/controller/operation information. Indicates the
+ * presence of the hardware
+ */
+ struct {
+ uint32_t bbu : 1;
+ uint32_t alarm : 1;
+ uint32_t nvram : 1;
+ uint32_t uart : 1;
+ uint32_t reserved : 28;
+ } hw_present;
+
+ uint32_t current_fw_time;
+
+ /* Maximum data transfer sizes */
+ uint16_t max_concurrent_cmds;
+ uint16_t max_sge_count;
+ uint32_t max_request_size;
+
+ /* Logical and physical device counts */
+ uint16_t ld_present_count;
+ uint16_t ld_degraded_count;
+ uint16_t ld_offline_count;
+
+ uint16_t pd_present_count;
+ uint16_t pd_disk_present_count;
+ uint16_t pd_disk_pred_failure_count;
+ uint16_t pd_disk_failed_count;
+
+ /* Memory size information */
+ uint16_t nvram_size;
+ uint16_t memory_size;
+ uint16_t flash_size;
+
+ /* Error counters */
+ uint16_t mem_correctable_error_count;
+ uint16_t mem_uncorrectable_error_count;
+
+ /* Cluster information */
+ uint8_t cluster_permitted;
+ uint8_t cluster_active;
+ uint8_t reserved_1[2];
+
+ /* Controller capabilities structures */
+ struct {
+ uint32_t raid_level_0 : 1;
+ uint32_t raid_level_1 : 1;
+ uint32_t raid_level_5 : 1;
+ uint32_t raid_level_1E : 1;
+ uint32_t reserved : 28;
+ } raid_levels;
+
+ struct {
+ uint32_t rbld_rate : 1;
+ uint32_t cc_rate : 1;
+ uint32_t bgi_rate : 1;
+ uint32_t recon_rate : 1;
+ uint32_t patrol_rate : 1;
+ uint32_t alarm_control : 1;
+ uint32_t cluster_supported : 1;
+ uint32_t bbu : 1;
+ uint32_t spanning_allowed : 1;
+ uint32_t dedicated_hotspares : 1;
+ uint32_t revertible_hotspares : 1;
+ uint32_t foreign_config_import : 1;
+ uint32_t self_diagnostic : 1;
+ uint32_t reserved : 19;
+ } adapter_operations;
+
+ struct {
+ uint32_t read_policy : 1;
+ uint32_t write_policy : 1;
+ uint32_t io_policy : 1;
+ uint32_t access_policy : 1;
+ uint32_t reserved : 28;
+ } ld_operations;
+
+ struct {
+ uint8_t min;
+ uint8_t max;
+ uint8_t reserved[2];
+ } stripe_size_operations;
+
+ struct {
+ uint32_t force_online : 1;
+ uint32_t force_offline : 1;
+ uint32_t force_rebuild : 1;
+ uint32_t reserved : 29;
+ } pd_operations;
+
+ struct {
+ uint32_t ctrl_supports_sas : 1;
+ uint32_t ctrl_supports_sata : 1;
+ uint32_t allow_mix_in_encl : 1;
+ uint32_t allow_mix_in_ld : 1;
+ uint32_t allow_sata_in_cluster : 1;
+ uint32_t reserved : 27;
+ } pd_mix_support;
+
+ /* Include the controller properties (changeable items) */
+ uint8_t reserved_2[12];
+ struct drsas_ctrl_prop properties;
+
+ uint8_t pad[0x800 - 0x640];
+};
+
+/*
+ * ==================================
+ * MegaRAID SAS2.0 driver definitions
+ * ==================================
+ */
+#define MRDRV_MAX_NUM_CMD 1024
+
+#define MRDRV_MAX_PD_CHANNELS 2
+#define MRDRV_MAX_LD_CHANNELS 2
+#define MRDRV_MAX_CHANNELS (MRDRV_MAX_PD_CHANNELS + \
+ MRDRV_MAX_LD_CHANNELS)
+#define MRDRV_MAX_DEV_PER_CHANNEL 128
+#define MRDRV_DEFAULT_INIT_ID -1
+#define MRDRV_MAX_CMD_PER_LUN 1000
+#define MRDRV_MAX_LUN 1
+#define MRDRV_MAX_LD 64
+
+#define MRDRV_RESET_WAIT_TIME 300
+#define MRDRV_RESET_NOTICE_INTERVAL 5
+
+#define DRSAS_IOCTL_CMD 0
+
+/*
+ * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit
+ * SGLs based on the size of dma_addr_t
+ */
+#define IS_DMA64 (sizeof (dma_addr_t) == 8)
+
+#define IB_MSG_0_OFF 0x10 /* XScale */
+#define OB_MSG_0_OFF 0x18 /* XScale */
+#define IB_DOORBELL_OFF 0x20 /* XScale & ROC */
+#define OB_INTR_STATUS_OFF 0x30 /* XScale & ROC */
+#define OB_INTR_MASK_OFF 0x34 /* XScale & ROC */
+#define IB_QPORT_OFF 0x40 /* XScale & ROC */
+#define OB_DOORBELL_CLEAR_OFF 0xA0 /* ROC */
+#define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */
+#define OB_INTR_MASK 0xFFFFFFFF
+#define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF
+
+/*
+ * All MFI register set macros accept drsas_register_set*
+ */
+#define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v))
+
+#define RD_OB_MSG_0(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF))
+
+#define WR_IB_DOORBELL(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v))
+
+#define RD_IB_DOORBELL(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF))
+
+#define WR_OB_INTR_STATUS(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v))
+
+#define RD_OB_INTR_STATUS(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF))
+
+#define WR_OB_INTR_MASK(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v))
+
+#define RD_OB_INTR_MASK(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF))
+
+#define WR_IB_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v))
+
+#define WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \
+ (v))
+
+#define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF))
+
+/*
+ * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data
+ * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs
+ * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled.
+ */
+#define MFI_OB_INTR_STATUS_MASK 0x00000002
+
+/*
+ * This MFI_REPLY_2108_MESSAGE_INTR flag is used also
+ * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has
+ * been set in this flag along with bit 1.
+ */
+#define MFI_REPLY_2108_MESSAGE_INTR 0x00000001
+#define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005
+
+#define MFI_POLL_TIMEOUT_SECS 60
+
+#define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1)
+#define MFI_DISABLE_INTR(instance) \
+{ \
+ uint32_t disable = 1; \
+ uint32_t mask = ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\
+ mask &= ~disable; \
+ ddi_put32((instance)->regmap_handle, (uint32_t *) \
+ (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask); \
+}
+
+/* By default, the firmware programs for 8 Kbytes of memory */
+#define DEFAULT_MFI_MEM_SZ 8192
+#define MINIMUM_MFI_MEM_SZ 4096
+
+/* DCMD Message Frame MAILBOX0-11 */
+#define DCMD_MBOX_SZ 12
+
+
+struct drsas_register_set {
+ uint32_t reserved_0[4];
+
+ uint32_t inbound_msg_0;
+ uint32_t inbound_msg_1;
+ uint32_t outbound_msg_0;
+ uint32_t outbound_msg_1;
+
+ uint32_t inbound_doorbell;
+ uint32_t inbound_intr_status;
+ uint32_t inbound_intr_mask;
+
+ uint32_t outbound_doorbell;
+ uint32_t outbound_intr_status;
+ uint32_t outbound_intr_mask;
+
+ uint32_t reserved_1[2];
+
+ uint32_t inbound_queue_port;
+ uint32_t outbound_queue_port;
+
+ uint32_t reserved_2[22];
+
+ uint32_t outbound_doorbell_clear;
+
+ uint32_t reserved_3[3];
+
+ uint32_t outbound_scratch_pad;
+
+ uint32_t reserved_4[3];
+
+ uint32_t inbound_low_queue_port;
+
+ uint32_t inbound_high_queue_port;
+
+ uint32_t reserved_5;
+ uint32_t index_registers[820];
+};
+
+struct drsas_sge32 {
+ uint32_t phys_addr;
+ uint32_t length;
+};
+
+struct drsas_sge64 {
+ uint64_t phys_addr;
+ uint32_t length;
+};
+
+union drsas_sgl {
+ struct drsas_sge32 sge32[1];
+ struct drsas_sge64 sge64[1];
+};
+
+struct drsas_header {
+ uint8_t cmd;
+ uint8_t sense_len;
+ uint8_t cmd_status;
+ uint8_t scsi_status;
+
+ uint8_t target_id;
+ uint8_t lun;
+ uint8_t cdb_len;
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+ uint32_t data_xferlen;
+};
+
+union drsas_sgl_frame {
+ struct drsas_sge32 sge32[8];
+ struct drsas_sge64 sge64[5];
+};
+
+struct drsas_init_frame {
+ uint8_t cmd;
+ uint8_t reserved_0;
+ uint8_t cmd_status;
+
+ uint8_t reserved_1;
+ uint32_t reserved_2;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t reserved_3;
+ uint32_t data_xfer_len;
+
+ uint32_t queue_info_new_phys_addr_lo;
+ uint32_t queue_info_new_phys_addr_hi;
+ uint32_t queue_info_old_phys_addr_lo;
+ uint32_t queue_info_old_phys_addr_hi;
+
+ uint32_t reserved_4[6];
+};
+
+struct drsas_init_queue_info {
+ uint32_t init_flags;
+ uint32_t reply_queue_entries;
+
+ uint32_t reply_queue_start_phys_addr_lo;
+ uint32_t reply_queue_start_phys_addr_hi;
+ uint32_t producer_index_phys_addr_lo;
+ uint32_t producer_index_phys_addr_hi;
+ uint32_t consumer_index_phys_addr_lo;
+ uint32_t consumer_index_phys_addr_hi;
+};
+
+struct drsas_io_frame {
+ uint8_t cmd;
+ uint8_t sense_len;
+ uint8_t cmd_status;
+ uint8_t scsi_status;
+
+ uint8_t target_id;
+ uint8_t access_byte;
+ uint8_t reserved_0;
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+ uint32_t lba_count;
+
+ uint32_t sense_buf_phys_addr_lo;
+ uint32_t sense_buf_phys_addr_hi;
+
+ uint32_t start_lba_lo;
+ uint32_t start_lba_hi;
+
+ union drsas_sgl sgl;
+};
+
+struct drsas_pthru_frame {
+ uint8_t cmd;
+ uint8_t sense_len;
+ uint8_t cmd_status;
+ uint8_t scsi_status;
+
+ uint8_t target_id;
+ uint8_t lun;
+ uint8_t cdb_len;
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+ uint32_t data_xfer_len;
+
+ uint32_t sense_buf_phys_addr_lo;
+ uint32_t sense_buf_phys_addr_hi;
+
+ uint8_t cdb[16];
+ union drsas_sgl sgl;
+};
+
+struct drsas_dcmd_frame {
+ uint8_t cmd;
+ uint8_t reserved_0;
+ uint8_t cmd_status;
+ uint8_t reserved_1[4];
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+
+ uint32_t data_xfer_len;
+ uint32_t opcode;
+
+ union {
+ uint8_t b[DCMD_MBOX_SZ];
+ uint16_t s[6];
+ uint32_t w[3];
+ } mbox;
+
+ union drsas_sgl sgl;
+};
+
+struct drsas_abort_frame {
+ uint8_t cmd;
+ uint8_t reserved_0;
+ uint8_t cmd_status;
+
+ uint8_t reserved_1;
+ uint32_t reserved_2;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t reserved_3;
+ uint32_t reserved_4;
+
+ uint32_t abort_context;
+ uint32_t pad_1;
+
+ uint32_t abort_mfi_phys_addr_lo;
+ uint32_t abort_mfi_phys_addr_hi;
+
+ uint32_t reserved_5[6];
+};
+
+struct drsas_smp_frame {
+ uint8_t cmd;
+ uint8_t reserved_1;
+ uint8_t cmd_status;
+ uint8_t connection_status;
+
+ uint8_t reserved_2[3];
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+
+ uint32_t data_xfer_len;
+
+ uint64_t sas_addr;
+
+ union drsas_sgl sgl[2];
+};
+
+struct drsas_stp_frame {
+ uint8_t cmd;
+ uint8_t reserved_1;
+ uint8_t cmd_status;
+ uint8_t connection_status;
+
+ uint8_t target_id;
+ uint8_t reserved_2[2];
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+
+ uint32_t data_xfer_len;
+
+ uint16_t fis[10];
+ uint32_t stp_flags;
+ union drsas_sgl sgl;
+};
+
+union drsas_frame {
+ struct drsas_header hdr;
+ struct drsas_init_frame init;
+ struct drsas_io_frame io;
+ struct drsas_pthru_frame pthru;
+ struct drsas_dcmd_frame dcmd;
+ struct drsas_abort_frame abort;
+ struct drsas_smp_frame smp;
+ struct drsas_stp_frame stp;
+
+ uint8_t raw_bytes[64];
+};
+
+typedef struct drsas_pd_address {
+ uint16_t device_id;
+ uint16_t encl_id;
+
+ union {
+ struct {
+ uint8_t encl_index;
+ uint8_t slot_number;
+ } pd_address;
+ struct {
+ uint8_t encl_position;
+ uint8_t encl_connector_index;
+ } encl_address;
+ }address;
+
+ uint8_t scsi_dev_type;
+
+ union {
+ uint8_t port_bitmap;
+ uint8_t port_numbers;
+ } connected;
+
+ uint64_t sas_addr[2];
+} drsas_pd_address_t;
+
+union drsas_evt_class_locale {
+ struct {
+ uint16_t locale;
+ uint8_t reserved;
+ int8_t class;
+ } members;
+
+ uint32_t word;
+};
+
+struct drsas_evt_log_info {
+ uint32_t newest_seq_num;
+ uint32_t oldest_seq_num;
+ uint32_t clear_seq_num;
+ uint32_t shutdown_seq_num;
+ uint32_t boot_seq_num;
+};
+
+struct drsas_progress {
+ uint16_t progress;
+ uint16_t elapsed_seconds;
+};
+
+struct drsas_evtarg_ld {
+ uint16_t target_id;
+ uint8_t ld_index;
+ uint8_t reserved;
+};
+
+struct drsas_evtarg_pd {
+ uint16_t device_id;
+ uint8_t encl_index;
+ uint8_t slot_number;
+};
+
+struct drsas_evt_detail {
+ uint32_t seq_num;
+ uint32_t time_stamp;
+ uint32_t code;
+ union drsas_evt_class_locale cl;
+ uint8_t arg_type;
+ uint8_t reserved1[15];
+
+ union {
+ struct {
+ struct drsas_evtarg_pd pd;
+ uint8_t cdb_length;
+ uint8_t sense_length;
+ uint8_t reserved[2];
+ uint8_t cdb[16];
+ uint8_t sense[64];
+ } cdbSense;
+
+ struct drsas_evtarg_ld ld;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ uint64_t count;
+ } ld_count;
+
+ struct {
+ uint64_t lba;
+ struct drsas_evtarg_ld ld;
+ } ld_lba;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ uint32_t prevOwner;
+ uint32_t newOwner;
+ } ld_owner;
+
+ struct {
+ uint64_t ld_lba;
+ uint64_t pd_lba;
+ struct drsas_evtarg_ld ld;
+ struct drsas_evtarg_pd pd;
+ } ld_lba_pd_lba;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ struct drsas_progress prog;
+ } ld_prog;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ uint32_t prev_state;
+ uint32_t new_state;
+ } ld_state;
+
+ struct {
+ uint64_t strip;
+ struct drsas_evtarg_ld ld;
+ } ld_strip;
+
+ struct drsas_evtarg_pd pd;
+
+ struct {
+ struct drsas_evtarg_pd pd;
+ uint32_t err;
+ } pd_err;
+
+ struct {
+ uint64_t lba;
+ struct drsas_evtarg_pd pd;
+ } pd_lba;
+
+ struct {
+ uint64_t lba;
+ struct drsas_evtarg_pd pd;
+ struct drsas_evtarg_ld ld;
+ } pd_lba_ld;
+
+ struct {
+ struct drsas_evtarg_pd pd;
+ struct drsas_progress prog;
+ } pd_prog;
+
+ struct {
+ struct drsas_evtarg_pd pd;
+ uint32_t prevState;
+ uint32_t newState;
+ } pd_state;
+
+ struct {
+ uint16_t vendorId;
+ uint16_t deviceId;
+ uint16_t subVendorId;
+ uint16_t subDeviceId;
+ } pci;
+
+ uint32_t rate;
+ char str[96];
+
+ struct {
+ uint32_t rtc;
+ uint32_t elapsedSeconds;
+ } time;
+
+ struct {
+ uint32_t ecar;
+ uint32_t elog;
+ char str[64];
+ } ecc;
+
+ drsas_pd_address_t pd_addr;
+
+ uint8_t b[96];
+ uint16_t s[48];
+ uint32_t w[24];
+ uint64_t d[12];
+ } args;
+
+ char description[128];
+
+};
+
+/* only 63 are usable by the application */
+#define MAX_LOGICAL_DRIVES 64
+/* only 255 physical devices may be used */
+#define MAX_PHYSICAL_DEVICES 256
+#define MAX_PD_PER_ENCLOSURE 64
+/* maximum disks per array */
+#define MAX_ROW_SIZE 32
+/* maximum spans per logical drive */
+#define MAX_SPAN_DEPTH 8
+/* maximum number of arrays a hot spare may be dedicated to */
+#define MAX_ARRAYS_DEDICATED 16
+/* maximum number of arrays which may exist */
+#define MAX_ARRAYS 128
+/* maximum number of foreign configs that may ha managed at once */
+#define MAX_FOREIGN_CONFIGS 8
+/* maximum spares (global and dedicated combined) */
+#define MAX_SPARES_FOR_THE_CONTROLLER MAX_PHYSICAL_DEVICES
+/* maximum possible Target IDs (i.e. 0 to 63) */
+#define MAX_TARGET_ID 63
+/* maximum number of supported enclosures */
+#define MAX_ENCLOSURES 32
+/* maximum number of PHYs per controller */
+#define MAX_PHYS_PER_CONTROLLER 16
+/* maximum number of LDs per array (due to DDF limitations) */
+#define MAX_LDS_PER_ARRAY 16
+
+/*
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ *
+ * Logical Drive commands
+ *
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ */
+#define DR_DCMD_LD 0x03000000, /* Logical Device (LD) opcodes */
+
+/*
+ * Input: dcmd.opcode - DR_DCMD_LD_GET_LIST
+ * dcmd.mbox - reserved
+ * dcmd.sge IN - ptr to returned DR_LD_LIST structure
+ * Desc: Return the logical drive list structure
+ * Status: No error
+ */
+
+/*
+ * defines the logical drive reference structure
+ */
+typedef union _DR_LD_REF { /* LD reference structure */
+ struct {
+ uint8_t targetId; /* LD target id (0 to MAX_TARGET_ID) */
+ uint8_t reserved; /* reserved for in line with DR_PD_REF */
+ uint16_t seqNum; /* Sequence Number */
+ } ld_ref;
+ uint32_t ref; /* shorthand reference to full 32-bits */
+} DR_LD_REF; /* 4 bytes */
+
+/*
+ * defines the logical drive list structure
+ */
+typedef struct _DR_LD_LIST {
+ uint32_t ldCount; /* number of LDs */
+ uint32_t reserved; /* pad to 8-byte boundary */
+ struct {
+ DR_LD_REF ref; /* LD reference */
+ uint8_t state; /* current LD state (DR_LD_STATE) */
+ uint8_t reserved[3]; /* pad to 8-byte boundary */
+ uint64_t size; /* LD size */
+ } ldList[MAX_LOGICAL_DRIVES];
+} DR_LD_LIST;
+
+struct drsas_drv_ver {
+ uint8_t signature[12];
+ uint8_t os_name[16];
+ uint8_t os_ver[12];
+ uint8_t drv_name[20];
+ uint8_t drv_ver[32];
+ uint8_t drv_rel_date[20];
+};
+
+#define PCI_TYPE0_ADDRESSES 6
+#define PCI_TYPE1_ADDRESSES 2
+#define PCI_TYPE2_ADDRESSES 5
+
+struct drsas_pci_common_header {
+ uint16_t vendorID; /* (ro) */
+ uint16_t deviceID; /* (ro) */
+ uint16_t command; /* Device control */
+ uint16_t status;
+ uint8_t revisionID; /* (ro) */
+ uint8_t progIf; /* (ro) */
+ uint8_t subClass; /* (ro) */
+ uint8_t baseClass; /* (ro) */
+ uint8_t cacheLineSize; /* (ro+) */
+ uint8_t latencyTimer; /* (ro+) */
+ uint8_t headerType; /* (ro) */
+ uint8_t bist; /* Built in self test */
+
+ union {
+ struct {
+ uint32_t baseAddresses[PCI_TYPE0_ADDRESSES];
+ uint32_t cis;
+ uint16_t subVendorID;
+ uint16_t subSystemID;
+ uint32_t romBaseAddress;
+ uint8_t capabilitiesPtr;
+ uint8_t reserved1[3];
+ uint32_t reserved2;
+ uint8_t interruptLine;
+ uint8_t interruptPin; /* (ro) */
+ uint8_t minimumGrant; /* (ro) */
+ uint8_t maximumLatency; /* (ro) */
+ } type_0;
+
+ struct {
+ uint32_t baseAddresses[PCI_TYPE1_ADDRESSES];
+ uint8_t primaryBus;
+ uint8_t secondaryBus;
+ uint8_t subordinateBus;
+ uint8_t secondaryLatency;
+ uint8_t ioBase;
+ uint8_t ioLimit;
+ uint16_t secondaryStatus;
+ uint16_t memoryBase;
+ uint16_t memoryLimit;
+ uint16_t prefetchBase;
+ uint16_t prefetchLimit;
+ uint32_t prefetchBaseUpper32;
+ uint32_t prefetchLimitUpper32;
+ uint16_t ioBaseUpper16;
+ uint16_t ioLimitUpper16;
+ uint8_t capabilitiesPtr;
+ uint8_t reserved1[3];
+ uint32_t romBaseAddress;
+ uint8_t interruptLine;
+ uint8_t interruptPin;
+ uint16_t bridgeControl;
+ } type_1;
+
+ struct {
+ uint32_t socketRegistersBaseAddress;
+ uint8_t capabilitiesPtr;
+ uint8_t reserved;
+ uint16_t secondaryStatus;
+ uint8_t primaryBus;
+ uint8_t secondaryBus;
+ uint8_t subordinateBus;
+ uint8_t secondaryLatency;
+ struct {
+ uint32_t base;
+ uint32_t limit;
+ } range[PCI_TYPE2_ADDRESSES-1];
+ uint8_t interruptLine;
+ uint8_t interruptPin;
+ uint16_t bridgeControl;
+ } type_2;
+ } header;
+};
+
+struct drsas_pci_link_capability {
+ union {
+ struct {
+ uint32_t linkSpeed :4;
+ uint32_t linkWidth :6;
+ uint32_t aspmSupport :2;
+ uint32_t losExitLatency :3;
+ uint32_t l1ExitLatency :3;
+ uint32_t rsvdp :6;
+ uint32_t portNumber :8;
+ } bits;
+
+ uint32_t asUlong;
+ } cap;
+
+};
+
+struct drsas_pci_link_status_capability {
+ union {
+ struct {
+ uint16_t linkSpeed :4;
+ uint16_t negotiatedLinkWidth :6;
+ uint16_t linkTrainingError :1;
+ uint16_t linkTraning :1;
+ uint16_t slotClockConfig :1;
+ uint16_t rsvdZ :3;
+ } bits;
+
+ uint16_t asUshort;
+ } stat_cap;
+
+ uint16_t reserved;
+
+};
+
+struct drsas_pci_capabilities {
+ struct drsas_pci_link_capability linkCapability;
+ struct drsas_pci_link_status_capability linkStatusCapability;
+};
+
+struct drsas_pci_information
+{
+ uint32_t busNumber;
+ uint8_t deviceNumber;
+ uint8_t functionNumber;
+ uint8_t interruptVector;
+ uint8_t reserved;
+ struct drsas_pci_common_header pciHeaderInfo;
+ struct drsas_pci_capabilities capability;
+ uint8_t reserved2[32];
+};
+
+struct drsas_ioctl {
+ uint16_t version;
+ uint16_t controller_id;
+ uint8_t signature[8];
+ uint32_t reserved_1;
+ uint32_t control_code;
+ uint32_t reserved_2[2];
+ uint8_t frame[64];
+ union drsas_sgl_frame sgl_frame;
+ uint8_t sense_buff[DRSAS_MAX_SENSE_LENGTH];
+ uint8_t data[1];
+};
+
+struct drsas_aen {
+ uint16_t host_no;
+ uint16_t cmd_status;
+ uint32_t seq_num;
+ uint32_t class_locale_word;
+};
+#pragma pack()
+
+#ifndef DDI_VENDOR_LSI
+#define DDI_VENDOR_LSI "LSI"
+#endif /* DDI_VENDOR_LSI */
+
+static int drsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int drsas_attach(dev_info_t *, ddi_attach_cmd_t);
+static int drsas_reset(dev_info_t *, ddi_reset_cmd_t);
+static int drsas_detach(dev_info_t *, ddi_detach_cmd_t);
+static int drsas_open(dev_t *, int, int, cred_t *);
+static int drsas_close(dev_t, int, int, cred_t *);
+static int drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+
+static int drsas_tran_tgt_init(dev_info_t *, dev_info_t *,
+ scsi_hba_tran_t *, struct scsi_device *);
+static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register
+ struct scsi_pkt *, struct buf *, int, int, int, int,
+ int (*)(), caddr_t);
+static int drsas_tran_start(struct scsi_address *,
+ register struct scsi_pkt *);
+static int drsas_tran_abort(struct scsi_address *, struct scsi_pkt *);
+static int drsas_tran_reset(struct scsi_address *, int);
+static int drsas_tran_getcap(struct scsi_address *, char *, int);
+static int drsas_tran_setcap(struct scsi_address *, char *, int, int);
+static void drsas_tran_destroy_pkt(struct scsi_address *,
+ struct scsi_pkt *);
+static void drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *);
+static void drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *);
+static uint_t drsas_isr();
+static uint_t drsas_softintr();
+
+static int init_mfi(struct drsas_instance *);
+static int drsas_free_dma_obj(struct drsas_instance *, dma_obj_t);
+static int drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *,
+ uchar_t);
+static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *);
+static void return_mfi_pkt(struct drsas_instance *,
+ struct drsas_cmd *);
+
+static void free_space_for_mfi(struct drsas_instance *);
+static void free_additional_dma_buffer(struct drsas_instance *);
+static int alloc_additional_dma_buffer(struct drsas_instance *);
+static int read_fw_status_reg_ppc(struct drsas_instance *);
+static void issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *);
+static int issue_cmd_in_poll_mode_ppc(struct drsas_instance *,
+ struct drsas_cmd *);
+static int issue_cmd_in_sync_mode_ppc(struct drsas_instance *,
+ struct drsas_cmd *);
+static void enable_intr_ppc(struct drsas_instance *);
+static void disable_intr_ppc(struct drsas_instance *);
+static int intr_ack_ppc(struct drsas_instance *);
+static int mfi_state_transition_to_ready(struct drsas_instance *);
+static void destroy_mfi_frame_pool(struct drsas_instance *);
+static int create_mfi_frame_pool(struct drsas_instance *);
+static int drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *,
+ struct buf *, int, int (*)());
+static int drsas_dma_move(struct drsas_instance *,
+ struct scsi_pkt *, struct buf *);
+static void flush_cache(struct drsas_instance *instance);
+static void display_scsi_inquiry(caddr_t);
+static int start_mfi_aen(struct drsas_instance *instance);
+static int handle_drv_ioctl(struct drsas_instance *instance,
+ struct drsas_ioctl *ioctl, int mode);
+static int handle_mfi_ioctl(struct drsas_instance *instance,
+ struct drsas_ioctl *ioctl, int mode);
+static int handle_mfi_aen(struct drsas_instance *instance,
+ struct drsas_aen *aen);
+static void fill_up_drv_ver(struct drsas_drv_ver *dv);
+static struct drsas_cmd *build_cmd(struct drsas_instance *instance,
+ struct scsi_address *ap, struct scsi_pkt *pkt,
+ uchar_t *cmd_done);
+static int register_mfi_aen(struct drsas_instance *instance,
+ uint32_t seq_num, uint32_t class_locale_word);
+static int issue_mfi_pthru(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int issue_mfi_dcmd(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int issue_mfi_smp(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int issue_mfi_stp(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int abort_aen_cmd(struct drsas_instance *instance,
+ struct drsas_cmd *cmd_to_abort);
+
+static int drsas_common_check(struct drsas_instance *instance,
+ struct drsas_cmd *cmd);
+static void drsas_fm_init(struct drsas_instance *instance);
+static void drsas_fm_fini(struct drsas_instance *instance);
+static int drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *,
+ const void *);
+static void drsas_fm_ereport(struct drsas_instance *instance,
+ char *detail);
+static int drsas_check_dma_handle(ddi_dma_handle_t handle);
+static int drsas_check_acc_handle(ddi_acc_handle_t handle);
+
+static void drsas_rem_intrs(struct drsas_instance *instance);
+static int drsas_add_intrs(struct drsas_instance *instance, int intr_type);
+
+static void drsas_tran_tgt_free(dev_info_t *, dev_info_t *,
+ scsi_hba_tran_t *, struct scsi_device *);
+static int drsas_tran_bus_config(dev_info_t *, uint_t,
+ ddi_bus_config_op_t, void *, dev_info_t **);
+static int drsas_parse_devname(char *, int *, int *);
+static int drsas_config_all_devices(struct drsas_instance *);
+static int drsas_config_scsi_device(struct drsas_instance *,
+ struct scsi_device *, dev_info_t **);
+static int drsas_config_ld(struct drsas_instance *, uint16_t,
+ uint8_t, dev_info_t **);
+static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t,
+ uint8_t);
+static int drsas_name_node(dev_info_t *, char *, int);
+static void drsas_issue_evt_taskq(struct drsas_eventinfo *);
+static int drsas_service_evt(struct drsas_instance *, int, int, int,
+ uint64_t);
+static int drsas_mode_sense_build(struct scsi_pkt *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_H_ */
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
new file mode 100644
index 0000000000..4154a77796
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
@@ -0,0 +1,212 @@
+/*
+ * dr_sas_list.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DR_SAS_LIST_H_
+#define _DR_SAS_LIST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct mlist_head {
+ struct mlist_head *next, *prev;
+};
+
+typedef struct mlist_head mlist_t;
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct mlist_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) { \
+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+}
+
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_add(struct mlist_head *new,
+ struct mlist_head *prev,
+ struct mlist_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+
+/*
+ * mlist_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static void mlist_add(struct mlist_head *new, struct mlist_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+
+/*
+ * mlist_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_del(struct mlist_head *prev,
+ struct mlist_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+
+/*
+ * mlist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static void mlist_del_init(struct mlist_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+
+/*
+ * mlist_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static int mlist_empty(struct mlist_head *head)
+{
+ return (head->next == head);
+}
+
+
+/*
+ * mlist_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static void mlist_splice(struct mlist_head *list, struct mlist_head *head)
+{
+ struct mlist_head *first = list->next;
+
+ if (first != list) {
+ struct mlist_head *last = list->prev;
+ struct mlist_head *at = head->next;
+
+ first->prev = head;
+ head->next = first;
+
+ last->next = at;
+ at->prev = last;
+ }
+}
+
+
+/*
+ * mlist_entry - get the struct for this entry
+ * @ptr: the &struct mlist_head pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ */
+#define mlist_entry(ptr, type, member) \
+ ((type *)((size_t)(ptr) - offsetof(type, member)))
+
+
+/*
+ * mlist_for_each - iterate over a list
+ * @pos: the &struct mlist_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define mlist_for_each(pos, head) \
+ for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+ pos = pos->next, prefetch(pos->next))
+
+
+/*
+ * mlist_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos: the &struct mlist_head to use as a loop counter.
+ * @n: another &struct mlist_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define mlist_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_LIST_H_ */
diff --git a/usr/src/uts/common/io/elxl/elxl.c b/usr/src/uts/common/io/elxl/elxl.c
index 2ffe96aff3..42552225f8 100644
--- a/usr/src/uts/common/io/elxl/elxl.c
+++ b/usr/src/uts/common/io/elxl/elxl.c
@@ -1,6 +1,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -1163,8 +1164,7 @@ elxl_m_tx(void *arg, mblk_t *mp)
cflags = 0;
if ((sc->ex_conf & CONF_90XB) != 0) {
uint32_t pflags;
- hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL,
- &pflags);
+ mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags);
if (pflags & HCK_IPV4_HDRCKSUM) {
cflags |= EX_DPD_IPCKSUM;
}
@@ -1327,7 +1327,7 @@ elxl_recv(elxl_t *sc, ex_desc_t *rxd, uint32_t stat)
if (stat & (EX_UPD_TCPCHECKED | EX_UPD_UDPCHECKED)) {
pflags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
}
- (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, pflags, 0);
+ mac_hcksum_set(mp, 0, 0, 0, 0, pflags);
}
return (mp);
diff --git a/usr/src/uts/common/io/eventfd.c b/usr/src/uts/common/io/eventfd.c
index 32f875917f..efc1f9233f 100644
--- a/usr/src/uts/common/io/eventfd.c
+++ b/usr/src/uts/common/io/eventfd.c
@@ -141,37 +141,39 @@ eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
* transitions from EVENTFD_VALMAX to a lower value. At all other
* times, it is already considered writable by poll.
*/
- if (oval == EVENTFD_VALMAX) {
+ if (oval >= EVENTFD_VALMAX) {
pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
}
return (err);
}
-/*ARGSUSED*/
static int
-eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
+eventfd_post(eventfd_state_t *state, uint64_t val, boolean_t is_async,
+ boolean_t file_nonblock)
{
- eventfd_state_t *state;
- minor_t minor = getminor(dev);
- uint64_t val, oval;
- int err;
-
- if (uio->uio_resid < sizeof (val))
- return (EINVAL);
-
- if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
- return (err);
-
- if (val > EVENTFD_VALMAX)
- return (EINVAL);
-
- state = ddi_get_soft_state(eventfd_softstate, minor);
+ uint64_t oval;
+ boolean_t overflow = B_FALSE;
mutex_enter(&state->efd_lock);
while (val > EVENTFD_VALMAX - state->efd_value) {
- if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
+
+ /*
+ * When called from (LX) AIO, expectations about overflow and
+ * blocking are different than normal operation. If the
+ * incoming value would cause overflow, it is clamped to reach
+ * the overflow value exactly. This is added to the existing
+ * value without blocking. Any pollers of the eventfd will see
+ * POLLERR asserted when this occurs.
+ */
+ if (is_async) {
+ val = EVENTFD_VALOVERFLOW - state->efd_value;
+ overflow = B_TRUE;
+ break;
+ }
+
+ if (file_nonblock) {
mutex_exit(&state->efd_lock);
return (EAGAIN);
}
@@ -186,7 +188,7 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
}
/*
- * We now know that we can add the value without overflowing.
+ * We now know that we can safely add the value.
*/
state->efd_value = (oval = state->efd_value) + val;
@@ -200,10 +202,13 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
mutex_exit(&state->efd_lock);
/*
- * Notify pollers as well if the eventfd is now readable.
+ * Notify pollers as well if the eventfd has become readable or has
+ * transitioned into overflow.
*/
if (oval == 0) {
pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
+ } else if (overflow && val != 0) {
+ pollwakeup(&state->efd_pollhd, POLLERR);
}
return (0);
@@ -211,6 +216,29 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
/*ARGSUSED*/
static int
+eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
+{
+ eventfd_state_t *state;
+ boolean_t file_nonblock;
+ uint64_t val;
+ int err;
+
+ if (uio->uio_resid < sizeof (val))
+ return (EINVAL);
+
+ if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
+ return (err);
+
+ if (val > EVENTFD_VALMAX)
+ return (EINVAL);
+
+ file_nonblock = (uio->uio_fmode & (FNDELAY|FNONBLOCK)) != 0;
+ state = ddi_get_soft_state(eventfd_softstate, getminor(dev));
+ return (eventfd_post(state, val, B_FALSE, file_nonblock));
+}
+
+/*ARGSUSED*/
+static int
eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
struct pollhead **phpp)
{
@@ -228,6 +256,9 @@ eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
if (state->efd_value < EVENTFD_VALMAX)
revents |= POLLWRNORM | POLLOUT;
+ if (state->efd_value == EVENTFD_VALOVERFLOW)
+ revents |= POLLERR;
+
*reventsp = revents & events;
if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
*phpp = &state->efd_pollhd;
@@ -244,17 +275,28 @@ eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
{
eventfd_state_t *state;
minor_t minor = getminor(dev);
+ uint64_t *valp;
state = ddi_get_soft_state(eventfd_softstate, minor);
switch (cmd) {
- case EVENTFDIOC_SEMAPHORE: {
+ case EVENTFDIOC_SEMAPHORE:
mutex_enter(&state->efd_lock);
state->efd_semaphore ^= 1;
mutex_exit(&state->efd_lock);
+ return (0);
+ case EVENTFDIOC_POST:
+ /*
+ * This ioctl is expected to be kernel-internal, used only by
+ * the AIO emulation in LX.
+ */
+ if ((md & FKIOCTL) == 0) {
+ break;
+ }
+ valp = (uint64_t *)arg;
+ VERIFY(eventfd_post(state, *valp, B_TRUE, B_FALSE) == 0);
return (0);
- }
default:
break;
diff --git a/usr/src/uts/common/io/fibre-channel/impl/fctl.c b/usr/src/uts/common/io/fibre-channel/impl/fctl.c
index 4c2a39013a..eb2a0c2ec5 100644
--- a/usr/src/uts/common/io/fibre-channel/impl/fctl.c
+++ b/usr/src/uts/common/io/fibre-channel/impl/fctl.c
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
*/
/*
* Fibre channel Transport Library (fctl)
@@ -5500,6 +5501,11 @@ fc_ulp_get_adapter_paths(char *pathList, int count)
maxPorts ++;
}
+ if (maxPorts == 0) {
+ mutex_exit(&fctl_port_lock);
+ return (0);
+ }
+
/* Now allocate a buffer to store all the pointers for comparisons */
portList = kmem_zalloc(sizeof (fc_local_port_t *) * maxPorts, KM_SLEEP);
diff --git a/usr/src/uts/common/io/gld.c b/usr/src/uts/common/io/gld.c
index c6c6b65900..5502ea54af 100644
--- a/usr/src/uts/common/io/gld.c
+++ b/usr/src/uts/common/io/gld.c
@@ -22,6 +22,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -4550,8 +4551,7 @@ gld_unitdata(queue_t *q, mblk_t *mp)
ifp = ((gld_mac_pvt_t *)macinfo->gldm_mac_pvt)->interfacep;
/* grab any checksum information that may be present */
- hcksum_retrieve(mp->b_cont, NULL, NULL, &start, &stuff, &end,
- &value, &flags);
+ mac_hcksum_get(mp->b_cont, &start, &stuff, &end, &value, &flags);
/*
* Prepend a valid header for transmission
@@ -4567,8 +4567,7 @@ gld_unitdata(queue_t *q, mblk_t *mp)
}
/* apply any checksum information to the first block in the chain */
- (void) hcksum_assoc(nmp, NULL, NULL, start, stuff, end, value,
- flags, 0);
+ mac_hcksum_set(nmp, start, stuff, end, value, flags);
GLD_CLEAR_MBLK_VTAG(nmp);
if (gld_start(q, nmp, GLD_WSRV, upri) == GLD_NORESOURCES) {
diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c
new file mode 100644
index 0000000000..03bb799499
--- /dev/null
+++ b/usr/src/uts/common/io/gsqueue/gsqueue.c
@@ -0,0 +1,608 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Serialization queues are a technique used in illumos to provide what's
+ * commonly known as a 'vertical' perimeter. The idea (described a bit in
+ * uts/common/inet/squeue.c) is to provide a means to make sure that message
+ * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd
+ * consume these on different policies, ip on a conn_t basis, vnd on a per
+ * device basis, and use this to ensure that only one packet is being processed
+ * at a given time.
+ *
+ * Serialization queues were originally used by ip. As part of that
+ * implementation, many of the details of ip were baked into it. That includes
+ * things like conn_t, ip receive attributes, and the notion of sets. While an
+ * individual serialization queue, or gsqueue_t, is a useful level of
+ * abstraction, it isn't the basis on which monst consumers want to manage them.
+ * Instead, we have the notion of a set of serialization queues. These sets are
+ * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a
+ * gsqueue_t per CPU to fanout on without managing them all itself. In the
+ * original implementation, this existed, but they were heavily tied into the
+ * infrastructure of IP, and its notion of polling on the underlying MAC
+ * devices.
+ *
+ * The result of that past is a new interface to serialization queues and a
+ * similar, but slightly different, abstraction to sets of these
+ * (gsqueue_set_t). When designing this there are two different approaches that
+ * one could consider. The first is that the system has one gsqueue_set_t that
+ * the entire world shares, whether IP or some other consumer. The other is that
+ * every consumer has their own set.
+ *
+ * The trade offs between these two failure modes are the pathological failure
+ * modes. There is no guarantee that any two consumers here are equivalent. In
+ * fact, they very likely have very different latency profiles. If they are
+ * being processed in the same queue, that can lead to very odd behaviors. More
+ * generally, if we have a series of processing functions from one consumer
+ * which are generally short, and another which are generally long, that'll
+ * cause undue latency that's harder to observe. If we instead take the approach
+ * that each consumer should have its own set that it fans out over then we
+ * won't end up with the problem that a given serialization queue will have
+ * multiple latency profiles, but instead we'll see cpu contention for the bound
+ * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker
+ * thread is bound and it is in fact possible for it to be processed by other
+ * threads on other CPUs.
+ *
+ * We've opted to go down the second path, so each consumer has its own
+ * independent set of serialization queues that it is bound over.
+ *
+ * Structure Hierarchies
+ * ---------------------
+ *
+ * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t
+ * encapsulates all the per-CPU gsqueue_t that exist in the form of
+ * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could
+ * accommodate more than one gsqueue_t, but today there is a one to one mapping.
+ *
+ * We maintain two different lists of gsqueue_cpu_t, the active and defunct
+ * sets. The active set is maintained in the array `gs_cpus`. There are NCPU
+ * entries available in `gs_cpus` with the total number of currently active cpus
+ * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When
+ * there is no longer a need for a given binding (see the following section for
+ * more explanation on when this is the case) then we move the entry to the
+ * `gs_defunct` list which is just a list_t of gsqueue_cpu_t.
+ *
+ * In addition, each gsqueue_set_t can have a series of callbacks registered
+ * with it. These are described in the following section. Graphically, a given
+ * gsqueue_set_t looks roughly like the following:
+ *
+ * +---------------+
+ * | gsqueue_set_t |
+ * +---------------+
+ * | | |
+ * | | * . . . gs_cpus
+ * | | |
+ * | | | +-------------------------------------------------+
+ * | | +--->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |...
+ * | | +-------------------------------------------------+
+ * | |
+ * | * . . . gs_defunct
+ * | |
+ * | | +---------------+ +---------------+ +---------------+
+ * | +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |...
+ * | +---------------+ +---------------+ +---------------+
+ * * . . . gs_cbs
+ * |
+ * | +--------------+ +--------------+ +--------------+
+ * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |...
+ * +--------------+ +--------------+ +--------------+
+ *
+ * CPU DR, gsqueue_t, and gsqueue_t
+ * --------------------------------
+ *
+ * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker
+ * thread that may end up doing work. As part of supporting fanout, we have one
+ * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of
+ * this binding, we need to deal with CPU DR changes.
+ *
+ * The gsqueue driver maintains a single CPU DR callback that is used for the
+ * entire sub-system. We break down CPU DR events into three groups. Offline
+ * events, online events, and events we can ignore. When the first group occurs,
+ * we need to go through every gsqueue_t, find the gsqueue_cpu_t that
+ * corresponds to that processor id, and unbind all of its gsqueue_t's. It's
+ * rather important that we only unbind the gsqueue_t's and not actually destroy
+ * them. When this happens, they could very easily have data queued inside of
+ * them and it's unreasonable to just throw out everything in them at this
+ * point. The data remains intact and service continues uinterrupted.
+ *
+ * When we receive an online event, we do the opposite. We try to find a
+ * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid
+ * field intact) in the defunct list. If we find one, we remove it from the
+ * defunct list and add it to the active list as well as binding the gsqueue_t
+ * to the CPU in question. If we don't find one, then we create a new one.
+ *
+ * To deal with these kinds of situations, we allow a consumer to register
+ * callbacks for the gsqueue_t that they are interested in. These callbacks will
+ * fire whenever we are handling a topology change. The design of the callbacks
+ * is not that the user can take any administrative action during them, but
+ * rather set something for them to do asynchronously. It is illegal to make any
+ * calls into the gsqueue system while you are in a callback.
+ *
+ * Locking
+ * -------
+ *
+ * The lock ordering here is fairly straightforward. Due to our use of CPU
+ * binding and the CPU DR callbacks, we have an additional lock to consider
+ * cpu_lock. Because of that, the following are the rules for locking:
+ *
+ *
+ * o If performing binding operations, you must grab cpu_lock. cpu_lock is
+ * also at the top of the order.
+ *
+ * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock
+ * If you need to take multiple locks, you must take the greatest
+ * (left-most) one first.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/stream.h>
+#include <sys/modctl.h>
+#include <sys/cpuvar.h>
+#include <sys/list.h>
+#include <sys/sysmacros.h>
+
+#include <sys/gsqueue.h>
+#include <sys/squeue_impl.h>
+
+typedef struct gsqueue_cb {
+ struct gsqueue_cb *gcb_next;
+ gsqueue_cb_f gcb_func;
+ void *gcb_arg;
+} gsqueue_cb_t;
+
+typedef struct gsqueue_cpu {
+ list_node_t gqc_lnode;
+ squeue_t *gqc_head;
+ processorid_t gqc_cpuid;
+} gsqueue_cpu_t;
+
+struct gsqueue_set {
+ list_node_t gs_next;
+ pri_t gs_wpri;
+ kmutex_t gs_lock;
+ int gs_ncpus;
+ gsqueue_cpu_t **gs_cpus;
+ list_t gs_defunct;
+ gsqueue_cb_t *gs_cbs;
+};
+
+static kmutex_t gsqueue_lock;
+static list_t gsqueue_list;
+static kmem_cache_t *gsqueue_cb_cache;
+static kmem_cache_t *gsqueue_cpu_cache;
+static kmem_cache_t *gsqueue_set_cache;
+
+static gsqueue_cpu_t *
+gsqueue_cpu_create(pri_t wpri, processorid_t cpuid)
+{
+ gsqueue_cpu_t *scp;
+
+ scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP);
+
+ list_link_init(&scp->gqc_lnode);
+ scp->gqc_cpuid = cpuid;
+ scp->gqc_head = squeue_create(wpri, B_FALSE);
+ scp->gqc_head->sq_state = SQS_DEFAULT;
+ squeue_bind(scp->gqc_head, cpuid);
+
+ return (scp);
+}
+
+static void
+gsqueue_cpu_destroy(gsqueue_cpu_t *scp)
+{
+ squeue_destroy(scp->gqc_head);
+ kmem_cache_free(gsqueue_cpu_cache, scp);
+}
+
+gsqueue_set_t *
+gsqueue_set_create(pri_t wpri)
+{
+ int i;
+ gsqueue_set_t *gssp;
+
+ gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP);
+ gssp->gs_wpri = wpri;
+ gssp->gs_ncpus = 0;
+
+ /*
+ * We're grabbing CPU lock. Once we let go of it we have to ensure all
+ * set up of the gsqueue_set_t is complete, as it'll be in there for the
+ * various CPU DR bits.
+ */
+ mutex_enter(&cpu_lock);
+
+ for (i = 0; i < NCPU; i++) {
+ gsqueue_cpu_t *scp;
+ cpu_t *cp = cpu_get(i);
+ if (cp != NULL && CPU_ACTIVE(cp) &&
+ cp->cpu_flags & CPU_EXISTS) {
+ scp = gsqueue_cpu_create(wpri, cp->cpu_id);
+ gssp->gs_cpus[gssp->gs_ncpus] = scp;
+ gssp->gs_ncpus++;
+ }
+ }
+
+ /* Finally we can add it to our global list and be done */
+ mutex_enter(&gsqueue_lock);
+ list_insert_tail(&gsqueue_list, gssp);
+ mutex_exit(&gsqueue_lock);
+ mutex_exit(&cpu_lock);
+
+ return (gssp);
+}
+
+void
+gsqueue_set_destroy(gsqueue_set_t *gssp)
+{
+ int i;
+ gsqueue_cpu_t *scp;
+
+ /*
+ * Go through and unbind all of the squeues while cpu_lock is held and
+ * move them to the defunct list. Once that's done, we don't need to do
+ * anything else with cpu_lock.
+ */
+ mutex_enter(&cpu_lock);
+ mutex_enter(&gsqueue_lock);
+ list_remove(&gsqueue_list, gssp);
+ mutex_exit(&gsqueue_lock);
+
+ mutex_enter(&gssp->gs_lock);
+
+ for (i = 0; i < gssp->gs_ncpus; i++) {
+ scp = gssp->gs_cpus[i];
+ squeue_unbind(scp->gqc_head);
+ list_insert_tail(&gssp->gs_defunct, scp);
+ gssp->gs_cpus[i] = NULL;
+ }
+ gssp->gs_ncpus = 0;
+
+ mutex_exit(&gssp->gs_lock);
+ mutex_exit(&cpu_lock);
+
+ while ((scp = list_remove_head(&gssp->gs_defunct)) != NULL) {
+ gsqueue_cpu_destroy(scp);
+ }
+
+ while (gssp->gs_cbs != NULL) {
+ gsqueue_cb_t *cbp;
+
+ cbp = gssp->gs_cbs;
+ gssp->gs_cbs = cbp->gcb_next;
+ kmem_cache_free(gsqueue_cb_cache, cbp);
+ }
+
+ ASSERT3U(gssp->gs_ncpus, ==, 0);
+ ASSERT3P(list_head(&gssp->gs_defunct), ==, NULL);
+ ASSERT3P(gssp->gs_cbs, ==, NULL);
+ kmem_cache_free(gsqueue_set_cache, gssp);
+}
+
+gsqueue_t *
+gsqueue_set_get(gsqueue_set_t *gssp, uint_t index)
+{
+ squeue_t *sqp;
+ gsqueue_cpu_t *scp;
+
+ mutex_enter(&gssp->gs_lock);
+ scp = gssp->gs_cpus[index % gssp->gs_ncpus];
+ sqp = scp->gqc_head;
+ mutex_exit(&gssp->gs_lock);
+ return ((gsqueue_t *)sqp);
+}
+
+uintptr_t
+gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg)
+{
+ gsqueue_cb_t *cbp;
+
+ cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP);
+ cbp->gcb_func = cb;
+ cbp->gcb_arg = arg;
+
+ mutex_enter(&gssp->gs_lock);
+ cbp->gcb_next = gssp->gs_cbs;
+ gssp->gs_cbs = cbp;
+ mutex_exit(&gssp->gs_lock);
+ return ((uintptr_t)cbp);
+}
+
+int
+gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id)
+{
+ gsqueue_cb_t *cbp, *prev;
+ mutex_enter(&gssp->gs_lock);
+ cbp = gssp->gs_cbs;
+ prev = NULL;
+ while (cbp != NULL) {
+ if ((uintptr_t)cbp != id) {
+ prev = cbp;
+ cbp = cbp->gcb_next;
+ continue;
+ }
+
+ if (prev == NULL) {
+ gssp->gs_cbs = cbp->gcb_next;
+ } else {
+ prev->gcb_next = cbp->gcb_next;
+ }
+
+ mutex_exit(&gssp->gs_lock);
+ kmem_cache_free(gsqueue_cb_cache, cbp);
+ return (0);
+ }
+ mutex_exit(&gssp->gs_lock);
+ return (-1);
+}
+
+void
+gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg,
+ int flags, uint8_t tag)
+{
+ squeue_t *sqp = (squeue_t *)gsp;
+
+ ASSERT(mp->b_next == NULL);
+ ASSERT(mp->b_prev == NULL);
+ mp->b_queue = (queue_t *)func;
+ mp->b_prev = arg;
+ sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag);
+}
+
+static void
+gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online)
+{
+ gsqueue_cb_t *cbp;
+
+ ASSERT(MUTEX_HELD(&gssp->gs_lock));
+ cbp = gssp->gs_cbs;
+ while (cbp != NULL) {
+ cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online);
+ cbp = cbp->gcb_next;
+ }
+
+}
+
+/*
+ * When we online a processor we need to go through and either bind a defunct
+ * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the
+ * defunct list that used to be on that processor. If no such gsqueue_cpu_t
+ * exists, then we'll create a new one. We'd rather avoid taking over an
+ * existing defunct one that used to be on another CPU, as its not unreasonable
+ * to believe that its CPU will come back. More CPUs are offlined and onlined by
+ * the administrator or by creating cpu sets than actually get offlined by FMA.
+ */
+static void
+gsqueue_handle_online(processorid_t id)
+{
+ gsqueue_set_t *gssp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&gsqueue_lock);
+ for (gssp = list_head(&gsqueue_list); gssp != NULL;
+ gssp = list_next(&gsqueue_list, gssp)) {
+ gsqueue_cpu_t *scp;
+
+ mutex_enter(&gssp->gs_lock);
+ for (scp = list_head(&gssp->gs_defunct); scp != NULL;
+ scp = list_next(&gssp->gs_defunct, scp)) {
+ if (scp->gqc_cpuid == id) {
+ list_remove(&gssp->gs_defunct, scp);
+ break;
+ }
+ }
+
+ if (scp == NULL) {
+ scp = gsqueue_cpu_create(gssp->gs_wpri, id);
+ } else {
+ squeue_bind(scp->gqc_head, id);
+ }
+
+ ASSERT(gssp->gs_ncpus < NCPU);
+ gssp->gs_cpus[gssp->gs_ncpus] = scp;
+ gssp->gs_ncpus++;
+ gsqueue_notify(gssp, scp->gqc_head, B_TRUE);
+ mutex_exit(&gssp->gs_lock);
+ }
+ mutex_exit(&gsqueue_lock);
+}
+
+static void
+gsqueue_handle_offline(processorid_t id)
+{
+ gsqueue_set_t *gssp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&gsqueue_lock);
+ for (gssp = list_head(&gsqueue_list); gssp != NULL;
+ gssp = list_next(&gsqueue_list, gssp)) {
+ int i;
+ gsqueue_cpu_t *scp = NULL;
+
+ mutex_enter(&gssp->gs_lock);
+ for (i = 0; i < gssp->gs_ncpus; i++) {
+ if (gssp->gs_cpus[i]->gqc_cpuid == id) {
+ scp = gssp->gs_cpus[i];
+ break;
+ }
+ }
+
+ if (scp != NULL) {
+ squeue_unbind(scp->gqc_head);
+ list_insert_tail(&gssp->gs_defunct, scp);
+ gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1];
+ gssp->gs_ncpus--;
+ gsqueue_notify(gssp, scp->gqc_head, B_FALSE);
+ }
+ mutex_exit(&gssp->gs_lock);
+ }
+ mutex_exit(&gsqueue_lock);
+}
+
+/* ARGSUSED */
+static int
+gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused)
+{
+ cpu_t *cp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ cp = cpu_get(id);
+ switch (what) {
+ case CPU_CONFIG:
+ case CPU_ON:
+ case CPU_INIT:
+ case CPU_CPUPART_IN:
+ if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS)
+ gsqueue_handle_online(cp->cpu_id);
+ break;
+ case CPU_UNCONFIG:
+ case CPU_OFF:
+ case CPU_CPUPART_OUT:
+ gsqueue_handle_offline(cp->cpu_id);
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+gsqueue_set_cache_construct(void *buf, void *arg, int kmflags)
+{
+ gsqueue_set_t *gssp = buf;
+
+ gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags);
+ if (gssp->gs_cpus == NULL)
+ return (-1);
+
+ mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&gssp->gs_defunct, sizeof (gsqueue_cpu_t),
+ offsetof(gsqueue_cpu_t, gqc_lnode));
+ gssp->gs_ncpus = 0;
+ gssp->gs_cbs = NULL;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+gsqueue_set_cache_destruct(void *buf, void *arg)
+{
+ gsqueue_set_t *gssp = buf;
+
+ kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU);
+ gssp->gs_cpus = NULL;
+ list_destroy(&gssp->gs_defunct);
+ mutex_destroy(&gssp->gs_lock);
+}
+
+static void
+gsqueue_ddiinit(void)
+{
+ list_create(&gsqueue_list, sizeof (gsqueue_set_t),
+ offsetof(gsqueue_set_t, gs_next));
+ mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL);
+
+ gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache",
+ sizeof (gsqueue_cb_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache",
+ sizeof (gsqueue_cpu_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ gsqueue_set_cache = kmem_cache_create("squeue_set_cache",
+ sizeof (gsqueue_set_t),
+ 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct,
+ NULL, NULL, NULL, 0);
+
+
+ mutex_enter(&cpu_lock);
+ register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+}
+
+static int
+gsqueue_ddifini(void)
+{
+ mutex_enter(&gsqueue_lock);
+ if (list_is_empty(&gsqueue_list) == 0) {
+ mutex_exit(&gsqueue_lock);
+ return (EBUSY);
+ }
+ list_destroy(&gsqueue_list);
+ mutex_exit(&gsqueue_lock);
+
+ mutex_enter(&cpu_lock);
+ register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+
+ kmem_cache_destroy(gsqueue_set_cache);
+ kmem_cache_destroy(gsqueue_cpu_cache);
+ kmem_cache_destroy(gsqueue_cb_cache);
+
+ mutex_destroy(&gsqueue_lock);
+
+ return (0);
+}
+
+static struct modlmisc gsqueue_modmisc = {
+ &mod_miscops,
+ "gsqueue"
+};
+
+static struct modlinkage gsqueue_modlinkage = {
+ MODREV_1,
+ &gsqueue_modmisc,
+ NULL
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ gsqueue_ddiinit();
+ if ((ret = mod_install(&gsqueue_modlinkage)) != 0) {
+ VERIFY(gsqueue_ddifini() == 0);
+ return (ret);
+ }
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&gsqueue_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ if ((ret = gsqueue_ddifini()) != 0)
+ return (ret);
+
+ if ((ret = mod_remove(&gsqueue_modlinkage)) != 0)
+ return (ret);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c
index c3ebfa0e47..6726f72147 100644
--- a/usr/src/uts/common/io/hook.c
+++ b/usr/src/uts/common/io/hook.c
@@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks)
/* Free container */
kmem_free(hfi, sizeof (*hfi));
- if (hks->hks_shutdown == 2)
+ if (hks != NULL && hks->hks_shutdown == 2)
hook_stack_remove(hks);
mutex_exit(&hook_stack_lock);
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index d34057d64f..ccf814be0b 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -39,7 +39,8 @@ char *i40e_priv_props[] = {
static int
i40e_group_remove_mac(void *arg, const uint8_t *mac_addr)
{
- i40e_t *i40e = arg;
+ i40e_rx_group_t *rxg = arg;
+ i40e_t *i40e = rxg->irg_i40e;
struct i40e_aqc_remove_macvlan_element_data filt;
struct i40e_hw *hw = &i40e->i40e_hw_space;
int ret, i, last;
@@ -107,10 +108,11 @@ done:
static int
i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
{
- i40e_t *i40e = arg;
- struct i40e_hw *hw = &i40e->i40e_hw_space;
- int i, ret;
- i40e_uaddr_t *iua;
+ i40e_rx_group_t *rxg = arg;
+ i40e_t *i40e = rxg->irg_i40e;
+ struct i40e_hw *hw = &i40e->i40e_hw_space;
+ int i, ret;
+ i40e_uaddr_t *iua;
struct i40e_aqc_add_macvlan_element_data filt;
if (I40E_IS_MULTICAST(mac_addr))
@@ -136,16 +138,12 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
}
}
- /*
- * Note, the general use of the i40e_vsi_id will have to be refactored
- * when we have proper group support.
- */
bzero(&filt, sizeof (filt));
bcopy(mac_addr, filt.mac_addr, ETHERADDRL);
filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH |
I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
- if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+ if ((ret = i40e_aq_add_macvlan(hw, rxg->irg_vsi_seid, &filt, 1,
NULL)) != I40E_SUCCESS) {
i40e_error(i40e, "failed to add mac address "
"%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d",
@@ -157,7 +155,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used];
bcopy(mac_addr, iua->iua_mac, ETHERADDRL);
- iua->iua_vsi = i40e->i40e_vsi_id;
+ iua->iua_vsi = rxg->irg_vsi_seid;
i40e->i40e_resources.ifr_nmacfilt_used++;
ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <=
i40e->i40e_resources.ifr_nmacfilt);
@@ -227,7 +225,7 @@ i40e_m_promisc(void *arg, boolean_t on)
}
- ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
+ ret = i40e_aq_set_vsi_unicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e),
on, NULL, B_FALSE);
if (ret != I40E_SUCCESS) {
i40e_error(i40e, "failed to %s unicast promiscuity on "
@@ -246,7 +244,7 @@ i40e_m_promisc(void *arg, boolean_t on)
goto done;
}
- ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id,
+ ret = i40e_aq_set_vsi_multicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e),
on, NULL);
if (ret != I40E_SUCCESS) {
i40e_error(i40e, "failed to %s multicast promiscuity on "
@@ -257,8 +255,8 @@ i40e_m_promisc(void *arg, boolean_t on)
* Try our best to put us back into a state that MAC expects us
* to be in.
*/
- ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
- !on, NULL, B_FALSE);
+ ret = i40e_aq_set_vsi_unicast_promiscuous(hw,
+ I40E_DEF_VSI_SEID(i40e), !on, NULL, B_FALSE);
if (ret != I40E_SUCCESS) {
i40e_error(i40e, "failed to %s unicast promiscuity on "
"the default VSI after toggling multicast failed: "
@@ -294,11 +292,11 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address)
if (i40e->i40e_mcast_promisc_count == 0 &&
i40e->i40e_promisc_on == B_FALSE) {
ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
- i40e->i40e_vsi_id, B_TRUE, NULL);
+ I40E_DEF_VSI_SEID(i40e), B_TRUE, NULL);
if (ret != I40E_SUCCESS) {
i40e_error(i40e, "failed to enable multicast "
"promiscuous mode on VSI %d: %d",
- i40e->i40e_vsi_id, ret);
+ I40E_DEF_VSI_SEID(i40e), ret);
return (EIO);
}
}
@@ -312,7 +310,7 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address)
filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH |
I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
- if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+ if ((ret = i40e_aq_add_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1,
NULL)) != I40E_SUCCESS) {
i40e_error(i40e, "failed to add mac address "
"%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d",
@@ -353,8 +351,8 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address)
filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH |
I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
- if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id,
- &filt, 1, NULL) != I40E_SUCCESS) {
+ if (i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt,
+ 1, NULL) != I40E_SUCCESS) {
i40e_error(i40e, "failed to remove mac address "
"%2x:%2x:%2x:%2x:%2x:%2x from multicast "
"filter: %d",
@@ -381,11 +379,11 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address)
if (i40e->i40e_mcast_promisc_count == 1 &&
i40e->i40e_promisc_on == B_FALSE) {
ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
- i40e->i40e_vsi_id, B_FALSE, NULL);
+ I40E_DEF_VSI_SEID(i40e), B_FALSE, NULL);
if (ret != I40E_SUCCESS) {
i40e_error(i40e, "failed to disable "
"multicast promiscuous mode on VSI %d: %d",
- i40e->i40e_vsi_id, ret);
+ I40E_DEF_VSI_SEID(i40e), ret);
return (EIO);
}
}
@@ -490,7 +488,7 @@ i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
* we're not actually grouping things tx-wise at this time.
*/
ASSERT(group_index == -1);
- ASSERT(ring_index < i40e->i40e_num_trqpairs);
+ ASSERT(ring_index < i40e->i40e_num_trqpairs_per_vsi);
itrq->itrq_mactxring = rh;
infop->mri_driver = (mac_ring_driver_t)itrq;
@@ -516,15 +514,16 @@ i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
{
i40e_t *i40e = arg;
mac_intr_t *mintr = &infop->mri_intr;
- i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index];
+ uint_t trqpair_index;
+ i40e_trqpair_t *itrq;
- /*
- * We assert the group number and ring index to help sanity check
- * ourselves and mark that we'll need to rework this when we have
- * multiple groups.
- */
- ASSERT3S(group_index, ==, 0);
- ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs);
+ /* This assumes static groups. */
+ ASSERT3S(group_index, >=, 0);
+ ASSERT3S(ring_index, >=, 0);
+ trqpair_index = (group_index * i40e->i40e_num_trqpairs_per_vsi) +
+ ring_index;
+ ASSERT3U(trqpair_index, <, i40e->i40e_num_trqpairs);
+ itrq = &i40e->i40e_trqpairs[trqpair_index];
itrq->itrq_macrxring = rh;
infop->mri_driver = (mac_ring_driver_t)itrq;
@@ -552,24 +551,22 @@ i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
mac_group_info_t *infop, mac_group_handle_t gh)
{
i40e_t *i40e = arg;
+ i40e_rx_group_t *rxg;
if (rtype != MAC_RING_TYPE_RX)
return;
- /*
- * Note, this is a simplified view of a group, given that we only have a
- * single group and a single ring at the moment. We'll want to expand
- * upon this as we leverage more hardware functionality.
- */
- i40e->i40e_rx_group_handle = gh;
- infop->mgi_driver = (mac_group_driver_t)i40e;
+ rxg = &i40e->i40e_rx_groups[index];
+ rxg->irg_grp_hdl = gh;
+
+ infop->mgi_driver = (mac_group_driver_t)rxg;
infop->mgi_start = NULL;
infop->mgi_stop = NULL;
infop->mgi_addmac = i40e_group_add_mac;
infop->mgi_remmac = i40e_group_remove_mac;
- ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX);
- infop->mgi_count = i40e->i40e_num_trqpairs;
+ ASSERT(i40e->i40e_num_rx_groups <= I40E_GROUP_MAX);
+ infop->mgi_count = i40e->i40e_num_trqpairs_per_vsi;
}
static int
@@ -732,20 +729,32 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
break;
}
+ case MAC_CAPAB_LSO: {
+ mac_capab_lso_t *cap_lso = cap_data;
+
+ if (i40e->i40e_tx_lso_enable == B_TRUE) {
+ cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+ cap_lso->lso_basic_tcp_ipv4.lso_max = I40E_LSO_MAXLEN;
+ } else {
+ return (B_FALSE);
+ }
+ break;
+ }
+
case MAC_CAPAB_RINGS:
cap_rings = cap_data;
cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
switch (cap_rings->mr_type) {
case MAC_RING_TYPE_TX:
/*
- * Note, saying we have no rings, but some number of
- * groups indicates to MAC that it should create
- * psuedo-groups with one for each TX ring. This may not
- * be the long term behavior we want, but it'll work for
- * now.
+ * Note, saying we have no groups, but some
+ * number of rings indicates to MAC that it
+ * should create psuedo-groups with one for
+ * each TX ring. This may not be the long term
+ * behavior we want, but it'll work for now.
*/
cap_rings->mr_gnum = 0;
- cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
+ cap_rings->mr_rnum = i40e->i40e_num_trqpairs_per_vsi;
cap_rings->mr_rget = i40e_fill_tx_ring;
cap_rings->mr_gget = NULL;
cap_rings->mr_gaddring = NULL;
@@ -754,7 +763,7 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
case MAC_RING_TYPE_RX:
cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
cap_rings->mr_rget = i40e_fill_rx_ring;
- cap_rings->mr_gnum = I40E_GROUP_MAX;
+ cap_rings->mr_gnum = i40e->i40e_num_rx_groups;
cap_rings->mr_gget = i40e_fill_rx_group;
cap_rings->mr_gaddring = NULL;
cap_rings->mr_gremring = NULL;
diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c
index 51d1bbac92..170bef7ec6 100644
--- a/usr/src/uts/common/io/i40e/i40e_intr.c
+++ b/usr/src/uts/common/io/i40e/i40e_intr.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2017 Tegile Systems, Inc. All rights reserved.
*/
@@ -229,12 +229,20 @@ i40e_intr_adminq_disable(i40e_t *i40e)
I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
}
+/*
+ * The next two functions enable/disable the reception of interrupts
+ * on the given vector. Only vectors 1..N are programmed by these
+ * functions; vector 0 is special and handled by a different register.
+ * We must subtract one from the vector because i40e implicitly adds
+ * one to the vector value. See section 10.2.2.10.13 for more details.
+ */
static void
i40e_intr_io_enable(i40e_t *i40e, int vector)
{
uint32_t reg;
i40e_hw_t *hw = &i40e->i40e_hw_space;
+ ASSERT3S(vector, >, 0);
reg = I40E_PFINT_DYN_CTLN_INTENA_MASK |
I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
(I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
@@ -247,6 +255,7 @@ i40e_intr_io_disable(i40e_t *i40e, int vector)
uint32_t reg;
i40e_hw_t *hw = &i40e->i40e_hw_space;
+ ASSERT3S(vector, >, 0);
reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
}
@@ -375,49 +384,109 @@ i40e_intr_chip_fini(i40e_t *i40e)
}
/*
- * Enable all of the queues and set the corresponding LNKLSTN registers. Note
- * that we always enable queues as interrupt sources, even though we don't
- * enable the MSI-X interrupt vectors.
+ * Set the head of the interrupt linked list. The PFINT_LNKLSTN[N]
+ * register actually refers to the 'N + 1' interrupt vector. E.g.,
+ * PFINT_LNKLSTN[0] refers to interrupt vector 1.
+ */
+static void
+i40e_set_lnklstn(i40e_t *i40e, uint_t vector, uint_t queue)
+{
+ uint32_t reg;
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+ reg = (queue << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
+ (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+
+ I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(vector), reg);
+ DEBUGOUT2("PFINT_LNKLSTN[%u] = 0x%x", vector, reg);
+}
+
+/*
+ * Set the QINT_RQCTL[queue] register. The next queue is always the Tx
+ * queue associated with this Rx queue. Unlike PFINT_LNKLSTN, the
+ * vector should be the actual vector this queue is on -- i.e., it
+ * should be equal to itrq_rx_intrvec.
+ */
+static void
+i40e_set_rqctl(i40e_t *i40e, uint_t vector, uint_t queue)
+{
+ uint32_t reg;
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+ ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_rx_intrvec);
+
+ reg = (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
+ (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+ (queue << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
+ (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
+ I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+
+ I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
+ DEBUGOUT2("QINT_RQCTL[%u] = 0x%x", queue, reg);
+}
+
+/*
+ * Like i40e_set_rqctl(), but for QINT_TQCTL[queue]. The next queue is
+ * either the Rx queue of another TRQP, or EOL.
+ */
+static void
+i40e_set_tqctl(i40e_t *i40e, uint_t vector, uint_t queue, uint_t next_queue)
+{
+ uint32_t reg;
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+ ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_tx_intrvec);
+
+ reg = (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
+ (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+ (next_queue << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+ (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
+ I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+
+ I40E_WRITE_REG(hw, I40E_QINT_TQCTL(queue), reg);
+ DEBUGOUT2("QINT_TQCTL[%u] = 0x%x", queue, reg);
+}
+
+/*
+ * Program the interrupt linked list. Each vector has a linked list of
+ * queues which act as event sources for that vector. When one of
+ * those sources has an event the associated interrupt vector is
+ * fired. This mapping must match the mapping found in
+ * i40e_map_intrs_to_vectors().
+ *
+ * See section 7.5.3 for more information about the configuration of
+ * the interrupt linked list.
*/
static void
i40e_intr_init_queue_msix(i40e_t *i40e)
{
- i40e_hw_t *hw = &i40e->i40e_hw_space;
- uint32_t reg;
- int i;
+ uint_t intr_count;
/*
- * Map queues to MSI-X interrupts. Queue i is mapped to vector i + 1.
- * Note that we skip the ITR logic for the moment, just to make our
- * lives as explicit and simple as possible.
+ * The 0th vector is for 'Other Interrupts' only (subject to
+ * change in the future).
*/
- for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
- i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+ intr_count = i40e->i40e_intr_count - 1;
- reg = (i << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
- (I40E_QUEUE_TYPE_RX <<
- I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
- I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i), reg);
+ for (uint_t vec = 0; vec < intr_count; vec++) {
+ boolean_t head = B_TRUE;
- reg =
- (itrq->itrq_rx_intrvec << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
- (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
- (i << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
- (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
- I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+ for (uint_t qidx = vec; qidx < i40e->i40e_num_trqpairs;
+ qidx += intr_count) {
+ uint_t next_qidx = qidx + intr_count;
- I40E_WRITE_REG(hw, I40E_QINT_RQCTL(i), reg);
+ next_qidx = (next_qidx > i40e->i40e_num_trqpairs) ?
+ I40E_QUEUE_TYPE_EOL : next_qidx;
- reg =
- (itrq->itrq_tx_intrvec << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
- (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
- (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
- (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
- I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+ if (head) {
+ i40e_set_lnklstn(i40e, vec, qidx);
+ head = B_FALSE;
+ }
- I40E_WRITE_REG(hw, I40E_QINT_TQCTL(i), reg);
+ i40e_set_rqctl(i40e, vec + 1, qidx);
+ i40e_set_tqctl(i40e, vec + 1, qidx, next_qidx);
+ }
}
-
}
/*
@@ -604,31 +673,26 @@ i40e_intr_adminq_work(i40e_t *i40e)
}
static void
-i40e_intr_rx_work(i40e_t *i40e, int queue)
+i40e_intr_rx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
{
mblk_t *mp = NULL;
- i40e_trqpair_t *itrq;
-
- ASSERT(queue < i40e->i40e_num_trqpairs);
- itrq = &i40e->i40e_trqpairs[queue];
mutex_enter(&itrq->itrq_rx_lock);
if (!itrq->itrq_intr_poll)
mp = i40e_ring_rx(itrq, I40E_POLL_NULL);
mutex_exit(&itrq->itrq_rx_lock);
- if (mp != NULL) {
- mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
- itrq->itrq_rxgen);
- }
+ if (mp == NULL)
+ return;
+
+ mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
+ itrq->itrq_rxgen);
}
+/* ARGSUSED */
static void
-i40e_intr_tx_work(i40e_t *i40e, int queue)
+i40e_intr_tx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
{
- i40e_trqpair_t *itrq;
-
- itrq = &i40e->i40e_trqpairs[queue];
i40e_tx_recycle_ring(itrq);
}
@@ -665,11 +729,17 @@ i40e_intr_other_work(i40e_t *i40e)
i40e_intr_adminq_enable(i40e);
}
+/*
+ * Handle an MSI-X interrupt. See section 7.5.1.3 for an overview of
+ * the MSI-X interrupt sequence.
+ */
uint_t
i40e_intr_msix(void *arg1, void *arg2)
{
i40e_t *i40e = (i40e_t *)arg1;
- int vector_idx = (int)(uintptr_t)arg2;
+ uint_t vector_idx = (uint_t)(uintptr_t)arg2;
+
+ ASSERT3U(vector_idx, <, i40e->i40e_intr_count);
/*
* When using MSI-X interrupts, vector 0 is always reserved for the
@@ -681,10 +751,29 @@ i40e_intr_msix(void *arg1, void *arg2)
return (DDI_INTR_CLAIMED);
}
- i40e_intr_rx_work(i40e, vector_idx - 1);
- i40e_intr_tx_work(i40e, vector_idx - 1);
- i40e_intr_io_enable(i40e, vector_idx);
+ ASSERT3U(vector_idx, >, 0);
+ /*
+ * We determine the queue indexes via simple arithmetic (as
+ * opposed to keeping explicit state like a bitmap). While
+ * conveinent, it does mean that i40e_map_intrs_to_vectors(),
+ * i40e_intr_init_queue_msix(), and this function must be
+ * modified as a unit.
+ *
+ * We subtract 1 from the vector to offset the addition we
+ * performed during i40e_map_intrs_to_vectors().
+ */
+ for (uint_t i = vector_idx - 1; i < i40e->i40e_num_trqpairs;
+ i += (i40e->i40e_intr_count - 1)) {
+ i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+
+ ASSERT3U(i, <, i40e->i40e_num_trqpairs);
+ ASSERT3P(itrq, !=, NULL);
+ i40e_intr_rx_work(i40e, itrq);
+ i40e_intr_tx_work(i40e, itrq);
+ }
+
+ i40e_intr_io_enable(i40e, vector_idx);
return (DDI_INTR_CLAIMED);
}
@@ -693,6 +782,7 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared)
{
i40e_hw_t *hw = &i40e->i40e_hw_space;
uint32_t reg;
+ i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[0];
int ret = DDI_INTR_CLAIMED;
if (shared == B_TRUE) {
@@ -722,10 +812,10 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared)
i40e_intr_adminq_work(i40e);
if (reg & I40E_INTR_NOTX_RX_MASK)
- i40e_intr_rx_work(i40e, 0);
+ i40e_intr_rx_work(i40e, itrq);
if (reg & I40E_INTR_NOTX_TX_MASK)
- i40e_intr_tx_work(i40e, 0);
+ i40e_intr_tx_work(i40e, itrq);
done:
i40e_intr_adminq_enable(i40e);
diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c
index 54aef43424..0623aee513 100644
--- a/usr/src/uts/common/io/i40e/i40e_main.c
+++ b/usr/src/uts/common/io/i40e/i40e_main.c
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2017 Tegile Systems, Inc. All rights reserved.
*/
@@ -188,14 +188,15 @@
* VSI Management
* --------------
*
- * At this time, we currently only support a single MAC group, and thus a single
- * VSI. This VSI is considered the default VSI and should be the only one that
- * exists after a reset. Currently it is stored as the member
- * i40e_t`i40e_vsi_id. While this works for the moment and for an initial
- * driver, it's not sufficient for the longer-term path of the driver. Instead,
- * we'll want to actually have a unique i40e_vsi_t structure which is used
- * everywhere. Note that this means that every place that uses the
- * i40e_t`i40e_vsi_id will need to be refactored.
+ * The PFs share 384 VSIs. The firmware creates one VSI per PF by default.
+ * During chip start we retrieve the SEID of this VSI and assign it as the
+ * default VSI for our VEB (one VEB per PF). We then add additional VSIs to
+ * the VEB up to the determined number of rx groups: i40e_t`i40e_num_rx_groups.
+ * We currently cap this number to I40E_GROUP_MAX to a) make sure all PFs can
+ * allocate the same number of VSIs, and b) to keep the interrupt multiplexing
+ * under control. In the future, when we improve the interrupt allocation, we
+ * may want to revisit this cap to make better use of the available VSIs. The
+ * VSI allocation and configuration can be found in i40e_chip_start().
*
* ----------------
* Structure Layout
@@ -240,7 +241,7 @@
* | i40e_hw_t --+---> Intel common code structure
* | mac_handle_t --+---> GLDv3 handle to MAC
* | ddi_periodic_t --+---> Link activity timer
- * | int (vsi_id) --+---> VSI ID, main identifier
+ * | i40e_vsi_t * --+---> Array of VSIs
* | i40e_func_rsrc_t --+---> Available hardware resources
* | i40e_switch_rsrc_t * --+---> Switch resource snapshot
* | i40e_sdu --+---> Current MTU
@@ -249,11 +250,10 @@
* | i40e_maddr_t * --+---> Array of assigned multicast MACs
* | i40e_mcast_promisccount --+---> Active multicast state
* | i40e_promisc_on --+---> Current promiscuous mode state
- * | int --+---> Number of transmit/receive pairs
+ * | uint_t --+---> Number of transmit/receive pairs
+ * | i40e_rx_group_t * --+---> Array of Rx groups
* | kstat_t * --+---> PF kstats
- * | kstat_t * --+---> VSI kstats
* | i40e_pf_stats_t --+---> PF kstat backing data
- * | i40e_vsi_stats_t --+---> VSI kstat backing data
* | i40e_trqpair_t * --+---------+
* +---------------------------+ |
* |
@@ -359,8 +359,6 @@
* While bugs have been filed to cover this future work, the following gives an
* overview of expected work:
*
- * o TSO support
- * o Multiple group support
* o DMA binding and breaking up the locking in ring recycling.
* o Enhanced detection of device errors
* o Participation in IRM
@@ -371,7 +369,7 @@
#include "i40e_sw.h"
-static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.1";
+static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.3";
/*
* The i40e_glock primarily protects the lists below and the i40e_device_t
@@ -761,15 +759,16 @@ i40e_fm_ereport(i40e_t *i40e, char *detail)
}
/*
- * Here we're trying to get the ID of the default VSI. In general, when we come
- * through and look at this shortly after attach, we expect there to only be a
- * single element present, which is the default VSI. Importantly, each PF seems
- * to not see any other devices, in part because of the simple switch mode that
- * we're using. If for some reason, we see more artifact, we'll need to revisit
- * what we're doing here.
+ * Here we're trying to set the SEID of the default VSI. In general,
+ * when we come through and look at this shortly after attach, we
+ * expect there to only be a single element present, which is the
+ * default VSI. Importantly, each PF seems to not see any other
+ * devices, in part because of the simple switch mode that we're
+ * using. If for some reason, we see more artifacts, we'll need to
+ * revisit what we're doing here.
*/
-static int
-i40e_get_vsi_id(i40e_t *i40e)
+static boolean_t
+i40e_set_def_vsi_seid(i40e_t *i40e)
{
i40e_hw_t *hw = &i40e->i40e_hw_space;
struct i40e_aqc_get_switch_config_resp *sw_config;
@@ -784,17 +783,43 @@ i40e_get_vsi_id(i40e_t *i40e)
if (rc != I40E_SUCCESS) {
i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
rc, hw->aq.asq_last_status);
- return (-1);
+ return (B_FALSE);
}
if (LE_16(sw_config->header.num_reported) != 1) {
i40e_error(i40e, "encountered multiple (%d) switching units "
"during attach, not proceeding",
LE_16(sw_config->header.num_reported));
+ return (B_FALSE);
+ }
+
+ I40E_DEF_VSI_SEID(i40e) = sw_config->element[0].seid;
+ return (B_TRUE);
+}
+
+/*
+ * Get the SEID of the uplink MAC.
+ */
+static int
+i40e_get_mac_seid(i40e_t *i40e)
+{
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+ struct i40e_aqc_get_switch_config_resp *sw_config;
+ uint8_t aq_buf[I40E_AQ_LARGE_BUF];
+ uint16_t next = 0;
+ int rc;
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf;
+ rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next,
+ NULL);
+ if (rc != I40E_SUCCESS) {
+ i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
+ rc, hw->aq.asq_last_status);
return (-1);
}
- return (sw_config->element[0].seid);
+ return (LE_16(sw_config->element[0].uplink_seid));
}
/*
@@ -1098,11 +1123,16 @@ i40e_disable_interrupts(i40e_t *i40e)
static void
i40e_free_trqpairs(i40e_t *i40e)
{
- int i;
i40e_trqpair_t *itrq;
+ if (i40e->i40e_rx_groups != NULL) {
+ kmem_free(i40e->i40e_rx_groups,
+ sizeof (i40e_rx_group_t) * i40e->i40e_num_rx_groups);
+ i40e->i40e_rx_groups = NULL;
+ }
+
if (i40e->i40e_trqpairs != NULL) {
- for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+ for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
itrq = &i40e->i40e_trqpairs[i];
mutex_destroy(&itrq->itrq_rx_lock);
mutex_destroy(&itrq->itrq_tx_lock);
@@ -1133,7 +1163,6 @@ i40e_free_trqpairs(i40e_t *i40e)
static boolean_t
i40e_alloc_trqpairs(i40e_t *i40e)
{
- int i;
void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri);
/*
@@ -1146,7 +1175,7 @@ i40e_alloc_trqpairs(i40e_t *i40e)
i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) *
i40e->i40e_num_trqpairs, KM_SLEEP);
- for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+ for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
itrq->itrq_i40e = i40e;
@@ -1156,6 +1185,16 @@ i40e_alloc_trqpairs(i40e_t *i40e)
itrq->itrq_index = i;
}
+ i40e->i40e_rx_groups = kmem_zalloc(sizeof (i40e_rx_group_t) *
+ i40e->i40e_num_rx_groups, KM_SLEEP);
+
+ for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+ i40e_rx_group_t *rxg = &i40e->i40e_rx_groups[i];
+
+ rxg->irg_index = i;
+ rxg->irg_i40e = i40e;
+ }
+
return (B_TRUE);
}
@@ -1164,16 +1203,19 @@ i40e_alloc_trqpairs(i40e_t *i40e)
/*
* Unless a .conf file already overrode i40e_t structure values, they will
* be 0, and need to be set in conjunction with the now-available HW report.
- *
- * However, at the moment, we cap all of these resources as we only support a
- * single receive ring and a single group.
*/
/* ARGSUSED */
static void
i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw)
{
- if (i40e->i40e_num_trqpairs == 0) {
- i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX;
+ if (i40e->i40e_num_trqpairs_per_vsi == 0) {
+ if (i40e_is_x722(i40e)) {
+ i40e->i40e_num_trqpairs_per_vsi =
+ I40E_722_MAX_TC_QUEUES;
+ } else {
+ i40e->i40e_num_trqpairs_per_vsi =
+ I40E_710_MAX_TC_QUEUES;
+ }
}
if (i40e->i40e_num_rx_groups == 0) {
@@ -1309,12 +1351,11 @@ i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw)
}
/*
- * We need to obtain the Virtual Station ID (VSI) before we can
- * perform other operations on the device.
+ * We need to obtain the Default Virtual Station SEID (VSI)
+ * before we can perform other operations on the device.
*/
- i40e->i40e_vsi_id = i40e_get_vsi_id(i40e);
- if (i40e->i40e_vsi_id == -1) {
- i40e_error(i40e, "failed to obtain VSI ID");
+ if (!i40e_set_def_vsi_seid(i40e)) {
+ i40e_error(i40e, "failed to obtain Default VSI SEID");
return (B_FALSE);
}
@@ -1559,6 +1600,9 @@ i40e_init_properties(i40e_t *i40e)
i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable",
B_FALSE, B_TRUE, B_TRUE);
+ i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable",
+ B_FALSE, B_TRUE, B_TRUE);
+
i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable",
B_FALSE, B_TRUE, B_TRUE);
@@ -1728,15 +1772,56 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
}
i40e->i40e_intr_type = 0;
+ i40e->i40e_num_rx_groups = I40E_GROUP_MAX;
+ /*
+ * We need to determine the number of queue pairs per traffic
+ * class. We only have one traffic class (TC0), so we'll base
+ * this off the number of interrupts provided. Furthermore,
+ * since we only use one traffic class, the number of queues
+ * per traffic class and per VSI are the same.
+ */
if ((intr_types & DDI_INTR_TYPE_MSIX) &&
- i40e->i40e_intr_force <= I40E_INTR_MSIX) {
- if (i40e_alloc_intr_handles(i40e, devinfo,
- DDI_INTR_TYPE_MSIX)) {
- i40e->i40e_num_trqpairs =
- MIN(i40e->i40e_intr_count - 1, max_trqpairs);
- return (B_TRUE);
- }
+ (i40e->i40e_intr_force <= I40E_INTR_MSIX) &&
+ (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))) {
+ uint32_t n;
+
+ /*
+ * While we want the number of queue pairs to match
+ * the number of interrupts, we must keep stay in
+ * bounds of the maximum number of queues per traffic
+ * class. We subtract one from i40e_intr_count to
+ * account for interrupt zero; which is currently
+ * restricted to admin queue commands and other
+ * interrupt causes.
+ */
+ n = MIN(i40e->i40e_intr_count - 1, max_trqpairs);
+ ASSERT3U(n, >, 0);
+
+ /*
+ * Round up to the nearest power of two to ensure that
+ * the QBASE aligns with the TC size which must be
+ * programmed as a power of two. See the queue mapping
+ * description in section 7.4.9.5.5.1.
+ *
+ * If i40e_intr_count - 1 is not a power of two then
+ * some queue pairs on the same VSI will have to share
+ * an interrupt.
+ *
+ * We may want to revisit this logic in a future where
+ * we have more interrupts and more VSIs. Otherwise,
+ * each VSI will use as many interrupts as possible.
+ * Using more QPs per VSI means better RSS for each
+ * group, but at the same time may require more
+ * sharing of interrupts across VSIs. This may be a
+ * good candidate for a .conf tunable.
+ */
+ n = 0x1 << ddi_fls(n);
+ i40e->i40e_num_trqpairs_per_vsi = n;
+ ASSERT3U(i40e->i40e_num_rx_groups, >, 0);
+ i40e->i40e_num_trqpairs = i40e->i40e_num_trqpairs_per_vsi *
+ i40e->i40e_num_rx_groups;
+ return (B_TRUE);
}
/*
@@ -1745,6 +1830,7 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
* single MSI interrupt.
*/
i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
+ i40e->i40e_num_trqpairs_per_vsi = i40e->i40e_num_trqpairs;
i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
if ((intr_types & DDI_INTR_TYPE_MSI) &&
@@ -1767,24 +1853,20 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
static boolean_t
i40e_map_intrs_to_vectors(i40e_t *i40e)
{
- int i;
-
if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
return (B_TRUE);
}
/*
- * Each queue pair is mapped to a single interrupt, so transmit
- * and receive interrupts for a given queue share the same vector.
- * The number of queue pairs is one less than the number of interrupt
- * vectors and is assigned the vector one higher than its index.
- * Vector zero is reserved for the admin queue.
+ * Each queue pair is mapped to a single interrupt, so
+ * transmit and receive interrupts for a given queue share the
+ * same vector. Vector zero is reserved for the admin queue.
*/
- ASSERT(i40e->i40e_intr_count == i40e->i40e_num_trqpairs + 1);
+ for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
+ uint_t vector = i % (i40e->i40e_intr_count - 1);
- for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
- i40e->i40e_trqpairs[i].itrq_rx_intrvec = i + 1;
- i40e->i40e_trqpairs[i].itrq_tx_intrvec = i + 1;
+ i40e->i40e_trqpairs[i].itrq_rx_intrvec = vector + 1;
+ i40e->i40e_trqpairs[i].itrq_tx_intrvec = vector + 1;
}
return (B_TRUE);
@@ -1923,89 +2005,282 @@ i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw)
}
/*
- * Configure the hardware for the Virtual Station Interface (VSI). Currently
- * we only support one, but in the future we could instantiate more than one
- * per attach-point.
+ * Set the properties which have common values across all the VSIs.
+ * Consult the "Add VSI" command section (7.4.9.5.5.1) for a
+ * complete description of these properties.
*/
-static boolean_t
-i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw)
+static void
+i40e_set_shared_vsi_props(i40e_t *i40e,
+ struct i40e_aqc_vsi_properties_data *info, uint_t vsi_idx)
{
- struct i40e_vsi_context context;
- int err, tc_queues;
+ uint_t tc_queues;
+ uint16_t vsi_qp_base;
- bzero(&context, sizeof (struct i40e_vsi_context));
- context.seid = i40e->i40e_vsi_id;
- context.pf_num = hw->pf_id;
- err = i40e_aq_get_vsi_params(hw, &context, NULL);
- if (err != I40E_SUCCESS) {
- i40e_error(i40e, "get VSI params failed with %d", err);
- return (B_FALSE);
- }
-
- i40e->i40e_vsi_num = context.vsi_number;
+ /*
+ * It's important that we use bitwise-OR here; callers to this
+ * function might enable other sections before calling this
+ * function.
+ */
+ info->valid_sections |= LE_16(I40E_AQ_VSI_PROP_QUEUE_MAP_VALID |
+ I40E_AQ_VSI_PROP_VLAN_VALID);
/*
- * Set the queue and traffic class bits. Keep it simple for now.
+ * Calculate the starting QP index for this VSI. This base is
+ * relative to the PF queue space; so a value of 0 for PF#1
+ * represents the absolute index PFLAN_QALLOC_FIRSTQ for PF#1.
*/
- context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
- context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG;
- context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES;
+ vsi_qp_base = vsi_idx * i40e->i40e_num_trqpairs_per_vsi;
+ info->mapping_flags = LE_16(I40E_AQ_VSI_QUE_MAP_CONTIG);
+ info->queue_mapping[0] =
+ LE_16((vsi_qp_base << I40E_AQ_VSI_QUEUE_SHIFT) &
+ I40E_AQ_VSI_QUEUE_MASK);
/*
- * tc_queues determines the size of the traffic class, where the size is
- * 2^^tc_queues to a maximum of 64 for the X710 and 128 for the X722.
+ * tc_queues determines the size of the traffic class, where
+ * the size is 2^^tc_queues to a maximum of 64 for the X710
+ * and 128 for the X722.
*
* Some examples:
- * i40e_num_trqpairs == 1 => tc_queues = 0, 2^^0 = 1.
- * i40e_num_trqpairs == 7 => tc_queues = 3, 2^^3 = 8.
- * i40e_num_trqpairs == 8 => tc_queues = 3, 2^^3 = 8.
- * i40e_num_trqpairs == 9 => tc_queues = 4, 2^^4 = 16.
- * i40e_num_trqpairs == 17 => tc_queues = 5, 2^^5 = 32.
- * i40e_num_trqpairs == 64 => tc_queues = 6, 2^^6 = 64.
+ * i40e_num_trqpairs_per_vsi == 1 => tc_queues = 0, 2^^0 = 1.
+ * i40e_num_trqpairs_per_vsi == 7 => tc_queues = 3, 2^^3 = 8.
+ * i40e_num_trqpairs_per_vsi == 8 => tc_queues = 3, 2^^3 = 8.
+ * i40e_num_trqpairs_per_vsi == 9 => tc_queues = 4, 2^^4 = 16.
+ * i40e_num_trqpairs_per_vsi == 17 => tc_queues = 5, 2^^5 = 32.
+ * i40e_num_trqpairs_per_vsi == 64 => tc_queues = 6, 2^^6 = 64.
*/
- tc_queues = ddi_fls(i40e->i40e_num_trqpairs - 1);
+ tc_queues = ddi_fls(i40e->i40e_num_trqpairs_per_vsi - 1);
- context.info.tc_mapping[0] = ((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) &
- I40E_AQ_VSI_TC_QUE_OFFSET_MASK) |
- ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) &
- I40E_AQ_VSI_TC_QUE_NUMBER_MASK);
+ /*
+ * The TC queue mapping is in relation to the VSI queue space.
+ * Since we are only using one traffic class (TC0) we always
+ * start at queue offset 0.
+ */
+ info->tc_mapping[0] =
+ LE_16(((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) &
+ I40E_AQ_VSI_TC_QUE_OFFSET_MASK) |
+ ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) &
+ I40E_AQ_VSI_TC_QUE_NUMBER_MASK));
- context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID;
- context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
+ /*
+ * I40E_AQ_VSI_PVLAN_MODE_ALL ("VLAN driver insertion mode")
+ *
+ * Allow tagged and untagged packets to be sent to this
+ * VSI from the host.
+ *
+ * I40E_AQ_VSI_PVLAN_EMOD_NOTHING ("VLAN and UP expose mode")
+ *
+ * Leave the tag on the frame and place no VLAN
+ * information in the descriptor. We want this mode
+ * because our MAC layer will take care of the VLAN tag,
+ * if there is one.
+ */
+ info->port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
I40E_AQ_VSI_PVLAN_EMOD_NOTHING;
+}
- context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF);
+/*
+ * Delete the VSI at this index, if one exists. We assume there is no
+ * action we can take if this command fails but to log the failure.
+ */
+static void
+i40e_delete_vsi(i40e_t *i40e, uint_t idx)
+{
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+ uint16_t seid = i40e->i40e_vsis[idx].iv_seid;
- i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx);
- if (i40e_stat_vsi_init(i40e) == B_FALSE)
- return (B_FALSE);
+ if (seid != 0) {
+ int rc;
- err = i40e_aq_update_vsi_params(hw, &context, NULL);
- if (err != I40E_SUCCESS) {
- i40e_error(i40e, "Update VSI params failed with %d", err);
+ rc = i40e_aq_delete_element(hw, seid, NULL);
+
+ if (rc != I40E_SUCCESS) {
+ i40e_error(i40e, "Failed to delete VSI %d: %d",
+ rc, hw->aq.asq_last_status);
+ }
+
+ i40e->i40e_vsis[idx].iv_seid = 0;
+ }
+}
+
+/*
+ * Add a new VSI.
+ */
+static boolean_t
+i40e_add_vsi(i40e_t *i40e, i40e_hw_t *hw, uint_t idx)
+{
+ struct i40e_vsi_context ctx;
+ i40e_rx_group_t *rxg;
+ int rc;
+
+ /*
+ * The default VSI is created by the controller. This function
+ * creates new, non-defualt VSIs only.
+ */
+ ASSERT3U(idx, !=, 0);
+
+ bzero(&ctx, sizeof (struct i40e_vsi_context));
+ ctx.uplink_seid = i40e->i40e_veb_seid;
+ ctx.pf_num = hw->pf_id;
+ ctx.flags = I40E_AQ_VSI_TYPE_PF;
+ ctx.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+ i40e_set_shared_vsi_props(i40e, &ctx.info, idx);
+
+ rc = i40e_aq_add_vsi(hw, &ctx, NULL);
+ if (rc != I40E_SUCCESS) {
+ i40e_error(i40e, "i40e_aq_add_vsi() failed %d: %d", rc,
+ hw->aq.asq_last_status);
return (B_FALSE);
}
+ rxg = &i40e->i40e_rx_groups[idx];
+ rxg->irg_vsi_seid = ctx.seid;
+ i40e->i40e_vsis[idx].iv_number = ctx.vsi_number;
+ i40e->i40e_vsis[idx].iv_seid = ctx.seid;
+ i40e->i40e_vsis[idx].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
+
+ if (i40e_stat_vsi_init(i40e, idx) == B_FALSE)
+ return (B_FALSE);
return (B_TRUE);
}
/*
- * Configure the RSS key. For the X710 controller family, this is set on a
- * per-PF basis via registers. For the X722, this is done on a per-VSI basis
- * through the admin queue.
+ * Configure the hardware for the Default Virtual Station Interface (VSI).
*/
static boolean_t
-i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
+i40e_config_def_vsi(i40e_t *i40e, i40e_hw_t *hw)
{
- uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
+ struct i40e_vsi_context ctx;
+ i40e_rx_group_t *def_rxg;
+ int err;
+ struct i40e_aqc_remove_macvlan_element_data filt;
- (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+ bzero(&ctx, sizeof (struct i40e_vsi_context));
+ ctx.seid = I40E_DEF_VSI_SEID(i40e);
+ ctx.pf_num = hw->pf_id;
+ err = i40e_aq_get_vsi_params(hw, &ctx, NULL);
+ if (err != I40E_SUCCESS) {
+ i40e_error(i40e, "get VSI params failed with %d", err);
+ return (B_FALSE);
+ }
- if (i40e_is_x722(i40e)) {
+ ctx.info.valid_sections = 0;
+ i40e->i40e_vsis[0].iv_number = ctx.vsi_number;
+ i40e->i40e_vsis[0].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
+ if (i40e_stat_vsi_init(i40e, 0) == B_FALSE)
+ return (B_FALSE);
+
+ i40e_set_shared_vsi_props(i40e, &ctx.info, I40E_DEF_VSI_IDX);
+
+ err = i40e_aq_update_vsi_params(hw, &ctx, NULL);
+ if (err != I40E_SUCCESS) {
+ i40e_error(i40e, "Update VSI params failed with %d", err);
+ return (B_FALSE);
+ }
+
+ def_rxg = &i40e->i40e_rx_groups[0];
+ def_rxg->irg_vsi_seid = I40E_DEF_VSI_SEID(i40e);
+
+ /*
+ * We have seen three different behaviors in regards to the
+ * Default VSI and its implicit L2 MAC+VLAN filter.
+ *
+ * 1. It has an implicit filter for the factory MAC address
+ * and this filter counts against 'ifr_nmacfilt_used'.
+ *
+ * 2. It has an implicit filter for the factory MAC address
+ * and this filter DOES NOT count against 'ifr_nmacfilt_used'.
+ *
+ * 3. It DOES NOT have an implicit filter.
+ *
+ * All three of these cases are accounted for below. If we
+ * fail to remove the L2 filter (ENOENT) then we assume there
+ * wasn't one. Otherwise, if we successfully remove the
+ * filter, we make sure to update the 'ifr_nmacfilt_used'
+ * count accordingly.
+ *
+ * We remove this filter to prevent duplicate delivery of
+ * packets destined for the primary MAC address as DLS will
+ * create the same filter on a non-default VSI for the primary
+ * MAC client.
+ *
+ * If you change the following code please test it across as
+ * many X700 series controllers and firmware revisions as you
+ * can.
+ */
+ bzero(&filt, sizeof (filt));
+ bcopy(hw->mac.port_addr, filt.mac_addr, ETHERADDRL);
+ filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
+ filt.vlan_tag = 0;
+
+ ASSERT3U(i40e->i40e_resources.ifr_nmacfilt_used, <=, 1);
+ i40e_log(i40e, "Num L2 filters: %u",
+ i40e->i40e_resources.ifr_nmacfilt_used);
+
+ err = i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1,
+ NULL);
+ if (err == I40E_SUCCESS) {
+ i40e_log(i40e,
+ "Removed L2 filter from Default VSI with SEID %u",
+ I40E_DEF_VSI_SEID(i40e));
+ } else if (hw->aq.asq_last_status == ENOENT) {
+ i40e_log(i40e,
+ "No L2 filter for Default VSI with SEID %u",
+ I40E_DEF_VSI_SEID(i40e));
+ } else {
+ i40e_error(i40e, "Failed to remove L2 filter from"
+ " Default VSI with SEID %u: %d (%d)",
+ I40E_DEF_VSI_SEID(i40e), err, hw->aq.asq_last_status);
+
+ return (B_FALSE);
+ }
+
+ /*
+ * As mentioned above, the controller created an implicit L2
+ * filter for the primary MAC. We want to remove both the
+ * filter and decrement the filter count. However, not all
+ * controllers count this implicit filter against the total
+ * MAC filter count. So here we are making sure it is either
+ * one or zero. If it is one, then we know it is for the
+ * implicit filter and we should decrement since we just
+ * removed the filter above. If it is zero then we know the
+ * controller that does not count the implicit filter, and it
+ * was enough to just remove it; we leave the count alone.
+ * But if it is neither, then we have never seen a controller
+ * like this before and we should fail to attach.
+ *
+ * It is unfortunate that this code must exist but the
+ * behavior of this implicit L2 filter and its corresponding
+ * count were dicovered through empirical testing. The
+ * programming manuals hint at this filter but do not
+ * explicitly call out the exact behavior.
+ */
+ if (i40e->i40e_resources.ifr_nmacfilt_used == 1) {
+ i40e->i40e_resources.ifr_nmacfilt_used--;
+ } else {
+ if (i40e->i40e_resources.ifr_nmacfilt_used != 0) {
+ i40e_error(i40e, "Unexpected L2 filter count: %u"
+ " (expected 0)",
+ i40e->i40e_resources.ifr_nmacfilt_used);
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static boolean_t
+i40e_config_rss_key_x722(i40e_t *i40e, i40e_hw_t *hw)
+{
+ for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+ uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
struct i40e_aqc_get_set_rss_key_data key;
- const char *u8seed = (char *)seed;
+ const char *u8seed;
enum i40e_status_code status;
+ uint16_t vsi_number = i40e->i40e_vsis[i].iv_number;
+
+ (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+ u8seed = (char *)seed;
CTASSERT(sizeof (key) >= (sizeof (key.standard_rss_key) +
sizeof (key.extended_hash_key)));
@@ -2015,14 +2290,35 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
bcopy(&u8seed[sizeof (key.standard_rss_key)],
key.extended_hash_key, sizeof (key.extended_hash_key));
- status = i40e_aq_set_rss_key(hw, i40e->i40e_vsi_num, &key);
+ ASSERT3U(vsi_number, !=, 0);
+ status = i40e_aq_set_rss_key(hw, vsi_number, &key);
+
if (status != I40E_SUCCESS) {
- i40e_error(i40e, "failed to set rss key: %d", status);
+ i40e_error(i40e, "failed to set RSS key for VSI %u: %d",
+ vsi_number, status);
return (B_FALSE);
}
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Configure the RSS key. For the X710 controller family, this is set on a
+ * per-PF basis via registers. For the X722, this is done on a per-VSI basis
+ * through the admin queue.
+ */
+static boolean_t
+i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
+{
+ if (i40e_is_x722(i40e)) {
+ if (!i40e_config_rss_key_x722(i40e, hw))
+ return (B_FALSE);
} else {
- uint_t i;
- for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
+ uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
+
+ (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+ for (uint_t i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed[i]);
}
@@ -2034,11 +2330,12 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
* family, with the X722 using a known 7-bit width. On the X710 controller, this
* is programmed through its control registers where as on the X722 this is
* configured through the admin queue. Also of note, the X722 allows the LUT to
- * be set on a per-PF or VSI basis. At this time, as we only have a single VSI,
- * we use the PF setting as it is the primary VSI.
+ * be set on a per-PF or VSI basis. At this time we use the PF setting. If we
+ * decide to use the per-VSI LUT in the future, then we will need to modify the
+ * i40e_add_vsi() function to set the RSS LUT bits in the queueing section.
*
* We populate the LUT in a round robin fashion with the rx queue indices from 0
- * to i40e_num_trqpairs - 1.
+ * to i40e_num_trqpairs_per_vsi - 1.
*/
static boolean_t
i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw)
@@ -2068,15 +2365,20 @@ i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw)
lut_mask = (1 << hw->func_caps.rss_table_entry_width) - 1;
}
- for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++)
- ((uint8_t *)hlut)[i] = (i % i40e->i40e_num_trqpairs) & lut_mask;
+ for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) {
+ ((uint8_t *)hlut)[i] =
+ (i % i40e->i40e_num_trqpairs_per_vsi) & lut_mask;
+ }
if (i40e_is_x722(i40e)) {
enum i40e_status_code status;
- status = i40e_aq_set_rss_lut(hw, i40e->i40e_vsi_num, B_TRUE,
- (uint8_t *)hlut, I40E_HLUT_TABLE_SIZE);
+
+ status = i40e_aq_set_rss_lut(hw, 0, B_TRUE, (uint8_t *)hlut,
+ I40E_HLUT_TABLE_SIZE);
+
if (status != I40E_SUCCESS) {
- i40e_error(i40e, "failed to set RSS LUT: %d", status);
+ i40e_error(i40e, "failed to set RSS LUT %d: %d",
+ status, hw->aq.asq_last_status);
goto out;
}
} else {
@@ -2152,6 +2454,7 @@ i40e_chip_start(i40e_t *i40e)
i40e_hw_t *hw = &i40e->i40e_hw_space;
struct i40e_filter_control_settings filter;
int rc;
+ uint8_t err;
if (((hw->aq.fw_maj_ver == 4) && (hw->aq.fw_min_ver < 33)) ||
(hw->aq.fw_maj_ver < 4)) {
@@ -2167,6 +2470,15 @@ i40e_chip_start(i40e_t *i40e)
/* Determine hardware state */
i40e_get_hw_state(i40e, hw);
+ /* For now, we always disable Ethernet Flow Control. */
+ hw->fc.requested_mode = I40E_FC_NONE;
+ rc = i40e_set_fc(hw, &err, B_TRUE);
+ if (rc != I40E_SUCCESS) {
+ i40e_error(i40e, "Setting flow control failed, returned %d"
+ " with error: 0x%x", rc, err);
+ return (B_FALSE);
+ }
+
/* Initialize mac addresses. */
i40e_init_macaddrs(i40e, hw);
@@ -2188,8 +2500,34 @@ i40e_chip_start(i40e_t *i40e)
i40e_intr_chip_init(i40e);
- if (!i40e_config_vsi(i40e, hw))
+ rc = i40e_get_mac_seid(i40e);
+ if (rc == -1) {
+ i40e_error(i40e, "failed to obtain MAC Uplink SEID");
+ return (B_FALSE);
+ }
+ i40e->i40e_mac_seid = (uint16_t)rc;
+
+ /*
+ * Create a VEB in order to support multiple VSIs. Each VSI
+ * functions as a MAC group. This call sets the PF's MAC as
+ * the uplink port and the PF's default VSI as the default
+ * downlink port.
+ */
+ rc = i40e_aq_add_veb(hw, i40e->i40e_mac_seid, I40E_DEF_VSI_SEID(i40e),
+ 0x1, B_TRUE, &i40e->i40e_veb_seid, B_FALSE, NULL);
+ if (rc != I40E_SUCCESS) {
+ i40e_error(i40e, "i40e_aq_add_veb() failed %d: %d", rc,
+ hw->aq.asq_last_status);
return (B_FALSE);
+ }
+
+ if (!i40e_config_def_vsi(i40e, hw))
+ return (B_FALSE);
+
+ for (uint_t i = 1; i < i40e->i40e_num_rx_groups; i++) {
+ if (!i40e_add_vsi(i40e, hw, i))
+ return (B_FALSE);
+ }
if (!i40e_config_rss(i40e, hw))
return (B_FALSE);
@@ -2549,7 +2887,7 @@ i40e_setup_tx_hmc(i40e_trqpair_t *itrq)
* assigned to traffic class zero, because we don't actually use them.
*/
bzero(&context, sizeof (struct i40e_vsi_context));
- context.seid = i40e->i40e_vsi_id;
+ context.seid = I40E_DEF_VSI_SEID(i40e);
context.pf_num = hw->pf_id;
err = i40e_aq_get_vsi_params(hw, &context, NULL);
if (err != I40E_SUCCESS) {
@@ -2653,7 +2991,8 @@ i40e_setup_tx_rings(i40e_t *i40e)
void
i40e_stop(i40e_t *i40e, boolean_t free_allocations)
{
- int i;
+ uint_t i;
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
@@ -2689,6 +3028,27 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations)
delay(50 * drv_usectohz(1000));
+ /*
+ * We don't delete the default VSI because it replaces the VEB
+ * after VEB deletion (see the "Delete Element" section).
+ * Furthermore, since the default VSI is provided by the
+ * firmware, we never attempt to delete it.
+ */
+ for (i = 1; i < i40e->i40e_num_rx_groups; i++) {
+ i40e_delete_vsi(i40e, i);
+ }
+
+ if (i40e->i40e_veb_seid != 0) {
+ int rc = i40e_aq_delete_element(hw, i40e->i40e_veb_seid, NULL);
+
+ if (rc != I40E_SUCCESS) {
+ i40e_error(i40e, "Failed to delete VEB %d: %d", rc,
+ hw->aq.asq_last_status);
+ }
+
+ i40e->i40e_veb_seid = 0;
+ }
+
i40e_intr_chip_fini(i40e);
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
@@ -2718,7 +3078,9 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations)
mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock);
}
- i40e_stat_vsi_fini(i40e);
+ for (i = 0; i < i40e->i40e_num_rx_groups; i++) {
+ i40e_stat_vsi_fini(i40e, i);
+ }
i40e->i40e_link_speed = 0;
i40e->i40e_link_duplex = 0;
@@ -2783,7 +3145,8 @@ i40e_start(i40e_t *i40e, boolean_t alloc)
* Enable broadcast traffic; however, do not enable multicast traffic.
* That's handle exclusively through MAC's mc_multicst routines.
*/
- err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL);
+ err = i40e_aq_set_vsi_broadcast(hw, I40E_DEF_VSI_SEID(i40e), B_TRUE,
+ NULL);
if (err != I40E_SUCCESS) {
i40e_error(i40e, "failed to set default VSI: %d", err);
rc = B_FALSE;
diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c
index 7a4f0faedd..e40c9f2c53 100644
--- a/usr/src/uts/common/io/i40e/i40e_stats.c
+++ b/usr/src/uts/common/io/i40e/i40e_stats.c
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include "i40e_sw.h"
@@ -69,12 +69,7 @@
* ---------------------
*
* The hardware keeps statistics at each physical function/MAC (PF) and it keeps
- * statistics on each virtual station interface (VSI). Currently we only use one
- * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited
- * number of statistics units available. While every PF is guaranteed to have a
- * statistics unit, it is possible that we will run out for a given VSI. We'll
- * have to figure out an appropriate strategy here when we end up supporting
- * multiple VSIs.
+ * statistics on each virtual station interface (VSI).
*
* The hardware keeps these statistics as 32-bit and 48-bit counters. We are
* required to read them and then compute the differences between them. The
@@ -100,10 +95,10 @@
* data.
*
* The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the
- * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in
- * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All
- * of this data is protected by the i40e_stat_lock, which should be taken last,
- * when acquiring locks.
+ * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstats are in
+ * i40e_t`i40e_vsis[idx].iv_kstats and the data is backed in the
+ * i40e_t`i40e_vsis[idx].iv_stats. All of this data is protected by the
+ * i40e_stat_lock, which should be taken last, when acquiring locks.
*/
static void
@@ -169,15 +164,15 @@ i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat,
}
static void
-i40e_stat_vsi_update(i40e_t *i40e, boolean_t init)
+i40e_stat_vsi_update(i40e_t *i40e, uint_t idx, boolean_t init)
{
i40e_vsi_stats_t *ivs;
i40e_vsi_kstats_t *ivk;
- int id = i40e->i40e_vsi_stat_id;
+ uint16_t id = i40e->i40e_vsis[idx].iv_stats_id;
- ASSERT(i40e->i40e_vsi_kstat != NULL);
- ivs = &i40e->i40e_vsi_stat;
- ivk = i40e->i40e_vsi_kstat->ks_data;
+ ASSERT3P(i40e->i40e_vsis[idx].iv_kstats, !=, NULL);
+ ivs = &i40e->i40e_vsis[idx].iv_stats;
+ ivk = i40e->i40e_vsis[idx].iv_kstats->ks_data;
mutex_enter(&i40e->i40e_stat_lock);
@@ -231,39 +226,41 @@ i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw)
return (EACCES);
i40e = ksp->ks_private;
- i40e_stat_vsi_update(i40e, B_FALSE);
+ for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++)
+ i40e_stat_vsi_update(i40e, i, B_FALSE);
+
return (0);
}
void
-i40e_stat_vsi_fini(i40e_t *i40e)
+i40e_stat_vsi_fini(i40e_t *i40e, uint_t idx)
{
- if (i40e->i40e_vsi_kstat != NULL) {
- kstat_delete(i40e->i40e_vsi_kstat);
- i40e->i40e_vsi_kstat = NULL;
+ if (i40e->i40e_vsis[idx].iv_kstats != NULL) {
+ kstat_delete(i40e->i40e_vsis[idx].iv_kstats);
+ i40e->i40e_vsis[idx].iv_kstats = NULL;
}
}
boolean_t
-i40e_stat_vsi_init(i40e_t *i40e)
+i40e_stat_vsi_init(i40e_t *i40e, uint_t idx)
{
kstat_t *ksp;
i40e_vsi_kstats_t *ivk;
char buf[64];
+ uint16_t vsi_id = i40e->i40e_vsis[idx].iv_seid;
- (void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id);
+ (void) snprintf(buf, sizeof (buf), "vsi_%u", vsi_id);
ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip),
buf, "net", KSTAT_TYPE_NAMED,
sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0);
if (ksp == NULL) {
- i40e_error(i40e, "Failed to create kstats for VSI %d",
- i40e->i40e_vsi_id);
+ i40e_error(i40e, "Failed to create kstats for VSI %u", vsi_id);
return (B_FALSE);
}
- i40e->i40e_vsi_kstat = ksp;
+ i40e->i40e_vsis[idx].iv_kstats = ksp;
ivk = ksp->ks_data;
ksp->ks_update = i40e_stat_vsi_kstat_update;
ksp->ks_private = i40e;
@@ -291,9 +288,9 @@ i40e_stat_vsi_init(i40e_t *i40e)
kstat_named_init(&ivk->ivk_tx_errors, "tx_errors",
KSTAT_DATA_UINT64);
- bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t));
- i40e_stat_vsi_update(i40e, B_TRUE);
- kstat_install(i40e->i40e_vsi_kstat);
+ bzero(&i40e->i40e_vsis[idx].iv_stats, sizeof (i40e_vsi_stats_t));
+ i40e_stat_vsi_update(i40e, idx, B_TRUE);
+ kstat_install(i40e->i40e_vsis[idx].iv_kstats);
return (B_TRUE);
}
@@ -670,7 +667,12 @@ i40e_stat_pf_init(i40e_t *i40e)
void
i40e_stats_fini(i40e_t *i40e)
{
- ASSERT(i40e->i40e_vsi_kstat == NULL);
+#ifdef DEBUG
+ for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+ ASSERT3P(i40e->i40e_vsis[i].iv_kstats, ==, NULL);
+ }
+#endif
+
if (i40e->i40e_pf_kstat != NULL) {
kstat_delete(i40e->i40e_pf_kstat);
i40e->i40e_pf_kstat = NULL;
@@ -1230,6 +1232,12 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
kstat_named_init(&tsp->itxs_recycled, "tx_recycled",
KSTAT_DATA_UINT64);
tsp->itxs_recycled.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_force_copy, "tx_force_copy",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_force_copy.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_tso_force_copy, "tx_tso_force_copy",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_tso_force_copy.value.ui64 = 0;
kstat_named_init(&tsp->itxs_hck_meoifail, "tx_hck_meoifail",
KSTAT_DATA_UINT64);
@@ -1249,6 +1257,15 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4",
KSTAT_DATA_UINT64);
tsp->itxs_hck_badl4.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_lso_nohck, "tx_lso_nohck",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_lso_nohck.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_bind_fails, "tx_bind_fails",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_bind_fails.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_tx_short, "tx_short",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_tx_short.value.ui64 = 0;
kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb",
KSTAT_DATA_UINT64);
tsp->itxs_err_notcb.value.ui64 = 0;
diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h
index 78aced0144..e7b64c2160 100644
--- a/usr/src/uts/common/io/i40e/i40e_sw.h
+++ b/usr/src/uts/common/io/i40e/i40e_sw.h
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2017 Tegile Systems, Inc. All rights reserved.
*/
@@ -152,9 +152,10 @@ typedef enum i40e_itr_index {
} i40e_itr_index_t;
/*
- * Table 1-5 of the PRM notes that LSO supports up to 256 KB.
+ * The hardware claims to support LSO up to 256 KB, but due to the limitations
+ * imposed by the IP header for non-jumbo frames, we cap it at 64 KB.
*/
-#define I40E_LSO_MAXLEN (256 * 1024)
+#define I40E_LSO_MAXLEN (64 * 1024)
#define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */
#define I40E_DRAIN_RX_WAIT (500 * MILLISEC) /* In us */
@@ -173,13 +174,22 @@ typedef enum i40e_itr_index {
#define I40E_BUF_IPHDR_ALIGNMENT 2
/*
- * The XL710 controller has a limit of eight buffers being allowed to be used
- * for the transmission of a single frame. This is defined in 8.4.1 - Transmit
+ * The XL710 controller has a total of eight buffers available for the
+ * transmission of any single frame. This is defined in 8.4.1 - Transmit
* Packet in System Memory.
*/
#define I40E_TX_MAX_COOKIE 8
/*
+ * An LSO frame can be as large as 64KB, so we allow a DMA bind to span more
+ * cookies than a non-LSO frame. The key here to is to select a value such
+ * that once the HW has chunked up the LSO frame into MSS-sized segments that no
+ * single segment spans more than 8 cookies (see comments for
+ * I40E_TX_MAX_COOKIE)
+ */
+#define I40E_TX_LSO_MAX_COOKIE 32
+
+/*
* Sizing to determine the amount of available descriptors at which we'll
* consider ourselves blocked. Also, when we have these available, we'll then
* consider ourselves available to transmit to MAC again. Strictly speaking, the
@@ -203,6 +213,12 @@ typedef enum i40e_itr_index {
#define I40E_MAX_TX_DMA_THRESH INT32_MAX
/*
+ * The max size of each individual tx buffer is 16KB - 1.
+ * See table 8-17
+ */
+#define I40E_MAX_TX_BUFSZ 0x0000000000003FFFull
+
+/*
* Resource sizing counts. There are various aspects of hardware where we may
* have some variable number of elements that we need to handle. Such as the
* hardware capabilities and switch capacities. We cannot know a priori how many
@@ -240,21 +256,6 @@ typedef enum i40e_itr_index {
#define I40E_HMC_TX_TPH_DISABLE 0
/*
- * Whenever we establish and create a VSI, we need to assign some number of
- * queues that it's allowed to access from the PF. Because we only have a single
- * VSI per PF at this time, we assign it all the queues.
- *
- * Many of the devices support what's called Data-center Bridging. Which is a
- * feature that we don't have much use of at this time. However, we still need
- * to fill in this information. We follow the guidance of the note in Table 7-80
- * which talks about bytes 62-77. It says that if we don't want to assign
- * anything to traffic classes, we should set the field to zero. Effectively
- * this means that everything in the system is assigned to traffic class zero.
- */
-#define I40E_ASSIGN_ALL_QUEUES 0
-#define I40E_TRAFFIC_CLASS_NO_QUEUES 0
-
-/*
* This defines the error mask that we care about from rx descriptors. Currently
* we're only concerned with the general errors and oversize errors.
*/
@@ -268,12 +269,12 @@ typedef enum i40e_itr_index {
#define I40E_DDI_PROP_LEN 64
/*
- * We currently consolidate some overrides that we use in the code here. These
- * will be gone in the fullness of time, but as we're bringing up the device,
- * this is what we use.
+ * Place an artificial limit on the max number of groups. The X710
+ * series supports up to 384 VSIs to be partitioned across PFs as the
+ * driver sees fit. But until we support more interrupts this seems
+ * like a good place to start.
*/
-#define I40E_GROUP_MAX 1
-#define I40E_TRQPAIR_MAX 1
+#define I40E_GROUP_MAX 32
#define I40E_GROUP_NOMSIX 1
#define I40E_TRQPAIR_NOMSIX 1
@@ -405,18 +406,29 @@ typedef struct i40e_rx_control_block {
typedef enum {
I40E_TX_NONE,
I40E_TX_COPY,
- I40E_TX_DMA
+ I40E_TX_DMA,
+ I40E_TX_DESC,
} i40e_tx_type_t;
typedef struct i40e_tx_desc i40e_tx_desc_t;
+typedef struct i40e_tx_context_desc i40e_tx_context_desc_t;
typedef union i40e_32byte_rx_desc i40e_rx_desc_t;
+struct i40e_dma_bind_info {
+ caddr_t dbi_paddr;
+ size_t dbi_len;
+};
+
typedef struct i40e_tx_control_block {
struct i40e_tx_control_block *tcb_next;
mblk_t *tcb_mp;
i40e_tx_type_t tcb_type;
ddi_dma_handle_t tcb_dma_handle;
+ ddi_dma_handle_t tcb_lso_dma_handle;
i40e_dma_buffer_t tcb_dma;
+ struct i40e_dma_bind_info *tcb_bind_info;
+ uint_t tcb_bind_ncookies;
+ boolean_t tcb_used_lso;
} i40e_tx_control_block_t;
/*
@@ -517,6 +529,8 @@ typedef struct i40e_txq_stat {
kstat_named_t itxs_packets; /* Packets out on queue */
kstat_named_t itxs_descriptors; /* Descriptors issued */
kstat_named_t itxs_recycled; /* Descriptors reclaimed */
+ kstat_named_t itxs_force_copy; /* non-TSO force copy */
+ kstat_named_t itxs_tso_force_copy; /* TSO force copy */
/*
* Various failure conditions.
*/
@@ -526,6 +540,9 @@ typedef struct i40e_txq_stat {
kstat_named_t itxs_hck_nol4info; /* Missing l4 info */
kstat_named_t itxs_hck_badl3; /* Not IPv4/IPv6 */
kstat_named_t itxs_hck_badl4; /* Bad L4 Paylaod */
+ kstat_named_t itxs_lso_nohck; /* Missing offloads for LSO */
+ kstat_named_t itxs_bind_fails; /* DMA bind failures */
+ kstat_named_t itxs_tx_short; /* Tx chain too short */
kstat_named_t itxs_err_notcb; /* No tcb's available */
kstat_named_t itxs_err_nodescs; /* No tcb's available */
@@ -761,6 +778,25 @@ typedef struct i40e_func_rsrc {
uint_t ifr_nmcastfilt_used;
} i40e_func_rsrc_t;
+typedef struct i40e_vsi {
+ uint16_t iv_seid;
+ uint16_t iv_number;
+ kstat_t *iv_kstats;
+ i40e_vsi_stats_t iv_stats;
+ uint16_t iv_stats_id;
+} i40e_vsi_t;
+
+/*
+ * While irg_index and irg_grp_hdl aren't used anywhere, they are
+ * still useful for debugging.
+ */
+typedef struct i40e_rx_group {
+ uint32_t irg_index; /* index in i40e_rx_groups[] */
+ uint16_t irg_vsi_seid; /* SEID of VSI for this group */
+ mac_group_handle_t irg_grp_hdl; /* handle to mac_group_t */
+ struct i40e *irg_i40e; /* ref to i40e_t */
+} i40e_rx_group_t;
+
/*
* Main i40e per-instance state.
*/
@@ -789,11 +825,18 @@ typedef struct i40e {
struct i40e_aq_get_phy_abilities_resp i40e_phy;
void *i40e_aqbuf;
+#define I40E_DEF_VSI_IDX 0
+#define I40E_DEF_VSI(i40e) ((i40e)->i40e_vsis[I40E_DEF_VSI_IDX])
+#define I40E_DEF_VSI_SEID(i40e) (I40E_DEF_VSI(i40e).iv_seid)
+
/*
* Device state, switch information, and resources.
*/
- int i40e_vsi_id;
- uint16_t i40e_vsi_num;
+ i40e_vsi_t i40e_vsis[I40E_GROUP_MAX];
+ uint16_t i40e_mac_seid; /* SEID of physical MAC */
+ uint16_t i40e_veb_seid; /* switch atop MAC (SEID) */
+ uint16_t i40e_vsi_avail; /* VSIs avail to this PF */
+ uint16_t i40e_vsi_used; /* VSIs used by this PF */
struct i40e_device *i40e_device;
i40e_func_rsrc_t i40e_resources;
uint16_t i40e_switch_rsrc_alloc;
@@ -814,12 +857,13 @@ typedef struct i40e {
*/
i40e_trqpair_t *i40e_trqpairs;
boolean_t i40e_mr_enable;
- int i40e_num_trqpairs;
+ uint_t i40e_num_trqpairs; /* total TRQPs (per PF) */
+ uint_t i40e_num_trqpairs_per_vsi; /* TRQPs per VSI */
uint_t i40e_other_itr;
- int i40e_num_rx_groups;
+ i40e_rx_group_t *i40e_rx_groups;
+ uint_t i40e_num_rx_groups;
int i40e_num_rx_descs;
- mac_group_handle_t i40e_rx_group_handle;
uint32_t i40e_rx_ring_size;
uint32_t i40e_rx_buf_size;
boolean_t i40e_rx_hcksum_enable;
@@ -832,6 +876,7 @@ typedef struct i40e {
uint32_t i40e_tx_buf_size;
uint32_t i40e_tx_block_thresh;
boolean_t i40e_tx_hcksum_enable;
+ boolean_t i40e_tx_lso_enable;
uint32_t i40e_tx_dma_min;
uint_t i40e_tx_itr;
@@ -855,6 +900,7 @@ typedef struct i40e {
*/
ddi_dma_attr_t i40e_static_dma_attr;
ddi_dma_attr_t i40e_txbind_dma_attr;
+ ddi_dma_attr_t i40e_txbind_lso_dma_attr;
ddi_device_acc_attr_t i40e_desc_acc_attr;
ddi_device_acc_attr_t i40e_buf_acc_attr;
@@ -872,10 +918,7 @@ typedef struct i40e {
*/
kmutex_t i40e_stat_lock;
kstat_t *i40e_pf_kstat;
- kstat_t *i40e_vsi_kstat;
i40e_pf_stats_t i40e_pf_stat;
- i40e_vsi_stats_t i40e_vsi_stat;
- uint16_t i40e_vsi_stat_id;
/*
* Misc. stats and counters that should maybe one day be kstats.
@@ -975,8 +1018,8 @@ extern void i40e_tx_cleanup_ring(i40e_trqpair_t *);
*/
extern boolean_t i40e_stats_init(i40e_t *);
extern void i40e_stats_fini(i40e_t *);
-extern boolean_t i40e_stat_vsi_init(i40e_t *);
-extern void i40e_stat_vsi_fini(i40e_t *);
+extern boolean_t i40e_stat_vsi_init(i40e_t *, uint_t);
+extern void i40e_stat_vsi_fini(i40e_t *, uint_t);
extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *);
extern void i40e_stats_trqpair_fini(i40e_trqpair_t *);
extern int i40e_m_stat(void *, uint_t, uint64_t *);
diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c
index 57620f03fa..caafa3e102 100644
--- a/usr/src/uts/common/io/i40e/i40e_transceiver.c
+++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include "i40e_sw.h"
@@ -60,19 +60,19 @@
* This size is then rounded up to the nearest 1k chunk, which represents the
* actual amount of memory that we'll allocate for a single frame.
*
- * Note, that for rx, we do something that might be unexpected. We always add
+ * Note, that for RX, we do something that might be unexpected. We always add
* an extra two bytes to the frame size that we allocate. We then offset the DMA
* address that we receive a packet into by two bytes. This ensures that the IP
* header will always be 4 byte aligned because the MAC header is either 14 or
* 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
* and MAC's lives easier.
*
- * Both the rx and tx descriptor rings (which are what we use to communicate
+ * Both the RX and TX descriptor rings (which are what we use to communicate
* with hardware) are allocated as a single region of DMA memory which is the
* size of the descriptor (4 bytes and 2 bytes respectively) times the total
- * number of descriptors for an rx and tx ring.
+ * number of descriptors for an RX and TX ring.
*
- * While the rx and tx descriptors are allocated using DMA-based memory, the
+ * While the RX and TX descriptors are allocated using DMA-based memory, the
* control blocks for each of them are allocated using normal kernel memory.
* They aren't special from a DMA perspective. We'll go over the design of both
* receiving and transmitting separately, as they have slightly different
@@ -113,16 +113,16 @@
*
* To try and ensure that the device always has blocks that it can receive data
* into, we maintain two lists of control blocks, a working list and a free
- * list. Each list is sized equal to the number of descriptors in the rx ring.
- * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
+ * list. Each list is sized equal to the number of descriptors in the RX ring.
+ * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
* equal to twice the number of descriptors in the ring and we assign them
* equally to the free list and to the working list. Each control block also has
* DMA memory allocated and associated with which it will be used to receive the
* actual packet data. All of a received frame's data will end up in a single
* DMA buffer.
*
- * During operation, we always maintain the invariant that each rx descriptor
- * has an associated rx control block which lives in the working list. If we
+ * During operation, we always maintain the invariant that each RX descriptor
+ * has an associated RX control block which lives in the working list. If we
* feel that we should loan up DMA memory to MAC in the form of a message block,
* we can only do so if we can maintain this invariant. To do that, we swap in
* one of the buffers from the free list. If none are available, then we resort
@@ -130,14 +130,14 @@
* size.
*
* Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
- * called on the block, at which point we restore the rx control block to the
+ * called on the block, at which point we restore the RX control block to the
* free list and are able to reuse the DMA memory again. While the scheme may
* seem odd, it importantly keeps us out of trying to do any DMA allocations in
* the normal path of operation, even though we may still have to allocate
* message blocks and copy.
*
- * The following state machine describes the life time of a rx control block. In
- * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
+ * The following state machine describes the life time of a RX control block. In
+ * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
* control block entry as rcb.
*
* | |
@@ -160,11 +160,11 @@
* +--------------------<-----| rcb loaned to MAC |
* +-------------------+
*
- * Finally, note that every rx control block has a reference count on it. One
+ * Finally, note that every RX control block has a reference count on it. One
* reference is added as long as the driver has had the GLDv3 mc_start endpoint
* called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
* no other DLPI consumers remain, then we'll decrement the reference count by
- * one. Whenever we loan up the rx control block and associated buffer to MAC,
+ * one. Whenever we loan up the RX control block and associated buffer to MAC,
* then we bump the reference count again. Even though the device is stopped,
* there may still be loaned frames in upper levels that we'll want to account
* for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
@@ -192,10 +192,10 @@
* state tracking. Effectively, we cache the HEAD register and then update it
* ourselves based on our work.
*
- * When we iterate over the rx descriptors and thus the received frames, we are
+ * When we iterate over the RX descriptors and thus the received frames, we are
* either in an interrupt context or we've been asked by MAC to poll on the
* ring. If we've been asked to poll on the ring, we have a maximum number of
- * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
+ * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
* exceed that count, then we do not process it. When in interrupt context, we
* don't have a strict byte count. However, to ensure liveness, we limit the
* amount of data based on a configuration value
@@ -249,31 +249,54 @@
* differently due to the fact that all data is originated by the operating
* system and not by the device.
*
- * Like rx, there is both a descriptor ring that we use to communicate to the
- * driver and which points to the memory used to transmit a frame. Similarly,
- * there is a corresponding transmit control block. Each transmit control block
- * has a region of DMA memory allocated to it; however, the way we use it
- * varies.
+ * Like RX, there is both a descriptor ring that we use to communicate to the
+ * driver and which points to the memory used to transmit a frame. Similarly,
+ * there is a corresponding transmit control block, however, the correspondence
+ * between descriptors and control blocks is more complex and not necessarily
+ * 1-to-1.
*
* The driver is asked to process a single frame at a time. That message block
* may be made up of multiple fragments linked together by the mblk_t`b_cont
* member. The device has a hard limit of up to 8 buffers being allowed for use
- * for a single logical frame. For each fragment, we'll try and use an entry
- * from the tx descriptor ring and then we'll allocate a corresponding tx
- * control block. Depending on the size of the fragment, we may copy it around
- * or we might instead try to do DMA binding of the fragment.
- *
- * If we exceed the number of blocks that fit, we'll try to pull up the block
- * and then we'll do a DMA bind and send it out.
- *
- * If we don't have enough space in the ring or tx control blocks available,
+ * for a single non-LSO packet or LSO segment. The number of TX ring entires
+ * (and thus TX control blocks) used depends on the fragment sizes and DMA
+ * layout, as explained below.
+ *
+ * We alter our DMA strategy based on a threshold tied to the fragment size.
+ * This threshold is configurable via the tx_dma_threshold property. If the
+ * fragment is above the threshold, we DMA bind it -- consuming one TCB and
+ * potentially several data descriptors. The exact number of descriptors (equal
+ * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
+ * into page, b_wptr offset into page, and the physical layout of the dblk's
+ * memory (contiguous or not). Essentially, we are at the mercy of the DMA
+ * engine and the dblk's memory allocation. Knowing the exact number of
+ * descriptors up front is a task best not taken on by the driver itself.
+ * Instead, we attempt to DMA bind the fragment and verify the descriptor
+ * layout meets hardware constraints. If the proposed DMA bind does not satisfy
+ * the hardware constaints, then we discard it and instead copy the entire
+ * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
+ * larger than the TCB buffer).
+ *
+ * If the fragment is below or at the threshold, we copy it to the pre-allocated
+ * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
+ * conserve resources. We are guaranteed that the TCB buffer is made up of only
+ * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
+ *
+ * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
+ * filtering, then the TX data descriptors must be preceeded by a single TX
+ * context descriptor. Because there is no DMA transfer associated with the
+ * context descriptor, we allocate a control block with a special type which
+ * indicates to the TX ring recycle code that there are no associated DMA
+ * resources to unbind when the control block is free'd.
+ *
+ * If we don't have enough space in the ring or TX control blocks available,
* then we'll return the unprocessed message block to MAC. This will induce flow
* control and once we recycle enough entries, we'll once again enable sending
* on the ring.
*
* We size the working list as equal to the number of descriptors in the ring.
* We size the free list as equal to 1.5 times the number of descriptors in the
- * ring. We'll allocate a number of tx control block entries equal to the number
+ * ring. We'll allocate a number of TX control block entries equal to the number
* of entries in the free list. By default, all entries are placed in the free
* list. As we come along and try to send something, we'll allocate entries from
* the free list and add them to the working list, where they'll stay until the
@@ -325,7 +348,7 @@
* +------------------+ +------------------+
* | tcb on free list |---*------------------>| tcb on work list |
* +------------------+ . +------------------+
- * ^ . tcb allocated |
+ * ^ . N tcbs allocated[1] |
* | to send frame v
* | or fragment on |
* | wire, mblk from |
@@ -335,20 +358,27 @@
* .
* . Hardware indicates
* entry transmitted.
- * tcb recycled, mblk
+ * tcbs recycled, mblk
* from MAC freed.
*
+ * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
+ * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA
+ * bind case, N can be 1 context descriptor plus 1 data descriptor per
+ * b_cont in the mblk. In this case, the mblk is associated with the first
+ * data descriptor and freed as part of freeing that data descriptor.
+ *
* ------------
* Blocking MAC
* ------------
*
- * Wen performing transmit, we can run out of descriptors and ring entries. When
- * such a case happens, we return the mblk_t to MAC to indicate that we've been
- * blocked. At that point in time, MAC becomes blocked and will not transmit
- * anything out that specific ring until we notify MAC. To indicate that we're
- * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
+ * When performing transmit, we can run out of descriptors and ring entries.
+ * When such a case happens, we return the mblk_t to MAC to indicate that we've
+ * been blocked. At that point in time, MAC becomes blocked and will not
+ * transmit anything out that specific ring until we notify MAC. To indicate
+ * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
+ * to B_TRUE.
*
- * When we recycle tx descriptors then we'll end up signaling MAC by calling
+ * When we recycle TX descriptors then we'll end up signaling MAC by calling
* mac_tx_ring_update() if we were blocked, letting it know that it's safe to
* start sending frames out to us again.
*/
@@ -367,13 +397,15 @@
/*
* This structure is used to maintain information and flags related to
- * transmitting a frame. The first member is the set of flags we need to or into
- * the command word (generally checksumming related). The second member controls
- * the word offsets which is required for IP and L4 checksumming.
+ * transmitting a frame. These fields are ultimately used to construct the
+ * TX data descriptor(s) and, if necessary, the TX context descriptor.
*/
typedef struct i40e_tx_context {
- enum i40e_tx_desc_cmd_bits itc_cmdflags;
- uint32_t itc_offsets;
+ enum i40e_tx_desc_cmd_bits itc_data_cmdflags;
+ uint32_t itc_data_offsets;
+ enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags;
+ uint32_t itc_ctx_tsolen;
+ uint32_t itc_ctx_mss;
} i40e_tx_context_t;
/*
@@ -395,14 +427,18 @@ i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
* i40e_static_dma_attr, is designed to be used for both the descriptor rings
* and the static buffers that we associate with control blocks. For this
* reason, we force an SGL length of one. While technically the driver supports
- * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
+ * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
* management here. In addition, when the Intel common code wants to allocate
* memory via the i40e_allocate_virt_mem osdep function, we have it leverage
* the static dma attr.
*
- * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
- * binding a bunch of mblk_t fragments to go out the door. Note that the main
- * difference here is that we're allowed a larger SGL length -- eight.
+ * The latter two sets of attributes, are what we use when we're binding a
+ * bunch of mblk_t fragments to go out the door. Note that the main difference
+ * here is that we're allowed a larger SGL length. For non-LSO TX, we
+ * restrict the SGL length to match the number of TX buffers available to the
+ * PF (8). For the LSO case we can go much larger, with the caveat that each
+ * MSS-sized chunk (segment) must not span more than 8 data descriptors and
+ * hence must not span more than 8 cookies.
*
* Note, we default to setting ourselves to be DMA capable here. However,
* because we could have multiple instances which have different FMA error
@@ -429,7 +465,7 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
DMA_ATTR_V0, /* version number */
0x0000000000000000ull, /* low address */
0xFFFFFFFFFFFFFFFFull, /* high address */
- 0x00000000FFFFFFFFull, /* dma counter max */
+ I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
I40E_DMA_ALIGNMENT, /* alignment */
0x00000FFF, /* burst sizes */
0x00000001, /* minimum transfer size */
@@ -440,6 +476,21 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
DDI_DMA_FLAGERR /* DMA flags */
};
+static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
+ DMA_ATTR_V0, /* version number */
+ 0x0000000000000000ull, /* low address */
+ 0xFFFFFFFFFFFFFFFFull, /* high address */
+ I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
+ I40E_DMA_ALIGNMENT, /* alignment */
+ 0x00000FFF, /* burst sizes */
+ 0x00000001, /* minimum transfer size */
+ 0x00000000FFFFFFFFull, /* maximum transfer size */
+ 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
+ I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */
+ 0x00000001, /* granularity */
+ DDI_DMA_FLAGERR /* DMA flags */
+};
+
/*
* Next, we have the attributes for these structures. The descriptor rings are
* all strictly little endian, while the data buffers are just arrays of bytes
@@ -668,7 +719,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
rxd->rxd_ring_size, KM_NOSLEEP);
if (rxd->rxd_work_list == NULL) {
- i40e_error(i40e, "failed to allocate rx work list for a ring "
+ i40e_error(i40e, "failed to allocate RX work list for a ring "
"of %d entries for ring %d", rxd->rxd_ring_size,
itrq->itrq_index);
goto cleanup;
@@ -677,7 +728,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
rxd->rxd_free_list_size, KM_NOSLEEP);
if (rxd->rxd_free_list == NULL) {
- i40e_error(i40e, "failed to allocate a %d entry rx free list "
+ i40e_error(i40e, "failed to allocate a %d entry RX free list "
"for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
goto cleanup;
}
@@ -765,7 +816,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
i40e_t *i40e = rxd->rxd_i40e;
/*
- * First allocate the rx descriptor ring.
+ * First allocate the RX descriptor ring.
*/
dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
VERIFY(dmasz > 0);
@@ -773,7 +824,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
&i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
B_TRUE, dmasz) == B_FALSE) {
i40e_error(i40e, "failed to allocate DMA resources "
- "for rx descriptor ring");
+ "for RX descriptor ring");
return (B_FALSE);
}
rxd->rxd_desc_ring =
@@ -799,7 +850,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
if (i40e_alloc_dma_buffer(i40e, dmap,
&i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
B_TRUE, B_FALSE, dmasz) == B_FALSE) {
- i40e_error(i40e, "failed to allocate rx dma buffer");
+ i40e_error(i40e, "failed to allocate RX dma buffer");
return (B_FALSE);
}
@@ -841,6 +892,10 @@ i40e_free_tx_dma(i40e_trqpair_t *itrq)
ddi_dma_free_handle(&tcb->tcb_dma_handle);
tcb->tcb_dma_handle = NULL;
}
+ if (tcb->tcb_lso_dma_handle != NULL) {
+ ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
+ tcb->tcb_lso_dma_handle = NULL;
+ }
}
fsz = sizeof (i40e_tx_control_block_t) *
@@ -881,7 +936,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
(i40e->i40e_tx_ring_size >> 1);
/*
- * Allocate an additional tx descriptor for the writeback head.
+ * Allocate an additional TX descriptor for the writeback head.
*/
dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
dmasz += sizeof (i40e_tx_desc_t);
@@ -890,7 +945,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
&i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
B_FALSE, B_TRUE, dmasz) == B_FALSE) {
- i40e_error(i40e, "failed to allocate DMA resources for tx "
+ i40e_error(i40e, "failed to allocate DMA resources for TX "
"descriptor ring");
return (B_FALSE);
}
@@ -905,7 +960,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
if (itrq->itrq_tcb_work_list == NULL) {
- i40e_error(i40e, "failed to allocate a %d entry tx work list "
+ i40e_error(i40e, "failed to allocate a %d entry TX work list "
"for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
goto cleanup;
}
@@ -913,14 +968,14 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
sizeof (i40e_tx_control_block_t *), KM_SLEEP);
if (itrq->itrq_tcb_free_list == NULL) {
- i40e_error(i40e, "failed to allocate a %d entry tx free list "
+ i40e_error(i40e, "failed to allocate a %d entry TX free list "
"for ring %d", itrq->itrq_tx_free_list_size,
itrq->itrq_index);
goto cleanup;
}
/*
- * We allocate enough tx control blocks to cover the free list.
+ * We allocate enough TX control blocks to cover the free list.
*/
itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
itrq->itrq_tx_free_list_size, KM_NOSLEEP);
@@ -948,18 +1003,29 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
&i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
&tcb->tcb_dma_handle);
if (ret != DDI_SUCCESS) {
- i40e_error(i40e, "failed to allocate DMA handle for tx "
+ i40e_error(i40e, "failed to allocate DMA handle for TX "
"data binding on ring %d: %d", itrq->itrq_index,
ret);
tcb->tcb_dma_handle = NULL;
goto cleanup;
}
+ ret = ddi_dma_alloc_handle(i40e->i40e_dip,
+ &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
+ &tcb->tcb_lso_dma_handle);
+ if (ret != DDI_SUCCESS) {
+ i40e_error(i40e, "failed to allocate DMA handle for TX "
+ "LSO data binding on ring %d: %d", itrq->itrq_index,
+ ret);
+ tcb->tcb_lso_dma_handle = NULL;
+ goto cleanup;
+ }
+
if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
&i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
B_TRUE, B_FALSE, dmasz) == B_FALSE) {
i40e_error(i40e, "failed to allocate %ld bytes of "
- "DMA for tx data binding on ring %d", dmasz,
+ "DMA for TX data binding on ring %d", dmasz,
itrq->itrq_index);
goto cleanup;
}
@@ -989,10 +1055,17 @@ i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
/*
- * Clean up our rx data. We have to free DMA resources first and
+ * In some cases i40e_alloc_rx_data() may have failed
+ * and in that case there is no rxd to free.
+ */
+ if (rxd == NULL)
+ continue;
+
+ /*
+ * Clean up our RX data. We have to free DMA resources first and
* then if we have no more pending RCB's, then we'll go ahead
* and clean things up. Note, we can't set the stopped flag on
- * the rx data until after we've done the first pass of the
+ * the RX data until after we've done the first pass of the
* pending resources. Otherwise we might race with
* i40e_rx_recycle on determining who should free the
* i40e_rx_data_t above.
@@ -1055,6 +1128,8 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
sizeof (ddi_dma_attr_t));
bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
sizeof (ddi_dma_attr_t));
+ bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
+ sizeof (ddi_dma_attr_t));
bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
sizeof (ddi_device_acc_attr_t));
bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
@@ -1063,9 +1138,13 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
if (fma == B_TRUE) {
i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+ i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
+ DDI_DMA_FLAGERR;
} else {
i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
+ i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
+ ~DDI_DMA_FLAGERR;
}
}
@@ -1102,7 +1181,7 @@ i40e_rcb_alloc(i40e_rx_data_t *rxd)
/*
* This is the callback that we get from the OS when freemsg(9F) has been called
* on a loaned descriptor. In addition, if we take the last reference count
- * here, then we have to tear down all of the rx data.
+ * here, then we have to tear down all of the RX data.
*/
void
i40e_rx_recycle(caddr_t arg)
@@ -1768,17 +1847,18 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
* to properly program the hardware for checksum offload as well as the
* generally required flags.
*
- * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
- * into the descriptor based on the checksum flags for this mblk_t and the
+ * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
+ * 'or' into the descriptor based on the checksum flags for this mblk_t and the
* actual information we care about.
+ *
+ * If the mblk requires LSO then we'll also gather the information that will be
+ * used to construct the Transmit Context Descriptor.
*/
static int
i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
- i40e_tx_context_t *tctx)
+ mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
{
- int ret;
- uint32_t flags, start;
- mac_ether_offload_info_t meo;
+ uint32_t chkflags, start, mss, lsoflags;
i40e_txq_stat_t *txs = &itrq->itrq_txstat;
bzero(tctx, sizeof (i40e_tx_context_t));
@@ -1786,37 +1866,34 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
if (i40e->i40e_tx_hcksum_enable != B_TRUE)
return (0);
- mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
- if (flags == 0)
- return (0);
+ mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
+ mac_lso_get(mp, &mss, &lsoflags);
- if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
- txs->itxs_hck_meoifail.value.ui64++;
- return (ret);
- }
+ if (chkflags == 0 && lsoflags == 0)
+ return (0);
/*
* Have we been asked to checksum an IPv4 header. If so, verify that we
* have sufficient information and then set the proper fields in the
* command structure.
*/
- if (flags & HCK_IPV4_HDRCKSUM) {
- if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
+ if (chkflags & HCK_IPV4_HDRCKSUM) {
+ if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
txs->itxs_hck_nol2info.value.ui64++;
return (-1);
}
- if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
+ if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
txs->itxs_hck_nol3info.value.ui64++;
return (-1);
}
- if (meo.meoi_l3proto != ETHERTYPE_IP) {
+ if (meo->meoi_l3proto != ETHERTYPE_IP) {
txs->itxs_hck_badl3.value.ui64++;
return (-1);
}
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
- tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+ tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
+ tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
- tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+ tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
}
@@ -1826,57 +1903,77 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
* onto seeing if we have enough information for the L4 checksum
* offload.
*/
- if (flags & HCK_PARTIALCKSUM) {
- if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
+ if (chkflags & HCK_PARTIALCKSUM) {
+ if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
txs->itxs_hck_nol4info.value.ui64++;
return (-1);
}
- if (!(flags & HCK_IPV4_HDRCKSUM)) {
- if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
+ if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
+ if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
txs->itxs_hck_nol2info.value.ui64++;
return (-1);
}
- if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
+ if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
txs->itxs_hck_nol3info.value.ui64++;
return (-1);
}
- if (meo.meoi_l3proto == ETHERTYPE_IP) {
- tctx->itc_cmdflags |=
+ if (meo->meoi_l3proto == ETHERTYPE_IP) {
+ tctx->itc_data_cmdflags |=
I40E_TX_DESC_CMD_IIPT_IPV4;
- } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
- tctx->itc_cmdflags |=
+ } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
+ tctx->itc_data_cmdflags |=
I40E_TX_DESC_CMD_IIPT_IPV6;
} else {
txs->itxs_hck_badl3.value.ui64++;
return (-1);
}
- tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+ tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
- tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+ tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
}
- switch (meo.meoi_l4proto) {
+ switch (meo->meoi_l4proto) {
case IPPROTO_TCP:
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_L4T_EOFT_TCP;
break;
case IPPROTO_UDP:
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_L4T_EOFT_UDP;
break;
case IPPROTO_SCTP:
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
break;
default:
txs->itxs_hck_badl4.value.ui64++;
return (-1);
}
- tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
+ tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
}
+ if (lsoflags & HW_LSO) {
+ /*
+ * LSO requires that checksum offloads are enabled. If for
+ * some reason they're not we bail out with an error.
+ */
+ if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 ||
+ (chkflags & HCK_PARTIALCKSUM) == 0) {
+ txs->itxs_lso_nohck.value.ui64++;
+ return (-1);
+ }
+
+ tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
+ tctx->itc_ctx_mss = mss;
+ tctx->itc_ctx_tsolen = msgsize(mp) -
+ (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
+ }
+
return (0);
}
@@ -1925,7 +2022,20 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
tcb->tcb_dma.dmab_len = 0;
break;
case I40E_TX_DMA:
- (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+ if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
+ (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
+ else if (tcb->tcb_bind_ncookies > 0)
+ (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+ if (tcb->tcb_bind_info != NULL) {
+ kmem_free(tcb->tcb_bind_info,
+ tcb->tcb_bind_ncookies *
+ sizeof (struct i40e_dma_bind_info));
+ }
+ tcb->tcb_bind_info = NULL;
+ tcb->tcb_bind_ncookies = 0;
+ tcb->tcb_used_lso = B_FALSE;
+ break;
+ case I40E_TX_DESC:
break;
case I40E_TX_NONE:
/* Cast to pacify lint */
@@ -1935,8 +2045,10 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
}
tcb->tcb_type = I40E_TX_NONE;
- freemsg(tcb->tcb_mp);
- tcb->tcb_mp = NULL;
+ if (tcb->tcb_mp != NULL) {
+ freemsg(tcb->tcb_mp);
+ tcb->tcb_mp = NULL;
+ }
tcb->tcb_next = NULL;
}
@@ -1969,10 +2081,11 @@ i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
i40e_tx_control_block_t *tcb;
tcb = itrq->itrq_tcb_work_list[index];
- VERIFY(tcb != NULL);
- itrq->itrq_tcb_work_list[index] = NULL;
- i40e_tcb_reset(tcb);
- i40e_tcb_free(itrq, tcb);
+ if (tcb != NULL) {
+ itrq->itrq_tcb_work_list[index] = NULL;
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+ }
bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
@@ -1995,6 +2108,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
uint32_t wbhead, toclean, count;
i40e_tx_control_block_t *tcbhead;
i40e_t *i40e = itrq->itrq_i40e;
+ uint_t desc_per_tcb, i;
mutex_enter(&itrq->itrq_tx_lock);
@@ -2042,11 +2156,27 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
tcbhead = tcb;
/*
- * We zero this out for sanity purposes.
+ * In the DMA bind case, there may not necessarily be a 1:1
+ * mapping between tcb's and descriptors. If the tcb type
+ * indicates a DMA binding then check the number of DMA
+ * cookies to determine how many entries to clean in the
+ * descriptor ring.
*/
- bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
- toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
- count++;
+ if (tcb->tcb_type == I40E_TX_DMA)
+ desc_per_tcb = tcb->tcb_bind_ncookies;
+ else
+ desc_per_tcb = 1;
+
+ for (i = 0; i < desc_per_tcb; i++) {
+ /*
+ * We zero this out for sanity purposes.
+ */
+ bzero(&itrq->itrq_desc_ring[toclean],
+ sizeof (i40e_tx_desc_t));
+ toclean = i40e_next_desc(toclean, 1,
+ itrq->itrq_tx_ring_size);
+ count++;
+ }
}
itrq->itrq_desc_head = wbhead;
@@ -2078,10 +2208,610 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
}
+static void
+i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
+ const size_t off, const size_t len)
+{
+ const void *soff = mp->b_rptr + off;
+ void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
+
+ ASSERT3U(len, >, 0);
+ ASSERT3P(soff, >=, mp->b_rptr);
+ ASSERT3P(soff, <=, mp->b_wptr);
+ ASSERT3U(len, <=, MBLKL(mp));
+ ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
+ ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
+ bcopy(soff, doff, len);
+ tcb->tcb_type = I40E_TX_COPY;
+ tcb->tcb_dma.dmab_len += len;
+ I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
+}
+
+static i40e_tx_control_block_t *
+i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
+ size_t off, boolean_t use_lso)
+{
+ ddi_dma_handle_t dma_handle;
+ ddi_dma_cookie_t dma_cookie;
+ uint_t i = 0, ncookies = 0, dmaflags;
+ i40e_tx_control_block_t *tcb;
+ i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+ if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ return (NULL);
+ }
+ tcb->tcb_type = I40E_TX_DMA;
+
+ if (use_lso == B_TRUE)
+ dma_handle = tcb->tcb_lso_dma_handle;
+ else
+ dma_handle = tcb->tcb_dma_handle;
+
+ dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
+ if (ddi_dma_addr_bind_handle(dma_handle, NULL,
+ (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
+ DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
+ txs->itxs_bind_fails.value.ui64++;
+ goto bffail;
+ }
+
+ tcb->tcb_bind_ncookies = ncookies;
+ tcb->tcb_used_lso = use_lso;
+
+ tcb->tcb_bind_info =
+ kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
+ KM_NOSLEEP);
+ if (tcb->tcb_bind_info == NULL)
+ goto bffail;
+
+ while (i < ncookies) {
+ if (i > 0)
+ ddi_dma_nextcookie(dma_handle, &dma_cookie);
+
+ tcb->tcb_bind_info[i].dbi_paddr =
+ (caddr_t)dma_cookie.dmac_laddress;
+ tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
+ }
+
+ return (tcb);
+
+bffail:
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+ return (NULL);
+}
+
+static void
+i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
+ caddr_t buff, size_t len, boolean_t last_desc)
+{
+ i40e_tx_desc_t *txdesc;
+ int cmd;
+
+ ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
+ itrq->itrq_desc_free--;
+ txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
+ itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
+ itrq->itrq_tx_ring_size);
+
+ cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
+
+ /*
+ * The last data descriptor needs the EOP bit set, so that the HW knows
+ * that we're ready to send. Additionally, we set the RS (Report
+ * Status) bit, so that we are notified when the transmit engine has
+ * completed DMA'ing all of the data descriptors and data buffers
+ * associated with this frame.
+ */
+ if (last_desc == B_TRUE) {
+ cmd |= I40E_TX_DESC_CMD_EOP;
+ cmd |= I40E_TX_DESC_CMD_RS;
+ }
+
+ /*
+ * Per the X710 manual, section 8.4.2.1.1, the buffer size
+ * must be a value from 1 to 16K minus 1, inclusive.
+ */
+ ASSERT3U(len, >=, 1);
+ ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ);
+
+ txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
+ txdesc->cmd_type_offset_bsz =
+ LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
+ ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
+ ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
+ ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+}
+
+/*
+ * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
+ */
+static inline void
+tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
+ i40e_tx_control_block_t *tcb)
+{
+ if (*head == NULL) {
+ *head = tcb;
+ *tail = *head;
+ } else {
+ ASSERT3P(*tail, !=, NULL);
+ ASSERT3P((*tail)->tcb_next, ==, NULL);
+ (*tail)->tcb_next = tcb;
+ *tail = tcb;
+ }
+}
+
+/*
+ * This function takes a single packet, possibly consisting of
+ * multiple mblks, and creates a TCB chain to send to the controller.
+ * This TCB chain may span up to a maximum of 8 descriptors. A copy
+ * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
+ * more, depending on several factors. For each fragment (invidual
+ * mblk making up the packet), we determine if its size dictates a
+ * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
+ * count of descriptors used; when that count reaches the max we force
+ * all remaining fragments into a single TCB buffer. We have a
+ * guarantee that the TCB buffer is always larger than the MTU -- so
+ * there is always enough room. Consecutive fragments below the DMA
+ * threshold are copied into a single TCB. In the event of an error
+ * this function returns NULL but leaves 'mp' alone.
+ */
+static i40e_tx_control_block_t *
+i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
+{
+ const mblk_t *nmp = mp;
+ uint_t needed_desc = 0;
+ boolean_t force_copy = B_FALSE;
+ i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
+ i40e_t *i40e = itrq->itrq_i40e;
+ i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+ /* TCB buffer is always larger than MTU. */
+ ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
+
+ while (nmp != NULL) {
+ const size_t nmp_len = MBLKL(nmp);
+
+ /* Ignore zero-length mblks. */
+ if (nmp_len == 0) {
+ nmp = nmp->b_cont;
+ continue;
+ }
+
+ if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
+ /* Compress consecutive copies into one TCB. */
+ if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
+ i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
+ nmp = nmp->b_cont;
+ continue;
+ }
+
+ if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ goto fail;
+ }
+
+ /*
+ * TCB DMA buffer is guaranteed to be one
+ * cookie by i40e_alloc_dma_buffer().
+ */
+ i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
+ needed_desc++;
+ tcb_list_append(&tcbhead, &tcbtail, tcb);
+ } else {
+ uint_t total_desc;
+
+ tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
+ if (tcb == NULL) {
+ i40e_error(i40e, "dma bind failed!");
+ goto fail;
+ }
+
+ /*
+ * If the new total exceeds the max or we've
+ * reached the limit and there's data left,
+ * then give up binding and copy the rest into
+ * the pre-allocated TCB buffer.
+ */
+ total_desc = needed_desc + tcb->tcb_bind_ncookies;
+ if ((total_desc > I40E_TX_MAX_COOKIE) ||
+ (total_desc == I40E_TX_MAX_COOKIE &&
+ nmp->b_cont != NULL)) {
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+
+ if (tcbtail != NULL &&
+ tcbtail->tcb_type == I40E_TX_COPY) {
+ tcb = tcbtail;
+ } else {
+ tcb = NULL;
+ }
+
+ force_copy = B_TRUE;
+ txs->itxs_force_copy.value.ui64++;
+ continue;
+ }
+
+ needed_desc += tcb->tcb_bind_ncookies;
+ tcb_list_append(&tcbhead, &tcbtail, tcb);
+ }
+
+ nmp = nmp->b_cont;
+ }
+
+ ASSERT3P(nmp, ==, NULL);
+ ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
+ ASSERT3P(tcbhead, !=, NULL);
+ *ndesc += needed_desc;
+ return (tcbhead);
+
+fail:
+ tcb = tcbhead;
+ while (tcb != NULL) {
+ i40e_tx_control_block_t *next = tcb->tcb_next;
+
+ ASSERT(tcb->tcb_type == I40E_TX_DMA ||
+ tcb->tcb_type == I40E_TX_COPY);
+
+ tcb->tcb_mp = NULL;
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+ tcb = next;
+ }
+
+ return (NULL);
+}
+
+/*
+ * Section 8.4.1 of the 700-series programming guide states that a
+ * segment may span up to 8 data descriptors; including both header
+ * and payload data. However, empirical evidence shows that the
+ * controller freezes the Tx queue when presented with a segment of 8
+ * descriptors. Or, at least, when the first segment contains 8
+ * descriptors. One explanation is that the controller counts the
+ * context descriptor against the first segment, even though the
+ * programming guide makes no mention of such a constraint. In any
+ * case, we limit TSO segments to 7 descriptors to prevent Tx queue
+ * freezes. We still allow non-TSO segments to utilize all 8
+ * descriptors as they have not demonstrated the faulty behavior.
+ */
+uint_t i40e_lso_num_descs = 7;
+
+#define I40E_TCB_LEFT(tcb) \
+ ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
+
+/*
+ * This function is similar in spirit to i40e_non_lso_chain(), but
+ * much more complicated in reality. Like the previous function, it
+ * takes a packet (an LSO packet) as input and returns a chain of
+ * TCBs. The complication comes with the fact that we are no longer
+ * trying to fit the entire packet into 8 descriptors, but rather we
+ * must fit each MSS-size segment of the LSO packet into 8 descriptors.
+ * Except it's really 7 descriptors, see i40e_lso_num_descs.
+ *
+ * Your first inclination might be to verify that a given segment
+ * spans no more than 7 mblks; but it's actually much more subtle than
+ * that. First, let's describe what the hardware expects, and then we
+ * can expound on the software side of things.
+ *
+ * For an LSO packet the hardware expects the following:
+ *
+ * o Each MSS-sized segment must span no more than 7 descriptors.
+ *
+ * o The header size does not count towards the segment size.
+ *
+ * o If header and payload share the first descriptor, then the
+ * controller will count the descriptor twice.
+ *
+ * The most important thing to keep in mind is that the hardware does
+ * not view the segments in terms of mblks, like we do. The hardware
+ * only sees descriptors. It will iterate each descriptor in turn,
+ * keeping a tally of bytes seen and descriptors visited. If the byte
+ * count hasn't reached MSS by the time the descriptor count reaches
+ * 7, then the controller freezes the queue and we are stuck.
+ * Furthermore, the hardware picks up its tally where it left off. So
+ * if it reached MSS in the middle of a descriptor, it will start
+ * tallying the next segment in the middle of that descriptor. The
+ * hardware's view is entirely removed from the mblk chain or even the
+ * descriptor layout. Consider these facts:
+ *
+ * o The MSS will vary dpeneding on MTU and other factors.
+ *
+ * o The dblk allocation will sit at various offsets within a
+ * memory page.
+ *
+ * o The page size itself could vary in the future (i.e. not
+ * always 4K).
+ *
+ * o Just because a dblk is virtually contiguous doesn't mean
+ * it's physically contiguous. The number of cookies
+ * (descriptors) required by a DMA bind of a single dblk is at
+ * the mercy of the page size and physical layout.
+ *
+ * o The descriptors will most often NOT start/end on a MSS
+ * boundary. Thus the hardware will often start counting the
+ * MSS mid descriptor and finish mid descriptor.
+ *
+ * The upshot of all this is that the driver must learn to think like
+ * the controller; and verify that none of the constraints are broken.
+ * It does this by tallying up the segment just like the hardware
+ * would. This is handled by the two variables 'segsz' and 'segdesc'.
+ * After each attempt to bind a dblk, we check the constaints. If
+ * violated, we undo the DMA and force a copy until MSS is met. We
+ * have a guarantee that the TCB buffer is larger than MTU; thus
+ * ensuring we can always meet the MSS with a single copy buffer. We
+ * also copy consecutive non-DMA fragments into the same TCB buffer.
+ */
+static i40e_tx_control_block_t *
+i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
+ const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
+ uint_t *ndesc)
+{
+ size_t mp_len = MBLKL(mp);
+ /*
+ * The cpoff (copy offset) variable tracks the offset inside
+ * the current mp. There are cases where the entire mp is not
+ * fully copied in one go: such as the header copy followed by
+ * a non-DMA mblk, or a TCB buffer that only has enough space
+ * to copy part of the current mp.
+ */
+ size_t cpoff = 0;
+ /*
+ * The segsz and segdesc variables track the controller's view
+ * of the segment. The needed_desc variable tracks the total
+ * number of data descriptors used by the driver.
+ */
+ size_t segsz = 0;
+ uint_t segdesc = 0;
+ uint_t needed_desc = 0;
+ size_t hdrcopied = 0;
+ const size_t hdrlen =
+ meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
+ const size_t mss = tctx->itc_ctx_mss;
+ boolean_t force_copy = B_FALSE;
+ i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
+ i40e_t *i40e = itrq->itrq_i40e;
+ i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+ /*
+ * We always copy the header in order to avoid more
+ * complicated code dealing with various edge cases.
+ */
+ if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ goto fail;
+ }
+
+ needed_desc++;
+ tcb_list_append(&tcbhead, &tcbtail, tcb);
+
+ while (hdrcopied < hdrlen) {
+ const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len);
+ i40e_tx_copy_fragment(tcb, mp, 0, tocopy);
+ hdrcopied += tocopy;
+ cpoff += tocopy;
+ if (tocopy == mp_len) {
+ /*
+ * This is a bit of defensive programming. We
+ * should never have a chain too short to
+ * satisfy the headers -- but just in case.
+ */
+ if ((mp = mp->b_cont) == NULL) {
+ txs->itxs_tx_short.value.ui64++;
+ goto fail;
+ }
+
+ while ((mp_len = MBLKL(mp)) == 0) {
+ if ((mp = mp->b_cont) == NULL) {
+ txs->itxs_tx_short.value.ui64++;
+ goto fail;
+ }
+ }
+ cpoff = 0;
+ }
+ }
+ ASSERT3U(hdrcopied, ==, hdrlen);
+
+ /*
+ * A single descriptor containing both header and data is
+ * counted twice by the controller.
+ */
+ if (mp_len < i40e->i40e_tx_dma_min) {
+ segdesc = 2;
+ } else {
+ segdesc = 1;
+ }
+
+ while (mp != NULL) {
+ mp_len = MBLKL(mp);
+force_copy:
+ /* Ignore zero-length mblks. */
+ if (mp_len == 0) {
+ mp = mp->b_cont;
+ cpoff = 0;
+ continue;
+ }
+
+ /*
+ * We copy into the preallocated TCB buffer when the
+ * current fragment is less than the DMA threshold OR
+ * when the DMA bind can't meet the controller's
+ * segment descriptor limit.
+ */
+ if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
+ size_t tocopy;
+
+ /*
+ * Our objective here is to compress
+ * consecutive copies into one TCB (until it
+ * is full). If there is no current TCB, or if
+ * it is a DMA TCB, then allocate a new one.
+ */
+ if (tcb == NULL ||
+ (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
+ if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ goto fail;
+ }
+
+ /*
+ * The TCB DMA buffer is guaranteed to
+ * be one cookie by i40e_alloc_dma_buffer().
+ */
+ needed_desc++;
+ segdesc++;
+ ASSERT3U(segdesc, <=, i40e_lso_num_descs);
+ tcb_list_append(&tcbhead, &tcbtail, tcb);
+ } else if (segdesc == 0) {
+ /*
+ * We are copying into an existing TCB
+ * but we just crossed the MSS
+ * boundary. Make sure to increment
+ * segdesc to track the descriptor
+ * count as the hardware would.
+ */
+ segdesc++;
+ }
+
+ tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
+ i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
+ cpoff += tocopy;
+ segsz += tocopy;
+
+ /* We have consumed the current mp. */
+ if (cpoff == mp_len) {
+ mp = mp->b_cont;
+ cpoff = 0;
+ }
+
+ /* We have consumed the current TCB buffer. */
+ if (I40E_TCB_LEFT(tcb) == 0) {
+ tcb = NULL;
+ }
+
+ /*
+ * We have met MSS with this copy; restart the
+ * counters.
+ */
+ if (segsz >= mss) {
+ segsz = segsz % mss;
+ segdesc = segsz == 0 ? 0 : 1;
+ force_copy = B_FALSE;
+ }
+
+ /*
+ * We are at the controller's descriptor
+ * limit; we must copy into the current TCB
+ * until MSS is reached. The TCB buffer is
+ * always bigger than the MTU so we know it is
+ * big enough to meet the MSS.
+ */
+ if (segdesc == i40e_lso_num_descs) {
+ force_copy = B_TRUE;
+ }
+ } else {
+ uint_t tsegdesc = segdesc;
+ size_t tsegsz = segsz;
+
+ ASSERT(force_copy == B_FALSE);
+ ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
+
+ tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
+ if (tcb == NULL) {
+ i40e_error(i40e, "dma bind failed!");
+ goto fail;
+ }
+
+ for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
+ struct i40e_dma_bind_info dbi =
+ tcb->tcb_bind_info[i];
+
+ tsegsz += dbi.dbi_len;
+ tsegdesc++;
+ ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
+
+ /*
+ * We've met the MSS with this portion
+ * of the DMA.
+ */
+ if (tsegsz >= mss) {
+ tsegsz = tsegsz % mss;
+ tsegdesc = tsegsz == 0 ? 0 : 1;
+ }
+
+ /*
+ * We've reached max descriptors but
+ * have not met the MSS. Undo the bind
+ * and instead copy.
+ */
+ if (tsegdesc == i40e_lso_num_descs) {
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+
+ if (tcbtail != NULL &&
+ I40E_TCB_LEFT(tcb) > 0 &&
+ tcbtail->tcb_type == I40E_TX_COPY) {
+ tcb = tcbtail;
+ } else {
+ tcb = NULL;
+ }
+
+ /*
+ * Remember, we are still on
+ * the same mp.
+ */
+ force_copy = B_TRUE;
+ txs->itxs_tso_force_copy.value.ui64++;
+ goto force_copy;
+ }
+ }
+
+ ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
+ ASSERT3U(tsegsz, <, mss);
+
+ /*
+ * We've made if through the loop without
+ * breaking the segment descriptor contract
+ * with the controller -- replace the segment
+ * tracking values with the temporary ones.
+ */
+ segdesc = tsegdesc;
+ segsz = tsegsz;
+ needed_desc += tcb->tcb_bind_ncookies;
+ cpoff = 0;
+ tcb_list_append(&tcbhead, &tcbtail, tcb);
+ mp = mp->b_cont;
+ }
+ }
+
+ ASSERT3P(mp, ==, NULL);
+ ASSERT3P(tcbhead, !=, NULL);
+ *ndesc += needed_desc;
+ return (tcbhead);
+
+fail:
+ tcb = tcbhead;
+ while (tcb != NULL) {
+ i40e_tx_control_block_t *next = tcb->tcb_next;
+
+ ASSERT(tcb->tcb_type == I40E_TX_DMA ||
+ tcb->tcb_type == I40E_TX_COPY);
+
+ tcb->tcb_mp = NULL;
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+ tcb = next;
+ }
+
+ return (NULL);
+}
+
/*
* We've been asked to send a message block on the wire. We'll only have a
* single chain. There will not be any b_next pointers; however, there may be
- * multiple b_cont blocks.
+ * multiple b_cont blocks. The number of b_cont blocks may exceed the
+ * controller's Tx descriptor limit.
*
* We may do one of three things with any given mblk_t chain:
*
@@ -2096,12 +2826,14 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
mblk_t *
i40e_ring_tx(void *arg, mblk_t *mp)
{
- const mblk_t *nmp;
- size_t mpsize;
- i40e_tx_control_block_t *tcb;
- i40e_tx_desc_t *txdesc;
+ size_t msglen;
+ i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
+ i40e_tx_context_desc_t *ctxdesc;
+ mac_ether_offload_info_t meo;
i40e_tx_context_t tctx;
- int cmd, type;
+ int type;
+ uint_t needed_desc = 0;
+ boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
i40e_trqpair_t *itrq = arg;
i40e_t *i40e = itrq->itrq_i40e;
@@ -2119,107 +2851,137 @@ i40e_ring_tx(void *arg, mblk_t *mp)
return (NULL);
}
+ if (mac_ether_offload_info(mp, &meo) != 0) {
+ freemsg(mp);
+ itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
+ return (NULL);
+ }
+
/*
* Figure out the relevant context about this frame that we might need
- * for enabling checksum, lso, etc. This also fills in information that
+ * for enabling checksum, LSO, etc. This also fills in information that
* we might set around the packet type, etc.
*/
- if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
+ if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
freemsg(mp);
itrq->itrq_txstat.itxs_err_context.value.ui64++;
return (NULL);
}
+ if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+ use_lso = B_TRUE;
+ do_ctx_desc = B_TRUE;
+ }
/*
* For the primordial driver we can punt on doing any recycling right
* now; however, longer term we need to probably do some more pro-active
- * recycling to cut back on stalls in the tx path.
+ * recycling to cut back on stalls in the TX path.
*/
- /*
- * Do a quick size check to make sure it fits into what we think it
- * should for this device. Note that longer term this will be false,
- * particularly when we have the world of TSO.
- */
- mpsize = 0;
- for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
- mpsize += MBLKL(nmp);
+ msglen = msgsize(mp);
+
+ if (do_ctx_desc) {
+ /*
+ * If we're doing tunneling or LSO, then we'll need a TX
+ * context descriptor in addition to one or more TX data
+ * descriptors. Since there's no data DMA block or handle
+ * associated with the context descriptor, we create a special
+ * control block that behaves effectively like a NOP.
+ */
+ if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ goto txfail;
+ }
+ tcb_ctx->tcb_type = I40E_TX_DESC;
+ needed_desc++;
}
- /*
- * First we allocate our tx control block and prepare the packet for
- * transmit before we do a final check for descriptors. We do it this
- * way to minimize the time under the tx lock.
- */
- tcb = i40e_tcb_alloc(itrq);
- if (tcb == NULL) {
- txs->itxs_err_notcb.value.ui64++;
- goto txfail;
+ if (!use_lso) {
+ tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
+ } else {
+ tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
}
- /*
- * For transmitting a block, we're currently going to use just a
- * single control block and bcopy all of the fragments into it. We
- * should be more intelligent about doing DMA binding or otherwise, but
- * for getting off the ground this will have to do.
- */
- ASSERT(tcb->tcb_dma.dmab_len == 0);
- ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
- for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
- size_t clen = MBLKL(nmp);
- void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
+ if (tcbhead == NULL)
+ goto txfail;
- bcopy(nmp->b_rptr, coff, clen);
- tcb->tcb_dma.dmab_len += clen;
- }
- ASSERT(tcb->tcb_dma.dmab_len == mpsize);
+ tcbhead->tcb_mp = mp;
/*
- * While there's really no need to keep the mp here, but let's just do
- * it to help with our own debugging for now.
+ * The second condition ensures that 'itrq_desc_tail' never
+ * equals 'itrq_desc_head'. This enforces the rule found in
+ * the second bullet point of section 8.4.3.1.5 of the XL710
+ * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
+ * never overlap with the head. This means that we only ever
+ * have 'itrq_tx_ring_size - 1' total available descriptors.
*/
- tcb->tcb_mp = mp;
- tcb->tcb_type = I40E_TX_COPY;
- I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
-
mutex_enter(&itrq->itrq_tx_lock);
- if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
+ if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
+ (itrq->itrq_desc_free - 1) < needed_desc) {
txs->itxs_err_nodescs.value.ui64++;
mutex_exit(&itrq->itrq_tx_lock);
goto txfail;
}
- /*
- * Build up the descriptor and send it out. Thankfully at the moment
- * we only need a single desc, because we're not doing anything fancy
- * yet.
- */
- ASSERT(itrq->itrq_desc_free > 0);
- itrq->itrq_desc_free--;
- txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
- itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
- itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
- itrq->itrq_tx_ring_size);
+ if (do_ctx_desc) {
+ /*
+ * If we're enabling any offloads for this frame, then we'll
+ * need to build up a transmit context descriptor, first. The
+ * context descriptor needs to be placed in the TX ring before
+ * the data descriptor(s). See section 8.4.2, table 8-16
+ */
+ uint_t tail = itrq->itrq_desc_tail;
+ itrq->itrq_desc_free--;
+ ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
+ itrq->itrq_tcb_work_list[tail] = tcb_ctx;
+ itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
+ itrq->itrq_tx_ring_size);
+
+ /* QW0 */
+ type = I40E_TX_DESC_DTYPE_CONTEXT;
+ ctxdesc->tunneling_params = 0;
+ ctxdesc->l2tag2 = 0;
+
+ /* QW1 */
+ ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
+ if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+ ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
+ ((uint64_t)tctx.itc_ctx_cmdflags <<
+ I40E_TXD_CTX_QW1_CMD_SHIFT) |
+ ((uint64_t)tctx.itc_ctx_tsolen <<
+ I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
+ ((uint64_t)tctx.itc_ctx_mss <<
+ I40E_TXD_CTX_QW1_MSS_SHIFT));
+ }
+ }
- /*
- * Note, we always set EOP and RS which indicates that this is the last
- * data frame and that we should ask for it to be transmitted. We also
- * must always set ICRC, because that is an internal bit that must be
- * set to one for data descriptors. The remaining bits in the command
- * descriptor depend on checksumming and are determined based on the
- * information set up in i40e_tx_context().
- */
- type = I40E_TX_DESC_DTYPE_DATA;
- cmd = I40E_TX_DESC_CMD_EOP |
- I40E_TX_DESC_CMD_RS |
- I40E_TX_DESC_CMD_ICRC |
- tctx.itc_cmdflags;
- txdesc->buffer_addr =
- CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
- txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
- ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
- ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
- ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+ tcb = tcbhead;
+ while (tcb != NULL) {
+
+ itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
+ if (tcb->tcb_type == I40E_TX_COPY) {
+ boolean_t last_desc = (tcb->tcb_next == NULL);
+
+ i40e_tx_set_data_desc(itrq, &tctx,
+ (caddr_t)tcb->tcb_dma.dmab_dma_address,
+ tcb->tcb_dma.dmab_len, last_desc);
+ } else {
+ boolean_t last_desc = B_FALSE;
+ ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
+
+ for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
+ last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
+ (tcb->tcb_next == NULL);
+
+ i40e_tx_set_data_desc(itrq, &tctx,
+ tcb->tcb_bind_info[c].dbi_paddr,
+ tcb->tcb_bind_info[c].dbi_len,
+ last_desc);
+ }
+ }
+
+ tcb = tcb->tcb_next;
+ }
/*
* Now, finally, sync the DMA data and alert hardware.
@@ -2228,6 +2990,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
itrq->itrq_desc_tail);
+
if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
DDI_FM_OK) {
/*
@@ -2239,9 +3002,9 @@ i40e_ring_tx(void *arg, mblk_t *mp)
atomic_or_32(&i40e->i40e_state, I40E_ERROR);
}
- txs->itxs_bytes.value.ui64 += mpsize;
+ txs->itxs_bytes.value.ui64 += msglen;
txs->itxs_packets.value.ui64++;
- txs->itxs_descriptors.value.ui64++;
+ txs->itxs_descriptors.value.ui64 += needed_desc;
mutex_exit(&itrq->itrq_tx_lock);
@@ -2254,10 +3017,23 @@ txfail:
* Make sure to reset their message block's, since we'll return them
* back to MAC.
*/
- if (tcb != NULL) {
+ if (tcb_ctx != NULL) {
+ tcb_ctx->tcb_mp = NULL;
+ i40e_tcb_reset(tcb_ctx);
+ i40e_tcb_free(itrq, tcb_ctx);
+ }
+
+ tcb = tcbhead;
+ while (tcb != NULL) {
+ i40e_tx_control_block_t *next = tcb->tcb_next;
+
+ ASSERT(tcb->tcb_type == I40E_TX_DMA ||
+ tcb->tcb_type == I40E_TX_COPY);
+
tcb->tcb_mp = NULL;
i40e_tcb_reset(tcb);
i40e_tcb_free(itrq, tcb);
+ tcb = next;
}
mutex_enter(&itrq->itrq_tx_lock);
diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c
index 1c8318b191..55c4159bc4 100644
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c
+++ b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -272,8 +273,7 @@ ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
icmph->icmph_checksum = IP_CSUM(pmtu_mp,
(int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
- (void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
- HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
+ mac_hcksum_set(pmtu_mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
"ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
@@ -1560,8 +1560,7 @@ ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
/*
* Can RC mode in IB guarantee its checksum correctness?
*
- * (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
- * HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
+ * mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
*/
/*
diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c
new file mode 100644
index 0000000000..eaa0c33f0f
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.c
@@ -0,0 +1,1555 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2015 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * Support for the inotify facility, a Linux-borne facility for asynchronous
+ * notification of certain events on specified files or directories. Our
+ * implementation broadly leverages the file event monitoring facility, and
+ * would actually be quite straightforward were it not for a very serious
+ * blunder in the inotify interface: in addition to allowing for one to be
+ * notified on events on a particular file or directory, inotify also allows
+ * for one to be notified on certain events on files _within_ a watched
+ * directory -- even though those events have absolutely nothing to do with
+ * the directory itself. This leads to all sorts of madness because file
+ * operations are (of course) not undertaken on paths but rather on open
+ * files -- and the relationships between open files and the paths that resolve
+ * to those files are neither static nor isomorphic. We implement this
+ * concept by having _child watches_ when directories are watched with events
+ * in IN_CHILD_EVENTS. We add child watches when a watch on a directory is
+ * first added, and we modify those child watches dynamically as files are
+ * created, deleted, moved into or moved out of the specified directory. This
+ * mechanism works well, absent hard links. Hard links, unfortunately, break
+ * this rather badly, and the user is warned that watches on directories that
+ * have multiple directory entries referring to the same file may behave
+ * unexpectedly.
+ */
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/inotify.h>
+#include <sys/fem.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vmem.h>
+#include <sys/avl.h>
+#include <sys/sysmacros.h>
+#include <sys/cyclic.h>
+#include <sys/filio.h>
+
+struct inotify_state;
+struct inotify_kevent;
+
+typedef struct inotify_watch inotify_watch_t;
+typedef struct inotify_state inotify_state_t;
+typedef struct inotify_kevent inotify_kevent_t;
+
+struct inotify_watch {
+ kmutex_t inw_lock; /* lock protecting ref count */
+ int inw_refcnt; /* reference count */
+ uint8_t inw_zombie:1; /* boolean: is zombie */
+ uint8_t inw_fired:1; /* boolean: fired one-shot */
+ uint8_t inw_active:1; /* boolean: watch is active */
+ uint8_t inw_orphaned:1; /* boolean: orphaned */
+ kcondvar_t inw_cv; /* condvar for zombifier */
+ uint32_t inw_mask; /* mask of watch */
+ int32_t inw_wd; /* watch descriptor */
+ vnode_t *inw_vp; /* underlying vnode */
+ inotify_watch_t *inw_parent; /* parent, if a child */
+ avl_node_t inw_byvp; /* watches by vnode */
+ avl_node_t inw_bywd; /* watches by descriptor */
+ avl_tree_t inw_children; /* children, if a parent */
+ char *inw_name; /* name, if a child */
+ list_node_t inw_orphan; /* orphan list */
+ cred_t *inw_cred; /* cred, if orphaned */
+ inotify_state_t *inw_state; /* corresponding state */
+};
+
+struct inotify_kevent {
+ inotify_kevent_t *ine_next; /* next event in queue */
+ struct inotify_event ine_event; /* event (variable size) */
+};
+
+#define INOTIFY_EVENT_LENGTH(ev) \
+ (sizeof (inotify_kevent_t) + (ev)->ine_event.len)
+
+struct inotify_state {
+ kmutex_t ins_lock; /* lock protecting state */
+ avl_tree_t ins_byvp; /* watches by vnode */
+ avl_tree_t ins_bywd; /* watches by descriptor */
+ vmem_t *ins_wds; /* watch identifier arena */
+ int ins_maxwatches; /* maximum number of watches */
+ int ins_maxevents; /* maximum number of events */
+ int ins_nevents; /* current # of events */
+ int32_t ins_size; /* total size of events */
+ inotify_kevent_t *ins_head; /* head of event queue */
+ inotify_kevent_t *ins_tail; /* tail of event queue */
+ pollhead_t ins_pollhd; /* poll head */
+ kcondvar_t ins_cv; /* condvar for reading */
+ list_t ins_orphans; /* orphan list */
+ ddi_periodic_t ins_cleaner; /* cyclic for cleaning */
+ inotify_watch_t *ins_zombies; /* zombie watch list */
+ cred_t *ins_cred; /* creator's credentials */
+ inotify_state_t *ins_next; /* next state on global list */
+};
+
+/*
+ * Tunables (exported read-only in lx-branded zones via /proc).
+ */
+int inotify_maxwatches = 8192; /* max watches per instance */
+int inotify_maxevents = 16384; /* max events */
+int inotify_maxinstances = 128; /* max instances per user */
+
+/*
+ * Internal global variables.
+ */
+static kmutex_t inotify_lock; /* lock protecting state */
+static dev_info_t *inotify_devi; /* device info */
+static fem_t *inotify_femp; /* FEM pointer */
+static vmem_t *inotify_minor; /* minor number arena */
+static void *inotify_softstate; /* softstate pointer */
+static inotify_state_t *inotify_state; /* global list if state */
+
+static void inotify_watch_event(inotify_watch_t *, uint64_t, char *);
+static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *);
+static void inotify_watch_delete(inotify_watch_t *, uint32_t);
+static void inotify_watch_remove(inotify_state_t *state,
+ inotify_watch_t *watch);
+
+static int
+inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset,
+ cred_t *cr, caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) {
+ inotify_watch_event(watch, flag & FWRITE ?
+ IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
+ int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
+ vsecattr_t *vsecp)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_create(vf, name, vap, excl, mode,
+ vpp, cr, flag, ct, vsecp)) == 0) {
+ inotify_watch_insert(watch, *vpp, name);
+ inotify_watch_event(watch, IN_CREATE, name);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) {
+ inotify_watch_insert(watch, svp, tnm);
+ inotify_watch_event(watch, IN_CREATE, tnm);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp,
+ cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_mkdir(vf, name, vap, vpp, cr,
+ ct, flags, vsecp)) == 0) {
+ inotify_watch_insert(watch, *vpp, name);
+ inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_open(vf, mode, cr, ct)) == 0)
+ inotify_watch_event(watch, IN_OPEN, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval = vnext_read(vf, uiop, ioflag, cr, ct);
+ inotify_watch_event(watch, IN_ACCESS, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags);
+ inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL);
+
+ return (rval);
+}
+
+int
+inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0)
+ inotify_watch_event(watch, IN_DELETE, nm);
+
+ return (rval);
+}
+
+int
+inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0)
+ inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm);
+
+ return (rval);
+}
+
+static int
+inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0)
+ inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval = vnext_write(vf, uiop, ioflag, cr, ct);
+ inotify_watch_event(watch, IN_MODIFY, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+
+ switch (vnevent) {
+ case VE_RENAME_SRC:
+ inotify_watch_event(watch, IN_MOVE_SELF, NULL);
+ inotify_watch_delete(watch, IN_MOVE_SELF);
+ break;
+ case VE_REMOVE:
+ /*
+ * Linux will apparently fire an IN_ATTRIB event when the link
+ * count changes (including when it drops to 0 on a remove).
+ * This is merely somewhat odd; what is amazing is that this
+ * IN_ATTRIB event is not visible on an inotify watch on the
+ * parent directory. (IN_ATTRIB events are normally sent to
+ * watches on the parent directory). While it's hard to
+ * believe that this constitutes desired semantics, ltp
+ * unfortunately tests this case (if implicitly); in the name
+ * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are
+ * explicitly watching the file that has been removed.
+ */
+ if (watch->inw_parent == NULL)
+ inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+ /*FALLTHROUGH*/
+ case VE_RENAME_DEST:
+ inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+ inotify_watch_delete(watch, IN_DELETE_SELF);
+ break;
+ case VE_RMDIR:
+ /*
+ * It seems that IN_ISDIR should really be OR'd in here, but
+ * Linux doesn't seem to do that in this case; for the sake of
+ * bug-for-bug compatibility, we don't do it either.
+ */
+ inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+ inotify_watch_delete(watch, IN_DELETE_SELF);
+ break;
+ case VE_CREATE:
+ case VE_TRUNCATE:
+ case VE_RESIZE:
+ inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL);
+ break;
+ case VE_LINK:
+ inotify_watch_event(watch, IN_ATTRIB, NULL);
+ break;
+ case VE_RENAME_SRC_DIR:
+ inotify_watch_event(watch, IN_MOVED_FROM, name);
+ break;
+ case VE_RENAME_DEST_DIR:
+ if (name == NULL)
+ name = dvp->v_path;
+
+ inotify_watch_insert(watch, dvp, name);
+ inotify_watch_event(watch, IN_MOVED_TO, name);
+ break;
+ case VE_SUPPORT:
+ case VE_MOUNTEDOVER:
+ case VE_PRE_RENAME_SRC:
+ case VE_PRE_RENAME_DEST:
+ case VE_PRE_RENAME_DEST_DIR:
+ break;
+ }
+
+ return (vnext_vnevent(vf, vnevent, dvp, name, ct));
+}
+
+const fs_operation_def_t inotify_vnodesrc_template[] = {
+ VOPNAME_CLOSE, { .femop_close = inotify_fop_close },
+ VOPNAME_CREATE, { .femop_create = inotify_fop_create },
+ VOPNAME_LINK, { .femop_link = inotify_fop_link },
+ VOPNAME_MKDIR, { .femop_mkdir = inotify_fop_mkdir },
+ VOPNAME_OPEN, { .femop_open = inotify_fop_open },
+ VOPNAME_READ, { .femop_read = inotify_fop_read },
+ VOPNAME_READDIR, { .femop_readdir = inotify_fop_readdir },
+ VOPNAME_REMOVE, { .femop_remove = inotify_fop_remove },
+ VOPNAME_RMDIR, { .femop_rmdir = inotify_fop_rmdir },
+ VOPNAME_SETATTR, { .femop_setattr = inotify_fop_setattr },
+ VOPNAME_WRITE, { .femop_write = inotify_fop_write },
+ VOPNAME_VNEVENT, { .femop_vnevent = inotify_fop_vnevent },
+ NULL, NULL
+};
+
+static int
+inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+ if (lhs->inw_wd < rhs->inw_wd)
+ return (-1);
+
+ if (lhs->inw_wd > rhs->inw_wd)
+ return (1);
+
+ return (0);
+}
+
+static int
+inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+ uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp;
+
+ if (lvp < rvp)
+ return (-1);
+
+ if (lvp > rvp)
+ return (1);
+
+ return (0);
+}
+
+static void
+inotify_watch_hold(inotify_watch_t *watch)
+{
+ mutex_enter(&watch->inw_lock);
+ VERIFY(watch->inw_refcnt > 0);
+ watch->inw_refcnt++;
+ mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_release(inotify_watch_t *watch)
+{
+ mutex_enter(&watch->inw_lock);
+ VERIFY(watch->inw_refcnt > 1);
+
+ if (--watch->inw_refcnt == 1 && watch->inw_zombie) {
+ /*
+ * We're down to our last reference; kick anyone that might be
+ * waiting.
+ */
+ cv_signal(&watch->inw_cv);
+ }
+
+ mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name)
+{
+ inotify_kevent_t *event, *tail;
+ inotify_state_t *state = watch->inw_state;
+ uint32_t wd = watch->inw_wd, cookie = 0, len;
+ boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE;
+ inotify_watch_t *source = watch;
+
+ if (!(mask &= watch->inw_mask) || mask == IN_ISDIR)
+ return;
+
+ if (watch->inw_parent != NULL) {
+ /*
+ * This is an event on the child; if this isn't a valid child
+ * event, return. Otherwise, we move our watch to be our
+ * parent (which we know is around because we have a hold on
+ * it) and continue.
+ */
+ if (!(mask & IN_CHILD_EVENTS))
+ return;
+
+ name = watch->inw_name;
+ watch = watch->inw_parent;
+ wd = watch->inw_wd;
+ }
+
+ if (!removal) {
+ mutex_enter(&state->ins_lock);
+
+ if (watch->inw_zombie ||
+ watch->inw_fired || !watch->inw_active) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+ } else {
+ if (!watch->inw_active)
+ return;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+ }
+
+ /*
+ * If this is an operation on a directory and it's a child event
+ * (event if it's not on a child), we specify IN_ISDIR.
+ */
+ if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS))
+ mask |= IN_ISDIR;
+
+ if (mask & (IN_MOVED_FROM | IN_MOVED_TO))
+ cookie = (uint32_t)curthread->t_did;
+
+ if (state->ins_nevents >= state->ins_maxevents) {
+ /*
+ * We're at our maximum number of events -- turn our event
+ * into an IN_Q_OVERFLOW event, which will be coalesced if
+ * it's already the tail event.
+ */
+ mask = IN_Q_OVERFLOW;
+ wd = (uint32_t)-1;
+ cookie = 0;
+ len = 0;
+ }
+
+ if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd &&
+ tail->ine_event.mask == mask && tail->ine_event.cookie == cookie &&
+ ((tail->ine_event.len == 0 && len == 0) ||
+ (name != NULL && tail->ine_event.len != 0 &&
+ strcmp(tail->ine_event.name, name) == 0))) {
+ /*
+ * This is an implicitly coalesced event; we're done.
+ */
+ if (!removal)
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if (name != NULL) {
+ /*
+ * We are in the context of a file event monitoring operation,
+ * so the name length is bounded by the kernel.
+ */
+ len = strlen(name) + 1;
+ len = roundup(len, sizeof (struct inotify_event));
+ } else {
+ len = 0;
+ }
+
+ event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP);
+ event->ine_event.wd = wd;
+ event->ine_event.mask = (uint32_t)mask;
+ event->ine_event.cookie = cookie;
+ event->ine_event.len = len;
+
+ if (name != NULL)
+ (void) strcpy(event->ine_event.name, name);
+
+ if (tail != NULL) {
+ tail->ine_next = event;
+ } else {
+ VERIFY(state->ins_head == NULL);
+ state->ins_head = event;
+ cv_broadcast(&state->ins_cv);
+ }
+
+ state->ins_tail = event;
+ state->ins_nevents++;
+ state->ins_size += sizeof (event->ine_event) + len;
+
+ if (removal)
+ return;
+
+ if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) {
+ /*
+ * If this is a one-shot, we need to remove the watch. (Note
+ * that this will recurse back into inotify_watch_event() to
+ * fire the IN_IGNORED event -- but with "removal" set.)
+ */
+ watch->inw_fired = 1;
+ inotify_watch_remove(state, watch);
+ }
+
+ mutex_exit(&state->ins_lock);
+ pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
+}
+
+/*
+ * Destroy a watch. By the time we're in here, the watch must have exactly
+ * one reference.
+ */
+static void
+inotify_watch_destroy(inotify_watch_t *watch)
+{
+ VERIFY(MUTEX_HELD(&watch->inw_lock));
+
+ if (watch->inw_name != NULL)
+ kmem_free(watch->inw_name, strlen(watch->inw_name) + 1);
+
+ kmem_free(watch, sizeof (inotify_watch_t));
+}
+
+static int
+inotify_fem_install(vnode_t *vp, inotify_watch_t *watch)
+{
+ /*
+ * For vnodes that are devices (of type VCHR or VBLK), we silently
+ * refuse to actually install any event monitor. This is to avoid
+ * single-thread deadlock when both a special device vnode and its
+ * underlying real vnode are being watched: releasing the device
+ * vnode upon watch removal can induce an attribute update on the
+ * underlying vnode, which will bring us into inotify_watch_event()
+ * with our lock already held. While we could fail earlier and more
+ * explicitly in this case, we choose to keep with the Linux behavior
+ * on unwatchable entities and allow the watch but not generate any
+ * events for it.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK)
+ return (0);
+
+ return (fem_install(vp, inotify_femp, watch, OPARGUNIQ,
+ (void (*)(void *))inotify_watch_hold,
+ (void (*)(void *))inotify_watch_release));
+}
+
+static int
+inotify_fem_uninstall(vnode_t *vp, inotify_watch_t *watch)
+{
+ /*
+ * See inotify_fem_install(), above, for our rationale here.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK)
+ return (0);
+
+ return (fem_uninstall(vp, inotify_femp, watch));
+}
+
+/*
+ * Zombify a watch. By the time we come in here, it must be true that the
+ * watch has already been fem_uninstall()'d -- the only reference should be
+ * in the state's data structure. If we can get away with freeing it, we'll
+ * do that -- but if the reference count is greater than one due to an active
+ * vnode operation, we'll put this watch on the zombie list on the state
+ * structure.
+ */
+static void
+inotify_watch_zombify(inotify_watch_t *watch)
+{
+ inotify_state_t *state = watch->inw_state;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+ VERIFY(!watch->inw_zombie);
+
+ watch->inw_zombie = 1;
+
+ if (watch->inw_parent != NULL) {
+ inotify_watch_release(watch->inw_parent);
+ } else {
+ avl_remove(&state->ins_byvp, watch);
+ avl_remove(&state->ins_bywd, watch);
+ vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1);
+ watch->inw_wd = -1;
+ }
+
+ mutex_enter(&watch->inw_lock);
+
+ if (watch->inw_refcnt == 1) {
+ /*
+ * There are no operations in flight and there is no way
+ * for anyone to discover this watch -- we can destroy it.
+ */
+ inotify_watch_destroy(watch);
+ } else {
+ /*
+ * There are operations in flight; we will need to enqueue
+ * this for later destruction.
+ */
+ watch->inw_parent = state->ins_zombies;
+ state->ins_zombies = watch;
+ mutex_exit(&watch->inw_lock);
+ }
+}
+
+static inotify_watch_t *
+inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent,
+ const char *name, vnode_t *vp, uint32_t mask)
+{
+ inotify_watch_t *watch;
+ int err;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+
+ watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP);
+
+ watch->inw_vp = vp;
+ watch->inw_mask = mask;
+ watch->inw_state = state;
+ watch->inw_refcnt = 1;
+
+ if (parent == NULL) {
+ watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds,
+ 1, VM_BESTFIT | VM_SLEEP);
+ avl_add(&state->ins_byvp, watch);
+ avl_add(&state->ins_bywd, watch);
+
+ avl_create(&watch->inw_children,
+ (int(*)(const void *, const void *))inotify_watch_cmpvp,
+ sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_byvp));
+ } else {
+ VERIFY(name != NULL);
+ inotify_watch_hold(parent);
+ watch->inw_mask &= IN_CHILD_EVENTS;
+ watch->inw_parent = parent;
+
+ /*
+ * Copy the name. Note that when the name is user-specified,
+ * its length is bounded by the copyinstr() to be MAXPATHLEN
+ * (and regardless, we know by this point that it exists in
+ * our parent).
+ */
+ watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+ (void) strcpy(watch->inw_name, name);
+
+ avl_add(&parent->inw_children, watch);
+ }
+
+ /*
+ * Add our monitor to the vnode. We must not have the watch lock held
+ * when we do this, as it will immediately hold our watch.
+ */
+ err = inotify_fem_install(vp, watch);
+
+ VERIFY(err == 0);
+
+ return (watch);
+}
+
+/*
+ * Remove a (non-child) watch. This is called from either synchronous context
+ * via inotify_rm_watch() or monitor context via either a vnevent or a
+ * one-shot.
+ */
+static void
+inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch)
+{
+ inotify_watch_t *child;
+ int err;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+ VERIFY(watch->inw_parent == NULL);
+
+ err = inotify_fem_uninstall(watch->inw_vp, watch);
+ VERIFY(err == 0);
+
+ /*
+ * If we have children, we're going to remove them all and set them
+ * all to be zombies.
+ */
+ while ((child = avl_first(&watch->inw_children)) != NULL) {
+ VERIFY(child->inw_parent == watch);
+ avl_remove(&watch->inw_children, child);
+
+ err = inotify_fem_uninstall(child->inw_vp, child);
+ VERIFY(err == 0);
+
+ /*
+ * If this child watch has been orphaned, remove it from the
+ * state's list of orphans.
+ */
+ if (child->inw_orphaned) {
+ list_remove(&state->ins_orphans, child);
+ crfree(child->inw_cred);
+ }
+
+ VN_RELE(child->inw_vp);
+
+ /*
+ * We're down (or should be down) to a single reference to
+ * this child watch; it's safe to zombify it.
+ */
+ inotify_watch_zombify(child);
+ }
+
+ inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL);
+ VN_RELE(watch->inw_vp);
+
+ /*
+ * It's now safe to zombify the watch -- we know that the only reference
+ * can come from operations in flight.
+ */
+ inotify_watch_zombify(watch);
+}
+
+/*
+ * Delete a watch. Should only be called from VOP context.
+ */
+static void
+inotify_watch_delete(inotify_watch_t *watch, uint32_t event)
+{
+ inotify_state_t *state = watch->inw_state;
+ inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent;
+ int err;
+
+ if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS))
+ return;
+
+ mutex_enter(&state->ins_lock);
+
+ if (watch->inw_zombie) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if ((parent = watch->inw_parent) == NULL) {
+ if (event == IN_DELETE_SELF) {
+ /*
+ * If we're here because we're being deleted and we
+ * are not a child watch, we need to delete the entire
+ * watch, children and all.
+ */
+ inotify_watch_remove(state, watch);
+ }
+
+ mutex_exit(&state->ins_lock);
+ return;
+ } else {
+ if (event == IN_DELETE_SELF &&
+ !(parent->inw_mask & IN_EXCL_UNLINK)) {
+ /*
+ * This is a child watch for a file that is being
+ * removed and IN_EXCL_UNLINK has not been specified;
+ * indicate that it is orphaned and add it to the list
+ * of orphans. (This list will be checked by the
+ * cleaning cyclic to determine when the watch has
+ * become the only hold on the vnode, at which point
+ * the watch can be zombified.) Note that we check
+ * if the watch is orphaned before we orphan it: hard
+ * links make it possible for VE_REMOVE to be called
+ * multiple times on the same vnode. (!)
+ */
+ if (!watch->inw_orphaned) {
+ watch->inw_orphaned = 1;
+ watch->inw_cred = CRED();
+ crhold(watch->inw_cred);
+ list_insert_head(&state->ins_orphans, watch);
+ }
+
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if (watch->inw_orphaned) {
+ /*
+ * If we're here, a file was orphaned and then later
+ * moved -- which almost certainly means that hard
+ * links are on the scene. We choose the orphan over
+ * the move because we don't want to spuriously
+ * drop events if we can avoid it.
+ */
+ crfree(watch->inw_cred);
+ list_remove(&state->ins_orphans, watch);
+ }
+ }
+
+ if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) {
+ /*
+ * This watch has already been deleted from the parent.
+ */
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ avl_remove(&parent->inw_children, watch);
+ err = inotify_fem_uninstall(watch->inw_vp, watch);
+ VERIFY(err == 0);
+
+ VN_RELE(watch->inw_vp);
+
+ /*
+ * It's now safe to zombify the watch -- which won't actually delete
+ * it as we know that the reference count is greater than 1.
+ */
+ inotify_watch_zombify(watch);
+ mutex_exit(&state->ins_lock);
+}
+
+/*
+ * Insert a new child watch. Should only be called from VOP context when
+ * a child is created in a watched directory.
+ */
+static void
+inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name)
+{
+ inotify_state_t *state = watch->inw_state;
+ inotify_watch_t cmp = { .inw_vp = vp };
+
+ if (!(watch->inw_mask & IN_CHILD_EVENTS))
+ return;
+
+ mutex_enter(&state->ins_lock);
+
+ if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ VN_HOLD(vp);
+ watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask);
+ VERIFY(watch != NULL);
+
+ mutex_exit(&state->ins_lock);
+}
+
+
+static int
+inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask,
+ int32_t *wdp)
+{
+ inotify_watch_t *watch, cmp = { .inw_vp = vp };
+ uint32_t set;
+
+ set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE;
+
+ /*
+ * Lookup our vnode to determine if we already have a watch on it.
+ */
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+ /*
+ * We don't have this watch; allocate a new one, provided that
+ * we have fewer than our limit.
+ */
+ if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) {
+ mutex_exit(&state->ins_lock);
+ return (ENOSPC);
+ }
+
+ VN_HOLD(vp);
+ watch = inotify_watch_add(state, NULL, NULL, vp, set);
+ *wdp = watch->inw_wd;
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+ }
+
+ VERIFY(!watch->inw_zombie);
+
+ if (!(mask & IN_MASK_ADD)) {
+ /*
+ * Note that if we're resetting our event mask and we're
+ * transitioning from an event mask that includes child events
+ * to one that doesn't, there will be potentially some stale
+ * child watches. This is basically fine: they won't fire,
+ * and they will correctly be removed when the watch is
+ * removed.
+ */
+ watch->inw_mask = 0;
+ }
+
+ watch->inw_mask |= set;
+
+ *wdp = watch->inw_wd;
+
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+static int
+inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name)
+{
+ inotify_watch_t *watch, cmp = { .inw_vp = vp };
+ vnode_t *cvp;
+ int err;
+
+ /*
+ * Verify that the specified child doesn't have a directory component
+ * within it.
+ */
+ if (strchr(name, '/') != NULL)
+ return (EINVAL);
+
+ /*
+ * Lookup the underlying file. Note that this will succeed even if
+ * we don't have permissions to actually read the file.
+ */
+ if ((err = lookupnameat(name,
+ UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) {
+ return (err);
+ }
+
+ /*
+ * Use our vnode to find our watch, and then add our child watch to it.
+ */
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+ /*
+ * This is unexpected -- it means that we don't have the
+ * watch that we thought we had.
+ */
+ mutex_exit(&state->ins_lock);
+ VN_RELE(cvp);
+ return (ENXIO);
+ }
+
+ /*
+ * Now lookup the child vnode in the watch; we'll only add it if it
+ * isn't already there.
+ */
+ cmp.inw_vp = cvp;
+
+ if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+ mutex_exit(&state->ins_lock);
+ VN_RELE(cvp);
+ return (0);
+ }
+
+ watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask);
+ VERIFY(watch != NULL);
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+static int
+inotify_rm_watch(inotify_state_t *state, int32_t wd)
+{
+ inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+ mutex_exit(&state->ins_lock);
+ return (EINVAL);
+ }
+
+ inotify_watch_remove(state, watch);
+ mutex_exit(&state->ins_lock);
+
+ /*
+ * Because removing a watch will generate an IN_IGNORED event (and
+ * because inotify_watch_remove() won't alone induce a pollwakeup()),
+ * we need to explicitly issue a pollwakeup().
+ */
+ pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
+
+ return (0);
+}
+
+static int
+inotify_activate(inotify_state_t *state, int32_t wd)
+{
+ inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+ mutex_exit(&state->ins_lock);
+ return (EINVAL);
+ }
+
+ watch->inw_active = 1;
+
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+/*
+ * Called periodically as a cyclic to process the orphans and zombies.
+ */
+static void
+inotify_clean(void *arg)
+{
+ inotify_state_t *state = arg;
+ inotify_watch_t *watch, *parent, *next, **prev;
+ cred_t *savecred;
+ int err;
+
+ mutex_enter(&state->ins_lock);
+
+ for (watch = list_head(&state->ins_orphans);
+ watch != NULL; watch = next) {
+ next = list_next(&state->ins_orphans, watch);
+
+ VERIFY(!watch->inw_zombie);
+ VERIFY((parent = watch->inw_parent) != NULL);
+
+ if (watch->inw_vp->v_count > 1)
+ continue;
+
+ avl_remove(&parent->inw_children, watch);
+ err = inotify_fem_uninstall(watch->inw_vp, watch);
+ VERIFY(err == 0);
+
+ list_remove(&state->ins_orphans, watch);
+
+ /*
+ * For purposes of releasing the vnode, we need to switch our
+ * cred to be the cred of the orphaning thread (which we held
+ * at the time this watch was orphaned).
+ */
+ savecred = curthread->t_cred;
+ curthread->t_cred = watch->inw_cred;
+ VN_RELE(watch->inw_vp);
+ crfree(watch->inw_cred);
+ curthread->t_cred = savecred;
+
+ inotify_watch_zombify(watch);
+ }
+
+ prev = &state->ins_zombies;
+
+ while ((watch = *prev) != NULL) {
+ mutex_enter(&watch->inw_lock);
+
+ if (watch->inw_refcnt == 1) {
+ *prev = watch->inw_parent;
+ inotify_watch_destroy(watch);
+ continue;
+ }
+
+ prev = &watch->inw_parent;
+ mutex_exit(&watch->inw_lock);
+ }
+
+ mutex_exit(&state->ins_lock);
+}
+
+/*ARGSUSED*/
+static int
+inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+{
+ inotify_state_t *state;
+ major_t major = getemajor(*devp);
+ minor_t minor = getminor(*devp);
+ int instances = 0;
+ char c[64];
+
+ if (minor != INOTIFYMNRN_INOTIFY)
+ return (ENXIO);
+
+ mutex_enter(&inotify_lock);
+
+ for (state = inotify_state; state != NULL; state = state->ins_next) {
+ if (state->ins_cred == cred_p)
+ instances++;
+ }
+
+ if (instances >= inotify_maxinstances) {
+ mutex_exit(&inotify_lock);
+ return (EMFILE);
+ }
+
+ minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1,
+ VM_BESTFIT | VM_SLEEP);
+
+ if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) {
+ vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+ mutex_exit(&inotify_lock);
+ return (NULL);
+ }
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+ *devp = makedevice(major, minor);
+
+ crhold(cred_p);
+ state->ins_cred = cred_p;
+ state->ins_next = inotify_state;
+ inotify_state = state;
+
+ (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor);
+ state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1,
+ NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
+
+ avl_create(&state->ins_bywd,
+ (int(*)(const void *, const void *))inotify_watch_cmpwd,
+ sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_bywd));
+
+ avl_create(&state->ins_byvp,
+ (int(*)(const void *, const void *))inotify_watch_cmpvp,
+ sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_byvp));
+
+ list_create(&state->ins_orphans, sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_orphan));
+
+ state->ins_maxwatches = inotify_maxwatches;
+ state->ins_maxevents = inotify_maxevents;
+
+ mutex_exit(&inotify_lock);
+
+ state->ins_cleaner = ddi_periodic_add(inotify_clean,
+ state, NANOSEC, DDI_IPL_0);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_read(dev_t dev, uio_t *uio, cred_t *cr)
+{
+ inotify_state_t *state;
+ inotify_kevent_t *event;
+ minor_t minor = getminor(dev);
+ int err = 0, nevents = 0;
+ size_t len;
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ mutex_enter(&state->ins_lock);
+
+ while (state->ins_head == NULL) {
+ if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
+ mutex_exit(&state->ins_lock);
+ return (EAGAIN);
+ }
+
+ if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) {
+ mutex_exit(&state->ins_lock);
+ return (EINTR);
+ }
+ }
+
+ /*
+ * We have events and we have our lock; return as many as we can.
+ */
+ while ((event = state->ins_head) != NULL) {
+ len = sizeof (event->ine_event) + event->ine_event.len;
+
+ if (uio->uio_resid < len) {
+ if (nevents == 0)
+ err = EINVAL;
+ break;
+ }
+
+ nevents++;
+
+ if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0)
+ break;
+
+ VERIFY(state->ins_nevents > 0);
+ state->ins_nevents--;
+
+ VERIFY(state->ins_size > 0);
+ state->ins_size -= len;
+
+ if ((state->ins_head = event->ine_next) == NULL) {
+ VERIFY(event == state->ins_tail);
+ VERIFY(state->ins_nevents == 0);
+ state->ins_tail = NULL;
+ }
+
+ kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+ }
+
+ mutex_exit(&state->ins_lock);
+
+ return (err);
+}
+
+static int
+inotify_poll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ inotify_state_t *state;
+ minor_t minor = getminor(dev);
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ mutex_enter(&state->ins_lock);
+
+ if (state->ins_head != NULL) {
+ *reventsp = events & (POLLRDNORM | POLLIN);
+ } else {
+ *reventsp = 0;
+ }
+
+ if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &state->ins_pollhd;
+ }
+
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+ inotify_state_t *state;
+ minor_t minor = getminor(dev);
+ file_t *fp;
+ int rval;
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ switch (cmd) {
+ case INOTIFYIOC_ADD_WATCH: {
+ inotify_addwatch_t addwatch;
+ file_t *fp;
+
+ if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0)
+ return (EFAULT);
+
+ if ((fp = getf(addwatch.inaw_fd)) == NULL)
+ return (EBADF);
+
+ rval = inotify_add_watch(state, fp->f_vnode,
+ addwatch.inaw_mask, rv);
+
+ releasef(addwatch.inaw_fd);
+ return (rval);
+ }
+
+ case INOTIFYIOC_ADD_CHILD: {
+ inotify_addchild_t addchild;
+ char name[MAXPATHLEN];
+
+ if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0)
+ return (EFAULT);
+
+ if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0)
+ return (EFAULT);
+
+ if ((fp = getf(addchild.inac_fd)) == NULL)
+ return (EBADF);
+
+ rval = inotify_add_child(state, fp->f_vnode, name);
+
+ releasef(addchild.inac_fd);
+ return (rval);
+ }
+
+ case INOTIFYIOC_RM_WATCH:
+ return (inotify_rm_watch(state, arg));
+
+ case INOTIFYIOC_ACTIVATE:
+ return (inotify_activate(state, arg));
+
+ case FIONREAD: {
+ int32_t size;
+
+ mutex_enter(&state->ins_lock);
+ size = state->ins_size;
+ mutex_exit(&state->ins_lock);
+
+ if (copyout(&size, (void *)arg, sizeof (size)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ default:
+ break;
+ }
+
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
+{
+ inotify_state_t *state, **sp;
+ inotify_watch_t *watch, *zombies;
+ inotify_kevent_t *event;
+ minor_t minor = getminor(dev);
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ if (state->ins_pollhd.ph_list != NULL) {
+ pollwakeup(&state->ins_pollhd, POLLERR);
+ pollhead_clean(&state->ins_pollhd);
+ }
+
+ mutex_enter(&state->ins_lock);
+
+ /*
+ * First, destroy all of our watches.
+ */
+ while ((watch = avl_first(&state->ins_bywd)) != NULL)
+ inotify_watch_remove(state, watch);
+
+ /*
+ * And now destroy our event queue.
+ */
+ while ((event = state->ins_head) != NULL) {
+ state->ins_head = event->ine_next;
+ kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+ }
+
+ zombies = state->ins_zombies;
+ state->ins_zombies = NULL;
+ mutex_exit(&state->ins_lock);
+
+ /*
+ * Now that our state lock is dropped, we can synchronously wait on
+ * any zombies.
+ */
+ while ((watch = zombies) != NULL) {
+ zombies = zombies->inw_parent;
+
+ mutex_enter(&watch->inw_lock);
+
+ while (watch->inw_refcnt > 1)
+ cv_wait(&watch->inw_cv, &watch->inw_lock);
+
+ inotify_watch_destroy(watch);
+ }
+
+ if (state->ins_cleaner != NULL) {
+ ddi_periodic_delete(state->ins_cleaner);
+ state->ins_cleaner = NULL;
+ }
+
+ mutex_enter(&inotify_lock);
+
+ /*
+ * Remove our state from our global list, and release our hold on
+ * the cred.
+ */
+ for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next))
+ VERIFY(*sp != NULL);
+
+ *sp = (*sp)->ins_next;
+ crfree(state->ins_cred);
+ vmem_destroy(state->ins_wds);
+
+ ddi_soft_state_free(inotify_softstate, minor);
+ vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+
+ mutex_exit(&inotify_lock);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+ mutex_enter(&inotify_lock);
+
+ if (ddi_soft_state_init(&inotify_softstate,
+ sizeof (inotify_state_t), 0) != 0) {
+ cmn_err(CE_NOTE, "/dev/inotify failed to create soft state");
+ mutex_exit(&inotify_lock);
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_create_minor_node(devi, "inotify", S_IFCHR,
+ INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) {
+ cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node");
+ ddi_soft_state_fini(&inotify_softstate);
+ mutex_exit(&inotify_lock);
+ return (DDI_FAILURE);
+ }
+
+ if (fem_create("inotify_fem",
+ inotify_vnodesrc_template, &inotify_femp) != 0) {
+ cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state");
+ ddi_remove_minor_node(devi, NULL);
+ ddi_soft_state_fini(&inotify_softstate);
+ mutex_exit(&inotify_lock);
+ return (DDI_FAILURE);
+ }
+
+ ddi_report_dev(devi);
+ inotify_devi = devi;
+
+ inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE,
+ UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0,
+ VM_SLEEP | VMC_IDENTIFIER);
+
+ mutex_exit(&inotify_lock);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ switch (cmd) {
+ case DDI_DETACH:
+ break;
+
+ case DDI_SUSPEND:
+ return (DDI_SUCCESS);
+
+ default:
+ return (DDI_FAILURE);
+ }
+
+ mutex_enter(&inotify_lock);
+ fem_free(inotify_femp);
+ vmem_destroy(inotify_minor);
+
+ ddi_remove_minor_node(inotify_devi, NULL);
+ inotify_devi = NULL;
+
+ ddi_soft_state_fini(&inotify_softstate);
+ mutex_exit(&inotify_lock);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ int error;
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)inotify_devi;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ }
+ return (error);
+}
+
+static struct cb_ops inotify_cb_ops = {
+ inotify_open, /* open */
+ inotify_close, /* close */
+ nulldev, /* strategy */
+ nulldev, /* print */
+ nodev, /* dump */
+ inotify_read, /* read */
+ nodev, /* write */
+ inotify_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ inotify_poll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ 0, /* streamtab */
+ D_NEW | D_MP /* Driver compatibility flag */
+};
+
+static struct dev_ops inotify_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ inotify_info, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ inotify_attach, /* attach */
+ inotify_detach, /* detach */
+ nodev, /* reset */
+ &inotify_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed, /* quiesce */
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* module type (this is a pseudo driver) */
+ "inotify support", /* name of module */
+ &inotify_ops, /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf
new file mode 100644
index 0000000000..ce9da6180f
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+name="inotify" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c
index c10e23a8a6..cde57df235 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c
@@ -996,17 +996,20 @@ static s32 ixgbe_clear_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq)
* @vlan: VLAN id to write to VLAN filter
* @vind: VMDq output index that maps queue to VLAN id in VFTA
* @vlan_on: boolean flag to turn on/off VLAN in VFTA
+ * @vlvf_bypass: boolean flag - unused
*
* Turn on/off specified VLAN in the VLAN filter table.
**/
s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind,
- bool vlan_on)
+ bool vlan_on, bool vlvf_bypass)
{
u32 regindex;
u32 bitindex;
u32 bits;
u32 vftabyte;
+ UNREFERENCED_1PARAMETER(vlvf_bypass);
+
DEBUGFUNC("ixgbe_set_vfta_82598");
if (vlan > 4095)
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h
index d2241c70cd..c32672187a 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h
@@ -40,7 +40,8 @@ s32 ixgbe_fc_enable_82598(struct ixgbe_hw *hw);
s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw);
void ixgbe_enable_relaxed_ordering_82598(struct ixgbe_hw *hw);
s32 ixgbe_set_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq);
-s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on);
+s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on,
+ bool vlvf_bypass);
s32 ixgbe_read_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 *val);
s32 ixgbe_write_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 val);
s32 ixgbe_read_i2c_eeprom_82598(struct ixgbe_hw *hw, u8 byte_offset,
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c
index 894d0b2ac9..c550982710 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c
@@ -1057,33 +1057,38 @@ s32 ixgbe_clear_vfta(struct ixgbe_hw *hw)
* ixgbe_set_vfta - Set VLAN filter table
* @hw: pointer to hardware structure
* @vlan: VLAN id to write to VLAN filter
- * @vind: VMDq output index that maps queue to VLAN id in VFTA
- * @vlan_on: boolean flag to turn on/off VLAN in VFTA
+ * @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ * @vlan_on: boolean flag to turn on/off VLAN
+ * @vlvf_bypass: boolean flag indicating updating the default pool is okay
*
* Turn on/off specified VLAN in the VLAN filter table.
**/
-s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on)
+s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on,
+ bool vlvf_bypass)
{
return ixgbe_call_func(hw, hw->mac.ops.set_vfta, (hw, vlan, vind,
- vlan_on), IXGBE_NOT_IMPLEMENTED);
+ vlan_on, vlvf_bypass), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_set_vlvf - Set VLAN Pool Filter
* @hw: pointer to hardware structure
* @vlan: VLAN id to write to VLAN filter
- * @vind: VMDq output index that maps queue to VLAN id in VFVFB
- * @vlan_on: boolean flag to turn on/off VLAN in VFVF
- * @vfta_changed: pointer to boolean flag which indicates whether VFTA
- * should be changed
+ * @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ * @vlan_on: boolean flag to turn on/off VLAN in VLVF
+ * @vfta_delta: pointer to the difference between the current value of VFTA
+ * and the desired value
+ * @vfta: the desired value of the VFTA
+ * @vlvf_bypass: boolean flag indicating updating the default pool is okay
*
* Turn on/off specified bit in VLVF table.
**/
s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on,
- bool *vfta_changed)
+ u32 *vfta_delta, u32 vfta, bool vlvf_bypass)
{
return ixgbe_call_func(hw, hw->mac.ops.set_vlvf, (hw, vlan, vind,
- vlan_on, vfta_changed), IXGBE_NOT_IMPLEMENTED);
+ vlan_on, vfta_delta, vfta, vlvf_bypass),
+ IXGBE_NOT_IMPLEMENTED);
}
/**
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h
index 24d507039d..3bee89e45e 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h
@@ -125,9 +125,10 @@ s32 ixgbe_enable_mc(struct ixgbe_hw *hw);
s32 ixgbe_disable_mc(struct ixgbe_hw *hw);
s32 ixgbe_clear_vfta(struct ixgbe_hw *hw);
s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan,
- u32 vind, bool vlan_on);
+ u32 vind, bool vlan_on, bool vlvf_bypass);
s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind,
- bool vlan_on, bool *vfta_changed);
+ bool vlan_on, u32 *vfta_delta, u32 vfta,
+ bool vlvf_bypass);
s32 ixgbe_fc_enable(struct ixgbe_hw *hw);
s32 ixgbe_setup_fc(struct ixgbe_hw *hw);
s32 ixgbe_set_fw_drv_ver(struct ixgbe_hw *hw, u8 maj, u8 min, u8 build,
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c
index f342eee637..656534862c 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c
@@ -3810,68 +3810,65 @@ s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw)
* return the VLVF index where this VLAN id should be placed
*
**/
-s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan)
+s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass)
{
- u32 bits = 0;
- u32 first_empty_slot = 0;
- s32 regindex;
+ s32 regindex, first_empty_slot;
+ u32 bits;
/* short cut the special case */
if (vlan == 0)
return 0;
- /*
- * Search for the vlan id in the VLVF entries. Save off the first empty
- * slot found along the way
- */
- for (regindex = 1; regindex < IXGBE_VLVF_ENTRIES; regindex++) {
+ /* if vlvf_bypass is set we don't want to use an empty slot, we
+ * will simply bypass the VLVF if there are no entries present in the
+ * VLVF that contain our VLAN
+ */
+ first_empty_slot = vlvf_bypass ? IXGBE_ERR_NO_SPACE : 0;
+
+ /* add VLAN enable bit for comparison */
+ vlan |= IXGBE_VLVF_VIEN;
+
+ /* Search for the vlan id in the VLVF entries. Save off the first empty
+ * slot found along the way.
+ *
+ * pre-decrement loop covering (IXGBE_VLVF_ENTRIES - 1) .. 1
+ */
+ for (regindex = IXGBE_VLVF_ENTRIES; --regindex;) {
bits = IXGBE_READ_REG(hw, IXGBE_VLVF(regindex));
- if (!bits && !(first_empty_slot))
+ if (bits == vlan)
+ return regindex;
+ if (!first_empty_slot && !bits)
first_empty_slot = regindex;
- else if ((bits & 0x0FFF) == vlan)
- break;
}
- /*
- * If regindex is less than IXGBE_VLVF_ENTRIES, then we found the vlan
- * in the VLVF. Else use the first empty VLVF register for this
- * vlan id.
- */
- if (regindex >= IXGBE_VLVF_ENTRIES) {
- if (first_empty_slot)
- regindex = first_empty_slot;
- else {
- ERROR_REPORT1(IXGBE_ERROR_SOFTWARE,
- "No space in VLVF.\n");
- regindex = IXGBE_ERR_NO_SPACE;
- }
- }
+ /* If we are here then we didn't find the VLAN. Return first empty
+ * slot we found during our search, else error.
+ */
+ if (!first_empty_slot)
+ ERROR_REPORT1(IXGBE_ERROR_SOFTWARE, "No space in VLVF.\n");
- return regindex;
+ return first_empty_slot ? first_empty_slot : IXGBE_ERR_NO_SPACE;
}
/**
* ixgbe_set_vfta_generic - Set VLAN filter table
* @hw: pointer to hardware structure
* @vlan: VLAN id to write to VLAN filter
- * @vind: VMDq output index that maps queue to VLAN id in VFVFB
- * @vlan_on: boolean flag to turn on/off VLAN in VFVF
+ * @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ * @vlan_on: boolean flag to turn on/off VLAN
+ * @vlvf_bypass: boolean flag indicating updating default pool is okay
*
* Turn on/off specified VLAN in the VLAN filter table.
**/
s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
- bool vlan_on)
+ bool vlan_on, bool vlvf_bypass)
{
- s32 regindex;
- u32 bitindex;
- u32 vfta;
- u32 targetbit;
- s32 ret_val = IXGBE_SUCCESS;
- bool vfta_changed = FALSE;
+ u32 regidx, vfta_delta, vfta;
+ s32 ret_val;
DEBUGFUNC("ixgbe_set_vfta_generic");
- if (vlan > 4095)
+ if (vlan > 4095 || vind > 63)
return IXGBE_ERR_PARAM;
/*
@@ -3886,33 +3883,33 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
* bits[11-5]: which register
* bits[4-0]: which bit in the register
*/
- regindex = (vlan >> 5) & 0x7F;
- bitindex = vlan & 0x1F;
- targetbit = (1 << bitindex);
- vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regindex));
-
- if (vlan_on) {
- if (!(vfta & targetbit)) {
- vfta |= targetbit;
- vfta_changed = TRUE;
- }
- } else {
- if ((vfta & targetbit)) {
- vfta &= ~targetbit;
- vfta_changed = TRUE;
- }
- }
+ regidx = vlan / 32;
+ vfta_delta = 1 << (vlan % 32);
+ vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regidx));
+
+ /*
+ * vfta_delta represents the difference between the current value
+ * of vfta and the value we want in the register. Since the diff
+ * is an XOR mask we can just update the vfta using an XOR
+ */
+ vfta_delta &= vlan_on ? ~vfta : vfta;
+ vfta ^= vfta_delta;
/* Part 2
* Call ixgbe_set_vlvf_generic to set VLVFB and VLVF
*/
- ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on,
- &vfta_changed);
- if (ret_val != IXGBE_SUCCESS)
+ ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on, &vfta_delta,
+ vfta, vlvf_bypass);
+ if (ret_val != IXGBE_SUCCESS) {
+ if (vlvf_bypass)
+ goto vfta_update;
return ret_val;
+ }
- if (vfta_changed)
- IXGBE_WRITE_REG(hw, IXGBE_VFTA(regindex), vfta);
+vfta_update:
+ /* Update VFTA now that we are ready for traffic */
+ if (vfta_delta)
+ IXGBE_WRITE_REG(hw, IXGBE_VFTA(regidx), vfta);
return IXGBE_SUCCESS;
}
@@ -3921,21 +3918,25 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
* ixgbe_set_vlvf_generic - Set VLAN Pool Filter
* @hw: pointer to hardware structure
* @vlan: VLAN id to write to VLAN filter
- * @vind: VMDq output index that maps queue to VLAN id in VFVFB
- * @vlan_on: boolean flag to turn on/off VLAN in VFVF
- * @vfta_changed: pointer to boolean flag which indicates whether VFTA
- * should be changed
+ * @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ * @vlan_on: boolean flag to turn on/off VLAN in VLVF
+ * @vfta_delta: pointer to the difference between the current value of VFTA
+ * and the desired value
+ * @vfta: the desired value of the VFTA
+ * @vlvf_bypass: boolean flag indicating updating default pool is okay
*
* Turn on/off specified bit in VLVF table.
**/
s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
- bool vlan_on, bool *vfta_changed)
+ bool vlan_on, u32 *vfta_delta, u32 vfta,
+ bool vlvf_bypass)
{
- u32 vt;
+ u32 bits;
+ s32 vlvf_index;
DEBUGFUNC("ixgbe_set_vlvf_generic");
- if (vlan > 4095)
+ if (vlan > 4095 || vind > 63)
return IXGBE_ERR_PARAM;
/* If VT Mode is set
@@ -3945,83 +3946,60 @@ s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
* Or !vlan_on
* clear the pool bit and possibly the vind
*/
- vt = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
- if (vt & IXGBE_VT_CTL_VT_ENABLE) {
- s32 vlvf_index;
- u32 bits;
-
- vlvf_index = ixgbe_find_vlvf_slot(hw, vlan);
- if (vlvf_index < 0)
- return vlvf_index;
-
- if (vlan_on) {
- /* set the pool bit */
- if (vind < 32) {
- bits = IXGBE_READ_REG(hw,
- IXGBE_VLVFB(vlvf_index * 2));
- bits |= (1 << vind);
- IXGBE_WRITE_REG(hw,
- IXGBE_VLVFB(vlvf_index * 2),
- bits);
- } else {
- bits = IXGBE_READ_REG(hw,
- IXGBE_VLVFB((vlvf_index * 2) + 1));
- bits |= (1 << (vind - 32));
- IXGBE_WRITE_REG(hw,
- IXGBE_VLVFB((vlvf_index * 2) + 1),
- bits);
- }
- } else {
- /* clear the pool bit */
- if (vind < 32) {
- bits = IXGBE_READ_REG(hw,
- IXGBE_VLVFB(vlvf_index * 2));
- bits &= ~(1 << vind);
- IXGBE_WRITE_REG(hw,
- IXGBE_VLVFB(vlvf_index * 2),
- bits);
- bits |= IXGBE_READ_REG(hw,
- IXGBE_VLVFB((vlvf_index * 2) + 1));
- } else {
- bits = IXGBE_READ_REG(hw,
- IXGBE_VLVFB((vlvf_index * 2) + 1));
- bits &= ~(1 << (vind - 32));
- IXGBE_WRITE_REG(hw,
- IXGBE_VLVFB((vlvf_index * 2) + 1),
- bits);
- bits |= IXGBE_READ_REG(hw,
- IXGBE_VLVFB(vlvf_index * 2));
- }
- }
+ if (!(IXGBE_READ_REG(hw, IXGBE_VT_CTL) & IXGBE_VT_CTL_VT_ENABLE))
+ return IXGBE_SUCCESS;
- /*
- * If there are still bits set in the VLVFB registers
- * for the VLAN ID indicated we need to see if the
- * caller is requesting that we clear the VFTA entry bit.
- * If the caller has requested that we clear the VFTA
- * entry bit but there are still pools/VFs using this VLAN
- * ID entry then ignore the request. We're not worried
- * about the case where we're turning the VFTA VLAN ID
- * entry bit on, only when requested to turn it off as
- * there may be multiple pools and/or VFs using the
- * VLAN ID entry. In that case we cannot clear the
- * VFTA bit until all pools/VFs using that VLAN ID have also
- * been cleared. This will be indicated by "bits" being
- * zero.
+ vlvf_index = ixgbe_find_vlvf_slot(hw, vlan, vlvf_bypass);
+ if (vlvf_index < 0)
+ return vlvf_index;
+
+ bits = IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32));
+
+ /* set the pool bit */
+ bits |= 1 << (vind % 32);
+ if (vlan_on)
+ goto vlvf_update;
+
+ /* clear the pool bit */
+ bits ^= 1 << (vind % 32);
+
+ if (!bits &&
+ !IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + 1 - vind / 32))) {
+ /* Clear VFTA first, then disable VLVF. Otherwise
+ * we run the risk of stray packets leaking into
+ * the PF via the default pool
*/
- if (bits) {
- IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index),
- (IXGBE_VLVF_VIEN | vlan));
- if ((!vlan_on) && (vfta_changed != NULL)) {
- /* someone wants to clear the vfta entry
- * but some pools/VFs are still using it.
- * Ignore it. */
- *vfta_changed = FALSE;
- }
- } else
- IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0);
+ if (*vfta_delta)
+ IXGBE_WRITE_REG(hw, IXGBE_VFTA(vlan / 32), vfta);
+
+ /* disable VLVF and clear remaining bit from pool */
+ IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0);
+ IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), 0);
+
+ return IXGBE_SUCCESS;
}
+ /* If there are still bits set in the VLVFB registers
+ * for the VLAN ID indicated we need to see if the
+ * caller is requesting that we clear the VFTA entry bit.
+ * If the caller has requested that we clear the VFTA
+ * entry bit but there are still pools/VFs using this VLAN
+ * ID entry then ignore the request. We're not worried
+ * about the case where we're turning the VFTA VLAN ID
+ * entry bit on, only when requested to turn it off as
+ * there may be multiple pools and/or VFs using the
+ * VLAN ID entry. In that case we cannot clear the
+ * VFTA bit until all pools/VFs using that VLAN ID have also
+ * been cleared. This will be indicated by "bits" being
+ * zero.
+ */
+ *vfta_delta = 0;
+
+vlvf_update:
+ /* record pool change and enable VLAN ID if not already enabled */
+ IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), bits);
+ IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), IXGBE_VLVF_VIEN | vlan);
+
return IXGBE_SUCCESS;
}
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h
index 069fc88c96..bd18e96f82 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h
@@ -135,11 +135,12 @@ s32 ixgbe_clear_vmdq_generic(struct ixgbe_hw *hw, u32 rar, u32 vmdq);
s32 ixgbe_insert_mac_addr_generic(struct ixgbe_hw *hw, u8 *addr, u32 vmdq);
s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw);
s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan,
- u32 vind, bool vlan_on);
+ u32 vind, bool vlan_on, bool vlvf_bypass);
s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
- bool vlan_on, bool *vfta_changed);
+ bool vlan_on, u32 *vfta_delta, u32 vfta,
+ bool vlvf_bypass);
s32 ixgbe_clear_vfta_generic(struct ixgbe_hw *hw);
-s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan);
+s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass);
s32 ixgbe_check_mac_link_generic(struct ixgbe_hw *hw,
ixgbe_link_speed *speed,
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h
index 45e8a7d029..9231979ff7 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h
@@ -3715,8 +3715,9 @@ struct ixgbe_mac_operations {
s32 (*enable_mc)(struct ixgbe_hw *);
s32 (*disable_mc)(struct ixgbe_hw *);
s32 (*clear_vfta)(struct ixgbe_hw *);
- s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool);
- s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, bool *);
+ s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool, bool);
+ s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, u32 *, u32,
+ bool);
s32 (*init_uta_tables)(struct ixgbe_hw *);
void (*set_mac_anti_spoofing)(struct ixgbe_hw *, bool, int);
void (*set_vlan_anti_spoofing)(struct ixgbe_hw *, bool, int);
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c
index 2ce4d32a30..66d836eb8f 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c
@@ -321,15 +321,16 @@ static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr)
return vector;
}
-static void ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw,
- u32 *msg, u16 size)
+static s32 ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw, u32 *msg,
+ u32 *retmsg, u16 size)
{
struct ixgbe_mbx_info *mbx = &hw->mbx;
- u32 retmsg[IXGBE_VFMAILBOX_SIZE];
s32 retval = mbx->ops.write_posted(hw, msg, size, 0);
- if (!retval)
- mbx->ops.read_posted(hw, retmsg, size, 0);
+ if (retval)
+ return retval;
+
+ return mbx->ops.read_posted(hw, retmsg, size, 0);
}
/**
@@ -415,29 +416,29 @@ s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list,
return mbx->ops.write_posted(hw, msgbuf, IXGBE_VFMAILBOX_SIZE, 0);
}
-/**
+/*
* ixgbe_set_vfta_vf - Set/Unset vlan filter table address
* @hw: pointer to the HW structure
* @vlan: 12 bit VLAN ID
* @vind: unused by VF drivers
* @vlan_on: if TRUE then set bit, else clear bit
+ * @vlvf_bypass: boolean flag indicating updating default pool is okay
+ *
+ * Turn on/off specified VLAN in the VLAN filter table.
**/
-s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on)
+s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind,
+ bool vlan_on, bool vlvf_bypass)
{
- struct ixgbe_mbx_info *mbx = &hw->mbx;
u32 msgbuf[2];
s32 ret_val;
- UNREFERENCED_1PARAMETER(vind);
+ UNREFERENCED_2PARAMETER(vind, vlvf_bypass);
msgbuf[0] = IXGBE_VF_SET_VLAN;
msgbuf[1] = vlan;
/* Setting the 8 bit field MSG INFO to TRUE indicates "add" */
msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT;
- ret_val = mbx->ops.write_posted(hw, msgbuf, 2, 0);
- if (!ret_val)
- ret_val = mbx->ops.read_posted(hw, msgbuf, 1, 0);
-
+ ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2);
if (!ret_val && (msgbuf[0] & IXGBE_VT_MSGTYPE_ACK))
return IXGBE_SUCCESS;
@@ -628,7 +629,7 @@ void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size)
msgbuf[0] = IXGBE_VF_SET_LPE;
msgbuf[1] = max_size;
- ixgbevf_write_msg_read_ack(hw, msgbuf, 2);
+ ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2);
}
/**
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h
index edc801367d..e9b8dc34ae 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h
@@ -132,7 +132,8 @@ s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 index, u8 *addr);
s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list,
u32 mc_addr_count, ixgbe_mc_addr_itr,
bool clear);
-s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on);
+s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind,
+ bool vlan_on, bool vlvf_bypass);
void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size);
int ixgbevf_negotiate_api_version(struct ixgbe_hw *hw, int api);
int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs,
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
index 2b8084801c..1be0c424e4 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
@@ -25,7 +25,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2012 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 OSN Online Service Nuernberg GmbH. All rights reserved.
@@ -57,8 +57,8 @@ static int ixgbe_alloc_rings(ixgbe_t *);
static void ixgbe_free_rings(ixgbe_t *);
static int ixgbe_alloc_rx_data(ixgbe_t *);
static void ixgbe_free_rx_data(ixgbe_t *);
-static void ixgbe_setup_rings(ixgbe_t *);
-static void ixgbe_setup_rx(ixgbe_t *);
+static int ixgbe_setup_rings(ixgbe_t *);
+static int ixgbe_setup_rx(ixgbe_t *);
static void ixgbe_setup_tx(ixgbe_t *);
static void ixgbe_setup_rx_ring(ixgbe_rx_ring_t *);
static void ixgbe_setup_tx_ring(ixgbe_tx_ring_t *);
@@ -67,6 +67,7 @@ static void ixgbe_setup_vmdq(ixgbe_t *);
static void ixgbe_setup_vmdq_rss(ixgbe_t *);
static void ixgbe_setup_rss_table(ixgbe_t *);
static void ixgbe_init_unicst(ixgbe_t *);
+static int ixgbe_init_vlan(ixgbe_t *);
static int ixgbe_unicst_find(ixgbe_t *, const uint8_t *);
static void ixgbe_setup_multicst(ixgbe_t *);
static void ixgbe_get_hw_state(ixgbe_t *);
@@ -113,6 +114,8 @@ static void ixgbe_intr_other_work(ixgbe_t *, uint32_t);
static void ixgbe_get_driver_control(struct ixgbe_hw *);
static int ixgbe_addmac(void *, const uint8_t *);
static int ixgbe_remmac(void *, const uint8_t *);
+static int ixgbe_addvlan(mac_group_driver_t, uint16_t);
+static int ixgbe_remvlan(mac_group_driver_t, uint16_t);
static void ixgbe_release_driver_control(struct ixgbe_hw *);
static int ixgbe_attach(dev_info_t *, ddi_attach_cmd_t);
@@ -273,7 +276,7 @@ static adapter_info_t ixgbe_82599eb_cap = {
128, /* default number of rx queues */
64, /* maximum number of rx groups */
1, /* minimum number of rx groups */
- 1, /* default number of rx groups */
+ 32, /* default number of rx groups */
128, /* maximum number of tx queues */
1, /* minimum number of tx queues */
8, /* default number of tx queues */
@@ -304,7 +307,7 @@ static adapter_info_t ixgbe_X540_cap = {
128, /* default number of rx queues */
64, /* maximum number of rx groups */
1, /* minimum number of rx groups */
- 1, /* default number of rx groups */
+ 32, /* default number of rx groups */
128, /* maximum number of tx queues */
1, /* minimum number of tx queues */
8, /* default number of tx queues */
@@ -1149,6 +1152,8 @@ ixgbe_init_driver_settings(ixgbe_t *ixgbe)
rx_group = &ixgbe->rx_groups[i];
rx_group->index = i;
rx_group->ixgbe = ixgbe;
+ list_create(&rx_group->vlans, sizeof (ixgbe_vlan_t),
+ offsetof(ixgbe_vlan_t, ixvl_link));
}
for (i = 0; i < ixgbe->num_tx_rings; i++) {
@@ -1898,7 +1903,8 @@ ixgbe_start(ixgbe_t *ixgbe, boolean_t alloc_buffer)
/*
* Setup the rx/tx rings
*/
- ixgbe_setup_rings(ixgbe);
+ if (ixgbe_setup_rings(ixgbe) != IXGBE_SUCCESS)
+ goto start_failure;
/*
* ixgbe_start() will be called when resetting, however if reset
@@ -1999,6 +2005,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
void *arg1, void *arg2)
{
ixgbe_t *ixgbe = (ixgbe_t *)arg1;
+ int prev = ixgbe->intr_cnt;
switch (cbaction) {
/* IRM callback */
@@ -2012,7 +2019,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
if (ixgbe_intr_adjust(ixgbe, cbaction, count) !=
DDI_SUCCESS) {
ixgbe_error(ixgbe,
- "IRM CB: Failed to adjust interrupts");
+ "IRM CB: Failed to adjust interrupts [%d %d %d]",
+ cbaction, count, prev);
goto cb_fail;
}
break;
@@ -2271,6 +2279,16 @@ ixgbe_free_rings(ixgbe_t *ixgbe)
ixgbe->tx_rings = NULL;
}
+ for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) {
+ ixgbe_vlan_t *vlp;
+ ixgbe_rx_group_t *rx_group = &ixgbe->rx_groups[i];
+
+ while ((vlp = list_remove_head(&rx_group->vlans)) != NULL)
+ kmem_free(vlp, sizeof (ixgbe_vlan_t));
+
+ list_destroy(&rx_group->vlans);
+ }
+
if (ixgbe->rx_groups != NULL) {
kmem_free(ixgbe->rx_groups,
sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups);
@@ -2325,7 +2343,7 @@ ixgbe_free_rx_data(ixgbe_t *ixgbe)
/*
* ixgbe_setup_rings - Setup rx/tx rings.
*/
-static void
+static int
ixgbe_setup_rings(ixgbe_t *ixgbe)
{
/*
@@ -2335,9 +2353,12 @@ ixgbe_setup_rings(ixgbe_t *ixgbe)
* 2. Initialize necessary registers for receive/transmit;
* 3. Initialize software pointers/parameters for receive/transmit;
*/
- ixgbe_setup_rx(ixgbe);
+ if (ixgbe_setup_rx(ixgbe) != IXGBE_SUCCESS)
+ return (IXGBE_FAILURE);
ixgbe_setup_tx(ixgbe);
+
+ return (IXGBE_SUCCESS);
}
static void
@@ -2423,7 +2444,7 @@ ixgbe_setup_rx_ring(ixgbe_rx_ring_t *rx_ring)
IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rx_ring->hw_index), reg_val);
}
-static void
+static int
ixgbe_setup_rx(ixgbe_t *ixgbe)
{
ixgbe_rx_ring_t *rx_ring;
@@ -2517,6 +2538,15 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
}
/*
+ * Initialize VLAN SW and HW state if VLAN filtering is
+ * enabled.
+ */
+ if (ixgbe->vlft_enabled) {
+ if (ixgbe_init_vlan(ixgbe) != IXGBE_SUCCESS)
+ return (IXGBE_FAILURE);
+ }
+
+ /*
* Enable the receive unit. This must be done after filter
* control is set in FCTRL. On 82598, we disable the descriptor monitor.
* 82598 is the only adapter which defines this RXCTRL option.
@@ -2598,6 +2628,8 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg_val);
}
+
+ return (IXGBE_SUCCESS);
}
static void
@@ -2829,7 +2861,7 @@ static void
ixgbe_setup_vmdq(ixgbe_t *ixgbe)
{
struct ixgbe_hw *hw = &ixgbe->hw;
- uint32_t vmdctl, i, vtctl;
+ uint32_t vmdctl, i, vtctl, vlnctl;
/*
* Setup the VMDq Control register, enable VMDq based on
@@ -2864,10 +2896,20 @@ ixgbe_setup_vmdq(ixgbe_t *ixgbe)
/*
* Enable Virtualization and Replication.
*/
- vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
+ vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
+ ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK;
+ vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl);
/*
+ * Enable VLAN filtering and switching (VFTA and VLVF).
+ */
+ vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
+ vlnctl |= IXGBE_VLNCTRL_VFE;
+ IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl);
+ ixgbe->vlft_enabled = B_TRUE;
+
+ /*
* Enable receiving packets to all VFs
*/
IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL);
@@ -2887,7 +2929,7 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe)
{
struct ixgbe_hw *hw = &ixgbe->hw;
uint32_t i, mrqc;
- uint32_t vtctl, vmdctl;
+ uint32_t vtctl, vmdctl, vlnctl;
/*
* Initialize RETA/ERETA table
@@ -2969,10 +3011,21 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe)
/*
* Enable Virtualization and Replication.
*/
+ vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
+ ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK;
+ vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl);
/*
+ * Enable VLAN filtering and switching (VFTA and VLVF).
+ */
+ vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
+ vlnctl |= IXGBE_VLNCTRL_VFE;
+ IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl);
+ ixgbe->vlft_enabled = B_TRUE;
+
+ /*
* Enable receiving packets to all VFs
*/
IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL);
@@ -3142,6 +3195,53 @@ ixgbe_unicst_find(ixgbe_t *ixgbe, const uint8_t *mac_addr)
}
/*
+ * Restore the HW state to match the SW state during restart.
+ */
+static int
+ixgbe_init_vlan(ixgbe_t *ixgbe)
+{
+ /*
+ * The device is starting for the first time; there is nothing
+ * to do.
+ */
+ if (!ixgbe->vlft_init) {
+ ixgbe->vlft_init = B_TRUE;
+ return (IXGBE_SUCCESS);
+ }
+
+ for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) {
+ int ret;
+ boolean_t vlvf_bypass;
+ ixgbe_rx_group_t *rxg = &ixgbe->rx_groups[i];
+ struct ixgbe_hw *hw = &ixgbe->hw;
+
+ if (rxg->aupe) {
+ uint32_t vml2flt;
+
+ vml2flt = IXGBE_READ_REG(hw, IXGBE_VMOLR(rxg->index));
+ vml2flt |= IXGBE_VMOLR_AUPE;
+ IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rxg->index), vml2flt);
+ }
+
+ vlvf_bypass = (rxg->index == ixgbe->rx_def_group);
+ for (ixgbe_vlan_t *vlp = list_head(&rxg->vlans); vlp != NULL;
+ vlp = list_next(&rxg->vlans, vlp)) {
+ ret = ixgbe_set_vfta(hw, vlp->ixvl_vid, rxg->index,
+ B_TRUE, vlvf_bypass);
+
+ if (ret != IXGBE_SUCCESS) {
+ ixgbe_error(ixgbe, "Failed to program VFTA"
+ " for group %u, VID: %u, ret: %d.",
+ rxg->index, vlp->ixvl_vid, ret);
+ return (IXGBE_FAILURE);
+ }
+ }
+ }
+
+ return (IXGBE_SUCCESS);
+}
+
+/*
* ixgbe_multicst_add - Add a multicst address.
*/
int
@@ -6151,6 +6251,7 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index,
mac_group_info_t *infop, mac_group_handle_t gh)
{
ixgbe_t *ixgbe = (ixgbe_t *)arg;
+ struct ixgbe_hw *hw = &ixgbe->hw;
switch (rtype) {
case MAC_RING_TYPE_RX: {
@@ -6164,6 +6265,20 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index,
infop->mgi_stop = NULL;
infop->mgi_addmac = ixgbe_addmac;
infop->mgi_remmac = ixgbe_remmac;
+
+ if ((ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ ||
+ ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ_RSS) &&
+ (hw->mac.type == ixgbe_mac_82599EB ||
+ hw->mac.type == ixgbe_mac_X540 ||
+ hw->mac.type == ixgbe_mac_X550 ||
+ hw->mac.type == ixgbe_mac_X550EM_x)) {
+ infop->mgi_addvlan = ixgbe_addvlan;
+ infop->mgi_remvlan = ixgbe_remvlan;
+ } else {
+ infop->mgi_addvlan = NULL;
+ infop->mgi_remvlan = NULL;
+ }
+
infop->mgi_count = (ixgbe->num_rx_rings / ixgbe->num_rx_groups);
break;
@@ -6263,6 +6378,228 @@ ixgbe_rx_ring_intr_disable(mac_intr_handle_t intrh)
return (0);
}
+static ixgbe_vlan_t *
+ixgbe_find_vlan(ixgbe_rx_group_t *rx_group, uint16_t vid)
+{
+ for (ixgbe_vlan_t *vlp = list_head(&rx_group->vlans); vlp != NULL;
+ vlp = list_next(&rx_group->vlans, vlp)) {
+ if (vlp->ixvl_vid == vid)
+ return (vlp);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Attempt to use a VLAN HW filter for this group. If the group is
+ * interested in untagged packets then set AUPE only. If the group is
+ * the default then only set the VFTA. Leave the VLVF slots open for
+ * reserved groups to guarantee their use of HW filtering.
+ */
+static int
+ixgbe_addvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+ ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)gdriver;
+ ixgbe_t *ixgbe = rx_group->ixgbe;
+ struct ixgbe_hw *hw = &ixgbe->hw;
+ ixgbe_vlan_t *vlp;
+ int ret;
+ boolean_t is_def_grp;
+
+ mutex_enter(&ixgbe->gen_lock);
+
+ if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+ mutex_exit(&ixgbe->gen_lock);
+ return (ECANCELED);
+ }
+
+ /*
+ * Let's be sure VLAN filtering is enabled.
+ */
+ VERIFY3B(ixgbe->vlft_enabled, ==, B_TRUE);
+ is_def_grp = (rx_group->index == ixgbe->rx_def_group);
+
+ /*
+ * VLAN filtering is enabled but we want to receive untagged
+ * traffic on this group -- set the AUPE bit on the group and
+ * leave the VLAN tables alone.
+ */
+ if (vid == MAC_VLAN_UNTAGGED) {
+ /*
+ * We never enable AUPE on the default group; it is
+ * redundant. Untagged traffic which passes L2
+ * filtering is delivered to the default group if no
+ * other group is interested.
+ */
+ if (!is_def_grp) {
+ uint32_t vml2flt;
+
+ vml2flt = IXGBE_READ_REG(hw,
+ IXGBE_VMOLR(rx_group->index));
+ vml2flt |= IXGBE_VMOLR_AUPE;
+ IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rx_group->index),
+ vml2flt);
+ rx_group->aupe = B_TRUE;
+ }
+
+ mutex_exit(&ixgbe->gen_lock);
+ return (0);
+ }
+
+ vlp = ixgbe_find_vlan(rx_group, vid);
+ if (vlp != NULL) {
+ /* Only the default group supports multiple clients. */
+ VERIFY3B(is_def_grp, ==, B_TRUE);
+ vlp->ixvl_refs++;
+ mutex_exit(&ixgbe->gen_lock);
+ return (0);
+ }
+
+ /*
+ * The default group doesn't require a VLVF entry, only a VFTA
+ * entry. All traffic passing L2 filtering (MPSAR + VFTA) is
+ * delivered to the default group if no other group is
+ * interested. The fourth argument, vlvf_bypass, tells the
+ * ixgbe common code to avoid using a VLVF slot if one isn't
+ * already allocated to this VLAN.
+ *
+ * This logic is meant to reserve VLVF slots for use by
+ * reserved groups: guaranteeing their use of HW filtering.
+ */
+ ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE, is_def_grp);
+
+ if (ret == IXGBE_SUCCESS) {
+ vlp = kmem_zalloc(sizeof (ixgbe_vlan_t), KM_SLEEP);
+ vlp->ixvl_vid = vid;
+ vlp->ixvl_refs = 1;
+ list_insert_tail(&rx_group->vlans, vlp);
+ mutex_exit(&ixgbe->gen_lock);
+ return (0);
+ }
+
+ /*
+ * We should actually never return ENOSPC because we've set
+ * things up so that every reserved group is guaranteed to
+ * have a VLVF slot.
+ */
+ if (ret == IXGBE_ERR_PARAM)
+ ret = EINVAL;
+ else if (ret == IXGBE_ERR_NO_SPACE)
+ ret = ENOSPC;
+ else
+ ret = EIO;
+
+ mutex_exit(&ixgbe->gen_lock);
+ return (ret);
+}
+
+/*
+ * Attempt to remove the VLAN HW filter associated with this group. If
+ * we are removing a HW filter for the default group then we know only
+ * the VFTA was set (VLVF is reserved for non-default/reserved
+ * groups). If the group wishes to stop receiving untagged traffic
+ * then clear the AUPE but leave the VLAN filters alone.
+ */
+static int
+ixgbe_remvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+ ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)gdriver;
+ ixgbe_t *ixgbe = rx_group->ixgbe;
+ struct ixgbe_hw *hw = &ixgbe->hw;
+ int ret;
+ ixgbe_vlan_t *vlp;
+ boolean_t is_def_grp;
+
+ mutex_enter(&ixgbe->gen_lock);
+
+ if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+ mutex_exit(&ixgbe->gen_lock);
+ return (ECANCELED);
+ }
+
+ is_def_grp = (rx_group->index == ixgbe->rx_def_group);
+
+ /* See the AUPE comment in ixgbe_addvlan(). */
+ if (vid == MAC_VLAN_UNTAGGED) {
+ if (!is_def_grp) {
+ uint32_t vml2flt;
+
+ vml2flt = IXGBE_READ_REG(hw,
+ IXGBE_VMOLR(rx_group->index));
+ vml2flt &= ~IXGBE_VMOLR_AUPE;
+ IXGBE_WRITE_REG(hw,
+ IXGBE_VMOLR(rx_group->index), vml2flt);
+ rx_group->aupe = B_FALSE;
+ }
+ mutex_exit(&ixgbe->gen_lock);
+ return (0);
+ }
+
+ vlp = ixgbe_find_vlan(rx_group, vid);
+ if (vlp == NULL)
+ return (ENOENT);
+
+ /*
+ * See the comment in ixgbe_addvlan() about is_def_grp and
+ * vlvf_bypass.
+ */
+ if (vlp->ixvl_refs == 1) {
+ ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_FALSE,
+ is_def_grp);
+ } else {
+ /*
+ * Only the default group can have multiple clients.
+ * If there is more than one client, leave the
+ * VFTA[vid] bit alone.
+ */
+ VERIFY3B(is_def_grp, ==, B_TRUE);
+ VERIFY3U(vlp->ixvl_refs, >, 1);
+ vlp->ixvl_refs--;
+ mutex_exit(&ixgbe->gen_lock);
+ return (0);
+ }
+
+ if (ret != IXGBE_SUCCESS) {
+ mutex_exit(&ixgbe->gen_lock);
+ /* IXGBE_ERR_PARAM should be the only possible error here. */
+ if (ret == IXGBE_ERR_PARAM)
+ return (EINVAL);
+ else
+ return (EIO);
+ }
+
+ VERIFY3U(vlp->ixvl_refs, ==, 1);
+ vlp->ixvl_refs = 0;
+ list_remove(&rx_group->vlans, vlp);
+ kmem_free(vlp, sizeof (ixgbe_vlan_t));
+
+ /*
+ * Calling ixgbe_set_vfta() on a non-default group may have
+ * cleared the VFTA[vid] bit even though the default group
+ * still has clients using the vid. This happens because the
+ * ixgbe common code doesn't ref count the use of VLANs. Check
+ * for any use of vid on the default group and make sure the
+ * VFTA[vid] bit is set. This operation is idempotent: setting
+ * VFTA[vid] to true if already true won't hurt anything.
+ */
+ if (!is_def_grp) {
+ ixgbe_rx_group_t *defgrp;
+
+ defgrp = &ixgbe->rx_groups[ixgbe->rx_def_group];
+ vlp = ixgbe_find_vlan(defgrp, vid);
+ if (vlp != NULL) {
+ /* This shouldn't fail, but if it does return EIO. */
+ ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE,
+ B_TRUE);
+ if (ret != IXGBE_SUCCESS)
+ return (EIO);
+ }
+ }
+
+ mutex_exit(&ixgbe->gen_lock);
+ return (0);
+}
+
/*
* Add a mac address.
*/
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
index ca52b10c89..baa4766c0e 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
@@ -27,7 +27,7 @@
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 Saso Kiselkov. All rights reserved.
* Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _IXGBE_SW_H
@@ -91,6 +91,8 @@ extern "C" {
#define MAX_NUM_UNICAST_ADDRESSES 0x80
#define MAX_NUM_MULTICAST_ADDRESSES 0x1000
+#define MAX_NUM_VLAN_FILTERS 0x40
+
#define IXGBE_INTR_NONE 0
#define IXGBE_INTR_MSIX 1
#define IXGBE_INTR_MSI 2
@@ -387,6 +389,15 @@ typedef union ixgbe_ether_addr {
} mac;
} ixgbe_ether_addr_t;
+/*
+ * The list of VLANs an Rx group will accept.
+ */
+typedef struct ixgbe_vlan {
+ list_node_t ixvl_link;
+ uint16_t ixvl_vid; /* The VLAN ID */
+ uint_t ixvl_refs; /* Number of users of this VLAN */
+} ixgbe_vlan_t;
+
typedef enum {
USE_NONE,
USE_COPY,
@@ -589,6 +600,7 @@ typedef struct ixgbe_rx_ring {
struct ixgbe *ixgbe; /* Pointer to ixgbe struct */
} ixgbe_rx_ring_t;
+
/*
* Software Receive Ring Group
*/
@@ -596,6 +608,8 @@ typedef struct ixgbe_rx_group {
uint32_t index; /* Group index */
mac_group_handle_t group_handle; /* call back group handle */
struct ixgbe *ixgbe; /* Pointer to ixgbe struct */
+ boolean_t aupe; /* AUPE bit */
+ list_t vlans; /* list of VLANs to allow */
} ixgbe_rx_group_t;
/*
@@ -662,6 +676,7 @@ typedef struct ixgbe {
*/
ixgbe_rx_group_t *rx_groups; /* Array of rx groups */
uint32_t num_rx_groups; /* Number of rx groups in use */
+ uint32_t rx_def_group; /* Default Rx group index */
/*
* Transmit Rings
@@ -715,6 +730,9 @@ typedef struct ixgbe {
uint32_t mcast_count;
struct ether_addr mcast_table[MAX_NUM_MULTICAST_ADDRESSES];
+ boolean_t vlft_enabled; /* VLAN filtering enabled? */
+ boolean_t vlft_init; /* VLAN filtering initialized? */
+
ulong_t sys_page_size;
boolean_t link_check_complete;
diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c
index a3cd9dfbb1..0a5eec209f 100644
--- a/usr/src/uts/common/io/ksocket/ksocket.c
+++ b/usr/src/uts/common/io/ksocket/ksocket.c
@@ -22,7 +22,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/file.h>
@@ -932,3 +932,15 @@ ksocket_rele(ksocket_t ks)
cv_signal(&so->so_closing_cv);
}
}
+
+int
+ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg)
+{
+ return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg));
+}
+
+void
+ksocket_krecv_unblock(ksocket_t ks)
+{
+ so_krecv_unblock(KSTOSO(ks));
+}
diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h
index ac5251540f..516a68d358 100644
--- a/usr/src/uts/common/io/ksocket/ksocket_impl.h
+++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h
@@ -22,11 +22,17 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _INET_KSOCKET_KSOCKET_IMPL_H
#define _INET_KSOCKET_KSOCKET_IMPL_H
+/*
+ * Note that if this relationship ever changes, the logic in ksocket_krecv_set
+ * must be updated and we must maintain local state about this on whatever the
+ * new ksocket object is.
+ */
#define KSTOSO(ks) ((struct sonode *)(ks))
#define SOTOKS(so) ((ksocket_t)(uintptr_t)(so))
diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c
index 74e71ed7e8..759b524186 100644
--- a/usr/src/uts/common/io/ksyms.c
+++ b/usr/src/uts/common/io/ksyms.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
@@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred)
char *addr;
void *hptr = NULL;
ksyms_buflist_hdr_t hdr;
+
+ /*
+ * This device should never be visible in a zone, but if it somehow
+ * does get created we refuse to allow the zone to use it.
+ */
+ if (crgetzoneid(cred) != GLOBAL_ZONEID)
+ return (EACCES);
+
bzero(&hdr, sizeof (struct ksyms_buflist_hdr));
list_create(&hdr.blist, PAGESIZE,
offsetof(ksyms_buflist_t, buflist_node));
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index bba41d7cf3..f258aad701 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2015 Garrett D'Amore <garrett@damore.org>
*/
@@ -158,7 +158,7 @@
* perimeter) across a call to any other layer from the mac layer. The call to
* any other layer could be via mi_* entry points, classifier entry points into
* the driver or via upcall pointers into layers above. The mac perimeter may
- * be acquired or held only in the down direction, for e.g. when calling into
+ * be acquired or held only in the down direction, e.g. when calling into
* a mi_* driver enty point to provide atomicity of the operation.
*
* R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
@@ -207,7 +207,7 @@
* number whenever the ring's stop routine is invoked.
* See comments in mac_rx_ring();
*
- * R17 Similarly mi_stop is another synchronization point and the driver must
+ * R17. Similarly mi_stop is another synchronization point and the driver must
* ensure that all upcalls are done and there won't be any future upcall
* before returning from mi_stop.
*
@@ -227,7 +227,7 @@
*
* cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
*
- * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
+ * mac perim -> i_dls_devnet_lock [dls_devnet_rename]
*
* Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
* client to driver. In the case of clients that explictly use the mac provided
@@ -460,7 +460,7 @@ mac_init(void)
mac_logging_interval = 20;
mac_flow_log_enable = B_FALSE;
mac_link_log_enable = B_FALSE;
- mac_logging_timer = 0;
+ mac_logging_timer = NULL;
/* Register to be notified of noteworthy pools events */
mac_pool_event_reg.pec_func = mac_pool_event_cb;
@@ -707,12 +707,45 @@ mac_callback_remove_wait(mac_cb_info_t *mcbi)
}
}
+void
+mac_callback_barrier(mac_cb_info_t *mcbi)
+{
+ ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
+ ASSERT3U(mcbi->mcbi_barrier_cnt, <, UINT_MAX);
+
+ if (mcbi->mcbi_walker_cnt == 0) {
+ return;
+ }
+
+ mcbi->mcbi_barrier_cnt++;
+ do {
+ cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
+ } while (mcbi->mcbi_walker_cnt > 0);
+ mcbi->mcbi_barrier_cnt--;
+ cv_broadcast(&mcbi->mcbi_cv);
+}
+
+void
+mac_callback_walker_enter(mac_cb_info_t *mcbi)
+{
+ mutex_enter(mcbi->mcbi_lockp);
+ /*
+ * Incoming walkers should give precedence to timely clean-up of
+ * deleted callback entries and requested barriers.
+ */
+ while (mcbi->mcbi_del_cnt > 0 || mcbi->mcbi_barrier_cnt > 0) {
+ cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
+ }
+ mcbi->mcbi_walker_cnt++;
+ mutex_exit(mcbi->mcbi_lockp);
+}
+
/*
* The last mac callback walker does the cleanup. Walk the list and unlik
* all the logically deleted entries and construct a temporary list of
* removed entries. Return the list of removed entries to the caller.
*/
-mac_cb_t *
+static mac_cb_t *
mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
{
mac_cb_t *p;
@@ -741,7 +774,90 @@ mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
return (rmlist);
}
-boolean_t
+void
+mac_callback_walker_exit(mac_cb_info_t *mcbi, mac_cb_t **headp,
+ boolean_t is_promisc)
+{
+ boolean_t do_wake = B_FALSE;
+
+ mutex_enter(mcbi->mcbi_lockp);
+
+ /* If walkers remain, nothing more can be done for now */
+ if (--mcbi->mcbi_walker_cnt != 0) {
+ mutex_exit(mcbi->mcbi_lockp);
+ return;
+ }
+
+ if (mcbi->mcbi_del_cnt != 0) {
+ mac_cb_t *rmlist;
+
+ rmlist = mac_callback_walker_cleanup(mcbi, headp);
+
+ if (!is_promisc) {
+ /* The "normal" non-promisc callback clean-up */
+ mac_callback_free(rmlist);
+ } else {
+ mac_cb_t *mcb, *mcb_next;
+
+ /*
+ * The promisc callbacks are in 2 lists, one off the
+ * 'mip' and another off the 'mcip' threaded by
+ * mpi_mi_link and mpi_mci_link respectively. There
+ * is, however, only a single shared total walker
+ * count, and an entry cannot be physically unlinked if
+ * a walker is active on either list. The last walker
+ * does this cleanup of logically deleted entries.
+ *
+ * With a list of callbacks deleted from above from
+ * mi_promisc_list (headp), remove the corresponding
+ * entry from mci_promisc_list (headp_pair) and free
+ * the structure.
+ */
+ for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
+ mac_promisc_impl_t *mpip;
+ mac_client_impl_t *mcip;
+
+ mcb_next = mcb->mcb_nextp;
+ mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
+ mcip = mpip->mpi_mcip;
+
+ ASSERT3P(&mcip->mci_mip->mi_promisc_cb_info,
+ ==, mcbi);
+ ASSERT3P(&mcip->mci_mip->mi_promisc_list,
+ ==, headp);
+
+ VERIFY(mac_callback_remove(mcbi,
+ &mcip->mci_promisc_list,
+ &mpip->mpi_mci_link));
+ mcb->mcb_flags = 0;
+ mcb->mcb_nextp = NULL;
+ kmem_cache_free(mac_promisc_impl_cache, mpip);
+ }
+ }
+
+ /*
+ * Wake any walker threads that could be waiting in
+ * mac_callback_walker_enter() until deleted items have been
+ * cleaned from the list.
+ */
+ do_wake = B_TRUE;
+ }
+
+ if (mcbi->mcbi_barrier_cnt != 0) {
+ /*
+ * One or more threads are waiting for all walkers to exit the
+ * callback list. Notify them, now that the list is clear.
+ */
+ do_wake = B_TRUE;
+ }
+
+ if (do_wake) {
+ cv_broadcast(&mcbi->mcbi_cv);
+ }
+ mutex_exit(mcbi->mcbi_lockp);
+}
+
+static boolean_t
mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
{
mac_cb_t *mcb;
@@ -755,7 +871,7 @@ mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
return (B_FALSE);
}
-boolean_t
+static boolean_t
mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
{
boolean_t found;
@@ -780,40 +896,6 @@ mac_callback_free(mac_cb_t *rmlist)
}
}
-/*
- * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
- * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
- * is only a single shared total walker count, and an entry can't be physically
- * unlinked if a walker is active on either list. The last walker does this
- * cleanup of logically deleted entries.
- */
-void
-i_mac_promisc_walker_cleanup(mac_impl_t *mip)
-{
- mac_cb_t *rmlist;
- mac_cb_t *mcb;
- mac_cb_t *mcb_next;
- mac_promisc_impl_t *mpip;
-
- /*
- * Construct a temporary list of deleted callbacks by walking the
- * the mi_promisc_list. Then for each entry in the temporary list,
- * remove it from the mci_promisc_list and free the entry.
- */
- rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
- &mip->mi_promisc_list);
-
- for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
- mcb_next = mcb->mcb_nextp;
- mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
- VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
- &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
- mcb->mcb_flags = 0;
- mcb->mcb_nextp = NULL;
- kmem_cache_free(mac_promisc_impl_cache, mpip);
- }
-}
-
void
i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
{
@@ -1115,9 +1197,10 @@ mac_start(mac_handle_t mh)
if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
/*
- * Start the default ring, since it will be needed
- * to receive broadcast and multicast traffic for
- * both primary and non-primary MAC clients.
+ * Start the default group which is responsible
+ * for receiving broadcast and multicast
+ * traffic for both primary and non-primary
+ * MAC clients.
*/
ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
err = mac_start_group_and_rings(defgrp);
@@ -1456,7 +1539,7 @@ mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
* used by the aggr driver to access and control the underlying HW Rx group
* and rings. In this case, the aggr driver has exclusive control of the
* underlying HW Rx group/rings, it calls the following functions to
- * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
+ * start/stop the HW Rx rings, disable/enable polling, add/remove MAC
* addresses, or set up the Rx callback.
*/
/* ARGSUSED */
@@ -1501,8 +1584,9 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
ASSERT(B_FALSE);
return (-1);
}
+
/*
- * The mac client did not reserve any RX group, return directly.
+ * The MAC client did not reserve an Rx group, return directly.
* This is probably because the underlying MAC does not support
* any groups.
*/
@@ -1511,7 +1595,7 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
if (grp == NULL)
return (0);
/*
- * This group must be reserved by this mac client.
+ * This group must be reserved by this MAC client.
*/
ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
(mcip == MAC_GROUP_ONLY_CLIENT(grp)));
@@ -1527,6 +1611,77 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
}
/*
+ * Get the HW ring handles of the given group index. If the MAC
+ * doesn't have a group at this index, or any groups at all, then 0 is
+ * returned and hwgh is set to NULL. This is a private client API. The
+ * MAC perimeter must be held when calling this function.
+ *
+ * mh: A handle to the MAC that owns the group.
+ *
+ * idx: The index of the HW group to be read.
+ *
+ * hwgh: If non-NULL, contains a handle to the HW group on return.
+ *
+ * hwrh: An array of ring handles pointing to the HW rings in the
+ * group. The array must be large enough to hold a handle to each ring
+ * in the group. To be safe, this array should be of size MAX_RINGS_PER_GROUP.
+ *
+ * rtype: Used to determine if we are fetching Rx or Tx rings.
+ *
+ * Returns the number of rings in the group.
+ */
+uint_t
+mac_hwrings_idx_get(mac_handle_t mh, uint_t idx, mac_group_handle_t *hwgh,
+ mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+ mac_group_t *grp;
+ mac_ring_t *ring;
+ uint_t cnt = 0;
+
+ /*
+ * The MAC perimeter must be held when accessing the
+ * mi_{rx,tx}_groups fields.
+ */
+ ASSERT(MAC_PERIM_HELD(mh));
+ ASSERT(rtype == MAC_RING_TYPE_RX || rtype == MAC_RING_TYPE_TX);
+
+ if (rtype == MAC_RING_TYPE_RX) {
+ grp = mip->mi_rx_groups;
+ } else if (rtype == MAC_RING_TYPE_TX) {
+ grp = mip->mi_tx_groups;
+ }
+
+ while (grp != NULL && grp->mrg_index != idx)
+ grp = grp->mrg_next;
+
+ /*
+ * If the MAC doesn't have a group at this index or doesn't
+ * impelement RINGS capab, then set hwgh to NULL and return 0.
+ */
+ if (hwgh != NULL)
+ *hwgh = NULL;
+
+ if (grp == NULL)
+ return (0);
+
+ ASSERT3U(idx, ==, grp->mrg_index);
+
+ for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
+ ASSERT3U(cnt, <, MAX_RINGS_PER_GROUP);
+ hwrh[cnt] = (mac_ring_handle_t)ring;
+ }
+
+ /* A group should always have at least one ring. */
+ ASSERT3U(cnt, >, 0);
+
+ if (hwgh != NULL)
+ *hwgh = (mac_group_handle_t)grp;
+
+ return (cnt);
+}
+
+/*
* This function is called to get info about Tx/Rx rings.
*
* Return value: returns uint_t which will have various bits set
@@ -1542,6 +1697,69 @@ mac_hwring_getinfo(mac_ring_handle_t rh)
}
/*
+ * Set the passthru callback on the hardware ring.
+ */
+void
+mac_hwring_set_passthru(mac_ring_handle_t hwrh, mac_rx_t fn, void *arg1,
+ mac_resource_handle_t arg2)
+{
+ mac_ring_t *hwring = (mac_ring_t *)hwrh;
+
+ ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX);
+
+ hwring->mr_classify_type = MAC_PASSTHRU_CLASSIFIER;
+
+ hwring->mr_pt_fn = fn;
+ hwring->mr_pt_arg1 = arg1;
+ hwring->mr_pt_arg2 = arg2;
+}
+
+/*
+ * Clear the passthru callback on the hardware ring.
+ */
+void
+mac_hwring_clear_passthru(mac_ring_handle_t hwrh)
+{
+ mac_ring_t *hwring = (mac_ring_t *)hwrh;
+
+ ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX);
+
+ hwring->mr_classify_type = MAC_NO_CLASSIFIER;
+
+ hwring->mr_pt_fn = NULL;
+ hwring->mr_pt_arg1 = NULL;
+ hwring->mr_pt_arg2 = NULL;
+}
+
+void
+mac_client_set_flow_cb(mac_client_handle_t mch, mac_rx_t func, void *arg1)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ flow_entry_t *flent = mcip->mci_flent;
+
+ mutex_enter(&flent->fe_lock);
+ flent->fe_cb_fn = (flow_fn_t)func;
+ flent->fe_cb_arg1 = arg1;
+ flent->fe_cb_arg2 = NULL;
+ flent->fe_flags &= ~FE_MC_NO_DATAPATH;
+ mutex_exit(&flent->fe_lock);
+}
+
+void
+mac_client_clear_flow_cb(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ flow_entry_t *flent = mcip->mci_flent;
+
+ mutex_enter(&flent->fe_lock);
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
+ flent->fe_cb_arg1 = NULL;
+ flent->fe_cb_arg2 = NULL;
+ flent->fe_flags |= FE_MC_NO_DATAPATH;
+ mutex_exit(&flent->fe_lock);
+}
+
+/*
* Export ddi interrupt handles from the HW ring to the pseudo ring and
* setup the RX callback of the mac client which exclusively controls
* HW ring.
@@ -1613,17 +1831,56 @@ mac_hwring_enable_intr(mac_ring_handle_t rh)
return (intr->mi_enable(intr->mi_handle));
}
+/*
+ * Start the HW ring pointed to by rh.
+ *
+ * This is used by special MAC clients that are MAC themselves and
+ * need to exert control over the underlying HW rings of the NIC.
+ */
int
mac_hwring_start(mac_ring_handle_t rh)
{
mac_ring_t *rr_ring = (mac_ring_t *)rh;
+ int rv = 0;
+
+ if (rr_ring->mr_state != MR_INUSE)
+ rv = mac_start_ring(rr_ring);
+
+ return (rv);
+}
+
+/*
+ * Stop the HW ring pointed to by rh. Also see mac_hwring_start().
+ */
+void
+mac_hwring_stop(mac_ring_handle_t rh)
+{
+ mac_ring_t *rr_ring = (mac_ring_t *)rh;
+
+ if (rr_ring->mr_state != MR_FREE)
+ mac_stop_ring(rr_ring);
+}
+
+/*
+ * Remove the quiesced flag from the HW ring pointed to by rh.
+ *
+ * This is used by special MAC clients that are MAC themselves and
+ * need to exert control over the underlying HW rings of the NIC.
+ */
+int
+mac_hwring_activate(mac_ring_handle_t rh)
+{
+ mac_ring_t *rr_ring = (mac_ring_t *)rh;
MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
return (0);
}
+/*
+ * Quiesce the HW ring pointed to by rh. Also see mac_hwring_activate().
+ */
void
-mac_hwring_stop(mac_ring_handle_t rh)
+mac_hwring_quiesce(mac_ring_handle_t rh)
{
mac_ring_t *rr_ring = (mac_ring_t *)rh;
@@ -1730,6 +1987,68 @@ mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
}
/*
+ * Program the group's HW VLAN filter if it has such support.
+ * Otherwise, the group will implicitly accept tagged traffic and
+ * there is nothing to do.
+ */
+int
+mac_hwgroup_addvlan(mac_group_handle_t gh, uint16_t vid)
+{
+ mac_group_t *group = (mac_group_t *)gh;
+
+ if (!MAC_GROUP_HW_VLAN(group))
+ return (0);
+
+ return (mac_group_addvlan(group, vid));
+}
+
+int
+mac_hwgroup_remvlan(mac_group_handle_t gh, uint16_t vid)
+{
+ mac_group_t *group = (mac_group_t *)gh;
+
+ if (!MAC_GROUP_HW_VLAN(group))
+ return (0);
+
+ return (mac_group_remvlan(group, vid));
+}
+
+/*
+ * Determine if a MAC has HW VLAN support. This is a private API
+ * consumed by aggr. In the future it might be nice to have a bitfield
+ * in mac_capab_rings_t to track which forms of HW filtering are
+ * supported by the MAC.
+ */
+boolean_t
+mac_has_hw_vlan(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ return (MAC_GROUP_HW_VLAN(mip->mi_rx_groups));
+}
+
+/*
+ * Get the number of Rx HW groups on this MAC.
+ */
+uint_t
+mac_get_num_rx_groups(mac_handle_t mh)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ ASSERT(MAC_PERIM_HELD(mh));
+ return (mip->mi_rx_group_count);
+}
+
+int
+mac_set_promisc(mac_handle_t mh, boolean_t value)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ ASSERT(MAC_PERIM_HELD(mh));
+ return (i_mac_promisc_set(mip, value));
+}
+
+/*
* Set the RX group to be shared/reserved. Note that the group must be
* started/stopped outside of this function.
*/
@@ -2416,7 +2735,6 @@ mac_disable(mac_handle_t mh)
/*
* Called when the MAC instance has a non empty flow table, to de-multiplex
* incoming packets to the right flow.
- * The MAC's rw lock is assumed held as a READER.
*/
/* ARGSUSED */
static mblk_t *
@@ -2426,19 +2744,6 @@ mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
uint_t flags = FLOW_INBOUND;
int err;
- /*
- * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
- * to mac_flow_lookup() so that the VLAN packets can be successfully
- * passed to the non-VLAN aggregation flows.
- *
- * Note that there is possibly a race between this and
- * mac_unicast_remove/add() and VLAN packets could be incorrectly
- * classified to non-VLAN flows of non-aggregation mac clients. These
- * VLAN packets will be then filtered out by the mac module.
- */
- if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
- flags |= FLOW_IGNORE_VLAN;
-
err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
if (err != 0) {
/* no registered receive function */
@@ -3772,9 +4077,27 @@ mac_start_group_and_rings(mac_group_t *group)
for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
ASSERT(ring->mr_state == MR_FREE);
+
if ((rv = mac_start_ring(ring)) != 0)
goto error;
- ring->mr_classify_type = MAC_SW_CLASSIFIER;
+
+ /*
+ * When aggr_set_port_sdu() is called, it will remove
+ * the port client's unicast address. This will cause
+ * MAC to stop the default group's rings on the port
+ * MAC. After it modifies the SDU, it will then re-add
+ * the unicast address. At which time, this function is
+ * called to start the default group's rings. Normally
+ * this function would set the classify type to
+ * MAC_SW_CLASSIFIER; but that will break aggr which
+ * relies on the passthru classify mode being set for
+ * correct delivery (see mac_rx_common()). To avoid
+ * that, we check for a passthru callback and set the
+ * classify type to MAC_PASSTHRU_CLASSIFIER; as it was
+ * before the rings were stopped.
+ */
+ ring->mr_classify_type = (ring->mr_pt_fn != NULL) ?
+ MAC_PASSTHRU_CLASSIFIER : MAC_SW_CLASSIFIER;
}
return (0);
@@ -4077,12 +4400,15 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
/*
- * Driver must register group->mgi_addmac/remmac() for rx groups
- * to support multiple MAC addresses.
+ * The driver must register some form of hardware MAC
+ * filter in order for Rx groups to support multiple
+ * MAC addresses.
*/
if (rtype == MAC_RING_TYPE_RX &&
- ((group_info.mgi_addmac == NULL) ||
- (group_info.mgi_remmac == NULL))) {
+ (group_info.mgi_addmac == NULL ||
+ group_info.mgi_remmac == NULL)) {
+ DTRACE_PROBE1(mac__init__rings__no__mac__filter,
+ char *, mip->mi_name);
err = EINVAL;
goto bail;
}
@@ -4129,8 +4455,9 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
/* Update this group's status */
mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
- } else
+ } else {
group->mrg_rings = NULL;
+ }
ASSERT(ring_left == 0);
@@ -4320,6 +4647,38 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
}
/*
+ * Associate the VLAN filter to the receive group.
+ */
+int
+mac_group_addvlan(mac_group_t *group, uint16_t vlan)
+{
+ VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+ VERIFY3P(group->mrg_info.mgi_addvlan, !=, NULL);
+
+ if (vlan > VLAN_ID_MAX)
+ return (EINVAL);
+
+ vlan = MAC_VLAN_UNTAGGED_VID(vlan);
+ return (group->mrg_info.mgi_addvlan(group->mrg_info.mgi_driver, vlan));
+}
+
+/*
+ * Dissociate the VLAN from the receive group.
+ */
+int
+mac_group_remvlan(mac_group_t *group, uint16_t vlan)
+{
+ VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+ VERIFY3P(group->mrg_info.mgi_remvlan, !=, NULL);
+
+ if (vlan > VLAN_ID_MAX)
+ return (EINVAL);
+
+ vlan = MAC_VLAN_UNTAGGED_VID(vlan);
+ return (group->mrg_info.mgi_remvlan(group->mrg_info.mgi_driver, vlan));
+}
+
+/*
* Associate a MAC address with a receive group.
*
* The return value of this function should always be checked properly, because
@@ -4335,8 +4694,8 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
int
mac_group_addmac(mac_group_t *group, const uint8_t *addr)
{
- ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
- ASSERT(group->mrg_info.mgi_addmac != NULL);
+ VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+ VERIFY3P(group->mrg_info.mgi_addmac, !=, NULL);
return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
}
@@ -4347,8 +4706,8 @@ mac_group_addmac(mac_group_t *group, const uint8_t *addr)
int
mac_group_remmac(mac_group_t *group, const uint8_t *addr)
{
- ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
- ASSERT(group->mrg_info.mgi_remmac != NULL);
+ VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+ VERIFY3P(group->mrg_info.mgi_remmac, !=, NULL);
return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
}
@@ -4523,28 +4882,20 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
switch (ring->mr_type) {
case MAC_RING_TYPE_RX:
/*
- * Setup SRS on top of the new ring if the group is
- * reserved for someones exclusive use.
+ * Setup an SRS on top of the new ring if the group is
+ * reserved for someone's exclusive use.
*/
if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
- mac_client_impl_t *mcip;
+ mac_client_impl_t *mcip = MAC_GROUP_ONLY_CLIENT(group);
- mcip = MAC_GROUP_ONLY_CLIENT(group);
- /*
- * Even though this group is reserved we migth still
- * have multiple clients, i.e a VLAN shares the
- * group with the primary mac client.
- */
- if (mcip != NULL) {
- flent = mcip->mci_flent;
- ASSERT(flent->fe_rx_srs_cnt > 0);
- mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
- mac_fanout_setup(mcip, flent,
- MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
- mcip, NULL, NULL);
- } else {
- ring->mr_classify_type = MAC_SW_CLASSIFIER;
- }
+ VERIFY3P(mcip, !=, NULL);
+ flent = mcip->mci_flent;
+ VERIFY3S(flent->fe_rx_srs_cnt, >, 0);
+ mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
+ mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
+ mac_rx_deliver, mcip, NULL, NULL);
+ } else {
+ ring->mr_classify_type = MAC_SW_CLASSIFIER;
}
break;
case MAC_RING_TYPE_TX:
@@ -4570,7 +4921,7 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
mcip = mgcp->mgc_client;
flent = mcip->mci_flent;
- is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR);
+ is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT);
mac_srs = MCIP_TX_SRS(mcip);
tx = &mac_srs->srs_tx;
mac_tx_client_quiesce((mac_client_handle_t)mcip);
@@ -4714,7 +5065,7 @@ i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
mcip = MAC_GROUP_ONLY_CLIENT(group);
ASSERT(mcip != NULL);
- ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR);
+ ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT);
mac_srs = MCIP_TX_SRS(mcip);
ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
@@ -4922,12 +5273,12 @@ mac_free_macaddr(mac_address_t *map)
mac_impl_t *mip = map->ma_mip;
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
- ASSERT(mip->mi_addresses != NULL);
-
- map = mac_find_macaddr(mip, map->ma_addr);
+ VERIFY3P(mip->mi_addresses, !=, NULL);
- ASSERT(map != NULL);
- ASSERT(map->ma_nusers == 0);
+ VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr));
+ VERIFY3P(map, !=, NULL);
+ VERIFY3S(map->ma_nusers, ==, 0);
+ VERIFY3P(map->ma_vlans, ==, NULL);
if (map == mip->mi_addresses) {
mip->mi_addresses = map->ma_next;
@@ -4943,85 +5294,201 @@ mac_free_macaddr(mac_address_t *map)
kmem_free(map, sizeof (mac_address_t));
}
+static mac_vlan_t *
+mac_find_vlan(mac_address_t *map, uint16_t vid)
+{
+ mac_vlan_t *mvp;
+
+ for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next) {
+ if (mvp->mv_vid == vid)
+ return (mvp);
+ }
+
+ return (NULL);
+}
+
+static mac_vlan_t *
+mac_add_vlan(mac_address_t *map, uint16_t vid)
+{
+ mac_vlan_t *mvp;
+
+ /*
+ * We should never add the same {addr, VID} tuple more
+ * than once, but let's be sure.
+ */
+ for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next)
+ VERIFY3U(mvp->mv_vid, !=, vid);
+
+ /* Add the VLAN to the head of the VLAN list. */
+ mvp = kmem_zalloc(sizeof (mac_vlan_t), KM_SLEEP);
+ mvp->mv_vid = vid;
+ mvp->mv_next = map->ma_vlans;
+ map->ma_vlans = mvp;
+
+ return (mvp);
+}
+
+static void
+mac_rem_vlan(mac_address_t *map, mac_vlan_t *mvp)
+{
+ mac_vlan_t *pre;
+
+ if (map->ma_vlans == mvp) {
+ map->ma_vlans = mvp->mv_next;
+ } else {
+ pre = map->ma_vlans;
+ while (pre->mv_next != mvp) {
+ pre = pre->mv_next;
+
+ /*
+ * We've reached the end of the list without
+ * finding mvp.
+ */
+ VERIFY3P(pre, !=, NULL);
+ }
+ pre->mv_next = mvp->mv_next;
+ }
+
+ kmem_free(mvp, sizeof (mac_vlan_t));
+}
+
/*
- * Add a MAC address reference for a client. If the desired MAC address
- * exists, add a reference to it. Otherwise, add the new address by adding
- * it to a reserved group or setting promiscuous mode. Won't try different
- * group is the group is non-NULL, so the caller must explictly share
- * default group when needed.
- *
- * Note, the primary MAC address is initialized at registration time, so
- * to add it to default group only need to activate it if its reference
- * count is still zero. Also, some drivers may not have advertised RINGS
- * capability.
+ * Create a new mac_address_t if this is the first use of the address
+ * or add a VID to an existing address. In either case, the
+ * mac_address_t acts as a list of {addr, VID} tuples where each tuple
+ * shares the same addr. If group is non-NULL then attempt to program
+ * the MAC's HW filters for this group. Otherwise, if group is NULL,
+ * then the MAC has no rings and there is nothing to program.
*/
int
-mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
- boolean_t use_hw)
+mac_add_macaddr_vlan(mac_impl_t *mip, mac_group_t *group, uint8_t *addr,
+ uint16_t vid, boolean_t use_hw)
{
- mac_address_t *map;
- int err = 0;
- boolean_t allocated_map = B_FALSE;
+ mac_address_t *map;
+ mac_vlan_t *mvp;
+ int err = 0;
+ boolean_t allocated_map = B_FALSE;
+ boolean_t hw_mac = B_FALSE;
+ boolean_t hw_vlan = B_FALSE;
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
- map = mac_find_macaddr(mip, mac_addr);
+ map = mac_find_macaddr(mip, addr);
/*
- * If the new MAC address has not been added. Allocate a new one
- * and set it up.
+ * If this is the first use of this MAC address then allocate
+ * and initialize a new structure.
*/
if (map == NULL) {
map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
map->ma_len = mip->mi_type->mt_addr_length;
- bcopy(mac_addr, map->ma_addr, map->ma_len);
+ bcopy(addr, map->ma_addr, map->ma_len);
map->ma_nusers = 0;
map->ma_group = group;
map->ma_mip = mip;
+ map->ma_untagged = B_FALSE;
- /* add the new MAC address to the head of the address list */
+ /* Add the new MAC address to the head of the address list. */
map->ma_next = mip->mi_addresses;
mip->mi_addresses = map;
allocated_map = B_TRUE;
}
- ASSERT(map->ma_group == NULL || map->ma_group == group);
+ VERIFY(map->ma_group == NULL || map->ma_group == group);
if (map->ma_group == NULL)
map->ma_group = group;
+ if (vid == VLAN_ID_NONE) {
+ map->ma_untagged = B_TRUE;
+ mvp = NULL;
+ } else {
+ mvp = mac_add_vlan(map, vid);
+ }
+
+ /*
+ * Set the VLAN HW filter if:
+ *
+ * o the MAC's VLAN HW filtering is enabled, and
+ * o the address does not currently rely on promisc mode.
+ *
+ * This is called even when the client specifies an untagged
+ * address (VLAN_ID_NONE) because some MAC providers require
+ * setting additional bits to accept untagged traffic when
+ * VLAN HW filtering is enabled.
+ */
+ if (MAC_GROUP_HW_VLAN(group) &&
+ map->ma_type != MAC_ADDRESS_TYPE_UNICAST_PROMISC) {
+ if ((err = mac_group_addvlan(group, vid)) != 0)
+ goto bail;
+
+ hw_vlan = B_TRUE;
+ }
+
+ VERIFY3S(map->ma_nusers, >=, 0);
+ map->ma_nusers++;
+
/*
- * If the MAC address is already in use, simply account for the
- * new client.
+ * If this MAC address already has a HW filter then simply
+ * increment the counter.
*/
- if (map->ma_nusers++ > 0)
+ if (map->ma_nusers > 1)
return (0);
/*
+ * All logic from here on out is executed during initial
+ * creation only.
+ */
+ VERIFY3S(map->ma_nusers, ==, 1);
+
+ /*
* Activate this MAC address by adding it to the reserved group.
*/
if (group != NULL) {
- err = mac_group_addmac(group, (const uint8_t *)mac_addr);
- if (err == 0) {
- map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
- return (0);
+ err = mac_group_addmac(group, (const uint8_t *)addr);
+
+ /*
+ * If the driver is out of filters then we can
+ * continue and use promisc mode. For any other error,
+ * assume the driver is in a state where we can't
+ * program the filters or use promisc mode; so we must
+ * bail.
+ */
+ if (err != 0 && err != ENOSPC) {
+ map->ma_nusers--;
+ goto bail;
}
+
+ hw_mac = (err == 0);
+ }
+
+ if (hw_mac) {
+ map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+ return (0);
}
/*
* The MAC address addition failed. If the client requires a
- * hardware classified MAC address, fail the operation.
+ * hardware classified MAC address, fail the operation. This
+ * feature is only used by sun4v vsw.
*/
- if (use_hw) {
+ if (use_hw && !hw_mac) {
err = ENOSPC;
+ map->ma_nusers--;
goto bail;
}
/*
- * Try promiscuous mode.
- *
- * For drivers that don't advertise RINGS capability, do
- * nothing for the primary address.
+ * If we reach this point then either the MAC doesn't have
+ * RINGS capability or we are out of MAC address HW filters.
+ * In any case we must put the MAC into promiscuous mode.
+ */
+ VERIFY(group == NULL || !hw_mac);
+
+ /*
+ * The one exception is the primary address. A non-RINGS
+ * driver filters the primary address by default; promisc mode
+ * is not needed.
*/
if ((group == NULL) &&
(bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
@@ -5030,53 +5497,76 @@ mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
}
/*
- * Enable promiscuous mode in order to receive traffic
- * to the new MAC address.
+ * Enable promiscuous mode in order to receive traffic to the
+ * new MAC address. All existing HW filters still send their
+ * traffic to their respective group/SRSes. But with promisc
+ * enabled all unknown traffic is delivered to the default
+ * group where it is SW classified via mac_rx_classify().
*/
if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
return (0);
}
- /*
- * Free the MAC address that could not be added. Don't free
- * a pre-existing address, it could have been the entry
- * for the primary MAC address which was pre-allocated by
- * mac_init_macaddr(), and which must remain on the list.
- */
bail:
- map->ma_nusers--;
+ if (hw_vlan) {
+ int err2 = mac_group_remvlan(group, vid);
+
+ if (err2 != 0) {
+ cmn_err(CE_WARN, "Failed to remove VLAN %u from group"
+ " %d on MAC %s: %d.", vid, group->mrg_index,
+ mip->mi_name, err2);
+ }
+ }
+
+ if (mvp != NULL)
+ mac_rem_vlan(map, mvp);
+
if (allocated_map)
mac_free_macaddr(map);
+
return (err);
}
-/*
- * Remove a reference to a MAC address. This may cause to remove the MAC
- * address from an associated group or to turn off promiscuous mode.
- * The caller needs to handle the failure properly.
- */
int
-mac_remove_macaddr(mac_address_t *map)
+mac_remove_macaddr_vlan(mac_address_t *map, uint16_t vid)
{
- mac_impl_t *mip = map->ma_mip;
- int err = 0;
+ mac_vlan_t *mvp;
+ mac_impl_t *mip = map->ma_mip;
+ mac_group_t *group = map->ma_group;
+ int err = 0;
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+ VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr));
- ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
+ if (vid == VLAN_ID_NONE) {
+ map->ma_untagged = B_FALSE;
+ mvp = NULL;
+ } else {
+ mvp = mac_find_vlan(map, vid);
+ VERIFY3P(mvp, !=, NULL);
+ }
+
+ if (MAC_GROUP_HW_VLAN(group) &&
+ map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED &&
+ ((err = mac_group_remvlan(group, vid)) != 0))
+ return (err);
+
+ if (mvp != NULL)
+ mac_rem_vlan(map, mvp);
/*
* If it's not the last client using this MAC address, only update
* the MAC clients count.
*/
- if (--map->ma_nusers > 0)
+ map->ma_nusers--;
+ if (map->ma_nusers > 0)
return (0);
/*
- * The MAC address is no longer used by any MAC client, so remove
- * it from its associated group, or turn off promiscuous mode
- * if it was enabled for the MAC address.
+ * The MAC address is no longer used by any MAC client, so
+ * remove it from its associated group. Turn off promiscuous
+ * mode if this is the last address relying on it.
*/
switch (map->ma_type) {
case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
@@ -5084,18 +5574,44 @@ mac_remove_macaddr(mac_address_t *map)
* Don't free the preset primary address for drivers that
* don't advertise RINGS capability.
*/
- if (map->ma_group == NULL)
+ if (group == NULL)
return (0);
- err = mac_group_remmac(map->ma_group, map->ma_addr);
- if (err == 0)
- map->ma_group = NULL;
+ if ((err = mac_group_remmac(group, map->ma_addr)) != 0) {
+ if (vid == VLAN_ID_NONE)
+ map->ma_untagged = B_TRUE;
+ else
+ (void) mac_add_vlan(map, vid);
+
+ /*
+ * If we fail to remove the MAC address HW
+ * filter but then also fail to re-add the
+ * VLAN HW filter then we are in a busted
+ * state and should just crash.
+ */
+ if (MAC_GROUP_HW_VLAN(group)) {
+ int err2;
+
+ err2 = mac_group_addvlan(group, vid);
+ if (err2 != 0) {
+ cmn_err(CE_WARN, "Failed to readd VLAN"
+ " %u to group %d on MAC %s: %d.",
+ vid, group->mrg_index, mip->mi_name,
+ err2);
+ }
+ }
+
+ return (err);
+ }
+
+ map->ma_group = NULL;
break;
case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
err = i_mac_promisc_set(mip, B_FALSE);
break;
default:
- ASSERT(B_FALSE);
+ panic("Unexpected ma_type 0x%x, file: %s, line %d",
+ map->ma_type, __FILE__, __LINE__);
}
if (err != 0)
@@ -5252,8 +5768,9 @@ mac_fini_macaddr(mac_impl_t *mip)
* If mi_addresses is initialized, there should be exactly one
* entry left on the list with no users.
*/
- ASSERT(map->ma_nusers == 0);
- ASSERT(map->ma_next == NULL);
+ VERIFY3S(map->ma_nusers, ==, 0);
+ VERIFY3P(map->ma_next, ==, NULL);
+ VERIFY3P(map->ma_vlans, ==, NULL);
kmem_free(map, sizeof (mac_address_t));
mip->mi_addresses = NULL;
@@ -5815,7 +6332,7 @@ mac_stop_logusage(mac_logtype_t type)
mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
(void) untimeout(mac_logging_timer);
- mac_logging_timer = 0;
+ mac_logging_timer = NULL;
/* Write log entries for each mac_impl in the list */
i_mac_log_info(&net_log_list, &lstate);
@@ -5933,7 +6450,7 @@ mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
}
/*
- * For a reserved group with multiple clients, return the primary client.
+ * For a non-default group with multiple clients, return the primary client.
*/
static mac_client_impl_t *
mac_get_grp_primary(mac_group_t *grp)
@@ -6292,13 +6809,12 @@ mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
break;
}
- VERIFY(mgcp == NULL);
+ ASSERT(mgcp == NULL);
mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
mgcp->mgc_client = mcip;
mgcp->mgc_next = grp->mrg_clients;
grp->mrg_clients = mgcp;
-
}
void
@@ -6319,8 +6835,27 @@ mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
}
/*
- * mac_reserve_rx_group()
- *
+ * Return true if any client on this group explicitly asked for HW
+ * rings (of type mask) or have a bound share.
+ */
+static boolean_t
+i_mac_clients_hw(mac_group_t *grp, uint32_t mask)
+{
+ mac_grp_client_t *mgcip;
+ mac_client_impl_t *mcip;
+ mac_resource_props_t *mrp;
+
+ for (mgcip = grp->mrg_clients; mgcip != NULL; mgcip = mgcip->mgc_next) {
+ mcip = mgcip->mgc_client;
+ mrp = MCIP_RESOURCE_PROPS(mcip);
+ if (mcip->mci_share != NULL || (mrp->mrp_mask & mask) != 0)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
* Finds an available group and exclusively reserves it for a client.
* The group is chosen to suit the flow's resource controls (bandwidth and
* fanout requirements) and the address type.
@@ -6343,7 +6878,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
int need_rings = 0;
mac_group_t *candidate_grp = NULL;
mac_client_impl_t *gclient;
- mac_resource_props_t *gmrp;
mac_group_t *donorgrp = NULL;
boolean_t rxhw = mrp->mrp_mask & MRP_RX_RINGS;
boolean_t unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC;
@@ -6354,18 +6888,20 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
/*
- * Check if a group already has this mac address (case of VLANs)
+ * Check if a group already has this MAC address (case of VLANs)
* unless we are moving this MAC client from one group to another.
*/
if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) {
if (map->ma_group != NULL)
return (map->ma_group);
}
+
if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0)
return (NULL);
+
/*
- * If exclusive open, return NULL which will enable the
- * caller to use the default group.
+ * If this client is requesting exclusive MAC access then
+ * return NULL to ensure the client uses the default group.
*/
if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
return (NULL);
@@ -6375,6 +6911,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
mrp->mrp_nrxrings = 1;
}
+
/*
* For static grouping we allow only specifying rings=0 and
* unspecified
@@ -6383,6 +6920,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) {
return (NULL);
}
+
if (rxhw) {
/*
* We have explicitly asked for a group (with nrxrings,
@@ -6444,25 +6982,19 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
* that didn't ask for an exclusive group, but got
* one and it has enough rings (combined with what
* the donor group can donate) for the new MAC
- * client
+ * client.
*/
if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) {
/*
- * If the primary/donor group is not the default
- * group, don't bother looking for a candidate group.
- * If we don't have enough rings we will check
- * if the primary group can be vacated.
+ * If the donor group is not the default
+ * group, don't bother looking for a candidate
+ * group. If we don't have enough rings we
+ * will check if the primary group can be
+ * vacated.
*/
if (candidate_grp == NULL &&
donorgrp == MAC_DEFAULT_RX_GROUP(mip)) {
- ASSERT(!MAC_GROUP_NO_CLIENT(grp));
- gclient = MAC_GROUP_ONLY_CLIENT(grp);
- if (gclient == NULL)
- gclient = mac_get_grp_primary(grp);
- ASSERT(gclient != NULL);
- gmrp = MCIP_RESOURCE_PROPS(gclient);
- if (gclient->mci_share == 0 &&
- (gmrp->mrp_mask & MRP_RX_RINGS) == 0 &&
+ if (!i_mac_clients_hw(grp, MRP_RX_RINGS) &&
(unspec ||
(grp->mrg_cur_count + donor_grp_rcnt >=
need_rings))) {
@@ -6528,6 +7060,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
*/
mac_stop_group(grp);
}
+
/* We didn't find an exclusive group for this MAC client */
if (i >= mip->mi_rx_group_count) {
@@ -6535,12 +7068,12 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
return (NULL);
/*
- * If we found a candidate group then we switch the
- * MAC client from the candidate_group to the default
- * group and give the group to this MAC client. If
- * we didn't find a candidate_group, check if the
- * primary is in its own group and if it can make way
- * for this MAC client.
+ * If we found a candidate group then move the
+ * existing MAC client from the candidate_group to the
+ * default group and give the candidate_group to the
+ * new MAC client. If we didn't find a candidate
+ * group, then check if the primary is in its own
+ * group and if it can make way for this MAC client.
*/
if (candidate_grp == NULL &&
donorgrp != MAC_DEFAULT_RX_GROUP(mip) &&
@@ -6551,15 +7084,15 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
boolean_t prim_grp = B_FALSE;
/*
- * Switch the MAC client from the candidate group
- * to the default group.. If this group was the
- * donor group, then after the switch we need
- * to update the donor group too.
+ * Switch the existing MAC client from the
+ * candidate group to the default group. If
+ * the candidate group is the donor group,
+ * then after the switch we need to update the
+ * donor group too.
*/
grp = candidate_grp;
- gclient = MAC_GROUP_ONLY_CLIENT(grp);
- if (gclient == NULL)
- gclient = mac_get_grp_primary(grp);
+ gclient = grp->mrg_clients->mgc_client;
+ VERIFY3P(gclient, !=, NULL);
if (grp == mip->mi_rx_donor_grp)
prim_grp = B_TRUE;
if (mac_rx_switch_group(gclient, grp,
@@ -6572,7 +7105,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
donorgrp = MAC_DEFAULT_RX_GROUP(mip);
}
-
/*
* Now give this group with the required rings
* to this MAC client.
@@ -6620,10 +7152,10 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
/*
* mac_rx_release_group()
*
- * This is called when there are no clients left for the group.
- * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
- * and if it is a non default group, the shares are removed and
- * all rings are assigned back to default group.
+ * Release the group when it has no remaining clients. The group is
+ * stopped and its shares are removed and all rings are assigned back
+ * to default group. This should never be called against the default
+ * group.
*/
void
mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
@@ -6632,6 +7164,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
mac_ring_t *ring;
ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
+ ASSERT(MAC_GROUP_NO_CLIENT(group) == B_TRUE);
if (mip->mi_rx_donor_grp == group)
mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip);
@@ -6683,56 +7216,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
}
/*
- * When we move the primary's mac address between groups, we need to also
- * take all the clients sharing the same mac address along with it (VLANs)
- * We remove the mac address for such clients from the group after quiescing
- * them. When we add the mac address we restart the client. Note that
- * the primary's mac address is removed from the group after all the
- * other clients sharing the address are removed. Similarly, the primary's
- * mac address is added before all the other client's mac address are
- * added. While grp is the group where the clients reside, tgrp is
- * the group where the addresses have to be added.
- */
-static void
-mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp,
- mac_group_t *tgrp, uint8_t *maddr, boolean_t add)
-{
- mac_impl_t *mip = mcip->mci_mip;
- mac_grp_client_t *mgcp = grp->mrg_clients;
- mac_client_impl_t *gmcip;
- boolean_t prim;
-
- prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
-
- /*
- * If the clients are in a non-default group, we just have to
- * walk the group's client list. If it is in the default group
- * (which will be shared by other clients as well, we need to
- * check if the unicast address matches mcip's unicast.
- */
- while (mgcp != NULL) {
- gmcip = mgcp->mgc_client;
- if (gmcip != mcip &&
- (grp != MAC_DEFAULT_RX_GROUP(mip) ||
- mcip->mci_unicast == gmcip->mci_unicast)) {
- if (!add) {
- mac_rx_client_quiesce(
- (mac_client_handle_t)gmcip);
- (void) mac_remove_macaddr(mcip->mci_unicast);
- } else {
- (void) mac_add_macaddr(mip, tgrp, maddr, prim);
- mac_rx_client_restart(
- (mac_client_handle_t)gmcip);
- }
- }
- mgcp = mgcp->mgc_next;
- }
-}
-
-
-/*
- * Move the MAC address from fgrp to tgrp. If this is the primary client,
- * we need to take any VLANs etc. together too.
+ * Move the MAC address from fgrp to tgrp.
*/
static int
mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
@@ -6741,56 +7225,86 @@ mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
mac_impl_t *mip = mcip->mci_mip;
uint8_t maddr[MAXMACADDRLEN];
int err = 0;
- boolean_t prim;
- boolean_t multiclnt = B_FALSE;
+ uint16_t vid;
+ mac_unicast_impl_t *muip;
+ boolean_t use_hw;
mac_rx_client_quiesce((mac_client_handle_t)mcip);
- ASSERT(mcip->mci_unicast != NULL);
+ VERIFY3P(mcip->mci_unicast, !=, NULL);
bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len);
- prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
- if (mcip->mci_unicast->ma_nusers > 1) {
- mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE);
- multiclnt = B_TRUE;
- }
- ASSERT(mcip->mci_unicast->ma_nusers == 1);
- err = mac_remove_macaddr(mcip->mci_unicast);
+ /*
+ * Does the client require MAC address hardware classifiction?
+ */
+ use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
+ vid = i_mac_flow_vid(mcip->mci_flent);
+
+ /*
+ * You can never move an address that is shared by multiple
+ * clients. mac_datapath_setup() ensures that clients sharing
+ * an address are placed on the default group. This guarantees
+ * that a non-default group will only ever have one client and
+ * thus make full use of HW filters.
+ */
+ if (mac_check_macaddr_shared(mcip->mci_unicast))
+ return (EINVAL);
+
+ err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid);
+
if (err != 0) {
mac_rx_client_restart((mac_client_handle_t)mcip);
- if (multiclnt) {
- mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
- B_TRUE);
- }
return (err);
}
+
+ /*
+ * If this isn't the primary MAC address then the
+ * mac_address_t has been freed by the last call to
+ * mac_remove_macaddr_vlan(). In any case, NULL the reference
+ * to avoid a dangling pointer.
+ */
+ mcip->mci_unicast = NULL;
+
+ /*
+ * We also have to NULL all the mui_map references -- sun4v
+ * strikes again!
+ */
+ rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+ for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next)
+ muip->mui_map = NULL;
+ rw_exit(&mcip->mci_rw_lock);
+
/*
- * Program the H/W Classifier first, if this fails we need
- * not proceed with the other stuff.
+ * Program the H/W Classifier first, if this fails we need not
+ * proceed with the other stuff.
*/
- if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) {
+ if ((err = mac_add_macaddr_vlan(mip, tgrp, maddr, vid, use_hw)) != 0) {
+ int err2;
+
/* Revert back the H/W Classifier */
- if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) {
- /*
- * This should not fail now since it worked earlier,
- * should we panic?
- */
- cmn_err(CE_WARN,
- "mac_rx_switch_group: switching %p back"
- " to group %p failed!!", (void *)mcip,
- (void *)fgrp);
+ err2 = mac_add_macaddr_vlan(mip, fgrp, maddr, vid, use_hw);
+
+ if (err2 != 0) {
+ cmn_err(CE_WARN, "Failed to revert HW classification"
+ " on MAC %s, for client %s: %d.", mip->mi_name,
+ mcip->mci_name, err2);
}
+
mac_rx_client_restart((mac_client_handle_t)mcip);
- if (multiclnt) {
- mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
- B_TRUE);
- }
return (err);
}
+
+ /*
+ * Get a reference to the new mac_address_t and update the
+ * client's reference. Then restart the client and add the
+ * other clients of this MAC addr (if they exsit).
+ */
mcip->mci_unicast = mac_find_macaddr(mip, maddr);
+ rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+ for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next)
+ muip->mui_map = mcip->mci_unicast;
+ rw_exit(&mcip->mci_rw_lock);
mac_rx_client_restart((mac_client_handle_t)mcip);
- if (multiclnt)
- mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE);
- return (err);
+ return (0);
}
/*
@@ -6811,19 +7325,34 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
mac_impl_t *mip = mcip->mci_mip;
mac_grp_client_t *mgcp;
- ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group);
+ VERIFY3P(fgrp, ==, mcip->mci_flent->fe_rx_ring_group);
if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0)
return (err);
/*
- * The group might be reserved, but SRSs may not be set up, e.g.
- * primary and its vlans using a reserved group.
+ * If the group is marked as reserved and in use by a single
+ * client, then there is an SRS to teardown.
*/
if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED &&
MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE);
}
+
+ /*
+ * If we are moving the client from a non-default group, then
+ * we know that any additional clients on this group share the
+ * same MAC address. Since we moved the MAC address filter, we
+ * need to move these clients too.
+ *
+ * If we are moving the client from the default group and its
+ * MAC address has VLAN clients, then we must move those
+ * clients as well.
+ *
+ * In both cases the idea is the same: we moved the MAC
+ * address filter to the tgrp, so we must move all clients
+ * using that MAC address to tgrp as well.
+ */
if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) {
mgcp = fgrp->mrg_clients;
while (mgcp != NULL) {
@@ -6834,20 +7363,21 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
gmcip->mci_flent->fe_rx_ring_group = tgrp;
}
mac_release_rx_group(mcip, fgrp);
- ASSERT(MAC_GROUP_NO_CLIENT(fgrp));
+ VERIFY3B(MAC_GROUP_NO_CLIENT(fgrp), ==, B_TRUE);
mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED);
} else {
mac_group_remove_client(fgrp, mcip);
mac_group_add_client(tgrp, mcip);
mcip->mci_flent->fe_rx_ring_group = tgrp;
+
/*
* If there are other clients (VLANs) sharing this address
- * we should be here only for the primary.
+ * then move them too.
*/
- if (mcip->mci_unicast->ma_nusers > 1) {
+ if (mac_check_macaddr_shared(mcip->mci_unicast)) {
/*
* We need to move all the clients that are using
- * this h/w address.
+ * this MAC address.
*/
mgcp = fgrp->mrg_clients;
while (mgcp != NULL) {
@@ -6861,20 +7391,24 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
}
}
}
+
/*
- * The default group will still take the multicast,
- * broadcast traffic etc., so it won't go to
+ * The default group still handles multicast and
+ * broadcast traffic; it won't transition to
* MAC_GROUP_STATE_REGISTERED.
*/
if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED)
mac_rx_group_unmark(fgrp, MR_CONDEMNED);
mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED);
}
+
next_state = mac_group_next_state(tgrp, &group_only_mcip,
MAC_DEFAULT_RX_GROUP(mip), B_TRUE);
mac_set_group_state(tgrp, next_state);
+
/*
- * If the destination group is reserved, setup the SRSs etc.
+ * If the destination group is reserved, then setup the SRSes.
+ * Otherwise make sure to use SW classification.
*/
if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK);
@@ -6885,6 +7419,7 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
} else {
mac_rx_switch_grp_to_sw(tgrp);
}
+
return (0);
}
@@ -6915,6 +7450,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
boolean_t isprimary;
isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
+
/*
* When we come here for a VLAN on the primary (dladm create-vlan),
* we need to pair it along with the primary (to keep it consistent
@@ -6996,8 +7532,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
if (grp->mrg_state == MAC_GROUP_STATE_RESERVED &&
candidate_grp == NULL) {
gclient = MAC_GROUP_ONLY_CLIENT(grp);
- if (gclient == NULL)
- gclient = mac_get_grp_primary(grp);
+ VERIFY3P(gclient, !=, NULL);
gmrp = MCIP_RESOURCE_PROPS(gclient);
if (gclient->mci_share == 0 &&
(gmrp->mrp_mask & MRP_TX_RINGS) == 0 &&
@@ -7034,13 +7569,14 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
*/
if (need_exclgrp && candidate_grp != NULL) {
/*
- * Switch the MAC client from the candidate group
- * to the default group.
+ * Switch the MAC client from the candidate
+ * group to the default group. We know the
+ * candidate_grp came from a reserved group
+ * and thus only has one client.
*/
grp = candidate_grp;
gclient = MAC_GROUP_ONLY_CLIENT(grp);
- if (gclient == NULL)
- gclient = mac_get_grp_primary(grp);
+ VERIFY3P(gclient, !=, NULL);
mac_tx_client_quiesce((mac_client_handle_t)gclient);
mac_tx_switch_group(gclient, grp, defgrp);
mac_tx_client_restart((mac_client_handle_t)gclient);
@@ -7208,7 +7744,7 @@ mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
*/
mac_group_remove_client(fgrp, mcip);
mac_tx_dismantle_soft_rings(fgrp, flent);
- if (mcip->mci_unicast->ma_nusers > 1) {
+ if (mac_check_macaddr_shared(mcip->mci_unicast)) {
mgcp = fgrp->mrg_clients;
while (mgcp != NULL) {
gmcip = mgcp->mgc_client;
@@ -7454,7 +7990,7 @@ mac_no_active(mac_handle_t mh)
* changes and update the mac_resource_props_t for the VLAN's client.
* We need to do this since we don't support setting these properties
* on the primary's VLAN clients, but the VLAN clients have to
- * follow the primary w.r.t the rings property;
+ * follow the primary w.r.t the rings property.
*/
void
mac_set_prim_vlan_rings(mac_impl_t *mip, mac_resource_props_t *mrp)
@@ -7603,13 +8139,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
MAC_GROUP_STATE_RESERVED) {
continue;
}
- mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
- if (mcip == NULL)
- mcip = mac_get_grp_primary(tgrp);
- ASSERT(mcip != NULL);
- mrp = MCIP_RESOURCE_PROPS(mcip);
- if ((mrp->mrp_mask & MRP_RX_RINGS) != 0)
+ if (i_mac_clients_hw(tgrp, MRP_RX_RINGS))
continue;
+ mcip = tgrp->mrg_clients->mgc_client;
+ VERIFY3P(mcip, !=, NULL);
if ((tgrp->mrg_cur_count +
defgrp->mrg_cur_count) < (modify + 1)) {
continue;
@@ -7624,12 +8157,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
MAC_GROUP_STATE_RESERVED) {
continue;
}
- mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
- if (mcip == NULL)
- mcip = mac_get_grp_primary(tgrp);
- mrp = MCIP_RESOURCE_PROPS(mcip);
- if ((mrp->mrp_mask & MRP_TX_RINGS) != 0)
+ if (i_mac_clients_hw(tgrp, MRP_TX_RINGS))
continue;
+ mcip = tgrp->mrg_clients->mgc_client;
+ VERIFY3P(mcip, !=, NULL);
if ((tgrp->mrg_cur_count +
defgrp->mrg_cur_count) < (modify + 1)) {
continue;
@@ -7899,10 +8430,10 @@ mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg)
* Set effective rings property. This could be called from datapath_setup/
* datapath_teardown or set-linkprop.
* If the group is reserved we just go ahead and set the effective rings.
- * Additionally, for TX this could mean the default group has lost/gained
+ * Additionally, for TX this could mean the default group has lost/gained
* some rings, so if the default group is reserved, we need to adjust the
* effective rings for the default group clients. For RX, if we are working
- * with the non-default group, we just need * to reset the effective props
+ * with the non-default group, we just need to reset the effective props
* for the default group clients.
*/
void
@@ -8032,6 +8563,7 @@ mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw)
* the first non-primary.
*/
ASSERT(mip->mi_nactiveclients == 2);
+
/*
* OK, now we have the primary that needs to be relocated.
*/
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
index 1ff33c3578..3b674be1d0 100644
--- a/usr/src/uts/common/io/mac/mac_bcast.c
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
uint64_t gen;
uint_t i;
mblk_t *mp_chain1;
- flow_entry_t *flent;
+ flow_entry_t *flent;
int err;
rw_enter(&mip->mi_rw_lock, RW_READER);
@@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
*/
if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL)
break;
- /*
- * Fix the checksum for packets originating
- * from the local machine.
- */
- if ((src_mcip != NULL) &&
- (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL)
- break;
FLOW_TRY_REFHOLD(flent, err);
if (err != 0) {
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index 66bba78e91..50316bb81e 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2017 RackTop Systems.
*/
@@ -114,6 +114,7 @@
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
+#include <sys/pattr.h>
#include <sys/dlpi.h>
#include <sys/modhash.h>
#include <sys/mac_impl.h>
@@ -865,9 +866,12 @@ mac_unicast_update_client_flow(mac_client_impl_t *mcip)
mac_protect_update_mac_token(mcip);
/*
- * A MAC client could have one MAC address but multiple
- * VLANs. In that case update the flow entries corresponding
- * to all VLANs of the MAC client.
+ * When there are multiple VLANs sharing the same MAC address,
+ * each gets its own MAC client, except when running on sun4v
+ * vsw. In that case the mci_flent_list is used to place
+ * multiple VLAN flows on one MAC client. If we ever get rid
+ * of vsw then this code can go, but until then we need to
+ * update all flow entries.
*/
for (flent = mcip->mci_flent_list; flent != NULL;
flent = flent->fe_client_next) {
@@ -1025,7 +1029,7 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr)
return (0);
}
- if (mac_find_macaddr(mip, (uint8_t *)addr) != 0) {
+ if (mac_find_macaddr(mip, (uint8_t *)addr) != NULL) {
i_mac_perim_exit(mip);
return (EBUSY);
}
@@ -1040,9 +1044,9 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr)
mac_capab_aggr_t aggr_cap;
/*
- * If the mac is an aggregation, other than the unicast
+ * If the MAC is an aggregation, other than the unicast
* addresses programming, aggr must be informed about this
- * primary unicst address change to change its mac address
+ * primary unicst address change to change its MAC address
* policy to be user-specified.
*/
ASSERT(map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED);
@@ -1353,7 +1357,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
mcip->mci_mip = mip;
mcip->mci_upper_mip = NULL;
- mcip->mci_rx_fn = mac_pkt_drop;
+ mcip->mci_rx_fn = mac_rx_def;
mcip->mci_rx_arg = NULL;
mcip->mci_rx_p_fn = NULL;
mcip->mci_rx_p_arg = NULL;
@@ -1374,7 +1378,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
mcip->mci_state_flags |= MCIS_IS_AGGR_PORT;
if (mip->mi_state_flags & MIS_IS_AGGR)
- mcip->mci_state_flags |= MCIS_IS_AGGR;
+ mcip->mci_state_flags |= MCIS_IS_AGGR_CLIENT;
if ((flags & MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) {
datalink_id_t linkid;
@@ -1433,6 +1437,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
mcip->mci_flent = flent;
FLOW_MARK(flent, FE_MC_NO_DATAPATH);
flent->fe_mcip = mcip;
+
/*
* Place initial creation reference on the flow. This reference
* is released in the corresponding delete action viz.
@@ -1539,7 +1544,8 @@ mac_client_close(mac_client_handle_t mch, uint16_t flags)
}
/*
- * Set the rx bypass receive callback.
+ * Set the Rx bypass receive callback and return B_TRUE. Return
+ * B_FALSE if it's not possible to enable bypass.
*/
boolean_t
mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1)
@@ -1550,11 +1556,11 @@ mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1)
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
/*
- * If the mac_client is a VLAN, we should not do DLS bypass and
- * instead let the packets come up via mac_rx_deliver so the vlan
- * header can be stripped.
+ * If the client has more than one VLAN then process packets
+ * through DLS. This should happen only when sun4v vsw is on
+ * the scene.
*/
- if (mcip->mci_nvids > 0)
+ if (mcip->mci_nvids > 1)
return (B_FALSE);
/*
@@ -1608,8 +1614,8 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
i_mac_perim_exit(mip);
/*
- * If we're changing the rx function on the primary mac of a vnic,
- * make sure any secondary macs on the vnic are updated as well.
+ * If we're changing the Rx function on the primary MAC of a VNIC,
+ * make sure any secondary addresses on the VNIC are updated as well.
*/
if (umip != NULL) {
ASSERT((umip->mi_state_flags & MIS_IS_VNIC) != 0);
@@ -1623,7 +1629,33 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
void
mac_rx_clear(mac_client_handle_t mch)
{
- mac_rx_set(mch, mac_pkt_drop, NULL);
+ mac_rx_set(mch, mac_rx_def, NULL);
+}
+
+void
+mac_rx_barrier(mac_client_handle_t mch)
+{
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_impl_t *mip = mcip->mci_mip;
+
+ i_mac_perim_enter(mip);
+
+ /* If a RX callback is set, quiesce and restart that datapath */
+ if (mcip->mci_rx_fn != mac_rx_def) {
+ mac_rx_client_quiesce(mch);
+ mac_rx_client_restart(mch);
+ }
+
+ /* If any promisc callbacks are registered, perform a barrier there */
+ if (mcip->mci_promisc_list != NULL || mip->mi_promisc_list != NULL) {
+ mac_cb_info_t *mcbi = &mip->mi_promisc_cb_info;
+
+ mutex_enter(mcbi->mcbi_lockp);
+ mac_callback_barrier(mcbi);
+ mutex_exit(mcbi->mcbi_lockp);
+ }
+
+ i_mac_perim_exit(mip);
}
void
@@ -1787,6 +1819,14 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp,
}
/* Let check if we can give this an excl group */
} else if (group == defgrp) {
+ /*
+ * If multiple clients share an
+ * address then they must stay on the
+ * default group.
+ */
+ if (mac_check_macaddr_shared(mcip->mci_unicast))
+ return (0);
+
ngrp = mac_reserve_rx_group(mcip, mac_addr,
B_TRUE);
/* Couldn't give it a group, that's fine */
@@ -1809,6 +1849,16 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp,
}
if (group == defgrp && ((mrp->mrp_nrxrings > 0) || unspec)) {
+ /*
+ * We are requesting Rx rings. Try to reserve
+ * a non-default group.
+ *
+ * If multiple clients share an address then
+ * they must stay on the default group.
+ */
+ if (mac_check_macaddr_shared(mcip->mci_unicast))
+ return (EINVAL);
+
ngrp = mac_reserve_rx_group(mcip, mac_addr, B_TRUE);
if (ngrp == NULL)
return (ENOSPC);
@@ -2166,10 +2216,10 @@ mac_unicast_flow_create(mac_client_impl_t *mcip, uint8_t *mac_addr,
flent_flags = FLOW_VNIC_MAC;
/*
- * For the first flow we use the mac client's name - mci_name, for
- * subsequent ones we just create a name with the vid. This is
+ * For the first flow we use the MAC client's name - mci_name, for
+ * subsequent ones we just create a name with the VID. This is
* so that we can add these flows to the same flow table. This is
- * fine as the flow name (except for the one with the mac client's
+ * fine as the flow name (except for the one with the MAC client's
* name) is not visible. When the first flow is removed, we just replace
* its fdesc with another from the list, so we will still retain the
* flent with the MAC client's flow name.
@@ -2327,6 +2377,7 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid,
* The unicast MAC address must have been added successfully.
*/
ASSERT(mcip->mci_unicast != NULL);
+
/*
* Push down the sub-flows that were defined on this link
* hitherto. The flows are added to the active flow table
@@ -2338,15 +2389,23 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid,
ASSERT(!no_unicast);
/*
- * A unicast flow already exists for that MAC client,
- * this flow must be the same mac address but with
- * different VID. It has been checked by mac_addr_in_use().
+ * A unicast flow already exists for that MAC client
+ * so this flow must be the same MAC address but with
+ * a different VID. It has been checked by
+ * mac_addr_in_use().
*
- * We will use the SRS etc. from the mci_flent. Note that
- * We don't need to create kstat for this as except for
- * the fdesc, everything will be used from in the 1st flent.
+ * We will use the SRS etc. from the initial
+ * mci_flent. We don't need to create a kstat for
+ * this, as except for the fdesc, everything will be
+ * used from the first flent.
+ *
+ * The only time we should see multiple flents on the
+ * same MAC client is on the sun4v vsw. If we removed
+ * that code we should be able to remove the entire
+ * notion of multiple flents on a MAC client (this
+ * doesn't affect sub/user flows because they have
+ * their own list unrelated to mci_flent_list).
*/
-
if (bcmp(mac_addr, map->ma_addr, map->ma_len) != 0) {
err = EINVAL;
goto bail;
@@ -2406,7 +2465,17 @@ done_setup:
if (flent->fe_rx_ring_group != NULL)
mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT);
FLOW_UNMARK(flent, FE_INCIPIENT);
- FLOW_UNMARK(flent, FE_MC_NO_DATAPATH);
+
+ /*
+ * If this is an aggr port client, don't enable the flow's
+ * datapath at this stage. Otherwise, bcast traffic could
+ * arrive while the aggr port is in the process of
+ * initializing. Instead, the flow's datapath is started later
+ * when mac_client_set_flow_cb() is called.
+ */
+ if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0)
+ FLOW_UNMARK(flent, FE_MC_NO_DATAPATH);
+
mac_tx_client_unblock(mcip);
return (0);
bail:
@@ -2475,8 +2544,12 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
boolean_t is_vnic_primary =
(flags & MAC_UNICAST_VNIC_PRIMARY);
- /* when VID is non-zero, the underlying MAC can not be VNIC */
- ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != 0)));
+ /*
+ * When the VID is non-zero the underlying MAC cannot be a
+ * VNIC. I.e., dladm create-vlan cannot take a VNIC as
+ * argument, only the primary MAC client.
+ */
+ ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != VLAN_ID_NONE)));
/*
* Can't unicast add if the client asked only for minimal datapath
@@ -2489,18 +2562,19 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
* Check for an attempted use of the current Port VLAN ID, if enabled.
* No client may use it.
*/
- if (mip->mi_pvid != 0 && vid == mip->mi_pvid)
+ if (mip->mi_pvid != VLAN_ID_NONE && vid == mip->mi_pvid)
return (EBUSY);
/*
* Check whether it's the primary client and flag it.
*/
- if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && vid == 0)
+ if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary &&
+ vid == VLAN_ID_NONE)
mcip->mci_flags |= MAC_CLIENT_FLAGS_PRIMARY;
/*
* is_vnic_primary is true when we come here as a VLAN VNIC
- * which uses the primary mac client's address but with a non-zero
+ * which uses the primary MAC client's address but with a non-zero
* VID. In this case the MAC address is not specified by an upper
* MAC client.
*/
@@ -2552,7 +2626,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
/*
* Create a handle for vid 0.
*/
- ASSERT(vid == 0);
+ ASSERT(vid == VLAN_ID_NONE);
muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP);
muip->mui_vid = vid;
*mah = (mac_unicast_handle_t)muip;
@@ -2572,7 +2646,9 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
}
/*
- * If this is a VNIC/VLAN, disable softmac fast-path.
+ * If this is a VNIC/VLAN, disable softmac fast-path. This is
+ * only relevant to legacy devices which use softmac to
+ * interface with GLDv3.
*/
if (mcip->mci_state_flags & MCIS_IS_VNIC) {
err = mac_fastpath_disable((mac_handle_t)mip);
@@ -2620,9 +2696,11 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
(void) mac_client_set_resources(mch, mrp);
} else if (mcip->mci_state_flags & MCIS_IS_VNIC) {
/*
- * This is a primary VLAN client, we don't support
- * specifying rings property for this as it inherits the
- * rings property from its MAC.
+ * This is a VLAN client sharing the address of the
+ * primary MAC client; i.e., one created via dladm
+ * create-vlan. We don't support specifying ring
+ * properties for this type of client as it inherits
+ * these from the primary MAC client.
*/
if (is_vnic_primary) {
mac_resource_props_t *vmrp;
@@ -2681,7 +2759,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
/*
* Set the flags here so that if this is a passive client, we
- * can return and set it when we call mac_client_datapath_setup
+ * can return and set it when we call mac_client_datapath_setup
* when this becomes the active client. If we defer to using these
* flags to mac_client_datapath_setup, then for a passive client,
* we'd have to store the flags somewhere (probably fe_flags)
@@ -2918,7 +2996,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip,
mac_misc_stat_delete(flent);
/* Initialize the receiver function to a safe routine */
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_cb_arg1 = NULL;
flent->fe_cb_arg2 = NULL;
@@ -2984,14 +3062,14 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
i_mac_perim_enter(mip);
if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) {
/*
- * Called made by the upper MAC client of a VNIC.
+ * Call made by the upper MAC client of a VNIC.
* There's nothing much to do, the unicast address will
* be removed by the VNIC driver when the VNIC is deleted,
* but let's ensure that all our transmit is done before
* the client does a mac_client_stop lest it trigger an
* assert in the driver.
*/
- ASSERT(muip->mui_vid == 0);
+ ASSERT(muip->mui_vid == VLAN_ID_NONE);
mac_tx_client_flush(mcip);
@@ -3055,6 +3133,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
i_mac_perim_exit(mip);
return (0);
}
+
/*
* Remove the VID from the list of client's VIDs.
*/
@@ -3081,7 +3160,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
* flows.
*/
flent = mac_client_get_flow(mcip, muip);
- ASSERT(flent != NULL);
+ VERIFY3P(flent, !=, NULL);
/*
* The first one is disappearing, need to make sure
@@ -3109,6 +3188,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
FLOW_FINAL_REFRELE(flent);
ASSERT(!(mcip->mci_state_flags & MCIS_EXCLUSIVE));
+
/*
* Enable fastpath if this is a VNIC or a VLAN.
*/
@@ -3122,7 +3202,8 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
mui_vid = muip->mui_vid;
mac_client_datapath_teardown(mch, muip, flent);
- if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && mui_vid == 0) {
+ if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) &&
+ mui_vid == VLAN_ID_NONE) {
mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY;
} else {
i_mac_perim_exit(mip);
@@ -3264,6 +3345,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
mac_cb_info_t *mcbi;
int rc;
+ if ((flags & MAC_PROMISC_FLAGS_NO_COPY) &&
+ (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) {
+ return (EINVAL);
+ }
+
i_mac_perim_enter(mip);
if ((rc = mac_start((mac_handle_t)mip)) != 0) {
@@ -3310,6 +3396,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
mpip->mpi_strip_vlan_tag =
((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0);
mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0);
+ mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0);
mcbi = &mip->mi_promisc_cb_info;
mutex_enter(mcbi->mcbi_lockp);
@@ -3530,6 +3617,13 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) :
msgdsize(mp_chain));
+ /*
+ * There's a chance this primary client might be part
+ * of a bridge and the packet forwarded to a local
+ * receiver -- mark the packet accordingly.
+ */
+ DB_CKSUMFLAGS(mp_chain) |= HW_LOCAL_MAC;
+
MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip);
if (mp_chain == NULL) {
cookie = 0;
@@ -3943,33 +4037,63 @@ mac_client_get_effective_resources(mac_client_handle_t mch,
* The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched
* after classification by mac_rx_deliver().
*/
-
static void
mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
boolean_t loopback)
{
- mblk_t *mp_copy, *mp_next;
+ mblk_t *mp_next;
+ boolean_t local = (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) != 0;
+
+ if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag ||
+ (mpip->mpi_do_fixups && local)) {
+ mblk_t *mp_copy;
- if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) {
mp_copy = copymsg(mp);
if (mp_copy == NULL)
return;
+ /*
+ * The consumer has requested we emulate HW offloads
+ * for host-local packets.
+ */
+ if (mpip->mpi_do_fixups && local) {
+ /*
+ * Remember that copymsg() doesn't copy
+ * b_next, so we are only passing a single
+ * packet to mac_hw_emul(). Also keep in mind
+ * that mp_copy will become an mblk chain if
+ * the argument is an LSO message.
+ */
+ mac_hw_emul(&mp_copy, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
+
+ if (mp_copy == NULL)
+ return;
+ }
+
if (mpip->mpi_strip_vlan_tag) {
mp_copy = mac_strip_vlan_tag_chain(mp_copy);
if (mp_copy == NULL)
return;
}
- mp_next = NULL;
- } else {
- mp_copy = mp;
- mp_next = mp->b_next;
+
+ /*
+ * There is code upstack that can't deal with message
+ * chains.
+ */
+ for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) {
+ mp_next = tmp->b_next;
+ tmp->b_next = NULL;
+ mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback);
+ }
+
+ return;
}
- mp_copy->b_next = NULL;
- mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
- if (mp_copy == mp)
- mp->b_next = mp_next;
+ mp_next = mp->b_next;
+ mp->b_next = NULL;
+ mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback);
+ mp->b_next = mp_next;
}
/*
@@ -4051,8 +4175,9 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
if (is_sender ||
mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
- is_mcast)
+ is_mcast) {
mac_promisc_dispatch_one(mpip, mp, is_sender);
+ }
}
}
MAC_PROMISC_WALKER_DCR(mip);
@@ -4152,16 +4277,15 @@ mac_info_get(const char *name, mac_info_t *minfop)
/*
* To get the capabilities that MAC layer cares about, such as rings, factory
* mac address, vnic or not, it should directly invoke this function. If the
- * link is part of a bridge, then the only "capability" it has is the inability
- * to do zero copy.
+ * link is part of a bridge, then the link is unable to do zero copy.
*/
boolean_t
i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
{
mac_impl_t *mip = (mac_impl_t *)mh;
- if (mip->mi_bridge_link != NULL)
- return (cap == MAC_CAPAB_NO_ZCOPY);
+ if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY)
+ return (B_TRUE);
else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
else
@@ -4180,8 +4304,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
mac_impl_t *mip = (mac_impl_t *)mh;
/*
- * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM,
- * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised.
+ * Some capabilities are restricted when there are more than one active
+ * clients on the MAC resource. The ones noted below are safe,
+ * independent of that count.
*/
if (mip->mi_nactiveclients > 1) {
switch (cap) {
@@ -4189,6 +4314,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
return (B_TRUE);
case MAC_CAPAB_LEGACY:
case MAC_CAPAB_HCKSUM:
+ case MAC_CAPAB_LSO:
case MAC_CAPAB_NO_NATIVEVLAN:
break;
default:
@@ -4340,7 +4466,13 @@ mac_addr_len(mac_handle_t mh)
boolean_t
mac_is_vnic(mac_handle_t mh)
{
- return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC);
+ return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0);
+}
+
+boolean_t
+mac_is_overlay(mac_handle_t mh)
+{
+ return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0);
}
mac_handle_t
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 81278cfdee..3697d888e7 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -604,6 +604,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
*
* TODO: Cleanup and tighten some of the assumptions.
*/
+boolean_t mac_check_overlay = B_TRUE;
boolean_t mac_use_bw_heuristic = B_TRUE;
static int
mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
@@ -611,6 +612,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
uint64_t cpu_speed, bw = 0;
int srings = 0;
boolean_t bw_enabled = B_FALSE;
+ mac_client_impl_t *mcip = flent->fe_mcip;
ASSERT(!(flent->fe_type & FLOW_USER));
if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
@@ -638,7 +640,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
*/
if (mac_soft_ring_enable)
srings = srings * 2;
+ } else if (mac_check_overlay == B_TRUE &&
+ (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) {
+ /* Is this a VNIC on an overlay? */
+ mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
+ if (mac_is_overlay(mh) == B_TRUE) {
+ srings = mac_rx_soft_ring_10gig_count;
+ }
}
+
+
} else {
/*
* Soft ring computation using CPU speed and specified
@@ -1186,7 +1197,7 @@ mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs)
mac_srs->srs_tx_soft_rings = (mac_soft_ring_t **)
kmem_zalloc(sizeof (mac_soft_ring_t *) *
MAX_RINGS_PER_GROUP, KM_SLEEP);
- if (mcip->mci_state_flags & MCIS_IS_AGGR) {
+ if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) {
mac_srs_tx_t *tx = &mac_srs->srs_tx;
tx->st_soft_rings = (mac_soft_ring_t **)
@@ -1595,13 +1606,13 @@ mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp)
/*
* When the first sub-flow is added to a link, we disable polling on the
- * link and also modify the entry point to mac_rx_srs_subflow_process.
+ * link and also modify the entry point to mac_rx_srs_subflow_process().
* (polling is disabled because with the subflow added, accounting
* for polling needs additional logic, it is assumed that when a subflow is
* added, we can take some hit as a result of disabling polling rather than
* adding more complexity - if this becomes a perf. issue we need to
* re-rvaluate this logic). When the last subflow is removed, we turn back
- * polling and also reset the entry point to mac_rx_srs_process.
+ * polling and also reset the entry point to mac_rx_srs_process().
*
* In the future if there are multiple SRS, we can simply
* take one and give it to the flow rather than disabling polling and
@@ -1646,7 +1657,7 @@ mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable)
* Change the S/W classifier so that we can land in the
* correct processing function with correct argument.
* If all subflows have been removed we can revert to
- * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process.
+ * mac_rx_srs_process(), else we need mac_rx_srs_subflow_process().
*/
mutex_enter(&flent->fe_lock);
flent->fe_cb_fn = (flow_fn_t)rx_func;
@@ -1977,8 +1988,6 @@ no_softrings:
}
/*
- * mac_fanout_setup:
- *
* Calls mac_srs_fanout_init() or modify() depending upon whether
* the SRS is getting initialized or re-initialized.
*/
@@ -1991,14 +2000,14 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
int i, rx_srs_cnt;
ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
/*
- * This is an aggregation port. Fanout will be setup
- * over the aggregation itself.
+ * Aggr ports do not have SRSes. This function should never be
+ * called on an aggr port.
*/
- if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
- return;
-
+ ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0);
mac_rx_srs = flent->fe_rx_srs[0];
+
/*
* Set up the fanout on the tx side only once, with the
* first rx SRS. The CPU binding, fanout, and bandwidth
@@ -2054,8 +2063,6 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
}
/*
- * mac_srs_create:
- *
* Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is
* SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side
* processing is created.
@@ -2187,7 +2194,7 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
* find nothing plus we have an existing backlog
* (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll
* the H/W for packets anymore (let the polling thread go to sleep).
- * 5) Once the backlog is relived (packets are processed) we reenable
+ * 5) Once the backlog is relieved (packets are processed) we reenable
* polling (by signalling the poll thread) only when the backlog
* dips below sr_poll_thres.
* 6) sr_hiwat is used exclusively when we are not polling capable
@@ -2210,7 +2217,14 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
mac_srs->srs_state |= SRS_SOFTRING_QUEUE;
}
- mac_srs->srs_worker = thread_create(NULL, 0,
+ /*
+ * Create the srs_worker with twice the stack of a normal kernel thread
+ * to reduce the likelihood of stack overflows in receive-side
+ * processing. (The larger stacks are not the only precaution taken
+ * against stack overflows; see the use of the MAC_RX_SRS_TOODEEP
+ * macro for details.)
+ */
+ mac_srs->srs_worker = thread_create(NULL, default_stksize << 1,
mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri);
if (is_tx_srs) {
@@ -2258,8 +2272,8 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
/*
* Some drivers require serialization and don't send
* packet chains in interrupt context. For such
- * drivers, we should always queue in soft ring
- * so that we get a chance to switch into a polling
+ * drivers, we should always queue in the soft ring
+ * so that we get a chance to switch into polling
* mode under backlog.
*/
ring_info = mac_hwring_getinfo((mac_ring_handle_t)ring);
@@ -2357,6 +2371,10 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
mac_rx_srs_group_setup(mcip, flent, link_type);
mac_tx_srs_group_setup(mcip, flent, link_type);
+ /* Aggr ports don't have SRSes; thus there is no soft ring fanout. */
+ if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0)
+ return;
+
pool_lock();
cpupart = mac_pset_find(mrp, &use_default);
mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
@@ -2366,9 +2384,11 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
}
/*
- * Set up the RX SRSs. If the S/W SRS is not set, set it up, if there
- * is a group associated with this MAC client, set up SRSs for individual
- * h/w rings.
+ * Set up the Rx SRSes. If there is no group associated with the
+ * client, then only setup SW classification. If the client has
+ * exlusive (MAC_GROUP_STATE_RESERVED) use of the group, then create an
+ * SRS for each HW ring. If the client is sharing a group, then make
+ * sure to teardown the HW SRSes.
*/
void
mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
@@ -2379,13 +2399,37 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
mac_ring_t *ring;
uint32_t fanout_type;
mac_group_t *rx_group = flent->fe_rx_ring_group;
+ boolean_t no_unicast;
+
+ /*
+ * If this is an an aggr port, then don't setup Rx SRS and Rx
+ * soft rings as they won't be used. However, we still need to
+ * start the rings to receive data on them.
+ */
+ if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) {
+ if (rx_group == NULL)
+ return;
+
+ for (ring = rx_group->mrg_rings; ring != NULL;
+ ring = ring->mr_next) {
+ if (ring->mr_state != MR_INUSE)
+ (void) mac_start_ring(ring);
+ }
+
+ return;
+ }
+
+ /*
+ * Aggr ports should never have SRSes.
+ */
+ ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0);
fanout_type = mac_find_fanout(flent, link_type);
+ no_unicast = (mcip->mci_state_flags & MCIS_NO_UNICAST_ADDR) != 0;
- /* Create the SRS for S/W classification if none exists */
+ /* Create the SRS for SW classification if none exists */
if (flent->fe_rx_srs[0] == NULL) {
ASSERT(flent->fe_rx_srs_cnt == 0);
- /* Setup the Rx SRS */
mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type,
mac_rx_deliver, mcip, NULL, NULL);
mutex_enter(&flent->fe_lock);
@@ -2397,15 +2441,17 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
if (rx_group == NULL)
return;
+
/*
- * fanout for default SRS is done when default SRS are created
- * above. As each ring is added to the group, we setup the
- * SRS and fanout to it.
+ * If the group is marked RESERVED then setup an SRS and
+ * fanout for each HW ring.
*/
switch (rx_group->mrg_state) {
case MAC_GROUP_STATE_RESERVED:
for (ring = rx_group->mrg_rings; ring != NULL;
ring = ring->mr_next) {
+ uint16_t vid = i_mac_flow_vid(mcip->mci_flent);
+
switch (ring->mr_state) {
case MR_INUSE:
case MR_FREE:
@@ -2415,20 +2461,23 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
(void) mac_start_ring(ring);
/*
- * Since the group is exclusively ours create
- * an SRS for this ring to allow the
- * individual SRS to dynamically poll the
- * ring. Do this only if the client is not
- * a VLAN MAC client, since for VLAN we do
- * s/w classification for the VID check, and
- * if it has a unicast address.
+ * If a client requires SW VLAN
+ * filtering or has no unicast address
+ * then we don't create any HW ring
+ * SRSes.
*/
- if ((mcip->mci_state_flags &
- MCIS_NO_UNICAST_ADDR) ||
- i_mac_flow_vid(mcip->mci_flent) !=
- VLAN_ID_NONE) {
+ if ((!MAC_GROUP_HW_VLAN(rx_group) &&
+ vid != VLAN_ID_NONE) || no_unicast)
break;
- }
+
+ /*
+ * When a client has exclusive use of
+ * a group, and that group's traffic
+ * is fully HW classified, we create
+ * an SRS for each HW ring in order to
+ * make use of dynamic polling of said
+ * HW rings.
+ */
mac_srs = mac_srs_create(mcip, flent,
fanout_type | link_type,
mac_rx_deliver, mcip, NULL, ring);
@@ -2444,14 +2493,9 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
break;
case MAC_GROUP_STATE_SHARED:
/*
- * Set all rings of this group to software classified.
- *
- * If the group is current RESERVED, the existing mac
- * client (the only client on this group) is using
- * this group exclusively. In that case we need to
- * disable polling on the rings of the group (if it
- * was enabled), and free the SRS associated with the
- * rings.
+ * When a group is shared by multiple clients, we must
+ * use SW classifiction to ensure packets are
+ * delivered to the correct client.
*/
mac_rx_switch_grp_to_sw(rx_group);
break;
@@ -2468,46 +2512,49 @@ void
mac_tx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
uint32_t link_type)
{
- int cnt;
- int ringcnt;
- mac_ring_t *ring;
- mac_group_t *grp;
-
/*
- * If we are opened exclusively (like aggr does for aggr_ports),
- * don't set up Tx SRS and Tx soft rings as they won't be used.
- * The same thing has to be done for Rx side also. See bug:
- * 6880080
+ * If this is an exclusive client (e.g. an aggr port), then
+ * don't setup Tx SRS and Tx soft rings as they won't be used.
+ * However, we still need to start the rings to send data
+ * across them.
*/
if (mcip->mci_state_flags & MCIS_EXCLUSIVE) {
- /*
- * If we have rings, start them here.
- */
- if (flent->fe_tx_ring_group == NULL)
- return;
+ mac_ring_t *ring;
+ mac_group_t *grp;
+
grp = (mac_group_t *)flent->fe_tx_ring_group;
- ringcnt = grp->mrg_cur_count;
- ring = grp->mrg_rings;
- for (cnt = 0; cnt < ringcnt; cnt++) {
- if (ring->mr_state != MR_INUSE) {
+
+ if (grp == NULL)
+ return;
+
+ for (ring = grp->mrg_rings; ring != NULL;
+ ring = ring->mr_next) {
+ if (ring->mr_state != MR_INUSE)
(void) mac_start_ring(ring);
- }
- ring = ring->mr_next;
}
+
return;
}
+
+ /*
+ * Aggr ports should never have SRSes.
+ */
+ ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0);
+
if (flent->fe_tx_srs == NULL) {
(void) mac_srs_create(mcip, flent, SRST_TX | link_type,
NULL, mcip, NULL, NULL);
}
+
mac_tx_srs_setup(mcip, flent);
}
/*
- * Remove all the RX SRSs. If we want to remove only the SRSs associated
- * with h/w rings, leave the S/W SRS alone. This is used when we want to
- * move the MAC client from one group to another, so we need to teardown
- * on the h/w SRSs.
+ * Teardown all the Rx SRSes. Unless hwonly is set, then only teardown
+ * the Rx HW SRSes and leave the SW SRS alone. The hwonly flag is set
+ * when we wish to move a MAC client from one group to another. In
+ * that case, we need to release the current HW SRSes but keep the SW
+ * SRS for continued traffic classifiction.
*/
void
mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly)
@@ -2525,8 +2572,16 @@ mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly)
flent->fe_rx_srs[i] = NULL;
flent->fe_rx_srs_cnt--;
}
- ASSERT(!hwonly || flent->fe_rx_srs_cnt == 1);
- ASSERT(hwonly || flent->fe_rx_srs_cnt == 0);
+
+ /*
+ * If we are only tearing down the HW SRSes then there must be
+ * one SRS left for SW classification. Otherwise we are tearing
+ * down both HW and SW and there should be no SRSes left.
+ */
+ if (hwonly)
+ VERIFY3S(flent->fe_rx_srs_cnt, ==, 1);
+ else
+ VERIFY3S(flent->fe_rx_srs_cnt, ==, 0);
}
/*
@@ -2828,6 +2883,7 @@ mac_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip,
* even if this is the only client in the default group, we will
* leave group as shared).
*/
+
int
mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
uint32_t link_type)
@@ -2838,7 +2894,8 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
mac_group_t *default_rgroup;
mac_group_t *default_tgroup;
int err;
- uint8_t *mac_addr;
+ uint8_t *mac_addr;
+ uint16_t vid;
mac_group_state_t next_state;
mac_client_impl_t *group_only_mcip;
mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
@@ -2850,6 +2907,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
boolean_t no_unicast;
boolean_t isprimary = flent->fe_type & FLOW_PRIMARY_MAC;
mac_client_impl_t *reloc_pmcip = NULL;
+ boolean_t use_hw;
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
@@ -2881,15 +2939,19 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
(mrp->mrp_mask & MRP_TXRINGS_UNSPEC));
/*
- * By default we have given the primary all the rings
- * i.e. the default group. Let's see if the primary
- * needs to be relocated so that the addition of this
- * client doesn't impact the primary's performance,
- * i.e. if the primary is in the default group and
- * we add this client, the primary will lose polling.
- * We do this only for NICs supporting dynamic ring
- * grouping and only when this is the first client
- * after the primary (i.e. nactiveclients is 2)
+ * All the rings initially belong to the default group
+ * under dynamic grouping. The primary client uses the
+ * default group when it is the only client. The
+ * default group is also used as the destination for
+ * all multicast and broadcast traffic of all clients.
+ * Therefore, the primary client loses its ability to
+ * poll the softrings on addition of a second client.
+ * To avoid a performance penalty, MAC will move the
+ * primary client to a dedicated group when it can.
+ *
+ * When using static grouping, the primary client
+ * begins life on a non-default group. There is
+ * no moving needed upon addition of a second client.
*/
if (!isprimary && mip->mi_nactiveclients == 2 &&
(group_only_mcip = mac_primary_client_handle(mip)) !=
@@ -2897,6 +2959,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
reloc_pmcip = mac_check_primary_relocation(
group_only_mcip, rxhw);
}
+
/*
* Check to see if we can get an exclusive group for
* this mac address or if there already exists a
@@ -2910,6 +2973,26 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
} else if (rgroup == NULL) {
rgroup = default_rgroup;
}
+
+ /*
+ * If we are adding a second client to a
+ * non-default group then we need to move the
+ * existing client to the default group and
+ * add the new client to the default group as
+ * well.
+ */
+ if (rgroup != default_rgroup &&
+ rgroup->mrg_state == MAC_GROUP_STATE_RESERVED) {
+ group_only_mcip = MAC_GROUP_ONLY_CLIENT(rgroup);
+ err = mac_rx_switch_group(group_only_mcip, rgroup,
+ default_rgroup);
+
+ if (err != 0)
+ goto setup_failed;
+
+ rgroup = default_rgroup;
+ }
+
/*
* Check to see if we can get an exclusive group for
* this mac client. If no groups are available, use
@@ -2941,14 +3024,17 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
rgroup->mrg_cur_count);
}
}
+
flent->fe_rx_ring_group = rgroup;
/*
- * Add the client to the group. This could cause
- * either this group to move to the shared state or
- * cause the default group to move to the shared state.
- * The actions on this group are done here, while the
- * actions on the default group are postponed to
- * the end of this function.
+ * Add the client to the group and update the
+ * group's state. If rgroup != default_group
+ * then the rgroup should only ever have one
+ * client and be in the RESERVED state. But no
+ * matter what, the default_rgroup will enter
+ * the SHARED state since it has to receive
+ * all broadcast and multicast traffic. This
+ * case is handled later in the function.
*/
mac_group_add_client(rgroup, mcip);
next_state = mac_group_next_state(rgroup,
@@ -2973,28 +3059,37 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
&group_only_mcip, default_tgroup, B_FALSE);
tgroup->mrg_state = next_state;
}
- /*
- * Setup the Rx and Tx SRSes. If we got a pristine group
- * exclusively above, mac_srs_group_setup would simply create
- * the required SRSes. If we ended up sharing a previously
- * reserved group, mac_srs_group_setup would also dismantle the
- * SRSes of the previously exclusive group
- */
- mac_srs_group_setup(mcip, flent, link_type);
/* We are setting up minimal datapath only */
- if (no_unicast)
+ if (no_unicast) {
+ mac_srs_group_setup(mcip, flent, link_type);
break;
- /* Program the S/W Classifer */
+ }
+
+ /* Program software classification. */
if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0)
goto setup_failed;
- /* Program the H/W Classifier */
- if ((err = mac_add_macaddr(mip, rgroup, mac_addr,
- (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0)) != 0)
+ /* Program hardware classification. */
+ vid = i_mac_flow_vid(flent);
+ use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
+ err = mac_add_macaddr_vlan(mip, rgroup, mac_addr, vid, use_hw);
+
+ if (err != 0)
goto setup_failed;
+
mcip->mci_unicast = mac_find_macaddr(mip, mac_addr);
- ASSERT(mcip->mci_unicast != NULL);
+ VERIFY3P(mcip->mci_unicast, !=, NULL);
+
+ /*
+ * Setup the Rx and Tx SRSes. If the client has a
+ * reserved group, then mac_srs_group_setup() creates
+ * the required SRSes for the HW rings. If we have a
+ * shared group, mac_srs_group_setup() dismantles the
+ * HW SRSes of the previously exclusive group.
+ */
+ mac_srs_group_setup(mcip, flent, link_type);
+
/* (Re)init the v6 token & local addr used by link protection */
mac_protect_update_mac_token(mcip);
break;
@@ -3038,17 +3133,23 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
ASSERT(default_rgroup->mrg_state ==
MAC_GROUP_STATE_SHARED);
}
+
/*
- * If we get an exclusive group for a VLAN MAC client we
- * need to take the s/w path to make the additional check for
- * the vid. Disable polling and set it to s/w classification.
- * Similarly for clients that don't have a unicast address.
+ * A VLAN MAC client on a reserved group still
+ * requires SW classification if the MAC doesn't
+ * provide VLAN HW filtering.
+ *
+ * Clients with no unicast address also require SW
+ * classification.
*/
if (rgroup->mrg_state == MAC_GROUP_STATE_RESERVED &&
- (i_mac_flow_vid(flent) != VLAN_ID_NONE || no_unicast)) {
+ ((!MAC_GROUP_HW_VLAN(rgroup) && vid != VLAN_ID_NONE) ||
+ no_unicast)) {
mac_rx_switch_grp_to_sw(rgroup);
}
+
}
+
mac_set_rings_effective(mcip);
return (0);
@@ -3074,6 +3175,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
boolean_t check_default_group = B_FALSE;
mac_group_state_t next_state;
mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
+ uint16_t vid;
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
@@ -3086,16 +3188,24 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
case SRST_LINK:
/* Stop sending packets */
mac_tx_client_block(mcip);
+ group = flent->fe_rx_ring_group;
+ vid = i_mac_flow_vid(flent);
- /* Stop the packets coming from the H/W */
+ /*
+ * Stop the packet flow from the hardware by disabling
+ * any hardware filters assigned to this client.
+ */
if (mcip->mci_unicast != NULL) {
int err;
- err = mac_remove_macaddr(mcip->mci_unicast);
+
+ err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid);
+
if (err != 0) {
- cmn_err(CE_WARN, "%s: failed to remove a MAC"
- " address because of error 0x%x",
+ cmn_err(CE_WARN, "%s: failed to remove a MAC HW"
+ " filters because of error 0x%x",
mip->mi_name, err);
}
+
mcip->mci_unicast = NULL;
}
@@ -3103,12 +3213,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
- /* Now quiesce and destroy all SRS and soft rings */
+ /* Quiesce and destroy all the SRSes. */
mac_rx_srs_group_teardown(flent, B_FALSE);
mac_tx_srs_group_teardown(mcip, flent, SRST_LINK);
- ASSERT((mcip->mci_flent == flent) &&
- (flent->fe_next == NULL));
+ ASSERT3P(mcip->mci_flent, ==, flent);
+ ASSERT3P(flent->fe_next, ==, NULL);
/*
* Release our hold on the group as well. We need
@@ -3116,17 +3226,17 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
* left who can use it exclusively. Also, if we
* were the last client, release the group.
*/
- group = flent->fe_rx_ring_group;
default_group = MAC_DEFAULT_RX_GROUP(mip);
if (group != NULL) {
mac_group_remove_client(group, mcip);
next_state = mac_group_next_state(group,
&grp_only_mcip, default_group, B_TRUE);
+
if (next_state == MAC_GROUP_STATE_RESERVED) {
/*
* Only one client left on this RX group.
*/
- ASSERT(grp_only_mcip != NULL);
+ VERIFY3P(grp_only_mcip, !=, NULL);
mac_set_group_state(group,
MAC_GROUP_STATE_RESERVED);
group_only_flent = grp_only_mcip->mci_flent;
@@ -3151,7 +3261,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
* to see if the primary client can get
* exclusive access to the default group.
*/
- ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
+ VERIFY3P(group, !=, MAC_DEFAULT_RX_GROUP(mip));
if (mrp->mrp_mask & MRP_RX_RINGS) {
MAC_RX_GRP_RELEASED(mip);
if (mip->mi_rx_group_type ==
@@ -3165,7 +3275,8 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
MAC_GROUP_STATE_REGISTERED);
check_default_group = B_TRUE;
} else {
- ASSERT(next_state == MAC_GROUP_STATE_SHARED);
+ VERIFY3S(next_state, ==,
+ MAC_GROUP_STATE_SHARED);
mac_set_group_state(group,
MAC_GROUP_STATE_SHARED);
mac_rx_group_unmark(group, MR_CONDEMNED);
@@ -3254,12 +3365,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
*/
if (check_default_group) {
default_group = MAC_DEFAULT_RX_GROUP(mip);
- ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED);
+ VERIFY3S(default_group->mrg_state, ==, MAC_GROUP_STATE_SHARED);
next_state = mac_group_next_state(default_group,
&grp_only_mcip, default_group, B_TRUE);
if (next_state == MAC_GROUP_STATE_RESERVED) {
- ASSERT(grp_only_mcip != NULL &&
- mip->mi_nactiveclients == 1);
+ VERIFY3P(grp_only_mcip, !=, NULL);
+ VERIFY3U(mip->mi_nactiveclients, ==, 1);
mac_set_group_state(default_group,
MAC_GROUP_STATE_RESERVED);
mac_rx_srs_group_setup(grp_only_mcip,
@@ -3385,7 +3496,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs)
ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
- mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
+ mac_drop_chain(mac_srs->srs_first, "SRS free");
mac_srs_ring_free(mac_srs);
mac_srs_soft_rings_free(mac_srs);
mac_srs_fanout_list_free(mac_srs);
@@ -3783,7 +3894,7 @@ mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
* is also stored in st_soft_rings[] array. That entry should
* be removed.
*/
- if (mcip->mci_state_flags & MCIS_IS_AGGR) {
+ if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) {
mac_srs_tx_t *tx = &mac_srs->srs_tx;
ASSERT(tx->st_soft_rings[tx_ring->mr_index] == remove_sring);
@@ -3812,7 +3923,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent)
boolean_t is_aggr;
uint_t ring_info = 0;
- is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR) != 0;
+ is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) != 0;
grp = flent->fe_tx_ring_group;
if (grp == NULL) {
ring = (mac_ring_t *)mip->mi_default_tx_ring;
@@ -3956,8 +4067,8 @@ mac_fanout_recompute_client(mac_client_impl_t *mcip, cpupart_t *cpupart)
}
/*
- * Walk through the list of mac clients for the MAC.
- * For each active mac client, recompute the number of soft rings
+ * Walk through the list of MAC clients for the MAC.
+ * For each active MAC client, recompute the number of soft rings
* associated with every client, only if current speed is different
* from the speed that was previously used for soft ring computation.
* If the cable is disconnected whlie the NIC is started, we would get
@@ -3980,6 +4091,10 @@ mac_fanout_recompute(mac_impl_t *mip)
for (mcip = mip->mi_clients_list; mcip != NULL;
mcip = mcip->mci_client_next) {
+ /* Aggr port clients don't have SRSes. */
+ if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0)
+ continue;
+
if ((mcip->mci_state_flags & MCIS_SHARE_BOUND) != 0 ||
!MCIP_DATAPATH_SETUP(mcip))
continue;
@@ -3992,6 +4107,7 @@ mac_fanout_recompute(mac_impl_t *mip)
mac_set_pool_effective(use_default, cpupart, mrp, emrp);
pool_unlock();
}
+
i_mac_perim_exit(mip);
}
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
index aa4985fe4c..62612122d6 100644
--- a/usr/src/uts/common/io/mac/mac_flow.c
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/strsun.h>
@@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
/* Initialize the receiver function to a safe routine */
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_index = -1;
}
(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c
index da83dc643e..ee493bbca1 100644
--- a/usr/src/uts/common/io/mac/mac_protect.c
+++ b/usr/src/uts/common/io/mac/mac_protect.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc. All rights reserved.
*/
/*
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
@@ -209,7 +209,7 @@ typedef struct slaac_addr {
} slaac_addr_t;
static void start_txn_cleanup_timer(mac_client_impl_t *);
-static boolean_t allowed_ips_set(mac_resource_props_t *, uint32_t);
+static boolean_t dynamic_method_set(mac_protect_t *, uint32_t);
#define BUMP_STAT(m, s) (m)->mci_misc_stat.mms_##s++
@@ -580,8 +580,7 @@ intercept_dhcpv4_outbound(mac_client_impl_t *mcip, ipha_t *ipha, uchar_t *end)
if (get_dhcpv4_info(ipha, end, &dh4) != 0)
return (B_TRUE);
- /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */
- if (allowed_ips_set(mrp, IPV4_VERSION))
+ if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV4))
return (B_FALSE);
if (get_dhcpv4_option(dh4, end, CD_DHCP_TYPE, &opt, &opt_len) != 0 ||
@@ -1310,8 +1309,7 @@ intercept_dhcpv6_outbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end)
if (get_dhcpv6_info(ip6h, end, &dh6) != 0)
return (B_TRUE);
- /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */
- if (allowed_ips_set(mrp, IPV6_VERSION))
+ if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV6))
return (B_FALSE);
/*
@@ -1517,6 +1515,10 @@ intercept_ra_inbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end,
{
struct nd_opt_hdr *opt;
int len, optlen;
+ mac_protect_t *protect = &MCIP_RESOURCE_PROPS(mcip)->mrp_protect;
+
+ if (!dynamic_method_set(protect, MPT_DYN_SLAAC))
+ return;
if (ip6h->ip6_hlim != 255) {
DTRACE_PROBE1(invalid__hoplimit, uint8_t, ip6h->ip6_hlim);
@@ -1755,6 +1757,7 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect,
if (*addr == INADDR_ANY)
return (B_TRUE);
+ /* If any specific addresses or subnets are allowed, check them */
for (i = 0; i < protect->mp_ipaddrcnt; i++) {
mac_ipaddr_t *v4addr = &protect->mp_ipaddrs[i];
@@ -1775,14 +1778,19 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect,
return (B_TRUE);
}
}
- return (protect->mp_ipaddrcnt == 0 ?
- check_dhcpv4_dyn_ip(mcip, *addr) : B_FALSE);
+
+ if (dynamic_method_set(protect, MPT_DYN_DHCPV4)) {
+ return (check_dhcpv4_dyn_ip(mcip, *addr));
+ }
+
+ return (B_FALSE);
}
static boolean_t
ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
in6_addr_t *addr)
{
+ boolean_t slaac_enabled, dhcpv6_enabled;
uint_t i;
/*
@@ -1793,7 +1801,7 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
IN6_ARE_ADDR_EQUAL(&mcip->mci_v6_local_addr, addr)))
return (B_TRUE);
-
+ /* If any specific addresses or subnets are allowed, check them */
for (i = 0; i < protect->mp_ipaddrcnt; i++) {
mac_ipaddr_t *v6addr = &protect->mp_ipaddrs[i];
@@ -1804,12 +1812,15 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
return (B_TRUE);
}
- if (protect->mp_ipaddrcnt == 0) {
- return (check_slaac_ip(mcip, addr) ||
- check_dhcpv6_dyn_ip(mcip, addr));
- } else {
- return (B_FALSE);
- }
+ slaac_enabled = dynamic_method_set(protect, MPT_DYN_SLAAC);
+ if (slaac_enabled && check_slaac_ip(mcip, addr))
+ return (B_TRUE);
+
+ dhcpv6_enabled = dynamic_method_set(protect, MPT_DYN_DHCPV6);
+ if (dhcpv6_enabled && check_dhcpv6_dyn_ip(mcip, addr))
+ return (B_TRUE);
+
+ return (B_FALSE);
}
/*
@@ -2025,6 +2036,9 @@ dhcpnospoof_check_cid(mac_protect_t *p, uchar_t *cid, uint_t cidlen)
bcmp(dcid->dc_id, cid, cidlen) == 0)
return (B_TRUE);
}
+
+ DTRACE_PROBE3(missing__cid, mac_protect_t *, p,
+ uchar_t *, cid, uint_t, cidlen);
return (B_FALSE);
}
@@ -2046,6 +2060,12 @@ dhcpnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *p,
bcmp(mcip->mci_unicast->ma_addr, dh4->chaddr, maclen) != 0) {
return (B_FALSE);
}
+
+ /* Everything after here is checking the Client Identifier */
+ if (p->mp_allcids == MPT_TRUE) {
+ return (B_TRUE);
+ }
+
if (get_dhcpv4_option(dh4, end, CD_CLIENT_ID, &cid, &optlen) == 0)
cidlen = optlen;
@@ -2082,6 +2102,11 @@ dhcpnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *p,
mtype == DHCPV6_MSG_RECONFIGURE)
return (B_TRUE);
+ /* Everything after here is checking the Client Identifier */
+ if (p->mp_allcids == MPT_TRUE) {
+ return (B_TRUE);
+ }
+
d6o = get_dhcpv6_option(&dh6[1], end - (uchar_t *)&dh6[1], NULL,
DHCPV6_OPT_CLIENTID, &cidlen);
if (d6o == NULL || (uchar_t *)d6o + cidlen > end)
@@ -2159,7 +2184,6 @@ dhcpnospoof_check(mac_client_impl_t *mcip, mac_protect_t *protect,
return (0);
fail:
- /* increment dhcpnospoof stat here */
freemsg(nmp);
return (err);
}
@@ -2487,6 +2511,11 @@ mac_protect_validate(mac_resource_props_t *mrp)
if ((err = validate_cids(p)) != 0)
return (err);
+ if (p->mp_allcids != MPT_FALSE && p->mp_allcids != MPT_TRUE &&
+ p->mp_allcids != MPT_RESET) {
+ return (EINVAL);
+ }
+
return (0);
}
@@ -2554,6 +2583,16 @@ mac_protect_update(mac_resource_props_t *new, mac_resource_props_t *curr)
cp->mp_cidcnt = 0;
}
}
+ if (np->mp_allcids == MPT_RESET) {
+ cp->mp_allcids = MPT_FALSE;
+ } else if (np->mp_allcids != 0) {
+ cp->mp_allcids = MPT_TRUE;
+ }
+ if (np->mp_dynamic == MPT_RESET) {
+ cp->mp_dynamic = 0;
+ } else if (np->mp_dynamic != 0) {
+ cp->mp_dynamic = np->mp_dynamic;
+ }
}
void
@@ -2597,15 +2636,50 @@ mac_protect_fini(mac_client_impl_t *mcip)
}
static boolean_t
-allowed_ips_set(mac_resource_props_t *mrp, uint32_t af)
+dynamic_method_set(mac_protect_t *mpt, uint32_t method)
+{
+ if (mpt->mp_dynamic != 0) {
+ return ((mpt->mp_dynamic & method) != 0);
+ } else {
+ return (mpt->mp_ipaddrcnt == 0);
+ }
+}
+
+boolean_t
+mac_protect_check_addr(mac_client_handle_t mch, boolean_t isv6,
+ in6_addr_t *v6addr)
{
- int i;
+ mac_perim_handle_t perim;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
- for (i = 0; i < mrp->mrp_protect.mp_ipaddrcnt; i++) {
- if (mrp->mrp_protect.mp_ipaddrs[i].ip_version == af)
- return (B_TRUE);
+ mac_perim_enter_by_mh(mh, &perim);
+
+ mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
+ mac_protect_t *p;
+ boolean_t allowed;
+
+ ASSERT(mrp != NULL);
+
+ p = &mrp->mrp_protect;
+
+ /* If mac protection/ipnospoof isn't enabled, return true */
+ if ((mrp->mrp_mask & MRP_PROTECT) == 0 ||
+ (p->mp_types & MPT_IPNOSPOOF) == 0) {
+ allowed = B_TRUE;
+ goto done;
}
- return (B_FALSE);
+
+ if (isv6) {
+ allowed = ipnospoof_check_v6(mcip, p, v6addr);
+ } else {
+ in_addr_t *v4addr = &V4_PART_OF_V6((*v6addr));
+ allowed = ipnospoof_check_v4(mcip, p, v4addr);
+ }
+
+done:
+ mac_perim_exit(perim);
+ return (allowed);
}
mac_protect_t *
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index 07201afdec..cb1a76aef6 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
*/
@@ -56,6 +57,7 @@
#include <sys/sdt.h>
#include <sys/pattr.h>
#include <sys/strsun.h>
+#include <sys/vlan.h>
/*
* MAC Provider Interface.
@@ -351,6 +353,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp)
if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
mip->mi_state_flags |= MIS_IS_AGGR;
+ if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
+ mip->mi_state_flags |= MIS_IS_OVERLAY;
+
mac_addr_factory_init(mip);
mac_transceiver_init(mip);
@@ -697,7 +702,6 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
mac_ring_t *mr = (mac_ring_t *)mrh;
mac_soft_ring_set_t *mac_srs;
mblk_t *bp = mp_chain;
- boolean_t hw_classified = B_FALSE;
/*
* If there are any promiscuous mode callbacks defined for
@@ -709,7 +713,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
if (mr != NULL) {
/*
* If the SRS teardown has started, just return. The 'mr'
- * continues to be valid until the driver unregisters the mac.
+ * continues to be valid until the driver unregisters the MAC.
* Hardware classified packets will not make their way up
* beyond this point once the teardown has started. The driver
* is never passed a pointer to a flow entry or SRS or any
@@ -722,11 +726,25 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
freemsgchain(mp_chain);
return;
}
- if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
- hw_classified = B_TRUE;
+
+ /*
+ * The ring is in passthru mode; pass the chain up to
+ * the pseudo ring.
+ */
+ if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
MR_REFHOLD_LOCKED(mr);
+ mutex_exit(&mr->mr_lock);
+ mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
+ B_FALSE);
+ MR_REFRELE(mr);
+ return;
}
- mutex_exit(&mr->mr_lock);
+
+ /*
+ * The passthru callback should only be set when in
+ * MAC_PASSTHRU_CLASSIFIER mode.
+ */
+ ASSERT3P(mr->mr_pt_fn, ==, NULL);
/*
* We check if an SRS is controlling this ring.
@@ -734,19 +752,24 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
* routine otherwise we need to go through mac_rx_classify
* to reach the right place.
*/
- if (hw_classified) {
+ if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
+ MR_REFHOLD_LOCKED(mr);
+ mutex_exit(&mr->mr_lock);
+ ASSERT3P(mr->mr_srs, !=, NULL);
mac_srs = mr->mr_srs;
+
/*
- * This is supposed to be the fast path.
- * All packets received though here were steered by
- * the hardware classifier, and share the same
- * MAC header info.
+ * This is the fast path. All packets received
+ * on this ring are hardware classified and
+ * share the same MAC header info.
*/
mac_srs->srs_rx.sr_lower_proc(mh,
(mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
MR_REFRELE(mr);
return;
}
+
+ mutex_exit(&mr->mr_lock);
/* We'll fall through to software classification */
} else {
flow_entry_t *flent;
@@ -1472,7 +1495,8 @@ mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
pr->pr_flags |= MAC_PROP_INFO_PERM;
}
-void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff,
+void
+mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
{
uint32_t flags;
@@ -1497,8 +1521,9 @@ void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff,
*flags_ptr = flags;
}
-void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff,
- uint32_t end, uint32_t value, uint32_t flags)
+void
+mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
+ uint32_t value, uint32_t flags)
{
ASSERT(DB_TYPE(mp) == M_DATA);
@@ -1510,6 +1535,31 @@ void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff,
}
void
+mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
+{
+ ASSERT3U(DB_TYPE(src), ==, M_DATA);
+ ASSERT3U(DB_TYPE(dst), ==, M_DATA);
+
+ /*
+ * Do these assignments unconditionally, rather than only when
+ * flags is non-zero. This protects a situation where zeroed
+ * hcksum data does not make the jump onto an mblk_t with
+ * stale data in those fields. It's important to copy all
+ * possible flags (HCK_* as well as HW_*) and not just the
+ * checksum specific flags. Dropping flags during a clone
+ * could result in dropped packets. If the caller has good
+ * reason to drop those flags then it should do it manually,
+ * after the clone.
+ */
+ DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
+ DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
+ DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
+ DB_CKSUMEND(dst) = DB_CKSUMEND(src);
+ DB_CKSUM16(dst) = DB_CKSUM16(src);
+ DB_LSOMSS(dst) = DB_LSOMSS(src);
+}
+
+void
mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
{
ASSERT(DB_TYPE(mp) == M_DATA);
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index d046930873..0e62f828a9 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -21,7 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
@@ -300,9 +300,8 @@
*
* Otherwise, all fanout is performed by software. MAC divides incoming frames
* into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and
- * everything else. Note, VLAN tagged traffic is considered other, regardless of
- * the interior EtherType. Regardless of the type of fanout, these three
- * categories or buckets are always used.
+ * everything else. Regardless of the type of fanout, these three categories
+ * or buckets are always used.
*
* The difference between protocol level fanout and full software ring protocol
* fanout is the number of software rings that end up getting created. The
@@ -969,6 +968,7 @@
#include <sys/types.h>
#include <sys/callb.h>
+#include <sys/pattr.h>
#include <sys/sdt.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
@@ -1328,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0;
* b_prev may be set to the fanout hint \
* hence can't use freemsg directly \
*/ \
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \
+ mac_drop_chain(mp_chain, "SRS Tx max queue"); \
DTRACE_PROBE1(tx_queued_hiwat, \
mac_soft_ring_set_t *, srs); \
enqueue = 0; \
@@ -1347,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0;
if (!(srs->srs_type & SRST_TX)) \
mutex_exit(&srs->srs_bw->mac_bw_lock);
-#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \
- mac_pkt_drop(NULL, NULL, mp, B_FALSE); \
+#define MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) { \
+ mac_drop_pkt((chain), (s)); \
/* increment freed stats */ \
- mac_srs->srs_tx.st_stat.mts_sdrops++; \
- cookie = (mac_tx_cookie_t)srs; \
+ (srs)->srs_tx.st_stat.mts_sdrops++; \
+ (cookie) = (mac_tx_cookie_t)(srs); \
}
#define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \
@@ -1367,11 +1367,11 @@ int mac_srs_worker_wakeup_ticks = 0;
* can occur in situ (in the interrupt thread) or if it should be left to a
* worker thread. Note that the constant used to make this determination is
* not entirely made-up, and is a result of some emprical validation. That
- * said, the constant is left as a static variable to allow it to be
+ * said, the constant is left as a global variable to allow it to be
* dynamically tuned in the field if and as needed.
*/
-static uintptr_t mac_rx_srs_stack_needed = 10240;
-static uint_t mac_rx_srs_stack_toodeep;
+uintptr_t mac_rx_srs_stack_needed = 14336;
+uint_t mac_rx_srs_stack_toodeep;
#ifndef STACK_GROWTH_DOWN
#error Downward stack growth assumed.
@@ -1379,7 +1379,7 @@ static uint_t mac_rx_srs_stack_toodeep;
#define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
(uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
- ++mac_rx_srs_stack_toodeep)
+ (++mac_rx_srs_stack_toodeep || (mac_rx_srs_stack_toodeep = 1)))
/*
@@ -1475,16 +1475,15 @@ enum pkt_type {
#define PORTS_SIZE 4
/*
- * mac_rx_srs_proto_fanout
- *
- * This routine delivers packets destined to an SRS into one of the
+ * This routine delivers packets destined for an SRS into one of the
* protocol soft rings.
*
- * Given a chain of packets we need to split it up into multiple sub chains
- * destined into TCP, UDP or OTH soft ring. Instead of entering
- * the soft ring one packet at a time, we want to enter it in the form of a
- * chain otherwise we get this start/stop behaviour where the worker thread
- * goes to sleep and then next packets comes in forcing it to wake up etc.
+ * Given a chain of packets we need to split it up into multiple sub
+ * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
+ * ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker
+ * thread goes to sleep and then next packet comes in forcing it to
+ * wake up.
*/
static void
mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
@@ -1523,9 +1522,9 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
/*
- * Special clients (eg. VLAN, non ether, etc) need DLS
- * processing in the Rx path. SRST_DLS_BYPASS will be clear for
- * such SRSs. Another way of disabling bypass is to set the
+ * Some clients, such as non-ethernet, need DLS processing in
+ * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
+ * DLS bypass may also be disabled via the
* MCIS_RX_BYPASS_DISABLE flag.
*/
dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
@@ -1537,10 +1536,11 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
bzero(sz, MAX_SR_TYPES * sizeof (size_t));
/*
- * We got a chain from SRS that we need to send to the soft rings.
- * Since squeues for TCP & IPv4 sap poll their soft rings (for
- * performance reasons), we need to separate out v4_tcp, v4_udp
- * and the rest goes in other.
+ * We have a chain from SRS that we need to split across the
+ * soft rings. The squeues for the TCP and IPv4 SAPs use their
+ * own soft rings to allow polling from the squeue. The rest of
+ * the packets are delivered on the OTH soft ring which cannot
+ * be polled.
*/
while (head != NULL) {
mp = head;
@@ -1568,9 +1568,14 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
evhp = (struct ether_vlan_header *)mp->b_rptr;
sap = ntohs(evhp->ether_type);
hdrsize = sizeof (struct ether_vlan_header);
+
/*
- * Check if the VID of the packet, if any,
- * belongs to this client.
+ * Check if the VID of the packet, if
+ * any, belongs to this client.
+ * Technically, if this packet came up
+ * via a HW classified ring then we
+ * don't need to perform this check.
+ * Perhaps a future optimization.
*/
if (!mac_client_check_flow_vid(mcip,
VLAN_ID(ntohs(evhp->ether_tci)))) {
@@ -1635,7 +1640,6 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
* performance and may bypass DLS. All other cases go through
* the 'OTH' type path without DLS bypass.
*/
-
ipha = (ipha_t *)(mp->b_rptr + hdrsize);
if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
type = OTH;
@@ -1647,11 +1651,13 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
}
ASSERT(type == UNDEF);
+
/*
- * We look for at least 4 bytes past the IP header to get
- * the port information. If we get an IP fragment, we don't
- * have the port information, and we use just the protocol
- * information.
+ * Determine the type from the IP protocol value. If
+ * classified as TCP or UDP, then update the read
+ * pointer to the beginning of the IP header.
+ * Otherwise leave the message as is for further
+ * processing by DLS.
*/
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
@@ -1695,11 +1701,10 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
int fanout_unaligned = 0;
/*
- * mac_rx_srs_long_fanout
- *
- * The fanout routine for VLANs, and for anything else that isn't performing
- * explicit dls bypass. Returns -1 on an error (drop the packet due to a
- * malformed packet), 0 on success, with values written in *indx and *type.
+ * The fanout routine for any clients with DLS bypass disabled or for
+ * traffic classified as "other". Returns -1 on an error (drop the
+ * packet due to a malformed packet), 0 on success, with values
+ * written in *indx and *type.
*/
static int
mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
@@ -1865,16 +1870,15 @@ src_dst_based_fanout:
}
/*
- * mac_rx_srs_fanout
- *
- * This routine delivers packets destined to an SRS into a soft ring member
+ * This routine delivers packets destined for an SRS into a soft ring member
* of the set.
*
- * Given a chain of packets we need to split it up into multiple sub chains
- * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
- * the soft ring one packet at a time, we want to enter it in the form of a
- * chain otherwise we get this start/stop behaviour where the worker thread
- * goes to sleep and then next packets comes in forcing it to wake up etc.
+ * Given a chain of packets we need to split it up into multiple sub
+ * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
+ * ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker
+ * thread goes to sleep and then next packet comes in forcing it to
+ * wake up.
*
* Note:
* Since we know what is the maximum fanout possible, we create a 2D array
@@ -1935,10 +1939,11 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
/*
- * Special clients (eg. VLAN, non ether, etc) need DLS
- * processing in the Rx path. SRST_DLS_BYPASS will be clear for
- * such SRSs. Another way of disabling bypass is to set the
- * MCIS_RX_BYPASS_DISABLE flag.
+ * Some clients, such as non Ethernet, need DLS processing in
+ * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
+ * DLS bypass may also be disabled via the
+ * MCIS_RX_BYPASS_DISABLE flag, but this is only consumed by
+ * sun4v vsw currently.
*/
dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
@@ -1960,7 +1965,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
/*
* We got a chain from SRS that we need to send to the soft rings.
- * Since squeues for TCP & IPv4 sap poll their soft rings (for
+ * Since squeues for TCP & IPv4 SAP poll their soft rings (for
* performance reasons), we need to separate out v4_tcp, v4_udp
* and the rest goes in other.
*/
@@ -1990,9 +1995,14 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
evhp = (struct ether_vlan_header *)mp->b_rptr;
sap = ntohs(evhp->ether_type);
hdrsize = sizeof (struct ether_vlan_header);
+
/*
- * Check if the VID of the packet, if any,
- * belongs to this client.
+ * Check if the VID of the packet, if
+ * any, belongs to this client.
+ * Technically, if this packet came up
+ * via a HW classified ring then we
+ * don't need to perform this check.
+ * Perhaps a future optimization.
*/
if (!mac_client_check_flow_vid(mcip,
VLAN_ID(ntohs(evhp->ether_tci)))) {
@@ -2032,7 +2042,6 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
continue;
}
-
/*
* If we are using the default Rx ring where H/W or S/W
* classification has not happened, we need to verify if
@@ -2621,7 +2630,6 @@ again:
mac_srs->srs_state |= (SRS_PROC|proc_type);
-
/*
* mcip is NULL for broadcast and multicast flows. The promisc
* callbacks for broadcast and multicast packets are delivered from
@@ -2641,10 +2649,8 @@ again:
}
/*
- * Check if SRS itself is doing the processing
- * This direct path does not apply when subflows are present. In this
- * case, packets need to be dispatched to a soft ring according to the
- * flow's bandwidth and other resources contraints.
+ * Check if SRS itself is doing the processing. This direct
+ * path applies only when subflows are present.
*/
if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
mac_direct_rx_t proc;
@@ -2888,7 +2894,7 @@ again:
mac_srs->srs_bw->mac_bw_sz -= sz;
mac_srs->srs_bw->mac_bw_drop_bytes += sz;
mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
- mac_pkt_drop(NULL, NULL, head, B_FALSE);
+ mac_drop_chain(head, "Rx no bandwidth");
goto leave_poll;
} else {
mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
@@ -3270,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
}
/*
- * mac_rx_srs_process
- *
- * Receive side routine called from the interrupt path.
+ * MAC SRS receive side routine. If the data is coming from the
+ * network (i.e. from a NIC) then this is called in interrupt context.
+ * If the data is coming from a local sender (e.g. mac_tx_send() or
+ * bridge_forward()) then this is not called in interrupt context.
*
* loopback is set to force a context switch on the loopback
* path between MAC clients.
@@ -3332,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
mac_bw->mac_bw_drop_bytes += sz;
mutex_exit(&mac_bw->mac_bw_lock);
mutex_exit(&mac_srs->srs_lock);
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain, "Rx no bandwidth");
return;
} else {
if ((mac_bw->mac_bw_sz + sz) <=
@@ -3454,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
if (flag & MAC_DROP_ON_NO_DESC) {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx no desc");
} else {
if (mac_srs->srs_first != NULL)
wakeup_worker = B_FALSE;
@@ -3517,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
if (flag & MAC_DROP_ON_NO_DESC) {
if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx SRS hiwat");
} else {
MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
mp_chain, tail, cnt, sz);
@@ -3890,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
cookie = (mac_tx_cookie_t)mac_srs;
*ret_mp = mp_chain;
} else {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx no bandwidth");
}
mutex_exit(&mac_srs->srs_lock);
return (cookie);
@@ -4336,6 +4346,14 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
obytes += (mp->b_cont == NULL ? MBLKL(mp) :
msgdsize(mp));
+ /*
+ * Mark all packets as local so that a
+ * receiver can determine if a packet arrived
+ * from a local source or from the network.
+ * This allows some consumers to avoid
+ * unecessary work like checksum computation.
+ */
+ DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC;
CHECK_VID_AND_ADD_TAG(mp);
MAC_TX(mip, ring, mp, src_mcip);
@@ -4368,7 +4386,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
flow_entry_t *dst_flow_ent;
void *flow_cookie;
size_t pkt_size;
- mblk_t *mp1;
next = mp->b_next;
mp->b_next = NULL;
@@ -4378,49 +4395,25 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
CHECK_VID_AND_ADD_TAG(mp);
/*
+ * Mark all packets as local so that a receiver can
+ * determine if a packet arrived from a local source
+ * or from the network. This allows some consumers to
+ * avoid unecessary work like checksum computation.
+ */
+ DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC;
+
+ /*
* Find the destination.
*/
dst_flow_ent = mac_tx_classify(mip, mp);
if (dst_flow_ent != NULL) {
- size_t hdrsize;
- int err = 0;
-
- if (mip->mi_info.mi_nativemedia == DL_ETHER) {
- struct ether_vlan_header *evhp =
- (struct ether_vlan_header *)mp->b_rptr;
-
- if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
- hdrsize = sizeof (*evhp);
- else
- hdrsize = sizeof (struct ether_header);
- } else {
- mac_header_info_t mhi;
-
- err = mac_header_info((mac_handle_t)mip,
- mp, &mhi);
- if (err == 0)
- hdrsize = mhi.mhi_hdrsize;
- }
-
/*
* Got a matching flow. It's either another
* MAC client, or a broadcast/multicast flow.
- * Make sure the packet size is within the
- * allowed size. If not drop the packet and
- * move to next packet.
*/
- if (err != 0 ||
- (pkt_size - hdrsize) > mip->mi_sdu_max) {
- oerrors++;
- DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
- mblk_t *, mp);
- freemsg(mp);
- mp = next;
- FLOW_REFRELE(dst_flow_ent);
- continue;
- }
flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
+
if (flow_cookie != NULL) {
/*
* The vnic_bcast_send function expects
@@ -4438,6 +4431,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
* bypass is set.
*/
boolean_t do_switch;
+
mac_client_impl_t *dst_mcip =
dst_flow_ent->fe_mcip;
@@ -4453,19 +4447,19 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
* check is done inside the MAC_TX()
* macro.
*/
- if (mip->mi_promisc_list != NULL)
+ if (mip->mi_promisc_list != NULL) {
mac_promisc_dispatch(mip, mp, src_mcip);
+ }
do_switch = ((src_mcip->mci_state_flags &
dst_mcip->mci_state_flags &
MCIS_CLIENT_POLL_CAPABLE) != 0);
- if ((mp1 = mac_fix_cksum(mp)) != NULL) {
- (dst_flow_ent->fe_cb_fn)(
- dst_flow_ent->fe_cb_arg1,
- dst_flow_ent->fe_cb_arg2,
- mp1, do_switch);
- }
+ (dst_flow_ent->fe_cb_fn)(
+ dst_flow_ent->fe_cb_arg1,
+ dst_flow_ent->fe_cb_arg2,
+ mp, do_switch);
+
}
FLOW_REFRELE(dst_flow_ent);
} else {
@@ -4656,6 +4650,9 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
* the packet to the promiscuous listeners of the
* client, since they expect to see the whole
* frame including the VLAN headers.
+ *
+ * The MCIS_STRIP_DISABLE is only issued when sun4v
+ * vsw is in play.
*/
mp_chain = mac_strip_vlan_tag_chain(mp_chain);
}
@@ -4664,13 +4661,11 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
}
/*
- * mac_rx_soft_ring_process
- *
- * process a chain for a given soft ring. The number of packets queued
- * in the SRS and its associated soft rings (including this one) is
- * very small (tracked by srs_poll_pkt_cnt), then allow the entering
- * thread (interrupt or poll thread) to do inline processing. This
- * helps keep the latency down under low load.
+ * Process a chain for a given soft ring. If the number of packets
+ * queued in the SRS and its associated soft rings (including this
+ * one) is very small (tracked by srs_poll_pkt_cnt) then allow the
+ * entering thread (interrupt or poll thread) to process the chain
+ * inline. This is meant to reduce latency under low load.
*
* The proc and arg for each mblk is already stored in the mblk in
* appropriate places.
@@ -4729,13 +4724,13 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
/*
- * If we have a soft ring set which is doing
- * bandwidth control, we need to decrement
- * srs_size and count so it the SRS can have a
- * accurate idea of what is the real data
- * queued between SRS and its soft rings. We
- * decrement the counters only when the packet
- * gets processed by both SRS and the soft ring.
+ * If we have an SRS performing bandwidth
+ * control then we need to decrement the size
+ * and count so the SRS has an accurate count
+ * of the data queued between the SRS and its
+ * soft rings. We decrement the counters only
+ * when the packet is processed by both the
+ * SRS and the soft ring.
*/
mutex_enter(&mac_srs->srs_lock);
MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
@@ -4751,8 +4746,8 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
if ((ringp->s_ring_first == NULL) ||
(ringp->s_ring_state & S_RING_BLANK)) {
/*
- * We processed inline our packet and
- * nothing new has arrived or our
+ * We processed a single packet inline
+ * and nothing new has arrived or our
* receiver doesn't want to receive
* any packets. We are done.
*/
@@ -4821,7 +4816,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
if (flag & MAC_DROP_ON_NO_DESC) {
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain, "Tx softring no desc");
/* increment freed stats */
ringp->s_ring_drops += cnt;
cookie = (mac_tx_cookie_t)ringp;
@@ -4865,8 +4860,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
* b_prev may be set to the fanout hint
* hence can't use freemsg directly
*/
- mac_pkt_drop(NULL, NULL,
- mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain,
+ "Tx softring max queue");
DTRACE_PROBE1(tx_queued_hiwat,
mac_soft_ring_t *, ringp);
enqueue = B_FALSE;
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index dc8cfdd145..4655631dc1 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -21,7 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -207,7 +207,7 @@ mac_soft_ring_create(int id, clock_t wait, uint16_t type,
ringp->s_ring_tx_hiwat =
(mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ?
mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat;
- if (mcip->mci_state_flags & MCIS_IS_AGGR) {
+ if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) {
mac_srs_tx_t *tx = &mac_srs->srs_tx;
ASSERT(tx->st_soft_rings[
@@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring)
ASSERT((softring->s_ring_state &
(S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
(S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
- mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
+ mac_drop_chain(softring->s_ring_first, "softring free");
softring->s_ring_tx_arg2 = NULL;
mac_soft_ring_stat_delete(softring);
mac_callback_free(softring->s_ring_notify_cb_list);
@@ -339,15 +339,14 @@ mac_soft_ring_fire(void *arg)
}
/*
- * mac_rx_soft_ring_drain
+ * Drain the soft ring pointed to by ringp.
*
- * Called when worker thread model (ST_RING_WORKER_ONLY) of processing
- * incoming packets is used. s_ring_first contain the queued packets.
- * s_ring_rx_func contains the upper level (client) routine where the
- * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the
- * cookie meant for the client.
+ * o s_ring_first: pointer to the queued packet chain.
+ *
+ * o s_ring_rx_func: pointer to to the client's Rx routine.
+ *
+ * o s_ring_rx_{arg1,arg2}: opaque values specific to the client.
*/
-/* ARGSUSED */
static void
mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
{
@@ -392,13 +391,12 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
(*proc)(arg1, arg2, mp, NULL);
/*
- * If we have a soft ring set which is doing
- * bandwidth control, we need to decrement its
- * srs_size so it can have a accurate idea of
- * what is the real data queued between SRS and
- * its soft rings. We decrement the size for a
- * packet only when it gets processed by both
- * SRS and the soft ring.
+ * If we have an SRS performing bandwidth control, then
+ * we need to decrement the size and count so the SRS
+ * has an accurate measure of the data queued between
+ * the SRS and its soft rings. We decrement the
+ * counters only when the packet is processed by both
+ * the SRS and the soft ring.
*/
mutex_enter(&mac_srs->srs_lock);
MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
@@ -414,12 +412,10 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
}
/*
- * mac_soft_ring_worker
- *
* The soft ring worker routine to process any queued packets. In
- * normal case, the worker thread is bound to a CPU. It the soft
- * ring is dealing with TCP packets, then the worker thread will
- * be bound to the same CPU as the TCP squeue.
+ * normal case, the worker thread is bound to a CPU. If the soft ring
+ * handles TCP packets then the worker thread is bound to the same CPU
+ * as the TCP squeue.
*/
static void
mac_soft_ring_worker(mac_soft_ring_t *ringp)
@@ -604,7 +600,7 @@ mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
mac_soft_ring_t *softring = arg;
mac_soft_ring_set_t *srs;
- ASSERT(rx_func != NULL);
+ VERIFY3P(rx_func, !=, NULL);
mutex_enter(&softring->s_ring_lock);
softring->s_ring_rx_func = rx_func;
diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c
index 31972f94d8..2244218f20 100644
--- a/usr/src/uts/common/io/mac/mac_stat.c
+++ b/usr/src/uts/common/io/mac/mac_stat.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -390,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname,
kstat_t *ksp;
kstat_named_t *knp;
- ksp = kstat_create(modname, 0, statname, "net",
- KSTAT_TYPE_NAMED, count, 0);
+ ksp = kstat_create_zone(modname, 0, statname, "net",
+ KSTAT_TYPE_NAMED, count, 0, getzoneid());
if (ksp == NULL)
return (NULL);
@@ -948,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip)
major_t major = getmajor(mip->mi_phy_dev);
count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount;
- ksp = kstat_create((const char *)ddi_major_to_name(major),
+ ksp = kstat_create_zone((const char *)ddi_major_to_name(major),
getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME,
- MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0);
+ MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid());
if (ksp == NULL)
return;
@@ -1003,6 +1004,7 @@ void
mac_ring_stat_create(mac_ring_t *ring)
{
mac_impl_t *mip = ring->mr_mip;
+ mac_group_t *grp = (mac_group_t *)ring->mr_gh;
char statname[MAXNAMELEN];
char modname[MAXNAMELEN];
@@ -1014,8 +1016,8 @@ mac_ring_stat_create(mac_ring_t *ring)
switch (ring->mr_type) {
case MAC_RING_TYPE_RX:
- (void) snprintf(statname, sizeof (statname), "mac_rx_ring%d",
- ring->mr_index);
+ (void) snprintf(statname, sizeof (statname),
+ "mac_rx_ring_%d_%d", grp->mrg_index, ring->mr_index);
i_mac_rx_ring_stat_create(ring, modname, statname);
break;
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index e83af37f16..334d1d034b 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -47,6 +48,74 @@
#include <inet/sadb.h>
#include <inet/ipsecesp.h>
#include <inet/ipsecah.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+
+/*
+ * The next two functions are used for dropping packets or chains of
+ * packets, respectively. We could use one function for both but
+ * separating the use cases allows us to specify intent and prevent
+ * dropping more data than intended.
+ *
+ * The purpose of these functions is to aid the debugging effort,
+ * especially in production. Rather than use freemsg()/freemsgchain(),
+ * it's preferable to use these functions when dropping a packet in
+ * the MAC layer. These functions should only be used during
+ * unexpected conditions. That is, any time a packet is dropped
+ * outside of the regular, successful datapath. Consolidating all
+ * drops on these functions allows the user to trace one location and
+ * determine why the packet was dropped based on the msg. It also
+ * allows the user to inspect the packet before it is freed. Finally,
+ * it allows the user to avoid tracing freemsg()/freemsgchain() thus
+ * keeping the hot path running as efficiently as possible.
+ *
+ * NOTE: At this time not all MAC drops are aggregated on these
+ * functions; but that is the plan. This comment should be erased once
+ * completed.
+ */
+
+/*PRINTFLIKE2*/
+void
+mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
+{
+ va_list adx;
+ char msg[128];
+ char *msgp = msg;
+
+ ASSERT3P(mp->b_next, ==, NULL);
+
+ va_start(adx, fmt);
+ (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+ va_end(adx);
+
+ DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+ freemsg(mp);
+}
+
+/*PRINTFLIKE2*/
+void
+mac_drop_chain(mblk_t *chain, const char *fmt, ...)
+{
+ va_list adx;
+ char msg[128];
+ char *msgp = msg;
+
+ va_start(adx, fmt);
+ (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+ va_end(adx);
+
+ /*
+ * We could use freemsgchain() for the actual freeing but
+ * since we are already walking the chain to fire the dtrace
+ * probe we might as well free the msg here too.
+ */
+ for (mblk_t *mp = chain, *next; mp != NULL; ) {
+ next = mp->b_next;
+ DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+ freemsg(mp);
+ mp = next;
+ }
+}
/*
* Copy an mblk, preserving its hardware checksum flags.
@@ -55,15 +124,12 @@ static mblk_t *
mac_copymsg_cksum(mblk_t *mp)
{
mblk_t *mp1;
- uint32_t start, stuff, end, value, flags;
mp1 = copymsg(mp);
if (mp1 == NULL)
return (NULL);
- hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
- (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
- flags, KM_NOSLEEP);
+ mac_hcksum_clone(mp, mp1);
return (mp1);
}
@@ -91,224 +157,1135 @@ mac_copymsgchain_cksum(mblk_t *mp)
}
/*
- * Process the specified mblk chain for proper handling of hardware
- * checksum offload. This routine is invoked for loopback traffic
- * between MAC clients.
- * The function handles a NULL mblk chain passed as argument.
+ * Perform software checksum on a single message, if needed. The
+ * emulation performed is determined by an intersection of the mblk's
+ * flags and the emul flags requested. The emul flags are documented
+ * in mac.h.
*/
-mblk_t *
-mac_fix_cksum(mblk_t *mp_chain)
+static mblk_t *
+mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
{
- mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
+ mblk_t *skipped_hdr = NULL;
uint32_t flags, start, stuff, end, value;
+ uint16_t len;
+ uint32_t offset;
+ uint16_t etype;
+ struct ether_header *ehp;
+ ipha_t *ipha;
+ uint8_t proto;
+ const char *err = "";
- for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
- uint16_t len;
- uint32_t offset;
- struct ether_header *ehp;
- uint16_t sap;
+ /*
+ * This function should only be called from mac_hw_emul()
+ * which handles mblk chains and the shared ref case.
+ */
+ ASSERT3P(mp->b_next, ==, NULL);
- hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
- &flags);
- if (flags == 0)
- continue;
+ mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
+
+ /*
+ * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) because
+ * we don't want to mask-out the HW_LOCAL_MAC flag.
+ */
+ flags = DB_CKSUMFLAGS(mp);
+
+ /* Why call this if checksum emulation isn't needed? */
+ ASSERT3U(flags & (HCK_FLAGS), !=, 0);
+
+ /*
+ * Ethernet, and optionally VLAN header. mac_hw_emul() has
+ * already verified we have enough data to read the L2 header.
+ */
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == VLAN_TPID) {
+ struct ether_vlan_header *evhp;
+
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ etype = ntohs(evhp->ether_type);
+ offset = sizeof (struct ether_vlan_header);
+ } else {
+ etype = ntohs(ehp->ether_type);
+ offset = sizeof (struct ether_header);
+ }
+
+ /*
+ * If this packet isn't IPv4, then leave it alone. We still
+ * need to add IPv6 support and we don't want to affect non-IP
+ * traffic like ARP.
+ */
+ if (etype != ETHERTYPE_IP)
+ return (mp);
+
+ ASSERT3U(MBLKL(mp), >=, offset);
+
+ /*
+ * If the first mblk of this packet contains only the ethernet
+ * header, skip past it for now. Packets with their data
+ * contained in only a single mblk can then use the fastpaths
+ * tuned to that possibility.
+ */
+ if (MBLKL(mp) == offset) {
+ offset -= MBLKL(mp);
+ /* This is guaranteed by mac_hw_emul(). */
+ ASSERT3P(mp->b_cont, !=, NULL);
+ skipped_hdr = mp;
+ mp = mp->b_cont;
+ }
+
+ /*
+ * Both full and partial checksum rely on finding the IP
+ * header in the current mblk. Our native TCP stack honors
+ * this assumption but it's prudent to guard our future
+ * clients that might not honor this contract.
+ */
+ ASSERT3U(MBLKL(mp), >=, offset + sizeof (ipha_t));
+ if (MBLKL(mp) < (offset + sizeof (ipha_t))) {
+ err = "mblk doesn't contain IP header";
+ goto bail;
+ }
+
+ /*
+ * We are about to modify the header mblk; make sure we are
+ * modifying our own copy. The code that follows assumes that
+ * the IP/ULP headers exist in this mblk (and drops the
+ * message if they don't).
+ */
+ if (DB_REF(mp) > 1) {
+ mblk_t *tmp = copyb(mp);
+
+ if (tmp == NULL) {
+ err = "copyb failed";
+ goto bail;
+ }
+
+ if (skipped_hdr != NULL) {
+ ASSERT3P(skipped_hdr->b_cont, ==, mp);
+ skipped_hdr->b_cont = tmp;
+ }
+
+ tmp->b_cont = mp->b_cont;
+ freeb(mp);
+ mp = tmp;
+ }
+
+ ipha = (ipha_t *)(mp->b_rptr + offset);
+
+ /*
+ * This code assumes a "simple" IP header (20 bytes, no
+ * options). IPv4 options are mostly a historic artifact. The
+ * one slight exception is Router Alert, but we don't expect
+ * such a packet to land here.
+ */
+ proto = ipha->ipha_protocol;
+ ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
+ if (ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION) {
+ err = "not simple IP header";
+ goto bail;
+ }
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ ASSERT3U(MBLKL(mp), >=,
+ (offset + sizeof (ipha_t) + sizeof (tcph_t)));
+ if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (tcph_t))) {
+ err = "mblk doesn't contain TCP header";
+ goto bail;
+ }
+ break;
+
+ case IPPROTO_UDP:
+ ASSERT3U(MBLKL(mp), >=,
+ (offset + sizeof (ipha_t) + sizeof (udpha_t)));
+ if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (udpha_t))) {
+ err = "mblk doesn't contain UDP header";
+ goto bail;
+ }
+ break;
+
+ default:
+ err = "unexpected protocol";
+ goto bail;
+ }
+
+ if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
+ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+ ipaddr_t src, dst;
+ uint32_t cksum;
+ uint16_t *up;
+
+ /* Get a pointer to the ULP checksum. */
+ switch (proto) {
+ case IPPROTO_TCP:
+ /* LINTED: improper alignment cast */
+ up = IPH_TCPH_CHECKSUMP(ipha,
+ IP_SIMPLE_HDR_LENGTH);
+ break;
+
+ case IPPROTO_UDP:
+ /* LINTED: improper alignment cast */
+ up = IPH_UDPH_CHECKSUMP(ipha,
+ IP_SIMPLE_HDR_LENGTH);
+ break;
+ }
+
+ /* Pseudo-header checksum. */
+ src = ipha->ipha_src;
+ dst = ipha->ipha_dst;
+ len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
+
+ cksum = (dst >> 16) + (dst & 0xFFFF) +
+ (src >> 16) + (src & 0xFFFF);
+ cksum += htons(len);
+
+ /*
+ * The checksum value stored in the packet
+ * needs to be correct. Compute it here.
+ */
+ *up = 0;
+ cksum += (((proto) == IPPROTO_UDP) ?
+ IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
+ cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
+ offset, cksum);
+ *(up) = (uint16_t)(cksum ? cksum : ~cksum);
+
+ }
+
+ /* We always update the ULP checksum flags. */
+ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+ flags &= ~HCK_FULLCKSUM;
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0;
+ }
/*
- * Since the processing of checksum offload for loopback
- * traffic requires modification of the packet contents,
- * ensure sure that we are always modifying our own copy.
+ * Out of paranoia, and for the sake of correctness,
+ * we won't calulate the IP header checksum if it's
+ * already populated. While unlikely, it's possible to
+ * write code that might end up calling mac_sw_cksum()
+ * twice on the same mblk (performing both LSO and
+ * checksum emualtion in a single mblk chain loop --
+ * the LSO emulation inserts a new chain into the
+ * existing chain and then the loop iterates back over
+ * the new segments and emulates the checksum a second
+ * time). Normally this wouldn't be a problem, because
+ * the HCK_*_OK flags are supposed to indicate that we
+ * don't need to do peform the work. But
+ * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
+ * same value; so we cannot use these flags to
+ * determine if the IP header checksum has already
+ * been calculated or not. Luckily, if IP requests
+ * HCK_IPV4_HDRCKSUM, then the IP header checksum will
+ * be zero. So this test works just as well as
+ * checking the flag. However, in the future, we
+ * should fix the HCK_* flags.
*/
- if (DB_REF(mp) > 1) {
- mp1 = copymsg(mp);
- if (mp1 == NULL)
- continue;
- mp1->b_next = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- if (prev != NULL)
- prev->b_next = mp1;
- else
- new_chain = mp1;
- mp = mp1;
+ if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS) &&
+ ipha->ipha_hdr_checksum == 0) {
+ ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
+ flags &= ~HCK_IPV4_HDRCKSUM;
+ flags |= HCK_IPV4_HDRCKSUM_OK;
}
+ }
+
+ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+ uint16_t *up, partial, cksum;
+ uchar_t *ipp; /* ptr to beginning of IP header */
+
+ ipp = mp->b_rptr + offset;
+ /* LINTED: cast may result in improper alignment */
+ up = (uint16_t *)((uchar_t *)ipp + stuff);
+ partial = *up;
+ *up = 0;
+
+ ASSERT3S(end, >, start);
+ cksum = ~IP_CSUM_PARTIAL(mp, offset + start, partial);
+ *up = cksum != 0 ? cksum : ~cksum;
+ }
+
+ /* We always update the ULP checksum flags. */
+ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+ flags &= ~HCK_PARTIALCKSUM;
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0;
+ }
+
+ mac_hcksum_set(mp, start, stuff, end, value, flags);
+
+ /* Don't forget to reattach the header. */
+ if (skipped_hdr != NULL) {
+ ASSERT3P(skipped_hdr->b_cont, ==, mp);
/*
- * Ethernet, and optionally VLAN header.
+ * Duplicate the HCKSUM data into the header mblk.
+ * This mimics mac_add_vlan_tag which ensures that
+ * both the first mblk _and_ the first data bearing
+ * mblk possess the HCKSUM information. Consumers like
+ * IP will end up discarding the ether_header mblk, so
+ * for now, it is important that the data be available
+ * in both places.
*/
- /* LINTED: improper alignment cast */
- ehp = (struct ether_header *)mp->b_rptr;
- if (ntohs(ehp->ether_type) == VLAN_TPID) {
- struct ether_vlan_header *evhp;
+ mac_hcksum_clone(mp, skipped_hdr);
+ mp = skipped_hdr;
+ }
- ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
- /* LINTED: improper alignment cast */
- evhp = (struct ether_vlan_header *)mp->b_rptr;
- sap = ntohs(evhp->ether_type);
- offset = sizeof (struct ether_vlan_header);
+ return (mp);
+
+bail:
+ if (skipped_hdr != NULL) {
+ ASSERT3P(skipped_hdr->b_cont, ==, mp);
+ mp = skipped_hdr;
+ }
+
+ mac_drop_pkt(mp, err);
+ return (NULL);
+}
+
+/*
+ * Build a single data segment from an LSO packet. The mblk chain
+ * returned, seg_head, represents the data segment and is always
+ * exactly seg_len bytes long. The lso_mp and offset input/output
+ * parameters track our position in the LSO packet. This function
+ * exists solely as a helper to mac_sw_lso().
+ *
+ * Case A
+ *
+ * The current lso_mp is larger than the requested seg_len. The
+ * beginning of seg_head may start at the beginning of lso_mp or
+ * offset into it. In either case, a single mblk is returned, and
+ * *offset is updated to reflect our new position in the current
+ * lso_mp.
+ *
+ * +----------------------------+
+ * | in *lso_mp / out *lso_mp |
+ * +----------------------------+
+ * ^ ^
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^ ^
+ * | |
+ * in *offset = 0 out *offset = seg_len
+ *
+ * |------ seg_len ----|
+ *
+ *
+ * +------------------------------+
+ * | in *lso_mp / out *lso_mp |
+ * +------------------------------+
+ * ^ ^
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^ ^
+ * | |
+ * in *offset = N out *offset = N + seg_len
+ *
+ * |------ seg_len ----|
+ *
+ *
+ *
+ * Case B
+ *
+ * The requested seg_len consumes exactly the rest of the lso_mp.
+ * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
+ * The seg_head may start at the beginning of the lso_mp or at some
+ * offset into it. In either case we return a single mblk, reset
+ * *offset to zero, and walk to the next lso_mp.
+ *
+ * +------------------------+ +------------------------+
+ * | in *lso_mp |---------->| out *lso_mp |
+ * +------------------------+ +------------------------+
+ * ^ ^ ^
+ * | | |
+ * | | out *offset = 0
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^
+ * |
+ * in *offset = 0
+ *
+ * |------ seg_len ----|
+ *
+ *
+ *
+ * +----------------------------+ +------------------------+
+ * | in *lso_mp |---------->| out *lso_mp |
+ * +----------------------------+ +------------------------+
+ * ^ ^ ^
+ * | | |
+ * | | out *offset = 0
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^
+ * |
+ * in *offset = N
+ *
+ * |------ seg_len ----|
+ *
+ *
+ * Case C
+ *
+ * The requested seg_len is greater than the current lso_mp. In
+ * this case we must consume LSO mblks until we have enough data to
+ * satisfy either case (A) or (B) above. We will return multiple
+ * mblks linked via b_cont, offset will be set based on the cases
+ * above, and lso_mp will walk forward at least one mblk, but maybe
+ * more.
+ *
+ * N.B. This digram is not exhaustive. The seg_head may start on
+ * the beginning of an lso_mp. The seg_tail may end exactly on the
+ * boundary of an lso_mp. And there may be two (in this case the
+ * middle block wouldn't exist), three, or more mblks in the
+ * seg_head chain. This is meant as one example of what might
+ * happen. The main thing to remember is that the seg_tail mblk
+ * must be one of case (A) or (B) above.
+ *
+ * +------------------+ +----------------+ +------------------+
+ * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp |
+ * +------------------+ +----------------+ +------------------+
+ * ^ ^ ^ ^ ^ ^
+ * | | | | | |
+ * | | | | | |
+ * | | | | | |
+ * | | | | | |
+ * +------------+ +----------------+ +------------+
+ * | seg_head |--->| |--->| seg_tail |
+ * +------------+ +----------------+ +------------+
+ * ^ ^
+ * | |
+ * in *offset = N out *offset = MBLKL(seg_tail)
+ *
+ * |------------------- seg_len -------------------|
+ *
+ */
+static mblk_t *
+build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
+{
+ mblk_t *seg_head, *seg_tail, *seg_mp;
+
+ ASSERT3P(*lso_mp, !=, NULL);
+ ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
+
+ seg_mp = dupb(*lso_mp);
+ if (seg_mp == NULL)
+ return (NULL);
+
+ seg_head = seg_mp;
+ seg_tail = seg_mp;
+
+ /* Continue where we left off from in the lso_mp. */
+ seg_mp->b_rptr += *offset;
+
+last_mblk:
+ /* Case (A) */
+ if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
+ *offset += seg_len;
+ seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
+ return (seg_head);
+ }
+
+ /* Case (B) */
+ if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
+ *offset = 0;
+ *lso_mp = (*lso_mp)->b_cont;
+ return (seg_head);
+ }
+
+ /* Case (C) */
+ ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
+
+ /*
+ * The current LSO mblk doesn't have enough data to satisfy
+ * seg_len -- continue peeling off LSO mblks to build the new
+ * segment message. If allocation fails we free the previously
+ * allocated segment mblks and return NULL.
+ */
+ while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
+ ASSERT3U(MBLKL(seg_mp), <=, seg_len);
+ seg_len -= MBLKL(seg_mp);
+ *offset = 0;
+ *lso_mp = (*lso_mp)->b_cont;
+ seg_mp = dupb(*lso_mp);
+
+ if (seg_mp == NULL) {
+ freemsgchain(seg_head);
+ return (NULL);
+ }
+
+ seg_tail->b_cont = seg_mp;
+ seg_tail = seg_mp;
+ }
+
+ /*
+ * We've walked enough LSO mblks that we can now satisfy the
+ * remaining seg_len. At this point we need to jump back to
+ * determine if we have arrived at case (A) or (B).
+ */
+
+ /* Just to be paranoid that we didn't underflow. */
+ ASSERT3U(seg_len, <, IP_MAXPACKET);
+ ASSERT3U(seg_len, >, 0);
+ goto last_mblk;
+}
+
+/*
+ * Perform software segmentation of a single LSO message. Take an LSO
+ * message as input and return head/tail pointers as output. This
+ * function should not be invoked directly but instead through
+ * mac_hw_emul().
+ *
+ * The resulting chain is comprised of multiple (nsegs) MSS sized
+ * segments. Each segment will consist of two or more mblks joined by
+ * b_cont: a header and one or more data mblks. The header mblk is
+ * allocated anew for each message. The first segment's header is used
+ * as a template for the rest with adjustments made for things such as
+ * ID, sequence, length, TCP flags, etc. The data mblks reference into
+ * the existing LSO mblk (passed in as omp) by way of dupb(). Their
+ * b_rptr/b_wptr values are adjusted to reference only the fraction of
+ * the LSO message they are responsible for. At the successful
+ * completion of this function the original mblk (omp) is freed,
+ * leaving the newely created segment chain as the only remaining
+ * reference to the data.
+ */
+static void
+mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
+ uint_t *count)
+{
+ uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
+ uint32_t mss;
+ uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
+ uint32_t oleft;
+ uint_t nsegs, seg;
+ int len;
+
+ struct ether_vlan_header *oevh;
+ const ipha_t *oiph;
+ const tcph_t *otcph;
+ ipha_t *niph;
+ tcph_t *ntcph;
+ uint16_t ip_id;
+ uint32_t tcp_seq, tcp_sum, otcp_sum;
+
+ uint32_t offset;
+ mblk_t *odatamp;
+ mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
+ mblk_t *tmptail;
+
+ ASSERT3P(head, !=, NULL);
+ ASSERT3P(tail, !=, NULL);
+ ASSERT3P(count, !=, NULL);
+ ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
+
+ /* Assume we are dealing with a single LSO message. */
+ ASSERT3P(omp->b_next, ==, NULL);
+
+ /*
+ * XXX: This is a hack to deal with mac_add_vlan_tag().
+ *
+ * When VLANs are in play, mac_add_vlan_tag() creates a new
+ * mblk with just the ether_vlan_header and tacks it onto the
+ * front of 'omp'. This breaks the assumptions made below;
+ * namely that the TCP/IP headers are in the first mblk. In
+ * this case, since we already have to pay the cost of LSO
+ * emulation, we simply pull up everything. While this might
+ * seem irksome, keep in mind this will only apply in a couple
+ * of scenarios: a) an LSO-capable VLAN client sending to a
+ * non-LSO-capable client over the "MAC/bridge loopback"
+ * datapath or b) an LSO-capable VLAN client is sending to a
+ * client that, for whatever reason, doesn't have DLS-bypass
+ * enabled. Finally, we have to check for both a tagged and
+ * untagged sized mblk depending on if the mblk came via
+ * mac_promisc_dispatch() or mac_rx_deliver().
+ *
+ * In the future, two things should be done:
+ *
+ * 1. This function should make use of some yet to be
+ * implemented "mblk helpers". These helper functions would
+ * perform all the b_cont walking for us and guarantee safe
+ * access to the mblk data.
+ *
+ * 2. We should add some slop to the mblks so that
+ * mac_add_vlan_tag() can just edit the first mblk instead
+ * of allocating on the hot path.
+ */
+ if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
+ MBLKL(omp) == sizeof (struct ether_header)) {
+ mblk_t *tmp = msgpullup(omp, -1);
+
+ if (tmp == NULL) {
+ mac_drop_pkt(omp, "failed to pull up");
+ goto fail;
+ }
+
+ mac_hcksum_clone(omp, tmp);
+ freemsg(omp);
+ omp = tmp;
+ }
+
+ mss = DB_LSOMSS(omp);
+ ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
+ sizeof (struct ether_vlan_header));
+ opktlen = msgsize(omp);
+
+ /*
+ * First, get references to the IP and TCP headers and
+ * determine the total TCP length (header + data).
+ *
+ * Thanks to mac_hw_emul() we know that the first mblk must
+ * contain (at minimum) the full L2 header. However, this
+ * function assumes more than that. It assumes the L2/L3/L4
+ * headers are all contained in the first mblk of a message
+ * (i.e., no b_cont walking for headers). While this is a
+ * current reality (our native TCP stack and viona both
+ * enforce this) things may become more nuanced in the future
+ * (e.g. when introducing encap support or adding new
+ * clients). For now we guard against this case by dropping
+ * the packet.
+ */
+ oevh = (struct ether_vlan_header *)omp->b_rptr;
+ if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
+ oehlen = sizeof (struct ether_vlan_header);
+ else
+ oehlen = sizeof (struct ether_header);
+
+ ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
+ if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
+ mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
+ goto fail;
+ }
+
+ oiph = (ipha_t *)(omp->b_rptr + oehlen);
+ oiphlen = IPH_HDR_LENGTH(oiph);
+ otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
+ otcphlen = TCP_HDR_LENGTH(otcph);
+
+ /*
+ * Currently we only support LSO for TCP/IPv4.
+ */
+ if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
+ mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
+ IPH_HDR_VERSION(oiph));
+ goto fail;
+ }
+
+ if (oiph->ipha_protocol != IPPROTO_TCP) {
+ mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
+ oiph->ipha_protocol);
+ goto fail;
+ }
+
+ if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
+ mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
+ goto fail;
+ }
+
+ ohdrslen = oehlen + oiphlen + otcphlen;
+ if ((len = MBLKL(omp)) < ohdrslen) {
+ mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
+ ohdrslen);
+ goto fail;
+ }
+
+ /*
+ * Either we have data in the first mblk or it's just the
+ * header. In either case, we need to set rptr to the start of
+ * the TCP data.
+ */
+ if (len > ohdrslen) {
+ odatamp = omp;
+ offset = ohdrslen;
+ } else {
+ ASSERT3U(len, ==, ohdrslen);
+ odatamp = omp->b_cont;
+ offset = 0;
+ }
+
+ /* Make sure we still have enough data. */
+ ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
+
+ /*
+ * If a MAC negotiated LSO then it must negotioate both
+ * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
+ * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
+ * change during LSO segmentation (only the 3 fields of the
+ * pseudo header checksum don't change: src, dst, proto). Thus
+ * we would expect these flags (HCK_IPV4_HDRCKSUM |
+ * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
+ * function to emulate those checksums in software. However,
+ * that assumes a world where we only expose LSO if the
+ * underlying hardware exposes LSO. Moving forward the plan is
+ * to assume LSO in the upper layers and have MAC perform
+ * software LSO when the underlying provider doesn't support
+ * it. In such a world, if the provider doesn't support LSO
+ * but does support hardware checksum offload, then we could
+ * simply perform the segmentation and allow the hardware to
+ * calculate the checksums. To the hardware it's just another
+ * chain of non-LSO packets.
+ */
+ ASSERT3S(DB_TYPE(omp), ==, M_DATA);
+ ocsum_flags = DB_CKSUMFLAGS(omp);
+ ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
+ ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
+
+ /*
+ * If hardware only provides partial checksum then software
+ * must supply the pseudo-header checksum. In the case of LSO
+ * we leave the TCP length at zero to be filled in by
+ * hardware. This function must handle two scenarios.
+ *
+ * 1. Being called by a MAC client on the Rx path to segment
+ * an LSO packet and calculate the checksum.
+ *
+ * 2. Being called by a MAC provider to segment an LSO packet.
+ * In this case the LSO segmentation is performed in
+ * software (by this routine) but the MAC provider should
+ * still calculate the TCP/IP checksums in hardware.
+ *
+ * To elaborate on the second case: we cannot have the
+ * scenario where IP sends LSO packets but the underlying HW
+ * doesn't support checksum offload -- because in that case
+ * TCP/IP would calculate the checksum in software (for the
+ * LSO packet) but then MAC would segment the packet and have
+ * to redo all the checksum work. So IP should never do LSO
+ * if HW doesn't support both IP and TCP checksum.
+ */
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
+ ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
+ }
+
+ odatalen = opktlen - ohdrslen;
+
+ /*
+ * Subtract one to account for the case where the data length
+ * is evenly divisble by the MSS. Add one to account for the
+ * fact that the division will always result in one less
+ * segment than needed.
+ */
+ nsegs = ((odatalen - 1) / mss) + 1;
+ if (nsegs < 2) {
+ mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
+ goto fail;
+ }
+
+ DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
+ __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
+ nsegs);
+
+ seg_chain = NULL;
+ tmptail = seg_chain;
+ oleft = odatalen;
+
+ for (uint_t i = 0; i < nsegs; i++) {
+ boolean_t last_seg = ((i + 1) == nsegs);
+ uint32_t seg_len;
+
+ /*
+ * If we fail to allocate, then drop the partially
+ * allocated chain as well as the LSO packet. Let the
+ * sender deal with the fallout.
+ */
+ if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
+ freemsgchain(seg_chain);
+ mac_drop_pkt(omp, "failed to alloc segment header");
+ goto fail;
+ }
+ ASSERT3P(nhdrmp->b_cont, ==, NULL);
+
+ if (seg_chain == NULL) {
+ seg_chain = nhdrmp;
} else {
- sap = ntohs(ehp->ether_type);
- offset = sizeof (struct ether_header);
+ ASSERT3P(tmptail, !=, NULL);
+ tmptail->b_next = nhdrmp;
}
- if (MBLKL(mp) <= offset) {
- offset -= MBLKL(mp);
- if (mp->b_cont == NULL) {
- /* corrupted packet, skip it */
- if (prev != NULL)
- prev->b_next = mp->b_next;
- else
- new_chain = mp->b_next;
- mp1 = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- mp = mp1;
- continue;
- }
- mp = mp->b_cont;
+ tmptail = nhdrmp;
+
+ /*
+ * Calculate this segment's lengh. It's either the MSS
+ * or whatever remains for the last segment.
+ */
+ seg_len = last_seg ? oleft : mss;
+ ASSERT3U(seg_len, <=, mss);
+ ndatamp = build_data_seg(&odatamp, &offset, seg_len);
+
+ if (ndatamp == NULL) {
+ freemsgchain(seg_chain);
+ mac_drop_pkt(omp, "LSO failed to segment data");
+ goto fail;
}
- if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
- ipha_t *ipha = NULL;
+ /* Attach data mblk to header mblk. */
+ nhdrmp->b_cont = ndatamp;
+ DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
+ ASSERT3U(seg_len, <=, oleft);
+ oleft -= seg_len;
+ }
- /*
- * In order to compute the full and header
- * checksums, we need to find and parse
- * the IP and/or ULP headers.
- */
+ /* We should have consumed entire LSO msg. */
+ ASSERT3S(oleft, ==, 0);
+ ASSERT3P(odatamp, ==, NULL);
+
+ /*
+ * All seg data mblks are referenced by the header mblks, null
+ * out this pointer to catch any bad derefs.
+ */
+ ndatamp = NULL;
+
+ /*
+ * Set headers and checksum for first segment.
+ */
+ nhdrmp = seg_chain;
+ bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
+ niph->ipha_length = htons(oiphlen + otcphlen + mss);
+ niph->ipha_hdr_checksum = 0;
+ ip_id = ntohs(niph->ipha_ident);
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ tcp_seq = BE32_TO_U32(ntcph->th_seq);
+ tcp_seq += mss;
+
+ /*
+ * The first segment shouldn't:
+ *
+ * o indicate end of data transmission (FIN),
+ * o indicate immediate handling of the data (PUSH).
+ */
+ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+
+ /*
+ * If the underlying HW provides partial checksum, then make
+ * sure to correct the pseudo header checksum before calling
+ * mac_sw_cksum(). The native TCP stack doesn't include the
+ * length field in the pseudo header when LSO is in play -- so
+ * we need to calculate it here.
+ */
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ tcp_sum = BE16_TO_U16(ntcph->th_sum);
+ otcp_sum = tcp_sum;
+ tcp_sum += mss + otcphlen;
+ tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ }
+
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ next_nhdrmp = nhdrmp->b_next;
+ nhdrmp->b_next = NULL;
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ nhdrmp->b_next = next_nhdrmp;
+ next_nhdrmp = NULL;
- sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+ /*
+ * We may have freed the nhdrmp argument during
+ * checksum emulation, make sure that seg_chain
+ * references a valid mblk.
+ */
+ seg_chain = nhdrmp;
+ }
+ ASSERT3P(nhdrmp, !=, NULL);
+
+ seg = 1;
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
+ uint_t, seg);
+ seg++;
+
+ /* There better be at least 2 segs. */
+ ASSERT3P(nhdrmp->b_next, !=, NULL);
+ prev_nhdrmp = nhdrmp;
+ nhdrmp = nhdrmp->b_next;
+
+ /*
+ * Now adjust the headers of the middle segments. For each
+ * header we need to adjust the following.
+ *
+ * o IP ID
+ * o IP length
+ * o TCP sequence
+ * o TCP flags
+ * o cksum flags
+ * o cksum values (if MAC_HWCKSUM_EMUL is set)
+ */
+ for (; seg < nsegs; seg++) {
+ /*
+ * We use seg_chain as a reference to the first seg
+ * header mblk -- this first header is a template for
+ * the rest of the segments. This copy will include
+ * the now updated checksum values from the first
+ * header. We must reset these checksum values to
+ * their original to make sure we produce the correct
+ * value.
+ */
+ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ niph->ipha_ident = htons(++ip_id);
+ ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
+ niph->ipha_length = htons(oiphlen + otcphlen + mss);
+ niph->ipha_hdr_checksum = 0;
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ U32_TO_BE32(tcp_seq, ntcph->th_seq);
+ tcp_seq += mss;
+ /*
+ * Just like the first segment, the middle segments
+ * shouldn't have these flags set.
+ */
+ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
/*
- * IP header.
+ * First and middle segs have same
+ * pseudo-header checksum.
*/
- if (sap != ETHERTYPE_IP)
- continue;
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ }
- ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
- /* LINTED: improper alignment cast */
- ipha = (ipha_t *)(mp->b_rptr + offset);
-
- if (flags & HCK_FULLCKSUM) {
- ipaddr_t src, dst;
- uint32_t cksum;
- uint16_t *up;
- uint8_t proto;
-
- /*
- * Pointer to checksum field in ULP header.
- */
- proto = ipha->ipha_protocol;
- ASSERT(ipha->ipha_version_and_hdr_length ==
- IP_SIMPLE_HDR_VERSION);
-
- switch (proto) {
- case IPPROTO_TCP:
- /* LINTED: improper alignment cast */
- up = IPH_TCPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- break;
-
- case IPPROTO_UDP:
- /* LINTED: improper alignment cast */
- up = IPH_UDPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- break;
-
- default:
- cmn_err(CE_WARN, "mac_fix_cksum: "
- "unexpected protocol: %d", proto);
- continue;
- }
-
- /*
- * Pseudo-header checksum.
- */
- src = ipha->ipha_src;
- dst = ipha->ipha_dst;
- len = ntohs(ipha->ipha_length) -
- IP_SIMPLE_HDR_LENGTH;
-
- cksum = (dst >> 16) + (dst & 0xFFFF) +
- (src >> 16) + (src & 0xFFFF);
- cksum += htons(len);
-
- /*
- * The checksum value stored in the packet needs
- * to be correct. Compute it here.
- */
- *up = 0;
- cksum += (((proto) == IPPROTO_UDP) ?
- IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
- cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
- offset, cksum);
- *(up) = (uint16_t)(cksum ? cksum : ~cksum);
-
- /*
- * Flag the packet so that it appears
- * that the checksum has already been
- * verified by the hardware.
- */
- flags &= ~HCK_FULLCKSUM;
- flags |= HCK_FULLCKSUM_OK;
- value = 0;
- }
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ next_nhdrmp = nhdrmp->b_next;
+ nhdrmp->b_next = NULL;
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ nhdrmp->b_next = next_nhdrmp;
+ next_nhdrmp = NULL;
+ /* We may have freed the original nhdrmp. */
+ prev_nhdrmp->b_next = nhdrmp;
+ }
- if (flags & HCK_IPV4_HDRCKSUM) {
- ASSERT(ipha != NULL);
- ipha->ipha_hdr_checksum =
- (uint16_t)ip_csum_hdr(ipha);
- flags &= ~HCK_IPV4_HDRCKSUM;
- flags |= HCK_IPV4_HDRCKSUM_OK;
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
+ uint_t, mss, uint_t, seg);
- }
+ ASSERT3P(nhdrmp->b_next, !=, NULL);
+ prev_nhdrmp = nhdrmp;
+ nhdrmp = nhdrmp->b_next;
+ }
+
+ /* Make sure we are on the last segment. */
+ ASSERT3U(seg, ==, nsegs);
+ ASSERT3P(nhdrmp->b_next, ==, NULL);
+
+ /*
+ * Now we set the last segment header. The difference being
+ * that FIN/PSH/RST flags are allowed.
+ */
+ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ niph->ipha_ident = htons(++ip_id);
+ len = msgsize(nhdrmp->b_cont);
+ ASSERT3S(len, >, 0);
+ niph->ipha_length = htons(oiphlen + otcphlen + len);
+ niph->ipha_hdr_checksum = 0;
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ U32_TO_BE32(tcp_seq, ntcph->th_seq);
+
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ tcp_sum = otcp_sum;
+ tcp_sum += len + otcphlen;
+ tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ }
+
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ /* This should be the last mblk. */
+ ASSERT3P(nhdrmp->b_next, ==, NULL);
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ prev_nhdrmp->b_next = nhdrmp;
+ }
+
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
+ uint_t, seg);
+
+ /*
+ * Free the reference to the original LSO message as it is
+ * being replaced by seg_cahin.
+ */
+ freemsg(omp);
+ *head = seg_chain;
+ *tail = nhdrmp;
+ *count = nsegs;
+ return;
+
+fail:
+ *head = NULL;
+ *tail = NULL;
+ *count = 0;
+}
+
+#define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
+
+/*
+ * Emulate various hardware offload features in software. Take a chain
+ * of packets as input and emulate the hardware features specified in
+ * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
+ * pointer given as input, and its tail pointer is written to
+ * '*otail'. The number of packets in the new chain is written to
+ * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
+ * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
+ * which case 'mp_chain' will simply stay a NULL chain.
+ *
+ * While unlikely, it is technically possible that this function could
+ * receive a non-NULL chain as input and return a NULL chain as output
+ * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
+ * zero). This could happen if all the packets in the chain are
+ * dropped or if we fail to allocate new mblks. In this case, there is
+ * nothing for the caller to free. In any event, the caller shouldn't
+ * assume that '*mp_chain' is non-NULL on return.
+ *
+ * This function was written with two main use cases in mind.
+ *
+ * 1. A way for MAC clients to emulate hardware offloads when they
+ * can't directly handle LSO packets or packets without fully
+ * calculated checksums.
+ *
+ * 2. A way for MAC providers (drivers) to offer LSO even when the
+ * underlying HW can't or won't supply LSO offload.
+ *
+ * At the time of this writing no provider is making use of this
+ * function. However, the plan for the future is to always assume LSO
+ * is available and then add SW LSO emulation to all providers that
+ * don't support it in HW.
+ */
+void
+mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
+{
+ mblk_t *head = NULL, *tail = NULL;
+ uint_t count = 0;
+
+ ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
+ ASSERT3P(mp_chain, !=, NULL);
+
+ for (mblk_t *mp = *mp_chain; mp != NULL; ) {
+ mblk_t *tmp, *next, *tmphead, *tmptail;
+ struct ether_header *ehp;
+ uint32_t flags;
+ uint_t len = MBLKL(mp), l2len;
+
+ /* Perform LSO/cksum one message at a time. */
+ next = mp->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * For our sanity the first mblk should contain at
+ * least the full L2 header.
+ */
+ if (len < sizeof (struct ether_header)) {
+ mac_drop_pkt(mp, "packet too short (A): %u", len);
+ mp = next;
+ continue;
}
- if (flags & HCK_PARTIALCKSUM) {
- uint16_t *up, partial, cksum;
- uchar_t *ipp; /* ptr to beginning of IP header */
-
- if (mp->b_cont != NULL) {
- mblk_t *mp1;
-
- mp1 = msgpullup(mp, offset + end);
- if (mp1 == NULL)
- continue;
- mp1->b_next = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- if (prev != NULL)
- prev->b_next = mp1;
- else
- new_chain = mp1;
- mp = mp1;
- }
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == VLAN_TPID)
+ l2len = sizeof (struct ether_vlan_header);
+ else
+ l2len = sizeof (struct ether_header);
- ipp = mp->b_rptr + offset;
- /* LINTED: cast may result in improper alignment */
- up = (uint16_t *)((uchar_t *)ipp + stuff);
- partial = *up;
- *up = 0;
+ /*
+ * If the first mblk is solely the L2 header, then
+ * there better be more data.
+ */
+ if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
+ mac_drop_pkt(mp, "packet too short (C): %u", len);
+ mp = next;
+ continue;
+ }
- cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
- end - start, partial);
- cksum = ~cksum;
- *up = cksum ? cksum : ~cksum;
+ DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
+
+ /*
+ * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
+ * because we don't want to mask-out the LSO flag.
+ */
+ flags = DB_CKSUMFLAGS(mp);
+
+ if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
+ uint_t tmpcount = 0;
/*
- * Since we already computed the whole checksum,
- * indicate to the stack that it has already
- * been verified by the hardware.
+ * LSO fix-up handles checksum emulation
+ * inline (if requested). It also frees mp.
*/
- flags &= ~HCK_PARTIALCKSUM;
- flags |= HCK_FULLCKSUM_OK;
- value = 0;
+ mac_sw_lso(mp, emul, &tmphead, &tmptail,
+ &tmpcount);
+ if (tmphead == NULL) {
+ /* mac_sw_lso() freed the mp. */
+ mp = next;
+ continue;
+ }
+ count += tmpcount;
+ } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
+ tmp = mac_sw_cksum(mp, emul);
+ if (tmp == NULL) {
+ /* mac_sw_cksum() freed the mp. */
+ mp = next;
+ continue;
+ }
+ tmphead = tmp;
+ tmptail = tmp;
+ count++;
+ } else {
+ /* There is nothing to emulate. */
+ tmp = mp;
+ tmphead = tmp;
+ tmptail = tmp;
+ count++;
+ }
+
+ /*
+ * The tmp mblk chain is either the start of the new
+ * chain or added to the tail of the new chain.
+ */
+ if (head == NULL) {
+ head = tmphead;
+ tail = tmptail;
+ } else {
+ /* Attach the new mblk to the end of the new chain. */
+ tail->b_next = tmphead;
+ tail = tmptail;
}
- (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
- value, flags, KM_NOSLEEP);
+ mp = next;
}
- return (new_chain);
+ *mp_chain = head;
+
+ if (otail != NULL)
+ *otail = tail;
+
+ if (ocount != NULL)
+ *ocount = count;
}
/*
@@ -320,7 +1297,6 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
mblk_t *hmp;
struct ether_vlan_header *evhp;
struct ether_header *ehp;
- uint32_t start, stuff, end, value, flags;
ASSERT(pri != 0 || vid != 0);
@@ -350,9 +1326,7 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
* Free the original message if it's now empty. Link the
* rest of messages to the header message.
*/
- hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
- (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
- KM_NOSLEEP);
+ mac_hcksum_clone(mp, hmp);
if (MBLKL(mp) == 0) {
hmp->b_cont = mp->b_cont;
freeb(mp);
@@ -456,16 +1430,9 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain)
*/
/* ARGSUSED */
void
-mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
+mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp,
boolean_t loopback)
{
- mblk_t *mp1 = mp;
-
- while (mp1 != NULL) {
- mp1->b_prev = NULL;
- mp1->b_queue = NULL;
- mp1 = mp1->b_next;
- }
freemsgchain(mp);
}
diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c
index 950fab1272..fcea4a8f03 100644
--- a/usr/src/uts/common/io/mem.c
+++ b/usr/src/uts/common/io/mem.c
@@ -225,10 +225,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
case M_NULL:
case M_ZERO:
case M_FULL:
+ /* standard devices */
+ break;
+
case M_MEM:
case M_KMEM:
case M_ALLKMEM:
- /* standard devices */
+ /*
+ * These devices should never be visible in a zone, but if they
+ * somehow do get created we refuse to allow the zone to use
+ * them.
+ */
+ if (crgetzoneid(cred) != GLOBAL_ZONEID)
+ return (EACCES);
break;
default:
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf
index cfda434e23..6c585c6a42 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.conf
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf
@@ -13,3 +13,11 @@
# Fast-Path specific flag. Default is "yes".
# mrsas-enable-fp="yes";
+flow_control="dmult" queue="qsort" tape="sctp";
+
+# MSI specific flag. To enable MSI modify the flag value to "yes"
+mrsas-enable-msi="yes";
+
+# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes"
+mrsas-enable-fp="yes";
+
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..187088ff34
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2014, Thales UK Limited
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..cde8b65b37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+NFAST CRYPTO ACCELERATOR DRIVER
diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h
new file mode 100644
index 0000000000..b9021942b2
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/autoversion.h
@@ -0,0 +1,21 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/* AUTOGENERATED - DO NOT EDIT */
+#ifndef AUTOVERSION_H
+#define AUTOVERSION_H
+
+#define VERSION_RELEASEMAJOR 2
+#define VERSION_RELEASEMINOR 26
+#define VERSION_RELEASEPATCH 40
+#define VERSION_NO "2.26.40cam999"
+#define VERSION_COMPNAME "nfdrv"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c
new file mode 100644
index 0000000000..a04b1fd5b0
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/drvlist.c
@@ -0,0 +1,19 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_cmd.h"
+
+const nfpcmd_dev *nfp_drvlist[] = {
+ &i21285_cmddev,
+ &i21555_cmddev,
+ NULL
+};
+
diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c
new file mode 100644
index 0000000000..684be703ea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/hostif.c
@@ -0,0 +1,1192 @@
+/*
+
+hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh Original solaris 2.6
+21/05/1999 jsh added support for solaris 2.5
+10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit)
+??/??/2001 jsh added support for solaris 2.8 (32 and 64 bit)
+16/10/2001 jsh moved from nfast to new structure in nfdrv
+12/02/2002 jsh added high level interrupt support
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+
+#include "nfp.h"
+
+/* mapped memory attributes, no-swap endianess (done in higher level) */
+static struct ddi_device_acc_attr nosw_attr = {
+ DDI_DEVICE_ATTR_V0,
+ DDI_NEVERSWAP_ACC,
+ DDI_STRICTORDER_ACC
+};
+
+/* dma attributes */
+static ddi_dma_attr_t dma_attrs = {
+ DMA_ATTR_V0, /* version number */
+ (uint64_t)0x0, /* low address */
+ (uint64_t)0xffffffff, /* high address */
+ (uint64_t)0xffffff, /* DMA counter max */
+ (uint64_t)0x1, /* alignment */
+ 0x0c, /* burst sizes */
+ 0x1, /* minimum transfer size */
+ (uint64_t)0x3ffffff, /* maximum transfer size */
+ (uint64_t)0x7fff, /* maximum segment size */
+ 1, /* no scatter/gather lists */
+ 1, /* granularity */
+ 0 /* DMA flags */
+};
+
+/*
+ * Debug message control
+ * Debug Levels:
+ * 0 = no messages
+ * 1 = Errors
+ * 2 = Subroutine calls & control flow
+ * 3 = I/O Data (verbose!)
+ * Can be set with adb or in the /etc/system file with
+ * "set nfp:nfp_debug=<value>"
+ */
+
+int nfp_debug= 1;
+
+static void *state_head; /* opaque handle top of state structs */
+
+static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp);
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp);
+static int nfp_release_dev( dev_info_t *dip );
+
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_strategy(struct buf *bp);
+
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp);
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp);
+
+static void nfp_wrtimeout (void *pdev);
+static void nfp_rdtimeout (void *pdev);
+
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result);
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok);
+static void nfp_write_complete_final(nfp_dev *pdev, int ok);
+
+/* nfp file ops --------------------------------------------------- */
+
+static struct cb_ops nfp_cb_ops = {
+ nfp_open,
+ nfp_close,
+ nodev, /* no nfp_strategy */
+ nodev, /* no print routine */
+ nodev, /* no dump routine */
+ nfp_read,
+ nfp_write,
+ nfp_ioctl,
+ nodev, /* no devmap routine */
+ nodev, /* no mmap routine */
+ nodev, /* no segmap routine */
+ nfp_chpoll,
+ ddi_prop_op,
+ 0, /* not a STREAMS driver, no cb_str routine */
+ D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */
+ CB_REV,
+ nodev, /* aread */
+ nodev /* awrite */
+};
+
+static struct dev_ops nfp_ops = {
+ DEVO_REV, /* DEVO_REV indicated by manual */
+ 0, /* device reference count */
+ nfp_getinfo,
+ nulldev, /* identify */
+ nulldev, /* probe */
+ nfp_attach,
+ nfp_detach,
+ nodev, /* device reset routine */
+ &nfp_cb_ops,
+ (struct bus_ops *)0, /* bus operations */
+};
+
+extern struct mod_ops mod_driverops;
+static struct modldrv modldrv = {
+ &mod_driverops,
+ NFP_DRVNAME,
+ &nfp_ops,
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, /* MODREV_1 indicated by manual */
+ (void *)&modldrv,
+ NULL, /* termination of list of linkage structures */
+};
+
+/* interface resource allocation */
+
+int nfp_alloc_pci_push( nfp_dev *pdev ) {
+ /* allocate resources needed for PCI Push,
+ * if not already allocated.
+ * return True if successful
+ */
+ nfp_err ret;
+ uint_t cookie_count;
+ size_t real_length;
+
+ if(!pdev->read_buf) {
+ /* allocate read buffer */
+ pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP );
+ }
+ if(!pdev->read_buf) {
+ nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed");
+ pdev->read_buf = NULL;
+ return 0;
+ }
+
+ if(!pdev->rd_dma_ok) {
+ /* allocate dma handle for read buffer */
+ ret = ddi_dma_alloc_handle( pdev->dip,
+ &dma_attrs,
+ DDI_DMA_DONTWAIT,
+ NULL,
+ &pdev->read_dma_handle );
+ if( ret != DDI_SUCCESS ) {
+ nfp_log( NFP_DBG1,
+ "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)",
+ ret );
+ return 0;
+ }
+
+ /* Allocate the memory for dma transfers */
+ ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr,
+ DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL,
+ (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle);
+ if (ret != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret);
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ return 0;
+ }
+
+ ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle,
+ NULL, /* kernel address space */
+ (caddr_t)pdev->read_buf, real_length,
+ DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */
+ DDI_DMA_DONTWAIT, NULL,
+ &pdev->read_dma_cookie, &cookie_count );
+ if( ret != DDI_DMA_MAPPED ) {
+ nfp_log( NFP_DBG1,
+ "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)",
+ ret);
+ ddi_dma_mem_free(&pdev->acchandle);
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ return 0;
+ }
+ if( cookie_count > 1 ) {
+ nfp_log( NFP_DBG1,
+ "nfp_alloc_pci_push: error:"
+ " ddi_dma_addr_bind_handle wants %d transfers",
+ cookie_count);
+ ddi_dma_mem_free(&pdev->acchandle);
+ (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ return 0;
+ }
+ pdev->rd_dma_ok = 1;
+ }
+ return pdev->rd_dma_ok;
+}
+
+void nfp_free_pci_push( nfp_dev *pdev ) {
+ /* free resources allocated to PCI Push */
+ if( pdev->rd_dma_ok ) {
+ (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL);
+ ddi_dma_mem_free(&pdev->acchandle);
+ (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ pdev->rd_dma_ok = 0;
+ }
+ if( pdev->read_buf ) {
+ kmem_free( pdev->read_buf, NFP_READBUF_SIZE );
+ pdev->read_buf = NULL;
+ }
+}
+
+/* include definition of nfp_set_ifvers() */
+#define nfp_ifvers NFDEV_IF_PCI_PUSH
+#include "nfp_ifvers.c"
+#undef nfp_ifvers
+
+/*--------------------*/
+/* nfp_isr */
+/*--------------------*/
+
+static u_int nfp_isr( char *pdev_in ) {
+ /* LINTED: alignment */
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+ nfp_err ne;
+ int handled;
+
+ nfp_log( NFP_DBG3, "nfp_isr: entered");
+
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_isr: cannot find dev");
+ return DDI_INTR_UNCLAIMED;
+ }
+
+ /* The isr needs to be mutex'ed - an SMP can call us while we're still
+ * running!
+ */
+ mutex_enter(&pdev->low_mutex);
+ ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled );
+ mutex_exit(&pdev->low_mutex);
+
+ if( !ne && handled )
+ return DDI_INTR_CLAIMED;
+ if (ne)
+ nfp_log( NFP_DBG1, "nfp_isr: failed");
+ else
+ nfp_log( NFP_DBG3, "nfp_isr: unclaimed");
+ return DDI_INTR_UNCLAIMED;
+}
+
+static u_int nfp_soft_isr( char *pdev_in ) {
+ /* LINTED: alignment */
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+ int rd, wr;
+
+ nfp_log( NFP_DBG3, "nfp_soft_isr: entered");
+
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev");
+ return DDI_INTR_UNCLAIMED;
+ }
+ rd= wr= 0;
+
+ mutex_enter(&pdev->high_mutex);
+ if(pdev->high_read) {
+ pdev->high_read= 0;
+ mutex_exit(&pdev->high_mutex);
+ rd= 1;
+ }
+ if(pdev->high_write) {
+ pdev->high_write= 0;
+ wr= 1;
+ }
+ mutex_exit(&pdev->high_mutex);
+
+ if(rd) {
+ nfp_log( NFP_DBG3, "nfp_soft_isr: read done");
+ nfp_read_complete_final(pdev, pdev->rd_ok);
+ }
+ if(wr) {
+ nfp_log( NFP_DBG3, "nfp_soft_isr: write done");
+ nfp_write_complete_final(pdev, pdev->wr_ok);
+ }
+ if( rd || wr )
+ return DDI_INTR_CLAIMED;
+
+ nfp_log( NFP_DBG2, "nfp_isr: unclaimed");
+ return DDI_INTR_UNCLAIMED;
+}
+
+
+/*-------------------------*/
+/* nfp_read */
+/*-------------------------*/
+
+void nfp_read_complete(nfp_dev *pdev, int ok) {
+ nfp_log( NFP_DBG2,"nfp_read_complete: entering");
+
+ if(pdev->high_intr) {
+ nfp_log(NFP_DBG2, "nfp_read_complete: high_intr");
+ mutex_enter(&pdev->high_mutex);
+ nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered");
+ if(pdev->high_read)
+ nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!");
+ pdev->high_read= 1;
+ pdev->rd_ok= ok;
+ nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex");
+ mutex_exit(&pdev->high_mutex);
+ ddi_trigger_softintr(pdev->soft_int_id);
+ } else
+ nfp_read_complete_final( pdev, ok );
+ nfp_log( NFP_DBG2,"nfp_read_complete: exiting");
+}
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok) {
+ nfp_log( NFP_DBG2,"nfp_read_complete_final: entering");
+ if(pdev->rdtimeout)
+ (void) untimeout(pdev->rdtimeout);
+ if(!pdev->rd_outstanding) {
+ nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding");
+ }
+ nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok);
+ mutex_enter(&pdev->isr_mutex);
+ pdev->rd_outstanding= 0;
+ pdev->rd_ready= 1;
+ pdev->rd_ok= ok;
+ cv_broadcast(&pdev->rd_cv);
+ mutex_exit(&pdev->isr_mutex);
+ pollwakeup (&pdev->pollhead, POLLRDNORM);
+ nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting");
+}
+
+static void nfp_rdtimeout( void *pdev_in )
+{
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+ nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out");
+
+ if (!pdev) {
+ nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." );
+ return;
+ }
+ pdev->rdtimeout= 0;
+ nfp_read_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) {
+ int ret;
+ nfp_log( NFP_DBG2, "nfp_read: entered" );
+ if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) {
+ nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev");
+ return (ENODEV);
+ }
+ nfp_log( NFP_DBG2, "nfp_read: about to physio." );
+ ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop );
+ if(ret)
+ nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret );
+ return ret;
+}
+
+/*-------------------------*/
+/* nfp_write */
+/*-------------------------*/
+
+void nfp_write_complete( nfp_dev *pdev, int ok) {
+ nfp_log( NFP_DBG2,"nfp_write_complete: entering");
+
+ if(pdev->high_intr) {
+ mutex_enter(&pdev->high_mutex);
+ if(pdev->high_write)
+ nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!");
+ pdev->high_write= 1;
+ pdev->wr_ok= ok;
+ mutex_exit(&pdev->high_mutex);
+ ddi_trigger_softintr(pdev->soft_int_id);
+ } else
+ nfp_write_complete_final( pdev, ok );
+ nfp_log( NFP_DBG2,"nfp_write_complete: exiting");
+}
+
+static void nfp_write_complete_final( nfp_dev *pdev, int ok) {
+ struct buf *local_wr_bp;
+ nfp_log( NFP_DBG2,"nfp_write_complete_final: entering");
+ if(pdev->wrtimeout)
+ (void) untimeout(pdev->wrtimeout);
+
+ if (!pdev->wr_bp) {
+ nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." );
+ return;
+ }
+
+ bp_mapout(pdev->wr_bp);
+ pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount;
+ /* Make sure we set wr_ready before calling biodone to avoid a race */
+ pdev->wr_ready = 1;
+ bioerror(pdev->wr_bp, ok ? 0 : ENXIO);
+ local_wr_bp = pdev->wr_bp;
+ pdev->wr_bp = 0;
+ biodone(local_wr_bp);
+ nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited");
+ pollwakeup (&pdev->pollhead, POLLWRNORM);
+
+ nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving");
+}
+
+static void nfp_wrtimeout( void *pdev_in )
+{
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+ nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out");
+
+ if (!pdev) {
+ nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." );
+ return;
+ }
+ pdev->wrtimeout= 0;
+ nfp_write_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) {
+ int ret;
+ nfp_log( NFP_DBG2, "nfp_write: entered." );
+ if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) {
+ nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev.");
+ return (ENODEV);
+ }
+ nfp_log( NFP_DBG2, "nfp_write: about to physio." );
+ ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop );
+ if(ret)
+ nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret );
+ return ret;
+}
+
+/*-------------------------*/
+/* nfp_strategy */
+/*-------------------------*/
+
+#define NFP_STRAT_ERR(thebp,err,txt) \
+ nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \
+ (thebp)->b_resid = (thebp)->b_bcount; \
+ bioerror ((thebp), err); \
+ biodone ((thebp));
+
+static int nfp_strategy(struct buf *bp) {
+ register struct nfp_dev *pdev;
+ nfp_err ne;
+
+ nfp_log( NFP_DBG2, "nfp_strategy: entered." );
+ if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) {
+ NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev");
+ return (0);
+ }
+
+ if (bp->b_flags & B_READ) {
+ int count;
+ /* read */
+ if (!pdev->rd_ready) {
+ NFP_STRAT_ERR (bp,ENXIO,"read called when not ready");
+ return (0);
+ }
+ pdev->rd_ready=0;
+ pdev->rd_pending = 0;
+ if( !pdev->rd_ok) {
+ NFP_STRAT_ERR (bp,ENXIO,"read failed");
+ return (0);
+ }
+ /* copy data from module */
+ if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+ nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer");
+ if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS )
+ {
+ NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed");
+ return (0);
+ }
+ /* LINTED: alignment */
+ count= *(unsigned int *)(pdev->read_buf+4);
+ count= FROM_LE32_MEM(&count);
+ nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count);
+ if(count<0 || count>bp->b_bcount) {
+ NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device");
+ nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count);
+ return (0);
+ }
+ bp_mapin (bp);
+ bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count );
+ bp_mapout (bp);
+ } else {
+ bp_mapin (bp);
+ ne= pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count );
+ bp_mapout (bp);
+ if( ne != NFP_SUCCESS) {
+ NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed");
+ return (0);
+ }
+ }
+ bioerror(bp, 0);
+ bp->b_resid = 0;
+ biodone (bp);
+ } else {
+ /* write */
+ if (!pdev->wr_ready) {
+ NFP_STRAT_ERR (bp,ENXIO,"write called when not ready");
+ return (0);
+ }
+ if (pdev->wr_bp) {
+ NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL");
+ return (0);
+ }
+ pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+ pdev->wr_bp = bp;
+ pdev->wr_ready = 0;
+ bp_mapin (bp);
+ ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx);
+ if( ne != NFP_SUCCESS ) {
+ bp_mapout (bp);
+ (void) untimeout(pdev->wrtimeout);
+ pdev->wr_bp = 0;
+ pdev->wr_ready = 1;
+ NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed");
+ return (0);
+ }
+ }
+ nfp_log( NFP_DBG2, "nfp_strategy: leaving");
+
+ return (0);
+}
+
+
+/*--------------------*/
+/* poll / select */
+/*--------------------*/
+
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp) {
+ nfp_dev *pdev;
+ short revents;
+
+ if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+ nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev");
+ *reventsp=0;
+ return (0);
+ }
+ nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events);
+
+ revents=0;
+ if (events&POLLWRNORM) {
+ if (pdev->wr_ready) {
+ nfp_log( NFP_DBG2, "nfp_chpoll: write ready");
+ revents|=POLLWRNORM;
+ }
+ }
+
+ if (events&POLLRDNORM) {
+ if (pdev->rd_ready) {
+ nfp_log( NFP_DBG2, "nfp_chpoll: read ready");
+ revents|=POLLRDNORM;
+ }
+ }
+
+ if (!revents && !anyyet) {
+ *phpp=&pdev->pollhead;
+ }
+ *reventsp=revents;
+
+ nfp_log( NFP_DBG2, "nfp_chpoll: leaving");
+ return (0);
+}
+
+
+/*--------------------*/
+/* ioctl */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) {
+ register struct nfp_dev *pdev;
+
+ nfp_log( NFP_DBG2, "nfp_ioctl: entered." );
+
+ if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev.");
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case NFDEV_IOCTL_ENQUIRY:
+ {
+ long *outp;
+ int outlen;
+ nfdev_enquiry_str enq_data;
+
+ enq_data.busno = (unsigned int)-1;
+ enq_data.slotno = (unsigned char)-1;
+
+ /* get our bus and slot num */
+ if (ddi_getlongprop (DDI_DEV_T_NONE,
+ pdev->dip, 0, "reg",
+ (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) {
+ nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." );
+ if( outlen > 0 ) {
+ enq_data.busno = ((*outp)>>16) & 0xff;
+ enq_data.slotno = ((*outp)>>11) & 0x1f;
+ nfp_log( NFP_DBG2, "busno %d, slotno %d.",
+ enq_data.busno, enq_data.slotno );
+ }
+ } else
+ nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." );
+
+ if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+ return EFAULT;
+ }
+ }
+ break;
+
+ case NFDEV_IOCTL_ENSUREREADING:
+ {
+ unsigned int addr, len;
+ nfp_err ret;
+ if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+ return (EFAULT);
+ }
+ /* signal a read to the module */
+ nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len );
+ if (len>8192) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len );
+ return EINVAL;
+ }
+ if (pdev->rd_outstanding==1) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding.");
+ return EIO;
+ }
+
+ addr= 0;
+ if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+ if( len > NFP_READBUF_SIZE ) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len );
+ return EINVAL;
+ }
+ addr= pdev->read_dma_cookie.dmac_address;
+ }
+
+ pdev->rd_outstanding = 1;
+ nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1");
+
+ /* setup timeout timer */
+ pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+
+ nfp_log( NFP_DBG2, "nfp_ioctl: read request");
+ ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx);
+ if ( ret != NFP_SUCCESS ) {
+ (void) untimeout(pdev->rdtimeout);
+ pdev->rdtimeout = 0;
+ pdev->rd_outstanding = 0;
+ nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed ");
+ return nfp_oserr( ret );
+ }
+ }
+ break;
+
+ case NFDEV_IOCTL_PCI_IFVERS:
+ {
+ int vers;
+
+ nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS");
+
+ if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+ return (EFAULT);
+ }
+
+ if( pdev->rd_outstanding ) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers);
+ return EIO;
+ }
+
+ nfp_set_ifvers(pdev, vers);
+ if( pdev->ifvers != vers ) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers);
+ return EIO;
+ }
+ }
+ break;
+
+ case NFDEV_IOCTL_STATS:
+ {
+ if( ddi_copyout( (char *)&(pdev->common.stats),
+ (void *)arg,
+ sizeof(nfdev_stats_str),
+ mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+ return EFAULT;
+ }
+ }
+ break;
+
+ default:
+ nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." );
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+/*-------------------------*/
+/* nfp_open */
+/*-------------------------*/
+
+/* ARGSUSED */
+int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{
+ nfp_err ret;
+ register struct nfp_dev *pdev;
+
+ nfp_log( NFP_DBG2, "entered nfp_open." );
+
+ pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev));
+
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev.");
+ return (ENODEV);
+ }
+
+ if( otyp != OTYP_CHR ) {
+ nfp_log( NFP_DBG1, "nfp_open: not opened as character device");
+ return (EINVAL);
+ }
+
+ mutex_enter(&pdev->busy_mutex);
+
+ if (pdev->busy) {
+ mutex_exit(&pdev->busy_mutex);
+ nfp_log( NFP_DBG1, "nfp_open: device busy");
+ return EBUSY;
+ }
+ pdev->busy= 1;
+ mutex_exit(&pdev->busy_mutex);
+
+ /* use oldest possible interface until told otherwise */
+ pdev->ifvers= NFDEV_IF_STANDARD;
+ nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers);
+ pdev->rd_ready= 0; /* drop any old data */
+
+ ret = pdev->cmddev->open(pdev->common.cmdctx);
+ if( ret != NFP_SUCCESS ) {
+ nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed ");
+ return nfp_oserr( ret );
+ }
+
+ nfp_log( NFP_DBG2, "nfp_open: done");
+
+ return 0;
+}
+
+/*--------------------*/
+/* nfp_close */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) {
+ nfp_dev *pdev;
+ nfp_err ret;
+
+ nfp_log( NFP_DBG2, "nfp_close: entered");
+
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev));
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+ return ENODEV;
+ }
+
+ mutex_enter(&pdev->isr_mutex);
+ if(pdev->rd_outstanding) {
+ int lbolt, err;
+ nfp_get_lbolt(&lbolt, err);
+ if(!err)
+ (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) );
+ }
+ mutex_exit(&pdev->isr_mutex);
+ ret = pdev->cmddev->close(pdev->common.cmdctx);
+ if (ret != NFP_SUCCESS ) {
+ nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed");
+ return nfp_oserr( ret );
+ }
+
+ mutex_enter(&pdev->busy_mutex);
+ pdev->busy= 0;
+ mutex_exit(&pdev->busy_mutex);
+
+ return 0;
+}
+
+/****************************************************************************
+
+ nfp driver config
+
+ ****************************************************************************/
+
+/*-------------------------*/
+/* nfp_getinfo */
+/*-------------------------*/
+
+/* ARGSUSED */
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) {
+ int error;
+ nfp_dev *pdev;
+
+ nfp_log( NFP_DBG2, "nfp_getinfo: entered" );
+
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg));
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+ return ENODEV;
+ }
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ if (pdev == NULL) {
+ *result = NULL;
+ error = DDI_FAILURE;
+ } else {
+ /*
+ * don't need to use a MUTEX even though we are
+ * accessing our instance structure; dev->dip
+ * never changes.
+ */
+ *result = pdev->dip;
+ error = DDI_SUCCESS;
+ }
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)getminor((dev_t)arg);
+ error = DDI_SUCCESS;
+ break;
+ default:
+ *result = NULL;
+ error = DDI_FAILURE;
+ }
+
+ nfp_log( NFP_DBG2, "nfp_getinfo: leaving." );
+ return (error);
+}
+
+/*-------------------------*/
+/* nfp_release */
+/*-------------------------*/
+
+static int nfp_release_dev( dev_info_t *dip ) {
+ nfp_dev *pdev;
+ int instance, i;
+ nfp_err ret;
+
+ nfp_log( NFP_DBG2, "nfp_release_dev: entering" );
+
+ instance = ddi_get_instance(dip);
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+ if (pdev) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: removing device" );
+
+ nfp_free_pci_push(pdev);
+
+ if( pdev->cmddev ) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" );
+ ret = pdev->cmddev->destroy(pdev->common.cmdctx);
+ if (ret != NFP_SUCCESS) {
+ nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed ");
+ return nfp_oserr( ret );
+ }
+ }
+
+ if(pdev->high_iblock_cookie) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" );
+ ddi_remove_softintr(pdev->soft_int_id);
+ ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie);
+ mutex_destroy( &pdev->busy_mutex );
+ cv_destroy( &pdev->rd_cv );
+ mutex_destroy( &pdev->isr_mutex );
+ mutex_destroy( &pdev->high_mutex );
+ } else if(pdev->iblock_cookie) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" );
+ ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie);
+ mutex_destroy( &pdev->busy_mutex );
+ cv_destroy( &pdev->rd_cv );
+ mutex_destroy( &pdev->isr_mutex );
+ }
+ if(pdev->low_iblock_cookie) {
+ ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie);
+ mutex_destroy( &pdev->low_mutex);
+ }
+
+ for(i=0;i<6;i++) {
+ if( pdev->common.extra[i] ) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i );
+ ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]);
+ }
+ }
+
+ ddi_remove_minor_node(dip, NULL);
+
+ if (pdev->conf_handle)
+ pci_config_teardown( &pdev->conf_handle );
+
+ ddi_soft_state_free(state_head, instance);
+ }
+ nfp_log( NFP_DBG2, "nfp_release: finished" );
+
+ return DDI_SUCCESS;
+}
+
+
+/*-------------------------*/
+/* nfp_attach */
+/*-------------------------*/
+
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) {
+ int instance;
+ nfp_dev *pdev = NULL;
+ int intres;
+ uint16_t device, vendor, sub_device, sub_vendor;
+ long *outp;
+ nfpcmd_dev const *cmddev;
+ int index, i;
+ nfp_err ret;
+
+ nfp_log( NFP_DBG2, "nfp_attach: entered." );
+
+ if (cmd != DDI_ATTACH) {
+ nfp_log( NFP_DBG1, "nfp_attach: bad command." );
+ goto bailout;
+ }
+
+ instance = ddi_get_instance(dip);
+
+ if (ddi_soft_state_zalloc(state_head, instance) != 0) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." );
+ goto bailout;
+ }
+
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_attach: cannot find dev.");
+ return ENODEV;
+ }
+ pdev->dip = dip;
+
+ /* map in pci config registers */
+ if (pci_config_setup(dip, &pdev->conf_handle)) {
+ nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." );
+ goto bailout;
+ }
+
+ /* find out what we have got */
+ vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID );
+ device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID );
+ sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID );
+ sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID );
+
+ index= 0;
+ while( (cmddev = nfp_drvlist[index++]) != NULL ) {
+ if( cmddev->vendorid == vendor &&
+ cmddev->deviceid == device &&
+ cmddev->sub_vendorid == sub_vendor &&
+ cmddev->sub_deviceid == sub_device )
+ break;
+ }
+ if( !cmddev ) {
+ nfp_log( NFP_DBG1, "nfp_attach: unknonw device." );
+ goto bailout;
+ }
+
+ /* map BARs */
+ for( i=0; i<6; i++ ) {
+ if( cmddev->bar_sizes[i] ) {
+ off_t size;
+ if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i );
+ goto bailout;
+ }
+ if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) );
+ goto bailout;
+ }
+ if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i],
+ 0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i );
+ goto bailout;
+ }
+ nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size );
+ }
+ }
+
+ pdev->read_buf = NULL;
+ pdev->rd_dma_ok = 0;
+
+ /* attach to minor node */
+ if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(dip, NULL);
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." );
+ goto bailout;
+ }
+
+ pdev->wr_ready = 1;
+ pdev->rd_ready = 0;
+ pdev->rd_pending = 0;
+ pdev->rd_outstanding = 0;
+ pdev->busy=0;
+ pdev->cmddev= cmddev;
+
+ ret = pdev->cmddev->create(&pdev->common);
+ if( ret != NFP_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: failed to create command device");
+ goto bailout;
+ }
+ pdev->common.dev= pdev;
+
+ if (ddi_intr_hilevel(dip, 0) != 0){
+ nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt");
+ if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." );
+ goto bailout;
+ }
+ if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." );
+ goto bailout;
+ }
+ mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER,
+ (void *)pdev->high_iblock_cookie);
+ mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER,
+ (void *)pdev->low_iblock_cookie);
+ if (ddi_add_intr(dip, 0, NULL,
+ NULL, nfp_isr,
+ (caddr_t)pdev) != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." );
+ goto bailout;
+ }
+ if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH,
+ &pdev->iblock_cookie) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." );
+ goto bailout;
+ }
+ mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER,
+ (void *)pdev->iblock_cookie);
+ if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id,
+ &pdev->iblock_cookie, NULL,
+ nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS)
+ goto bailout;
+ pdev->high_intr= 1;
+ } else {
+ nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt");
+
+ if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." );
+ goto bailout;
+ }
+
+ mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie);
+
+ if (ddi_add_intr(dip, 0, NULL,
+ (ddi_idevice_cookie_t *)NULL, nfp_isr,
+ (caddr_t)pdev) != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." );
+ goto bailout;
+ }
+ }
+ mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL );
+ cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL );
+
+ /* get our bus and slot num */
+ if (ddi_getlongprop (DDI_DEV_T_NONE,
+ pdev->dip, 0, "reg",
+ (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) {
+ nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." );
+ if( intres > 0 ) {
+ nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.",
+ ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f );
+ }
+ }
+
+ nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." );
+ return DDI_SUCCESS;
+
+bailout:
+ (void) nfp_release_dev( dip );
+
+ return DDI_FAILURE;
+}
+
+/*-------------------------*/
+/* nfp_detach */
+/*-------------------------*/
+
+/*
+ * When our driver is unloaded, nfp_detach cleans up and frees the resources
+ * we allocated in nfp_attach.
+ */
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) {
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ (void) nfp_release_dev(dip);
+
+ return (DDI_SUCCESS);
+}
+
+/*-------------------------*/
+/* _init */
+/*-------------------------*/
+
+int _init(void) {
+ register int error;
+
+ nfp_log( NFP_DBG2, "_init: entered" );
+
+ if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) {
+ nfp_log( NFP_DBG1, "_init: soft_state_init() failed" );
+ return (error);
+ }
+
+ if ((error = mod_install(&modlinkage)) != 0) {
+ nfp_log( NFP_DBG1, "_init: mod_install() failed" );
+ ddi_soft_state_fini(&state_head);
+ }
+
+ nfp_log( NFP_DBG2, "_init: leaving" );
+ return (error);
+}
+
+/*-------------------------*/
+/* _info */
+/*-------------------------*/
+
+int _info(struct modinfo *modinfop) {
+ nfp_log( NFP_DBG2, "_info: entered" );
+
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*-------------------------*/
+/* _fini */
+/*-------------------------*/
+
+int _fini(void) {
+ int status;
+
+ nfp_log( NFP_DBG2, "_fini: entered" );
+
+ if ((status = mod_remove(&modlinkage)) != 0) {
+ nfp_log( NFP_DBG2, "_fini: mod_remove() failed." );
+ return (status);
+ }
+
+ ddi_soft_state_fini(&state_head);
+
+ nfp_log( NFP_DBG2, "_fini: leaving" );
+
+ return (status);
+}
+
diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c
new file mode 100644
index 0000000000..f51a09188d
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.c
@@ -0,0 +1,310 @@
+/*
+
+i21285.c: nCipher PCI HSM intel/digital 21285 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+09/10/2001 jsh Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21285.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21285_create( nfp_cdev *pdev ) {
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21285_create: entered");
+ pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */
+
+ nfp_log( NFP_DBG2, "i21285_create: enable doorbell");
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE);
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21285_destroy( void * ctx ) {
+ nfp_cdev *pdev;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21285_destroy: entered");
+
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev");
+ return NFP_ENODEV;
+ }
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE );
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_open( void * ctx ) {
+ nfp_log( NFP_DBG2, "i21285_open: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_close( void * ctx ) {
+ nfp_log( NFP_DBG2, "i21285_close: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21285_isr( void *ctx, int *handled ) {
+ nfp_cdev *pdev;
+ unsigned int doorbell;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG3, "i21285_isr: entered");
+
+ *handled= 0;
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21285_isr: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+ doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+ while( doorbell && doorbell != 0xffff) {
+ *handled= 1;
+ /* service interrupts */
+ if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+ nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+ }
+
+ if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+ TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED );
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+ nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+ }
+
+ if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED |
+ NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell );
+ TO_LE32_IO( &tmp32, 0xffff & doorbell );
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+ }
+ doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+ doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+ }
+ return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21285_write( const char *block, int len, void *ctx ) {
+ nfp_cdev *cdev;
+ unsigned int hdr[2];
+ nfp_err ne;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21285_write: entered");
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21285_write: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]);
+ nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]);
+ if(!cdev->bar[ MEMBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR );
+ return NFP_ENOMEM;
+ }
+ ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed");
+ return ne;
+ }
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM(&hdr[1], len);
+
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed");
+ return ne;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed");
+ return ne;
+ }
+
+ TO_LE32_MEM( &tmp32, len );
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21285_write: length not written");
+ return NFP_EIO;
+ }
+
+ TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST);
+
+ nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ nfp_log( NFP_DBG2, "i21285_write: done");
+ return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) {
+ nfp_cdev *cdev;
+ nfp_err ne;
+ int count;
+
+ nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len);
+ *rcount= 0;
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21285_read: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ if(!cdev->bar[ MEMBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR );
+ return NFP_ENOMEM;
+ }
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+ if(ne) {
+ nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed.");
+ return ne;
+ }
+ count= FROM_LE32_MEM(&count);
+ if(count<0 || count>len) {
+ nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count);
+ return NFP_EIO;
+ }
+ ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+ if( ne ) {
+ nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed.");
+ return ne;
+ }
+ nfp_log( NFP_DBG2, "i21285_read: done");
+ *rcount= count;
+ return NFP_SUCCESS;
+}
+
+/* chupdate ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_chupdate( char *data, int len, void *ctx ) {
+ nfp_log( NFP_DBG1, "i21285_chupdate: NYI");
+ return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) {
+ nfp_cdev *cdev;
+ unsigned int hdr[2];
+ unsigned int tmp32;
+ nfp_err ne;
+
+ nfp_log( NFP_DBG2, "i21285_ensure_reading: entered");
+
+ if(addr) {
+ nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr");
+ return -NFP_EINVAL;
+ }
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ if(!cdev->bar[ MEMBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR );
+ return NFP_ENXIO;
+ }
+ nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+ nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+ TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM( &hdr[1], len);
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed");
+ return ne;
+ }
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed");
+ return ne;
+ }
+ TO_LE32_MEM( &tmp32, len );
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written");
+ return NFP_EIO;
+ };
+ TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST );
+ nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+
+const nfpcmd_dev i21285_cmddev = {
+ "nCipher Gen 1 PCI",
+ PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285,
+ PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1,
+ { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 },
+ NFP_CMD_FLG_NEED_IOBUF,
+ i21285_create,
+ i21285_destroy,
+ i21285_open,
+ i21285_close,
+ i21285_isr,
+ i21285_write,
+ i21285_read,
+ i21285_chupdate,
+ i21285_ensure_reading,
+ 0, /* no debug */
+};
+
diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h
new file mode 100644
index 0000000000..4ea1d853ec
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.h
@@ -0,0 +1,43 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_I21285_H
+#define NFP_I21285_H
+
+#ifndef PCI_VENDOR_ID_DEC
+#define PCI_VENDOR_ID_DEC 0x1011
+#endif
+#ifndef PCI_DEVICE_ID_DEC_21285
+#define PCI_DEVICE_ID_DEC_21285 0x1065
+#endif
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER 0x0100
+#endif
+
+#ifndef PCI_DEVICE_ID_NFAST_GEN1
+#define PCI_DEVICE_ID_NFAST_GEN1 0x0100
+#endif
+
+#define I21285_OFFSET_DOORBELL 0x60
+#define I21285_OFFSET_INTERRUPT_MASK 0x34
+
+#define DOORBELL_ENABLE 0x0
+#define DOORBELL_DISABLE 0x4
+
+#define POSTLIST_ENABLE 0x0
+#define POSTLIST_DISABLE 0x8
+
+#define IOBAR 1
+#define MEMBAR 2
+
+#define IOSIZE 0x80
+#define MEMSIZE 0x100000
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c
new file mode 100644
index 0000000000..82024dc800
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.c
@@ -0,0 +1,423 @@
+/*
+
+i21555.c: nCipher PCI HSM intel 21555 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+09/10/2001 jsh Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* started ------------------------------------------------------
+ *
+ * Check that device is ready to talk, by checking that
+ * the i21555 has master enabled on its secondary interface
+ */
+
+static nfp_err i21555_started( nfp_cdev *pdev ) {
+ unsigned int tmp32;
+#ifdef CONFIGSPACE_DEBUG
+ unsigned int reg32[64];
+ int i;
+#endif
+ nfp_err ne;
+
+ nfp_log( NFP_DBG2, "i21555_started: entered");
+
+#ifdef CONFIGSPACE_DEBUG
+ /* Suck up all the registers */
+ for (i=0; i < 64; i++) {
+ ne = nfp_config_inl( pdev, i*4, &reg32[i] );
+ }
+
+ for (i=0; i < 16; i++) {
+ int j = i * 4;
+ nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4,
+ reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]);
+ }
+#endif
+
+ ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 );
+ if (ne) {
+ /* succeed if PCI config reads are not implemented */
+ if (ne == NFP_EUNKNOWN)
+ return NFP_SUCCESS;
+ nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed");
+ return ne;
+ }
+
+ tmp32= FROM_LE32_IO(&tmp32) & 0xffff;
+
+ if ( tmp32 & CFG_CMD_MASTER ) {
+ nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32);
+ return NFP_SUCCESS;
+ } else {
+ nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32);
+ return NFP_ESTARTING;
+ }
+}
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21555_create( nfp_cdev *pdev ) {
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21555_create: entered");
+ pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */
+
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ nfp_log( NFP_DBG2, "i21555_create: enable doorbell");
+ TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+ return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21555_destroy( void * ctx ) {
+ nfp_cdev *pdev;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21555_destroy: entered");
+
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev");
+ return NFP_ENODEV;
+ }
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_open( void * ctx ) {
+
+ nfp_log( NFP_DBG2, "i21555_open: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_close( void * ctx ) {
+ nfp_log( NFP_DBG2, "i21555_close: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21555_isr( void *ctx, int *handled ) {
+ nfp_cdev *pdev;
+ nfp_err ne;
+ unsigned short doorbell;
+ unsigned short tmp16;
+
+ nfp_log( NFP_DBG3, "i21555_isr: entered");
+
+ *handled= 0;
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21555_isr: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ pdev->stats.isr++;
+
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ /* This interrupt may not be from our module, so check that it actually is
+ * us before handling it.
+ */
+ ne = i21555_started( pdev );
+ if (ne) {
+ if (ne != NFP_ESTARTING) {
+ nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed");
+ }
+ return ne;
+ }
+
+ doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+ doorbell= FROM_LE16_IO(&doorbell);
+ while( doorbell && doorbell != 0xffff) {
+ *handled= 1;
+ /* service interrupts */
+ if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ pdev->stats.isr_write++;
+ TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+ nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+ nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+ nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+ }
+
+ if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+ pdev->stats.isr_read++;
+ TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED);
+ nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+ nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+ nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+ }
+
+ if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED |
+ NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ TO_LE16_IO(&tmp16,doorbell);
+ nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+ nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell );
+ }
+ doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+ doorbell= FROM_LE16_IO(&doorbell);
+ }
+ nfp_log( NFP_DBG3, "i21555_isr: exiting");
+ return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21555_write( const char *block, int len, void *ctx) {
+ nfp_cdev *cdev;
+ unsigned int hdr[2];
+ nfp_err ne;
+ unsigned short tmp16;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21555_write: entered");
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21555_write: NULL cdev");
+ return NFP_ENODEV;
+ }
+
+ cdev->stats.write_fail++;
+
+ if(!cdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ ne = i21555_started( cdev );
+ if (ne) {
+ if (ne != NFP_ESTARTING) {
+ nfp_log( NFP_DBG1, "i21555_write: i21555_started failed");
+ }
+ return ne;
+ }
+
+ nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+ nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+ nfp_log( NFP_DBG3, "i21555_write: block len %d", len );
+ ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed");
+ return ne;
+ }
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM(&hdr[1], len);
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed");
+ return ne;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed");
+ return ne;
+ }
+
+ TO_LE32_MEM(&tmp32, len);
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21555_write: length not written");
+ return NFP_EIO;
+ }
+ TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16);
+ nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+ cdev->stats.write_fail--;
+ cdev->stats.write_block++;
+ cdev->stats.write_byte += len;
+
+ nfp_log( NFP_DBG2, "i21555_write: done");
+ return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) {
+ nfp_cdev *cdev;
+ nfp_err ne;
+ int count;
+
+ nfp_log( NFP_DBG2, "i21555_read: entered");
+ *rcount= 0;
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21555_read: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ cdev->stats.read_fail++;
+
+ if(!cdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed.");
+ return ne;
+ }
+ count= FROM_LE32_MEM(&count);
+ if(count<0 || count>len) {
+ nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count);
+ return NFP_EIO;
+ }
+ ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed.");
+ return ne;
+ }
+ nfp_log( NFP_DBG2, "i21555_read: done");
+ *rcount= count;
+ cdev->stats.read_fail--;
+ cdev->stats.read_block++;
+ cdev->stats.read_byte += len;
+ return NFP_SUCCESS;
+}
+
+/* chupdate ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_chupdate( char *data, int len, void *ctx ) {
+ nfp_log( NFP_DBG1, "i21555_chupdate: NYI");
+ return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) {
+ nfp_cdev *cdev;
+ unsigned int hdr[3];
+ unsigned short tmp16;
+ unsigned int tmp32;
+ nfp_err ne;
+ int hdr_len;
+
+ nfp_log( NFP_DBG2, "i21555_ensure_reading: entered");
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ cdev->stats.ensure_fail++;
+
+ if(!cdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ ne = i21555_started( cdev );
+ if (ne) {
+ if (ne != NFP_ESTARTING) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed");
+ }
+ return ne;
+ }
+
+ nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+ nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+ if(addr) {
+ nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr);
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH);
+ TO_LE32_MEM(&hdr[1], len);
+ TO_LE32_MEM(&hdr[2], addr);
+ hdr_len= 12;
+ } else {
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM(&hdr[1], len);
+ hdr_len= 8;
+ }
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed");
+ return ne;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed");
+ return ne;
+ }
+
+ TO_LE32_MEM(&tmp32, len);
+
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written");
+ return NFP_EIO;
+ }
+ TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16);
+ nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+ cdev->stats.ensure_fail--;
+ cdev->stats.ensure++;
+
+ return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+const nfpcmd_dev i21555_cmddev = {
+ "nCipher Gen 2 PCI",
+ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555,
+ PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1,
+ { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 },
+ NFP_CMD_FLG_NEED_IOBUF,
+ i21555_create,
+ i21555_destroy,
+ i21555_open,
+ i21555_close,
+ i21555_isr,
+ i21555_write,
+ i21555_read,
+ i21555_chupdate,
+ i21555_ensure_reading,
+ i21555_debug,
+};
diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h
new file mode 100644
index 0000000000..d8f3965938
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.h
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef I21555_H
+#define I21555_H
+
+#ifndef PCI_VENDOR_ID_INTEL
+#define PCI_VENDOR_ID_INTEL 0x8086
+#endif
+
+#ifndef PCI_DEVICE_ID_INTEL_21555
+#define PCI_DEVICE_ID_INTEL_21555 0xb555
+#endif
+
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER 0x0100
+#endif
+
+#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1
+#define PCI_SUBSYSTEM_ID_NFAST_REV1 0x0100
+#endif
+
+#define I21555_OFFSET_DOORBELL_PRI_SET 0x9C
+#define I21555_OFFSET_DOORBELL_SEC_SET 0x9E
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR 0x98
+
+#define I21555_OFFSET_DOORBELL_PRI_SET_MASK 0xA4
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK 0xA0
+
+#define I21555_DOORBELL_PRI_ENABLE 0x0000
+#define I21555_DOORBELL_PRI_DISABLE 0xFFFF
+
+#define I21555_CFG_SEC_CMD_STATUS 0x44
+
+#define CFG_CMD_MASTER 0x0004
+
+#define IOBAR 1
+#define MEMBAR 2
+
+#define IOSIZE 0x100
+
+extern nfp_err i21555_debug( int cmd, void *ctx );
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c
new file mode 100644
index 0000000000..183ace8275
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555d.c
@@ -0,0 +1,28 @@
+/*
+
+i21555d.c: nCipher PCI HSM intel 21555 debug ioctl
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+15/05/2002 jsh Original, does nothing
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+
+/* ARGSUSED */
+nfp_err i21555_debug( int cmd, void *ctx) {
+ nfp_log( NFP_DBG1, "i21555_debug: entered");
+
+ return NFP_EUNKNOWN;
+}
diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h
new file mode 100644
index 0000000000..8a97bf2c63
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-common.h
@@ -0,0 +1,141 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+/** \file nfdev-common.h
+ *
+ * \brief nFast device driver (not generic SCSI) ioctl struct definition file
+ * include NFDEV-$(system) for ioctl number definitions
+ *
+ * 1998.07.13 jsh Started
+ *
+ *
+ */
+
+#ifndef NFDEV_COMMON_H
+#define NFDEV_COMMON_H
+
+/**
+ * Result of the ENQUIRY ioctl.
+ */
+typedef struct nfdev_enquiry_str {
+ unsigned int busno; /**< Which bus is the PCI device on. */
+ unsigned char slotno; /**< Which slot is the PCI device in. */
+ unsigned char reserved[3]; /**< for consistant struct alignment */
+} nfdev_enquiry_str;
+
+/**
+ * Result of the STATS ioctl.
+ */
+typedef struct nfdev_stats_str {
+ unsigned long isr; /**< Count interrupts. */
+ unsigned long isr_read; /**< Count read interrupts. */
+ unsigned long isr_write; /**< Count write interrupts. */
+ unsigned long write_fail; /**< Count write failures. */
+ unsigned long write_block; /**< Count blocks written. */
+ unsigned long write_byte; /**< Count bytes written. */
+ unsigned long read_fail; /**< Count read failures. */
+ unsigned long read_block; /**< Count blocks read. */
+ unsigned long read_byte; /**< Count bytes read. */
+ unsigned long ensure_fail; /**< Count read request failures. */
+ unsigned long ensure; /**< Count read requests. */
+} nfdev_stats_str;
+
+/**
+ * Input to the CONTROL ioctl.
+ */
+typedef struct nfdev_control_str {
+ unsigned control; /**< Control flags. */
+} nfdev_control_str;
+
+/** Control bit indicating host supports MOI control */
+#define NFDEV_CONTROL_HOST_MOI 0x0001
+
+/** Index of control bits indicating desired mode
+ *
+ * Desired mode follows the M_ModuleMode enumeration.
+ */
+#define NFDEV_CONTROL_MODE_SHIFT 1
+
+/** Detect a backwards-compatible control value
+ *
+ * Returns true if the request control value "makes no difference", i.e.
+ * and the failure of an attempt to set it is therefore uninteresting.
+ */
+#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1)
+
+/**
+ * Result of the STATUS ioctl.
+ */
+typedef struct nfdev_status_str {
+ unsigned status; /**< Status flags. */
+ char error[8]; /**< Error string. */
+} nfdev_status_str;
+
+/** Monitor firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_MONITOR_MOI 0x0001
+
+/** Application firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_APPLICATION_MOI 0x0002
+
+/** Application firmware running and supports error reporting */
+#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004
+
+/** HSM failed
+ *
+ * Consult error[] for additional information.
+ */
+#define NFDEV_STATUS_FAILED 0x0008
+
+/** Standard PCI interface. */
+#define NFDEV_IF_STANDARD 0x01
+
+/** PCI interface with results pushed from device
+ * via DMA.
+ */
+#define NFDEV_IF_PCI_PUSH 0x02
+
+/* platform independant base ioctl numbers */
+
+/** Enquiry ioctl.
+ * \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_ENQUIRY 0x01
+/** Channel Update ioctl.
+ * \deprecated */
+#define NFDEV_IOCTL_NUM_CHUPDATE 0x02
+/** Ensure Reading ioctl.
+ * Signal a read request to the device.
+ * \param (unsigned int) Length of data to be read.
+ */
+#define NFDEV_IOCTL_NUM_ENSUREREADING 0x03
+/** Device Count ioctl.
+ * Not implemented for on all platforms.
+ * \return (int) the number of attached devices. */
+#define NFDEV_IOCTL_NUM_DEVCOUNT 0x04
+/** Internal Debug ioctl.
+ * Not implemented in release drivers. */
+#define NFDEV_IOCTL_NUM_DEBUG 0x05
+/** PCI Interface Version ioctl.
+ * \param (int) Maximum PCI interface version
+ * supported by the user of the device. */
+#define NFDEV_IOCTL_NUM_PCI_IFVERS 0x06
+/** Statistics ioctl.
+ * \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_STATS 0x07
+
+/** Module control ioctl
+ * \param (nfdev_control_str) Value to write to HSM control register
+ */
+#define NFDEV_IOCTL_NUM_CONTROL 0x08
+
+/** Module state ioctl
+ * \return (nfdev_status_str) Values read from HSM status/error registers
+ */
+#define NFDEV_IOCTL_NUM_STATUS 0x09
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h
new file mode 100644
index 0000000000..923b902e46
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h
@@ -0,0 +1,37 @@
+/*
+
+nfdev-solaris.h: nFast solaris specific device ioctl interface.
+
+(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+14/07/1998 jsh Original
+
+*/
+
+#ifndef NFDEV_SOLARIS_H
+#define NFDEV_SOLARIS_H
+
+#include "nfdev-common.h"
+
+#define NFDEV_IOCTL_TYPE ('n'<<8)
+
+#define NFDEV_IOCTL_ENQUIRY ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_ENQUIRY )
+#define NFDEV_IOCTL_ENSUREREADING ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_ENSUREREADING )
+#define NFDEV_IOCTL_DEVCOUNT ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_DEVCOUNT )
+#define NFDEV_IOCTL_DEBUG ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_DEBUG )
+#define NFDEV_IOCTL_PCI_IFVERS ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_PCI_IFVERS )
+#define NFDEV_IOCTL_STATS ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_STATS )
+
+#endif /* NFDEV_SOLARIS_H */
diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h
new file mode 100644
index 0000000000..9704f04fbc
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp.h
@@ -0,0 +1,113 @@
+/*
+
+nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7
+
+(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh Original solaris 2.6
+21/05/1999 jsh added support for solaris 2.5
+10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit)
+16/10/2001 jsh moved from nfast to new structure in nfdrv
+
+*/
+
+#ifndef NFP_H
+#define NFP_H
+
+#ifndef _KERNEL
+#error Hello? this is a driver, please compile with -D_KERNEL
+#endif
+
+#if ( CH_KERNELVER < 260 )
+typedef int ioctlptr_t;
+typedef unsigned short uint16_t;
+#define DDI_GET32 ddi_getl
+#define DDI_PUT32 ddi_putl
+#define DDI_GET16 ddi_getw
+#define DDI_PUT16 ddi_putw
+#define DDI_REP_GET8 ddi_rep_getb
+#define DDI_REP_PUT8 ddi_rep_putb
+#define DDI_REP_GET32 ddi_rep_getl
+#define DDI_REP_PUT32 ddi_rep_putl
+#define PCI_CONFIG_GET16 pci_config_getw
+#else /* ( CH_KERNELVER >= 260 ) */
+typedef intptr_t ioctlptr_t;
+#define DDI_GET32 ddi_get32
+#define DDI_PUT32 ddi_put32
+#define DDI_GET16 ddi_get16
+#define DDI_PUT16 ddi_put16
+#define DDI_REP_GET8 ddi_rep_get8
+#define DDI_REP_PUT8 ddi_rep_put8
+#define DDI_REP_GET32 ddi_rep_get32
+#define DDI_REP_PUT32 ddi_rep_put32
+#define PCI_CONFIG_GET16 pci_config_get16
+#endif
+
+#if ( CH_KERNELVER < 270 )
+typedef int nfp_timeout_t;
+#define EXTRA_CB_FLAGS 0
+#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap)
+#else /* ( CH_KERNELVER >= 270 ) */
+typedef timeout_id_t nfp_timeout_t;
+#define EXTRA_CB_FLAGS D_64BIT
+#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap)
+#endif
+
+typedef struct nfp_dev {
+ int rd_ok;
+ int wr_ok;
+
+ int ifvers;
+
+ /* for PCI push read interface */
+ unsigned char *read_buf;
+ ddi_dma_handle_t read_dma_handle;
+ ddi_dma_cookie_t read_dma_cookie;
+
+ ddi_acc_handle_t acchandle;
+
+ int rd_dma_ok;
+
+ nfp_timeout_t wrtimeout;
+ nfp_timeout_t rdtimeout;
+
+ struct buf *wr_bp;
+ int wr_ready;
+ int rd_ready;
+ int rd_pending;
+ int rd_outstanding;
+ kcondvar_t rd_cv;
+
+ struct pollhead pollhead;
+ dev_info_t *dip;
+
+ ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */
+ ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */
+ kmutex_t high_mutex;
+ kmutex_t low_mutex;
+ int high_intr;
+ ddi_softintr_t soft_int_id;
+ int high_read;
+ int high_write;
+
+ ddi_iblock_cookie_t iblock_cookie; /* for mutex */
+ kmutex_t isr_mutex;
+
+ kmutex_t busy_mutex;
+ int busy;
+
+ ddi_acc_handle_t conf_handle;
+
+ nfp_cdev common;
+ const nfpcmd_dev *cmddev;
+} nfp_dev;
+
+extern struct nfp_dev *nfp_dev_list[];
+
+#endif /* NFP_H */
diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h
new file mode 100644
index 0000000000..db8af0b2f9
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_cmd.h
@@ -0,0 +1,68 @@
+/*
+
+nfp_cmd.h: nCipher PCI HSM command driver decalrations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh Original
+
+*/
+
+#ifndef NFPCMD_H
+#define NFPCMD_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* read and write called with userspace buffer */
+
+typedef struct nfpcmd_dev {
+ const char *name;
+ unsigned short vendorid, deviceid,
+ sub_vendorid, sub_deviceid;
+ unsigned int bar_sizes[6]; /* includes IO bit */
+ unsigned int flags;
+ nfp_err (*create)(struct nfp_cdev *pdev);
+ nfp_err (*destroy)(void * ctx);
+ nfp_err (*open)(void * ctx);
+ nfp_err (*close)(void * ctx);
+ nfp_err (*isr)(void *ctx, int *handled);
+ nfp_err (*write_block)( const char *ublock, int len, void *ctx );
+ nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount);
+ nfp_err (*channel_update)( char *data, int len, void *ctx);
+ nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx );
+ nfp_err (*debug)( int cmd, void *ctx);
+} nfpcmd_dev;
+
+#define NFP_CMD_FLG_NEED_IOBUF 0x1
+
+/* list of all supported drivers ---------------------------------------- */
+
+extern const nfpcmd_dev *nfp_drvlist[];
+
+extern const nfpcmd_dev i21285_cmddev;
+extern const nfpcmd_dev i21555_cmddev;
+extern const nfpcmd_dev bcm5820_cmddev;
+
+#ifndef PCI_BASE_ADDRESS_SPACE_IO
+#define PCI_BASE_ADDRESS_SPACE_IO 0x1
+#endif
+
+#define NFP_MAXDEV 16
+
+
+#define NFP_MEMBAR_MASK ~0xf
+#define NFP_IOBAR_MASK ~0x3
+/*
+ This masks off the bottom bits of the PCI_CSR_BAR which signify that the
+ BAR is an IO BAR rather than a MEM BAR
+*/
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h
new file mode 100644
index 0000000000..d1d2100fea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_common.h
@@ -0,0 +1,68 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_COMMON_H
+#define NFP_COMMON_H
+
+#include <sys/types.h>
+#include <sys/conf.h>
+
+typedef uint32_t UINT32;
+typedef uint8_t BYTE;
+
+#define DEFINE_NFPCI_PACKED_STRUCTS
+#include "nfpci.h"
+#include "nfdev-solaris.h"
+
+typedef int oserr_t;
+
+#if CH_BIGENDIAN
+
+/* Big Endian Sparc */
+
+#define SWP32(x) \
+( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) )
+
+#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) )
+
+#define FROM_LE32_IO(x) SWP32(*x)
+#define TO_LE32_IO(x,y) *x=SWP32(y)
+
+#define FROM_LE32_MEM(x) SWP32(*x)
+#define TO_LE32_MEM(x,y) *x=SWP32(y)
+
+#define FROM_LE16_IO(x) SWP16(*x)
+#define TO_LE16_IO(x,y) *x=SWP16(y)
+
+#else
+
+/* Little Endian x86 */
+
+#define FROM_LE32_IO(x) (*x)
+#define TO_LE32_IO(x,y) (*x=y)
+
+#define FROM_LE32_MEM(x) (*x)
+#define TO_LE32_MEM(x,y) (*x=y)
+
+#define FROM_LE16_IO(x) (*x)
+#define TO_LE16_IO(x,y) (*x=y)
+
+#endif /* !CH_BIGENDIAN */
+
+#include <sys/types.h>
+
+#if CH_KERNELVER == 260
+#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt )
+#else
+#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; }
+#endif
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h
new file mode 100644
index 0000000000..d64cb78fd4
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_error.h
@@ -0,0 +1,48 @@
+/*
+
+nfp_error.h: nCipher PCI HSM error handling
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+05/12/2001 jsh Original
+
+*/
+
+#ifndef NFP_ERROR_H
+#define NFP_ERROR_H
+
+#include "nfp_common.h"
+
+#define NFP_SUCCESS 0x0
+#define NFP_EFAULT 0x1
+#define NFP_ENOMEM 0x2
+#define NFP_EINVAL 0x3
+#define NFP_EIO 0x4
+#define NFP_ENXIO 0x5
+#define NFP_ENODEV 0x6
+#define NFP_EINTR 0x7
+#define NFP_ESTARTING 0x8
+#define NFP_EAGAIN 0x9
+#define NFP_EUNKNOWN 0x100
+
+typedef int nfp_err;
+
+extern oserr_t nfp_oserr( nfp_err nerr );
+extern nfp_err nfp_error( oserr_t oerr );
+
+#define nfr( x) \
+ return nfp_error((x))
+
+#define nfer(x, fn, msg) \
+ { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } }
+
+#define er(x, fn, msg ) \
+{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } }
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h
new file mode 100644
index 0000000000..3e7d8187e5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_hostif.h
@@ -0,0 +1,54 @@
+/*
+
+nfp_hostif.h: nCipher PCI HSM host interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh Original
+
+*/
+
+#ifndef NFP_HOSTIF_H
+#define NFP_HOSTIF_H
+
+#include "nfdev-common.h"
+
+struct nfp_dev;
+
+/* common device structure */
+
+typedef struct nfp_cdev {
+ unsigned char *bar[6];
+ void *extra[6];
+
+ int busno;
+ int slotno;
+
+ void *cmdctx;
+
+ char *iobuf;
+
+ struct nfp_dev* dev;
+
+ struct nfdev_stats_str stats;
+
+} nfp_cdev;
+
+/* callbacks from command drivers -------------------------------------- */
+
+void nfp_read_complete( struct nfp_dev *pdev, int ok);
+void nfp_write_complete( struct nfp_dev *pdev, int ok);
+
+#define NFP_READ_MAX (8 * 1024)
+#define NFP_READBUF_SIZE (NFP_READ_MAX + 8)
+#define NFP_TIMEOUT_SEC 10
+
+#define NFP_DRVNAME "nCipher nFast PCI driver"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c
new file mode 100644
index 0000000000..807b4f24c5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+ * nfp_ifervs.c - common pci interface versioning
+ *
+ * uses:
+ *
+ * int pdev->ifvers
+ * device interface version
+ *
+ * int nfp_ifvers
+ * interface version limit
+ *
+ * int nfp_alloc_pci_push( nfp_dev *pdev )
+ * allocates resources needed for PCI Push,
+ * if not already allocated, and return True if successful
+ *
+ * void nfp_free_pci_push( nfp_dev *pdev ) {
+ * frees any resources allocated to PCI Push
+ */
+
+void nfp_set_ifvers( nfp_dev *pdev, int vers ) {
+ if( nfp_ifvers != 0 && vers > nfp_ifvers ) {
+ nfp_log( NFP_DBG2,
+ "nfp_set_ifvers: can't set ifvers %d"
+ " as nfp_ifvers wants max ifvers %d",
+ vers, nfp_ifvers);
+ return;
+ }
+ if( vers >= NFDEV_IF_PCI_PUSH ) {
+ if(!nfp_alloc_pci_push(pdev)) {
+ nfp_log( NFP_DBG1,
+ "nfp_set_ifvers: can't set ifvers %d"
+ " as resources not available",
+ vers);
+ return;
+ }
+ } else {
+ nfp_free_pci_push(pdev);
+ }
+ pdev->ifvers= vers;
+ nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers);
+}
diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h
new file mode 100644
index 0000000000..17ffe469ce
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_osif.h
@@ -0,0 +1,105 @@
+/*
+
+nfp_osif.h: nCipher PCI HSM OS interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh Original
+
+*/
+
+#ifndef NFP_OSIF_H
+#define NFP_OSIF_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* general typedefs ----------------------------------------------- */
+
+typedef volatile unsigned int reg32;
+typedef volatile unsigned short reg16;
+typedef volatile unsigned char reg8;
+
+/* sempaphores, mutexs and events --------------------------------- */
+
+#if 0
+extern nfp_err nfp_sema_init( nfp_sema *sema, int initial);
+extern void nfp_sema_destroy( nfp_sema *sema );
+extern void nfp_sema_post( nfp_sema *sema );
+extern void nfp_sema_wait( nfp_sema *sema );
+extern int nfp_sema_wait_sig( nfp_sema *sema );
+
+extern nfp_err nfp_mutex_init( nfp_mutex *mutex );
+extern void nfp_mutex_destroy( nfp_mutex *mutex );
+extern void nfp_mutex_enter( nfp_mutex *mutex );
+extern void nfp_mutex_exit( nfp_mutex *mutex );
+
+extern nfp_err nfp_event_init( nfp_event *event );
+extern void nfp_event_destroy( nfp_event *event );
+extern void nfp_event_set( nfp_event *event );
+extern void nfp_event_clear( nfp_event *event );
+extern void nfp_event_wait( nfp_event *event );
+extern void nfp_event_wait_sig( nfp_event *event );
+
+#endif
+
+/* timeouts ------------------------------------------------------ */
+
+extern void nfp_sleep( int ms );
+
+/* memory handling ----------------------------------------------- */
+
+#define KMALLOC_DMA 0
+#define KMALLOC_CACHED 1
+
+extern void *nfp_kmalloc( int size, int flags );
+extern void *nfp_krealloc( void *ptr, int size, int flags );
+extern void nfp_kfree( void * );
+
+/* config space access ------------------------------------------------ */
+
+/* return Little Endian 32 bit config register */
+extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res );
+
+/* io space access ------------------------------------------------ */
+
+extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset );
+extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset );
+extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data );
+extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data );
+
+/* user and device memory space access ---------------------------- */
+
+/* NB these 2 functions are not guarenteed to be re-entrant for a given device */
+extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len);
+extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len);
+
+extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len );
+extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len );
+
+extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len );
+extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len);
+
+/* debug ------------------------------------------------------------ */
+
+#define NFP_DBG1 1
+#define NFP_DBGE NFP_DBG1
+#define NFP_DBG2 2
+#define NFP_DBG3 3
+#define NFP_DBG4 4
+
+#ifdef STRANGE_VARARGS
+extern void nfp_log();
+#else
+extern void nfp_log( int severity, const char *format, ...);
+#endif
+
+extern int nfp_debug;
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h
new file mode 100644
index 0000000000..793f5995e6
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfpci.h
@@ -0,0 +1,171 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+*
+* NFPCI.H - nFast PCI interface definition file
+*
+*
+*
+* 1998.06.09 IH Started
+*
+* The interface presented by nFast PCI devices consists of:
+*
+* A region of shared RAM used for data transfer & control information
+* A doorbell interrupt register, so both sides can give each other interrupts
+* A number of DMA channels for transferring data
+*/
+
+#ifndef NFPCI_H
+#define NFPCI_H
+
+/* Sizes of some regions */
+#define NFPCI_RAM_MINSIZE 0x00100000
+/* This is the minimum size of shared RAM. In future it may be possible to
+ negotiate larger sizes of shared RAM or auto-detect how big it is */
+#define NFPCI_RAM_MINSIZE_JOBS 0x00020000 /* standard jobs only */
+#define NFPCI_RAM_MINSIZE_KERN 0x00040000 /* standard and kernel jobs */
+
+/* Offsets within shared memory space.
+ The following main regions are:
+ jobs input area
+ jobs output area
+ kernel jobs input area
+ kernel output area
+*/
+
+#define NFPCI_OFFSET_JOBS 0x00000000
+#define NFPCI_OFFSET_JOBS_WR 0x00000000
+#define NFPCI_OFFSET_JOBS_RD 0x00010000
+#define NFPCI_OFFSET_KERN 0x00020000
+#define NFPCI_OFFSET_KERN_WR 0x00020000
+#define NFPCI_OFFSET_KERN_RD 0x00030000
+
+/* Interrupts, defined by bit position in doorbell register */
+
+/* Interrupts from device to host */
+#define NFAST_INT_DEVICE_WRITE_OK 0x00000001
+#define NFAST_INT_DEVICE_WRITE_FAILED 0x00000002
+#define NFAST_INT_DEVICE_READ_OK 0x00000004
+#define NFAST_INT_DEVICE_READ_FAILED 0x00000008
+#define NFAST_INT_DEVICE_KERN_WRITE_OK 0x00000010
+#define NFAST_INT_DEVICE_KERN_WRITE_FAILED 0x00000020
+#define NFAST_INT_DEVICE_KERN_READ_OK 0x00000040
+#define NFAST_INT_DEVICE_KERN_READ_FAILED 0x00000080
+
+/* Interrupts from host to device */
+#define NFAST_INT_HOST_WRITE_REQUEST 0x00010000
+#define NFAST_INT_HOST_READ_REQUEST 0x00020000
+#define NFAST_INT_HOST_DEBUG 0x00040000
+#define NFAST_INT_HOST_KERN_WRITE_REQUEST 0x00080000
+#define NFAST_INT_HOST_KERN_READ_REQUEST 0x00100000
+
+/* Ordinary job submission ------------------------ */
+
+/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined
+ by the following (byte) address offsets... */
+
+#define NFPCI_OFFSET_CONTROL 0x0
+#define NFPCI_OFFSET_LENGTH 0x4
+#define NFPCI_OFFSET_DATA 0x8
+#define NFPCI_OFFSET_PUSH_ADDR 0x8
+
+#define NFPCI_JOBS_WR_CONTROL (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_WR_LENGTH (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_WR_DATA (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_JOBS_WR_LEN (0x0000FFF8)
+
+#define NFPCI_JOBS_RD_CONTROL (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_RD_LENGTH (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_RD_DATA (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_JOBS_RD_PUSH_ADDR (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_JOBS_RD_LEN (0x000FFF8)
+
+/* Kernel inferface job submission ---------------- */
+
+#define NFPCI_KERN_WR_CONTROL (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_WR_LENGTH (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_WR_DATA (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_KERN_WR_LEN (0x0000FFF8)
+
+#define NFPCI_KERN_RD_CONTROL (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_RD_LENGTH (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_RD_DATA (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_KERN_RD_ADDR (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_KERN_RD_LEN (0x000FFF8)
+
+#ifdef DEFINE_NFPCI_PACKED_STRUCTS
+typedef struct
+{
+ UINT32 controlword;
+ UINT32 length; /* length of data to follow */
+ union {
+ BYTE data[1];
+ UINT32 addr;
+ } uu;
+}
+ NFPCI_JOBS_BLOCK;
+#endif
+
+
+#define NFPCI_JOB_CONTROL 0x00000001
+#define NFPCI_JOB_CONTROL_PCI_PUSH 0x00000002
+/*
+ The 'Control' word is analogous to the SCSI read/write address;
+ 1 = standard push/pull IO
+ 2 = push/push IO
+
+ To submit a block of job data, the host:
+ - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL
+ - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data
+ - copies the data to NFPCI_JOBS_WR_DATA
+ - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register
+ - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back
+
+ To read a block of jobs back, the host:
+ - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL
+ - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+ - sets interrupt NFAST_INT_HOST_READ_REQUEST
+ - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+ - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at
+ NFPCI_JOBS_RD_LENGTH to its actual length.
+
+ Optionally the host can request the PCI read data to be pushed to host PCI mapped ram:
+ - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max
+ size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8
+ - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH
+ - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+ - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of
+ the buffer
+ - sets interrupt NFAST_INT_HOST_READ_REQUEST
+ - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+ - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer. The
+ module will set NFPCI_OFFSET_LENGTH to the actual length.
+*/
+
+#define NFPCI_SCRATCH_CONTROL 0
+
+#define NFPCI_SCRATCH_CONTROL_HOST_MOI (1<<0)
+#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1
+#define NFPCI_SCRATCH_CONTROL_MODE_MASK (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT)
+
+#define NFPCI_SCRATCH_STATUS 1
+
+#define NFPCI_SCRATCH_STATUS_MONITOR_MOI (1<<0)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI (1<<1)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2)
+#define NFPCI_SCRATCH_STATUS_ERROR (1<<3)
+
+#define NFPCI_SCRATCH_ERROR_LO 2
+#define NFPCI_SCRATCH_ERROR_HI 3
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c
new file mode 100644
index 0000000000..fba62f9a37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/osif.c
@@ -0,0 +1,184 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+#include "nfp.h"
+#include "autoversion.h"
+
+/* config space access ---------------------------------- */
+
+nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) {
+ unsigned int tmp32;
+ if ( !pdev || !pdev->dev || !pdev->dev->conf_handle )
+ return NFP_ENODEV;
+
+/* pci_config_get32() does byte swapping, so put back to LE */
+ tmp32 = pci_config_get32( pdev->dev->conf_handle, offset );
+ TO_LE32_IO(res, tmp32);
+
+ return NFP_SUCCESS;
+}
+
+/* user space memory access ---------------------------------- */
+
+nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) {
+ bcopy(ubuf, kbuf, len);
+ return 0;
+}
+
+nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) {
+ bcopy(kbuf, ubuf, len);
+ return 0;
+}
+
+nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) {
+ /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */
+ return nfp_copy_to_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) {
+ /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */
+ return nfp_copy_from_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) {
+ if( len & 0x3 || offset & 0x3 )
+ DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR);
+ else
+ /* LINTED: alignment */
+ DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR);
+ return NFP_SUCCESS;
+}
+
+nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) {
+ if( len & 0x3 || offset & 0x3 )
+ DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR );
+ else
+ /* LINTED: alignment */
+ DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR );
+ return NFP_SUCCESS;
+}
+
+/* pci io space access --------------------------------------- */
+
+unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) {
+ nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+ /* LINTED: alignment */
+ return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) );
+}
+
+unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) {
+ nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+ /* LINTED: alignment */
+ return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) );
+}
+
+void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) {
+ nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+ /* LINTED: alignment */
+ DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data );
+}
+
+void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) {
+ nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+ /* LINTED: alignment */
+ DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data );
+}
+
+/* logging ---------------------------------------------------- */
+
+void nfp_log( int level, const char *fmt, ...)
+{
+ auto char buf[256];
+ va_list ap;
+
+ switch (level) {
+ case NFP_DBG4: if (nfp_debug < 4) break;
+ /*FALLTHROUGH*/
+ case NFP_DBG3: if (nfp_debug < 3) break;
+ /*FALLTHROUGH*/
+ case NFP_DBG2: if (nfp_debug < 2) break;
+ /*FALLTHROUGH*/
+ case NFP_DBG1: if (nfp_debug < 1) break;
+ /*FALLTHROUGH*/
+ default:
+ va_start(ap, fmt);
+ (void) vsnprintf(buf, 256, fmt, ap);
+ va_end(ap);
+ cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf);
+ break;
+ }
+}
+
+struct errstr {
+ int oserr;
+ nfp_err nferr;
+};
+
+
+static struct errstr errtab[] = {
+ { EFAULT, NFP_EFAULT },
+ { ENOMEM, NFP_ENOMEM },
+ { EINVAL, NFP_EINVAL },
+ { EIO, NFP_EIO },
+ { ENXIO, NFP_ENXIO },
+ { ENODEV, NFP_ENODEV },
+ { EINVAL, NFP_EUNKNOWN },
+ { 0, 0 }
+};
+
+nfp_err nfp_error( int oserr )
+{
+ struct errstr *perr;
+ if(!oserr)
+ return 0;
+ perr= errtab;
+ while(perr->nferr) {
+ if(perr->oserr == oserr)
+ return perr->nferr;
+ perr++;
+ }
+ return NFP_EUNKNOWN;
+}
+
+int nfp_oserr( nfp_err nferr )
+{
+ struct errstr *perr;
+ if(nferr == NFP_SUCCESS)
+ return 0;
+ perr= errtab;
+ while(perr->nferr) {
+ if(perr->nferr == nferr)
+ return perr->oserr;
+ perr++;
+ }
+ return EIO;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
new file mode 100644
index 0000000000..2ad3f4f591
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -0,0 +1,2184 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay Devices
+ *
+ * Overlay devices provide a means for creating overlay networks, a means of
+ * multiplexing multiple logical, isolated, and discrete layer two and layer
+ * three networks on top of one physical network.
+ *
+ * In general, these overlay devices encapsulate the logic to answer two
+ * different questions:
+ *
+ * 1) How should I transform a packet to put it on the wire?
+ * 2) Where should I send a transformed packet?
+ *
+ * Each overlay device is presented to the user as a GLDv3 device. While the
+ * link itself cannot have an IP interface created on top of it, it allows for
+ * additional GLDv3 devices, such as a VNIC, to be created on top of it which
+ * can be plumbed up with IP interfaces.
+ *
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * The logical overlay device that a user sees in dladm(1M) is a combination of
+ * two different components that work together. The first component is this
+ * kernel module, which is responsible for answering question one -- how should
+ * I transform a packet to put it on the wire.
+ *
+ * The second component is what we call the virtual ARP daemon, or varpd. It is
+ * a userland component that is responsible for answering the second question --
+ * Where should I send a transformed packet. Instances of the kernel overlay
+ * GLDv3 device ask varpd the question of where should a packet go.
+ *
+ * The split was done for a few reasons. Importantly, we wanted to keep the act
+ * of generating encapsulated packets in the kernel so as to ensure that the
+ * general data path was fast and also kept simple. On the flip side, while the
+ * question of where should something go may be simple, it may often be
+ * complicated and need to interface with several different external or
+ * distributed systems. In those cases, it's simpler to allow for the full
+ * flexibility of userland to be brought to bear to solve that problem and in
+ * general, the path isn't very common.
+ *
+ * The following is what makes up the logical overlay device that a user would
+ * create with dladm(1M).
+ *
+ * Kernel Userland
+ * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+ * . +--------+ +--------+ +--------+ . . .
+ * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . .
+ * . +--------+ +--------+ +--------+ . . .
+ * . | | | . . .
+ * . | | | . . .
+ * . +------------+-----------+ . . .
+ * . | . . /dev/overlay .
+ * . +--------------+ . . . +------------+ .
+ * . | | . . . | | .
+ * . | Overlay |======*=================| Virtual | .
+ * . | GLDv3 Device |========================| ARP Daemon | .
+ * . | | . . | | .
+ * . +--------------+ . . +------------+ .
+ * . | . . | .
+ * . | . . | .
+ * . +----------------+ . . +--------+ .
+ * . | Overlay | . . | varpd | .
+ * . | Encapsulation | . . | Lookup | .
+ * . | Plugin | . . | Plugin | .
+ * . +----------------+ . . +--------+ .
+ * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+ *
+ *
+ * This image shows the two different components and where they live.
+ * Importantly, it also shows that both the kernel overlay device and the
+ * userland varpd both support plugins. The plugins actually implement the
+ * things that users care about and the APIs have been designed to try to
+ * minimize the amount of things that a module writer needs to worry about it.
+ *
+ * IDENTIFIERS
+ *
+ * Every overlay device is defined by a unique identifier which is the overlay
+ * identifier. Its purpose is similar to that of a VLAN identifier, it's a
+ * unique number that is used to differentiate between different entries on the
+ * wire.
+ *
+ * ENCAPSULATION
+ *
+ * An overlay encapsulation plugin is a kernel miscellaneous module whose
+ * purpose is to contain knowledge about how to transform packets to put them
+ * onto the wire and to take them off. An example of an encapsulation plugin is
+ * vxlan. It's also how support for things like nvgre or geneve would be brought
+ * into the system.
+ *
+ * Each encapsulation plugins defines a series of operation vectors and
+ * properties. For the full details on everything they should provide, please
+ * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
+ * for telling the system what information is required to send a packet. For
+ * example, vxlan is defined to send everything over a UDP packet and therefore
+ * requires a port and an IP address, while nvgre on the other hand is its own
+ * IP type and therefore just requires an IP address. In addition, it also
+ * provides information about the kind of socket that should be created. This is
+ * used by the kernel multiplexor, more of that in the Kernel Components
+ * section.
+ *
+ * LOOKUPS
+ *
+ * The kernel communicates requests for lookups over the character device
+ * /dev/overlay. varpd is responsible for listening for requests on that device
+ * and answering them. The character device is specific to the target path and
+ * varpd.
+ *
+ * Much as the kernel overlay module handles the bulk of the scaffolding but
+ * leaves the important work to the encapsulation plugin, varpd provides a
+ * similar role and leaves the full brunt of lookups to a userland dynamic
+ * shared object which implements the logic of lookups.
+ *
+ * Each lookup plugin defines a series of operation vectors and properties. For
+ * the full details on everything that they should provide, please read
+ * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
+ * address and asked to give an address on the physical network that it should
+ * be sent to. In addition, they handle questions related to how to handle
+ * things like broadcast and multicast traffic, etc.
+ *
+ * ----------
+ * Properties
+ * ----------
+ *
+ * A device from a dladm perspective has a unique set of properties that are
+ * combined from three different sources:
+ *
+ * 1) Generic properties that every overlay device has
+ * 2) Properties that are specific to the encapsulation plugin
+ * 3) Properties that are specific to the lookup plugin
+ *
+ * All of these are exposed in a single set of properties in dladm. Note that
+ * these are not necessarily traditional link properties. However, if something
+ * is both a traditional GLDv3 link property, say the MTU of a device, and a
+ * specific property here, than the driver ensures that all existing GLDv3
+ * specific means of manipulating it are used and wraps up its private property
+ * interfaces to ensure that works.
+ *
+ * Properties in the second and third category are prefixed with the name of
+ * their module. For example, the vxlan encapsulation module has a property
+ * called the 'listen_ip'. This property would show up in dladm as
+ * 'vxlan/listen_ip'. This allows different plugins to both use similar names
+ * for similar properties and to also have independent name spaces so that
+ * overlapping names do not conflict with anything else.
+ *
+ * While the kernel combines both sets one and two into a single coherent view,
+ * it does not do anything with respect to the properties that are owned by the
+ * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
+ * charge of bridging these two worlds into one magical experience for the user.
+ * It carries the burden of knowing about both overlay specific and varpd
+ * specific properties. Importantly, we want to maintain this distinction. We
+ * don't want to treat the kernel as an arbitrary key/value store for varpd and
+ * we want the kernel to own its own data and not have to ask userland for
+ * information that it owns.
+ *
+ * Every property in the system has the following attributes:
+ *
+ * o A name
+ * o A type
+ * o A size
+ * o Permissions
+ * o Default value
+ * o Valid value ranges
+ * o A value
+ *
+ * Everything except for the value is obtained by callers through the propinfo
+ * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
+ * currently 256 bytes.
+ *
+ * The following are the supported types of properties:
+ *
+ * OVERLAY_PROP_T_INT
+ *
+ * A signed integer, its length is 8 bytes, corresponding to a
+ * int64_t.
+ *
+ * OVERLAY_PROP_T_UINT
+ *
+ * An unsigned integer, its length is 8 bytes, corresponding to a
+ * uint64_t.
+ *
+ * OVERLAY_PROP_T_IP
+ *
+ * A struct in6_addr, it has a fixed size.
+ *
+ * OVERLAY_PROP_T_STRING
+ *
+ * A null-terminated character string encoded in either ASCII or
+ * UTF-8. Note that the size of the string includes the null
+ * terminator.
+ *
+ * The next thing that we apply to a property is its permission. The permissions
+ * are put together by the bitwise or of the following flags and values.
+ *
+ * OVERLAY_PROP_PERM_REQ
+ *
+ * This indicates a required property. A property that is required
+ * must be set by a consumer before the device can be created. If a
+ * required property has a default property, this constraint is
+ * loosened because the default property defines the value.
+ *
+ * OVERLAY_PORP_PERM_READ
+ *
+ * This indicates that a property can be read. All properties will
+ * have this value set.
+ *
+ * OVERLAY_PROP_PERM_WRITE
+ *
+ * This indicates that a property can be written to and thus
+ * updated by userland. Properties that are only intended to
+ * display information, will not have OVERLAY_PROP_PERM_WRITE set.
+ *
+ * In addition, a few additional values are defined as a convenience to
+ * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
+ * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
+ * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
+ * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
+ * property should generally be a constant across its lifetime.
+ *
+ * A property may optionally have a default value. If it does have a default
+ * value, and that property is not set to be a different value, then the default
+ * value is inherited automatically. It also means that if the default value is
+ * acceptable, there is no need to set the value for a required property. For
+ * example, the vxlan module has the vxlan/listen_port property which is
+ * required, but has a default value of 4789 (the IANA assigned port). Because
+ * of that default value, there is no need for it to be set.
+ *
+ * Finally, a property may declare a list of valid values. These valid values
+ * are used for display purposes, they are not enforced by the broader system,
+ * but merely allow a means for the information to be communicated to the user
+ * through dladm(1M). Like a default value, this is optional.
+ *
+ * The general scaffolding does not do very much with respect to the getting and
+ * setting of properties. That is really owned by the individual plugins
+ * themselves.
+ *
+ * -----------------------------
+ * Destinations and Plugin Types
+ * -----------------------------
+ *
+ * Both encapsulation and lookup plugins define the kinds of destinations that
+ * they know how to support. There are three different pieces of information
+ * that can be used to address to a destination currently, all of which is
+ * summarized in the type overlay_point_t. Any combination of these is
+ * supported.
+ *
+ * OVERLAY_PLUGIN_D_ETHERNET
+ *
+ * An Ethernet MAC address is required.
+ *
+ * OVERLAY_PLUGIN_D_IP
+ *
+ * An IP address is required. All IP addresses used by the overlay
+ * system are transmitted as IPv6 addresses. IPv4 addresses can be
+ * represented by using IPv4-mapped IPv6 addresses.
+ *
+ * OVERLAY_PLUGIN_D_PORT
+ *
+ * A TCP/UDP port is required.
+ *
+ * A kernel encapsulation plugin declares which of these that it requires, it's
+ * a static set. On the other hand, a userland lookup plugin can be built to
+ * support all of these or any combination thereof. It gets passed the required
+ * destination type, based on the kernel encapsulation method, and then it makes
+ * the determination as to whether or not it supports it. For example, the
+ * direct plugin can support either an IP or both an IP and a port, it simply
+ * doesn't display the direct/dest_port property in the cases where a port is
+ * not required to support this.
+ *
+ * The user lookup plugins have two different modes of operation which
+ * determines how they interact with the broader system and how look ups are
+ * performed. These types are:
+ *
+ * OVERLAY_TARGET_POINT
+ *
+ * A point to point plugin has a single static definition for where
+ * to send all traffic. Every packet in the system always gets sent
+ * to the exact same destination which is programmed into the
+ * kernel when the general device is activated.
+ *
+ * OVERLAY_TARGET_DYNAMIC
+ *
+ * A dynamic plugin does not have a single static definition.
+ * Instead, for each destination, the kernel makes an asynchronous
+ * request to varpd to determine where the packet should be routed,
+ * and if a specific destination is found, then that destination is
+ * cached in the overlay device's target cache.
+ *
+ * This distinction, while important for the general overlay device's operation,
+ * is not important to the encapsulation plugins. They don't need to know about
+ * any of these pieces. It's just a concern for varpd, the userland plugin, and
+ * the general overlay scaffolding.
+ *
+ * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
+ * maintain a target cache, and instead just keeps track of the destination and
+ * always sends encapsulated packets to that address. When the target type is of
+ * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
+ * destinations. These destinations are kept around in an instance of a
+ * reference hash that is specific to the given overlay device. Entries in the
+ * cache can be invalidated and replaced by varpd and its lookup plugins.
+ *
+ * ----------------------------------
+ * Kernel Components and Architecture
+ * ----------------------------------
+ *
+ * There are multiple pieces inside the kernel that work together, there is the
+ * general overlay_dev_t structure, which is the logical GLDv3 device, but it
+ * itself has references to things like an instance of an encapsulation plugin,
+ * a pointer to a mux and a target cache. It can roughly be summarized in the
+ * following image:
+ *
+ * +------------------+
+ * | global |
+ * | overlay list |
+ * | overlay_dev_list |
+ * +------------------+
+ * |
+ * | +-----------------------+ +---------------+
+ * +->| GLDv3 Device |----------->| GLDv3 Device | -> ...
+ * | overlay_dev_t | | overlay_dev_t |
+ * | | +---------------+
+ * | |
+ * | mac_handle_t -----+---> GLDv3 handle to MAC
+ * | datalink_id_t -----+---> Datalink ID used by DLS
+ * | overlay_dev_flag_t ---+---> Device state
+ * | uint_t -----+---> Curent device MTU
+ * | uint_t -----+---> In-progress RX operations
+ * | uint_t -----+---> In-progress TX operations
+ * | char[] -----+---> FMA degraded message
+ * | void * -----+---> plugin private data
+ * | overlay_target_t * ---+---------------------+
+ * | overlay_plugin_t * ---+---------+ |
+ * +-----------------------+ | |
+ * ^ | |
+ * +--------------------+ | | |
+ * | Kernel Socket | | | |
+ * | Multiplexor | | | |
+ * | overlay_mux_t | | | |
+ * | | | | |
+ * | avl_tree_t -+--+ | |
+ * | uint_t -+--> socket family | |
+ * | uint_t -+--> socket type | |
+ * | uint_t -+--> socket protocol | |
+ * | ksocket_t -+--> I/O socket | |
+ * | struct sockaddr * -+--> ksocket address | |
+ * | overlay_plugin_t --+--------+ | |
+ * +--------------------+ | | |
+ * | | |
+ * +-------------------------+ | | |
+ * | Encap Plugin |<--+-----------+ |
+ * | overlay_plugin_t | |
+ * | | |
+ * | char * ---+--> plugin name |
+ * | overlay_plugin_ops_t * -+--> plugin downcalls |
+ * | char ** (props) ---+--> property list |
+ * | uint_t ---+--> id length |
+ * | overlay_plugin_flags_t -+--> plugin flags |
+ * | overlay_plugin_dest_t --+--> destination type v
+ * +-------------------------+ +-------------------------+
+ * | Target Cache |
+ * | overlay_target_t |
+ * | |
+ * cache mode <--+- overlay_target_mode_t |
+ * dest type <--+- overlay_plugin_dest_t |
+ * cache flags <--+- overlay_target_flag_t |
+ * varpd id <--+- uint64_t |
+ * outstanding varpd reqs. <--+- uint_t |
+ * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t |
+ * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t |
+ * | +-------------------------+
+ * +-----------------------+
+ * |
+ * v
+ * +-------------------------------+ +------------------------+
+ * | Target Entry |-->| Target Entry |--> ...
+ * | overlay_target_entry_t | | overlay_target_entry_t |
+ * | | +------------------------+
+ * | |
+ * | overlay_target_entry_flags_t -+--> Entry flags
+ * | uint8_t[ETHERADDRL] ---+--> Target MAC address
+ * | overlay_target_point_t ---+--> Target underlay address
+ * | mblk_t * ---+--> outstanding mblk head
+ * | mblk_t * ---+--> outstanding mblk tail
+ * | size_t ---+--> outstanding mblk size
+ * +-------------------------------+
+ *
+ * The primary entries that we care about are the overlay_dev_t, which
+ * correspond to each overlay device that is created with dladm(1M). Globally,
+ * these devices are maintained in a simple list_t which is protected with a
+ * lock. Hence, these include important information such as the mac_handle_t
+ * and a datalink_id_t which is used to interact with the broader MAC and DLS
+ * ecosystem. We also maintain additional information such as the current state,
+ * outstanding operations, the mtu, and importantly, the plugin's private data.
+ * This is the instance of an encapsulation plugin that gets created as part of
+ * creating an overlay device. Another aspect of this is that the overlay_dev_t
+ * also includes information with respect to FMA. For more information, see the
+ * FMA section.
+ *
+ * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
+ * is the encapsulation plugin. This allows the device to make downcalls into it
+ * based on doing things like getting and setting properties. Otherwise, the
+ * plugin itself is a fairly straightforward entity. They are maintained in an
+ * (not pictured above) list. The plugins themselves mostly maintain things like
+ * the static list of properties, what kind of destination they require, and the
+ * operations vector. A given module may contain more if necessary.
+ *
+ * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
+ * maintains a ksocket and it is through the mux that we send and receive
+ * message blocks. The mux represents a socket type and address, as well as a
+ * plugin. Multiple overlay_dev_t devices may then share the same mux. For
+ * example, consider the case where you have different instances of vxlan all on
+ * the same underlay network. These would all logically share the same IP
+ * address and port that packets are sent and received on; however, what differs
+ * is the decapuslation ID.
+ *
+ * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
+ * a socket, we enable a direct callback on the ksocket. This means that
+ * whenever a message block chain is received, rather than sitting there and
+ * getting a callback in a context and kicking that back out to a taskq. Instead
+ * data comes into the callback function overlay_mux_recv().
+ *
+ * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
+ * function) to transmit. It receives encapsulated packets, decapsulates them to
+ * determine the overlay identifier, looks up the given device that matches that
+ * identifier, and then causes the broader MAC world to receive the packet with
+ * a call to mac_rx().
+ *
+ * Today, we don't do too much that's special with the ksocket; however, as
+ * hardware is gaining understanding for these encapuslation protocols, we'll
+ * probably want to think of better ways to get those capabilities passed down
+ * and potentially better ways to program receive filters so they get directly
+ * to us. Though, that's all fantasy future land.
+ *
+ * The next part of the puzzle is the target cache. The purpose of the target
+ * cache is to cache where we should send a packet on the underlay network,
+ * given its mac address. The target cache operates in two modes depending on
+ * whether the lookup module was declared to OVERLAY_TARGET_POINT or
+ * OVERLAY_TARGET_DYANMIC.
+ *
+ * In the case where the target cache has been programmed to be
+ * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
+ * which has the destination that we send everything, no matter the destination
+ * mac address.
+ *
+ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
+ * are much more interesting and as a result, more complicated. We primarily
+ * store lists of overlay_target_entry_t's which are stored in both an avl tree
+ * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * is only used for a few of the target ioctls used to dump data such that we
+ * can get a consistent iteration order for things like dladm show-overlay -t.
+ * The key that we use for the reference hashtable is based on the mac address
+ * in the cache and currently we just do a simple CRC32 to transform it into a
+ * hash.
+ *
+ * Each entry maintains a set of flags to indicate the current status of the
+ * request. The flags may indicate one of three states: that current cache entry
+ * is valid, that the current cache entry has been directed to drop all output,
+ * and that the current cache entry is invalid and may be being looked up. In
+ * the case where it's valid, we just take the destination address and run with
+ * it.
+ *
+ * If it's invalid and a lookup has not been made, then we start the process
+ * that prepares a query that will make its way up to varpd. The cache entry
+ * entry maintains a message block chain of outstanding message blocks and a
+ * size. These lists are populated only when we don't know the answer as to
+ * where should these be sent. The size entry is used to cap the amount of
+ * outstanding data that we don't know the answer to. If we exceed a cap on the
+ * amount of outstanding data (currently 1 Mb), then we'll drop any additional
+ * packets. Once we get an answer indicating a valid destination, we transmit
+ * any outstanding data to that place. For the full story on how we look that up
+ * will be discussed in the section on the Target Cache Lifecycle.
+ *
+ * ------------------------
+ * FMA and Degraded Devices
+ * ------------------------
+ *
+ * Every kernel overlay device keeps track of its FMA state. Today in FMA we
+ * cannot represent partitions between resources nor can we represent that a
+ * given minor node of a psuedo device has failed -- if we degrade the overlay
+ * device, then the entire dev_info_t is degraded. However, we still want to be
+ * able to indicate to administrators that things may go wrong.
+ *
+ * To this end, we've added a notion of a degraded state to every overlay
+ * device. This state is primarily dictated by userland and it can happen for
+ * various reasons. Generally, because a userland lookup plugin has been
+ * partitioned, or something has gone wrong such that there is no longer any
+ * userland lookup module for a device, then we'll mark it degraded.
+ *
+ * As long as any of our minor instances is degraded, then we'll fire off the
+ * FMA event to note that. Once the last degraded instance is no longer
+ * degraded, then we'll end up telling FMA that we're all clean.
+ *
+ * To help administrators get a better sense of which of the various minor
+ * devices is wrong, we store the odd_fmamsg[] character array. This character
+ * array can be fetched with doing a dladm show-overlay -f.
+ *
+ * Note, that it's important that we do not update the link status of the
+ * devices. We want to remain up as much as possible. By changing the link in a
+ * degraded state, this may end up making things worse. We may still actually
+ * have information in the target cache and if we mark the link down, that'll
+ * result in not being able to use it. The reason being that this'll mark all
+ * the downstream VNICs down which will go to IP and from there we end up
+ * dealing with sadness.
+ *
+ * -----------------------
+ * Target Cache Life Cycle
+ * -----------------------
+ *
+ * This section only applies when we have a lookup plugin of
+ * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
+ * OVERLAY_TARGET_POINT.
+ *
+ * While we got into the target cache in the general architecture section, it's
+ * worth going into more details as to how this actually works and showing some
+ * examples and state machines. Recall that a target cache entry basically has
+ * the following state transition diagram:
+ *
+ * Initial state
+ * . . . . . . first access . . . varpd lookup enqueued
+ * . . .
+ * . . .
+ * +-------+ . +----------+ .
+ * | No |------*---->| Invalid |-------*----+
+ * | Entry | | Entry | |
+ * +-------+ +----------+ |
+ * varpd ^ ^ varpd |
+ * invalidate | | drop |
+ * . . . * * . . v
+ * +-------+ | | +---------+
+ * | Entry |--->-----+ +----<----| Entry |
+ * | Valid |<----------*---------<----| Pending |->-+ varpd
+ * +-------+ . +---------+ * . . drop, but
+ * . varpd ^ | other queued
+ * . success | | entries
+ * +-----+
+ *
+ * When the table is first created, it is empty. As we attempt to lookup entries
+ * and we find there is no entry at all, we'll create a new table entry for it.
+ * At that point the entry is technically in an invalid state, that means that
+ * we have no valid data from varpd. In that case, we'll go ahead and queue the
+ * packet into the entry's pending chain, and queue a varpd lookup, setting the
+ * OVERLAY_ENTRY_F_PENDING flag in the progress.
+ *
+ * If additional mblk_t's come in for this entry, we end up appending them to
+ * the tail of the chain, if and only if, we don't exceed the threshold for the
+ * amount of space they can take up. An entry remains pending until we get a
+ * varpd reply. If varpd replies with a valid results, we move to the valid
+ * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
+ * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
+ *
+ * Once an entry is valid, it stays valid until user land tells us to invalidate
+ * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
+ * OVERLAY_TARG_CACHE_SET respectively.
+ *
+ * If the lookup fails with a call to drop the packet, then the next state is
+ * determined by the state of the queue. If the set of outstanding entries is
+ * empty, then we just transition back to the invalid state. If instead, the
+ * set of outstanding entries is not empty, then we'll queue another entry and
+ * stay in the same state, repeating this until the number of requests is
+ * drained.
+ *
+ * The following images describes the flow of a given lookup and where the
+ * overlay_target_entry_t is at any given time.
+ *
+ * +-------------------+
+ * | Invalid Entry | An entry starts off as an invalid entry
+ * | de:ad:be:ef:00:00 | and only exists in the target cache.
+ * +-------------------+
+ *
+ * ~~~~
+ *
+ * +---------------------+
+ * | Global list_t | A mblk_t comes in for an entry. We
+ * | overlay_target_list | append it to the overlay_target_list.
+ * +---------------------+
+ * |
+ * v
+ * +-------------------+ +-------------------+
+ * | Pending Entry |----->| Pending Entry |--->...
+ * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 |
+ * +-------------------+ +-------------------+
+ *
+ * ~~~~
+ *
+ * +--------------------------+
+ * | /dev/overlay minor state | User land said that it would look up an
+ * | overlay_target_hdl_t | entry for us. We remove it from the
+ * +--------------------------+ global list and add it to the handle's
+ * | outstanding list.
+ * |
+ * v
+ * +-------------------+ +-------------------+
+ * | Pending Entry |----->| Pending Entry |
+ * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 |
+ * +-------------------+ +-------------------+
+ *
+ * ~~~~
+ *
+ * +-------------------+
+ * | Valid Entry | varpd returned an answer with
+ * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache
+ * | 10.169.23.42:4789 | entry is now populated with a
+ * +-------------------+ destination and marked as valid
+ *
+ *
+ * The lookup mechanism is performed via a series of operations on the character
+ * psuedo-device /dev/overlay. The only thing that uses this device is the
+ * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
+ * granting a new minor number which maintains its own state. We maintain this
+ * state so that way if an outstanding lookup was queued to something that
+ * crashed or closed its handle without responding, we can know about this and
+ * thus handle it appropriately.
+ *
+ * When a lookup is first created it's added to our global list of outstanding
+ * lookups. To service requests, userland is required to perform an ioctl to ask
+ * for a request. We will block it in the kernel a set amount of time waiting
+ * for a request. When we give a request to a given minor instance of the
+ * device, we remove it from the global list and append the request to the
+ * device's list of outstanding entries, for the reasons we discussed above.
+ * When a lookup comes in, we give user land a smaller amount of information
+ * specific to that packet, the overlay_targ_lookup_t. It includes a request id
+ * to identify this, and then the overlay id, the varpd id, the header and
+ * packet size, the source and destination mac address, the SAP, and any
+ * potential VLAN header.
+ *
+ * At that point, it stays in that outstanding list until one of two ioctls are
+ * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
+ * userland may also perform other operations. For example, it may use
+ * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
+ * analysis of what to do beyond what we gave it initially. This is useful for
+ * providing proxy arp and the like. Finally, there are two other ioctls that
+ * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
+ * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
+ * causes us to encapsulate and send out the packet they've given us.
+ *
+ *
+ * Finally, through the target cache, several ioctls are provided to allow for
+ * interrogation and management of the cache. They allow for individual entries
+ * to be retrieved, set, or have the entire table flushed. For the full set of
+ * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
+ *
+ * ------------------
+ * Sample Packet Flow
+ * ------------------
+ *
+ * There's a lot of pieces here, hopefully an example of how this all fits
+ * together will help clarify and elucidate what's going on. We're going to
+ * first track an outgoing packet, eg. one that is sent from an IP interface on
+ * a VNIC on top of an overlay device, and then we'll look at what it means to
+ * respond to that.
+ *
+ *
+ * +----------------+ +--------------+ +------------------+
+ * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches |
+ * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx |
+ * +----------------+ | VNIC device | | overlay_m_tx() |
+ * +--------------+ +------------------+
+ * |
+ * . lookup . cache |
+ * . drop . miss v
+ * +---------+ . +--------+ . +------------------+
+ * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk |
+ * | mblk_t | | lookup | | in the target |
+ * +---------+ | queued | | cache |
+ * ^ +--------+ +------------------+
+ * on send | | | cache
+ * error . . * *. . lookup * . . hit
+ * | | success v
+ * | | +------------------+
+ * +-----------------+ +--------------->| call plugin |
+ * | Send out | | ovpo_encap() to |
+ * | overlay_mux_t's |<----------------------------------| get encap mblk_t |
+ * | ksocket | +------------------+
+ * +-----------------+
+ *
+ * The receive end point looks a little different and looks more like:
+ *
+ * +------------------+ +----------------+ +-----------+
+ * | mblk_t comes off |---->| enter netstack |--->| delivered |---+
+ * | the physical | | IP stack | | to | * . . direct
+ * | device | +----------------+ | ksocket | | callback
+ * +------------------+ +-----------+ |
+ * . overlay id |
+ * . not found v
+ * +-----------+ . +-----------------+ +--------------------+
+ * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() |
+ * | mblk_t | | ovpo_decap() to | +--------------------+
+ * +-----------+ | decap mblk_t |
+ * +-----------------+
+ * |
+ * * . . overlay id
+ * v found
+ * +--------+ +----------------+
+ * | adjust |----->| call mac_rx |
+ * | mblk_t | | on original |
+ * +--------+ | decaped packet |
+ * +----------------+
+ *
+ * ------------------
+ * Netstack Awareness
+ * ------------------
+ *
+ * In the above image we note that this enters a netstack. Today the only
+ * netstack that can be is the global zone as the overlay driver itself is not
+ * exactly netstack aware. What this really means is that varpd cannot run in a
+ * non-global zone and an overlay device cannot belong to a non-global zone.
+ * Non-global zones can still have a VNIC assigned to them that's been created
+ * over the overlay device the same way they would if it had been created over
+ * an etherstub or a physical device.
+ *
+ * The majority of the work to make it netstack aware is straightforward and the
+ * biggest thing is to create a netstack module that allows us to hook into
+ * netstack (and thus zone) creation and destruction. From there, we need to
+ * amend the target cache lookup routines that we discussed earlier to not have
+ * a global outstanding list and a global list of handles, but rather, one per
+ * netstack.
+ *
+ * For the mux, we'll need to open the ksocket in the context of the zone, we
+ * can likely do this with a properly composed credential, but we'll need to do
+ * some more work on that path. Finally, we'll want to make sure the dld ioctls
+ * are aware of the zoneid of the caller and we use that appropriately and store
+ * it in the overlay_dev_t.
+ *
+ * -----------
+ * GLDv3 Notes
+ * -----------
+ *
+ * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
+ * relevant and other parts are much less relevant for us. For example, the
+ * GLDv3 is used to toggle the device being put into and out of promiscuous
+ * mode, to program MAC addresses for unicast and multicast hardware filters.
+ * Today, an overlay device doesn't have a notion of promiscuous mode nor does
+ * it have a notion of unicast and multicast addresses programmed into the
+ * device. Instead, for the purposes of the hardware filter, we don't do
+ * anything and just always accept new addresses being added and removed.
+ *
+ * If the GLDv3 start function has not been called, then we will not use this
+ * device for I/O purposes. Any calls to transmit or receive should be dropped,
+ * though the GLDv3 guarantees us that transmit will not be called without
+ * calling start. Similarly, once stop is called, then no packets can be dealt
+ * with.
+ *
+ * Today we don't support the stat interfaces, though there's no good reason
+ * that we shouldn't assemble some of the stats based on what we have in the
+ * future.
+ *
+ * When it comes to link properties, many of the traditional link properties do
+ * not apply and many others MAC handles for us. For example, we don't need to
+ * implement anything for overlay_m_getprop() to deal with returning the MTU, as
+ * MAC never calls into us for that. As such, there isn't much of anything to
+ * support in terms of properties.
+ *
+ * Today, we don't support any notion of hardware capabilities. However, if
+ * future NIC hardware or other changes to the system cause it to make sense for
+ * us to emulate logical groups, then we should do that. However, we still do
+ * implement a capab function so that we can identify ourselves as an overlay
+ * device to the broader MAC framework. This is done mostly so that a device
+ * created on top of us can have fanout rings as we don't try to lie about a
+ * speed for our device.
+ *
+ * The other question is what should be done for a device's MTU and margin. We
+ * set our minimum supported MTU to be the minimum value that an IP network may
+ * be set to 576 -- which mimics what an etherstub does. On the flip side, we
+ * have our upper bound set to 8900. This value comes from the fact that a lot
+ * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
+ * bytes, which isn't exactly the most accurate number, but it'll be good enough
+ * for now. Because of that, our default MTU off of these devices is 1400, as
+ * the default MTU for everything is usually 1500 or whatever the underlying
+ * device is at; however, this is a bit simpler than asking the netstack what
+ * are all the IP interfaces at. It also calls into question how PMTU and PMTU
+ * discovery should work here. The challenge, especially for
+ * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
+ * not clear that if you have a single bad entry that the overall MTU should be
+ * lowered. Instead, we should figure out a better way of determining these
+ * kinds of PMTU errors and appropriately alerting the administrator via FMA.
+ *
+ * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
+ * or not the underlying encapsulation device supports VLAN tags. If it does,
+ * then we'll set the margin to allow for it, otherwise, we will not.
+ */
+
+#include <sys/conf.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/policy.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/ddifm.h>
+
+#include <sys/dls.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_ether.h>
+#include <sys/vlan.h>
+
+#include <sys/overlay_impl.h>
+
+dev_info_t *overlay_dip;
+static kmutex_t overlay_dev_lock;
+static list_t overlay_dev_list;
+static uint8_t overlay_macaddr[ETHERADDRL] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+typedef enum overlay_dev_prop {
+ OVERLAY_DEV_P_MTU = 0,
+ OVERLAY_DEV_P_VNETID,
+ OVERLAY_DEV_P_ENCAP,
+ OVERLAY_DEV_P_VARPDID
+} overlay_dev_prop_t;
+
+#define OVERLAY_DEV_NPROPS 4
+static const char *overlay_dev_props[] = {
+ "mtu",
+ "vnetid",
+ "encap",
+ "varpd/id"
+};
+
+#define OVERLAY_MTU_MIN 576
+#define OVERLAY_MTU_DEF 1400
+#define OVERLAY_MTU_MAX 8900
+
+overlay_dev_t *
+overlay_hold_by_dlid(datalink_id_t id)
+{
+ overlay_dev_t *o;
+
+ mutex_enter(&overlay_dev_lock);
+ for (o = list_head(&overlay_dev_list); o != NULL;
+ o = list_next(&overlay_dev_list, o)) {
+ if (id == o->odd_linkid) {
+ mutex_enter(&o->odd_lock);
+ o->odd_ref++;
+ mutex_exit(&o->odd_lock);
+ mutex_exit(&overlay_dev_lock);
+ return (o);
+ }
+ }
+
+ mutex_exit(&overlay_dev_lock);
+ return (NULL);
+}
+
+void
+overlay_hold_rele(overlay_dev_t *odd)
+{
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_ref > 0);
+ odd->odd_ref--;
+ mutex_exit(&odd->odd_lock);
+}
+
+void
+overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ if (flag & OVERLAY_F_IN_RX)
+ odd->odd_rxcount++;
+ if (flag & OVERLAY_F_IN_TX)
+ odd->odd_txcount++;
+ odd->odd_flags |= flag;
+}
+
+void
+overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ boolean_t signal = B_FALSE;
+
+ ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ if (flag & OVERLAY_F_IN_RX) {
+ ASSERT(odd->odd_rxcount > 0);
+ odd->odd_rxcount--;
+ if (odd->odd_rxcount == 0) {
+ signal = B_TRUE;
+ odd->odd_flags &= ~OVERLAY_F_IN_RX;
+ }
+ }
+ if (flag & OVERLAY_F_IN_TX) {
+ ASSERT(odd->odd_txcount > 0);
+ odd->odd_txcount--;
+ if (odd->odd_txcount == 0) {
+ signal = B_TRUE;
+ odd->odd_flags &= ~OVERLAY_F_IN_TX;
+ }
+ }
+
+ if (signal == B_TRUE)
+ cv_broadcast(&odd->odd_iowait);
+}
+
+static void
+overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ while (odd->odd_flags & flag) {
+ cv_wait(&odd->odd_iowait, &odd->odd_lock);
+ }
+}
+
+void
+overlay_dev_iter(overlay_dev_iter_f func, void *arg)
+{
+ overlay_dev_t *odd;
+
+ mutex_enter(&overlay_dev_lock);
+ for (odd = list_head(&overlay_dev_list); odd != NULL;
+ odd = list_next(&overlay_dev_list, odd)) {
+ if (func(odd, arg) != 0) {
+ mutex_exit(&overlay_dev_lock);
+ return;
+ }
+ }
+ mutex_exit(&overlay_dev_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+ return (ENOTSUP);
+}
+
+static int
+overlay_m_start(void *arg)
+{
+ overlay_dev_t *odd = arg;
+ overlay_mux_t *mux;
+ int ret, domain, family, prot;
+ struct sockaddr_storage storage;
+ socklen_t slen;
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
+ mutex_exit(&odd->odd_lock);
+ return (EAGAIN);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
+ &family, &prot, (struct sockaddr *)&storage, &slen);
+ if (ret != 0)
+ return (ret);
+
+ mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
+ (struct sockaddr *)&storage, slen, &ret);
+ if (mux == NULL)
+ return (ret);
+
+ overlay_mux_add_dev(mux, odd);
+ odd->odd_mux = mux;
+ mutex_enter(&odd->odd_lock);
+ ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
+ odd->odd_flags |= OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+
+ return (0);
+}
+
+static void
+overlay_m_stop(void *arg)
+{
+ overlay_dev_t *odd = arg;
+
+ /*
+ * The MAC Perimeter is held here, so we don't have to worry about
+ * synchornizing this with respect to metadata operations.
+ */
+ mutex_enter(&odd->odd_lock);
+ VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
+ VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ overlay_mux_close(odd->odd_mux);
+ odd->odd_mux = NULL;
+
+ mutex_enter(&odd->odd_lock);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ odd->odd_flags &= ~OVERLAY_F_MDDROP;
+ VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
+ mutex_exit(&odd->odd_lock);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_promisc(void *arg, boolean_t on)
+{
+ return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
+{
+ return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_unicast(void *arg, const uint8_t *macaddr)
+{
+ return (0);
+}
+
+mblk_t *
+overlay_m_tx(void *arg, mblk_t *mp_chain)
+{
+ overlay_dev_t *odd = arg;
+ mblk_t *mp, *ep;
+ int ret;
+ ovep_encap_info_t einfo;
+ struct msghdr hdr;
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+ !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ mutex_exit(&odd->odd_lock);
+ freemsgchain(mp_chain);
+ return (NULL);
+ }
+ overlay_io_start(odd, OVERLAY_F_IN_TX);
+ mutex_exit(&odd->odd_lock);
+
+ bzero(&hdr, sizeof (struct msghdr));
+
+ bzero(&einfo, sizeof (ovep_encap_info_t));
+ einfo.ovdi_id = odd->odd_vid;
+ mp = mp_chain;
+ while (mp != NULL) {
+ socklen_t slen;
+ struct sockaddr_storage storage;
+
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+ ep = NULL;
+
+ ret = overlay_target_lookup(odd, mp,
+ (struct sockaddr *)&storage, &slen);
+ if (ret != OVERLAY_TARGET_OK) {
+ if (ret == OVERLAY_TARGET_DROP)
+ freemsg(mp);
+ mp = mp_chain;
+ continue;
+ }
+
+ hdr.msg_name = &storage;
+ hdr.msg_namelen = slen;
+
+ ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
+ &einfo, &ep);
+ if (ret != 0 || ep == NULL) {
+ freemsg(mp);
+ goto out;
+ }
+
+ ASSERT(ep->b_cont == mp || ep == mp);
+ ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
+ if (ret != 0)
+ goto out;
+
+ mp = mp_chain;
+ }
+
+out:
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_TX);
+ mutex_exit(&odd->odd_lock);
+ return (mp_chain);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
+{
+ miocnak(q, mp, 0, ENOTSUP);
+}
+
+/* ARGSUSED */
+static boolean_t
+overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+ /*
+ * Tell MAC we're an overlay.
+ */
+ if (cap == MAC_CAPAB_OVERLAY)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, const void *pr_val)
+{
+ uint32_t mtu, old;
+ int err;
+ overlay_dev_t *odd = arg;
+
+ if (pr_num != MAC_PROP_MTU)
+ return (ENOTSUP);
+
+ bcopy(pr_val, &mtu, sizeof (mtu));
+ if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
+ return (EINVAL);
+
+ mutex_enter(&odd->odd_lock);
+ old = odd->odd_mtu;
+ odd->odd_mtu = mtu;
+ err = mac_maxsdu_update(odd->odd_mh, mtu);
+ if (err != 0)
+ odd->odd_mtu = old;
+ mutex_exit(&odd->odd_lock);
+
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, void *pr_val)
+{
+ return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ mac_prop_info_handle_t prh)
+{
+ if (pr_num != MAC_PROP_MTU)
+ return;
+
+ mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
+ mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
+}
+
+static mac_callbacks_t overlay_m_callbacks = {
+ .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
+ MC_PROPINFO),
+ .mc_getstat = overlay_m_stat,
+ .mc_start = overlay_m_start,
+ .mc_stop = overlay_m_stop,
+ .mc_setpromisc = overlay_m_promisc,
+ .mc_multicst = overlay_m_multicast,
+ .mc_unicst = overlay_m_unicast,
+ .mc_tx = overlay_m_tx,
+ .mc_ioctl = overlay_m_ioctl,
+ .mc_getcapab = overlay_m_getcapab,
+ .mc_getprop = overlay_m_getprop,
+ .mc_setprop = overlay_m_setprop,
+ .mc_propinfo = overlay_m_propinfo
+};
+
+static boolean_t
+overlay_valid_name(const char *name, size_t buflen)
+{
+ size_t actlen;
+ int err, i;
+
+ for (i = 0; i < buflen; i++) {
+ if (name[i] == '\0')
+ break;
+ }
+
+ if (i == 0 || i == buflen)
+ return (B_FALSE);
+ actlen = i;
+ if (strchr(name, '/') != NULL)
+ return (B_FALSE);
+ if (u8_validate((char *)name, actlen, NULL,
+ U8_VALIDATE_ENTIRE, &err) < 0)
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ int err;
+ uint64_t maxid;
+ overlay_dev_t *odd, *o;
+ mac_register_t *mac;
+ overlay_ioc_create_t *oicp = karg;
+
+ if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
+ return (EINVAL);
+
+ odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
+ odd->odd_linkid = oicp->oic_linkid;
+ odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
+ if (odd->odd_plugin == NULL) {
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (ENOENT);
+ }
+ err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
+ &odd->odd_pvoid);
+ if (err != 0) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure that our virtual network id is valid for the given plugin
+ * that we're working with.
+ */
+ ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+ maxid = UINT64_MAX;
+ if (odd->odd_plugin->ovp_id_size != 8)
+ maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
+ if (oicp->oic_vnetid > maxid) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+ odd->odd_vid = oicp->oic_vnetid;
+
+ mac = mac_alloc(MAC_VERSION);
+ if (mac == NULL) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+
+ mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+ mac->m_driver = odd;
+ mac->m_dip = overlay_dip;
+ mac->m_dst_addr = NULL;
+ mac->m_callbacks = &overlay_m_callbacks;
+ mac->m_pdata = NULL;
+ mac->m_pdata_size = 0;
+
+ mac->m_priv_props = NULL;
+
+ /* Let mac handle this itself. */
+ mac->m_instance = (uint_t)-1;
+
+ /*
+ * There is no real source address that should be used here, but saying
+ * that we're not ethernet is going to cause its own problems. At the
+ * end of the say, this is fine.
+ */
+ mac->m_src_addr = overlay_macaddr;
+
+ /*
+ * Start with the default MTU as the max SDU. If the MTU is changed, the
+ * SDU will be changed to reflect that.
+ */
+ mac->m_min_sdu = 1;
+ mac->m_max_sdu = OVERLAY_MTU_DEF;
+ mac->m_multicast_sdu = 0;
+
+ /*
+ * The underlying device doesn't matter, instead this comes from the
+ * encapsulation protocol and whether or not they allow VLAN tags.
+ */
+ if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
+ mac->m_margin = VLAN_TAGSZ;
+ } else {
+ mac->m_margin = 0;
+ }
+
+ /*
+ * Today, we have no MAC virtualization, it may make sense in the future
+ * to go ahead and emulate some subset of this, but it doesn't today.
+ */
+ mac->m_v12n = MAC_VIRT_NONE;
+
+ mutex_enter(&overlay_dev_lock);
+ for (o = list_head(&overlay_dev_list); o != NULL;
+ o = list_next(&overlay_dev_list, o)) {
+ if (o->odd_linkid == oicp->oic_linkid) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EEXIST);
+ }
+
+ if (o->odd_vid == oicp->oic_vnetid &&
+ o->odd_plugin == odd->odd_plugin) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EEXIST);
+ }
+ }
+
+ err = mac_register(mac, &odd->odd_mh);
+ mac_free(mac);
+ if (err != 0) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (err);
+ }
+
+ err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+ crgetzoneid(cred));
+ if (err != 0) {
+ mutex_exit(&overlay_dev_lock);
+ (void) mac_unregister(odd->odd_mh);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (err);
+ }
+
+ mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
+ odd->odd_ref = 0;
+ odd->odd_flags = 0;
+ list_insert_tail(&overlay_dev_list, odd);
+ mutex_exit(&overlay_dev_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ int i, ret;
+ overlay_dev_t *odd;
+ mac_perim_handle_t mph;
+ overlay_ioc_activate_t *oiap = karg;
+ overlay_ioc_propinfo_t *infop;
+ overlay_ioc_prop_t *oip;
+ overlay_prop_handle_t phdl;
+
+ odd = overlay_hold_by_dlid(oiap->oia_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
+ oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
+ phdl = (overlay_prop_handle_t)infop;
+
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (EEXIST);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+ const char *pname = odd->odd_plugin->ovp_props[i];
+ bzero(infop, sizeof (overlay_ioc_propinfo_t));
+ overlay_prop_init(phdl);
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
+ if (ret != 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ret);
+ }
+
+ if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
+ continue;
+ bzero(oip, sizeof (overlay_ioc_prop_t));
+ oip->oip_size = sizeof (oip->oip_value);
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+ pname, oip->oip_value, &oip->oip_size);
+ if (ret != 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ret);
+ }
+ if (oip->oip_size == 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (EINVAL);
+ }
+ }
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ENXIO);
+ }
+
+ ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
+ odd->odd_flags |= OVERLAY_F_ACTIVATED;
+
+ /*
+ * Now that we've activated ourselves, we should indicate to the world
+ * that we're up. Note that we may not be able to perform lookups at
+ * this time, but our notion of being 'up' isn't dependent on that
+ * ability.
+ */
+ mac_link_update(odd->odd_mh, LINK_STATE_UP);
+ mutex_exit(&odd->odd_lock);
+
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ overlay_ioc_delete_t *oidp = karg;
+ overlay_dev_t *odd;
+ datalink_id_t tid;
+ int ret;
+
+ odd = overlay_hold_by_dlid(oidp->oid_linkid);
+ if (odd == NULL) {
+ return (ENOENT);
+ }
+
+ mutex_enter(&odd->odd_lock);
+ /* If we're not the only hold, we're busy */
+ if (odd->odd_ref != 1) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EBUSY);
+ }
+
+ if (odd->odd_flags & OVERLAY_F_IN_MUX) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EBUSY);
+ }
+
+ /*
+ * To remove this, we need to first remove it from dls and then remove
+ * it from mac. The act of removing it from mac will check if there are
+ * devices on top of this, eg. vnics. If there are, then that will fail
+ * and we'll have to go through and recreate the dls entry. Only after
+ * mac_unregister has succeeded, then we'll go through and actually free
+ * everything and drop the dev lock.
+ */
+ ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
+ if (ret != 0) {
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ ASSERT(oidp->oid_linkid == tid);
+ ret = mac_disable(odd->odd_mh);
+ if (ret != 0) {
+ (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+ crgetzoneid(cred));
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ overlay_target_quiesce(odd->odd_target);
+
+ mutex_enter(&overlay_dev_lock);
+ list_remove(&overlay_dev_list, odd);
+ mutex_exit(&overlay_dev_lock);
+
+ cv_destroy(&odd->odd_iowait);
+ mutex_destroy(&odd->odd_lock);
+ overlay_target_free(odd);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ overlay_ioc_nprops_t *on = karg;
+
+ odd = overlay_hold_by_dlid(on->oipn_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+ on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static int
+overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
+{
+ overlay_prop_handle_t phdl = arg;
+ overlay_prop_set_range_str(phdl, opp->ovp_name);
+ return (0);
+}
+
+static int
+overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
+{
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], name) == 0) {
+ *id = i;
+ return (0);
+ }
+ }
+
+ for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+ if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
+ *id = i + OVERLAY_DEV_NPROPS;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static void
+overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
+{
+ uint32_t def;
+ mac_propval_range_t range;
+ uint_t perm;
+
+ ASSERT(MAC_PERIM_HELD(odd->odd_mh));
+
+ bzero(&range, sizeof (mac_propval_range_t));
+ range.mpr_count = 1;
+ if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
+ sizeof (def), &range, &perm) != 0)
+ return;
+
+ if (perm == MAC_PROP_PERM_READ)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ else if (perm == MAC_PROP_PERM_WRITE)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
+ else if (perm == MAC_PROP_PERM_RW)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
+ range.mpr_range_uint32[0].mpur_max);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ int ret;
+ mac_perim_handle_t mph;
+ uint_t propid = UINT_MAX;
+ overlay_ioc_propinfo_t *oip = karg;
+ overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
+
+ odd = overlay_hold_by_dlid(oip->oipi_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_prop_init(phdl);
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+
+ /*
+ * If the id is -1, then the property that we're looking for is named in
+ * oipi_name and we should fill in its id. Otherwise, we've been given
+ * an id and we need to turn that into a name for our plugin's sake. The
+ * id is our own fabrication for property discovery.
+ */
+ if (oip->oipi_id == -1) {
+ /*
+ * Determine if it's a known generic property or it belongs to a
+ * module by checking against the list of known names.
+ */
+ oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
+ &propid)) != 0) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ oip->oipi_id = propid;
+ if (propid >= OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+ oip->oipi_name, phdl);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+
+ }
+ } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
+
+ if (id >= odd->odd_plugin->ovp_nprops) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+ odd->odd_plugin->ovp_props[id], phdl);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ } else if (oip->oipi_id < -1) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oipi_id >= 0);
+ propid = oip->oipi_id;
+ (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
+ sizeof (oip->oipi_name));
+ }
+
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ overlay_i_propinfo_mtu(odd, phdl);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
+ overlay_prop_set_nodefault(phdl);
+ overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
+ break;
+ case OVERLAY_DEV_P_VARPDID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ break;
+ default:
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ENOENT);
+ }
+
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ int ret;
+ overlay_dev_t *odd;
+ mac_perim_handle_t mph;
+ overlay_ioc_prop_t *oip = karg;
+ uint_t propid, mtu;
+
+ odd = overlay_hold_by_dlid(oip->oip_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ oip->oip_size = OVERLAY_PROP_SIZEMAX;
+ oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ if (oip->oip_id == -1) {
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+ break;
+ if (i == OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, &oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ }
+
+ propid = i;
+ } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+ if (id > odd->odd_plugin->ovp_nprops) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+ odd->odd_plugin->ovp_props[id], oip->oip_value,
+ &oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ } else if (oip->oip_id < -1) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oip_id >= 0);
+ propid = oip->oip_id;
+ }
+
+ ret = 0;
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ /*
+ * The MTU is always set and retrieved through MAC, to allow for
+ * MAC to do whatever it wants, as really that property belongs
+ * to MAC. This is important for things where vnics have hold on
+ * the MTU.
+ */
+ mac_sdu_get(odd->odd_mh, NULL, &mtu);
+ bcopy(&mtu, oip->oip_value, sizeof (uint_t));
+ oip->oip_size = sizeof (uint_t);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ /*
+ * While it's read-only while inside of a mux, we're not in a
+ * context that can guarantee that. Therefore we always grab the
+ * overlay_dev_t's odd_lock.
+ */
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint64_t);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ oip->oip_size = strlcpy((char *)oip->oip_value,
+ odd->odd_plugin->ovp_name, oip->oip_size);
+ break;
+ case OVERLAY_DEV_P_VARPDID:
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ const uint64_t val = odd->odd_target->ott_id;
+ bcopy(&val, oip->oip_value, sizeof (uint64_t));
+ oip->oip_size = sizeof (uint64_t);
+ } else {
+ oip->oip_size = 0;
+ }
+ mutex_exit(&odd->odd_lock);
+ break;
+ default:
+ ret = ENOENT;
+ }
+
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+}
+
+static void
+overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
+{
+ mutex_enter(&odd->odd_lock);
+
+ /* Simple case, not active */
+ if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ odd->odd_vid = vnetid;
+ mutex_exit(&odd->odd_lock);
+ return;
+ }
+
+ /*
+ * In the hard case, we need to set the drop flag, quiesce I/O and then
+ * we can go ahead and do everything.
+ */
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ mutex_enter(&odd->odd_lock);
+ odd->odd_vid = vnetid;
+ mutex_exit(&odd->odd_lock);
+ overlay_mux_add_dev(odd->odd_mux, odd);
+
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ int ret;
+ overlay_dev_t *odd;
+ overlay_ioc_prop_t *oip = karg;
+ uint_t propid = UINT_MAX;
+ mac_perim_handle_t mph;
+ uint64_t maxid, *vidp;
+
+ if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
+ return (EINVAL);
+
+ odd = overlay_hold_by_dlid(oip->oip_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+ mac_perim_exit(mph);
+ mutex_exit(&odd->odd_lock);
+ return (ENOTSUP);
+ }
+ mutex_exit(&odd->odd_lock);
+ if (oip->oip_id == -1) {
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+ break;
+ if (i == OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ }
+
+ propid = i;
+ } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+ if (id > odd->odd_plugin->ovp_nprops) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
+ odd->odd_plugin->ovp_props[id], oip->oip_value,
+ oip->oip_size);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
+ } else if (oip->oip_id < -1) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oip_id >= 0);
+ propid = oip->oip_id;
+ }
+
+ ret = 0;
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
+ oip->oip_value, oip->oip_size);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vidp = (uint64_t *)oip->oip_value;
+ ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+ maxid = UINT64_MAX;
+ if (odd->odd_plugin->ovp_id_size != 8)
+ maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
+ 1ULL;
+ if (*vidp >= maxid) {
+ ret = EINVAL;
+ break;
+ }
+ overlay_setprop_vnetid(odd, *vidp);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ case OVERLAY_DEV_P_VARPDID:
+ ret = EPERM;
+ break;
+ default:
+ ret = ENOENT;
+ }
+
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ overlay_ioc_status_t *os = karg;
+
+ odd = overlay_hold_by_dlid(os->ois_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
+ os->ois_status = OVERLAY_I_DEGRADED;
+ if (odd->odd_fmamsg != NULL) {
+ (void) strlcpy(os->ois_message, odd->odd_fmamsg,
+ OVERLAY_STATUS_BUFLEN);
+ } else {
+ os->ois_message[0] = '\0';
+ }
+
+ } else {
+ os->ois_status = OVERLAY_I_OK;
+ os->ois_message[0] = '\0';
+ }
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static dld_ioc_info_t overlay_ioc_list[] = {
+ { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
+ overlay_i_create, secpolicy_dl_config },
+ { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
+ overlay_i_activate, secpolicy_dl_config },
+ { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
+ overlay_i_delete, secpolicy_dl_config },
+ { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_prop_t), overlay_i_getprop,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_SETPROP, DLDCOPYIN,
+ sizeof (overlay_ioc_prop_t), overlay_i_setprop,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_status_t), overlay_i_status,
+ NULL }
+};
+
+static int
+overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int fmcap = DDI_FM_EREPORT_CAPABLE;
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
+ return (DDI_FAILURE);
+
+ ddi_fm_init(dip, &fmcap, NULL);
+
+ if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
+ ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
+ DLDIOCCNT(overlay_ioc_list)) != 0) {
+ ddi_remove_minor_node(dip, OVERLAY_CTL);
+ return (DDI_FAILURE);
+ }
+
+ overlay_dip = dip;
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *resp = (void *)overlay_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *resp = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+
+ return (error);
+}
+
+static int
+overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ mutex_enter(&overlay_dev_lock);
+ if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
+ mutex_exit(&overlay_dev_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&overlay_dev_lock);
+
+
+ dld_ioc_unregister(OVERLAY_IOC);
+ ddi_remove_minor_node(dip, OVERLAY_CTL);
+ ddi_fm_fini(dip);
+ overlay_dip = NULL;
+ return (DDI_SUCCESS);
+}
+
+static struct cb_ops overlay_cbops = {
+ overlay_target_open, /* cb_open */
+ overlay_target_close, /* cb_close */
+ nodev, /* cb_strategy */
+ nodev, /* cb_print */
+ nodev, /* cb_dump */
+ nodev, /* cb_read */
+ nodev, /* cb_write */
+ overlay_target_ioctl, /* cb_ioctl */
+ nodev, /* cb_devmap */
+ nodev, /* cb_mmap */
+ nodev, /* cb_segmap */
+ nochpoll, /* cb_chpoll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* cb_stream */
+ D_MP, /* cb_flag */
+ CB_REV, /* cb_rev */
+ nodev, /* cb_aread */
+ nodev, /* cb_awrite */
+};
+
+static struct dev_ops overlay_dev_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* devo_refcnt */
+ overlay_getinfo, /* devo_getinfo */
+ nulldev, /* devo_identify */
+ nulldev, /* devo_probe */
+ overlay_attach, /* devo_attach */
+ overlay_detach, /* devo_detach */
+ nulldev, /* devo_reset */
+ &overlay_cbops, /* devo_cb_ops */
+ NULL, /* devo_bus_ops */
+ NULL, /* devo_power */
+ ddi_quiesce_not_supported /* devo_quiesce */
+};
+
+static struct modldrv overlay_modldrv = {
+ &mod_driverops,
+ "Overlay Network Driver",
+ &overlay_dev_ops
+};
+
+static struct modlinkage overlay_linkage = {
+ MODREV_1,
+ &overlay_modldrv
+};
+
+static int
+overlay_init(void)
+{
+ mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&overlay_dev_list, sizeof (overlay_dev_t),
+ offsetof(overlay_dev_t, odd_link));
+ overlay_mux_init();
+ overlay_plugin_init();
+ overlay_target_init();
+
+ return (DDI_SUCCESS);
+}
+
+static void
+overlay_fini(void)
+{
+ overlay_target_fini();
+ overlay_plugin_fini();
+ overlay_mux_fini();
+ mutex_destroy(&overlay_dev_lock);
+ list_destroy(&overlay_dev_list);
+}
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = overlay_init()) != DDI_SUCCESS)
+ return (err);
+
+ mac_init_ops(NULL, "overlay");
+ err = mod_install(&overlay_linkage);
+ if (err != DDI_SUCCESS) {
+ overlay_fini();
+ return (err);
+ }
+
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&overlay_linkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ err = mod_remove(&overlay_linkage);
+ if (err != 0)
+ return (err);
+
+ overlay_fini();
+ return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf
new file mode 100644
index 0000000000..4b62fafd94
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015, Joyent, Inc.
+#
+
+name="overlay" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile
new file mode 100644
index 0000000000..800d72dc2b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.mapfile
@@ -0,0 +1,46 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+ global:
+ # DDI Interfaces
+ _fini;
+ _init;
+ _info;
+
+ # Encapsualation Plugin interfaces
+ overlay_plugin_alloc;
+ overlay_plugin_free;
+ overlay_plugin_register;
+ overlay_plugin_unregister;
+ local:
+ *;
+};
diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c
new file mode 100644
index 0000000000..0701d08e8b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_fm.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device FMA operations.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/ddifm.h>
+#include <sys/overlay_impl.h>
+
+kmutex_t overlay_fm_lock;
+uint_t overlay_fm_count;
+
+void
+overlay_fm_init(void)
+{
+ overlay_fm_count = 0;
+ mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_fm_fini(void)
+{
+ VERIFY(overlay_fm_count == 0);
+ mutex_destroy(&overlay_fm_lock);
+}
+
+void
+overlay_fm_degrade(overlay_dev_t *odd, const char *msg)
+{
+ mutex_enter(&overlay_fm_lock);
+ mutex_enter(&odd->odd_lock);
+
+ if (msg != NULL)
+ (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN);
+
+ if (odd->odd_flags & OVERLAY_F_DEGRADED)
+ goto out;
+
+ odd->odd_flags |= OVERLAY_F_DEGRADED;
+ overlay_fm_count++;
+ if (overlay_fm_count == 1) {
+ ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED);
+ }
+out:
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&overlay_fm_lock);
+}
+
+void
+overlay_fm_restore(overlay_dev_t *odd)
+{
+ mutex_enter(&overlay_fm_lock);
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_DEGRADED))
+ goto out;
+
+ odd->odd_fmamsg[0] = '\0';
+ odd->odd_flags &= ~OVERLAY_F_DEGRADED;
+ overlay_fm_count--;
+ if (overlay_fm_count == 0) {
+ ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED);
+ }
+out:
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&overlay_fm_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
new file mode 100644
index 0000000000..58e9f2665d
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -0,0 +1,368 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Overlay device ksocket multiplexer.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ksynch.h>
+#include <sys/ksocket.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/pattr.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+
+#include <sys/overlay_impl.h>
+
+#include <sys/sdt.h>
+
+#define OVERLAY_FREEMSG(mp, reason) \
+ DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
+
+static list_t overlay_mux_list;
+static kmutex_t overlay_mux_lock;
+
+void
+overlay_mux_init(void)
+{
+ list_create(&overlay_mux_list, sizeof (overlay_mux_t),
+ offsetof(overlay_mux_t, omux_lnode));
+ mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_mux_fini(void)
+{
+ mutex_destroy(&overlay_mux_lock);
+ list_destroy(&overlay_mux_list);
+}
+
+static int
+overlay_mux_comparator(const void *a, const void *b)
+{
+ const overlay_dev_t *odl, *odr;
+ odl = a;
+ odr = b;
+ if (odl->odd_vid > odr->odd_vid)
+ return (1);
+ else if (odl->odd_vid < odr->odd_vid)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * This is the central receive data path. We need to decode the packet, if we
+ * can, and then deliver it to the appropriate overlay.
+ */
+/* ARGSUSED */
+static boolean_t
+overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
+ void *arg)
+{
+ mblk_t *mp, *nmp, *fmp;
+ overlay_mux_t *mux = arg;
+
+ /*
+ * We may have a received a chain of messages. Each messsage in the
+ * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
+ * If we aren't getting that, we should probably drop that for the
+ * moment.
+ */
+ for (mp = mpchain; mp != NULL; mp = nmp) {
+ struct T_unitdata_ind *tudi;
+ ovep_encap_info_t infop;
+ overlay_dev_t od, *odd;
+ int ret;
+
+ nmp = mp->b_next;
+ mp->b_next = NULL;
+
+ if (DB_TYPE(mp) != M_PROTO) {
+ OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
+ freemsg(mp);
+ continue;
+ }
+
+ if (mp->b_cont == NULL) {
+ OVERLAY_FREEMSG(mp, "missing a b_cont");
+ freemsg(mp);
+ continue;
+ }
+
+ tudi = (struct T_unitdata_ind *)mp->b_rptr;
+ if (tudi->PRIM_type != T_UNITDATA_IND) {
+ OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
+ freemsg(mp);
+ continue;
+ }
+
+ /*
+ * In the future, we'll care about the source information
+ * for purposes of telling varpd for oob invalidation. But for
+ * now, just drop that block.
+ */
+ fmp = mp;
+ mp = fmp->b_cont;
+ freeb(fmp);
+
+ /*
+ * Until we have VXLAN-or-other-decap HW acceleration support
+ * (e.g. we support NICs that reach into VXLAN-encapsulated
+ * packets and check the inside-VXLAN IP packets' checksums,
+ * or do LSO with VXLAN), we should clear any HW-accelerated-
+ * performed bits.
+ *
+ * We do this, even in cases of HW_LOCAL_MAC, because we
+ * absolutely have NO context about the inner packet.
+ * It could've arrived off an external NIC and been forwarded
+ * to the overlay network, which means no context.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * Decap and deliver.
+ */
+ bzero(&infop, sizeof (ovep_encap_info_t));
+ ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
+ if (ret != 0) {
+ OVERLAY_FREEMSG(mp, "decap failed");
+ freemsg(mp);
+ continue;
+ }
+ if (MBLKL(mp) > infop.ovdi_hdr_size) {
+ mp->b_rptr += infop.ovdi_hdr_size;
+ } else {
+ while (infop.ovdi_hdr_size != 0) {
+ size_t rem, blkl;
+
+ if (mp == NULL)
+ break;
+
+ blkl = MBLKL(mp);
+ rem = MIN(infop.ovdi_hdr_size, blkl);
+ infop.ovdi_hdr_size -= rem;
+ mp->b_rptr += rem;
+ if (rem == blkl) {
+ fmp = mp;
+ mp = fmp->b_cont;
+ fmp->b_cont = NULL;
+ OVERLAY_FREEMSG(mp,
+ "freed a fmp block");
+ freemsg(fmp);
+ }
+ }
+ if (mp == NULL) {
+ OVERLAY_FREEMSG(mp, "freed it all...");
+ continue;
+ }
+ }
+
+
+ od.odd_vid = infop.ovdi_id;
+ mutex_enter(&mux->omux_lock);
+ odd = avl_find(&mux->omux_devices, &od, NULL);
+ if (odd == NULL) {
+ mutex_exit(&mux->omux_lock);
+ OVERLAY_FREEMSG(mp, "no matching vid");
+ freemsg(mp);
+ continue;
+ }
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+ !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&mux->omux_lock);
+ OVERLAY_FREEMSG(mp, "dev dropped");
+ freemsg(mp);
+ continue;
+ }
+ overlay_io_start(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&mux->omux_lock);
+
+ mac_rx(odd->odd_mh, NULL, mp);
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Register a given device with a socket backend. If no such device socket
+ * exists, create a new one.
+ */
+overlay_mux_t *
+overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
+ struct sockaddr *addr, socklen_t len, int *errp)
+{
+ int err;
+ overlay_mux_t *mux;
+ ksocket_t ksock;
+
+ if (errp == NULL)
+ errp = &err;
+
+ mutex_enter(&overlay_mux_lock);
+ for (mux = list_head(&overlay_mux_list); mux != NULL;
+ mux = list_next(&overlay_mux_list, mux)) {
+ if (domain == mux->omux_domain &&
+ family == mux->omux_family &&
+ protocol == mux->omux_protocol &&
+ len == mux->omux_alen &&
+ bcmp(addr, mux->omux_addr, len) == 0) {
+
+ if (opp != mux->omux_plugin) {
+ *errp = EEXIST;
+ return (NULL);
+ }
+
+ mutex_enter(&mux->omux_lock);
+ mux->omux_count++;
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+ *errp = 0;
+ return (mux);
+ }
+ }
+
+ /*
+ * Today we aren't zone-aware and only exist in the global zone. When we
+ * allow for things to exist in the non-global zone, we'll want to use a
+ * credential that's actually specific to the zone.
+ */
+ *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
+ kcred);
+ if (*errp != 0) {
+ mutex_exit(&overlay_mux_lock);
+ return (NULL);
+ }
+
+ *errp = ksocket_bind(ksock, addr, len, kcred);
+ if (*errp != 0) {
+ mutex_exit(&overlay_mux_lock);
+ ksocket_close(ksock, kcred);
+ return (NULL);
+ }
+
+ /*
+ * Ask our lower layer to optionally toggle anything they need on this
+ * socket. Because a socket is owned by a single type of plugin, we can
+ * then ask it to perform any additional socket set up it'd like to do.
+ */
+ if (opp->ovp_ops->ovpo_sockopt != NULL &&
+ (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
+ mutex_exit(&overlay_mux_lock);
+ ksocket_close(ksock, kcred);
+ return (NULL);
+ }
+
+ mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
+ list_link_init(&mux->omux_lnode);
+ mux->omux_ksock = ksock;
+ mux->omux_plugin = opp;
+ mux->omux_domain = domain;
+ mux->omux_family = family;
+ mux->omux_protocol = protocol;
+ mux->omux_addr = kmem_alloc(len, KM_SLEEP);
+ bcopy(addr, mux->omux_addr, len);
+ mux->omux_alen = len;
+ mux->omux_count = 1;
+ avl_create(&mux->omux_devices, overlay_mux_comparator,
+ sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
+ mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
+
+
+ /* Once this is called, we need to expect to rx data */
+ *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
+ if (*errp != 0) {
+ ksocket_close(ksock, kcred);
+ mutex_destroy(&mux->omux_lock);
+ avl_destroy(&mux->omux_devices);
+ kmem_free(mux->omux_addr, len);
+ kmem_free(mux, sizeof (overlay_mux_t));
+ return (NULL);
+ }
+
+ list_insert_tail(&overlay_mux_list, mux);
+ mutex_exit(&overlay_mux_lock);
+
+ *errp = 0;
+ return (mux);
+}
+
+void
+overlay_mux_close(overlay_mux_t *mux)
+{
+ mutex_enter(&overlay_mux_lock);
+ mutex_enter(&mux->omux_lock);
+ mux->omux_count--;
+ if (mux->omux_count != 0) {
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+ return;
+ }
+ list_remove(&overlay_mux_list, mux);
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+
+ ksocket_close(mux->omux_ksock, kcred);
+ avl_destroy(&mux->omux_devices);
+ kmem_free(mux->omux_addr, mux->omux_alen);
+ kmem_free(mux, sizeof (overlay_mux_t));
+}
+
+void
+overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+ mutex_enter(&mux->omux_lock);
+ avl_add(&mux->omux_devices, odd);
+ mutex_exit(&mux->omux_lock);
+}
+
+void
+overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+ mutex_enter(&mux->omux_lock);
+ avl_remove(&mux->omux_devices, odd);
+ mutex_exit(&mux->omux_lock);
+}
+
+int
+overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
+{
+ int ret;
+
+ /*
+ * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
+ * that isn't actually supported by UDP at this time.
+ */
+ ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
+ if (ret != 0)
+ freemsg(mp);
+
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c
new file mode 100644
index 0000000000..348ddb92a2
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_plugin.c
@@ -0,0 +1,281 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Overlay device encapsulation plugin management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/sysmacros.h>
+#include <sys/modctl.h>
+
+#include <sys/overlay_impl.h>
+
+static kmem_cache_t *overlay_plugin_cache;
+static kmutex_t overlay_plugin_lock;
+static list_t overlay_plugin_list;
+
+#define OVERLAY_MODDIR "overlay"
+
+/* ARGSUSED */
+static int
+overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ overlay_plugin_t *opp = buf;
+
+ mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL);
+ list_link_init(&opp->ovp_link);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_plugin_cache_destructor(void *buf, void *arg)
+{
+ overlay_plugin_t *opp = buf;
+ ASSERT(list_link_active(&opp->ovp_link) == 0);
+ mutex_destroy(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_init(void)
+{
+ mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0);
+
+ /*
+ * In the future we may want to have a reaper to unload unused modules
+ * to help the kernel be able to reclaim memory.
+ */
+ overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache",
+ sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor,
+ overlay_plugin_cache_destructor, NULL, NULL, NULL, 0);
+ list_create(&overlay_plugin_list, sizeof (overlay_plugin_t),
+ offsetof(overlay_plugin_t, ovp_link));
+}
+
+void
+overlay_plugin_fini(void)
+{
+ mutex_enter(&overlay_plugin_lock);
+ VERIFY(list_is_empty(&overlay_plugin_list));
+ mutex_exit(&overlay_plugin_lock);
+
+ list_destroy(&overlay_plugin_list);
+ kmem_cache_destroy(overlay_plugin_cache);
+ mutex_destroy(&overlay_plugin_lock);
+}
+
+overlay_plugin_register_t *
+overlay_plugin_alloc(uint_t version)
+{
+ overlay_plugin_register_t *ovrp;
+ /* Version 1 is the only one that exists */
+ if (version != OVEP_VERSION_ONE)
+ return (NULL);
+
+ ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP);
+ ovrp->ovep_version = version;
+ return (ovrp);
+}
+
+void
+overlay_plugin_free(overlay_plugin_register_t *ovrp)
+{
+ kmem_free(ovrp, sizeof (overlay_plugin_register_t));
+}
+
+int
+overlay_plugin_register(overlay_plugin_register_t *ovrp)
+{
+ overlay_plugin_t *opp, *ipp;
+
+ /* Sanity check parameters of the registration */
+ if (ovrp->ovep_version != OVEP_VERSION_ONE)
+ return (EINVAL);
+
+ if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL)
+ return (EINVAL);
+
+ if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0)
+ return (EINVAL);
+
+ if (ovrp->ovep_id_size < 1)
+ return (EINVAL);
+
+ /* Don't support anything that has an id size larger than 8 bytes */
+ if (ovrp->ovep_id_size > 8)
+ return (ENOTSUP);
+
+ if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID)
+ return (EINVAL);
+
+ if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0)
+ return (EINVAL);
+
+ if (ovrp->ovep_ops->ovpo_callbacks != 0)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_init == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_fini == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_encap == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_decap == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_socket == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_getprop == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_setprop == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_propinfo == NULL)
+ return (EINVAL);
+
+
+ opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP);
+ opp->ovp_active = 0;
+ opp->ovp_name = ovrp->ovep_name;
+ opp->ovp_ops = ovrp->ovep_ops;
+ opp->ovp_props = ovrp->ovep_props;
+ opp->ovp_id_size = ovrp->ovep_id_size;
+ opp->ovp_flags = ovrp->ovep_flags;
+ opp->ovp_dest = ovrp->ovep_dest;
+
+ opp->ovp_nprops = 0;
+ if (ovrp->ovep_props != NULL) {
+ while (ovrp->ovep_props[opp->ovp_nprops] != NULL) {
+ if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >=
+ OVERLAY_PROP_NAMELEN) {
+ mutex_exit(&overlay_plugin_lock);
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (EINVAL);
+ }
+ opp->ovp_nprops++;
+ }
+ }
+
+ mutex_enter(&overlay_plugin_lock);
+ for (ipp = list_head(&overlay_plugin_list); ipp != NULL;
+ ipp = list_next(&overlay_plugin_list, ipp)) {
+ if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) {
+ mutex_exit(&overlay_plugin_lock);
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (EEXIST);
+ }
+ }
+ list_insert_tail(&overlay_plugin_list, opp);
+ mutex_exit(&overlay_plugin_lock);
+
+ return (0);
+}
+
+int
+overlay_plugin_unregister(const char *name)
+{
+ overlay_plugin_t *opp;
+
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (strcmp(opp->ovp_name, name) == 0)
+ break;
+ }
+
+ if (opp == NULL) {
+ mutex_exit(&overlay_plugin_lock);
+ return (ENOENT);
+ }
+
+ mutex_enter(&opp->ovp_mutex);
+ if (opp->ovp_active > 0) {
+ mutex_exit(&opp->ovp_mutex);
+ mutex_exit(&overlay_plugin_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&opp->ovp_mutex);
+
+ list_remove(&overlay_plugin_list, opp);
+ mutex_exit(&overlay_plugin_lock);
+
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (0);
+}
+
+overlay_plugin_t *
+overlay_plugin_lookup(const char *name)
+{
+ overlay_plugin_t *opp;
+ boolean_t trymodload = B_FALSE;
+
+ for (;;) {
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (strcmp(name, opp->ovp_name) == 0) {
+ mutex_enter(&opp->ovp_mutex);
+ opp->ovp_active++;
+ mutex_exit(&opp->ovp_mutex);
+ mutex_exit(&overlay_plugin_lock);
+ return (opp);
+ }
+ }
+ mutex_exit(&overlay_plugin_lock);
+
+ if (trymodload == B_TRUE)
+ return (NULL);
+
+ /*
+ * If we didn't find it, it may still exist, but just not have
+ * been a loaded module. In that case, we'll do one attempt to
+ * load it.
+ */
+ if (modload(OVERLAY_MODDIR, (char *)name) == -1)
+ return (NULL);
+ trymodload = B_TRUE;
+ }
+
+}
+
+void
+overlay_plugin_rele(overlay_plugin_t *opp)
+{
+ mutex_enter(&opp->ovp_mutex);
+ ASSERT(opp->ovp_active > 0);
+ opp->ovp_active--;
+ mutex_exit(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_walk(overlay_plugin_walk_f func, void *arg)
+{
+ overlay_plugin_t *opp;
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (func(opp, arg) != 0) {
+ mutex_exit(&overlay_plugin_lock);
+ return;
+ }
+ }
+ mutex_exit(&overlay_plugin_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c
new file mode 100644
index 0000000000..ba1ea2a629
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_prop.c
@@ -0,0 +1,122 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+/*
+ * Routines for manipulating property information structures.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/overlay_impl.h>
+
+void
+overlay_prop_init(overlay_prop_handle_t phdl)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+ infop->oipi_posssize = sizeof (mac_propval_range_t);
+ bzero(rangep, sizeof (mac_propval_range_t));
+}
+
+void
+overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN);
+}
+
+void
+overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_prot = prot;
+}
+
+void
+overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_type = type;
+}
+
+int
+overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+
+ if (len > OVERLAY_PROP_SIZEMAX)
+ return (E2BIG);
+
+ if (len < 0)
+ return (EOVERFLOW);
+
+ bcopy(def, infop->oipi_default, len);
+ infop->oipi_defsize = (uint32_t)len;
+
+ return (0);
+}
+
+void
+overlay_prop_set_nodefault(overlay_prop_handle_t phdl)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_default[0] = '\0';
+ infop->oipi_defsize = 0;
+}
+
+void
+overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min,
+ uint32_t max)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32)
+ return;
+
+ if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) >
+ sizeof (infop->oipi_poss))
+ return;
+
+ infop->oipi_posssize += sizeof (mac_propval_uint32_range_t);
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_UINT32;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max;
+}
+
+void
+overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str)
+{
+ size_t len = strlen(str) + 1; /* Account for a null terminator */
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+ mac_propval_str_range_t *pstr = &rangep->u.mpr_str;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR)
+ return;
+
+ if (infop->oipi_posssize + len > sizeof (infop->oipi_poss))
+ return;
+
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_STR;
+ strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str,
+ sizeof (infop->oipi_poss) - infop->oipi_posssize);
+ pstr->mpur_nextbyte += len;
+ infop->oipi_posssize += len;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
new file mode 100644
index 0000000000..f4147b56d1
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -0,0 +1,1651 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device target cache management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/sysmacros.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
+#include <sys/vlan.h>
+#include <sys/crc32.h>
+#include <sys/cred.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/overlay_impl.h>
+#include <sys/sdt.h>
+
+/*
+ * This is total straw man, but at least it's a prime number. Here we're
+ * going to have to go through and do a lot of evaluation and understanding as
+ * to how these target caches should grow and shrink, as well as, memory
+ * pressure and evictions. This just gives us a starting point that'll be 'good
+ * enough', until it's not.
+ */
+#define OVERLAY_HSIZE 823
+
+/*
+ * We use this data structure to keep track of what requests have been actively
+ * allocated to a given instance so we know what to put back on the pending
+ * list.
+ */
+typedef struct overlay_target_hdl {
+ minor_t oth_minor; /* RO */
+ zoneid_t oth_zoneid; /* RO */
+ int oth_oflags; /* RO */
+ list_node_t oth_link; /* overlay_target_lock */
+ kmutex_t oth_lock;
+ list_t oth_outstanding; /* oth_lock */
+} overlay_target_hdl_t;
+
+typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
+typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
+typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
+
+typedef struct overaly_target_ioctl {
+ int oti_cmd; /* ioctl id */
+ boolean_t oti_write; /* ioctl requires FWRITE */
+ boolean_t oti_ncopyout; /* copyout data? */
+ overlay_target_copyin_f oti_copyin; /* copyin func */
+ overlay_target_ioctl_f oti_func; /* function to call */
+ overlay_target_copyout_f oti_copyout; /* copyin func */
+ size_t oti_size; /* size of user level structure */
+} overlay_target_ioctl_t;
+
+static kmem_cache_t *overlay_target_cache;
+static kmem_cache_t *overlay_entry_cache;
+static id_space_t *overlay_thdl_idspace;
+static void *overlay_thdl_state;
+
+/*
+ * When we support overlay devices in the NGZ, then all of these need to become
+ * zone aware, by plugging into the netstack engine and becoming per-netstack
+ * data.
+ */
+static list_t overlay_thdl_list;
+static kmutex_t overlay_target_lock;
+static kcondvar_t overlay_target_condvar;
+static list_t overlay_target_list;
+static boolean_t overlay_target_excl;
+
+/*
+ * Outstanding data per hash table entry.
+ */
+static int overlay_ent_size = 128 * 1024;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+ overlay_target_t *ott = buf;
+
+ mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_target_cache_destructor(void *buf, void *arg)
+{
+ overlay_target_t *ott = buf;
+
+ cv_destroy(&ott->ott_cond);
+ mutex_destroy(&ott->ott_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+ overlay_target_entry_t *ote = buf;
+
+ bzero(ote, sizeof (overlay_target_entry_t));
+ mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_entry_cache_destructor(void *buf, void *arg)
+{
+ overlay_target_entry_t *ote = buf;
+
+ mutex_destroy(&ote->ote_lock);
+}
+
+static uint64_t
+overlay_mac_hash(const void *v)
+{
+ uint32_t crc;
+ CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+ return (crc);
+}
+
+static int
+overlay_mac_cmp(const void *a, const void *b)
+{
+ return (bcmp(a, b, ETHERADDRL));
+}
+
+/* ARGSUSED */
+static void
+overlay_target_entry_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+
+ ote->ote_flags = 0;
+ bzero(ote->ote_addr, ETHERADDRL);
+ ote->ote_ott = NULL;
+ ote->ote_odd = NULL;
+ freemsgchain(ote->ote_chead);
+ ote->ote_chead = ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_vtime = 0;
+ kmem_cache_free(overlay_entry_cache, ote);
+}
+
+static int
+overlay_mac_avl(const void *a, const void *b)
+{
+ int i;
+ const overlay_target_entry_t *l, *r;
+ l = a;
+ r = b;
+
+ for (i = 0; i < ETHERADDRL; i++) {
+ if (l->ote_addr[i] > r->ote_addr[i])
+ return (1);
+ else if (l->ote_addr[i] < r->ote_addr[i])
+ return (-1);
+ }
+
+ return (0);
+}
+
+void
+overlay_target_init(void)
+{
+ int ret;
+ ret = ddi_soft_state_init(&overlay_thdl_state,
+ sizeof (overlay_target_hdl_t), 1);
+ VERIFY(ret == 0);
+ overlay_target_cache = kmem_cache_create("overlay_target",
+ sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
+ overlay_target_cache_destructor, NULL, NULL, NULL, 0);
+ overlay_entry_cache = kmem_cache_create("overlay_entry",
+ sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
+ overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
+ mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
+ list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_qlink));
+ list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
+ offsetof(overlay_target_hdl_t, oth_link));
+ overlay_thdl_idspace = id_space_create("overlay_target_minors",
+ 1, INT32_MAX);
+}
+
+void
+overlay_target_fini(void)
+{
+ id_space_destroy(overlay_thdl_idspace);
+ list_destroy(&overlay_thdl_list);
+ list_destroy(&overlay_target_list);
+ cv_destroy(&overlay_target_condvar);
+ mutex_destroy(&overlay_target_lock);
+ kmem_cache_destroy(overlay_entry_cache);
+ kmem_cache_destroy(overlay_target_cache);
+ ddi_soft_state_fini(&overlay_thdl_state);
+}
+
+void
+overlay_target_free(overlay_dev_t *odd)
+{
+ if (odd->odd_target == NULL)
+ return;
+
+ if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
+ avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
+ overlay_target_entry_t *ote;
+
+ /*
+ * Our AVL tree and hashtable contain the same elements,
+ * therefore we should just remove it from the tree, but then
+ * delete the entries when we remove them from the hash table
+ * (which happens through the refhash dtor).
+ */
+ while ((ote = avl_first(ap)) != NULL)
+ avl_remove(ap, ote);
+
+ avl_destroy(ap);
+ for (ote = refhash_first(rp); ote != NULL;
+ ote = refhash_next(rp, ote)) {
+ refhash_remove(rp, ote);
+ }
+ refhash_destroy(rp);
+ }
+
+ ASSERT(odd->odd_target->ott_ocount == 0);
+ kmem_cache_free(overlay_target_cache, odd->odd_target);
+}
+
+int
+overlay_target_busy()
+{
+ int ret;
+
+ mutex_enter(&overlay_target_lock);
+ ret = !list_is_empty(&overlay_thdl_list);
+ mutex_exit(&overlay_target_lock);
+
+ return (ret);
+}
+
+static void
+overlay_target_queue(overlay_target_entry_t *entry)
+{
+ mutex_enter(&overlay_target_lock);
+ mutex_enter(&entry->ote_ott->ott_lock);
+ if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
+ mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_exit(&overlay_target_lock);
+ return;
+ }
+ entry->ote_ott->ott_ocount++;
+ mutex_exit(&entry->ote_ott->ott_lock);
+ list_insert_tail(&overlay_target_list, entry);
+ cv_signal(&overlay_target_condvar);
+ mutex_exit(&overlay_target_lock);
+}
+
+void
+overlay_target_quiesce(overlay_target_t *ott)
+{
+ if (ott == NULL)
+ return;
+ mutex_enter(&ott->ott_lock);
+ ott->ott_flags |= OVERLAY_T_TEARDOWN;
+ while (ott->ott_ocount != 0)
+ cv_wait(&ott->ott_cond, &ott->ott_lock);
+ mutex_exit(&ott->ott_lock);
+}
+
+/*
+ * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
+ * this time, say for NVGRE, we drop all packets that mcuh this.
+ */
+int
+overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
+ socklen_t *slenp)
+{
+ int ret;
+ struct sockaddr_in6 *v6;
+ overlay_target_t *ott;
+ mac_header_info_t mhi;
+ overlay_target_entry_t *entry;
+
+ ASSERT(odd->odd_target != NULL);
+
+ /*
+ * At this point, the overlay device is in a mux which means that it's
+ * been activated. At this point, parts of the target, such as the mode
+ * and the destination are now read-only and we don't have to worry
+ * about synchronization for them.
+ */
+ ott = odd->odd_target;
+ if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ return (OVERLAY_TARGET_DROP);
+
+ v6 = (struct sockaddr_in6 *)sock;
+ bzero(v6, sizeof (struct sockaddr_in6));
+ v6->sin6_family = AF_INET6;
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ mutex_enter(&ott->ott_lock);
+ bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
+ mutex_exit(&ott->ott_lock);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ return (OVERLAY_TARGET_OK);
+ }
+
+ ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
+
+ /*
+ * Note we only want the MAC address here, therefore we won't bother
+ * using mac_vlan_header_info(). If any caller needs the vlan info at
+ * this point, this should change to a call to mac_vlan_header_info().
+ */
+ if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+ return (OVERLAY_TARGET_DROP);
+ mutex_enter(&ott->ott_lock);
+ entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ mhi.mhi_daddr);
+ if (entry == NULL) {
+ entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI);
+ if (entry == NULL) {
+ mutex_exit(&ott->ott_lock);
+ return (OVERLAY_TARGET_DROP);
+ }
+ bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+ entry->ote_chead = entry->ote_ctail = mp;
+ entry->ote_mbsize = msgsize(mp);
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ entry->ote_ott = ott;
+ entry->ote_odd = odd;
+ refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
+ mutex_exit(&ott->ott_lock);
+ overlay_target_queue(entry);
+ return (OVERLAY_TARGET_ASYNC);
+ }
+ refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
+
+ mutex_enter(&entry->ote_lock);
+ if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(entry->ote_dest.otp_port);
+ *slenp = sizeof (struct sockaddr_in6);
+ ret = OVERLAY_TARGET_OK;
+ } else {
+ size_t mlen = msgsize(mp);
+
+ if (mlen + entry->ote_mbsize > overlay_ent_size) {
+ ret = OVERLAY_TARGET_DROP;
+ } else {
+ if (entry->ote_ctail != NULL) {
+ ASSERT(entry->ote_ctail->b_next ==
+ NULL);
+ entry->ote_ctail->b_next = mp;
+ entry->ote_ctail = mp;
+ } else {
+ entry->ote_chead = mp;
+ entry->ote_ctail = mp;
+ }
+ entry->ote_mbsize += mlen;
+ if ((entry->ote_flags &
+ OVERLAY_ENTRY_F_PENDING) == 0) {
+ entry->ote_flags |=
+ OVERLAY_ENTRY_F_PENDING;
+ overlay_target_queue(entry);
+ }
+ ret = OVERLAY_TARGET_ASYNC;
+ }
+ }
+ mutex_exit(&entry->ote_lock);
+
+ mutex_enter(&ott->ott_lock);
+ refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_info_t *oti = arg;
+
+ odd = overlay_hold_by_dlid(oti->oti_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ oti->oti_flags = 0;
+ oti->oti_needs = odd->odd_plugin->ovp_dest;
+ if (odd->odd_flags & OVERLAY_F_DEGRADED)
+ oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED)
+ oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
+ oti->oti_vnetid = odd->odd_vid;
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_targ_associate_t *ota = arg;
+
+ odd = overlay_hold_by_dlid(ota->ota_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ if (ota->ota_id == 0) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_mode != OVERLAY_TARGET_POINT &&
+ ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_mode == OVERLAY_TARGET_POINT) {
+ if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
+ IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
+ IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ }
+
+ if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
+ if (ota->ota_point.otp_port == 0) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ }
+ }
+
+ ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
+ ott->ott_flags = 0;
+ ott->ott_ocount = 0;
+ ott->ott_mode = ota->ota_mode;
+ ott->ott_dest = ota->ota_provides;
+ ott->ott_id = ota->ota_id;
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ bcopy(&ota->ota_point, &ott->ott_u.ott_point,
+ sizeof (overlay_target_point_t));
+ } else {
+ ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+ overlay_mac_hash, overlay_mac_cmp,
+ overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_reflink),
+ offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+ avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_avllink));
+ }
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ mutex_exit(&odd->odd_lock);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (EEXIST);
+ }
+
+ odd->odd_flags |= OVERLAY_F_VARPD;
+ odd->odd_target = ott;
+ mutex_exit(&odd->odd_lock);
+
+ overlay_hold_rele(odd);
+
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_degrade_t *otd = arg;
+
+ odd = overlay_hold_by_dlid(otd->otd_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_fm_degrade(odd, otd->otd_buf);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_id_t *otid = arg;
+
+ odd = overlay_hold_by_dlid(otid->otid_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_fm_restore(odd);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_id_t *otid = arg;
+
+ odd = overlay_hold_by_dlid(otid->otid_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ odd->odd_flags &= ~OVERLAY_F_VARPD;
+ mutex_exit(&odd->odd_lock);
+
+ overlay_hold_rele(odd);
+ return (0);
+
+}
+
+static int
+overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_lookup_t *otl = arg;
+ overlay_target_entry_t *entry;
+ clock_t ret, timeout;
+ mac_header_info_t mhi;
+
+ timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
+again:
+ mutex_enter(&overlay_target_lock);
+ while (list_is_empty(&overlay_target_list)) {
+ ret = cv_timedwait(&overlay_target_condvar,
+ &overlay_target_lock, timeout);
+ if (ret == -1) {
+ mutex_exit(&overlay_target_lock);
+ return (ETIME);
+ }
+ }
+ entry = list_remove_head(&overlay_target_list);
+ mutex_exit(&overlay_target_lock);
+ mutex_enter(&entry->ote_lock);
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ ASSERT(entry->ote_chead == NULL);
+ mutex_exit(&entry->ote_lock);
+ goto again;
+ }
+ ASSERT(entry->ote_chead != NULL);
+
+ /*
+ * If we have a bogon that doesn't have a valid mac header, drop it and
+ * try again.
+ */
+ if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
+ &mhi) != 0) {
+ boolean_t queue = B_FALSE;
+ mblk_t *mp = entry->ote_chead;
+ entry->ote_chead = mp->b_next;
+ mp->b_next = NULL;
+ if (entry->ote_ctail == mp)
+ entry->ote_ctail = entry->ote_chead;
+ entry->ote_mbsize -= msgsize(mp);
+ if (entry->ote_chead != NULL)
+ queue = B_TRUE;
+ mutex_exit(&entry->ote_lock);
+ if (queue == B_TRUE)
+ overlay_target_queue(entry);
+ freemsg(mp);
+ goto again;
+ }
+
+ otl->otl_dlid = entry->ote_odd->odd_linkid;
+ otl->otl_reqid = (uintptr_t)entry;
+ otl->otl_varpdid = entry->ote_ott->ott_id;
+ otl->otl_vnetid = entry->ote_odd->odd_vid;
+
+ otl->otl_hdrsize = mhi.mhi_hdrsize;
+ otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
+ bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
+ bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
+ otl->otl_dsttype = mhi.mhi_dsttype;
+ otl->otl_sap = mhi.mhi_bindsap;
+ otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
+ mutex_exit(&entry->ote_lock);
+
+ mutex_enter(&thdl->oth_lock);
+ list_insert_tail(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
+{
+ const overlay_targ_resp_t *otr = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == otr->otr_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ list_remove(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ mutex_enter(&entry->ote_lock);
+ bcopy(&otr->otr_answer, &entry->ote_dest,
+ sizeof (overlay_target_point_t));
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ mp = entry->ote_chead;
+ entry->ote_chead = NULL;
+ entry->ote_ctail = NULL;
+ entry->ote_mbsize = 0;
+ entry->ote_vtime = gethrtime();
+ mutex_exit(&entry->ote_lock);
+
+ /*
+ * For now do an in-situ drain.
+ */
+ mp = overlay_m_tx(entry->ote_odd, mp);
+ freemsgchain(mp);
+
+ mutex_enter(&entry->ote_ott->ott_lock);
+ entry->ote_ott->ott_ocount--;
+ cv_signal(&entry->ote_ott->ott_cond);
+ mutex_exit(&entry->ote_ott->ott_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
+{
+ const overlay_targ_resp_t *otr = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+ boolean_t queue = B_FALSE;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == otr->otr_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ list_remove(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ mutex_enter(&entry->ote_lock);
+
+ /* Safeguard against a confused varpd */
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ DTRACE_PROBE1(overlay__target__valid__drop,
+ overlay_target_entry_t *, entry);
+ mutex_exit(&entry->ote_lock);
+ goto done;
+ }
+
+ mp = entry->ote_chead;
+ if (mp != NULL) {
+ entry->ote_chead = mp->b_next;
+ mp->b_next = NULL;
+ if (entry->ote_ctail == mp)
+ entry->ote_ctail = entry->ote_chead;
+ entry->ote_mbsize -= msgsize(mp);
+ }
+ if (entry->ote_chead != NULL) {
+ queue = B_TRUE;
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ } else {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ }
+ mutex_exit(&entry->ote_lock);
+
+ if (queue == B_TRUE)
+ overlay_target_queue(entry);
+ freemsg(mp);
+
+done:
+ mutex_enter(&entry->ote_ott->ott_lock);
+ entry->ote_ott->ott_ocount--;
+ cv_signal(&entry->ote_ott->ott_cond);
+ mutex_exit(&entry->ote_ott->ott_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_pkt_t *pkt;
+ overlay_targ_pkt32_t *pkt32;
+
+ pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
+ *outp = pkt;
+ *bsize = sizeof (overlay_targ_pkt_t);
+ if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+ uintptr_t addr;
+
+ if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
+ flags & FKIOCTL) != 0) {
+ kmem_free(pkt, *bsize);
+ return (EFAULT);
+ }
+ pkt32 = (overlay_targ_pkt32_t *)pkt;
+ addr = pkt32->otp_buf;
+ pkt->otp_buf = (void *)addr;
+ } else {
+ if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
+ kmem_free(pkt, *bsize);
+ return (EFAULT);
+ }
+ }
+ return (0);
+}
+
+static int
+overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
+ int flags)
+{
+ if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+ overlay_targ_pkt_t *pkt = buf;
+ overlay_targ_pkt32_t *pkt32 = buf;
+ uintptr_t addr = (uintptr_t)pkt->otp_buf;
+ pkt32->otp_buf = (caddr32_t)addr;
+ if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+ } else {
+ if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+static int
+overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+ size_t mlen;
+ size_t boff;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ mutex_enter(&entry->ote_lock);
+ mutex_exit(&thdl->oth_lock);
+ mp = entry->ote_chead;
+ /* Protect against a rogue varpd */
+ if (mp == NULL) {
+ mutex_exit(&entry->ote_lock);
+ return (EINVAL);
+ }
+ mlen = MIN(msgsize(mp), pkt->otp_size);
+ pkt->otp_size = mlen;
+ boff = 0;
+ while (mlen > 0) {
+ size_t wlen = MIN(MBLKL(mp), mlen);
+ if (ddi_copyout(mp->b_rptr,
+ (void *)((uintptr_t)pkt->otp_buf + boff),
+ wlen, 0) != 0) {
+ mutex_exit(&entry->ote_lock);
+ return (EFAULT);
+ }
+ mlen -= wlen;
+ boff += wlen;
+ mp = mp->b_cont;
+ }
+ mutex_exit(&entry->ote_lock);
+ return (0);
+}
+
+static int
+overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ overlay_dev_t *odd;
+ mblk_t *mp;
+
+ if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+ return (EINVAL);
+
+ mp = allocb(pkt->otp_size, 0);
+ if (mp == NULL)
+ return (ENOMEM);
+
+ if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+ freeb(mp);
+ return (EFAULT);
+ }
+ mp->b_wptr += pkt->otp_size;
+
+ if (pkt->otp_linkid != UINT64_MAX) {
+ odd = overlay_hold_by_dlid(pkt->otp_linkid);
+ if (odd == NULL) {
+ freeb(mp);
+ return (ENOENT);
+ }
+ } else {
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ freeb(mp);
+ return (ENOENT);
+ }
+ odd = entry->ote_odd;
+ mutex_exit(&thdl->oth_lock);
+ }
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_start(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+
+ mac_rx(odd->odd_mh, NULL, mp);
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ overlay_dev_t *odd;
+ mblk_t *mp;
+
+ if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+ return (EINVAL);
+
+ mp = allocb(pkt->otp_size, 0);
+ if (mp == NULL)
+ return (ENOMEM);
+
+ if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+ freeb(mp);
+ return (EFAULT);
+ }
+ mp->b_wptr += pkt->otp_size;
+
+ if (pkt->otp_linkid != UINT64_MAX) {
+ odd = overlay_hold_by_dlid(pkt->otp_linkid);
+ if (odd == NULL) {
+ freeb(mp);
+ return (ENOENT);
+ }
+ } else {
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ freeb(mp);
+ return (ENOENT);
+ }
+ odd = entry->ote_odd;
+ mutex_exit(&thdl->oth_lock);
+ }
+
+ mp = overlay_m_tx(odd, mp);
+ freemsgchain(mp);
+
+ return (0);
+}
+
+typedef struct overlay_targ_list_int {
+ boolean_t otli_count;
+ uint32_t otli_cur;
+ uint32_t otli_nents;
+ uint32_t otli_ents[];
+} overlay_targ_list_int_t;
+
+static int
+overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_list_t n;
+ overlay_targ_list_int_t *otl;
+
+ if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ /*
+ */
+ if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
+ return (EINVAL);
+ *bsize = sizeof (overlay_targ_list_int_t) +
+ sizeof (uint32_t) * n.otl_nents;
+ otl = kmem_zalloc(*bsize, KM_SLEEP);
+ otl->otli_cur = 0;
+ otl->otli_nents = n.otl_nents;
+ if (otl->otli_nents != 0) {
+ otl->otli_count = B_FALSE;
+ if (ddi_copyin((void *)((uintptr_t)ubuf +
+ offsetof(overlay_targ_list_t, otl_ents)),
+ otl->otli_ents, n.otl_nents * sizeof (uint32_t),
+ flags & FKIOCTL) != 0) {
+ kmem_free(otl, *bsize);
+ return (EFAULT);
+ }
+ } else {
+ otl->otli_count = B_TRUE;
+ }
+
+ *outp = otl;
+ return (0);
+}
+
+static int
+overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
+{
+ overlay_targ_list_int_t *otl = arg;
+
+ if (otl->otli_cur < otl->otli_nents)
+ otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
+ otl->otli_cur++;
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
+{
+ overlay_targ_list_int_t *otl = buf;
+
+ if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (otl->otli_count == B_FALSE) {
+ if (ddi_copyout(otl->otli_ents,
+ (void *)((uintptr_t)ubuf +
+ offsetof(overlay_targ_list_t, otl_ents)),
+ sizeof (uint32_t) * otl->otli_nents,
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
+{
+ int ret = 0;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_POINT &&
+ ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
+ } else {
+ overlay_target_entry_t *ote;
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote != NULL) {
+ mutex_enter(&ote->ote_lock);
+ if ((ote->ote_flags &
+ OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+ if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_DROP;
+ } else {
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ote->ote_dest,
+ &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
+ }
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+ mutex_exit(&ote->ote_lock);
+ } else {
+ ret = ENOENT;
+ }
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+ mblk_t *mp = NULL;
+
+ if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+ return (EINVAL);
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote == NULL) {
+ ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+ bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
+ ote->ote_chead = ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_ott = ott;
+ ote->ote_odd = odd;
+ mutex_enter(&ote->ote_lock);
+ refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+ } else {
+ mutex_enter(&ote->ote_lock);
+ }
+
+ if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
+ ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
+ } else {
+ ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+ sizeof (overlay_target_point_t));
+ mp = ote->ote_chead;
+ ote->ote_chead = NULL;
+ ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_vtime = gethrtime();
+ }
+
+ mutex_exit(&ote->ote_lock);
+ mutex_exit(&ott->ott_lock);
+
+ if (mp != NULL) {
+ mp = overlay_m_tx(ote->ote_odd, mp);
+ freemsgchain(mp);
+ }
+
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
+{
+ int ret = 0;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote != NULL) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
+{
+ avl_tree_t *avl;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+ avl = &ott->ott_u.ott_dyn.ott_tree;
+
+ for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ }
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static int
+overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_cache_iter_t base, *iter;
+
+ if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
+ return (E2BIG);
+
+ if (base.otci_count == 0)
+ return (EINVAL);
+
+ *bsize = sizeof (overlay_targ_cache_iter_t) +
+ base.otci_count * sizeof (overlay_targ_cache_entry_t);
+ iter = kmem_alloc(*bsize, KM_SLEEP);
+ bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
+ *outp = iter;
+
+ return (0);
+}
+
+typedef struct overlay_targ_cache_marker {
+ uint8_t otcm_mac[ETHERADDRL];
+ uint16_t otcm_done;
+} overlay_targ_cache_marker_t;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t lookup, *ent;
+ overlay_targ_cache_marker_t *mark;
+ avl_index_t where;
+ avl_tree_t *avl;
+ uint16_t written = 0;
+
+ overlay_targ_cache_iter_t *iter = arg;
+ mark = (void *)&iter->otci_marker;
+
+ if (mark->otcm_done != 0) {
+ iter->otci_count = 0;
+ return (0);
+ }
+
+ odd = overlay_hold_by_dlid(iter->otci_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
+ ott->ott_mode != OVERLAY_TARGET_POINT) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Holding this lock across the entire iteration probably isn't very
+ * good. We should perhaps add an r/w lock for the avl tree. But we'll
+ * wait until we now it's necessary before we do more.
+ */
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
+ bzero(out->otce_mac, ETHERADDRL);
+ out->otce_flags = 0;
+ bcopy(&ott->ott_u.ott_point, &out->otce_dest,
+ sizeof (overlay_target_point_t));
+ written++;
+ mark->otcm_done = 1;
+ }
+
+ avl = &ott->ott_u.ott_dyn.ott_tree;
+ bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+ ent = avl_find(avl, &lookup, &where);
+
+ /*
+ * NULL ent means that the entry does not exist, so we want to start
+ * with the closest node in the tree. This means that we implicitly rely
+ * on the tree's order and the first node will be the mac 00:00:00:00:00
+ * and the last will be ff:ff:ff:ff:ff:ff.
+ */
+ if (ent == NULL) {
+ ent = avl_nearest(avl, where, AVL_AFTER);
+ if (ent == NULL) {
+ mark->otcm_done = 1;
+ goto done;
+ }
+ }
+
+ for (; ent != NULL && written < iter->otci_count;
+ ent = AVL_NEXT(avl, ent)) {
+ overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
+ mutex_enter(&ent->ote_lock);
+ if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
+ mutex_exit(&ent->ote_lock);
+ continue;
+ }
+ bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+ out->otce_flags = 0;
+ if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
+ out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
+ if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
+ bcopy(&ent->ote_dest, &out->otce_dest,
+ sizeof (overlay_target_point_t));
+ written++;
+ mutex_exit(&ent->ote_lock);
+ }
+
+ if (ent != NULL) {
+ bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+ } else {
+ mark->otcm_done = 1;
+ }
+
+done:
+ iter->otci_count = written;
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
+ int flags)
+{
+ size_t outsize;
+ const overlay_targ_cache_iter_t *iter = buf;
+
+ outsize = sizeof (overlay_targ_cache_iter_t) +
+ iter->otci_count * sizeof (overlay_targ_cache_entry_t);
+
+ if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static overlay_target_ioctl_t overlay_target_ioctab[] = {
+ { OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
+ NULL, overlay_target_info,
+ NULL, sizeof (overlay_targ_info_t) },
+ { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
+ NULL, overlay_target_associate,
+ NULL, sizeof (overlay_targ_associate_t) },
+ { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
+ NULL, overlay_target_disassociate,
+ NULL, sizeof (overlay_targ_id_t) },
+ { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
+ NULL, overlay_target_degrade,
+ NULL, sizeof (overlay_targ_degrade_t) },
+ { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
+ NULL, overlay_target_restore,
+ NULL, sizeof (overlay_targ_id_t) },
+ { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
+ NULL, overlay_target_lookup_request,
+ NULL, sizeof (overlay_targ_lookup_t) },
+ { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
+ NULL, overlay_target_lookup_respond,
+ NULL, sizeof (overlay_targ_resp_t) },
+ { OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
+ NULL, overlay_target_lookup_drop,
+ NULL, sizeof (overlay_targ_resp_t) },
+ { OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
+ overlay_target_pkt_copyin,
+ overlay_target_packet,
+ overlay_target_pkt_copyout,
+ sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
+ overlay_target_pkt_copyin,
+ overlay_target_inject,
+ NULL, sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
+ overlay_target_pkt_copyin,
+ overlay_target_resend,
+ NULL, sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
+ overlay_target_list_copyin,
+ overlay_target_ioctl_list,
+ overlay_target_list_copyout,
+ sizeof (overlay_targ_list_t) },
+ { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
+ NULL, overlay_target_cache_get,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_set,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_remove,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_flush,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
+ overlay_target_cache_iter_copyin,
+ overlay_target_cache_iter,
+ overlay_target_cache_iter_copyout,
+ sizeof (overlay_targ_cache_iter_t) },
+ { 0 }
+};
+
+int
+overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
+{
+ minor_t mid;
+ overlay_target_hdl_t *thdl;
+
+ if (secpolicy_dl_config(credp) != 0)
+ return (EPERM);
+
+ if (getminor(*devp) != 0)
+ return (ENXIO);
+
+ if (otype & OTYP_BLK)
+ return (EINVAL);
+
+ if (flags & ~(FREAD | FWRITE | FEXCL))
+ return (EINVAL);
+
+ if ((flags & FWRITE) &&
+ !(flags & FEXCL))
+ return (EINVAL);
+
+ if (!(flags & FREAD) && !(flags & FWRITE))
+ return (EINVAL);
+
+ if (crgetzoneid(credp) != GLOBAL_ZONEID)
+ return (EPERM);
+
+ mid = id_alloc(overlay_thdl_idspace);
+ if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
+ id_free(overlay_thdl_idspace, mid);
+ return (ENXIO);
+ }
+
+ thdl = ddi_get_soft_state(overlay_thdl_state, mid);
+ VERIFY(thdl != NULL);
+ thdl->oth_minor = mid;
+ thdl->oth_zoneid = crgetzoneid(credp);
+ thdl->oth_oflags = flags;
+ mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_qlink));
+ *devp = makedevice(getmajor(*devp), mid);
+
+ mutex_enter(&overlay_target_lock);
+ if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
+ mutex_exit(&overlay_target_lock);
+ list_destroy(&thdl->oth_outstanding);
+ mutex_destroy(&thdl->oth_lock);
+ ddi_soft_state_free(overlay_thdl_state, mid);
+ id_free(overlay_thdl_idspace, mid);
+ return (EEXIST);
+ } else if ((flags & FEXCL) != 0) {
+ VERIFY(overlay_target_excl == B_FALSE);
+ overlay_target_excl = B_TRUE;
+ }
+ list_insert_tail(&overlay_thdl_list, thdl);
+ mutex_exit(&overlay_target_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ overlay_target_ioctl_t *ioc;
+ overlay_target_hdl_t *thdl;
+
+ if (secpolicy_dl_config(credp) != 0)
+ return (EPERM);
+
+ if ((thdl = ddi_get_soft_state(overlay_thdl_state,
+ getminor(dev))) == NULL)
+ return (ENXIO);
+
+ for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
+ int ret;
+ caddr_t buf;
+ size_t bufsize;
+
+ if (ioc->oti_cmd != cmd)
+ continue;
+
+ if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
+ return (EBADF);
+
+ if (ioc->oti_copyin == NULL) {
+ bufsize = ioc->oti_size;
+ buf = kmem_alloc(bufsize, KM_SLEEP);
+ if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
+ mode & FKIOCTL) != 0) {
+ kmem_free(buf, bufsize);
+ return (EFAULT);
+ }
+ } else {
+ if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
+ (void **)&buf, &bufsize, mode)) != 0)
+ return (ret);
+ }
+
+ ret = ioc->oti_func(thdl, buf);
+ if (ret == 0 && ioc->oti_size != 0 &&
+ ioc->oti_ncopyout == B_TRUE) {
+ if (ioc->oti_copyout == NULL) {
+ if (ddi_copyout(buf, (void *)(uintptr_t)arg,
+ bufsize, mode & FKIOCTL) != 0)
+ ret = EFAULT;
+ } else {
+ ret = ioc->oti_copyout((void *)(uintptr_t)arg,
+ buf, bufsize, mode);
+ }
+ }
+
+ kmem_free(buf, bufsize);
+ return (ret);
+ }
+
+ return (ENOTTY);
+}
+
+/* ARGSUSED */
+int
+overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
+{
+ overlay_target_hdl_t *thdl;
+ overlay_target_entry_t *entry;
+ minor_t mid = getminor(dev);
+
+ if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
+ return (ENXIO);
+
+ mutex_enter(&overlay_target_lock);
+ list_remove(&overlay_thdl_list, thdl);
+ mutex_enter(&thdl->oth_lock);
+ while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
+ list_insert_tail(&overlay_target_list, entry);
+ cv_signal(&overlay_target_condvar);
+ mutex_exit(&thdl->oth_lock);
+ if ((thdl->oth_oflags & FEXCL) != 0) {
+ VERIFY(overlay_target_excl == B_TRUE);
+ overlay_target_excl = B_FALSE;
+ }
+ mutex_exit(&overlay_target_lock);
+
+ list_destroy(&thdl->oth_outstanding);
+ mutex_destroy(&thdl->oth_lock);
+ mid = thdl->oth_minor;
+ ddi_soft_state_free(overlay_thdl_state, mid);
+ id_free(overlay_thdl_idspace, mid);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
new file mode 100644
index 0000000000..92144b3985
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
@@ -0,0 +1,394 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * VXLAN encapsulation module
+ *
+ *
+ * The VXLAN header looks as follows in network byte order:
+ *
+ * |0 3| 4 |5 31|
+ * +----------+---+------------------------+
+ * | Reserved | I | Reserved |
+ * +---------------------------------------+
+ * | Virtual Network ID | Reserved |
+ * +----------------------------+----------+
+ * |0 23|24 31|
+ *
+ * All reserved values must be 0. The I bit must be 1. We call the top
+ * word the VXLAN magic field for the time being. The second word is
+ * definitely not the most friendly way to operate. Specifically, the ID
+ * is a 24-bit big endian value, but we have to make sure not to use the
+ * reserved byte.
+ *
+ * For us, VXLAN encapsulation is a fairly straightforward implementation. It
+ * only has two properties, a listen_ip and a listen_port. These determine on
+ * what address we should be listening on. While we do not have a default
+ * address to listen upon, we do have a default port, which is the IANA assigned
+ * port for VXLAN -- 4789.
+ */
+
+#include <sys/overlay_plugin.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/byteorder.h>
+#include <sys/vxlan.h>
+#include <inet/ip.h>
+#include <netinet/in.h>
+#include <sys/strsun.h>
+#include <netinet/udp.h>
+
+static const char *vxlan_ident = "vxlan";
+static uint16_t vxlan_defport = IPPORT_VXLAN;
+
+/*
+ * Should we enable UDP source port hashing for fanout.
+ */
+boolean_t vxlan_fanout = B_TRUE;
+
+/*
+ * This represents the size in bytes that we want to allocate when allocating a
+ * vxlan header block. This is intended such that lower levels can try and use
+ * the message block that we allocate for the IP and UPD header. The hope is
+ * that even if this is tunneled, that this is enough space.
+ *
+ * The vxlan_noalloc_min value represents the minimum amount of space we need to
+ * consider not allocating a message block and just passing it down the stack in
+ * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet
+ * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header.
+ */
+uint_t vxlan_alloc_size = 128;
+uint_t vxlan_noalloc_min = 54;
+
+static const char *vxlan_props[] = {
+ "vxlan/listen_ip",
+ "vxlan/listen_port",
+ NULL
+};
+
+typedef struct vxlan {
+ kmutex_t vxl_lock;
+ overlay_handle_t vxl_oh;
+ uint16_t vxl_lport;
+ boolean_t vxl_hladdr;
+ struct in6_addr vxl_laddr;
+} vxlan_t;
+
+static int
+vxlan_o_init(overlay_handle_t oh, void **outp)
+{
+ vxlan_t *vxl;
+
+ vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
+ *outp = vxl;
+ mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
+ vxl->vxl_oh = oh;
+ vxl->vxl_lport = vxlan_defport;
+ vxl->vxl_hladdr = B_FALSE;
+
+ return (0);
+}
+
+static void
+vxlan_o_fini(void *arg)
+{
+ vxlan_t *vxl = arg;
+
+ mutex_destroy(&vxl->vxl_lock);
+ kmem_free(arg, sizeof (vxlan_t));
+}
+
+static int
+vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
+ socklen_t *slenp)
+{
+ vxlan_t *vxl = arg;
+ struct sockaddr_in6 *in;
+
+ in = (struct sockaddr_in6 *)addr;
+ *dp = AF_INET6;
+ *fp = SOCK_DGRAM;
+ *pp = 0;
+ bzero(in, sizeof (struct sockaddr_in6));
+ in->sin6_family = AF_INET6;
+
+ /*
+ * We should consider a more expressive private errno set that
+ * provider's can use.
+ */
+ mutex_enter(&vxl->vxl_lock);
+ if (vxl->vxl_hladdr == B_FALSE) {
+ mutex_exit(&vxl->vxl_lock);
+ return (EINVAL);
+ }
+ in->sin6_port = htons(vxl->vxl_lport);
+ in->sin6_addr = vxl->vxl_laddr;
+ mutex_exit(&vxl->vxl_lock);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ return (0);
+}
+
+static int
+vxlan_o_sockopt(ksocket_t ksock)
+{
+ int val, err;
+ if (vxlan_fanout == B_FALSE)
+ return (0);
+
+ val = UDP_HASH_VXLAN;
+ err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
+ sizeof (val), kcred);
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
+ mblk_t **outp)
+{
+ mblk_t *ob;
+ vxlan_hdr_t *vxh;
+
+ ASSERT(einfop->ovdi_id < (1 << 24));
+
+ if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
+ /*
+ * This allocation could get hot. We may want to have a good
+ * way to cache and handle this allocation the same way that IP
+ * does with keeping around a message block per entry, or
+ * basically treating this as an immutable message block in the
+ * system. Basically freemsg() will be a nop, but we'll do the
+ * right thing with respect to the rest of the chain.
+ */
+ ob = allocb(vxlan_alloc_size, 0);
+ if (ob == NULL)
+ return (ENOMEM);
+
+ ob->b_wptr = DB_LIM(ob);
+ ob->b_rptr = ob->b_wptr;
+ ob->b_cont = mp;
+ } else {
+ ob = mp;
+ }
+ ob->b_rptr -= VXLAN_HDR_LEN;
+
+ vxh = (vxlan_hdr_t *)ob->b_rptr;
+ vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
+ vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
+ *outp = ob;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
+{
+ vxlan_hdr_t *vxh;
+
+ if (MBLKL(mp) < sizeof (vxlan_hdr_t))
+ return (EINVAL);
+ vxh = (vxlan_hdr_t *)mp->b_rptr;
+ if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
+ return (EINVAL);
+
+ dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
+ dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
+
+ return (0);
+}
+
+static int
+vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
+{
+ vxlan_t *vxl = arg;
+
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ if (*bufsize < sizeof (struct in6_addr))
+ return (EOVERFLOW);
+
+ mutex_enter(&vxl->vxl_lock);
+ if (vxl->vxl_hladdr == B_FALSE) {
+ *bufsize = 0;
+ } else {
+ bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
+ *bufsize = sizeof (struct in6_addr);
+ }
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+
+ /* vxlan/listen_port */
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ uint64_t val;
+ if (*bufsize < sizeof (uint64_t))
+ return (EOVERFLOW);
+
+ mutex_enter(&vxl->vxl_lock);
+ val = vxl->vxl_lport;
+ bcopy(&val, buf, sizeof (uint64_t));
+ *bufsize = sizeof (uint64_t);
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static int
+vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
+ uint32_t bufsize)
+{
+ vxlan_t *vxl = arg;
+
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ const struct in6_addr *ipv6 = buf;
+ if (bufsize != sizeof (struct in6_addr))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_V4COMPAT(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_MULTICAST(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_6TO4(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
+ ipaddr_t v4;
+ IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
+ if (IN_MULTICAST(v4))
+ return (EINVAL);
+ }
+
+ mutex_enter(&vxl->vxl_lock);
+ vxl->vxl_hladdr = B_TRUE;
+ bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
+ mutex_exit(&vxl->vxl_lock);
+
+ return (0);
+ }
+
+ /* vxlan/listen_port */
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ const uint64_t *valp = buf;
+ if (bufsize != 8)
+ return (EINVAL);
+
+ if (*valp == 0 || *valp > UINT16_MAX)
+ return (EINVAL);
+
+ mutex_enter(&vxl->vxl_lock);
+ vxl->vxl_lport = *valp;
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+ return (EINVAL);
+}
+
+static int
+vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
+{
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ overlay_prop_set_name(phdl, vxlan_props[0]);
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
+ overlay_prop_set_nodefault(phdl);
+ return (0);
+ }
+
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ overlay_prop_set_name(phdl, vxlan_props[1]);
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ (void) overlay_prop_set_default(phdl, &vxlan_defport,
+ sizeof (vxlan_defport));
+ overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static struct overlay_plugin_ops vxlan_o_ops = {
+ 0,
+ vxlan_o_init,
+ vxlan_o_fini,
+ vxlan_o_encap,
+ vxlan_o_decap,
+ vxlan_o_socket,
+ vxlan_o_sockopt,
+ vxlan_o_getprop,
+ vxlan_o_setprop,
+ vxlan_o_propinfo
+};
+
+static struct modlmisc vxlan_modlmisc = {
+ &mod_miscops,
+ "VXLAN encap plugin"
+};
+
+static struct modlinkage vxlan_modlinkage = {
+ MODREV_1,
+ &vxlan_modlmisc
+};
+
+int
+_init(void)
+{
+ int err;
+ overlay_plugin_register_t *ovrp;
+
+ ovrp = overlay_plugin_alloc(OVEP_VERSION);
+ if (ovrp == NULL)
+ return (ENOTSUP);
+ ovrp->ovep_name = vxlan_ident;
+ ovrp->ovep_ops = &vxlan_o_ops;
+ ovrp->ovep_id_size = VXLAN_ID_LEN;
+ ovrp->ovep_flags = OVEP_F_VLAN_TAG;
+ ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
+ ovrp->ovep_props = vxlan_props;
+
+ if ((err = overlay_plugin_register(ovrp)) == 0) {
+ if ((err = mod_install(&vxlan_modlinkage)) != 0) {
+ (void) overlay_plugin_unregister(vxlan_ident);
+ }
+ }
+
+ overlay_plugin_free(ovrp);
+ return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&vxlan_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
+ return (err);
+
+ return (mod_remove(&vxlan_modlinkage));
+}
diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c
index 4ea5cd9778..b06e750888 100644
--- a/usr/src/uts/common/io/pciex/pcie.c
+++ b/usr/src/uts/common/io/pciex/pcie.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019, Joyent, Inc.
*/
#include <sys/sysmacros.h>
@@ -684,6 +684,7 @@ pcie_init_pfd(dev_info_t *dip)
pfd_p->pe_bus_p = bus_p;
pfd_p->pe_severity_flags = 0;
+ pfd_p->pe_severity_mask = 0;
pfd_p->pe_orig_severity_flags = 0;
pfd_p->pe_lock = B_FALSE;
pfd_p->pe_valid = B_FALSE;
@@ -840,6 +841,7 @@ pcie_rc_init_pfd(dev_info_t *dip, pf_data_t *pfd_p)
{
pfd_p->pe_bus_p = PCIE_DIP2DOWNBUS(dip);
pfd_p->pe_severity_flags = 0;
+ pfd_p->pe_severity_mask = 0;
pfd_p->pe_orig_severity_flags = 0;
pfd_p->pe_lock = B_FALSE;
pfd_p->pe_valid = B_FALSE;
@@ -921,7 +923,7 @@ pcie_rc_init_bus(dev_info_t *dip)
bus_p->bus_aer_off = (uint16_t)-1;
/* Needed only for handle lookup */
- bus_p->bus_fm_flags |= PF_FM_READY;
+ atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY);
ndi_set_bus_private(dip, B_FALSE, DEVI_PORT_TYPE_PCI, bus_p);
@@ -938,6 +940,180 @@ pcie_rc_fini_bus(dev_info_t *dip)
}
/*
+ * We need to capture the supported, maximum, and current device speed and
+ * width. The way that this has been done has changed over time.
+ *
+ * Prior to PCIe Gen 3, there were only current and supported speed fields.
+ * These were found in the link status and link capabilities registers of the
+ * PCI express capability. With the change to PCIe Gen 3, the information in the
+ * link capabilities changed to the maximum value. The supported speeds vector
+ * was moved to the link capabilities 2 register.
+ *
+ * Now, a device may not implement some of these registers. To determine whether
+ * or not it's here, we have to do the following. First, we need to check the
+ * revision of the PCI express capability. The link capabilities 2 register did
+ * not exist prior to version 2 of this register.
+ */
+static void
+pcie_capture_speeds(pcie_bus_t *bus_p, pcie_req_id_t bdf, dev_info_t *rcdip)
+{
+ uint16_t vers, status;
+ uint32_t val, cap, cap2;
+
+ if (!PCIE_IS_PCIE(bus_p))
+ return;
+
+ vers = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off + PCIE_PCIECAP);
+ if (vers == PCI_EINVAL16)
+ return;
+ vers &= PCIE_PCIECAP_VER_MASK;
+
+ /*
+ * Verify the capability's version.
+ */
+ switch (vers) {
+ case PCIE_PCIECAP_VER_1_0:
+ cap2 = 0;
+ break;
+ case PCIE_PCIECAP_VER_2_0:
+ cap2 = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off +
+ PCIE_LINKCAP2);
+ if (cap2 == PCI_EINVAL32)
+ cap2 = 0;
+ break;
+ default:
+ /* Don't try and handle an unknown version */
+ return;
+ }
+
+ status = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off +
+ PCIE_LINKSTS);
+ cap = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off + PCIE_LINKCAP);
+ if (status == PCI_EINVAL16 || cap == PCI_EINVAL32)
+ return;
+
+ switch (status & PCIE_LINKSTS_SPEED_MASK) {
+ case PCIE_LINKSTS_SPEED_2_5:
+ bus_p->bus_cur_speed = PCIE_LINK_SPEED_2_5;
+ break;
+ case PCIE_LINKSTS_SPEED_5:
+ bus_p->bus_cur_speed = PCIE_LINK_SPEED_5;
+ break;
+ case PCIE_LINKSTS_SPEED_8:
+ bus_p->bus_cur_speed = PCIE_LINK_SPEED_8;
+ break;
+ case PCIE_LINKSTS_SPEED_16:
+ bus_p->bus_cur_speed = PCIE_LINK_SPEED_16;
+ break;
+ default:
+ bus_p->bus_cur_speed = PCIE_LINK_SPEED_UNKNOWN;
+ break;
+ }
+
+ switch (status & PCIE_LINKSTS_NEG_WIDTH_MASK) {
+ case PCIE_LINKSTS_NEG_WIDTH_X1:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_X1;
+ break;
+ case PCIE_LINKSTS_NEG_WIDTH_X2:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_X2;
+ break;
+ case PCIE_LINKSTS_NEG_WIDTH_X4:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_X4;
+ break;
+ case PCIE_LINKSTS_NEG_WIDTH_X8:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_X8;
+ break;
+ case PCIE_LINKSTS_NEG_WIDTH_X12:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_X12;
+ break;
+ case PCIE_LINKSTS_NEG_WIDTH_X16:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_X16;
+ break;
+ case PCIE_LINKSTS_NEG_WIDTH_X32:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_X32;
+ break;
+ default:
+ bus_p->bus_cur_width = PCIE_LINK_WIDTH_UNKNOWN;
+ break;
+ }
+
+ switch (cap & PCIE_LINKCAP_MAX_WIDTH_MASK) {
+ case PCIE_LINKCAP_MAX_WIDTH_X1:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_X1;
+ break;
+ case PCIE_LINKCAP_MAX_WIDTH_X2:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_X2;
+ break;
+ case PCIE_LINKCAP_MAX_WIDTH_X4:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_X4;
+ break;
+ case PCIE_LINKCAP_MAX_WIDTH_X8:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_X8;
+ break;
+ case PCIE_LINKCAP_MAX_WIDTH_X12:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_X12;
+ break;
+ case PCIE_LINKCAP_MAX_WIDTH_X16:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_X16;
+ break;
+ case PCIE_LINKCAP_MAX_WIDTH_X32:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_X32;
+ break;
+ default:
+ bus_p->bus_max_width = PCIE_LINK_WIDTH_UNKNOWN;
+ break;
+ }
+
+ /*
+ * If we have the Link Capabilities 2, then we can get the supported
+ * speeds from it and treat the bits in Link Capabilities 1 as the
+ * maximum. If we don't, then we need to follow the Implementation Note
+ * in the standard under Link Capabilities 2. Effectively, this means
+ * that if the value of 10b is set in Link Capabilities register, that
+ * it supports both 2.5 and 5 GT/s speeds.
+ */
+ if (cap2 != 0) {
+ if (cap2 & PCIE_LINKCAP2_SPEED_2_5)
+ bus_p->bus_sup_speed |= PCIE_LINK_SPEED_2_5;
+ if (cap2 & PCIE_LINKCAP2_SPEED_5)
+ bus_p->bus_sup_speed |= PCIE_LINK_SPEED_5;
+ if (cap2 & PCIE_LINKCAP2_SPEED_8)
+ bus_p->bus_sup_speed |= PCIE_LINK_SPEED_8;
+ if (cap2 & PCIE_LINKCAP2_SPEED_16)
+ bus_p->bus_sup_speed |= PCIE_LINK_SPEED_16;
+
+ switch (cap & PCIE_LINKCAP_MAX_SPEED_MASK) {
+ case PCIE_LINKCAP_MAX_SPEED_2_5:
+ bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5;
+ break;
+ case PCIE_LINKCAP_MAX_SPEED_5:
+ bus_p->bus_max_speed = PCIE_LINK_SPEED_5;
+ break;
+ case PCIE_LINKCAP_MAX_SPEED_8:
+ bus_p->bus_max_speed = PCIE_LINK_SPEED_8;
+ break;
+ case PCIE_LINKCAP_MAX_SPEED_16:
+ bus_p->bus_max_speed = PCIE_LINK_SPEED_16;
+ break;
+ default:
+ bus_p->bus_max_speed = PCIE_LINK_SPEED_UNKNOWN;
+ break;
+ }
+ } else {
+ if (cap & PCIE_LINKCAP_MAX_SPEED_5) {
+ bus_p->bus_max_speed = PCIE_LINK_SPEED_5;
+ bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5 |
+ PCIE_LINK_SPEED_5;
+ }
+
+ if (cap & PCIE_LINKCAP_MAX_SPEED_2_5) {
+ bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5;
+ bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5;
+ }
+ }
+}
+
+/*
* partially init pcie_bus_t for device (dip,bdf) for accessing pci
* config space
*
@@ -1134,6 +1310,10 @@ pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf, uint8_t flags)
}
}
+ /*
+ * Save and record speed information about the device.
+ */
+
caps_done:
/* save RP dip and RP bdf */
if (PCIE_IS_RP(bus_p)) {
@@ -1170,7 +1350,7 @@ caps_done:
}
bus_p->bus_soft_state = PCI_SOFT_STATE_CLOSED;
- bus_p->bus_fm_flags = 0;
+ (void) atomic_swap_uint(&bus_p->bus_fm_flags, 0);
bus_p->bus_mps = 0;
ndi_set_bus_private(dip, B_TRUE, DEVI_PORT_TYPE_PCI, (void *)bus_p);
@@ -1226,6 +1406,8 @@ initial_done:
pcie_init_plat(dip);
+ pcie_capture_speeds(bus_p, bdf, rcdip);
+
final_done:
PCIE_DBG("Add %s(dip 0x%p, bdf 0x%x, secbus 0x%x)\n",
@@ -1318,14 +1500,15 @@ pcie_get_rc_dip(dev_info_t *dip)
return (rcdip);
}
-static boolean_t
+boolean_t
pcie_is_pci_device(dev_info_t *dip)
{
dev_info_t *pdip;
char *device_type;
pdip = ddi_get_parent(dip);
- ASSERT(pdip);
+ if (pdip == NULL)
+ return (B_FALSE);
if (ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, DDI_PROP_DONTPASS,
"device_type", &device_type) != DDI_PROP_SUCCESS)
diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c
index a8c02caa9c..6a335db3e2 100644
--- a/usr/src/uts/common/io/pciex/pcie_fault.c
+++ b/usr/src/uts/common/io/pciex/pcie_fault.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
*/
#include <sys/sysmacros.h>
@@ -919,10 +920,18 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl)
}
/*
- * Read vendor/device ID and check with cached data, if it doesn't match
- * could very well be a device that isn't responding anymore. Just
- * stop. Save the basic info in the error q for post mortem debugging
- * purposes.
+ * If this is a device used for PCI passthrough into a virtual machine,
+ * don't let any error it caused panic the system.
+ */
+ if (bus_p->bus_fm_flags & PF_FM_IS_PASSTHRU)
+ pfd_p->pe_severity_mask |= PF_ERR_PANIC;
+
+ /*
+ * Read vendor/device ID and check with cached data; if it doesn't
+ * match, it could very well mean that the device is no longer
+ * responding. In this case, we return PF_SCAN_BAD_RESPONSE; should
+ * the caller choose to panic in this case, we will have the basic
+ * info in the error queue for the purposes of postmortem debugging.
*/
if (PCIE_GET(32, bus_p, PCI_CONF_VENID) != bus_p->bus_dev_ven_id) {
char buf[FM_MAX_CLASS];
@@ -933,12 +942,12 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl)
DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0, NULL);
/*
- * For IOV/Hotplug purposes skip gathering info fo this device,
+ * For IOV/Hotplug purposes skip gathering info for this device,
* but populate affected info and severity. Clear out any data
* that maybe been saved in the last fabric scan.
*/
pf_reset_pfd(pfd_p);
- pfd_p->pe_severity_flags = PF_ERR_PANIC_BAD_RESPONSE;
+ pfd_p->pe_severity_flags = PF_ERR_BAD_RESPONSE;
PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_SELF;
/* Add the snapshot to the error q */
@@ -950,6 +959,7 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl)
pf_pci_regs_gather(pfd_p, bus_p);
pf_pci_regs_clear(pfd_p, bus_p);
+
if (PCIE_IS_RP(bus_p))
pf_pci_find_rp_fault(pfd_p, bus_p);
@@ -984,6 +994,22 @@ done:
}
/*
+ * Set the passthru flag on a device bus_p. Called by passthru drivers to
+ * indicate when a device is or is no longer under passthru control.
+ */
+void
+pf_set_passthru(dev_info_t *dip, boolean_t is_passthru)
+{
+ pcie_bus_t *bus_p = PCIE_DIP2BUS(dip);
+
+ if (is_passthru) {
+ atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_PASSTHRU);
+ } else {
+ atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_PASSTHRU);
+ }
+}
+
+/*
* Called during postattach to initialize a device's error handling
* capabilities. If the devices has already been hardened, then there isn't
* much needed. Otherwise initialize the device's default FMA capabilities.
@@ -1026,7 +1052,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd)
DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE);
cap &= (DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE);
- bus_p->bus_fm_flags |= PF_FM_IS_NH;
+ atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_NH);
if (cmd == DDI_ATTACH) {
ddi_fm_init(dip, &cap, &ibc);
@@ -1041,7 +1067,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd)
/* If ddi_fm_init fails for any reason RETURN */
if (!fmhdl) {
- bus_p->bus_fm_flags = 0;
+ (void) atomic_swap_uint(&bus_p->bus_fm_flags, 0);
return;
}
@@ -1051,7 +1077,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd)
ddi_fm_handler_register(dip, pf_dummy_cb, NULL);
}
- bus_p->bus_fm_flags |= PF_FM_READY;
+ atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY);
}
/* undo FMA lock, called at predetach */
@@ -1068,7 +1094,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd)
return;
/* no other code should set the flag to false */
- bus_p->bus_fm_flags &= ~PF_FM_READY;
+ atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_READY);
/*
* Grab the mutex to make sure device isn't in the middle of
@@ -1082,7 +1108,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd)
/* undo non-hardened drivers */
if (bus_p->bus_fm_flags & PF_FM_IS_NH) {
if (cmd == DDI_DETACH) {
- bus_p->bus_fm_flags &= ~PF_FM_IS_NH;
+ atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_NH);
pci_ereport_teardown(dip);
/*
* ddi_fini itself calls ddi_handler_unregister,
@@ -1379,7 +1405,7 @@ pf_analyse_error(ddi_fm_error_t *derr, pf_impl_t *impl)
sts_flags = 0;
/* skip analysing error when no error info is gathered */
- if (pfd_p->pe_severity_flags == PF_ERR_PANIC_BAD_RESPONSE)
+ if (pfd_p->pe_severity_flags == PF_ERR_BAD_RESPONSE)
goto done;
switch (PCIE_PFD2BUS(pfd_p)->bus_dev_type) {
@@ -1457,6 +1483,8 @@ done:
/* Have pciev_eh adjust the severity */
pfd_p->pe_severity_flags = pciev_eh(pfd_p, impl);
+ pfd_p->pe_severity_flags &= ~pfd_p->pe_severity_mask;
+
error_flags |= pfd_p->pe_severity_flags;
}
@@ -3060,6 +3088,7 @@ pf_reset_pfd(pf_data_t *pfd_p)
pcie_bus_t *bus_p = PCIE_PFD2BUS(pfd_p);
pfd_p->pe_severity_flags = 0;
+ pfd_p->pe_severity_mask = 0;
pfd_p->pe_orig_severity_flags = 0;
/* pe_lock and pe_valid were reset in pf_send_ereport */
diff --git a/usr/src/uts/common/io/pciex/pciev.c b/usr/src/uts/common/io/pciex/pciev.c
index 18794318dd..da68026dcf 100644
--- a/usr/src/uts/common/io/pciex/pciev.c
+++ b/usr/src/uts/common/io/pciex/pciev.c
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
#include <sys/types.h>
#include <sys/ddi.h>
#include <sys/dditypes.h>
@@ -302,7 +306,7 @@ pciev_eh(pf_data_t *pfd_p, pf_impl_t *impl)
pcie_faulty_all = B_TRUE;
} else if (severity & (PF_ERR_NO_PANIC | PF_ERR_MATCHED_DEVICE |
- PF_ERR_PANIC | PF_ERR_PANIC_BAD_RESPONSE)) {
+ PF_ERR_PANIC | PF_ERR_BAD_RESPONSE)) {
uint16_t affected_flag, dev_affected_flags;
uint_t is_panic = 0, is_aff_dev_found = 0;
diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c
index 665c9eff6c..9aaf58fb7b 100644
--- a/usr/src/uts/common/io/physmem.c
+++ b/usr/src/uts/common/io/physmem.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
@@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
int ret;
static int msg_printed = 0;
+ /*
+ * This device should never be visible in a zone, but if it somehow
+ * does get created we refuse to allow the zone to use it.
+ */
+ if (crgetzoneid(credp) != GLOBAL_ZONEID)
+ return (EACCES);
+
if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
return (EINVAL);
}
diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf
index 42248e93d6..08affec609 100644
--- a/usr/src/uts/common/io/pseudo.conf
+++ b/usr/src/uts/common/io/pseudo.conf
@@ -22,8 +22,7 @@
#
# Copyright 2003 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2014 Joyent, Inc. All rights reserved.
#
# This file is private to the pseudonex driver. It should not be edited.
#
@@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0;
# /pseudo; it has as its children the zone console pseudo nodes.
#
name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons";
+
+#
+# zfdnex is an alias for pseudo; this node is instantiated as a child of
+# /pseudo; it has as its children the zone fd pseudo nodes.
+#
+name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd";
diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c
index f83b0abf39..0ae06f88cc 100644
--- a/usr/src/uts/common/io/pseudonex.c
+++ b/usr/src/uts/common/io/pseudonex.c
@@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t);
static int pseudonex_open(dev_t *, int, int, cred_t *);
static int pseudonex_close(dev_t, int, int, cred_t *);
static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int,
+ ddi_iblock_cookie_t *);
static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *,
void *);
@@ -90,6 +92,8 @@ static void *pseudonex_state;
typedef struct pseudonex_state {
dev_info_t *pnx_devi;
+ int pnx_fmcap;
+ ddi_iblock_cookie_t pnx_fm_ibc;
} pseudonex_state_t;
static struct bus_ops pseudonex_bus_ops = {
@@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = {
NULL, /* bus_intr_ctl */
NULL, /* bus_config */
NULL, /* bus_unconfig */
- NULL, /* bus_fm_init */
+ pseudonex_fm_init, /* bus_fm_init */
NULL, /* bus_fm_fini */
NULL, /* bus_fm_access_enter */
NULL, /* bus_fm_access_exit */
@@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
pnx_state = ddi_get_soft_state(pseudonex_state, instance);
pnx_state->pnx_devi = devi;
+ pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE;
+ ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc);
+
if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance,
DDI_NT_NEXUS, 0) != DDI_SUCCESS) {
ddi_remove_minor_node(devi, NULL);
@@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
if (cmd == DDI_SUSPEND)
return (DDI_SUCCESS);
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ ddi_fm_fini(devi);
ddi_remove_minor_node(devi, NULL);
ddi_soft_state_free(pseudonex_state, instance);
return (DDI_SUCCESS);
@@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child)
}
static int
+pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap,
+ ddi_iblock_cookie_t *ibc)
+{
+ pseudonex_state_t *pnx_state;
+
+ pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip));
+ ASSERT(pnx_state != NULL);
+ ASSERT(ibc != NULL);
+ *ibc = pnx_state->pnx_fm_ibc;
+ return (pnx_state->pnx_fmcap & cap);
+}
+
+static int
pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
void *arg, void *result)
{
diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c
index bc8c17bedd..54bcee88bc 100644
--- a/usr/src/uts/common/io/ptm.c
+++ b/usr/src/uts/common/io/ptm.c
@@ -447,6 +447,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp)
return (0);
}
+static boolean_t
+ptmptsopencb(ptmptsopencb_arg_t arg)
+{
+ struct pt_ttys *ptmp = (struct pt_ttys *)arg;
+ boolean_t rval;
+
+ PT_ENTER_READ(ptmp);
+ rval = (ptmp->pt_nullmsg != NULL);
+ PT_EXIT_READ(ptmp);
+ return (rval);
+}
+
/*
* The wput procedure will only handle ioctl and flush messages.
*/
@@ -574,6 +586,41 @@ ptmwput(queue_t *qp, mblk_t *mp)
miocack(qp, mp, 0, 0);
break;
}
+ case PTMPTSOPENCB:
+ {
+ mblk_t *dp; /* ioctl reply data */
+ ptmptsopencb_t *ppocb;
+
+ /* only allow the kernel to invoke this ioctl */
+ if (iocp->ioc_cr != kcred) {
+ miocnak(qp, mp, 0, EINVAL);
+ break;
+ }
+
+ /* we don't support transparent ioctls */
+ ASSERT(iocp->ioc_count != TRANSPARENT);
+ if (iocp->ioc_count == TRANSPARENT) {
+ miocnak(qp, mp, 0, EINVAL);
+ break;
+ }
+
+ /* allocate a response message */
+ dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED);
+ if (dp == NULL) {
+ miocnak(qp, mp, 0, EAGAIN);
+ break;
+ }
+
+ /* initialize the ioctl results */
+ ppocb = (ptmptsopencb_t *)dp->b_rptr;
+ ppocb->ppocb_func = ptmptsopencb;
+ ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp;
+
+ /* send the reply data */
+ mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0);
+ qreply(qp, mp);
+ break;
+ }
}
break;
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg
new file mode 100644
index 0000000000..b932ffaa7c
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg
new file mode 100644
index 0000000000..9421ecc0db
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png
new file mode 100644
index 0000000000..4b8a66761a
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png
new file mode 100644
index 0000000000..3254fbdc3b
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg
new file mode 100644
index 0000000000..7bb0dbf21b
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin
new file mode 100644
index 0000000000..43014fd8ea
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin
new file mode 100644
index 0000000000..9524eb4a63
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin
Binary files differ
diff --git a/usr/src/uts/common/io/qede/qede_list.h b/usr/src/uts/common/io/qede/qede_list.h
index 2350cb4117..656d2a915f 100644
--- a/usr/src/uts/common/io/qede/qede_list.h
+++ b/usr/src/uts/common/io/qede/qede_list.h
@@ -176,4 +176,3 @@ qede_list_splice_tail(qede_list_t *list,
#define QEDE_LIST_FOR_EACH_ENTRY_SAFE OSAL_LIST_FOR_EACH_ENTRY_SAFE
#endif /* !_QEDE_LIST_H */
-
diff --git a/usr/src/uts/common/io/qede/qede_version.h b/usr/src/uts/common/io/qede/qede_version.h
index 43584f95f0..0ee38b4338 100644
--- a/usr/src/uts/common/io/qede/qede_version.h
+++ b/usr/src/uts/common/io/qede/qede_version.h
@@ -42,4 +42,3 @@
#define REVVERSION 25
#endif /* !_QEDE_VERSION_H */
-
diff --git a/usr/src/uts/common/io/random.c b/usr/src/uts/common/io/random.c
index d79b86362c..a50bbcceec 100644
--- a/usr/src/uts/common/io/random.c
+++ b/usr/src/uts/common/io/random.c
@@ -291,6 +291,9 @@ rnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
if ((error = uiomove(buf, bytes, UIO_WRITE, uiop)) != 0)
return (error);
+ if (crgetzone(credp) != global_zone)
+ continue;
+
switch (devno) {
case DEVRANDOM:
if ((error = random_add_entropy(buf, bytes, 0)) != 0)
diff --git a/usr/src/uts/common/io/rsm/rsm.c b/usr/src/uts/common/io/rsm/rsm.c
index b49d5b735a..d9d40c83fd 100644
--- a/usr/src/uts/common/io/rsm/rsm.c
+++ b/usr/src/uts/common/io/rsm/rsm.c
@@ -22,8 +22,8 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2012 Milan Jurik. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
index d615f8dd62..0e4fb433cf 100644
--- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
+++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
@@ -82,6 +82,7 @@
*/
#include <sys/note.h>
+#include <sys/debug.h>
#include <sys/scsi/scsi.h>
#include <sys/pci.h>
#include <sys/disp.h>
@@ -10552,8 +10553,7 @@ ahci_em_led_task(void *arg)
mutex_enter(&led->aelta_ctl->ahcictl_mutex);
if (ret) {
- led->aelta_ctl->ahcictl_em_state[led->aelta_port] =
- led->aelta_state;
+ led->aelta_ctl->ahcictl_em_state[led->aelta_port] = state;
led->aelta_ret = 0;
} else {
led->aelta_ret = EIO;
@@ -10763,6 +10763,7 @@ ahci_em_ioctl_set(ahci_ctl_t *ahci_ctlp, intptr_t arg)
}
task->aelta_ctl = ahci_ctlp;
+ task->aelta_port = set.aiems_port;
task->aelta_port = (uint8_t)set.aiems_port;
task->aelta_op = set.aiems_op;
task->aelta_state = set.aiems_leds;
@@ -10839,22 +10840,19 @@ static void
ahci_em_quiesce(ahci_ctl_t *ahci_ctlp)
{
ASSERT(ahci_ctlp->ahcictl_em_flags & AHCI_EM_PRESENT);
+ VERIFY(mutex_owned(&ahci_ctlp->ahcictl_mutex));
- mutex_enter(&ahci_ctlp->ahcictl_mutex);
ahci_ctlp->ahcictl_em_flags |= AHCI_EM_QUIESCE;
- mutex_exit(&ahci_ctlp->ahcictl_mutex);
-
ddi_taskq_wait(ahci_ctlp->ahcictl_em_taskq);
}
static void
ahci_em_suspend(ahci_ctl_t *ahci_ctlp)
{
- ahci_em_quiesce(ahci_ctlp);
+ VERIFY(mutex_owned(&ahci_ctlp->ahcictl_mutex));
- mutex_enter(&ahci_ctlp->ahcictl_mutex);
+ ahci_em_quiesce(ahci_ctlp);
ahci_ctlp->ahcictl_em_flags &= ~AHCI_EM_READY;
- mutex_exit(&ahci_ctlp->ahcictl_mutex);
}
static void
@@ -10875,7 +10873,10 @@ ahci_em_fini(ahci_ctl_t *ahci_ctlp)
return;
}
+ mutex_enter(&ahci_ctlp->ahcictl_mutex);
ahci_em_quiesce(ahci_ctlp);
+ mutex_exit(&ahci_ctlp->ahcictl_mutex);
+
ddi_taskq_destroy(ahci_ctlp->ahcictl_em_taskq);
ahci_ctlp->ahcictl_em_taskq = NULL;
}
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
index e5aa96f469..05298d8b05 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
@@ -72,6 +72,7 @@
#include <sys/file.h>
#include <sys/policy.h>
#include <sys/model.h>
+#include <sys/refhash.h>
#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/dr.h>
@@ -99,7 +100,6 @@
#include <sys/scsi/adapters/mpt_sas/mptsas_var.h>
#include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h>
#include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h>
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
#include <sys/raidioctl.h>
#include <sys/fs/dv_node.h> /* devfs_clean */
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c
new file mode 100644
index 0000000000..a7ef2b69d7
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c
@@ -0,0 +1,565 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static int smrt_attach(dev_info_t *, ddi_attach_cmd_t);
+static int smrt_detach(dev_info_t *, ddi_detach_cmd_t);
+static int smrt_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static void smrt_cleanup(smrt_t *);
+static int smrt_command_comparator(const void *, const void *);
+
+/*
+ * Controller soft state. Each entry is an object of type "smrt_t".
+ */
+void *smrt_state;
+
+/*
+ * DMA attributes template. Each controller will make a copy of this template
+ * with appropriate customisations; e.g., the Scatter/Gather List Length.
+ */
+static ddi_dma_attr_t smrt_dma_attr_template = {
+ .dma_attr_version = DMA_ATTR_V0,
+ .dma_attr_addr_lo = 0x0000000000000000,
+ .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF,
+ .dma_attr_count_max = 0x00FFFFFF,
+ .dma_attr_align = 0x20,
+ .dma_attr_burstsizes = 0x20,
+ .dma_attr_minxfer = DMA_UNIT_8,
+ .dma_attr_maxxfer = 0xFFFFFFFF,
+ /*
+ * There is some suggestion that at least some, possibly older, Smart
+ * Array controllers cannot tolerate a DMA segment that straddles a 4GB
+ * boundary.
+ */
+ .dma_attr_seg = 0xFFFFFFFF,
+ .dma_attr_sgllen = 1,
+ .dma_attr_granular = 512,
+ .dma_attr_flags = 0
+};
+
+/*
+ * Device memory access attributes for device control registers.
+ */
+ddi_device_acc_attr_t smrt_dev_attributes = {
+ .devacc_attr_version = DDI_DEVICE_ATTR_V0,
+ .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
+ .devacc_attr_dataorder = DDI_STRICTORDER_ACC,
+ .devacc_attr_access = 0
+};
+
+/*
+ * Character/Block Operations Structure
+ */
+static struct cb_ops smrt_cb_ops = {
+ .cb_rev = CB_REV,
+ .cb_flag = D_NEW | D_MP,
+
+ .cb_open = scsi_hba_open,
+ .cb_close = scsi_hba_close,
+
+ .cb_ioctl = smrt_ioctl,
+
+ .cb_strategy = nodev,
+ .cb_print = nodev,
+ .cb_dump = nodev,
+ .cb_read = nodev,
+ .cb_write = nodev,
+ .cb_devmap = nodev,
+ .cb_mmap = nodev,
+ .cb_segmap = nodev,
+ .cb_chpoll = nochpoll,
+ .cb_prop_op = ddi_prop_op,
+ .cb_str = NULL,
+ .cb_aread = nodev,
+ .cb_awrite = nodev
+};
+
+/*
+ * Device Operations Structure
+ */
+static struct dev_ops smrt_dev_ops = {
+ .devo_rev = DEVO_REV,
+ .devo_refcnt = 0,
+
+ .devo_attach = smrt_attach,
+ .devo_detach = smrt_detach,
+
+ .devo_cb_ops = &smrt_cb_ops,
+
+ .devo_getinfo = nodev,
+ .devo_identify = nulldev,
+ .devo_probe = nulldev,
+ .devo_reset = nodev,
+ .devo_bus_ops = NULL,
+ .devo_power = nodev,
+ .devo_quiesce = nodev
+};
+
+/*
+ * Linkage structures
+ */
+static struct modldrv smrt_modldrv = {
+ .drv_modops = &mod_driverops,
+ .drv_linkinfo = "HP Smart Array",
+ .drv_dev_ops = &smrt_dev_ops
+};
+
+static struct modlinkage smrt_modlinkage = {
+ .ml_rev = MODREV_1,
+ .ml_linkage = { &smrt_modldrv, NULL }
+};
+
+
+int
+_init()
+{
+ int r;
+
+ VERIFY0(ddi_soft_state_init(&smrt_state, sizeof (smrt_t), 0));
+
+ if ((r = scsi_hba_init(&smrt_modlinkage)) != 0) {
+ goto fail;
+ }
+
+ if ((r = mod_install(&smrt_modlinkage)) != 0) {
+ scsi_hba_fini(&smrt_modlinkage);
+ goto fail;
+ }
+
+ return (r);
+
+fail:
+ ddi_soft_state_fini(&smrt_state);
+ return (r);
+}
+
+int
+_fini()
+{
+ int r;
+
+ if ((r = mod_remove(&smrt_modlinkage)) == 0) {
+ scsi_hba_fini(&smrt_modlinkage);
+ ddi_soft_state_fini(&smrt_state);
+ }
+
+ return (r);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&smrt_modlinkage, modinfop));
+}
+
+static int
+smrt_iport_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ const char *addr;
+ dev_info_t *pdip;
+ int instance;
+ smrt_t *smrt;
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ /*
+ * Note, we cannot get to our parent via the tran's tran_hba_private
+ * member. This pointer is reset to NULL when the scsi_hba_tran_t
+ * structure is duplicated.
+ */
+ addr = scsi_hba_iport_unit_address(dip);
+ VERIFY(addr != NULL);
+ pdip = ddi_get_parent(dip);
+ instance = ddi_get_instance(pdip);
+ smrt = ddi_get_soft_state(smrt_state, instance);
+ VERIFY(smrt != NULL);
+
+ if (strcmp(addr, SMRT_IPORT_VIRT) == 0) {
+ if (smrt_logvol_hba_setup(smrt, dip) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ smrt->smrt_virt_iport = dip;
+ } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) {
+ if (smrt_phys_hba_setup(smrt, dip) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ smrt->smrt_phys_iport = dip;
+ } else {
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_iport_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ const char *addr;
+ scsi_hba_tran_t *tran;
+ smrt_t *smrt;
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ tran = ddi_get_driver_private(dip);
+ VERIFY(tran != NULL);
+ smrt = tran->tran_hba_private;
+ VERIFY(smrt != NULL);
+
+ addr = scsi_hba_iport_unit_address(dip);
+ VERIFY(addr != NULL);
+
+ if (strcmp(addr, SMRT_IPORT_VIRT) == 0) {
+ smrt_logvol_hba_teardown(smrt, dip);
+ smrt->smrt_virt_iport = NULL;
+ } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) {
+ smrt_phys_hba_teardown(smrt, dip);
+ smrt->smrt_phys_iport = NULL;
+ } else {
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ uint32_t instance;
+ smrt_t *smrt;
+ boolean_t check_for_interrupts = B_FALSE;
+ int r;
+ char taskq_name[64];
+
+ if (scsi_hba_iport_unit_address(dip) != NULL)
+ return (smrt_iport_attach(dip, cmd));
+
+ if (cmd != DDI_ATTACH) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Allocate the per-controller soft state object and get
+ * a pointer to it.
+ */
+ instance = ddi_get_instance(dip);
+ if (ddi_soft_state_zalloc(smrt_state, instance) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not allocate soft state");
+ return (DDI_FAILURE);
+ }
+ if ((smrt = ddi_get_soft_state(smrt_state, instance)) == NULL) {
+ dev_err(dip, CE_WARN, "could not get soft state");
+ ddi_soft_state_free(smrt_state, instance);
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Initialise per-controller state object.
+ */
+ smrt->smrt_dip = dip;
+ smrt->smrt_instance = instance;
+ smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER;
+ list_create(&smrt->smrt_commands, sizeof (smrt_command_t),
+ offsetof(smrt_command_t, smcm_link));
+ list_create(&smrt->smrt_finishq, sizeof (smrt_command_t),
+ offsetof(smrt_command_t, smcm_link_finish));
+ list_create(&smrt->smrt_abortq, sizeof (smrt_command_t),
+ offsetof(smrt_command_t, smcm_link_abort));
+ list_create(&smrt->smrt_volumes, sizeof (smrt_volume_t),
+ offsetof(smrt_volume_t, smlv_link));
+ list_create(&smrt->smrt_physicals, sizeof (smrt_physical_t),
+ offsetof(smrt_physical_t, smpt_link));
+ list_create(&smrt->smrt_targets, sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_ctlr));
+ avl_create(&smrt->smrt_inflight, smrt_command_comparator,
+ sizeof (smrt_command_t), offsetof(smrt_command_t,
+ smcm_node));
+ cv_init(&smrt->smrt_cv_finishq, NULL, CV_DRIVER, NULL);
+
+ smrt->smrt_init_level |= SMRT_INITLEVEL_BASIC;
+
+ /*
+ * Perform basic device setup, including identifying the board, mapping
+ * the I2O registers and the Configuration Table.
+ */
+ if (smrt_device_setup(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "device setup failed");
+ goto fail;
+ }
+
+ /*
+ * Select a Transport Method (e.g. Simple or Performant) and update
+ * the Configuration Table. This function also waits for the
+ * controller to become ready.
+ */
+ if (smrt_ctlr_init(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "controller initialisation failed");
+ goto fail;
+ }
+
+ /*
+ * Each controller may have a different Scatter/Gather Element count.
+ * Configure a per-controller set of DMA attributes with the
+ * appropriate S/G size.
+ */
+ VERIFY(smrt->smrt_sg_cnt > 0);
+ smrt->smrt_dma_attr = smrt_dma_attr_template;
+ smrt->smrt_dma_attr.dma_attr_sgllen = smrt->smrt_sg_cnt;
+
+ /*
+ * Now that we have selected a Transport Method, we can configure
+ * the appropriate interrupt handlers.
+ */
+ if (smrt_interrupts_setup(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "interrupt handler setup failed");
+ goto fail;
+ }
+
+ /*
+ * Now that we have the correct interrupt priority, we can initialise
+ * the mutex. This must be done before the interrupt handler is
+ * enabled.
+ */
+ mutex_init(&smrt->smrt_mutex, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(smrt->smrt_interrupt_pri));
+ smrt->smrt_init_level |= SMRT_INITLEVEL_MUTEX;
+
+ /*
+ * From this point forward, the controller is able to accept commands
+ * and (at least by polling) return command submissions. Setting this
+ * flag allows the rest of the driver to interact with the device.
+ */
+ smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING;
+
+ if (smrt_interrupts_enable(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "interrupt handler could not be enabled");
+ goto fail;
+ }
+
+ if (smrt_ctrl_hba_setup(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "SCSI framework setup failed");
+ goto fail;
+ }
+
+ /*
+ * Set the appropriate Interrupt Mask Register bits to start
+ * command completion interrupts from the controller.
+ */
+ smrt_intr_set(smrt, B_TRUE);
+ check_for_interrupts = B_TRUE;
+
+ /*
+ * Register the maintenance routine for periodic execution:
+ */
+ smrt->smrt_periodic = ddi_periodic_add(smrt_periodic, smrt,
+ SMRT_PERIODIC_RATE * NANOSEC, DDI_IPL_0);
+ smrt->smrt_init_level |= SMRT_INITLEVEL_PERIODIC;
+
+ (void) snprintf(taskq_name, sizeof (taskq_name), "smrt_discover_%u",
+ instance);
+ smrt->smrt_discover_taskq = ddi_taskq_create(smrt->smrt_dip, taskq_name,
+ 1, TASKQ_DEFAULTPRI, 0);
+ if (smrt->smrt_discover_taskq == NULL) {
+ dev_err(dip, CE_WARN, "failed to create discovery task queue");
+ goto fail;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_TASKQ;
+
+ if ((r = smrt_event_init(smrt)) != 0) {
+ dev_err(dip, CE_WARN, "could not initialize event subsystem "
+ "(%d)", r);
+ goto fail;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_ASYNC_EVENT;
+
+ if (scsi_hba_iport_register(dip, SMRT_IPORT_VIRT) != DDI_SUCCESS)
+ goto fail;
+
+ if (scsi_hba_iport_register(dip, SMRT_IPORT_PHYS) != DDI_SUCCESS)
+ goto fail;
+
+ /*
+ * Announce the attachment of this controller.
+ */
+ ddi_report_dev(dip);
+
+ return (DDI_SUCCESS);
+
+fail:
+ if (check_for_interrupts) {
+ if (smrt->smrt_stats.smrts_claimed_interrupts == 0) {
+ dev_err(dip, CE_WARN, "controller did not interrupt "
+ "during attach");
+ }
+ }
+ smrt_cleanup(smrt);
+ return (DDI_FAILURE);
+}
+
+static int
+smrt_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ scsi_hba_tran_t *tran = (scsi_hba_tran_t *)ddi_get_driver_private(dip);
+ smrt_t *smrt = (smrt_t *)tran->tran_hba_private;
+
+ if (scsi_hba_iport_unit_address(dip) != NULL)
+ return (smrt_iport_detach(dip, cmd));
+
+ if (cmd != DDI_DETACH) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * First, check to make sure that all SCSI framework targets have
+ * detached.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+ if (!list_is_empty(&smrt->smrt_targets)) {
+ mutex_exit(&smrt->smrt_mutex);
+ dev_err(smrt->smrt_dip, CE_WARN, "cannot detach; targets still "
+ "using HBA");
+ return (DDI_FAILURE);
+ }
+
+ if (smrt->smrt_virt_iport != NULL || smrt->smrt_phys_iport != NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ dev_err(smrt->smrt_dip, CE_WARN, "cannot detach: iports still "
+ "attached");
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Prevent new targets from attaching now:
+ */
+ smrt->smrt_status |= SMRT_CTLR_STATUS_DETACHING;
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * Clean up all remaining resources.
+ */
+ smrt_cleanup(smrt);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rval)
+{
+ int inst = MINOR2INST(getminor(dev));
+ int status;
+
+ if (secpolicy_sys_config(credp, B_FALSE) != 0) {
+ return (EPERM);
+ }
+
+ /*
+ * Ensure that we have a soft state object for this instance.
+ */
+ if (ddi_get_soft_state(smrt_state, inst) == NULL) {
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ default:
+ status = scsi_hba_ioctl(dev, cmd, arg, mode, credp, rval);
+ break;
+ }
+
+ return (status);
+}
+
+static void
+smrt_cleanup(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_ASYNC_EVENT) {
+ smrt_event_fini(smrt);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_ASYNC_EVENT;
+ }
+
+ smrt_interrupts_teardown(smrt);
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_TASKQ) {
+ ddi_taskq_destroy(smrt->smrt_discover_taskq);
+ smrt->smrt_discover_taskq = NULL;
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_TASKQ;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_PERIODIC) {
+ ddi_periodic_delete(smrt->smrt_periodic);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_PERIODIC;
+ }
+
+ smrt_ctrl_hba_teardown(smrt);
+
+ smrt_ctlr_teardown(smrt);
+
+ smrt_device_teardown(smrt);
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_BASIC) {
+ smrt_logvol_teardown(smrt);
+ smrt_phys_teardown(smrt);
+
+ cv_destroy(&smrt->smrt_cv_finishq);
+
+ VERIFY(list_is_empty(&smrt->smrt_commands));
+ list_destroy(&smrt->smrt_commands);
+ list_destroy(&smrt->smrt_finishq);
+ list_destroy(&smrt->smrt_abortq);
+
+ VERIFY(list_is_empty(&smrt->smrt_volumes));
+ list_destroy(&smrt->smrt_volumes);
+
+ VERIFY(list_is_empty(&smrt->smrt_physicals));
+ list_destroy(&smrt->smrt_physicals);
+
+ VERIFY(list_is_empty(&smrt->smrt_targets));
+ list_destroy(&smrt->smrt_targets);
+
+ VERIFY(avl_is_empty(&smrt->smrt_inflight));
+ avl_destroy(&smrt->smrt_inflight);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_BASIC;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_MUTEX) {
+ mutex_destroy(&smrt->smrt_mutex);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_MUTEX;
+ }
+
+ VERIFY0(smrt->smrt_init_level);
+
+ ddi_soft_state_free(smrt_state, ddi_get_instance(smrt->smrt_dip));
+}
+
+/*
+ * Comparator for the "smrt_inflight" AVL tree in a "smrt_t". This AVL tree
+ * allows a tag ID to be mapped back to the relevant "smrt_command_t".
+ */
+static int
+smrt_command_comparator(const void *lp, const void *rp)
+{
+ const smrt_command_t *l = lp;
+ const smrt_command_t *r = rp;
+
+ if (l->smcm_tag > r->smcm_tag) {
+ return (1);
+ } else if (l->smcm_tag < r->smcm_tag) {
+ return (-1);
+ } else {
+ return (0);
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf
new file mode 100644
index 0000000000..758ecd0779
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+scsi-no-quiesce=1;
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c
new file mode 100644
index 0000000000..b4cdd5607e
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c
@@ -0,0 +1,2023 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * Discovery, Resets, Periodics, and Events
+ * ----------------------------------------
+ *
+ * Discovery is the act of figuring out what logical and physical volumes exist
+ * under the controller. Discovery happens in response to the following events:
+ *
+ * o iports for virtual and physical devices being attached
+ * o Controller event notifications indicating potential topology changes
+ * o After a reset of the controller, before we can perform I/O again
+ *
+ * Because we have to perform discovery after a reset, which can happen during
+ * panic(), that also means that discovery may be run in panic context. We
+ * also need to emphasize the need for discovery to happen after a controller
+ * reset. Once a reset is initiated, we cannot be certain about the addresses
+ * of any of the existing targets until the reset has completed. The driver
+ * performs I/Os to addresses that the controller provides. The controller
+ * specification says that these addresses may change after a controller reset.
+ *
+ * Unfortunately, all of this combined means that making sure we can correctly
+ * run discovery is somewhat complicated. In non-panic contexts, discovery is
+ * always run from a taskq. We'll kick off the discovery in the taskq if
+ * nothing is pending at that time. The state is managed by bits in the
+ * smrt_status member of the smrt_t. There are four bits at this time:
+ *
+ * SMRT_CTLR_DISCOVERY_REQUESTED This flag indicates that something has
+ * requested that a discovery be performed.
+ * If no flags are set when this is set,
+ * then we will kick off discovery. All
+ * discovery requests are initiated via the
+ * smrt_discover_request() function.
+ *
+ * SMRT_CTLR_DISCOVERY_RUNNING This flag is set at the start of us
+ * running a discovery. It is removed when
+ * discovery finishes.
+ *
+ * SMRT_CTLR_DISCOVERY_PERIODIC This flag is set in a number of
+ * circumstances, which will be described
+ * in a subsequent section. This indicates
+ * that the periodic must kick off the
+ * discovery process.
+ *
+ * SMRT_CTLR_DISCOVERY_REQUIRED This flag indicates that at some point a
+ * controller reset occurred and we need to
+ * have a successful discovery to finish
+ * the act of resetting and allowing I/O to
+ * continue.
+ *
+ * In general, a request to discover kicks off the taskq to discover entries, if
+ * it hasn't already been requested or started. This also allows us to coalesce
+ * multiple requests, if needed. Note that if a request comes in when a
+ * discovery is ongoing, we do not kick off discovery again. Instead, we set
+ * the SMRT_CTLR_DISCOVERY_REQUESTED flag which will rerun discovery after the
+ * initial pass has completed.
+ *
+ * When a discovery starts, the first thing it does is clear the
+ * SMRT_CTLR_DISCOVERY_REQUESTED flag. This is important, because any
+ * additional requests for discovery that come in after this has started likely
+ * indicate that we've missed something. As such, when the discovery process
+ * finishes, if it sees the REQUESTED flag, then it will need to set the
+ * PERIODIC flag. The PERIODIC flag is used to indicate that we should run
+ * discovery again, but not kick if off immediately. Instead, it should be
+ * driven by the normal periodic behavior.
+ *
+ * If for some reason the act of discovery fails, or we fail to dispatch
+ * discovery due to a transient error, then we will flag PERIODIC so that the
+ * periodic tick will try and run things again.
+ *
+ * Now, we need to talk about SMRT_CTLR_DISCOVERY_REQUIRED. This flag is set
+ * after a reset occurs. The reset thread will be blocked on this.
+ * Importantly, none of the code in the discovery path can ask for a controller
+ * reset at this time. If at the end of a discovery, this flag is set, then we
+ * will signal the reset thread that it should check on its status by
+ * broadcasting on the smrt_cv_finishq. At that point, the reset thread will
+ * continue.
+ *
+ * Panic Context
+ * -------------
+ *
+ * All of this talk of threads and taskqs is well and good, but as an HBA
+ * driver, we have a serious responsibility to try and deal with panic sanely.
+ * In panic context, we will directly call the discovery functions and not poll
+ * for them to occur.
+ *
+ * However, because our discovery relies on the target maps, which aren't safe
+ * for panic context at this time, we have to take a different approach. We
+ * leverage the fact that we have a generation number stored with every
+ * discovery. If we try to do an I/O to a device where the generation doesn't
+ * match, then we know that it disappeared and should not be used. We also
+ * sanity check the model, serial numbers, and WWNs to make sure that these are
+ * the same devices. If they are, then we'll end up updating the address
+ * structures.
+ *
+ * Now, it is possible that when we were panicking, we had a thread that was in
+ * the process of running a discovery or even resetting the system. Once we're
+ * in panic, those threads aren't running, so if they didn't end up producing a
+ * new view of the world that the SCSI framework is using, then it shouldn't
+ * really matter, as we won't have updated the list of devices. Importantly,
+ * once we're in that context, we're not going to be attaching or detaching
+ * targets. If we get a request for one of these targets which has disappeared,
+ * we're going to have to end up giving up.
+ *
+ * Request Attributes
+ * ------------------
+ *
+ * The CISS specification allows for three different kinds of attributes that
+ * describe how requests are queued to the controller. These are:
+ *
+ * HEAD OF QUEUE The request should go to the head of the
+ * controller queue. This is used for resets and
+ * aborts to ensure that they're not blocked behind
+ * additional I/O.
+ *
+ * SIMPLE This queues the request for normal processing.
+ * Commands queued this way are not special with
+ * respect to one another. We use this for all I/O
+ * and discovery commands.
+ *
+ * ORDERED This attribute is used to indicate that commands
+ * should be submitted and processed in some order.
+ * This is used primarily for the event
+ * notification bits so we can ensure that at the
+ * return of a cancellation of the event
+ * notification, that any outstanding request has
+ * been honored.
+ */
+
+static int smrt_ctlr_versions(smrt_t *, uint16_t, smrt_versions_t *);
+static void smrt_discover(void *);
+
+/*
+ * The maximum number of seconds to wait for the controller to come online.
+ */
+unsigned smrt_ciss_init_time = 90;
+
+/*
+ * A tunable that determines the number of events per tick that we'll process
+ * via asynchronous event notification. If this rate is very high, then we will
+ * not submit the event and it will be picked up at the next tick of the
+ * periodic.
+ */
+uint_t smrt_event_intervention_threshold = 1000;
+
+/*
+ * Converts a LUN Address to a BMIC Identifier. The BMIC Identifier is used
+ * when performing various physical commands and generally should stay the same
+ * for a given device across inserts and removals; however, not across
+ * controller resets. These are calculated based on what the CISS specification
+ * calls the 'Level 2' target and bus, which don't have a real meaning in the
+ * SAS world otherwise.
+ */
+uint16_t
+smrt_lun_addr_to_bmic(PhysDevAddr_t *paddr)
+{
+ uint16_t id;
+
+ id = (paddr->Target[1].PeripDev.Bus - 1) << 8;
+ id += paddr->Target[1].PeripDev.Dev;
+
+ return (id);
+}
+
+void
+smrt_write_lun_addr_phys(LUNAddr_t *lun, boolean_t masked, unsigned bus,
+ unsigned target)
+{
+ lun->PhysDev.Mode = masked ? MASK_PERIPHERIAL_DEV_ADDR :
+ PERIPHERIAL_DEV_ADDR;
+
+ lun->PhysDev.TargetId = target;
+ lun->PhysDev.Bus = bus;
+
+ bzero(&lun->PhysDev.Target, sizeof (lun->PhysDev.Target));
+}
+
+/*
+ * According to the CISS Specification, the controller is always addressed in
+ * Mask Perhiperhal mode with a bus and target ID of zero. This is used by
+ * commands that need to write to the controller itself, which is generally
+ * discovery and other commands.
+ */
+void
+smrt_write_controller_lun_addr(LUNAddr_t *lun)
+{
+ smrt_write_lun_addr_phys(lun, B_TRUE, 0, 0);
+}
+
+void
+smrt_write_message_common(smrt_command_t *smcm, uint8_t type, int timeout_secs)
+{
+ switch (type) {
+ case CISS_MSG_ABORT:
+ case CISS_MSG_RESET:
+ case CISS_MSG_NOP:
+ break;
+
+ default:
+ panic("unknown message type");
+ }
+
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_MSG;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_HEADOFQUEUE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout_secs);
+ smcm->smcm_va_cmd->Request.CDBLen = CISS_CDBLEN;
+ smcm->smcm_va_cmd->Request.CDB[0] = type;
+}
+
+void
+smrt_write_message_abort_one(smrt_command_t *smcm, uint32_t tag)
+{
+ smrt_tag_t cisstag;
+
+ /*
+ * When aborting a particular command, the request is addressed
+ * to the controller.
+ */
+ smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+ B_TRUE, 0, 0);
+
+ smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
+
+ /*
+ * Abort a single command.
+ */
+ smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASK;
+
+ /*
+ * The CISS Specification says that the tag value for a task-level
+ * abort should be in the CDB in bytes 4-11.
+ */
+ bzero(&cisstag, sizeof (cisstag));
+ cisstag.tag_value = tag;
+ bcopy(&cisstag, &smcm->smcm_va_cmd->Request.CDB[4],
+ sizeof (cisstag));
+}
+
+void
+smrt_write_message_abort_all(smrt_command_t *smcm, LUNAddr_t *addr)
+{
+ /*
+ * When aborting all tasks for a particular Logical Volume,
+ * the command is addressed not to the controller but to
+ * the Volume itself.
+ */
+ smcm->smcm_va_cmd->Header.LUN = *addr;
+
+ smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
+
+ /*
+ * Abort all commands for a particular Logical Volume.
+ */
+ smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASKSET;
+}
+
+void
+smrt_write_message_event_notify(smrt_command_t *smcm)
+{
+ smrt_event_notify_req_t senr;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+ smcm->smcm_va_cmd->Request.Timeout = 0;
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
+
+ bzero(&senr, sizeof (senr));
+ senr.senr_opcode = CISS_SCMD_READ;
+ senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT;
+ senr.senr_flags = BE_32(0);
+ senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
+
+ bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (senr)));
+}
+
+void
+smrt_write_message_cancel_event_notify(smrt_command_t *smcm)
+{
+ smrt_event_notify_req_t senr;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_WRITE;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(SMRT_ASYNC_CANCEL_TIMEOUT);
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
+
+ bzero(&senr, sizeof (senr));
+ senr.senr_opcode = CISS_SCMD_WRITE;
+ senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT_CANCEL;
+ senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
+
+ bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (senr)));
+}
+
+void
+smrt_write_message_reset_ctlr(smrt_command_t *smcm)
+{
+ smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+ B_TRUE, 0, 0);
+
+ smrt_write_message_common(smcm, CISS_MSG_RESET, 0);
+
+ smcm->smcm_va_cmd->Request.CDB[1] = CISS_RESET_CTLR;
+}
+
+void
+smrt_write_message_nop(smrt_command_t *smcm, int timeout_secs)
+{
+ /*
+ * No-op messages are always sent to the controller.
+ */
+ smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+ B_TRUE, 0, 0);
+
+ smrt_write_message_common(smcm, CISS_MSG_NOP, timeout_secs);
+}
+
+/*
+ * This routine is executed regularly by ddi_periodic_add(9F). It checks the
+ * health of the controller and looks for submitted commands that have timed
+ * out.
+ */
+void
+smrt_periodic(void *arg)
+{
+ smrt_t *smrt = arg;
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Before we even check if the controller is running to process
+ * everything else, we must first check if we had a request to kick off
+ * discovery. We do this before the check if the controller is running,
+ * as this may be required to finish a discovery.
+ */
+ if ((smrt->smrt_status & SMRT_CTLR_DISCOVERY_PERIODIC) != 0 &&
+ (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) == 0 &&
+ (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) == 0) {
+ if (ddi_taskq_dispatch(smrt->smrt_discover_taskq,
+ smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
+ smrt->smrt_stats.smrts_discovery_tq_errors++;
+ } else {
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_PERIODIC;
+ }
+ }
+
+ if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) {
+ /*
+ * The device is currently not active, e.g. due to an
+ * in-progress controller reset.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ return;
+ }
+
+ /*
+ * Check on the health of the controller firmware. Note that if the
+ * controller has locked up, this routine will panic the system.
+ */
+ smrt_lockup_check(smrt);
+
+ /*
+ * Reset the event notification threshold counter.
+ */
+ smrt->smrt_event_count = 0;
+
+ /*
+ * Check inflight commands to see if they have timed out.
+ */
+ for (smrt_command_t *smcm = avl_first(&smrt->smrt_inflight);
+ smcm != NULL; smcm = AVL_NEXT(&smrt->smrt_inflight, smcm)) {
+ if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
+ /*
+ * Polled commands are timed out by the polling
+ * routine.
+ */
+ continue;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+ /*
+ * This command has been aborted; either it will
+ * complete or the controller will be reset.
+ */
+ continue;
+ }
+
+ if (list_link_active(&smcm->smcm_link_abort)) {
+ /*
+ * Already on the abort queue.
+ */
+ continue;
+ }
+
+ if (smcm->smcm_expiry == 0) {
+ /*
+ * This command has no expiry time.
+ */
+ continue;
+ }
+
+ if (gethrtime() > smcm->smcm_expiry) {
+ list_insert_tail(&smrt->smrt_abortq, smcm);
+ smcm->smcm_status |= SMRT_CMD_STATUS_TIMEOUT;
+ }
+ }
+
+ /*
+ * Process the abort queue.
+ */
+ (void) smrt_process_abortq(smrt);
+
+ /*
+ * Check if we have an outstanding event intervention request. Note,
+ * the command in question should always be in a state such that it is
+ * usable by the system here. The command is always prepared again by
+ * the normal event notification path, even if a reset has occurred.
+ * The reset will be processed before we'd ever consider running an
+ * event again. Note, if we fail to submit this, then we leave this for
+ * the next occurrence of the periodic.
+ */
+ if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
+ smrt->smrt_stats.smrts_events_intervened++;
+
+ if (smrt_submit(smrt, smrt->smrt_event_cmd) == 0) {
+ smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
+ }
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+int
+smrt_retrieve(smrt_t *smrt)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ smrt_retrieve_simple(smrt);
+ return (DDI_SUCCESS);
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ break;
+ }
+
+ panic("unknown controller mode");
+ /* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+/*
+ * Grab a new tag number for this command. We aim to avoid reusing tag numbers
+ * as much as possible, so as to avoid spurious double completion from the
+ * controller.
+ */
+static void
+smrt_set_new_tag(smrt_t *smrt, smrt_command_t *smcm)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ /*
+ * Loop until we find a tag that is not in use. The tag space is
+ * very large (~30 bits) and the maximum number of inflight commands
+ * is comparatively small (~1024 in current controllers).
+ */
+ for (;;) {
+ uint32_t new_tag = smrt->smrt_next_tag;
+
+ if (++smrt->smrt_next_tag > SMRT_MAX_TAG_NUMBER) {
+ smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER;
+ }
+
+ if (smrt_lookup_inflight(smrt, new_tag) != NULL) {
+ /*
+ * This tag is already used on an inflight command.
+ * Choose another.
+ */
+ continue;
+ }
+
+ /*
+ * Set the tag for the command and also write it into the
+ * appropriate part of the request block.
+ */
+ smcm->smcm_tag = new_tag;
+ smcm->smcm_va_cmd->Header.Tag.tag_value = new_tag;
+ return;
+ }
+}
+
+/*
+ * Submit a command to the controller.
+ */
+int
+smrt_submit(smrt_t *smrt, smrt_command_t *smcm)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ VERIFY(smcm->smcm_type != SMRT_CMDTYPE_PREINIT);
+
+ /*
+ * Anything that asks us to ignore the running state of the controller
+ * must be wired up to poll for completion.
+ */
+ if (smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING) {
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+ }
+
+ /*
+ * If the controller is currently being reset, do not allow command
+ * submission. However, if this is one of the commands needed to finish
+ * reset, as indicated on the command structure, allow it.
+ */
+ if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING) &&
+ !(smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING)) {
+ return (EIO);
+ }
+
+ /*
+ * Do not allow submission of more concurrent commands than the
+ * controller supports.
+ */
+ if (avl_numnodes(&smrt->smrt_inflight) >= smrt->smrt_maxcmds) {
+ return (EAGAIN);
+ }
+
+ /*
+ * Synchronise the Command Block DMA resources to ensure that the
+ * device has a consistent view before we pass it the command.
+ */
+ if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
+ DDI_DMA_SYNC_FORDEV) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "DMA sync failure");
+ return (EIO);
+ }
+
+ /*
+ * Ensure that this command is not re-used without issuing a new
+ * tag number and performing any appropriate cleanup.
+ */
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_USED));
+ smcm->smcm_status |= SMRT_CMD_STATUS_USED;
+
+ /*
+ * Assign a tag that is not currently in use
+ */
+ smrt_set_new_tag(smrt, smcm);
+
+ /*
+ * Insert this command into the inflight AVL.
+ */
+ avl_index_t where;
+ if (avl_find(&smrt->smrt_inflight, smcm, &where) != NULL) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "duplicate submit tag %x",
+ smcm->smcm_tag);
+ }
+ avl_insert(&smrt->smrt_inflight, smcm, where);
+ if (smrt->smrt_stats.smrts_max_inflight <
+ avl_numnodes(&smrt->smrt_inflight)) {
+ smrt->smrt_stats.smrts_max_inflight =
+ avl_numnodes(&smrt->smrt_inflight);
+ }
+
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+ smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT;
+
+ smcm->smcm_time_submit = gethrtime();
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ smrt_submit_simple(smrt, smcm);
+ return (0);
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ break;
+ }
+ panic("unknown controller mode");
+ /* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+static void
+smrt_process_finishq_sync(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
+ DDI_DMA_SYNC_FORCPU) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "finishq DMA sync failure");
+ }
+}
+
+static void
+smrt_process_finishq_one(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE));
+ smcm->smcm_status |= SMRT_CMD_STATUS_COMPLETE;
+
+ switch (smcm->smcm_type) {
+ case SMRT_CMDTYPE_INTERNAL:
+ cv_broadcast(&smcm->smcm_ctlr->smrt_cv_finishq);
+ return;
+
+ case SMRT_CMDTYPE_SCSA:
+ smrt_hba_complete(smcm);
+ return;
+
+ case SMRT_CMDTYPE_EVENT:
+ smrt_event_complete(smcm);
+ return;
+
+ case SMRT_CMDTYPE_ABORTQ:
+ /*
+ * Abort messages sent as part of abort queue processing
+ * do not require any completion activity.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ return;
+
+ case SMRT_CMDTYPE_PREINIT:
+ dev_err(smrt->smrt_dip, CE_PANIC, "preinit command "
+ "completed after initialisation");
+ return;
+ }
+
+ panic("unknown command type");
+}
+
+/*
+ * Process commands in the completion queue.
+ */
+void
+smrt_process_finishq(smrt_t *smrt)
+{
+ smrt_command_t *smcm;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ while ((smcm = list_remove_head(&smrt->smrt_finishq)) != NULL) {
+ /*
+ * Synchronise the Command Block before we read from it or
+ * free it, to ensure that any writes from the controller are
+ * visible.
+ */
+ smrt_process_finishq_sync(smcm);
+
+ /*
+ * Check if this command was in line to be aborted.
+ */
+ if (list_link_active(&smcm->smcm_link_abort)) {
+ /*
+ * This command was in line, but the controller
+ * subsequently completed the command before we
+ * were able to do so.
+ */
+ list_remove(&smrt->smrt_abortq, smcm);
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_TIMEOUT;
+ }
+
+ /*
+ * Check if this command has been abandoned by the original
+ * submitter. If it has, free it now to avoid a leak.
+ */
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABANDONED) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ continue;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
+ /*
+ * This command will be picked up and processed
+ * by "smrt_poll_for()" once the CV is triggered
+ * at the end of processing.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+ continue;
+ }
+
+ smrt_process_finishq_one(smcm);
+ }
+
+ cv_broadcast(&smrt->smrt_cv_finishq);
+}
+
+/*
+ * Process commands in the abort queue.
+ */
+void
+smrt_process_abortq(smrt_t *smrt)
+{
+ smrt_command_t *smcm;
+ smrt_command_t *abort_smcm = NULL;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (list_is_empty(&smrt->smrt_abortq)) {
+ goto out;
+ }
+
+another:
+ mutex_exit(&smrt->smrt_mutex);
+ if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_ABORTQ,
+ KM_NOSLEEP)) == NULL) {
+ /*
+ * No resources available to send abort messages. We will
+ * try again the next time around.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+ goto out;
+ }
+ mutex_enter(&smrt->smrt_mutex);
+
+ while ((smcm = list_remove_head(&smrt->smrt_abortq)) != NULL) {
+ if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+ /*
+ * This message is not currently inflight, so
+ * no abort is needed.
+ */
+ continue;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+ /*
+ * An abort message has already been sent for
+ * this command.
+ */
+ continue;
+ }
+
+ /*
+ * Send an abort message for the command.
+ */
+ smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag);
+ if (smrt_submit(smrt, abort_smcm) != 0) {
+ /*
+ * The command could not be submitted to the
+ * controller. Put it back in the abort queue
+ * and give up for now.
+ */
+ list_insert_head(&smrt->smrt_abortq, smcm);
+ goto out;
+ }
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT;
+
+ /*
+ * Record some debugging information about the abort we
+ * sent:
+ */
+ smcm->smcm_abort_time = gethrtime();
+ smcm->smcm_abort_tag = abort_smcm->smcm_tag;
+
+ /*
+ * The abort message was sent. Release it and
+ * allocate another command.
+ */
+ abort_smcm = NULL;
+ goto another;
+ }
+
+out:
+ cv_broadcast(&smrt->smrt_cv_finishq);
+ if (abort_smcm != NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(abort_smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+}
+
+int
+smrt_poll_for(smrt_t *smrt, smrt_command_t *smcm)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+
+ while (!(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE)) {
+ if (smcm->smcm_expiry != 0) {
+ /*
+ * This command has an expiry time. Check to see
+ * if it has already passed:
+ */
+ if (smcm->smcm_expiry < gethrtime()) {
+ return (ETIMEDOUT);
+ }
+ }
+
+ if (ddi_in_panic()) {
+ /*
+ * When the system is panicking, there are no
+ * interrupts or other threads. Drive the polling loop
+ * on our own, but with a small delay to avoid
+ * aggrevating the controller while we're trying to
+ * dump.
+ */
+ (void) smrt_retrieve(smrt);
+ smrt_process_finishq(smrt);
+ drv_usecwait(100);
+ continue;
+ }
+
+ /*
+ * Wait for command completion to return through the regular
+ * interrupt handling path.
+ */
+ if (smcm->smcm_expiry == 0) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ } else {
+ /*
+ * Wait only until the expiry time for this command.
+ */
+ (void) cv_timedwait_sig_hrtime(&smrt->smrt_cv_finishq,
+ &smrt->smrt_mutex, smcm->smcm_expiry);
+ }
+ }
+
+ /*
+ * Fire the completion callback for this command. The callback
+ * is responsible for freeing the command, so it may not be
+ * referenced again once this call returns.
+ */
+ smrt_process_finishq_one(smcm);
+
+ return (0);
+}
+
+void
+smrt_intr_set(smrt_t *smrt, boolean_t enabled)
+{
+ /*
+ * Read the Interrupt Mask Register.
+ */
+ uint32_t imr = smrt_get32(smrt, CISS_I2O_INTERRUPT_MASK);
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ if (enabled) {
+ imr &= ~CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
+ } else {
+ imr |= CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
+ }
+ smrt_put32(smrt, CISS_I2O_INTERRUPT_MASK, imr);
+ return;
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ break;
+ }
+ panic("unknown controller mode");
+}
+
+/*
+ * Signal to the controller that we have updated the Configuration Table by
+ * writing to the Inbound Doorbell Register. The controller will, after some
+ * number of seconds, acknowledge this by clearing the bit.
+ *
+ * If successful, return DDI_SUCCESS. If the controller takes too long to
+ * acknowledge, return DDI_FAILURE.
+ */
+int
+smrt_cfgtbl_flush(smrt_t *smrt)
+{
+ /*
+ * Read the current value of the Inbound Doorbell Register.
+ */
+ uint32_t idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
+
+ /*
+ * Signal the Configuration Table change to the controller.
+ */
+ idr |= CISS_IDR_BIT_CFGTBL_CHANGE;
+ smrt_put32(smrt, CISS_I2O_INBOUND_DOORBELL, idr);
+
+ /*
+ * Wait for the controller to acknowledge the change.
+ */
+ for (unsigned i = 0; i < smrt_ciss_init_time; i++) {
+ idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
+
+ if ((idr & CISS_IDR_BIT_CFGTBL_CHANGE) == 0) {
+ return (DDI_SUCCESS);
+ }
+
+ /*
+ * Wait for one second before trying again.
+ */
+ delay(drv_usectohz(1000000));
+ }
+
+ dev_err(smrt->smrt_dip, CE_WARN, "time out expired before controller "
+ "configuration completed");
+ return (DDI_FAILURE);
+}
+
+int
+smrt_cfgtbl_transport_has_support(smrt_t *smrt, int xport)
+{
+ VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+ /*
+ * Read the current value of the "Supported Transport Methods" field in
+ * the Configuration Table.
+ */
+ uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->TransportSupport);
+
+ /*
+ * Check that the desired transport method is supported by the
+ * controller:
+ */
+ if ((xport_active & xport) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller does not support "
+ "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
+ "simple" : "performant");
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_cfgtbl_transport_set(smrt_t *smrt, int xport)
+{
+ VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+ ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->TransportRequest,
+ xport);
+}
+
+int
+smrt_cfgtbl_transport_confirm(smrt_t *smrt, int xport)
+{
+ VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+ /*
+ * Read the current value of the TransportActive field in the
+ * Configuration Table.
+ */
+ uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->TransportActive);
+
+ /*
+ * Check that the desired transport method is now active:
+ */
+ if ((xport_active & xport) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to enable transport "
+ "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
+ "simple" : "performant");
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Ensure that the controller is now ready to accept commands.
+ */
+ if ((xport_active & CISS_CFGTBL_READY_FOR_COMMANDS) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller not ready to "
+ "accept commands");
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+uint32_t
+smrt_ctlr_get_maxsgelements(smrt_t *smrt)
+{
+ return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->MaxSGElements));
+}
+
+uint32_t
+smrt_ctlr_get_cmdsoutmax(smrt_t *smrt)
+{
+ return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->CmdsOutMax));
+}
+
+static uint32_t
+smrt_ctlr_get_hostdrvsup(smrt_t *smrt)
+{
+ return (ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->HostDrvrSupport));
+}
+
+int
+smrt_ctlr_init(smrt_t *smrt)
+{
+ uint8_t signature[4] = { 'C', 'I', 'S', 'S' };
+ int e;
+
+ if ((e = smrt_ctlr_wait_for_state(smrt,
+ SMRT_WAIT_STATE_READY)) != DDI_SUCCESS) {
+ return (e);
+ }
+
+ /*
+ * The configuration table contains an ASCII signature ("CISS") which
+ * should be checked as we initialise the controller.
+ * See: "9.1 Configuration Table" in CISS Specification.
+ */
+ for (unsigned i = 0; i < 4; i++) {
+ if (ddi_get8(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->Signature[i]) != signature[i]) {
+ dev_err(smrt->smrt_dip, CE_WARN, "invalid signature "
+ "detected");
+ return (DDI_FAILURE);
+ }
+ }
+
+ /*
+ * Initialise an appropriate Transport Method. For now, this driver
+ * only supports the "Simple" method.
+ */
+ if ((e = smrt_ctlr_init_simple(smrt)) != DDI_SUCCESS) {
+ return (e);
+ }
+
+ /*
+ * Save some common feature support bitfields.
+ */
+ smrt->smrt_host_support = smrt_ctlr_get_hostdrvsup(smrt);
+ smrt->smrt_bus_support = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->BusTypes);
+
+ /*
+ * Read initial controller heartbeat value and mark the current
+ * reading time.
+ */
+ smrt->smrt_last_heartbeat = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->HeartBeat);
+ smrt->smrt_last_heartbeat_time = gethrtime();
+
+ /*
+ * Determine the firmware version of the controller so that we can
+ * select which type of interrupts to use.
+ */
+ if ((e = smrt_ctlr_versions(smrt, SMRT_DISCOVER_TIMEOUT,
+ &smrt->smrt_versions)) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "could not identify "
+ "controller (%d)", e);
+ return (DDI_FAILURE);
+ }
+
+ dev_err(smrt->smrt_dip, CE_NOTE, "!firmware rev %s",
+ smrt->smrt_versions.smrtv_firmware_rev);
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_ctlr_teardown(smrt_t *smrt)
+{
+ smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ smrt_ctlr_teardown_simple(smrt);
+ return;
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ return;
+ }
+
+ panic("unknown controller mode");
+}
+
+int
+smrt_ctlr_wait_for_state(smrt_t *smrt, smrt_wait_state_t state)
+{
+ unsigned wait_usec = 100 * 1000;
+ unsigned wait_count = SMRT_WAIT_DELAY_SECONDS * 1000000 / wait_usec;
+
+ VERIFY(state == SMRT_WAIT_STATE_READY ||
+ state == SMRT_WAIT_STATE_UNREADY);
+
+ /*
+ * Read from the Scratchpad Register until the expected ready signature
+ * is detected. This behaviour is not described in the CISS
+ * specification.
+ *
+ * If the device is not in the desired state immediately, sleep for a
+ * second and try again. If the device has not become ready in 300
+ * seconds, give up.
+ */
+ for (unsigned i = 0; i < wait_count; i++) {
+ uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
+
+ switch (state) {
+ case SMRT_WAIT_STATE_READY:
+ if (spr == CISS_SCRATCHPAD_INITIALISED) {
+ return (DDI_SUCCESS);
+ }
+ break;
+
+ case SMRT_WAIT_STATE_UNREADY:
+ if (spr != CISS_SCRATCHPAD_INITIALISED) {
+ return (DDI_SUCCESS);
+ }
+ break;
+ }
+
+ if (ddi_in_panic()) {
+ /*
+ * There is no sleep for the panicking, so we
+ * must spin wait:
+ */
+ drv_usecwait(wait_usec);
+ } else {
+ /*
+ * Wait for a quarter second and try again.
+ */
+ delay(drv_usectohz(wait_usec));
+ }
+ }
+
+ dev_err(smrt->smrt_dip, CE_WARN, "time out waiting for controller "
+ "to enter state \"%s\"", state == SMRT_WAIT_STATE_READY ?
+ "ready": "unready");
+ return (DDI_FAILURE);
+}
+
+void
+smrt_lockup_check(smrt_t *smrt)
+{
+ /*
+ * Read the current controller heartbeat value.
+ */
+ uint32_t heartbeat = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->HeartBeat);
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ /*
+ * Check to see if the value is the same as last time we looked:
+ */
+ if (heartbeat != smrt->smrt_last_heartbeat) {
+ /*
+ * The heartbeat value has changed, which suggests that the
+ * firmware in the controller has not yet come to a complete
+ * stop. Record the new value, as well as the current time.
+ */
+ smrt->smrt_last_heartbeat = heartbeat;
+ smrt->smrt_last_heartbeat_time = gethrtime();
+ return;
+ }
+
+ /*
+ * The controller _might_ have been able to signal to us that is
+ * has locked up. This is a truly unfathomable state of affairs:
+ * If the firmware can tell it has flown off the rails, why not
+ * simply reset the controller?
+ */
+ uint32_t odr = smrt_get32(smrt, CISS_I2O_OUTBOUND_DOORBELL_STATUS);
+ uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
+ if ((odr & CISS_ODR_BIT_LOCKUP) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
+ "reported a critical fault (odr %08x spr %08x)",
+ odr, spr);
+ }
+
+ if (gethrtime() > smrt->smrt_last_heartbeat_time + 60 * NANOSEC) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
+ "stopped responding (odr %08x spr %08x)",
+ odr, spr);
+ }
+}
+
+/*
+ * Probe the controller with the IDENTIFY CONTROLLER request. This is a BMIC
+ * command, so it must be submitted to the controller and we must poll for its
+ * completion. This functionality is only presently used during controller
+ * initialisation, so it uses the special pre-initialisation path for command
+ * allocation and submission.
+ */
+static int
+smrt_ctlr_identify(smrt_t *smrt, uint16_t timeout,
+ smrt_identify_controller_t *resp)
+{
+ smrt_command_t *smcm;
+ smrt_identify_controller_req_t smicr;
+ int r;
+ size_t sz;
+
+ /*
+ * Allocate a command with a data buffer; the controller will fill it
+ * with identification information. There is some suggestion in the
+ * firmware-level specification that the buffer length should be a
+ * multiple of 512 bytes for some controllers, so we round up.
+ */
+ sz = P2ROUNDUP_TYPED(sizeof (*resp), 512, size_t);
+ if ((smcm = smrt_command_alloc_preinit(smrt, sz, KM_SLEEP)) == NULL) {
+ return (ENOMEM);
+ }
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (smicr);
+ smcm->smcm_va_cmd->Request.Timeout = timeout;
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * Construct the IDENTIFY CONTROLLER request CDB. Note that any
+ * reserved fields in the request must be filled with zeroes.
+ */
+ bzero(&smicr, sizeof (smicr));
+ smicr.smicr_opcode = CISS_SCMD_BMIC_READ;
+ smicr.smicr_lun = 0;
+ smicr.smicr_command = CISS_BMIC_IDENTIFY_CONTROLLER;
+ bcopy(&smicr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (smicr)));
+
+ /*
+ * Send the command to the device and poll for its completion.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((r = smrt_preinit_command_simple(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * This command timed out, but the driver is not presently
+ * initialised to the point where we can try to abort it.
+ * The command was created with the PREINIT type, so it
+ * does not appear in the global command tracking list.
+ * In order to avoid problems with DMA from the controller,
+ * we have to leak the command allocation.
+ */
+ smcm = NULL;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to identify
+ * it. Report failure.
+ */
+ r = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "identify "
+ "controller error: status 0x%x",
+ ei->CommandStatus);
+ r = EIO;
+ goto out;
+ }
+ }
+
+ if (resp != NULL) {
+ /*
+ * Copy the identify response out for the caller.
+ */
+ bcopy(smcm->smcm_internal->smcmi_va, resp, sizeof (*resp));
+ }
+
+ r = 0;
+
+out:
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (r);
+}
+
+/*
+ * The firmware versions in an IDENTIFY CONTROLLER response generally take
+ * the form of a four byte ASCII string containing a dotted decimal version
+ * number; e.g., "8.00".
+ *
+ * This function sanitises the firmware version, replacing unexpected
+ * values with a question mark.
+ */
+static void
+smrt_copy_firmware_version(uint8_t *src, char *dst)
+{
+ for (unsigned i = 0; i < 4; i++) {
+ /*
+ * Make sure that this is a 7-bit clean ASCII value.
+ */
+ char c = src[i] <= 0x7f ? (char)(src[i] & 0x7f) : '?';
+
+ if (isalnum(c) || c == '.' || c == ' ') {
+ dst[i] = c;
+ } else {
+ dst[i] = '?';
+ }
+ }
+ dst[4] = '\0';
+}
+
+/*
+ * Using an IDENTIFY CONTROLLER request, determine firmware and controller
+ * version details. See the comments for "smrt_ctlr_identify()" for more
+ * details about calling context.
+ */
+static int
+smrt_ctlr_versions(smrt_t *smrt, uint16_t timeout, smrt_versions_t *smrtv)
+{
+ smrt_identify_controller_t smic;
+ int r;
+
+ if ((r = smrt_ctlr_identify(smrt, timeout, &smic)) != 0) {
+ return (r);
+ }
+
+ smrtv->smrtv_hardware_version = smic.smic_hardware_version;
+ smrt_copy_firmware_version(smic.smic_firmware_rev,
+ smrtv->smrtv_firmware_rev);
+ smrt_copy_firmware_version(smic.smic_recovery_rev,
+ smrtv->smrtv_recovery_rev);
+ smrt_copy_firmware_version(smic.smic_bootblock_rev,
+ smrtv->smrtv_bootblock_rev);
+
+ return (0);
+}
+
+int
+smrt_ctlr_reset(smrt_t *smrt)
+{
+ smrt_command_t *smcm, *smcm_nop;
+ int r;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (ddi_in_panic()) {
+ goto skip_check;
+ }
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ /*
+ * Don't pile on. One reset is enough. Wait until
+ * it's complete, and then return success.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ }
+ return (0);
+ }
+ smrt->smrt_status |= SMRT_CTLR_STATUS_RESETTING;
+ smrt->smrt_last_reset_start = gethrtime();
+ smrt->smrt_stats.smrts_ctlr_resets++;
+
+skip_check:
+ /*
+ * Allocate two commands: one for the soft reset message, which we
+ * cannot free until the controller has reset; and one for the ping we
+ * will use to determine when it is once again functional.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ mutex_enter(&smrt->smrt_mutex);
+ return (ENOMEM);
+ }
+ if ((smcm_nop = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ return (ENOMEM);
+ }
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send a soft reset command to the controller. If this command
+ * succeeds, there will likely be no completion notification. Instead,
+ * the device should become unavailable for some period of time and
+ * then become available again. Once available again, we know the soft
+ * reset has completed and should abort all in-flight commands.
+ */
+ smrt_write_message_reset_ctlr(smcm);
+
+ /*
+ * Disable interrupts now.
+ */
+ smrt_intr_set(smrt, B_FALSE);
+
+ dev_err(smrt->smrt_dip, CE_WARN, "attempting controller soft reset");
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "submit failed (%d)", r);
+ }
+
+ /*
+ * Mark every currently inflight command as being reset, including the
+ * soft reset command we just sent. Once we confirm the reset works,
+ * we can safely report that these commands have failed.
+ */
+ for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
+ t != NULL; t = AVL_NEXT(&smrt->smrt_inflight, t)) {
+ t->smcm_status |= SMRT_CMD_STATUS_RESET_SENT;
+ }
+
+ /*
+ * Now that we have submitted our soft reset command, prevent
+ * the rest of the driver from interacting with the controller.
+ */
+ smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
+
+ /*
+ * We do not expect a completion from the controller for our soft
+ * reset command, but we also cannot remove it from the inflight
+ * list until we know the controller has actually reset. To do
+ * otherwise would potentially allow the controller to scribble
+ * on the memory we were using.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+
+ if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_UNREADY) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "controller did not become unready");
+ }
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller unready");
+
+ if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_READY) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "controller did not come become ready");
+ }
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller ready");
+
+ /*
+ * In at least the Smart Array P420i, the controller can take 30-45
+ * seconds after the scratchpad register shows it as being available
+ * before it is ready to receive commands. In order to avoid hitting
+ * it too early with our post-reset ping, we will sleep for 10 seconds
+ * here.
+ */
+ if (ddi_in_panic()) {
+ drv_usecwait(10 * MICROSEC);
+ } else {
+ delay(drv_usectohz(10 * MICROSEC));
+ }
+
+ smrt_ctlr_teardown(smrt);
+ if (smrt_ctlr_init(smrt) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "controller transport could not be configured");
+ }
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller configured");
+
+ smrt_write_message_nop(smcm_nop, 0);
+ smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLLED |
+ SMRT_CMD_IGNORE_RUNNING;
+ if ((r = smrt_submit(smrt, smcm_nop)) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "ping could not be submitted (%d)", r);
+ }
+
+ /*
+ * Interrupts are still masked at this stage. Poll manually in
+ * a way that will not trigger regular finish queue processing:
+ */
+ VERIFY(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT);
+ for (unsigned i = 0; i < 600; i++) {
+ smrt_retrieve_simple(smrt);
+
+ if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+ /*
+ * Remove the ping command from the finish queue and
+ * process it manually. This processing must mirror
+ * what would have been done in smrt_process_finishq().
+ */
+ VERIFY(list_link_active(&smcm_nop->smcm_link_finish));
+ list_remove(&smrt->smrt_finishq, smcm_nop);
+ smrt_process_finishq_sync(smcm_nop);
+ smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+ smrt_process_finishq_one(smcm_nop);
+ break;
+ }
+
+ if (ddi_in_panic()) {
+ drv_usecwait(100 * 1000);
+ } else {
+ delay(drv_usectohz(100 * 1000));
+ }
+ }
+
+ if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_COMPLETE)) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "ping did not complete");
+ } else if (smcm_nop->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ dev_err(smrt->smrt_dip, CE_WARN, "soft reset: ping completed "
+ "in error (status %u)",
+ (unsigned)smcm_nop->smcm_va_err->CommandStatus);
+ } else {
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: ping completed");
+ }
+
+ /*
+ * Now that the controller is working again, we can abort any
+ * commands that were inflight during the reset.
+ */
+ smrt_command_t *nt;
+ for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
+ t != NULL; t = nt) {
+ nt = AVL_NEXT(&smrt->smrt_inflight, t);
+
+ if (t->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ avl_remove(&smrt->smrt_inflight, t);
+ t->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+
+ list_insert_tail(&smrt->smrt_finishq, t);
+ }
+ }
+
+ /*
+ * Quiesce our discovery thread. Note, because
+ * SMRT_CTLR_STATUS_RESTARTING is set, nothing can cause it to be
+ * enabled again.
+ */
+ if (!ddi_in_panic()) {
+ mutex_exit(&smrt->smrt_mutex);
+ ddi_taskq_wait(smrt->smrt_discover_taskq);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ /*
+ * Re-enable interrupts. Now, we must kick off a discovery to make sure
+ * that the system is in a sane state and that we can perform I/O.
+ */
+ smrt_intr_set(smrt, B_TRUE);
+ smrt->smrt_status &= ~SMRT_CTLR_STATUS_RESETTING;
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUIRED;
+
+ /*
+ * Attempt a discovery to make sure that the drivers sees a realistic
+ * view of the world. If we're not in panic context, spin for the
+ * asynchronous process to complete, otherwise we're in panic context
+ * and this is going to happen regardless if we want it to or not.
+ * Before we kick off the request to run discovery, we reset the
+ * discovery request flags as we know that nothing else can consider
+ * running discovery and we don't want to delay until the next smrt
+ * periodic tick if we can avoid it. In panic context, if this failed,
+ * then we won't make it back.
+ */
+ VERIFY0(smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING);
+ smrt->smrt_status &= ~(SMRT_CTLR_DISCOVERY_MASK);
+ smrt_discover(smrt);
+ if (!ddi_in_panic()) {
+ while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ }
+ }
+
+ smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING;
+ smrt->smrt_last_reset_finish = gethrtime();
+
+ /*
+ * Wake anybody that was waiting for the reset to complete.
+ */
+ cv_broadcast(&smrt->smrt_cv_finishq);
+
+ /*
+ * Process the completion queue one last time before we let go
+ * of the mutex.
+ */
+ smrt_process_finishq(smrt);
+
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm_nop);
+ mutex_enter(&smrt->smrt_mutex);
+ return (0);
+}
+
+int
+smrt_event_init(smrt_t *smrt)
+{
+ int ret;
+ smrt_command_t *event, *cancel;
+
+ event = smrt_command_alloc(smrt, SMRT_CMDTYPE_EVENT, KM_NOSLEEP);
+ if (event == NULL)
+ return (ENOMEM);
+ if (smrt_command_attach_internal(smrt, event, SMRT_EVENT_NOTIFY_BUFLEN,
+ KM_NOSLEEP) != 0) {
+ smrt_command_free(event);
+ return (ENOMEM);
+ }
+ smrt_write_message_event_notify(event);
+
+ cancel = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, KM_NOSLEEP);
+ if (cancel == NULL) {
+ smrt_command_free(event);
+ return (ENOMEM);
+ }
+ if (smrt_command_attach_internal(smrt, cancel, SMRT_EVENT_NOTIFY_BUFLEN,
+ KM_NOSLEEP) != 0) {
+ smrt_command_free(event);
+ smrt_command_free(cancel);
+ return (ENOMEM);
+ }
+ smrt_write_message_cancel_event_notify(cancel);
+
+ cv_init(&smrt->smrt_event_queue, NULL, CV_DRIVER, NULL);
+
+ mutex_enter(&smrt->smrt_mutex);
+ if ((ret = smrt_submit(smrt, event)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(event);
+ smrt_command_free(cancel);
+ return (ret);
+ }
+
+ smrt->smrt_event_cmd = event;
+ smrt->smrt_event_cancel_cmd = cancel;
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (0);
+}
+
+void
+smrt_event_complete(smrt_command_t *smcm)
+{
+ smrt_event_notify_t *sen;
+ boolean_t log, rescan;
+
+ boolean_t intervene = B_FALSE;
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ VERIFY3P(smcm, ==, smrt->smrt_event_cmd);
+ VERIFY0(smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION);
+
+ smrt->smrt_stats.smrts_events_received++;
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ cv_signal(&smrt->smrt_event_queue);
+ return;
+ }
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ intervene = B_TRUE;
+ goto clean;
+ }
+
+ /*
+ * The event notification command failed for some reason. Attempt to
+ * drive on and try again at the next intervention period. Because this
+ * may represent a programmer error (though it's hard to know), we wait
+ * until the next intervention period and don't panic.
+ */
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+ intervene = B_TRUE;
+
+ smrt->smrt_stats.smrts_events_errors++;
+ dev_err(smrt->smrt_dip, CE_WARN, "!event notification request "
+ "error: status 0x%x", ei->CommandStatus);
+ goto clean;
+ }
+
+ sen = smcm->smcm_internal->smcmi_va;
+ log = rescan = B_FALSE;
+ switch (sen->sen_class) {
+ case SMRT_EVENT_CLASS_PROTOCOL:
+ /*
+ * Most of the event protocol class events aren't really
+ * actionable. However, subclass 1 indicates errors. Today,
+ * the only error is an event overflow. If there's an event
+ * overflow, then we must assume that we need to rescan.
+ */
+ if (sen->sen_subclass == SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR) {
+ rescan = B_TRUE;
+ }
+ break;
+ case SMRT_EVENT_CLASS_HOTPLUG:
+ /*
+ * We want to log all hotplug events. However we only need to
+ * scan these if the subclass indicates the event is for a disk.
+ */
+ log = B_TRUE;
+ if (sen->sen_subclass == SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE) {
+ rescan = B_TRUE;
+ }
+ break;
+ case SMRT_EVENT_CLASS_HWERROR:
+ case SMRT_EVENT_CLASS_ENVIRONMENT:
+ log = B_TRUE;
+ break;
+ case SMRT_EVENT_CLASS_PHYS:
+ log = B_TRUE;
+ /*
+ * This subclass indicates some change for physical drives. As
+ * such, this should trigger a rescan.
+ */
+ if (sen->sen_subclass == SMRT_EVENT_PHYS_SUBCLASS_STATE) {
+ rescan = B_TRUE;
+ }
+ break;
+ case SMRT_EVENT_CLASS_LOGVOL:
+ rescan = B_TRUE;
+ log = B_TRUE;
+ break;
+ default:
+ /*
+ * While there are other classes of events, it's hard to say how
+ * actionable they are for the moment. If we revamp this such
+ * that it becomes an ireport based system, then we should just
+ * always log these. We opt not to at the moment to try and be
+ * kind to the system log.
+ */
+ break;
+ }
+
+ /*
+ * Ideally, this would be an ireport that we could pass onto
+ * administrators; however, since we don't have any way to generate
+ * that, we provide a subset of the event information.
+ */
+ if (log) {
+ const char *rmsg;
+ if (rescan == B_TRUE) {
+ rmsg = "rescanning";
+ } else {
+ rmsg = "not rescanning";
+ }
+ if (sen->sen_message[0] != '\0') {
+ sen->sen_message[sizeof (sen->sen_message) - 1] = '\0';
+ dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
+ "class/sub-class/detail %x, %x, %x: %s; %s devices",
+ sen->sen_class, sen->sen_subclass, sen->sen_detail,
+ sen->sen_message, rmsg);
+ } else {
+ dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
+ "class/sub-class/detail %x, %x, %x; %s devices",
+ sen->sen_class, sen->sen_subclass, sen->sen_detail,
+ rmsg);
+ }
+ }
+
+ if (rescan)
+ smrt_discover_request(smrt);
+
+clean:
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_reuse(smcm);
+ bzero(smcm->smcm_internal->smcmi_va, SMRT_EVENT_NOTIFY_BUFLEN);
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Make sure we're not _now_ detaching or resetting.
+ */
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ cv_signal(&smrt->smrt_event_queue);
+ return;
+ }
+
+ if ((smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) != 0 ||
+ intervene == B_TRUE) {
+ smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+ return;
+ }
+
+ /*
+ * Check out command count per tick. If it's too high, leave it for
+ * intervention to solve. Likely there is some serious driver or
+ * firmware error going on.
+ */
+ smrt->smrt_event_count++;
+ if (smrt->smrt_event_count > smrt_event_intervention_threshold) {
+ smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+ return;
+ }
+
+ if (smrt_submit(smrt, smcm) != 0) {
+ smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+ }
+}
+
+void
+smrt_event_fini(smrt_t *smrt)
+{
+ int ret;
+ smrt_command_t *event, *cancel;
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * If intervention has been requested, there is nothing for us to do. We
+ * clear the flag so nothing else accidentally sees this and takes
+ * action. We also don't need to bother sending a cancellation request,
+ * as there is no outstanding event.
+ */
+ if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
+ smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
+ goto free;
+ }
+
+ /*
+ * Submit a cancel request for the event notification queue. Because we
+ * submit both the cancel event and the regular notification event as an
+ * ordered command, we know that by the time this completes, that the
+ * existing one will have completed.
+ */
+ smrt->smrt_event_cancel_cmd->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((ret = smrt_submit(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
+ /*
+ * This is unfortunate. We've failed to submit the command. At
+ * this point all we can do is reset the device. If the reset
+ * succeeds, we're done and we can clear all the memory. If it
+ * fails, then all we can do is just leak the command and scream
+ * to the system, sorry.
+ */
+ if (smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+ "device after failure to submit cancellation "
+ "(%d), abandoning smrt_command_t at address %p",
+ ret, smrt->smrt_event_cmd);
+ smrt->smrt_event_cmd = NULL;
+ goto free;
+ }
+ }
+
+ smrt->smrt_event_cancel_cmd->smcm_expiry = gethrtime() +
+ SMRT_ASYNC_CANCEL_TIMEOUT * NANOSEC;
+ if ((ret = smrt_poll_for(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
+ VERIFY3S(ret, ==, ETIMEDOUT);
+ VERIFY0(smrt->smrt_event_cancel_cmd->smcm_status &
+ SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out. All we can do is hope a reset will
+ * work.
+ */
+ if (smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+ "device after failure to poll for async "
+ "cancellation command abandoning smrt_command_t "
+ "event command at address %p and cancellation "
+ "command at %p", smrt->smrt_event_cmd,
+ smrt->smrt_event_cancel_cmd);
+ smrt->smrt_event_cmd = NULL;
+ smrt->smrt_event_cancel_cmd = NULL;
+ goto free;
+ }
+
+ }
+
+ /*
+ * Well, in the end, it's results that count.
+ */
+ if (smrt->smrt_event_cancel_cmd->smcm_status &
+ SMRT_CMD_STATUS_RESET_SENT) {
+ goto free;
+ }
+
+ if (smrt->smrt_event_cancel_cmd->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smrt->smrt_event_cancel_cmd->smcm_va_err;
+
+ /*
+ * This can return a CISS_CMD_TARGET_STATUS entry when the
+ * controller doesn't think a command is outstanding. It is
+ * possible we raced, so don't think too much about that case.
+ * Anything else leaves us between a rock and a hard place, the
+ * only way out is a reset.
+ */
+ if (ei->CommandStatus != CISS_CMD_TARGET_STATUS &&
+ smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+ "device after receiving an error on the async "
+ "cancellation command (%d); abandoning "
+ "smrt_command_t event command at address %p and "
+ "cancellation command at %p", ei->CommandStatus,
+ smrt->smrt_event_cmd, smrt->smrt_event_cancel_cmd);
+ smrt->smrt_event_cmd = NULL;
+ smrt->smrt_event_cancel_cmd = NULL;
+ goto free;
+ }
+ }
+
+free:
+ event = smrt->smrt_event_cmd;
+ smrt->smrt_event_cmd = NULL;
+ cancel = smrt->smrt_event_cancel_cmd;
+ smrt->smrt_event_cancel_cmd = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ if (event != NULL)
+ smrt_command_free(event);
+ if (cancel != NULL)
+ smrt_command_free(cancel);
+ cv_destroy(&smrt->smrt_event_queue);
+}
+
+/*
+ * We've been asked to do a discovery in panic context. This would have
+ * occurred because there was a device reset. Because we can't rely on the
+ * target maps, all we can do at the moment is go over all the active targets
+ * and note which ones no longer exist. If this target was required to dump,
+ * then the dump code will encounter a fatal error. If not, then we should
+ * count ourselves surprisingly lucky.
+ */
+static void
+smrt_discover_panic_check(smrt_t *smrt)
+{
+ smrt_target_t *smtg;
+
+ ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
+ for (smtg = list_head(&smrt->smrt_targets); smtg != NULL;
+ smtg = list_next(&smrt->smrt_targets, smtg)) {
+ uint64_t gen;
+
+ if (smtg->smtg_physical) {
+ smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys;
+ /*
+ * Don't worry about drives that aren't visible.
+ */
+ if (!smpt->smpt_visible)
+ continue;
+ gen = smpt->smpt_gen;
+ } else {
+ smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol;
+ gen = smlv->smlv_gen;
+ }
+
+ if (gen != smrt->smrt_discover_gen) {
+ dev_err(smrt->smrt_dip, CE_WARN, "target %s "
+ "disappeared during post-panic discovery",
+ scsi_device_unit_address(smtg->smtg_scsi_dev));
+ smtg->smtg_gone = B_TRUE;
+ }
+ }
+}
+
+static void
+smrt_discover(void *arg)
+{
+ int log = 0, phys = 0;
+ smrt_t *smrt = arg;
+ uint64_t gen;
+ boolean_t runphys, runvirt;
+
+ mutex_enter(&smrt->smrt_mutex);
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_RUNNING;
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUESTED;
+
+ smrt->smrt_discover_gen++;
+ gen = smrt->smrt_discover_gen;
+ runphys = smrt->smrt_phys_tgtmap != NULL;
+ runvirt = smrt->smrt_virt_tgtmap != NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ if (runphys)
+ phys = smrt_phys_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
+ if (runvirt)
+ log = smrt_logvol_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (phys != 0 || log != 0) {
+ if (!ddi_in_panic()) {
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+ } else {
+ panic("smrt_t %p failed to perform discovery after "
+ "a reset in panic context, unable to continue. "
+ "logvol: %d, phys: %d", smrt, log, phys);
+ }
+ } else {
+ if (!ddi_in_panic() &&
+ smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUIRED;
+ cv_broadcast(&smrt->smrt_cv_finishq);
+ }
+
+ if (ddi_in_panic()) {
+ smrt_discover_panic_check(smrt);
+ }
+ }
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_RUNNING;
+ if (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUESTED)
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+/*
+ * Request discovery, which is always run via a taskq.
+ */
+void
+smrt_discover_request(smrt_t *smrt)
+{
+ boolean_t run;
+ ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (ddi_in_panic()) {
+ smrt_discover(smrt);
+ return;
+ }
+
+ run = (smrt->smrt_status & SMRT_CTLR_DISCOVERY_MASK) == 0;
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUESTED;
+ if (run && ddi_taskq_dispatch(smrt->smrt_discover_taskq,
+ smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+ smrt->smrt_stats.smrts_discovery_tq_errors++;
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c
new file mode 100644
index 0000000000..1b3d7b2602
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c
@@ -0,0 +1,282 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+uint_t
+smrt_isr_hw_simple(caddr_t arg1, caddr_t arg2)
+{
+ _NOTE(ARGUNUSED(arg2))
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ smrt_t *smrt = (smrt_t *)arg1;
+ uint32_t isr = smrt_get32(smrt, CISS_I2O_INTERRUPT_STATUS);
+ hrtime_t now = gethrtime();
+
+ mutex_enter(&smrt->smrt_mutex);
+ if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) {
+ smrt->smrt_stats.smrts_unclaimed_interrupts++;
+ smrt->smrt_last_interrupt_unclaimed = now;
+
+ /*
+ * We should not be receiving interrupts from the controller
+ * while the driver is not running.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ /*
+ * Check to see if this interrupt came from the device:
+ */
+ if ((isr & CISS_ISR_BIT_SIMPLE_INTR) == 0) {
+ smrt->smrt_stats.smrts_unclaimed_interrupts++;
+ smrt->smrt_last_interrupt_unclaimed = now;
+
+ /*
+ * Check to see if the firmware has come to rest. If it has,
+ * this routine will panic the system.
+ */
+ smrt_lockup_check(smrt);
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ smrt->smrt_stats.smrts_claimed_interrupts++;
+ smrt->smrt_last_interrupt_claimed = now;
+
+ /*
+ * The interrupt was from our controller, so collect any pending
+ * command completions.
+ */
+ smrt_retrieve_simple(smrt);
+
+ /*
+ * Process any commands in the completion queue.
+ */
+ smrt_process_finishq(smrt);
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Read tags and process completion of the associated command until the supply
+ * of tags is exhausted.
+ */
+void
+smrt_retrieve_simple(smrt_t *smrt)
+{
+ uint32_t opq;
+ uint32_t none = 0xffffffff;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ while ((opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q)) != none) {
+ uint32_t tag = CISS_OPQ_READ_TAG(opq);
+ smrt_command_t *smcm;
+
+ if ((smcm = smrt_lookup_inflight(smrt, tag)) == NULL) {
+ dev_err(smrt->smrt_dip, CE_WARN, "spurious tag %x",
+ tag);
+ continue;
+ }
+
+ avl_remove(&smrt->smrt_inflight, smcm);
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+ if (CISS_OPQ_READ_ERROR(opq) != 0) {
+ smcm->smcm_status |= SMRT_CMD_STATUS_ERROR;
+ }
+ smcm->smcm_time_complete = gethrtime();
+
+ /*
+ * Push this command onto the completion queue.
+ */
+ list_insert_tail(&smrt->smrt_finishq, smcm);
+ }
+}
+
+/*
+ * Submit a command to the controller by posting it to the Inbound Post Queue
+ * Register.
+ */
+void
+smrt_submit_simple(smrt_t *smrt, smrt_command_t *smcm)
+{
+ smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd);
+}
+
+/*
+ * Submit a command to the controller by posting it to the Inbound Post Queue
+ * Register. Immediately begin polling on the completion of that command.
+ *
+ * NOTE: This function is for controller initialisation only. It discards
+ * completions of commands other than the expected command as spurious, and
+ * will not interact correctly with the rest of the driver once it is running.
+ */
+int
+smrt_preinit_command_simple(smrt_t *smrt, smrt_command_t *smcm)
+{
+ /*
+ * The controller must be initialised to use the Simple Transport
+ * Method, but not be marked RUNNING. The command to process must be a
+ * PREINIT command with the expected tag number, marked for polling.
+ */
+ VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE);
+ VERIFY(!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING));
+ VERIFY(smcm->smcm_type == SMRT_CMDTYPE_PREINIT);
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+ VERIFY3U(smcm->smcm_tag, ==, SMRT_PRE_TAG_NUMBER);
+
+ /*
+ * Submit this command to the controller.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT;
+ smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd);
+
+ /*
+ * Poll the controller for completions until we see the command we just
+ * sent, or the timeout expires.
+ */
+ for (;;) {
+ uint32_t none = 0xffffffff;
+ uint32_t opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q);
+ uint32_t tag;
+
+ if (smcm->smcm_expiry != 0) {
+ /*
+ * This command has an expiry time. Check to see
+ * if it has already passed:
+ */
+ if (smcm->smcm_expiry < gethrtime()) {
+ return (ETIMEDOUT);
+ }
+ }
+
+ if (opq == none) {
+ delay(drv_usectohz(10 * 1000));
+ continue;
+ }
+
+ if ((tag = CISS_OPQ_READ_TAG(opq)) != SMRT_PRE_TAG_NUMBER) {
+ dev_err(smrt->smrt_dip, CE_WARN, "unexpected tag 0x%x"
+ " completed during driver init", tag);
+ delay(drv_usectohz(10 * 1000));
+ continue;
+ }
+
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+ if (CISS_OPQ_READ_ERROR(opq) != 0) {
+ smcm->smcm_status |= SMRT_CMD_STATUS_ERROR;
+ }
+ smcm->smcm_time_complete = gethrtime();
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+
+ return (0);
+ }
+}
+
+int
+smrt_ctlr_init_simple(smrt_t *smrt)
+{
+ VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_UNKNOWN);
+
+ if (smrt_cfgtbl_transport_has_support(smrt,
+ CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+ smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_SIMPLE;
+
+ /*
+ * Disable device interrupts while we are setting up.
+ */
+ smrt_intr_set(smrt, B_FALSE);
+
+ if ((smrt->smrt_maxcmds = smrt_ctlr_get_cmdsoutmax(smrt)) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding "
+ "commands set to zero");
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Determine the number of Scatter/Gather List entries this controller
+ * supports. The maximum number we allow is CISS_MAXSGENTRIES: the
+ * number of elements in the static struct we use for command
+ * submission.
+ */
+ if ((smrt->smrt_sg_cnt = smrt_ctlr_get_maxsgelements(smrt)) == 0) {
+ /*
+ * The CISS specification states that if this value is
+ * zero, we should assume a value of 31 for compatibility
+ * with older firmware.
+ */
+ smrt->smrt_sg_cnt = CISS_SGCNT_FALLBACK;
+
+ } else if (smrt->smrt_sg_cnt > CISS_MAXSGENTRIES) {
+ /*
+ * If the controller supports more than we have allocated,
+ * just cap the count at the allocation size.
+ */
+ smrt->smrt_sg_cnt = CISS_MAXSGENTRIES;
+ }
+
+ /*
+ * Zero the upper 32 bits of the address in the Controller.
+ */
+ ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->Upper32Addr, 0);
+
+ /*
+ * Set the Transport Method and flush the changes to the
+ * Configuration Table.
+ */
+ smrt_cfgtbl_transport_set(smrt, CISS_CFGTBL_XPORT_SIMPLE);
+ if (smrt_cfgtbl_flush(smrt) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ if (smrt_cfgtbl_transport_confirm(smrt,
+ CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Check the outstanding command cap a second time now that we have
+ * flushed out the new Transport Method. This is entirely defensive;
+ * we do not expect this value to change.
+ */
+ uint32_t check_again = smrt_ctlr_get_cmdsoutmax(smrt);
+ if (check_again != smrt->smrt_maxcmds) {
+ dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding commands "
+ "changed during initialisation (was %u, now %u)",
+ smrt->smrt_maxcmds, check_again);
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_ctlr_teardown_simple(smrt_t *smrt)
+{
+ VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE);
+
+ /*
+ * Due to the nominal simplicity of the simple mode, we have no
+ * particular teardown to perform as we do not allocate anything
+ * on the way up.
+ */
+ smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_UNKNOWN;
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c
new file mode 100644
index 0000000000..edcbfa65e2
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c
@@ -0,0 +1,362 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+
+static ddi_dma_attr_t smrt_command_dma_attr = {
+ .dma_attr_version = DMA_ATTR_V0,
+ .dma_attr_addr_lo = 0x00000000,
+ .dma_attr_addr_hi = 0xFFFFFFFF,
+ .dma_attr_count_max = 0x00FFFFFF,
+ .dma_attr_align = 0x20,
+ .dma_attr_burstsizes = 0x20,
+ .dma_attr_minxfer = DMA_UNIT_8,
+ .dma_attr_maxxfer = 0xFFFFFFFF,
+ .dma_attr_seg = 0x0000FFFF,
+ .dma_attr_sgllen = 1,
+ .dma_attr_granular = 512,
+ .dma_attr_flags = 0
+};
+
+/*
+ * These device access attributes are for command block allocation, where we do
+ * not use any of the structured byte swapping facilities.
+ */
+static ddi_device_acc_attr_t smrt_command_dev_attr = {
+ .devacc_attr_version = DDI_DEVICE_ATTR_V0,
+ .devacc_attr_endian_flags = DDI_NEVERSWAP_ACC,
+ .devacc_attr_dataorder = DDI_STRICTORDER_ACC,
+ .devacc_attr_access = 0
+};
+
+
+static void smrt_contig_free(smrt_dma_t *);
+
+
+static int
+smrt_check_command_type(smrt_command_type_t type)
+{
+ /*
+ * Note that we leave out the default case in order to utilise
+ * compiler warnings about missed enum values.
+ */
+ switch (type) {
+ case SMRT_CMDTYPE_ABORTQ:
+ case SMRT_CMDTYPE_SCSA:
+ case SMRT_CMDTYPE_INTERNAL:
+ case SMRT_CMDTYPE_PREINIT:
+ case SMRT_CMDTYPE_EVENT:
+ return (type);
+ }
+
+ panic("unexpected command type");
+ /* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+static int
+smrt_contig_alloc(smrt_t *smrt, smrt_dma_t *smdma, size_t sz, int kmflags,
+ void **vap, uint32_t *pap)
+{
+ caddr_t va;
+ int rv;
+ dev_info_t *dip = smrt->smrt_dip;
+ int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP :
+ DDI_DMA_DONTWAIT;
+
+ VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+
+ /*
+ * Ensure we don't try to allocate a second time using the same
+ * tracking object.
+ */
+ VERIFY0(smdma->smdma_level);
+
+ if ((rv = ddi_dma_alloc_handle(dip, &smrt_command_dma_attr,
+ dma_wait, NULL, &smdma->smdma_dma_handle)) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "DMA handle allocation failed (%x)",
+ rv);
+ goto fail;
+ }
+ smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_ALLOC;
+
+ if ((rv = ddi_dma_mem_alloc(smdma->smdma_dma_handle, sz,
+ &smrt_command_dev_attr, DDI_DMA_CONSISTENT, dma_wait, NULL,
+ &va, &smdma->smdma_real_size, &smdma->smdma_acc_handle)) !=
+ DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "DMA memory allocation failed (%x)", rv);
+ goto fail;
+ }
+ smdma->smdma_level |= SMRT_DMALEVEL_MEMORY_ALLOC;
+
+ if ((rv = ddi_dma_addr_bind_handle(smdma->smdma_dma_handle,
+ NULL, va, smdma->smdma_real_size,
+ DDI_DMA_CONSISTENT | DDI_DMA_RDWR, dma_wait, NULL,
+ smdma->smdma_dma_cookies, &smdma->smdma_dma_ncookies)) !=
+ DDI_DMA_MAPPED) {
+ dev_err(dip, CE_WARN, "DMA handle bind failed (%x)", rv);
+ goto fail;
+ }
+ smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_BOUND;
+
+ VERIFY3U(smdma->smdma_dma_ncookies, ==, 1);
+ *pap = smdma->smdma_dma_cookies[0].dmac_address;
+ *vap = (void *)va;
+ return (DDI_SUCCESS);
+
+fail:
+ *vap = NULL;
+ *pap = 0;
+ smrt_contig_free(smdma);
+ return (DDI_FAILURE);
+}
+
+static void
+smrt_contig_free(smrt_dma_t *smdma)
+{
+ if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_BOUND) {
+ VERIFY3U(ddi_dma_unbind_handle(smdma->smdma_dma_handle), ==,
+ DDI_SUCCESS);
+
+ smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_BOUND;
+ }
+
+ if (smdma->smdma_level & SMRT_DMALEVEL_MEMORY_ALLOC) {
+ ddi_dma_mem_free(&smdma->smdma_acc_handle);
+
+ smdma->smdma_level &= ~SMRT_DMALEVEL_MEMORY_ALLOC;
+ }
+
+ if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_ALLOC) {
+ ddi_dma_free_handle(&smdma->smdma_dma_handle);
+
+ smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_ALLOC;
+ }
+
+ VERIFY(smdma->smdma_level == 0);
+ bzero(smdma, sizeof (*smdma));
+}
+
+static smrt_command_t *
+smrt_command_alloc_impl(smrt_t *smrt, smrt_command_type_t type, int kmflags)
+{
+ smrt_command_t *smcm;
+
+ VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+
+ if ((smcm = kmem_zalloc(sizeof (*smcm), kmflags)) == NULL) {
+ return (NULL);
+ }
+
+ smcm->smcm_ctlr = smrt;
+ smcm->smcm_type = smrt_check_command_type(type);
+
+ /*
+ * Allocate a single contiguous chunk of memory for the command block
+ * (smcm_va_cmd) and the error information block (smcm_va_err). The
+ * physical address of each block should be 32-byte aligned.
+ */
+ size_t contig_size = 0;
+ contig_size += P2ROUNDUP_TYPED(sizeof (CommandList_t), 32, size_t);
+
+ size_t errorinfo_offset = contig_size;
+ contig_size += P2ROUNDUP_TYPED(sizeof (ErrorInfo_t), 32, size_t);
+
+ if (smrt_contig_alloc(smrt, &smcm->smcm_contig, contig_size,
+ kmflags, (void **)&smcm->smcm_va_cmd, &smcm->smcm_pa_cmd) !=
+ DDI_SUCCESS) {
+ kmem_free(smcm, sizeof (*smcm));
+ return (NULL);
+ }
+
+ smcm->smcm_va_err = (void *)((caddr_t)smcm->smcm_va_cmd +
+ errorinfo_offset);
+ smcm->smcm_pa_err = smcm->smcm_pa_cmd + errorinfo_offset;
+
+ /*
+ * Ensure we asked for, and received, the correct physical alignment:
+ */
+ VERIFY0(smcm->smcm_pa_cmd & 0x1f);
+ VERIFY0(smcm->smcm_pa_err & 0x1f);
+
+ /*
+ * Populate Fields.
+ */
+ bzero(smcm->smcm_va_cmd, contig_size);
+ smcm->smcm_va_cmd->ErrDesc.Addr = smcm->smcm_pa_err;
+ smcm->smcm_va_cmd->ErrDesc.Len = sizeof (ErrorInfo_t);
+
+ return (smcm);
+}
+
+smrt_command_t *
+smrt_command_alloc_preinit(smrt_t *smrt, size_t datasize, int kmflags)
+{
+ smrt_command_t *smcm;
+
+ if ((smcm = smrt_command_alloc_impl(smrt, SMRT_CMDTYPE_PREINIT,
+ kmflags)) == NULL) {
+ return (NULL);
+ }
+
+ /*
+ * Note that most driver infrastructure has not been initialised at
+ * this time. All commands are submitted to the controller serially,
+ * using a pre-specified tag, and are not attached to the command
+ * tracking list.
+ */
+ smcm->smcm_tag = SMRT_PRE_TAG_NUMBER;
+ smcm->smcm_va_cmd->Header.Tag.tag_value = SMRT_PRE_TAG_NUMBER;
+
+ if (smrt_command_attach_internal(smrt, smcm, datasize, kmflags) != 0) {
+ smrt_command_free(smcm);
+ return (NULL);
+ }
+
+ return (smcm);
+}
+
+smrt_command_t *
+smrt_command_alloc(smrt_t *smrt, smrt_command_type_t type, int kmflags)
+{
+ smrt_command_t *smcm;
+
+ VERIFY(type != SMRT_CMDTYPE_PREINIT);
+
+ if ((smcm = smrt_command_alloc_impl(smrt, type, kmflags)) == NULL) {
+ return (NULL);
+ }
+
+ /*
+ * Insert into the per-controller command list.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+ list_insert_tail(&smrt->smrt_commands, smcm);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (smcm);
+}
+
+int
+smrt_command_attach_internal(smrt_t *smrt, smrt_command_t *smcm, size_t len,
+ int kmflags)
+{
+ smrt_command_internal_t *smcmi;
+
+ VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+ VERIFY3U(len, <=, UINT32_MAX);
+
+ if ((smcmi = kmem_zalloc(sizeof (*smcmi), kmflags)) == NULL) {
+ return (ENOMEM);
+ }
+
+ if (smrt_contig_alloc(smrt, &smcmi->smcmi_contig, len, kmflags,
+ &smcmi->smcmi_va, &smcmi->smcmi_pa) != DDI_SUCCESS) {
+ kmem_free(smcmi, sizeof (*smcmi));
+ return (ENOMEM);
+ }
+
+ bzero(smcmi->smcmi_va, smcmi->smcmi_len);
+
+ smcm->smcm_internal = smcmi;
+
+ smcm->smcm_va_cmd->SG[0].Addr = smcmi->smcmi_pa;
+ smcm->smcm_va_cmd->SG[0].Len = (uint32_t)len;
+ smcm->smcm_va_cmd->Header.SGList = 1;
+ smcm->smcm_va_cmd->Header.SGTotal = 1;
+
+ return (0);
+}
+
+void
+smrt_command_reuse(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Make sure the command is not currently inflight, then
+ * reset the command status.
+ */
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+ smcm->smcm_status = SMRT_CMD_STATUS_REUSED;
+
+ /*
+ * Ensure we are not trying to reuse a command that is in the finish or
+ * abort queue.
+ */
+ VERIFY(!list_link_active(&smcm->smcm_link_abort));
+ VERIFY(!list_link_active(&smcm->smcm_link_finish));
+
+ /*
+ * Clear the previous tag value.
+ */
+ smcm->smcm_tag = 0;
+ smcm->smcm_va_cmd->Header.Tag.tag_value = 0;
+
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+void
+smrt_command_free(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ /*
+ * Ensure the object we are about to free is not currently in the
+ * inflight AVL.
+ */
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+
+ if (smcm->smcm_internal != NULL) {
+ smrt_command_internal_t *smcmi = smcm->smcm_internal;
+
+ smrt_contig_free(&smcmi->smcmi_contig);
+ kmem_free(smcmi, sizeof (*smcmi));
+ }
+
+ smrt_contig_free(&smcm->smcm_contig);
+
+ if (smcm->smcm_type != SMRT_CMDTYPE_PREINIT) {
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Ensure we are not trying to free a command that is in the
+ * finish or abort queue.
+ */
+ VERIFY(!list_link_active(&smcm->smcm_link_abort));
+ VERIFY(!list_link_active(&smcm->smcm_link_finish));
+
+ list_remove(&smrt->smrt_commands, smcm);
+
+ mutex_exit(&smrt->smrt_mutex);
+ }
+
+ kmem_free(smcm, sizeof (*smcm));
+}
+
+smrt_command_t *
+smrt_lookup_inflight(smrt_t *smrt, uint32_t tag)
+{
+ smrt_command_t srch;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ bzero(&srch, sizeof (srch));
+ srch.smcm_tag = tag;
+
+ return (avl_find(&smrt->smrt_inflight, &srch, NULL));
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c
new file mode 100644
index 0000000000..9e27448b68
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c
@@ -0,0 +1,238 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * We must locate what the CISS specification describes as the "I2O
+ * registers". The Intelligent I/O (I2O) Architecture Specification describes
+ * this somewhat more coherently as "the memory region specified by the first
+ * base address configuration register indicating memory space (offset 10h,
+ * 14h, and so forth)".
+ */
+static int
+smrt_locate_bar(pci_regspec_t *regs, unsigned nregs,
+ unsigned *i2o_bar)
+{
+ /*
+ * Locate the first memory-mapped BAR:
+ */
+ for (unsigned i = 0; i < nregs; i++) {
+ unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK;
+
+ if (type == PCI_ADDR_MEM32 || type == PCI_ADDR_MEM64) {
+ *i2o_bar = i;
+ return (DDI_SUCCESS);
+ }
+ }
+
+ return (DDI_FAILURE);
+}
+
+static int
+smrt_locate_cfgtbl(smrt_t *smrt, pci_regspec_t *regs, unsigned nregs,
+ unsigned *ct_bar, uint32_t *baseaddr)
+{
+ uint32_t cfg_offset, mem_offset;
+ unsigned want_type;
+ uint32_t want_bar;
+
+ cfg_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_CFG_OFFSET);
+ mem_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_MEM_OFFSET);
+
+ VERIFY3U(cfg_offset, !=, 0xffffffff);
+ VERIFY3U(mem_offset, !=, 0xffffffff);
+
+ /*
+ * Locate the Configuration Table. Three different values read
+ * from two I2O registers allow us to determine the location:
+ * - the correct PCI BAR offset is in the low 16 bits of
+ * CISS_I2O_CFGTBL_CFG_OFFSET
+ * - bit 16 is 0 for a 32-bit space, and 1 for 64-bit
+ * - the memory offset from the base of this BAR is
+ * in CISS_I2O_CFGTBL_MEM_OFFSET
+ */
+ want_bar = (cfg_offset & 0xffff);
+ want_type = (cfg_offset & (1UL << 16)) ? PCI_ADDR_MEM64 :
+ PCI_ADDR_MEM32;
+
+ DTRACE_PROBE4(locate_cfgtbl, uint32_t, want_bar, unsigned,
+ want_type, uint32_t, cfg_offset, uint32_t, mem_offset);
+
+ for (unsigned i = 0; i < nregs; i++) {
+ unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK;
+ unsigned bar = PCI_REG_REG_G(regs[i].pci_phys_hi);
+
+ if (type != PCI_ADDR_MEM32 && type != PCI_ADDR_MEM64) {
+ continue;
+ }
+
+ if (bar == want_bar) {
+ *ct_bar = i;
+ *baseaddr = mem_offset;
+ return (DDI_SUCCESS);
+ }
+ }
+
+ return (DDI_FAILURE);
+}
+
+/*
+ * Determine the PCI vendor and device ID which is a proxy for which generation
+ * of controller we're working with.
+ */
+static int
+smrt_identify_device(smrt_t *smrt)
+{
+ ddi_acc_handle_t pci_hdl;
+
+ if (pci_config_setup(smrt->smrt_dip, &pci_hdl) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+
+ smrt->smrt_pci_vendor = pci_config_get16(pci_hdl, PCI_CONF_VENID);
+ smrt->smrt_pci_device = pci_config_get16(pci_hdl, PCI_CONF_DEVID);
+
+ pci_config_teardown(&pci_hdl);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_map_device(smrt_t *smrt)
+{
+ pci_regspec_t *regs;
+ uint_t regslen, nregs;
+ dev_info_t *dip = smrt->smrt_dip;
+ int r = DDI_FAILURE;
+
+ /*
+ * Get the list of PCI registers from the DDI property "regs":
+ */
+ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+ "reg", (int **)&regs, &regslen) != DDI_PROP_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not load \"reg\" DDI prop");
+ return (DDI_FAILURE);
+ }
+ nregs = regslen * sizeof (int) / sizeof (pci_regspec_t);
+
+ if (smrt_locate_bar(regs, nregs, &smrt->smrt_i2o_bar) !=
+ DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "did not find any memory BARs");
+ goto out;
+ }
+
+ /*
+ * Map enough of the I2O memory space to enable us to talk to the
+ * device.
+ */
+ if (ddi_regs_map_setup(dip, smrt->smrt_i2o_bar, &smrt->smrt_i2o_space,
+ CISS_I2O_MAP_BASE, CISS_I2O_MAP_LIMIT - CISS_I2O_MAP_BASE,
+ &smrt_dev_attributes, &smrt->smrt_i2o_handle) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "failed to map I2O registers");
+ goto out;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_I2O_MAPPED;
+
+ if (smrt_locate_cfgtbl(smrt, regs, nregs, &smrt->smrt_ct_bar,
+ &smrt->smrt_ct_baseaddr) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not find config table");
+ goto out;
+ }
+
+ /*
+ * Map the Configuration Table.
+ */
+ if (ddi_regs_map_setup(dip, smrt->smrt_ct_bar,
+ (caddr_t *)&smrt->smrt_ct, smrt->smrt_ct_baseaddr,
+ sizeof (CfgTable_t), &smrt_dev_attributes,
+ &smrt->smrt_ct_handle) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not map config table");
+ goto out;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_CFGTBL_MAPPED;
+
+ r = DDI_SUCCESS;
+
+out:
+ ddi_prop_free(regs);
+ return (r);
+}
+
+int
+smrt_device_setup(smrt_t *smrt)
+{
+ /*
+ * Ensure that the controller is installed in such a fashion that it
+ * may become a DMA master.
+ */
+ if (ddi_slaveonly(smrt->smrt_dip) == DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "device cannot become DMA "
+ "master");
+ return (DDI_FAILURE);
+ }
+
+ if (smrt_identify_device(smrt) != DDI_SUCCESS)
+ goto fail;
+
+ if (smrt_map_device(smrt) != DDI_SUCCESS) {
+ goto fail;
+ }
+
+ return (DDI_SUCCESS);
+
+fail:
+ smrt_device_teardown(smrt);
+ return (DDI_FAILURE);
+}
+
+void
+smrt_device_teardown(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_CFGTBL_MAPPED) {
+ ddi_regs_map_free(&smrt->smrt_ct_handle);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_CFGTBL_MAPPED;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_I2O_MAPPED) {
+ ddi_regs_map_free(&smrt->smrt_i2o_handle);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_I2O_MAPPED;
+ }
+}
+
+uint32_t
+smrt_get32(smrt_t *smrt, offset_t off)
+{
+ VERIFY3S(off, >=, CISS_I2O_MAP_BASE);
+ VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT);
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space +
+ (off - CISS_I2O_MAP_BASE));
+
+ return (ddi_get32(smrt->smrt_i2o_handle, addr));
+}
+
+void
+smrt_put32(smrt_t *smrt, offset_t off, uint32_t val)
+{
+ VERIFY3S(off, >=, CISS_I2O_MAP_BASE);
+ VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT);
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space +
+ (off - CISS_I2O_MAP_BASE));
+
+ ddi_put32(smrt->smrt_i2o_handle, addr, val);
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c
new file mode 100644
index 0000000000..8f082ffc9c
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c
@@ -0,0 +1,1457 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * The controller is not allowed to attach.
+ */
+static int
+smrt_ctrl_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ return (DDI_FAILURE);
+}
+
+/*
+ * The controller is not allowed to send packets.
+ */
+static int
+smrt_ctrl_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+ return (TRAN_BADPKT);
+}
+
+static boolean_t
+smrt_logvol_parse(const char *ua, uint_t *targp)
+{
+ long targ, lun;
+ const char *comma;
+ char *eptr;
+
+ comma = strchr(ua, ',');
+ if (comma == NULL) {
+ return (B_FALSE);
+ }
+
+ /*
+ * We expect the target number for a logical unit number to be zero for
+ * a logical volume.
+ */
+ if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' ||
+ lun != 0) {
+ return (B_FALSE);
+ }
+
+ if (ddi_strtol(ua, &eptr, 16, &targ) != 0 || eptr != comma ||
+ targ < 0 || targ >= SMRT_MAX_LOGDRV) {
+ return (B_FALSE);
+ }
+
+ *targp = (uint_t)targ;
+
+ return (B_TRUE);
+}
+
+static int
+smrt_logvol_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip))
+
+ smrt_volume_t *smlv;
+ smrt_target_t *smtg;
+ const char *ua;
+ uint_t targ;
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ dev_info_t *dip = smrt->smrt_dip;
+
+ /*
+ * The unit address comes in the form of 'target,lun'. We expect the
+ * lun to be zero. The target is what we set when we added it to the
+ * target map earlier.
+ */
+ ua = scsi_device_unit_address(sd);
+ if (ua == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ if (!smrt_logvol_parse(ua, &targ)) {
+ return (DDI_FAILURE);
+ }
+
+ if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) {
+ dev_err(dip, CE_WARN, "could not allocate target object "
+ "due to memory exhaustion");
+ return (DDI_FAILURE);
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ /*
+ * We are detaching. Do not accept any more requests to
+ * attach targets from the framework.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Look for a logical volume for the SCSI unit address of this target.
+ */
+ if ((smlv = smrt_logvol_lookup_by_id(smrt, targ)) == NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+ smtg->smtg_lun.smtg_vol = smlv;
+ smtg->smtg_addr = &smlv->smlv_addr;
+ smtg->smtg_physical = B_FALSE;
+ list_insert_tail(&smlv->smlv_targets, smtg);
+
+ /*
+ * Link this target object to the controller:
+ */
+ smtg->smtg_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_targets, smtg);
+
+ smtg->smtg_scsi_dev = sd;
+ VERIFY(sd->sd_dev == tgt_dip);
+
+ scsi_device_hba_private_set(sd, smtg);
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_SUCCESS);
+}
+
+static void
+smrt_logvol_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip, tgt_dip))
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ smrt_target_t *smtg = scsi_device_hba_private_get(sd);
+ smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol;
+
+ VERIFY(smtg->smtg_scsi_dev == sd);
+ VERIFY(smtg->smtg_physical == B_FALSE);
+
+ mutex_enter(&smrt->smrt_mutex);
+ list_remove(&smlv->smlv_targets, smtg);
+ list_remove(&smrt->smrt_targets, smtg);
+
+ scsi_device_hba_private_set(sd, NULL);
+
+ mutex_exit(&smrt->smrt_mutex);
+
+ kmem_free(smtg, sizeof (*smtg));
+}
+
+static int
+smrt_phys_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip))
+
+ smrt_target_t *smtg;
+ smrt_physical_t *smpt;
+ const char *ua, *comma;
+ char *eptr;
+ long lun;
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ dev_info_t *dip = smrt->smrt_dip;
+
+ /*
+ * The unit address comes in the form of 'target,lun'. We expect the
+ * lun to be zero. The target is what we set when we added it to the
+ * target map earlier.
+ */
+ ua = scsi_device_unit_address(sd);
+ if (ua == NULL)
+ return (DDI_FAILURE);
+
+ comma = strchr(ua, ',');
+ if (comma == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Confirm the LUN is zero. We may want to instead check the scsi
+ * 'lun'/'lun64' property or do so in addition to this logic.
+ */
+ if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' ||
+ lun != 0) {
+ return (DDI_FAILURE);
+ }
+
+ if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) {
+ dev_err(dip, CE_WARN, "could not allocate target object "
+ "due to memory exhaustion");
+ return (DDI_FAILURE);
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ /*
+ * We are detaching. Do not accept any more requests to
+ * attach targets from the framework.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+
+ /*
+ * Look for a physical target based on the unit address of the target
+ * (which will encode its WWN and LUN).
+ */
+ smpt = smrt_phys_lookup_by_ua(smrt, ua);
+ if (smpt == NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+ smtg->smtg_scsi_dev = sd;
+ smtg->smtg_physical = B_TRUE;
+ smtg->smtg_lun.smtg_phys = smpt;
+ list_insert_tail(&smpt->smpt_targets, smtg);
+ smtg->smtg_addr = &smpt->smpt_addr;
+
+ /*
+ * Link this target object to the controller:
+ */
+ smtg->smtg_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_targets, smtg);
+
+ VERIFY(sd->sd_dev == tgt_dip);
+ smtg->smtg_scsi_dev = sd;
+
+ scsi_device_hba_private_set(sd, smtg);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (DDI_SUCCESS);
+}
+
+static void
+smrt_phys_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip, tgt_dip))
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ smrt_target_t *smtg = scsi_device_hba_private_get(sd);
+ smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys;
+
+ VERIFY(smtg->smtg_scsi_dev == sd);
+ VERIFY(smtg->smtg_physical == B_TRUE);
+
+ mutex_enter(&smrt->smrt_mutex);
+ list_remove(&smpt->smpt_targets, smtg);
+ list_remove(&smrt->smrt_targets, smtg);
+
+ scsi_device_hba_private_set(sd, NULL);
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+}
+
+/*
+ * This function is called when the SCSI framework has allocated a packet and
+ * our private per-packet object.
+ *
+ * We choose not to have the framework pre-allocate memory for the CDB.
+ * Instead, we will make available the CDB area in the controller command block
+ * itself.
+ *
+ * Status block memory is allocated by the framework because we passed
+ * SCSI_HBA_TRAN_SCB to scsi_hba_attach_setup(9F).
+ */
+static int
+smrt_tran_setup_pkt(struct scsi_pkt *pkt, int (*callback)(caddr_t),
+ caddr_t arg)
+{
+ _NOTE(ARGUNUSED(arg))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_t *smcm;
+ smrt_command_scsa_t *smcms;
+ int kmflags = callback == SLEEP_FUNC ? KM_SLEEP : KM_NOSLEEP;
+
+ sd = scsi_address_device(&pkt->pkt_address);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+ smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private;
+
+ /*
+ * Check that we have enough space in the command object for the
+ * request from the target driver:
+ */
+ if (pkt->pkt_cdblen > CISS_CDBLEN) {
+ /*
+ * The CDB member of the Request Block of a controller
+ * command is fixed at 16 bytes.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "oversize CDB: had %u, "
+ "needed %u", CISS_CDBLEN, pkt->pkt_cdblen);
+ return (-1);
+ }
+
+ /*
+ * Allocate our command block:
+ */
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_SCSA,
+ kmflags)) == NULL) {
+ return (-1);
+ }
+ smcm->smcm_scsa = smcms;
+ smcms->smcms_command = smcm;
+ smcms->smcms_pkt = pkt;
+
+ pkt->pkt_cdbp = &smcm->smcm_va_cmd->Request.CDB[0];
+ smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen;
+
+ smcm->smcm_target = smtg;
+
+ return (0);
+}
+
+static void
+smrt_tran_teardown_pkt(struct scsi_pkt *pkt)
+{
+ smrt_command_scsa_t *smcms = (smrt_command_scsa_t *)
+ pkt->pkt_ha_private;
+ smrt_command_t *smcm = smcms->smcms_command;
+
+ smrt_command_free(smcm);
+
+ pkt->pkt_cdbp = NULL;
+}
+
+static void
+smrt_set_arq_data(struct scsi_pkt *pkt, uchar_t key)
+{
+ struct scsi_arq_status *sts;
+
+ VERIFY3U(pkt->pkt_scblen, >=, sizeof (struct scsi_arq_status));
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ sts = (struct scsi_arq_status *)(pkt->pkt_scbp);
+ bzero(sts, sizeof (*sts));
+
+ /*
+ * Mock up a CHECK CONDITION SCSI status for the original command:
+ */
+ sts->sts_status.sts_chk = 1;
+
+ /*
+ * Pretend that we successfully performed REQUEST SENSE:
+ */
+ sts->sts_rqpkt_reason = CMD_CMPLT;
+ sts->sts_rqpkt_resid = 0;
+ sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET |
+ STATE_SENT_CMD | STATE_XFERRED_DATA;
+ sts->sts_rqpkt_statistics = 0;
+
+ /*
+ * Return the key value we were provided in the fake sense data:
+ */
+ sts->sts_sensedata.es_valid = 1;
+ sts->sts_sensedata.es_class = CLASS_EXTENDED_SENSE;
+ sts->sts_sensedata.es_key = key;
+
+ pkt->pkt_state |= STATE_ARQ_DONE;
+}
+
+/*
+ * When faking up a REPORT LUNS data structure, we simply report one LUN, LUN 0.
+ * We need 16 bytes for this, 4 for the size, 4 reserved bytes, and the 8 for
+ * the actual LUN.
+ */
+static void
+smrt_fake_report_lun(smrt_command_t *smcm, struct scsi_pkt *pkt)
+{
+ size_t sz;
+ char resp[16];
+ struct buf *bp;
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD |
+ STATE_GOT_STATUS;
+
+ /*
+ * Check to make sure this is valid. If reserved bits are set or if the
+ * mode is one other than 0x00, 0x01, 0x02, then it's an illegal
+ * request.
+ */
+ if (pkt->pkt_cdbp[1] != 0 || pkt->pkt_cdbp[3] != 0 ||
+ pkt->pkt_cdbp[4] != 0 || pkt->pkt_cdbp[5] != 0 ||
+ pkt->pkt_cdbp[10] != 0 || pkt->pkt_cdbp[11] != 0 ||
+ pkt->pkt_cdbp[2] > 0x2) {
+ smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST);
+ return;
+ }
+
+ /*
+ * Construct the actual REPORT LUNS reply. We need to indicate a single
+ * LUN of all zeros. This means that the length needs to be 8 bytes,
+ * the size of the lun. Otherwise, the rest of this structure can be
+ * zeros.
+ */
+ bzero(resp, sizeof (resp));
+ resp[3] = sizeof (scsi_lun_t);
+
+ bp = scsi_pkt2bp(pkt);
+ sz = MIN(sizeof (resp), bp->b_bcount);
+
+ bp_mapin(bp);
+ bcopy(resp, bp->b_un.b_addr, sz);
+ bp_mapout(bp);
+ pkt->pkt_state |= STATE_XFERRED_DATA;
+ pkt->pkt_resid = bp->b_bcount - sz;
+ if (pkt->pkt_scblen >= 1) {
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ }
+}
+
+static int
+smrt_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+ _NOTE(ARGUNUSED(sa))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_scsa_t *smcms;
+ smrt_command_t *smcm;
+ int r;
+
+ sd = scsi_address_device(&pkt->pkt_address);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+ smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private;
+ VERIFY(smcms != NULL);
+ smcm = smcms->smcms_command;
+ VERIFY(smcm != NULL);
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_TRAN_START) {
+ /*
+ * This is a retry of a command that has already been
+ * used once. Assign it a new tag number.
+ */
+ smrt_command_reuse(smcm);
+ }
+ smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_START;
+
+ /*
+ * The sophisticated firmware in this controller cannot possibly bear
+ * the following SCSI commands. It appears to return a response with
+ * the status STATUS_ACA_ACTIVE (0x30), which is not something we
+ * expect. Instead, fake up a failure response.
+ */
+ switch (pkt->pkt_cdbp[0]) {
+ case SCMD_FORMAT:
+ case SCMD_LOG_SENSE_G1:
+ case SCMD_MODE_SELECT:
+ case SCMD_PERSISTENT_RESERVE_IN:
+ if (smtg->smtg_physical) {
+ break;
+ }
+
+ smrt->smrt_stats.smrts_ignored_scsi_cmds++;
+ smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_IGNORED;
+
+ /*
+ * Mark the command as completed to the point where we
+ * received a SCSI status code:
+ */
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET |
+ STATE_SENT_CMD | STATE_GOT_STATUS;
+
+ /*
+ * Mock up sense data for an illegal request:
+ */
+ smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST);
+
+ scsi_hba_pkt_comp(pkt);
+ return (TRAN_ACCEPT);
+ case SCMD_REPORT_LUNS:
+ /*
+ * The SMRT controller does not accept a REPORT LUNS command for
+ * logical volumes. As such, we need to fake up a REPORT LUNS
+ * response that has a single LUN, LUN 0.
+ */
+ if (smtg->smtg_physical) {
+ break;
+ }
+
+ smrt_fake_report_lun(smcm, pkt);
+
+ scsi_hba_pkt_comp(pkt);
+ return (TRAN_ACCEPT);
+ default:
+ break;
+ }
+
+ if (pkt->pkt_flags & FLAG_NOINTR) {
+ /*
+ * We must sleep and wait for the completion of this command.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ }
+
+ /*
+ * Because we provide a tran_setup_pkt(9E) entrypoint, we must now
+ * set up the Scatter/Gather List in the Command to reflect any
+ * DMA resources passed to us by the framework.
+ */
+ if (pkt->pkt_numcookies > smrt->smrt_sg_cnt) {
+ /*
+ * More DMA cookies than we are prepared to handle.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "too many DMA cookies (got %u;"
+ " expected %u)", pkt->pkt_numcookies, smrt->smrt_sg_cnt);
+ return (TRAN_BADPKT);
+ }
+ smcm->smcm_va_cmd->Header.SGList = pkt->pkt_numcookies;
+ smcm->smcm_va_cmd->Header.SGTotal = pkt->pkt_numcookies;
+ for (unsigned i = 0; i < pkt->pkt_numcookies; i++) {
+ smcm->smcm_va_cmd->SG[i].Addr =
+ LE_64(pkt->pkt_cookies[i].dmac_laddress);
+ smcm->smcm_va_cmd->SG[i].Len =
+ LE_32(pkt->pkt_cookies[i].dmac_size);
+ }
+
+ /*
+ * Copy logical volume address from the target object:
+ */
+ smcm->smcm_va_cmd->Header.LUN = *smcm->smcm_target->smtg_addr;
+
+ /*
+ * Initialise the command block.
+ */
+ smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen;
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(pkt->pkt_time);
+ if (pkt->pkt_numcookies > 0) {
+ /*
+ * There are DMA resources; set the transfer direction
+ * appropriately:
+ */
+ if (pkt->pkt_dma_flags & DDI_DMA_READ) {
+ smcm->smcm_va_cmd->Request.Type.Direction =
+ CISS_XFER_READ;
+ } else if (pkt->pkt_dma_flags & DDI_DMA_WRITE) {
+ smcm->smcm_va_cmd->Request.Type.Direction =
+ CISS_XFER_WRITE;
+ } else {
+ smcm->smcm_va_cmd->Request.Type.Direction =
+ CISS_XFER_NONE;
+ }
+ } else {
+ /*
+ * No DMA resources means no transfer.
+ */
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE;
+ }
+
+ /*
+ * Initialise the SCSI packet as described in tran_start(9E). We will
+ * progressively update these fields as the command moves through the
+ * submission and completion states.
+ */
+ pkt->pkt_resid = 0;
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_state = 0;
+
+ /*
+ * If this SCSI packet has a timeout, configure an appropriate
+ * expiry time:
+ */
+ if (pkt->pkt_time != 0) {
+ smcm->smcm_expiry = gethrtime() + pkt->pkt_time * NANOSEC;
+ }
+
+ /*
+ * Submit the command to the controller.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * If we're dumping, there's a chance that the target we're talking to
+ * could have ended up disappearing during the process of discovery. If
+ * this target is part of the dump device, we check here and return that
+ * we hit a fatal error.
+ */
+ if (ddi_in_panic() && smtg->smtg_gone) {
+ mutex_exit(&smrt->smrt_mutex);
+
+ dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed: target "
+ "%s is gone, it did not come back after post-panic reset "
+ "device discovery", scsi_device_unit_address(sd));
+
+ return (TRAN_FATAL_ERROR);
+ }
+
+ smrt->smrt_stats.smrts_tran_starts++;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+
+ dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed %d", r);
+
+ /*
+ * Inform the SCSI framework that we could not submit
+ * the command.
+ */
+ return (r == EAGAIN ? TRAN_BUSY : TRAN_FATAL_ERROR);
+ }
+
+ /*
+ * Update the SCSI packet to reflect submission of the command.
+ */
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD;
+
+ if (pkt->pkt_flags & FLAG_NOINTR) {
+ /*
+ * Poll the controller for completion of the command we
+ * submitted. Once this routine has returned, the completion
+ * callback will have been fired with either an active response
+ * (success or error) or a timeout. The command is freed by
+ * the completion callback, so it may not be referenced again
+ * after this call returns.
+ */
+ smrt_poll_for(smrt, smcm);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (TRAN_ACCEPT);
+}
+
+static int
+smrt_tran_reset(struct scsi_address *sa, int level)
+{
+ _NOTE(ARGUNUSED(level))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_t *smcm;
+ int r;
+
+ sd = scsi_address_device(sa);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+
+ /*
+ * The framework has requested some kind of SCSI reset. A
+ * controller-level soft reset can take a very long time -- often on
+ * the order of 30-60 seconds -- but might well be our only option if
+ * the controller is non-responsive.
+ *
+ * First, check if the controller is responding to pings.
+ */
+again:
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ return (0);
+ }
+
+ smrt_write_message_nop(smcm, SMRT_PING_CHECK_TIMEOUT);
+
+ mutex_enter(&smrt->smrt_mutex);
+ smrt->smrt_stats.smrts_tran_resets++;
+ if (ddi_in_panic()) {
+ goto skip_check;
+ }
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ /*
+ * The controller is already resetting. Wait for that
+ * to finish.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ }
+ }
+
+skip_check:
+ /*
+ * Submit our ping to the controller.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ smcm->smcm_expiry = gethrtime() + SMRT_PING_CHECK_TIMEOUT * NANOSEC;
+ if (smrt_submit(smrt, smcm) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (0);
+ }
+
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The ping command timed out. Abandon it now.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping timed out");
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+
+ } else if ((smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) ||
+ (smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+ /*
+ * The command completed in error, or a controller reset
+ * was sent while we were trying to ping.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping error");
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+
+ } else {
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE);
+
+ /*
+ * The controller is responsive, and a full soft reset would be
+ * extremely disruptive to the system. Given our spotty
+ * support for some SCSI commands (which can upset the target
+ * drivers) and the historically lax behaviour of the "smrt"
+ * driver, we grit our teeth and pretend we were able to
+ * perform a reset.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (1);
+ }
+
+ /*
+ * If a reset has been initiated in the last 90 seconds, try
+ * another ping.
+ */
+ if (gethrtime() < smrt->smrt_last_reset_start + 90 * NANOSEC) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed, but "
+ "was recently reset; retrying ping");
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * Sleep for a second first.
+ */
+ if (ddi_in_panic()) {
+ drv_usecwait(1 * MICROSEC);
+ } else {
+ delay(drv_usectohz(1 * MICROSEC));
+ }
+ goto again;
+ }
+
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed; resetting "
+ "controller");
+ if (smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller reset failure");
+ mutex_exit(&smrt->smrt_mutex);
+ return (0);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (1);
+}
+
+static int
+smrt_tran_abort(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_t *smcm = NULL;
+ smrt_command_t *abort_smcm;
+
+ sd = scsi_address_device(sa);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+
+
+ if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ /*
+ * No resources available to send an abort message.
+ */
+ return (0);
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+ smrt->smrt_stats.smrts_tran_aborts++;
+ if (pkt != NULL) {
+ /*
+ * The framework wants us to abort a specific SCSI packet.
+ */
+ smrt_command_scsa_t *smcms = (smrt_command_scsa_t *)
+ pkt->pkt_ha_private;
+ smcm = smcms->smcms_command;
+
+ if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+ /*
+ * This message is not currently in flight, so we
+ * cannot abort it.
+ */
+ goto fail;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+ /*
+ * An abort message for this command has already been
+ * sent to the controller. Return failure.
+ */
+ goto fail;
+ }
+
+ smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag);
+ } else {
+ /*
+ * The framework wants us to abort every in flight command
+ * for the target with this address.
+ */
+ smrt_write_message_abort_all(abort_smcm, smtg->smtg_addr);
+ }
+
+ /*
+ * Submit the abort message to the controller.
+ */
+ abort_smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if (smrt_submit(smrt, abort_smcm) != 0) {
+ goto fail;
+ }
+
+ if (pkt != NULL) {
+ /*
+ * Record some debugging information about the abort we
+ * sent:
+ */
+ smcm->smcm_abort_time = gethrtime();
+ smcm->smcm_abort_tag = abort_smcm->smcm_tag;
+
+ /*
+ * Mark the command as aborted so that we do not send
+ * a second abort message:
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT;
+ }
+
+ /*
+ * Poll for completion of the abort message. Note that this function
+ * only fails if we set a timeout on the command, which we have not
+ * done.
+ */
+ VERIFY0(smrt_poll_for(smrt, abort_smcm));
+
+ if ((abort_smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) ||
+ (abort_smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+ /*
+ * Either the controller was reset or the abort command
+ * failed.
+ */
+ goto fail;
+ }
+
+ /*
+ * The command was successfully aborted.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(abort_smcm);
+ return (1);
+
+fail:
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(abort_smcm);
+ return (0);
+}
+
+static void
+smrt_hba_complete_status(smrt_command_t *smcm)
+{
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+ struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt;
+
+ bzero(pkt->pkt_scbp, pkt->pkt_scblen);
+
+ if (ei->ScsiStatus != STATUS_CHECK) {
+ /*
+ * If the SCSI status is not CHECK CONDITION, we don't want
+ * to try and read the sense data buffer.
+ */
+ goto simple_status;
+ }
+
+ if (pkt->pkt_scblen < sizeof (struct scsi_arq_status)) {
+ /*
+ * There is not enough room for a request sense structure.
+ * Fall back to reporting just the SCSI status code.
+ */
+ goto simple_status;
+ }
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ struct scsi_arq_status *sts = (struct scsi_arq_status *)pkt->pkt_scbp;
+
+ /*
+ * Copy in the SCSI status from the original command.
+ */
+ bcopy(&ei->ScsiStatus, &sts->sts_status, sizeof (sts->sts_status));
+
+ /*
+ * Mock up a successful REQUEST SENSE:
+ */
+ sts->sts_rqpkt_reason = CMD_CMPLT;
+ sts->sts_rqpkt_resid = 0;
+ sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET |
+ STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ sts->sts_rqpkt_statistics = 0;
+
+ /*
+ * The sense data from the controller should be copied into place
+ * starting at the "sts_sensedata" member of the auto request
+ * sense object.
+ */
+ size_t sense_len = pkt->pkt_scblen - offsetof(struct scsi_arq_status,
+ sts_sensedata);
+ if (ei->SenseLen < sense_len) {
+ /*
+ * Only copy sense data bytes that are within the region
+ * the controller marked as valid.
+ */
+ sense_len = ei->SenseLen;
+ }
+ bcopy(ei->SenseInfo, &sts->sts_sensedata, sense_len);
+
+ pkt->pkt_state |= STATE_ARQ_DONE;
+ return;
+
+simple_status:
+ if (pkt->pkt_scblen < sizeof (struct scsi_status)) {
+ /*
+ * There is not even enough room for the SCSI status byte.
+ */
+ return;
+ }
+
+ bcopy(&ei->ScsiStatus, pkt->pkt_scbp, sizeof (struct scsi_status));
+}
+
+static void
+smrt_hba_complete_log_error(smrt_command_t *smcm, const char *name)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ dev_err(smrt->smrt_dip, CE_WARN, "!SCSI command failed: %s: "
+ "SCSI op %x, CISS status %x, SCSI status %x", name,
+ (unsigned)smcm->smcm_va_cmd->Request.CDB[0],
+ (unsigned)ei->CommandStatus, (unsigned)ei->ScsiStatus);
+}
+
+/*
+ * Completion routine for commands submitted to the controller via the SCSI
+ * framework.
+ */
+void
+smrt_hba_complete(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+ struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ pkt->pkt_resid = ei->ResidualCnt;
+
+ /*
+ * Check if the controller was reset while this packet was in flight.
+ */
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ if (pkt->pkt_reason != CMD_CMPLT) {
+ /*
+ * If another error status has already been written,
+ * do not overwrite it.
+ */
+ pkt->pkt_reason = CMD_RESET;
+ }
+ pkt->pkt_statistics |= STAT_BUS_RESET | STAT_DEV_RESET;
+ goto finish;
+ }
+
+ if (!(smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+ /*
+ * The command was completed without error by the controller.
+ *
+ * As per the specification, if an error was not signalled
+ * by the controller through the CISS transport method,
+ * the error information (including CommandStatus) has not
+ * been written and should not be checked.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ goto finish;
+ }
+
+ /*
+ * Check the completion status to determine what befell this request.
+ */
+ switch (ei->CommandStatus) {
+ case CISS_CMD_SUCCESS:
+ /*
+ * In a certain sense, the specification contradicts itself.
+ * On the one hand, it suggests that a successful command
+ * will not result in a controller write to the error
+ * information block; on the other hand, it makes room
+ * for a status code (0) which denotes a successful
+ * execution.
+ *
+ * To be on the safe side, we check for that condition here.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_DATA_UNDERRUN:
+ /*
+ * A data underrun occurred. Ideally this will result in
+ * an appropriate SCSI status and sense data.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_TARGET_STATUS:
+ /*
+ * The command completed, but an error occurred. We need
+ * to provide the sense data to the SCSI framework.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_DATA_OVERRUN:
+ /*
+ * Data overrun has occurred.
+ */
+ smrt_hba_complete_log_error(smcm, "data overrun");
+ pkt->pkt_reason = CMD_DATA_OVR;
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_INVALID:
+ /*
+ * One or more fields in the command has invalid data.
+ */
+ smrt_hba_complete_log_error(smcm, "invalid command");
+ pkt->pkt_reason = CMD_BADMSG;
+ pkt->pkt_state |= STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_PROTOCOL_ERR:
+ /*
+ * An error occurred in communication with the end device.
+ */
+ smrt_hba_complete_log_error(smcm, "protocol error");
+ pkt->pkt_reason = CMD_BADMSG;
+ pkt->pkt_state |= STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_HARDWARE_ERR:
+ /*
+ * A hardware error occurred.
+ */
+ smrt_hba_complete_log_error(smcm, "hardware error");
+ pkt->pkt_reason = CMD_INCOMPLETE;
+ break;
+
+ case CISS_CMD_CONNECTION_LOST:
+ /*
+ * The connection with the end device cannot be
+ * re-established.
+ */
+ smrt_hba_complete_log_error(smcm, "connection lost");
+ pkt->pkt_reason = CMD_INCOMPLETE;
+ break;
+
+ case CISS_CMD_ABORTED:
+ case CISS_CMD_UNSOLICITED_ABORT:
+ if (smcm->smcm_status & SMRT_CMD_STATUS_TIMEOUT) {
+ /*
+ * This abort was arranged by the periodic routine
+ * in response to an elapsed timeout.
+ */
+ pkt->pkt_reason = CMD_TIMEOUT;
+ pkt->pkt_statistics |= STAT_TIMEOUT;
+ } else {
+ pkt->pkt_reason = CMD_ABORTED;
+ }
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ pkt->pkt_statistics |= STAT_ABORTED;
+ break;
+
+ case CISS_CMD_TIMEOUT:
+ smrt_hba_complete_log_error(smcm, "timeout");
+ pkt->pkt_reason = CMD_TIMEOUT;
+ pkt->pkt_statistics |= STAT_TIMEOUT;
+ break;
+
+ default:
+ /*
+ * This is an error that we were not prepared to handle.
+ * Signal a generic transport-level error to the framework.
+ */
+ smrt_hba_complete_log_error(smcm, "unexpected error");
+ pkt->pkt_reason = CMD_TRAN_ERR;
+ }
+
+ /*
+ * Attempt to read a SCSI status code and any automatic
+ * request sense data that may exist:
+ */
+ smrt_hba_complete_status(smcm);
+
+finish:
+ mutex_exit(&smrt->smrt_mutex);
+ scsi_hba_pkt_comp(pkt);
+ mutex_enter(&smrt->smrt_mutex);
+}
+
+static int
+smrt_getcap(struct scsi_address *sa, char *cap, int whom)
+{
+ _NOTE(ARGUNUSED(whom))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ int index;
+
+ sd = scsi_address_device(sa);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+
+ if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) {
+ /*
+ * This capability string could not be translated to an
+ * ID number, so it must not exist.
+ */
+ return (-1);
+ }
+
+ switch (index) {
+ case SCSI_CAP_CDB_LEN:
+ /*
+ * The CDB field in the CISS request block is fixed at 16
+ * bytes.
+ */
+ return (CISS_CDBLEN);
+
+ case SCSI_CAP_DMA_MAX:
+ if (smrt->smrt_dma_attr.dma_attr_maxxfer > INT_MAX) {
+ return (INT_MAX);
+ }
+ return ((int)smrt->smrt_dma_attr.dma_attr_maxxfer);
+
+ case SCSI_CAP_SECTOR_SIZE:
+ if (smrt->smrt_dma_attr.dma_attr_granular > INT_MAX) {
+ return (-1);
+ }
+ return ((int)smrt->smrt_dma_attr.dma_attr_granular);
+
+ /*
+ * If this target corresponds to a physical device, then we always
+ * indicate that we're on a SAS interconnect. Otherwise, we default to
+ * saying that we're on a parallel bus. We can't use SAS for
+ * everything, unfortunately. When you declare yourself to be a SAS
+ * interconnect, it's expected that you have a full 16-byte WWN as the
+ * target. If not, devfsadm will not be able to enumerate the device
+ * and create /dev/[r]dsk entries.
+ */
+ case SCSI_CAP_INTERCONNECT_TYPE:
+ if (smtg->smtg_physical) {
+ return (INTERCONNECT_SAS);
+ } else {
+ return (INTERCONNECT_PARALLEL);
+ }
+
+ case SCSI_CAP_DISCONNECT:
+ case SCSI_CAP_SYNCHRONOUS:
+ case SCSI_CAP_WIDE_XFER:
+ case SCSI_CAP_ARQ:
+ case SCSI_CAP_UNTAGGED_QING:
+ case SCSI_CAP_TAGGED_QING:
+ /*
+ * These capabilities are supported by the driver and the
+ * controller. See scsi_ifgetcap(9F) for more information.
+ */
+ return (1);
+
+ case SCSI_CAP_INITIATOR_ID:
+ case SCSI_CAP_RESET_NOTIFICATION:
+ /*
+ * These capabilities are not supported.
+ */
+ return (0);
+
+ default:
+ /*
+ * The property in question is not known to this driver.
+ */
+ return (-1);
+ }
+}
+
+/* ARGSUSED */
+static int
+smrt_setcap(struct scsi_address *sa, char *cap, int value, int whom)
+{
+ int index;
+
+ if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) {
+ /*
+ * This capability string could not be translated to an
+ * ID number, so it must not exist.
+ */
+ return (-1);
+ }
+
+ if (whom == 0) {
+ /*
+ * When whom is 0, this is a request to set a capability for
+ * all targets. As per the recommendation in tran_setcap(9E),
+ * we do not support this mode of operation.
+ */
+ return (-1);
+ }
+
+ switch (index) {
+ case SCSI_CAP_CDB_LEN:
+ case SCSI_CAP_DMA_MAX:
+ case SCSI_CAP_SECTOR_SIZE:
+ case SCSI_CAP_INITIATOR_ID:
+ case SCSI_CAP_DISCONNECT:
+ case SCSI_CAP_SYNCHRONOUS:
+ case SCSI_CAP_WIDE_XFER:
+ case SCSI_CAP_ARQ:
+ case SCSI_CAP_UNTAGGED_QING:
+ case SCSI_CAP_TAGGED_QING:
+ case SCSI_CAP_RESET_NOTIFICATION:
+ case SCSI_CAP_INTERCONNECT_TYPE:
+ /*
+ * We do not support changing any capabilities at this time.
+ */
+ return (0);
+
+ default:
+ /*
+ * The capability in question is not known to this driver.
+ */
+ return (-1);
+ }
+}
+
+int
+smrt_ctrl_hba_setup(smrt_t *smrt)
+{
+ int flags;
+ dev_info_t *dip = smrt->smrt_dip;
+ scsi_hba_tran_t *tran;
+
+ if ((tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP)) == NULL) {
+ dev_err(dip, CE_WARN, "could not allocate SCSA resources");
+ return (DDI_FAILURE);
+ }
+
+ smrt->smrt_hba_tran = tran;
+ tran->tran_hba_private = smrt;
+
+ tran->tran_tgt_init = smrt_ctrl_tran_tgt_init;
+ tran->tran_tgt_probe = scsi_hba_probe;
+
+ tran->tran_start = smrt_ctrl_tran_start;
+
+ tran->tran_getcap = smrt_getcap;
+ tran->tran_setcap = smrt_setcap;
+
+ tran->tran_setup_pkt = smrt_tran_setup_pkt;
+ tran->tran_teardown_pkt = smrt_tran_teardown_pkt;
+ tran->tran_hba_len = sizeof (smrt_command_scsa_t);
+ tran->tran_interconnect_type = INTERCONNECT_SAS;
+
+ flags = SCSI_HBA_HBA | SCSI_HBA_TRAN_SCB | SCSI_HBA_ADDR_COMPLEX;
+ if (scsi_hba_attach_setup(dip, &smrt->smrt_dma_attr, tran, flags) !=
+ DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not attach to SCSA framework");
+ scsi_hba_tran_free(tran);
+ return (DDI_FAILURE);
+ }
+
+ smrt->smrt_init_level |= SMRT_INITLEVEL_SCSA;
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_ctrl_hba_teardown(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_SCSA) {
+ VERIFY(scsi_hba_detach(smrt->smrt_dip) != DDI_FAILURE);
+ scsi_hba_tran_free(smrt->smrt_hba_tran);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_SCSA;
+ }
+}
+
+int
+smrt_logvol_hba_setup(smrt_t *smrt, dev_info_t *iport)
+{
+ scsi_hba_tran_t *tran;
+
+ tran = ddi_get_driver_private(iport);
+ if (tran == NULL)
+ return (DDI_FAILURE);
+
+ tran->tran_tgt_init = smrt_logvol_tran_tgt_init;
+ tran->tran_tgt_free = smrt_logvol_tran_tgt_free;
+
+ tran->tran_start = smrt_tran_start;
+ tran->tran_reset = smrt_tran_reset;
+ tran->tran_abort = smrt_tran_abort;
+
+ tran->tran_hba_private = smrt;
+
+ mutex_enter(&smrt->smrt_mutex);
+ if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC,
+ 2 * MICROSEC, smrt, smrt_logvol_tgtmap_activate,
+ smrt_logvol_tgtmap_deactivate, &smrt->smrt_virt_tgtmap) !=
+ DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ smrt_discover_request(smrt);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_logvol_hba_teardown(smrt_t *smrt, dev_info_t *iport)
+{
+ ASSERT(smrt->smrt_virt_iport == iport);
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_virt_tgtmap != NULL) {
+ scsi_hba_tgtmap_t *t;
+
+ /*
+ * Ensure that we can't be racing with discovery.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) {
+ mutex_exit(&smrt->smrt_mutex);
+ ddi_taskq_wait(smrt->smrt_discover_taskq);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ t = smrt->smrt_virt_tgtmap;
+ smrt->smrt_virt_tgtmap = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ scsi_hba_tgtmap_destroy(t);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+int
+smrt_phys_hba_setup(smrt_t *smrt, dev_info_t *iport)
+{
+ scsi_hba_tran_t *tran;
+
+ tran = ddi_get_driver_private(iport);
+ if (tran == NULL)
+ return (DDI_FAILURE);
+
+ tran->tran_tgt_init = smrt_phys_tran_tgt_init;
+ tran->tran_tgt_free = smrt_phys_tran_tgt_free;
+
+ tran->tran_start = smrt_tran_start;
+ tran->tran_reset = smrt_tran_reset;
+ tran->tran_abort = smrt_tran_abort;
+
+ tran->tran_hba_private = smrt;
+
+ mutex_enter(&smrt->smrt_mutex);
+ if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC,
+ 2 * MICROSEC, smrt, smrt_phys_tgtmap_activate,
+ smrt_phys_tgtmap_deactivate, &smrt->smrt_phys_tgtmap) !=
+ DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ smrt_discover_request(smrt);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_phys_hba_teardown(smrt_t *smrt, dev_info_t *iport)
+{
+ ASSERT(smrt->smrt_phys_iport == iport);
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_phys_tgtmap != NULL) {
+ scsi_hba_tgtmap_t *t;
+
+ /*
+ * Ensure that we can't be racing with discovery.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) {
+ mutex_exit(&smrt->smrt_mutex);
+ ddi_taskq_wait(smrt->smrt_discover_taskq);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ t = smrt->smrt_phys_tgtmap;
+ smrt->smrt_phys_tgtmap = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ scsi_hba_tgtmap_destroy(t);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c
new file mode 100644
index 0000000000..18d5b8e936
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c
@@ -0,0 +1,286 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static char *
+smrt_interrupt_type_name(int type)
+{
+ switch (type) {
+ case DDI_INTR_TYPE_MSIX:
+ return ("MSI-X");
+ case DDI_INTR_TYPE_MSI:
+ return ("MSI");
+ case DDI_INTR_TYPE_FIXED:
+ return ("fixed");
+ default:
+ return ("?");
+ }
+}
+
+static boolean_t
+smrt_try_msix(smrt_t *smrt)
+{
+ char *fwver = smrt->smrt_versions.smrtv_firmware_rev;
+
+ /*
+ * Generation 9 controllers end up having a different firmware
+ * versioning scheme than others. If this is a generation 9 controller,
+ * which all share the same PCI device ID, then we default to MSI.
+ */
+ if (smrt->smrt_pci_vendor == SMRT_VENDOR_HP &&
+ smrt->smrt_pci_device == SMRT_DEVICE_GEN9) {
+ return (B_FALSE);
+ }
+
+ if (fwver[0] == '8' && fwver[1] == '.' && isdigit(fwver[2]) &&
+ isdigit(fwver[3])) {
+ /*
+ * Version 8.00 of the Smart Array firmware appears to have
+ * broken MSI support on at least one controller. We could
+ * blindly try MSI-X everywhere, except that on at least some
+ * 6.XX firmware versions, MSI-X interrupts do not appear
+ * to be triggered for Simple Transport Method command
+ * completions.
+ *
+ * For now, assume we should try for MSI-X with all 8.XX
+ * versions of the firmware.
+ */
+ dev_err(smrt->smrt_dip, CE_NOTE, "!trying MSI-X interrupts "
+ "to work around 8.XX firmware defect");
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+smrt_interrupts_disable(smrt_t *smrt)
+{
+ if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+ return (ddi_intr_block_disable(smrt->smrt_interrupts,
+ smrt->smrt_ninterrupts));
+ } else {
+ VERIFY3S(smrt->smrt_ninterrupts, ==, 1);
+
+ return (ddi_intr_disable(smrt->smrt_interrupts[0]));
+ }
+}
+
+int
+smrt_interrupts_enable(smrt_t *smrt)
+{
+ int ret;
+
+ VERIFY(!(smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED));
+
+ if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+ ret = ddi_intr_block_enable(smrt->smrt_interrupts,
+ smrt->smrt_ninterrupts);
+ } else {
+ VERIFY3S(smrt->smrt_ninterrupts, ==, 1);
+
+ ret = ddi_intr_enable(smrt->smrt_interrupts[0]);
+ }
+
+ if (ret == DDI_SUCCESS) {
+ smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ENABLED;
+ }
+
+ return (ret);
+}
+
+static void
+smrt_interrupts_free(smrt_t *smrt)
+{
+ for (int i = 0; i < smrt->smrt_ninterrupts; i++) {
+ (void) ddi_intr_free(smrt->smrt_interrupts[i]);
+ }
+ smrt->smrt_ninterrupts = 0;
+ smrt->smrt_interrupt_type = 0;
+ smrt->smrt_interrupt_cap = 0;
+ smrt->smrt_interrupt_pri = 0;
+}
+
+static int
+smrt_interrupts_alloc(smrt_t *smrt, int type)
+{
+ dev_info_t *dip = smrt->smrt_dip;
+ int nintrs = 0;
+ int navail = 0;
+
+ if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not count %s interrupts",
+ smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+ if (nintrs < 1) {
+ dev_err(dip, CE_WARN, "no %s interrupts supported",
+ smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not count available %s "
+ "interrupts", smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+ if (navail < 1) {
+ dev_err(dip, CE_WARN, "no %s interrupts available",
+ smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_intr_alloc(dip, smrt->smrt_interrupts, type, 0, 1,
+ &smrt->smrt_ninterrupts, DDI_INTR_ALLOC_STRICT) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "%s interrupt allocation failed",
+ smrt_interrupt_type_name(type));
+ smrt_interrupts_free(smrt);
+ return (DDI_FAILURE);
+ }
+
+ smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ALLOC;
+ smrt->smrt_interrupt_type = type;
+ return (DDI_SUCCESS);
+}
+
+int
+smrt_interrupts_setup(smrt_t *smrt)
+{
+ int types;
+ unsigned ipri;
+ uint_t (*hw_isr)(caddr_t, caddr_t);
+ dev_info_t *dip = smrt->smrt_dip;
+
+ /*
+ * Select the correct hardware interrupt service routine for the
+ * Transport Method we have configured:
+ */
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ hw_isr = smrt_isr_hw_simple;
+ break;
+ default:
+ panic("unknown controller mode");
+ }
+
+ if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not get support interrupts");
+ goto fail;
+ }
+
+ /*
+ * At least one firmware version has been released for the Smart Array
+ * line with entirely defective MSI support. The specification is
+ * somewhat unclear on the precise nature of MSI-X support with Smart
+ * Array controllers, particularly with respect to the Simple Transport
+ * Method, but for those broken firmware versions we need to try
+ * anyway.
+ */
+ if (smrt_try_msix(smrt) && (types & DDI_INTR_TYPE_MSIX)) {
+ if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSIX) ==
+ DDI_SUCCESS) {
+ goto add_handler;
+ }
+ }
+
+ /*
+ * If MSI-X is not available, or not expected to work, fall back to
+ * MSI.
+ */
+ if (types & DDI_INTR_TYPE_MSI) {
+ if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSI) ==
+ DDI_SUCCESS) {
+ goto add_handler;
+ }
+ }
+
+ /*
+ * If neither MSI-X nor MSI is available, fall back to fixed
+ * interrupts. Note that the use of fixed interrupts has been
+ * observed, with some combination of controllers and systems, to
+ * result in interrupts stopping completely at random times.
+ */
+ if (types & DDI_INTR_TYPE_FIXED) {
+ if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_FIXED) ==
+ DDI_SUCCESS) {
+ goto add_handler;
+ }
+ }
+
+ /*
+ * We were unable to allocate any interrupts.
+ */
+ dev_err(dip, CE_WARN, "interrupt allocation failed");
+ goto fail;
+
+add_handler:
+ /*
+ * Ensure that we have not been given a high-level interrupt, as our
+ * interrupt handlers do not support them.
+ */
+ if (ddi_intr_get_pri(smrt->smrt_interrupts[0], &ipri) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not determine interrupt priority");
+ goto fail;
+ }
+ if (ipri >= ddi_intr_get_hilevel_pri()) {
+ dev_err(dip, CE_WARN, "high level interrupts not supported");
+ goto fail;
+ }
+ smrt->smrt_interrupt_pri = ipri;
+
+ if (ddi_intr_get_cap(smrt->smrt_interrupts[0],
+ &smrt->smrt_interrupt_cap) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not get %s interrupt cap",
+ smrt_interrupt_type_name(smrt->smrt_interrupt_type));
+ goto fail;
+ }
+
+ if (ddi_intr_add_handler(smrt->smrt_interrupts[0], hw_isr,
+ (caddr_t)smrt, NULL) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "adding %s interrupt failed",
+ smrt_interrupt_type_name(smrt->smrt_interrupt_type));
+ goto fail;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ADDED;
+
+ return (DDI_SUCCESS);
+
+fail:
+ smrt_interrupts_teardown(smrt);
+ return (DDI_FAILURE);
+}
+
+void
+smrt_interrupts_teardown(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED) {
+ (void) smrt_interrupts_disable(smrt);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ENABLED;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ADDED) {
+ (void) ddi_intr_remove_handler(smrt->smrt_interrupts[0]);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ADDED;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ALLOC) {
+ smrt_interrupts_free(smrt);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ALLOC;
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c
new file mode 100644
index 0000000000..05963ac2e2
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c
@@ -0,0 +1,367 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static void
+smrt_logvol_free(smrt_volume_t *smlv)
+{
+ /*
+ * By this stage of teardown, all of the SCSI target drivers
+ * must have been detached from this logical volume.
+ */
+ VERIFY(list_is_empty(&smlv->smlv_targets));
+ list_destroy(&smlv->smlv_targets);
+
+ kmem_free(smlv, sizeof (*smlv));
+}
+
+smrt_volume_t *
+smrt_logvol_lookup_by_id(smrt_t *smrt, unsigned long id)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ for (smrt_volume_t *smlv = list_head(&smrt->smrt_volumes);
+ smlv != NULL; smlv = list_next(&smrt->smrt_volumes, smlv)) {
+ if (smlv->smlv_addr.LogDev.VolId == id) {
+ return (smlv);
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+smrt_read_logvols(smrt_t *smrt, smrt_report_logical_lun_t *smrll, uint64_t gen)
+{
+ smrt_report_logical_lun_ent_t *ents = smrll->smrll_data.ents;
+ uint32_t count = BE_32(smrll->smrll_datasize) /
+ sizeof (smrt_report_logical_lun_ent_t);
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (count > SMRT_MAX_LOGDRV) {
+ count = SMRT_MAX_LOGDRV;
+ }
+
+ for (unsigned i = 0; i < count; i++) {
+ smrt_volume_t *smlv;
+ char id[SCSI_MAXNAMELEN];
+
+ DTRACE_PROBE2(read_logvol, unsigned, i,
+ smrt_report_logical_lun_ent_t *, &ents[i]);
+
+ if ((smlv = smrt_logvol_lookup_by_id(smrt,
+ ents[i].smrle_addr.VolId)) == NULL) {
+
+ /*
+ * This is a new Logical Volume, so add it the the list.
+ */
+ if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) ==
+ NULL) {
+ return (ENOMEM);
+ }
+
+ list_create(&smlv->smlv_targets,
+ sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_lun));
+
+ smlv->smlv_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_volumes, smlv);
+ }
+
+ /*
+ * Always make sure that the address and the generation are up
+ * to date, regardless of where this came from.
+ */
+ smlv->smlv_addr.LogDev = ents[i].smrle_addr;
+ smlv->smlv_gen = gen;
+ (void) snprintf(id, sizeof (id), "%x",
+ smlv->smlv_addr.LogDev.VolId);
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap,
+ SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) {
+ return (EIO);
+ }
+ }
+
+ return (0);
+}
+
+static int
+smrt_read_logvols_ext(smrt_t *smrt, smrt_report_logical_lun_t *smrll,
+ uint64_t gen)
+{
+ smrt_report_logical_lun_extent_t *extents =
+ smrll->smrll_data.extents;
+ uint32_t count = BE_32(smrll->smrll_datasize) /
+ sizeof (smrt_report_logical_lun_extent_t);
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (count > SMRT_MAX_LOGDRV) {
+ count = SMRT_MAX_LOGDRV;
+ }
+
+ for (unsigned i = 0; i < count; i++) {
+ smrt_volume_t *smlv;
+ char id[SCSI_MAXNAMELEN];
+
+ DTRACE_PROBE2(read_logvol_ext, unsigned, i,
+ smrt_report_logical_lun_extent_t *, &extents[i]);
+
+ if ((smlv = smrt_logvol_lookup_by_id(smrt,
+ extents[i].smrle_addr.VolId)) != NULL) {
+ if ((smlv->smlv_flags & SMRT_VOL_FLAG_WWN) &&
+ bcmp(extents[i].smrle_wwn, smlv->smlv_wwn,
+ 16) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "logical "
+ "volume %u WWN changed unexpectedly", i);
+ }
+ } else {
+ /*
+ * This is a new Logical Volume, so add it the the list.
+ */
+ if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) ==
+ NULL) {
+ return (ENOMEM);
+ }
+
+ bcopy(extents[i].smrle_wwn, smlv->smlv_wwn, 16);
+ smlv->smlv_flags |= SMRT_VOL_FLAG_WWN;
+
+ list_create(&smlv->smlv_targets,
+ sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_lun));
+
+ smlv->smlv_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_volumes, smlv);
+ }
+
+ /*
+ * Always make sure that the address and the generation are up
+ * to date. The address may have changed on a reset.
+ */
+ smlv->smlv_addr.LogDev = extents[i].smrle_addr;
+ smlv->smlv_gen = gen;
+ (void) snprintf(id, sizeof (id), "%x",
+ smlv->smlv_addr.LogDev.VolId);
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap,
+ SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) {
+ return (EIO);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Discover the currently visible set of Logical Volumes exposed by the
+ * controller.
+ */
+int
+smrt_logvol_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen)
+{
+ smrt_command_t *smcm;
+ smrt_report_logical_lun_t *smrll;
+ smrt_report_logical_lun_req_t smrllr = { 0 };
+ int r;
+
+ /*
+ * Allocate the command to send to the device, including buffer space
+ * for the returned list of Logical Volumes.
+ */
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ sizeof (smrt_report_logical_lun_t), KM_NOSLEEP) != 0) {
+ r = ENOMEM;
+ mutex_enter(&smrt->smrt_mutex);
+ goto out;
+ }
+
+ smrll = smcm->smcm_internal->smcmi_va;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrllr);
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * The Report Logical LUNs command is essentially a vendor-specific
+ * SCSI command, which we assemble into the CDB region of the command
+ * block.
+ */
+ bzero(&smrllr, sizeof (smrllr));
+ smrllr.smrllr_opcode = CISS_SCMD_REPORT_LOGICAL_LUNS;
+ smrllr.smrllr_extflag = 1;
+ smrllr.smrllr_datasize = htonl(sizeof (smrt_report_logical_lun_t));
+ bcopy(&smrllr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (smrllr)));
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ goto out;
+ }
+
+ /*
+ * Poll for completion.
+ */
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ smcm = NULL;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to discover
+ * logical volumes. Report failure.
+ */
+ r = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "logical volume "
+ "discovery error: status 0x%x", ei->CommandStatus);
+ r = EIO;
+ goto out;
+ }
+ }
+
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_begin(smrt->smrt_virt_tgtmap) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map "
+ "observation on %s", SMRT_IPORT_VIRT);
+ r = EIO;
+ goto out;
+ }
+
+ if ((smrll->smrll_extflag & 0x1) != 0) {
+ r = smrt_read_logvols_ext(smrt, smrll, gen);
+ } else {
+ r = smrt_read_logvols(smrt, smrll, gen);
+ }
+
+ if (r == 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_end(smrt->smrt_virt_tgtmap, 0) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_VIRT);
+ r = EIO;
+ }
+ } else if (r != 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_flush(smrt->smrt_virt_tgtmap) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_VIRT);
+ r = EIO;
+ }
+ }
+
+ if (r == 0) {
+ /*
+ * Update the time of the last successful Logical Volume
+ * discovery:
+ */
+ smrt->smrt_last_log_discovery = gethrtime();
+ }
+
+out:
+ mutex_exit(&smrt->smrt_mutex);
+
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (r);
+}
+
+void
+smrt_logvol_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+ void **privpp)
+{
+ smrt_t *smrt = arg;
+ unsigned long volume;
+ char *eptr;
+
+ VERIFY(type == SCSI_TGT_SCSI_DEVICE);
+ VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume));
+ VERIFY3S(*eptr, ==, '\0');
+ VERIFY3S(volume, >=, 0);
+ VERIFY3S(volume, <, SMRT_MAX_LOGDRV);
+ mutex_enter(&smrt->smrt_mutex);
+ VERIFY(smrt_logvol_lookup_by_id(smrt, volume) != NULL);
+ mutex_exit(&smrt->smrt_mutex);
+ *privpp = NULL;
+}
+
+boolean_t
+smrt_logvol_tgtmap_deactivate(void *arg, char *addr,
+ scsi_tgtmap_tgt_type_t type, void *priv, scsi_tgtmap_deact_rsn_t reason)
+{
+ smrt_t *smrt = arg;
+ smrt_volume_t *smlv;
+ unsigned long volume;
+ char *eptr;
+
+ VERIFY(type == SCSI_TGT_SCSI_DEVICE);
+ VERIFY(priv == NULL);
+ VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume));
+ VERIFY3S(*eptr, ==, '\0');
+ VERIFY3S(volume, >=, 0);
+ VERIFY3S(volume, <, SMRT_MAX_LOGDRV);
+
+ mutex_enter(&smrt->smrt_mutex);
+ smlv = smrt_logvol_lookup_by_id(smrt, volume);
+ VERIFY(smlv != NULL);
+
+ list_remove(&smrt->smrt_volumes, smlv);
+ smrt_logvol_free(smlv);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (B_FALSE);
+}
+
+void
+smrt_logvol_teardown(smrt_t *smrt)
+{
+ smrt_volume_t *smlv;
+
+ while ((smlv = list_remove_head(&smrt->smrt_volumes)) != NULL) {
+ smrt_logvol_free(smlv);
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c
new file mode 100644
index 0000000000..8ab3927673
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c
@@ -0,0 +1,613 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017 Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static void
+smrt_physical_free(smrt_physical_t *smpt)
+{
+ VERIFY(list_is_empty(&smpt->smpt_targets));
+ VERIFY(smpt->smpt_info != NULL);
+
+ kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info));
+ list_destroy(&smpt->smpt_targets);
+ kmem_free(smpt, sizeof (*smpt));
+}
+
+/*
+ * Determine if a physical device enumerated should be shown to the world. There
+ * are three conditions to satisfy for this to be true.
+ *
+ * 1. The device (SAS, SATA, SES, etc.) must not have a masked CISS address. A
+ * masked CISS address indicates a device that we should not be performing I/O
+ * to.
+ * 2. The drive (SAS or SATA device) must not be marked as a member of a logical
+ * volume.
+ * 3. The drive (SAS or SATA device) must not be marked as a spare.
+ */
+static boolean_t
+smrt_physical_visible(PhysDevAddr_t *addr, smrt_identify_physical_drive_t *info)
+{
+ if (addr->Mode == SMRT_CISS_MODE_MASKED) {
+ return (B_FALSE);
+ }
+
+ if ((info->sipd_more_flags & (SMRT_MORE_FLAGS_LOGVOL |
+ SMRT_MORE_FLAGS_SPARE)) != 0) {
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Note, the caller is responsible for making sure that the unit-address form of
+ * the WWN is pased in. Any additional information to target a specific LUN
+ * will be ignored.
+ */
+smrt_physical_t *
+smrt_phys_lookup_by_ua(smrt_t *smrt, const char *ua)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ /*
+ * Sanity check that the caller has provided us enough bytes for a
+ * properly formed unit-address form of a WWN.
+ */
+ if (strlen(ua) < SCSI_WWN_UA_STRLEN)
+ return (NULL);
+
+ for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals);
+ smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) {
+ char wwnstr[SCSI_WWN_BUFLEN];
+
+ (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, wwnstr);
+ if (strncmp(wwnstr, ua, SCSI_WWN_UA_STRLEN) != 0)
+ continue;
+
+ /*
+ * Verify that the UA string is either a comma or null there.
+ * We accept the comma in case it's being used as part of a
+ * normal UA with a LUN.
+ */
+ if (ua[SCSI_WWN_UA_STRLEN] != '\0' &&
+ ua[SCSI_WWN_UA_STRLEN] != ',') {
+ continue;
+ }
+
+ return (smpt);
+ }
+
+ return (NULL);
+}
+
+static smrt_physical_t *
+smrt_phys_lookup_by_wwn(smrt_t *smrt, uint64_t wwn)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals);
+ smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) {
+ if (wwn == smpt->smpt_wwn)
+ return (smpt);
+ }
+
+ return (NULL);
+}
+
+static int
+smrt_phys_identify(smrt_t *smrt, smrt_identify_physical_drive_t *info,
+ uint16_t bmic, uint16_t timeout)
+{
+ smrt_command_t *smcm = NULL;
+ smrt_identify_physical_drive_t *sipd;
+ smrt_identify_physical_drive_req_t sipdr;
+ int ret;
+ size_t sz, copysz;
+
+ sz = sizeof (smrt_identify_physical_drive_t);
+ sz = P2ROUNDUP_TYPED(sz, 512, size_t);
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ sizeof (*sipd), KM_NOSLEEP) != 0) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ sipd = smcm->smcm_internal->smcmi_va;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (sipdr);
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * Construct the IDENTIFY PHYSICAL DEVICE request CDB. Note that any
+ * reserved fields in the request must be filled with zeroes.
+ */
+ bzero(&sipdr, sizeof (sipdr));
+ sipdr.sipdr_opcode = CISS_SCMD_BMIC_READ;
+ sipdr.sipdr_lun = 0;
+ sipdr.sipdr_bmic_index1 = bmic & 0x00ff;
+ sipdr.sipdr_command = CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE;
+ sipdr.sipdr_bmic_index2 = (bmic & 0xff00) >> 8;
+ bcopy(&sipdr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (sipdr)));
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((ret = smrt_submit(smrt, smcm)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ goto out;
+ }
+
+ /*
+ * Poll for completion.
+ */
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((ret = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(ret, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ smcm = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ goto out;
+ }
+ mutex_exit(&smrt->smrt_mutex);
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to discover
+ * physical volumes. Report failure.
+ */
+ ret = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "identify physical "
+ "device error: status 0x%x", ei->CommandStatus);
+ ret = EIO;
+ goto out;
+ }
+
+ copysz = MIN(sizeof (*sipd), sz - ei->ResidualCnt);
+ } else {
+ copysz = sizeof (*sipd);
+ }
+
+
+ sz = MIN(sizeof (*sipd), copysz);
+ bcopy(sipd, info, sizeof (*sipd));
+
+ ret = 0;
+out:
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+
+ return (ret);
+}
+
+static int
+smrt_read_phys_ext(smrt_t *smrt, smrt_report_physical_lun_t *smrpl,
+ uint16_t timeout, uint64_t gen)
+{
+ smrt_report_physical_lun_extent_t *extents = smrpl->smrpl_data.extents;
+ uint32_t count = BE_32(smrpl->smrpl_datasize) /
+ sizeof (smrt_report_physical_lun_extent_t);
+ uint32_t i;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (count > SMRT_MAX_PHYSDEV) {
+ count = SMRT_MAX_PHYSDEV;
+ }
+
+ for (i = 0; i < count; i++) {
+ int ret;
+ smrt_physical_t *smpt;
+ smrt_identify_physical_drive_t *info;
+ smrt_report_physical_opdi_t *opdi;
+ uint16_t bmic;
+ uint64_t wwn, satawwn;
+ char name[SCSI_MAXNAMELEN];
+
+ opdi = &extents[i].srple_extdata.srple_opdi;
+
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * Get the extended information about this device.
+ */
+ info = kmem_zalloc(sizeof (*info), KM_NOSLEEP);
+ if (info == NULL) {
+ mutex_enter(&smrt->smrt_mutex);
+ return (ENOMEM);
+ }
+
+ bmic = smrt_lun_addr_to_bmic(&extents[i].srple_addr);
+ ret = smrt_phys_identify(smrt, info, bmic, timeout);
+ if (ret != 0) {
+ mutex_enter(&smrt->smrt_mutex);
+ kmem_free(info, sizeof (*info));
+ return (ret);
+ }
+
+ wwn = *(uint64_t *)opdi->srpo_wwid;
+ wwn = BE_64(wwn);
+
+ /*
+ * SATA devices may not have a proper WWN returned from firmware
+ * based on the SATL specification. Try to fetch the proper id
+ * for SATA devices, if the drive has one. If the drive doesn't
+ * have one or the SATL refuses to give us one, we use whatever
+ * the controller told us.
+ */
+ if (opdi->srpo_dtype == SMRT_DTYPE_SATA &&
+ smrt_sata_determine_wwn(smrt, &extents[i].srple_addr,
+ &satawwn, timeout) == 0) {
+ wwn = satawwn;
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+ smpt = smrt_phys_lookup_by_wwn(smrt, wwn);
+ if (smpt != NULL) {
+ /*
+ * Sanity check that the model and serial number of this
+ * device is the same for this WWN. If it's not, the
+ * controller is probably lying about something.
+ */
+ if (bcmp(smpt->smpt_info->sipd_model, info->sipd_model,
+ sizeof (info->sipd_model)) != 0 ||
+ bcmp(smpt->smpt_info->sipd_serial,
+ info->sipd_serial, sizeof (info->sipd_serial)) !=
+ 0 || smpt->smpt_dtype != opdi->srpo_dtype) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "physical "
+ "target with wwn 0x%" PRIx64 " changed "
+ "model, serial, or type unexpectedly: "
+ "smrt_physical_t %p, phys info: %p", wwn,
+ smpt, info);
+ }
+
+ /*
+ * When panicking, we don't allow a device's visibility
+ * to change to being invisible and be able to actually
+ * panic. We only worry about devices which are used
+ * for I/O. We purposefully ignore SES devices.
+ */
+ if (ddi_in_panic() &&
+ (opdi->srpo_dtype == SMRT_DTYPE_SATA ||
+ opdi->srpo_dtype == SMRT_DTYPE_SAS)) {
+ boolean_t visible;
+
+ visible = smrt_physical_visible(
+ &smpt->smpt_addr.PhysDev, smpt->smpt_info);
+
+ if (visible != smpt->smpt_visible) {
+ dev_err(smrt->smrt_dip, CE_PANIC,
+ "physical target with wwn 0x%"
+ PRIx64 " changed visibility status "
+ "unexpectedly", wwn);
+ }
+ }
+
+ kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info));
+ smpt->smpt_info = NULL;
+ } else {
+ smpt = kmem_zalloc(sizeof (smrt_physical_t),
+ KM_NOSLEEP);
+ if (smpt == NULL) {
+ kmem_free(info, sizeof (*info));
+ return (ENOMEM);
+ }
+
+ smpt->smpt_wwn = wwn;
+ smpt->smpt_dtype = opdi->srpo_dtype;
+ list_create(&smpt->smpt_targets, sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_lun));
+ smpt->smpt_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_physicals, smpt);
+ }
+
+ VERIFY3P(smpt->smpt_info, ==, NULL);
+
+ /*
+ * Determine if this device is supported and if it's visible to
+ * the system. Some devices may not be visible to the system
+ * because they're used in logical volumes or spares.
+ * Unsupported devices are also not visible.
+ */
+ switch (smpt->smpt_dtype) {
+ case SMRT_DTYPE_SATA:
+ case SMRT_DTYPE_SAS:
+ smpt->smpt_supported = B_TRUE;
+ smpt->smpt_visible =
+ smrt_physical_visible(&extents[i].srple_addr, info);
+ break;
+ case SMRT_DTYPE_SES:
+ smpt->smpt_supported = B_TRUE;
+ smpt->smpt_visible =
+ smrt_physical_visible(&extents[i].srple_addr, info);
+ break;
+ default:
+ smpt->smpt_visible = B_FALSE;
+ smpt->smpt_supported = B_FALSE;
+ }
+
+ smpt->smpt_info = info;
+ smpt->smpt_addr.PhysDev = extents[i].srple_addr;
+ smpt->smpt_bmic = bmic;
+ smpt->smpt_gen = gen;
+ (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, name);
+ if (!ddi_in_panic() && smpt->smpt_visible &&
+ scsi_hba_tgtmap_set_add(smrt->smrt_phys_tgtmap,
+ SCSI_TGT_SCSI_DEVICE, name, NULL) != DDI_SUCCESS) {
+ return (EIO);
+ }
+ }
+
+ return (0);
+}
+
+int
+smrt_phys_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen)
+{
+ smrt_command_t *smcm;
+ smrt_report_physical_lun_t *smrpl;
+ smrt_report_physical_lun_req_t smrplr;
+ int r;
+
+ /*
+ * Allocate the command to send to the device, including buffer space
+ * for the returned list of Physical Volumes.
+ */
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ sizeof (*smrpl), KM_NOSLEEP) != 0) {
+ r = ENOMEM;
+ mutex_enter(&smrt->smrt_mutex);
+ goto out;
+ }
+
+ smrpl = smcm->smcm_internal->smcmi_va;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrplr);
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * The Report Physical LUNs command is essentially a vendor-specific
+ * SCSI command, which we assemble into the CDB region of the command
+ * block.
+ */
+ bzero(&smrplr, sizeof (smrplr));
+ smrplr.smrplr_opcode = CISS_SCMD_REPORT_PHYSICAL_LUNS;
+ smrplr.smrplr_extflag = SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI;
+ smrplr.smrplr_datasize = BE_32(sizeof (smrt_report_physical_lun_t));
+ bcopy(&smrplr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (smrplr)));
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ goto out;
+ }
+
+ /*
+ * Poll for completion.
+ */
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ smcm = NULL;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ *
+ * The controller was reset while we were trying to discover
+ * logical volumes. Report failure.
+ */
+ r = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "physical target "
+ "discovery error: status 0x%x", ei->CommandStatus);
+ r = EIO;
+ goto out;
+ }
+ }
+
+ /*
+ * If the controller doesn't support extended physical reporting, it
+ * likely doesn't even support physical devices that we'd care about
+ * exposing. As such, we treat this as an OK case.
+ */
+ if ((smrpl->smrpl_extflag & SMRT_REPORT_PHYSICAL_LUN_EXT_MASK) !=
+ SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI) {
+ r = 0;
+ goto out;
+ }
+
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_begin(smrt->smrt_phys_tgtmap) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map "
+ "observation on %s", SMRT_IPORT_PHYS);
+ r = EIO;
+ goto out;
+ }
+
+ r = smrt_read_phys_ext(smrt, smrpl, timeout, gen);
+
+ if (r == 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_end(smrt->smrt_phys_tgtmap, 0) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_PHYS);
+ r = EIO;
+ }
+ } else if (r != 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_flush(smrt->smrt_phys_tgtmap) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_PHYS);
+ r = EIO;
+ }
+ }
+
+ if (r == 0) {
+ smrt_physical_t *smpt, *next;
+
+ /*
+ * Prune physical devices that do not match the current
+ * generation and are not marked as visible devices. Visible
+ * devices will be dealt with as part of the target map work.
+ */
+ for (smpt = list_head(&smrt->smrt_physicals), next = NULL;
+ smpt != NULL; smpt = next) {
+ next = list_next(&smrt->smrt_physicals, smpt);
+ if (smpt->smpt_visible || smpt->smpt_gen == gen)
+ continue;
+ list_remove(&smrt->smrt_physicals, smpt);
+ smrt_physical_free(smpt);
+ }
+
+ /*
+ * Update the time of the last successful Physical Volume
+ * discovery:
+ */
+ smrt->smrt_last_phys_discovery = gethrtime();
+
+ /*
+ * Now, for each unsupported device that we haven't warned about
+ * encountering, try and give the administrator some hope of
+ * knowing about this.
+ */
+ for (smpt = list_head(&smrt->smrt_physicals), next = NULL;
+ smpt != NULL; smpt = next) {
+ if (smpt->smpt_supported || smpt->smpt_unsup_warn)
+ continue;
+ smpt->smpt_unsup_warn = B_TRUE;
+ dev_err(smrt->smrt_dip, CE_WARN, "encountered "
+ "unsupported device with device type %d",
+ smpt->smpt_dtype);
+ }
+ }
+
+out:
+ mutex_exit(&smrt->smrt_mutex);
+
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (r);
+}
+
+void
+smrt_phys_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+ void **privpp)
+{
+ smrt_t *smrt = arg;
+ smrt_physical_t *smpt;
+
+ VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE);
+ mutex_enter(&smrt->smrt_mutex);
+ smpt = smrt_phys_lookup_by_ua(smrt, addr);
+ VERIFY(smpt != NULL);
+ VERIFY(smpt->smpt_supported);
+ VERIFY(smpt->smpt_visible);
+ *privpp = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+boolean_t
+smrt_phys_tgtmap_deactivate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+ void *priv, scsi_tgtmap_deact_rsn_t reason)
+{
+ smrt_t *smrt = arg;
+ smrt_physical_t *smpt;
+
+ VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE);
+ VERIFY3P(priv, ==, NULL);
+
+ mutex_enter(&smrt->smrt_mutex);
+ smpt = smrt_phys_lookup_by_ua(smrt, addr);
+
+ /*
+ * If the device disappeared or became invisible, then it may have
+ * already been removed.
+ */
+ if (smpt == NULL || !smpt->smpt_visible) {
+ mutex_exit(&smrt->smrt_mutex);
+ return (B_FALSE);
+ }
+
+ list_remove(&smrt->smrt_physicals, smpt);
+ smrt_physical_free(smpt);
+ mutex_exit(&smrt->smrt_mutex);
+ return (B_FALSE);
+}
+
+void
+smrt_phys_teardown(smrt_t *smrt)
+{
+ smrt_physical_t *smpt;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ while ((smpt = list_remove_head(&smrt->smrt_physicals)) != NULL) {
+ smrt_physical_free(smpt);
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c
new file mode 100644
index 0000000000..6224b97732
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c
@@ -0,0 +1,160 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * Collection of routines specific to SATA devices and attempting to make them
+ * work.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * This is a buffer size that should easily cover all of the data that we need
+ * to properly determine the buffer allocation.
+ */
+#define SMRT_SATA_INQ83_LEN 256
+
+/*
+ * We need to try and determine if a SATA WWN exists on the device. SAT-2
+ * defines that the response to the inquiry page 0x83.
+ */
+int
+smrt_sata_determine_wwn(smrt_t *smrt, PhysDevAddr_t *addr, uint64_t *wwnp,
+ uint16_t timeout)
+{
+ smrt_command_t *smcm;
+ int r;
+ uint8_t *inq;
+ uint64_t wwn;
+ size_t resid;
+
+ VERIFY3P(wwnp, !=, NULL);
+
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ SMRT_SATA_INQ83_LEN, KM_NOSLEEP) != 0) {
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (ENOMEM);
+ }
+
+ smcm->smcm_va_cmd->Header.LUN.PhysDev = *addr;
+ smcm->smcm_va_cmd->Request.CDBLen = CDB_GROUP0;
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+
+ smcm->smcm_va_cmd->Request.CDB[0] = SCMD_INQUIRY;
+ smcm->smcm_va_cmd->Request.CDB[1] = 1;
+ smcm->smcm_va_cmd->Request.CDB[2] = 0x83;
+ smcm->smcm_va_cmd->Request.CDB[3] = (SMRT_SATA_INQ83_LEN & 0xff00) >> 8;
+ smcm->smcm_va_cmd->Request.CDB[4] = SMRT_SATA_INQ83_LEN & 0x00ff;
+ smcm->smcm_va_cmd->Request.CDB[5] = 0;
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (r);
+ }
+
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ mutex_exit(&smrt->smrt_mutex);
+ return (r);
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to discover
+ * logical volumes. Report failure.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (EIO);
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "physical target "
+ "SATA WWN error: status 0x%x", ei->CommandStatus);
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (EIO);
+ }
+ resid = ei->ResidualCnt;
+ } else {
+ resid = 0;
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * We must have at least 12 bytes. The first four bytes are the header,
+ * the next four are for the LUN header, and the last 8 are for the
+ * actual WWN, which according to SAT-2 will always be first.
+ */
+ if (SMRT_SATA_INQ83_LEN - resid < 16) {
+ smrt_command_free(smcm);
+ return (EINVAL);
+ }
+ inq = smcm->smcm_internal->smcmi_va;
+
+ /*
+ * Sanity check we have the right page.
+ */
+ if (inq[1] != 0x83) {
+ smrt_command_free(smcm);
+ return (EINVAL);
+ }
+
+ /*
+ * Check to see if we have a proper Network Address Authority (NAA)
+ * based world wide number for this LUN. It is possible that firmware
+ * interposes on this and constructs a fake world wide number (WWN). If
+ * this is the case, we don't want to actually use it. We need to
+ * verify that the WWN declares the correct naming authority and is of
+ * the proper length.
+ */
+ if ((inq[5] & 0x30) != 0 || (inq[5] & 0x0f) != 3 || inq[7] != 8) {
+ smrt_command_free(smcm);
+ return (ENOTSUP);
+ }
+
+ bcopy(&inq[8], &wwn, sizeof (uint64_t));
+ *wwnp = BE_64(wwn);
+
+ smrt_command_free(smcm);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c
index f968b54847..274c7caccf 100644
--- a/usr/src/uts/common/io/scsi/targets/sd.c
+++ b/usr/src/uts/common/io/scsi/targets/sd.c
@@ -3498,9 +3498,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc)
* according to the successful response to the page
* 0x2A mode sense request.
*/
- scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
- "sd_set_mmc_caps: Mode Sense returned "
- "invalid block descriptor length\n");
+ /*
+ * The following warning occurs due to the KVM CD-ROM
+ * mishandling the multi-media commands. Ignore it.
+ * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
+ * "sd_set_mmc_caps: Mode Sense returned "
+ * "invalid block descriptor length\n");
+ */
kmem_free(buf, BUFLEN_MODE_CDROM_CAP);
return;
}
@@ -4444,19 +4448,78 @@ static int
sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen)
{
struct scsi_inquiry *sd_inq;
- int rval = SD_SUCCESS;
+ int rval = SD_SUCCESS;
+ char *p;
+ int chk_vidlen = 0, chk_pidlen = 0;
+ int has_tail = 0;
+ static const int VSZ = sizeof (sd_inq->inq_vid);
+ static const int PSZ = sizeof (sd_inq->inq_pid);
ASSERT(un != NULL);
sd_inq = un->un_sd->sd_inq;
ASSERT(id != NULL);
/*
- * We use the inq_vid as a pointer to a buffer containing the
- * vid and pid and use the entire vid/pid length of the table
- * entry for the comparison. This works because the inq_pid
- * data member follows inq_vid in the scsi_inquiry structure.
+ * We would like to use the inq_vid as a pointer to a buffer
+ * containing the vid and pid and use the entire vid/pid length of
+ * the table entry for the comparison. However, this does not work
+ * because, while the inq_pid data member follows inq_vid in the
+ * scsi_inquiry structure, we do not control the contents of this
+ * buffer, and some broken devices violate SPC 4.3.1 and return
+ * fields with null bytes in them.
+ */
+ chk_vidlen = MIN(VSZ, idlen);
+ p = id + chk_vidlen - 1;
+ while (*p == ' ' && chk_vidlen > 0) {
+ --p;
+ --chk_vidlen;
+ }
+
+ /*
+ * If it's all spaces, check the whole thing.
+ */
+ if (chk_vidlen == 0)
+ chk_vidlen = MIN(VSZ, idlen);
+
+ if (idlen > VSZ) {
+ chk_pidlen = idlen - VSZ;
+ p = id + idlen - 1;
+ while (*p == ' ' && chk_pidlen > 0) {
+ --p;
+ --chk_pidlen;
+ }
+ if (chk_pidlen == 0)
+ chk_pidlen = MIN(PSZ, idlen - VSZ);
+ }
+
+ /*
+ * There's one more thing we need to do here. If the user specified
+ * an ID with trailing spaces, we need to make sure the inquiry
+ * vid/pid has only spaces or NULs after the check length; otherwise, it
+ * can't match.
*/
- if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) {
+ if (idlen > chk_vidlen && chk_vidlen < VSZ) {
+ for (p = sd_inq->inq_vid + chk_vidlen;
+ p < sd_inq->inq_vid + VSZ; ++p) {
+ if (*p != ' ' && *p != '\0') {
+ ++has_tail;
+ break;
+ }
+ }
+ }
+ if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) {
+ for (p = sd_inq->inq_pid + chk_pidlen;
+ p < sd_inq->inq_pid + PSZ; ++p) {
+ if (*p != ' ' && *p != '\0') {
+ ++has_tail;
+ break;
+ }
+ }
+ }
+
+ if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 ||
+ (idlen > VSZ &&
+ strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) {
/*
* The user id string is compared to the inquiry vid/pid
* using a case insensitive comparison and ignoring
@@ -6723,7 +6786,7 @@ sdpower(dev_info_t *devi, int component, int level)
time_t intvlp;
struct pm_trans_data sd_pm_tran_data;
uchar_t save_state = SD_STATE_NORMAL;
- int sval;
+ int sval, tursval = 0;
uchar_t state_before_pm;
int got_semaphore_here;
sd_ssc_t *ssc;
@@ -7040,13 +7103,26 @@ sdpower(dev_info_t *devi, int component, int level)
* a deadlock on un_pm_busy_cv will occur.
*/
if (SD_PM_IS_IO_CAPABLE(un, level)) {
- sval = sd_send_scsi_TEST_UNIT_READY(ssc,
+ tursval = sd_send_scsi_TEST_UNIT_READY(ssc,
SD_DONT_RETRY_TUR | SD_BYPASS_PM);
- if (sval != 0)
+ if (tursval != 0)
sd_ssc_assessment(ssc, SD_FMT_IGNORE);
}
- if (un->un_f_power_condition_supported) {
+ /*
+ * We've encountered certain classes of drives that pass a TUR, but fail
+ * the START STOP UNIT when using power conditions, or worse leave the
+ * drive in an unusable state despite passing SSU. Strictly speaking,
+ * for SPC-4 or greater, no additional actions are required to make the
+ * drive operational when a TUR passes. If we have something that
+ * matches this condition, we continue on and presume the drive is
+ * successfully powered on.
+ */
+ if (un->un_f_power_condition_supported &&
+ SD_SCSI_VERS_IS_GE_SPC_4(un) && SD_PM_IS_IO_CAPABLE(un, level) &&
+ level == SD_SPINDLE_ACTIVE && tursval == 0) {
+ sval = 0;
+ } else if (un->un_f_power_condition_supported) {
char *pm_condition_name[] = {"STOPPED", "STANDBY",
"IDLE", "ACTIVE"};
SD_TRACE(SD_LOG_IO_PM, un,
@@ -7066,6 +7142,7 @@ sdpower(dev_info_t *devi, int component, int level)
sd_ssc_assessment(ssc, SD_FMT_STATUS_CHECK);
else
sd_ssc_assessment(ssc, SD_FMT_IGNORE);
+
}
/* Command failed, check for media present. */
@@ -31324,7 +31401,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi)
if (SD_PM_CAPABLE_IS_UNDEFINED(pm_cap)) {
un->un_f_log_sense_supported = TRUE;
if (!un->un_f_power_condition_disabled &&
- SD_INQUIRY(un)->inq_ansi == 6) {
+ SD_SCSI_VERS_IS_GE_SPC_4(un)) {
un->un_f_power_condition_supported = TRUE;
}
} else {
@@ -31342,7 +31419,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi)
/* SD_PM_CAPABLE_IS_TRUE case */
un->un_f_pm_supported = TRUE;
if (!un->un_f_power_condition_disabled &&
- SD_PM_CAPABLE_IS_SPC_4(pm_cap)) {
+ (SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap))) {
un->un_f_power_condition_supported =
TRUE;
}
diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c
index 46d616dd79..4dce53e22c 100644
--- a/usr/src/uts/common/io/signalfd.c
+++ b/usr/src/uts/common/io/signalfd.c
@@ -107,6 +107,7 @@
#include <sys/schedctl.h>
#include <sys/id_space.h>
#include <sys/sdt.h>
+#include <sys/brand.h>
#include <sys/disp.h>
#include <sys/taskq_impl.h>
@@ -459,6 +460,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block)
lwp->lwp_extsig = 0;
mutex_exit(&p->p_lock);
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate)
+ BROP(p)->b_sigfd_translate(infop);
+
/* Convert k_siginfo into external, datamodel independent, struct. */
bzero(ssp, sizeof (*ssp));
ssp->ssi_signo = infop->si_signo;
diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c
index 727fbbad8e..9bfe2fe7cf 100644
--- a/usr/src/uts/common/io/simnet/simnet.c
+++ b/usr/src/uts/common/io/simnet/simnet.c
@@ -21,6 +21,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -795,12 +797,6 @@ simnet_m_tx(void *arg, mblk_t *mp_chain)
continue;
}
- /* Fix mblk checksum as the pkt dest is local */
- if ((mp = mac_fix_cksum(mp)) == NULL) {
- sdev->sd_stats.xmit_errors++;
- continue;
- }
-
/* Hold reference for taskq receive processing per-pkt */
if (!simnet_thread_ref(sdev_rx)) {
freemsg(mp);
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index ec76c6e2b9..55fd87db45 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -1451,6 +1451,16 @@ copyb(mblk_t *bp)
ndp = nbp->b_datap;
/*
+ * Copy the various checksum information that came in
+ * originally.
+ */
+ ndp->db_cksumstart = dp->db_cksumstart;
+ ndp->db_cksumend = dp->db_cksumend;
+ ndp->db_cksumstuff = dp->db_cksumstuff;
+ bcopy(dp->db_struioun.data, ndp->db_struioun.data,
+ sizeof (dp->db_struioun.data));
+
+ /*
* Well, here is a potential issue. If we are trying to
* trace a flow, and we copy the message, we might lose
* information about where this message might have been.
diff --git a/usr/src/uts/common/io/tl.c b/usr/src/uts/common/io/tl.c
index 03b93c6114..e77f33d31f 100644
--- a/usr/src/uts/common/io/tl.c
+++ b/usr/src/uts/common/io/tl.c
@@ -1419,8 +1419,9 @@ tl_closeok(tl_endpt_t *tep)
static int
tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
{
- tl_endpt_t *tep;
- minor_t minor = getminor(*devp);
+ tl_endpt_t *tep;
+ minor_t minor = getminor(*devp);
+ id_t inst_minor;
/*
* Driver is called directly. Both CLONEOPEN and MODOPEN
@@ -1440,6 +1441,14 @@ tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
minor |= TL_SOCKET;
}
+ /*
+ * Attempt to allocate a unique minor number for this instance.
+ * Avoid an uninterruptable sleep if none are available.
+ */
+ if ((inst_minor = id_alloc_nosleep(tl_minors)) == -1) {
+ return (ENOMEM);
+ }
+
tep = kmem_cache_alloc(tl_cache, KM_SLEEP);
tep->te_refcnt = 1;
tep->te_cpid = curproc->p_pid;
@@ -1451,9 +1460,7 @@ tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
tep->te_flag = minor & TL_MINOR_MASK;
tep->te_transport = &tl_transports[minor];
-
- /* Allocate a unique minor number for this instance. */
- tep->te_minor = (minor_t)id_alloc(tl_minors);
+ tep->te_minor = (minor_t)inst_minor;
/* Reserve hash handle for bind(). */
(void) mod_hash_reserve(tep->te_addrhash, &tep->te_hash_hndl);
diff --git a/usr/src/uts/common/io/usb/clients/hid/hid.c b/usr/src/uts/common/io/usb/clients/hid/hid.c
index 084fa7fedc..eccd48bf08 100644
--- a/usr/src/uts/common/io/usb/clients/hid/hid.c
+++ b/usr/src/uts/common/io/usb/clients/hid/hid.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
@@ -139,6 +139,12 @@ static int hid_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int hid_attach(dev_info_t *, ddi_attach_cmd_t);
static int hid_detach(dev_info_t *, ddi_detach_cmd_t);
static int hid_power(dev_info_t *, int, int);
+/* These are to enable ugen support: */
+static int hid_chropen(dev_t *, int, int, cred_t *);
+static int hid_chrclose(dev_t, int, int, cred_t *);
+static int hid_read(dev_t, struct uio *, cred_t *);
+static int hid_write(dev_t, struct uio *, cred_t *);
+static int hid_poll(dev_t, short, int, short *, struct pollhead **);
/*
* Warlock is not aware of the automatic locking mechanisms for
@@ -198,18 +204,18 @@ struct streamtab hid_streamtab = {
};
struct cb_ops hid_cb_ops = {
- nulldev, /* open */
- nulldev, /* close */
+ hid_chropen, /* open */
+ hid_chrclose, /* close */
nulldev, /* strategy */
nulldev, /* print */
nulldev, /* dump */
- nulldev, /* read */
- nulldev, /* write */
+ hid_read, /* read */
+ hid_write, /* write */
nulldev, /* ioctl */
nulldev, /* devmap */
nulldev, /* mmap */
nulldev, /* segmap */
- nochpoll, /* poll */
+ hid_poll, /* poll */
ddi_prop_op, /* cb_prop_op */
&hid_streamtab, /* streamtab */
D_MP | D_MTPERQ
@@ -349,6 +355,7 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
usb_alt_if_data_t *altif_data;
char minor_name[HID_MINOR_NAME_LEN];
usb_ep_data_t *ep_data;
+ usb_ugen_info_t usb_ugen_info;
switch (cmd) {
case DDI_ATTACH:
@@ -491,6 +498,28 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
usb_free_dev_data(dip, dev_data);
hidp->hid_dev_data = NULL;
+ if (usb_owns_device(dip)) {
+ /* Get a ugen handle. */
+ bzero(&usb_ugen_info, sizeof (usb_ugen_info));
+
+ usb_ugen_info.usb_ugen_flags = 0;
+ usb_ugen_info.usb_ugen_minor_node_ugen_bits_mask =
+ (dev_t)HID_MINOR_UGEN_BITS_MASK;
+ usb_ugen_info.usb_ugen_minor_node_instance_mask =
+ (dev_t)HID_MINOR_INSTANCE_MASK;
+ hidp->hid_ugen_hdl = usb_ugen_get_hdl(dip, &usb_ugen_info);
+
+ if (usb_ugen_attach(hidp->hid_ugen_hdl, cmd) !=
+ USB_SUCCESS) {
+ USB_DPRINTF_L2(PRINT_MASK_ATTA,
+ hidp->hid_log_handle,
+ "usb_ugen_attach failed");
+
+ usb_ugen_release_hdl(hidp->hid_ugen_hdl);
+ hidp->hid_ugen_hdl = NULL;
+ }
+ }
+
/*
* Don't get the report descriptor if parsing hid descriptor earlier
* failed since device probably won't return valid report descriptor
@@ -769,6 +798,149 @@ hid_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
return (rval);
}
+static int
+hid_chropen(dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(*devp);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ hid_pm_busy_component(hidp);
+ (void) pm_raise_power(hidp->hid_dip, 0, USB_DEV_OS_FULL_PWR);
+
+ mutex_enter(&hidp->hid_mutex);
+
+ rval = usb_ugen_open(hidp->hid_ugen_hdl, devp, flag,
+ sflag, credp);
+
+ mutex_exit(&hidp->hid_mutex);
+
+ if (rval != 0) {
+ hid_pm_idle_component(hidp);
+ }
+
+ return (rval);
+}
+
+static int
+hid_chrclose(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ mutex_enter(&hidp->hid_mutex);
+
+ rval = usb_ugen_close(hidp->hid_ugen_hdl, dev, flag,
+ otyp, credp);
+
+ mutex_exit(&hidp->hid_mutex);
+
+ if (rval == 0) {
+ hid_pm_idle_component(hidp);
+ }
+
+ return (rval);
+}
+
+static int
+hid_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ rval = usb_ugen_read(hidp->hid_ugen_hdl, dev, uiop, credp);
+
+ return (rval);
+}
+
+static int
+hid_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ rval = usb_ugen_write(hidp->hid_ugen_hdl, dev, uiop, credp);
+
+ return (rval);
+}
+
+static int
+hid_poll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ rval = usb_ugen_poll(hidp->hid_ugen_hdl, dev, events, anyyet,
+ reventsp, phpp);
+
+ return (rval);
+}
+
/*
* hid_open :
* Open entry point: Opens the interrupt pipe. Sets up queues.
@@ -787,13 +959,21 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
hidp = ddi_get_soft_state(hid_statep, instance);
if (hidp == NULL) {
-
return (ENXIO);
}
USB_DPRINTF_L4(PRINT_MASK_OPEN, hidp->hid_log_handle,
"hid_open: Begin");
+ /*
+ * If this is a ugen device, return ENOSTR (no streams). This will
+ * cause spec_open to try hid_chropen from our regular ops_cb instead
+ * (and thus treat us as a plain character device).
+ */
+ if (HID_IS_UGEN_OPEN(minor)) {
+ return (ENOSTR);
+ }
+
if (sflag) {
/* clone open NOT supported here */
return (ENXIO);
@@ -803,6 +983,8 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
return (EIO);
}
+ mutex_enter(&hidp->hid_mutex);
+
/*
* This is a workaround:
* Currently, if we open an already disconnected device, and send
@@ -812,7 +994,6 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
* The consconfig_dacf module need this interface to detect if the
* device is already disconnnected.
*/
- mutex_enter(&hidp->hid_mutex);
if (HID_IS_INTERNAL_OPEN(minor) &&
(hidp->hid_dev_state == USB_DEV_DISCONNECTED)) {
mutex_exit(&hidp->hid_mutex);
@@ -1688,6 +1869,11 @@ hid_cpr_suspend(hid_state_t *hidp)
}
mutex_exit(&hidp->hid_mutex);
+ if ((retval == USB_SUCCESS) && hidp->hid_ugen_hdl != NULL) {
+ retval = usb_ugen_detach(hidp->hid_ugen_hdl,
+ DDI_SUSPEND);
+ }
+
return (retval);
}
@@ -1699,6 +1885,10 @@ hid_cpr_resume(hid_state_t *hidp)
"hid_cpr_resume: dip=0x%p", (void *)hidp->hid_dip);
hid_restore_device_state(hidp->hid_dip, hidp);
+
+ if (hidp->hid_ugen_hdl != NULL) {
+ (void) usb_ugen_attach(hidp->hid_ugen_hdl, DDI_RESUME);
+ }
}
@@ -2136,6 +2326,12 @@ hid_detach_cleanup(dev_info_t *dip, hid_state_t *hidp)
hidp->hid_pm = NULL;
}
+ if (hidp->hid_ugen_hdl != NULL) {
+ rval = usb_ugen_detach(hidp->hid_ugen_hdl, DDI_DETACH);
+ VERIFY0(rval);
+ usb_ugen_release_hdl(hidp->hid_ugen_hdl);
+ }
+
mutex_exit(&hidp->hid_mutex);
if (hidp->hid_report_descr != NULL) {
diff --git a/usr/src/uts/common/io/usb/usba/genconsole.c b/usr/src/uts/common/io/usb/usba/genconsole.c
index 609c1d7456..5e48a2e672 100644
--- a/usr/src/uts/common/io/usb/usba/genconsole.c
+++ b/usr/src/uts/common/io/usb/usba/genconsole.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019, Joyent, Inc.
*/
/*
@@ -40,10 +41,8 @@
* layers to initialize any state information.
*/
int
-usb_console_input_init(dev_info_t *dip,
- usb_pipe_handle_t pipe_handle,
- uchar_t **state_buf,
- usb_console_info_t *console_input_info)
+usb_console_input_init(dev_info_t *dip, usb_pipe_handle_t pipe_handle,
+ uchar_t **state_buf, usb_console_info_t *console_input_info)
{
int ret;
usba_device_t *usba_device;
@@ -168,10 +167,8 @@ usb_console_input_enter(usb_console_info_t console_input_info)
/*
* Call the lower layer to save state information.
*/
- usba_device->usb_hcdi_ops->usba_hcdi_console_input_enter(
- usb_console_input);
-
- return (USB_SUCCESS);
+ return (usba_device->usb_hcdi_ops->usba_hcdi_console_input_enter(
+ usb_console_input));
}
@@ -235,10 +232,8 @@ usb_console_input_exit(usb_console_info_t console_input_info)
/*
* Restore the state information.
*/
- usba_device->usb_hcdi_ops->usba_hcdi_console_input_exit(
- usb_console_input);
-
- return (USB_SUCCESS);
+ return (usba_device->usb_hcdi_ops->usba_hcdi_console_input_exit(
+ usb_console_input));
}
/*
@@ -345,10 +340,8 @@ usb_console_output_enter(usb_console_info_t console_output_info)
/*
* Call the lower layer to save state information.
*/
- usb_device->usb_hcdi_ops->usba_hcdi_console_output_enter(
- usb_console_output);
-
- return (USB_SUCCESS);
+ return (usb_device->usb_hcdi_ops->usba_hcdi_console_output_enter(
+ usb_console_output));
}
/*
@@ -358,7 +351,7 @@ usb_console_output_enter(usb_console_info_t console_output_info)
*/
int
usb_console_write(usb_console_info_t console_output_info,
- uchar_t *buf, uint_t num_characters, uint_t *num_characters_written)
+ uchar_t *buf, uint_t num_characters, uint_t *num_characters_written)
{
usba_device_t *usb_device;
usb_console_info_impl_t *usb_console_output;
@@ -402,8 +395,6 @@ usb_console_output_exit(usb_console_info_t console_output_info)
/*
* Restore the state information.
*/
- usb_device->usb_hcdi_ops->usba_hcdi_console_output_exit(
- usb_console_output);
-
- return (USB_SUCCESS);
+ return (usb_device->usb_hcdi_ops->usba_hcdi_console_output_exit(
+ usb_console_output));
}
diff --git a/usr/src/uts/common/io/usb/usba/hubdi.c b/usr/src/uts/common/io/usb/usba/hubdi.c
index e3f3722de8..99d75edce3 100644
--- a/usr/src/uts/common/io/usb/usba/hubdi.c
+++ b/usr/src/uts/common/io/usb/usba/hubdi.c
@@ -22,7 +22,7 @@
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2019, Joyent, Inc.
*/
/*
@@ -1797,6 +1797,10 @@ usba_hubdi_power(dev_info_t *dip, int comp, int level)
retval = hubd_pwrlvl3(hubd);
break;
+ default:
+ retval = USB_FAILURE;
+
+ break;
}
mutex_exit(HUBD_MUTEX(hubd));
@@ -2133,11 +2137,11 @@ fail:
kmem_free(pathname, MAXPATHLEN);
}
- mutex_enter(HUBD_MUTEX(hubd));
- hubd_pm_idle_component(hubd, dip, 0);
- mutex_exit(HUBD_MUTEX(hubd));
+ if (hubd != NULL) {
+ mutex_enter(HUBD_MUTEX(hubd));
+ hubd_pm_idle_component(hubd, dip, 0);
+ mutex_exit(HUBD_MUTEX(hubd));
- if (hubd) {
rval = hubd_cleanup(dip, hubd);
if (rval != USB_SUCCESS) {
USB_DPRINTF_L2(DPRINT_MASK_ATTA, hubdi_log_handle,
@@ -2180,7 +2184,7 @@ usba_hubdi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
static int
hubd_setdevaddr(hubd_t *hubd, usb_port_t port)
{
- int rval;
+ int rval = USB_FAILURE;
usb_cr_t completion_reason;
usb_cb_flags_t cb_flags;
usb_pipe_handle_t ph;
@@ -2235,8 +2239,8 @@ hubd_setdevaddr(hubd_t *hubd, usb_port_t port)
for (retry = 0; retry < hubd_retry_enumerate; retry++) {
/* open child's default pipe with USBA_DEFAULT_ADDR */
- if (usb_pipe_open(child_dip, NULL, NULL,
- USB_FLAGS_SLEEP | USBA_FLAGS_PRIVILEGED, &ph) !=
+ if ((rval = usb_pipe_open(child_dip, NULL, NULL,
+ USB_FLAGS_SLEEP | USBA_FLAGS_PRIVILEGED, &ph)) !=
USB_SUCCESS) {
USB_DPRINTF_L2(DPRINT_MASK_ATTA, hubd->h_log_handle,
"hubd_setdevaddr: Unable to open default pipe");
@@ -6071,7 +6075,6 @@ hubd_ready_device(hubd_t *hubd, dev_info_t *child_dip, usba_device_t *child_ud,
return (child_dip);
}
-
/*
* hubd_create_child
* - create child dip
@@ -6480,6 +6483,8 @@ hubd_create_child(dev_info_t *dip,
goto fail_cleanup;
}
+ /* Read the BOS data */
+ usba_get_binary_object_store(child_dip, child_ud);
/* get the device string descriptor(s) */
usba_get_dev_string_descrs(child_dip, child_ud);
@@ -9198,7 +9203,7 @@ usba_hubdi_reset_device(dev_info_t *dip, usb_dev_reset_lvl_t reset_level)
usb_port_t port = 0;
dev_info_t *hdip;
usb_pipe_state_t prev_pipe_state = 0;
- usba_device_t *usba_device;
+ usba_device_t *usba_device = NULL;
hubd_reset_arg_t *arg;
int i, ph_open_cnt;
int rval = USB_FAILURE;
@@ -9372,6 +9377,7 @@ usba_hubdi_reset_device(dev_info_t *dip, usb_dev_reset_lvl_t reset_level)
== USB_SUCCESS) {
mutex_exit(HUBD_MUTEX(hubd));
/* re-open the default pipe */
+ ASSERT3P(usba_device, !=, NULL);
rval = usba_persistent_pipe_open(usba_device);
mutex_enter(HUBD_MUTEX(hubd));
if (rval != USB_SUCCESS) {
diff --git a/usr/src/uts/common/io/usb/usba/parser.c b/usr/src/uts/common/io/usb/usba/parser.c
index 965113374c..f81bcfdb39 100644
--- a/usr/src/uts/common/io/usb/usba/parser.c
+++ b/usr/src/uts/common/io/usb/usba/parser.c
@@ -23,6 +23,7 @@
* Use is subject to license terms.
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2019, Joyent, Inc.
*/
@@ -45,16 +46,13 @@ extern usba_cfg_pwr_descr_t default_cfg_power;
extern usba_if_pwr_descr_t default_if_power;
size_t
-usb_parse_data(char *format,
- uchar_t *data,
- size_t datalen,
- void *structure,
- size_t structlen)
+usb_parse_data(char *format, const uchar_t *data, size_t datalen,
+ void *structure, size_t structlen)
{
int fmt;
int counter = 1;
int multiplier = 0;
- uchar_t *dataend = data + datalen;
+ const uchar_t *dataend = data + datalen;
char *structstart = (char *)structure;
void *structend = (void *)((intptr_t)structstart + structlen);
@@ -170,11 +168,8 @@ usb_parse_data(char *format,
size_t
-usb_parse_CV_descr(char *format,
- uchar_t *data,
- size_t datalen,
- void *structure,
- size_t structlen)
+usb_parse_CV_descr(char *format, const uchar_t *data, size_t datalen,
+ void *structure, size_t structlen)
{
return (usb_parse_data(format, data, datalen, structure,
structlen));
@@ -186,16 +181,12 @@ usb_parse_CV_descr(char *format,
* type descr_type, unless the end of the buffer or a descriptor
* of type stop_descr_type1 or stop_descr_type2 is encountered first.
*/
-static uchar_t *
-usb_nth_descr(uchar_t *buf,
- size_t buflen,
- int descr_type,
- uint_t n,
- int stop_descr_type1,
- int stop_descr_type2)
+static const uchar_t *
+usb_nth_descr(const uchar_t *buf, size_t buflen, int descr_type, uint_t n,
+ int stop_descr_type1, int stop_descr_type2)
{
- uchar_t *bufstart = buf;
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufstart = buf;
+ const uchar_t *bufend = buf + buflen;
if (buf == NULL) {
@@ -229,10 +220,8 @@ usb_nth_descr(uchar_t *buf,
size_t
-usb_parse_dev_descr(uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */
- size_t buflen,
- usb_dev_descr_t *ret_descr,
- size_t ret_buf_len)
+usb_parse_dev_descr(const uchar_t *buf, size_t buflen,
+ usb_dev_descr_t *ret_descr, size_t ret_buf_len)
{
if ((buf == NULL) || (ret_descr == NULL) ||
(buflen < 2) || (buf[1] != USB_DESCR_TYPE_DEV)) {
@@ -246,10 +235,8 @@ usb_parse_dev_descr(uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */
size_t
-usb_parse_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- usb_cfg_descr_t *ret_descr,
- size_t ret_buf_len)
+usb_parse_cfg_descr(const uchar_t *buf, size_t buflen,
+ usb_cfg_descr_t *ret_descr, size_t ret_buf_len)
{
if ((buf == NULL) || (ret_descr == NULL) ||
(buflen < 2) || (buf[1] != USB_DESCR_TYPE_CFG)) {
@@ -263,13 +250,10 @@ usb_parse_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t
-usba_parse_cfg_pwr_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- usba_cfg_pwr_descr_t *ret_descr,
- size_t ret_buf_len)
+usba_parse_cfg_pwr_descr(const uchar_t *buf, size_t buflen,
+ usba_cfg_pwr_descr_t *ret_descr, size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL)) {
@@ -298,13 +282,10 @@ usba_parse_cfg_pwr_descr(
size_t
-usb_parse_ia_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- size_t first_if,
- usb_ia_descr_t *ret_descr,
- size_t ret_buf_len)
+usb_parse_ia_descr(const uchar_t *buf, size_t buflen, size_t first_if,
+ usb_ia_descr_t *ret_descr, size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL)) {
@@ -332,14 +313,10 @@ usb_parse_ia_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t
-usb_parse_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- uint_t if_number,
- uint_t alt_if_setting,
- usb_if_descr_t *ret_descr,
- size_t ret_buf_len)
+usb_parse_if_descr(const uchar_t *buf, size_t buflen, uint_t if_number,
+ uint_t alt_if_setting, usb_if_descr_t *ret_descr, size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL)) {
@@ -367,14 +344,10 @@ usb_parse_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
}
size_t
-usba_parse_if_pwr_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- uint_t if_number,
- uint_t alt_if_setting,
- usba_if_pwr_descr_t *ret_descr,
- size_t ret_buf_len)
+usba_parse_if_pwr_descr(const uchar_t *buf, size_t buflen, uint_t if_number,
+ uint_t alt_if_setting, usba_if_pwr_descr_t *ret_descr, size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL)) {
@@ -422,15 +395,11 @@ usba_parse_if_pwr_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
* the first endpoint
*/
size_t
-usb_parse_ep_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- uint_t if_number,
- uint_t alt_if_setting,
- uint_t ep_index,
- usb_ep_descr_t *ret_descr,
- size_t ret_buf_len)
+usb_parse_ep_descr(const uchar_t *buf, size_t buflen, uint_t if_number,
+ uint_t alt_if_setting, uint_t ep_index, usb_ep_descr_t *ret_descr,
+ size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL)) {
@@ -473,14 +442,12 @@ usb_parse_ep_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
*/
/*ARGSUSED*/
size_t
-usba_ascii_string_descr(uchar_t *buf, /* from GET_DESCRIPTOR(STRING) */
- size_t buflen,
- char *ret_descr,
- size_t ret_buf_len)
+usba_ascii_string_descr(const uchar_t *buf, size_t buflen, char *ret_descr,
+ size_t ret_buf_len)
{
- int i = 1;
- char *retstart = ret_descr;
- uchar_t *bufend = buf + buflen;
+ int i = 1;
+ char *retstart = ret_descr;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL) ||
(ret_buf_len == 0) || (buflen < 2) ||
@@ -501,15 +468,10 @@ usba_ascii_string_descr(uchar_t *buf, /* from GET_DESCRIPTOR(STRING) */
size_t
-usb_parse_CV_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- char *fmt,
- uint_t descr_type,
- uint_t descr_index,
- void *ret_descr,
- size_t ret_buf_len)
+usb_parse_CV_cfg_descr(const uchar_t *buf, size_t buflen, char *fmt,
+ uint_t descr_type, uint_t descr_index, void *ret_descr, size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL) || (fmt == NULL) ||
(buflen < 2) || ((buf = usb_nth_descr(buf, buflen, descr_type,
@@ -525,17 +487,11 @@ usb_parse_CV_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t
-usb_parse_CV_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- char *fmt,
- uint_t if_number,
- uint_t alt_if_setting,
- uint_t descr_type,
- uint_t descr_index,
- void *ret_descr,
- size_t ret_buf_len)
+usb_parse_CV_if_descr(const uchar_t *buf, size_t buflen, char *fmt,
+ uint_t if_number, uint_t alt_if_setting, uint_t descr_type,
+ uint_t descr_index, void *ret_descr, size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL) || (fmt == NULL)) {
@@ -570,18 +526,11 @@ usb_parse_CV_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t
-usb_parse_CV_ep_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
- size_t buflen,
- char *fmt,
- uint_t if_number,
- uint_t alt_if_setting,
- uint_t ep_index,
- uint_t descr_type,
- uint_t descr_index,
- void *ret_descr,
- size_t ret_buf_len)
+usb_parse_CV_ep_descr(const uchar_t *buf, size_t buflen, char *fmt,
+ uint_t if_number, uint_t alt_if_setting, uint_t ep_index, uint_t descr_type,
+ uint_t descr_index, void *ret_descr, size_t ret_buf_len)
{
- uchar_t *bufend = buf + buflen;
+ const uchar_t *bufend = buf + buflen;
if ((buf == NULL) || (ret_descr == NULL) || (fmt == NULL)) {
diff --git a/usr/src/uts/common/io/usb/usba/usba.c b/usr/src/uts/common/io/usb/usba/usba.c
index 7cc68e79df..6a37f8430a 100644
--- a/usr/src/uts/common/io/usb/usba/usba.c
+++ b/usr/src/uts/common/io/usb/usba/usba.c
@@ -24,6 +24,7 @@
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
* Copyright 2016 James S. Blachly, MD <james.blachly@gmail.com>
+ * Copyright 2019 Joyent, Inc.
*/
@@ -776,6 +777,8 @@ usba_free_usba_device(usba_device_t *usba_device)
strlen(usba_device->usb_serialno_str) + 1);
}
+ usba_free_binary_object_store(usba_device);
+
usba_unset_usb_address(usba_device);
}
@@ -2262,6 +2265,17 @@ usba_ready_device_node(dev_info_t *child_dip)
}
}
+ if (usba_device->usb_port_status == USBA_FULL_SPEED_DEV) {
+ /* create boolean property */
+ rval = ndi_prop_create_boolean(DDI_DEV_T_NONE, child_dip,
+ "full-speed");
+ if (rval != DDI_PROP_SUCCESS) {
+ USB_DPRINTF_L2(DPRINT_MASK_USBA, usba_log_handle,
+ "usba_ready_device_node: "
+ "full speed prop update failed");
+ }
+ }
+
if (usba_device->usb_port_status == USBA_HIGH_SPEED_DEV) {
/* create boolean property */
rval = ndi_prop_create_boolean(DDI_DEV_T_NONE, child_dip,
@@ -2283,6 +2297,8 @@ usba_ready_device_node(dev_info_t *child_dip)
}
}
+ usba_add_binary_object_store_props(child_dip, usba_device);
+
USB_DPRINTF_L4(DPRINT_MASK_USBA, usba_log_handle,
"%s%d at port %d: %s, dip=0x%p",
ddi_node_name(ddi_get_parent(child_dip)),
@@ -2906,7 +2922,6 @@ usba_get_mfg_prod_sn_str(
return (buffer);
}
-
/*
* USB enumeration statistic functions
*/
diff --git a/usr/src/uts/common/io/usb/usba/usba10_calls.c b/usr/src/uts/common/io/usb/usba/usba10_calls.c
index 2bdcfd11c4..9fe39418e8 100644
--- a/usr/src/uts/common/io/usb/usba/usba10_calls.c
+++ b/usr/src/uts/common/io/usb/usba/usba10_calls.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019, Joyent, Inc.
*/
@@ -80,7 +81,7 @@ usba10_usb_free_descr_tree(
size_t
usba10_usb_parse_data(
char *format,
- uchar_t *data,
+ const uchar_t *data,
size_t datalen,
void *structure,
size_t structlen)
diff --git a/usr/src/uts/common/io/usb/usba/usba_bos.c b/usr/src/uts/common/io/usb/usba/usba_bos.c
new file mode 100644
index 0000000000..df8bd00680
--- /dev/null
+++ b/usr/src/uts/common/io/usb/usba/usba_bos.c
@@ -0,0 +1,420 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Routines to access, parse, and manage the USB Binary Object Store
+ */
+
+#define USBA_FRAMEWORK
+#include <sys/usb/usba/usba_impl.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+
+static size_t
+usba_bos_parse_bos_descr(const uchar_t *buf, size_t buflen,
+ usb_bos_descr_t *bosp, size_t rlen)
+{
+ if (buf == NULL || bosp == NULL || buflen < USB_BOS_PACKED_SIZE ||
+ buf[1] != USB_DESCR_TYPE_BOS) {
+ return (USB_PARSE_ERROR);
+ }
+
+ return (usb_parse_data("ccsc", buf, buflen, bosp, rlen));
+}
+
+static boolean_t
+usba_bos_parse_usb2ext(const uchar_t *buf, size_t buflen, usb_bos_t *bosp)
+{
+ size_t len;
+
+ if (buflen != USB_BOS_USB2EXT_PACKED_SIZE) {
+ return (B_FALSE);
+ }
+
+ len = usb_parse_data("cccl", buf, buflen, &bosp->ubos_caps.ubos_usb2,
+ sizeof (usb_bos_usb2ext_t));
+ return (len == sizeof (usb_bos_usb2ext_t));
+}
+
+static boolean_t
+usba_bos_parse_superspeed(const uchar_t *buf, size_t buflen, usb_bos_t *bosp)
+{
+ size_t len;
+
+ if (buflen != USB_BOS_SSUSB_PACKED_SIZE) {
+ return (B_FALSE);
+ }
+
+ len = usb_parse_data("ccccsccs", buf, buflen,
+ &bosp->ubos_caps.ubos_ssusb, sizeof (usb_bos_ssusb_t));
+ return (len == sizeof (usb_bos_ssusb_t));
+}
+
+static boolean_t
+usba_bos_parse_container(const uchar_t *buf, size_t buflen, usb_bos_t *bosp)
+{
+ size_t len;
+
+ if (buflen != USB_BOS_CONTAINER_PACKED_SIZE) {
+ return (B_FALSE);
+ }
+
+ len = usb_parse_data("cccc16c", buf, buflen,
+ &bosp->ubos_caps.ubos_container, sizeof (usb_bos_container_t));
+ return (len == sizeof (usb_bos_container_t));
+}
+
+static boolean_t
+usba_bos_parse_precision_time(const uchar_t *buf, size_t buflen,
+ usb_bos_t *bosp)
+{
+ size_t len;
+
+ if (buflen != USB_BOS_PRECISION_TIME_PACKED_SIZE) {
+ return (B_FALSE);
+ }
+
+ len = usb_parse_data("ccc", buf, buflen, &bosp->ubos_caps.ubos_time,
+ sizeof (usb_bos_precision_time_t));
+ /*
+ * The actual size of this structure will usually be rounded up to four
+ * bytes by the compiler, therefore we need to compare against the
+ * packed size.
+ */
+ return (len == USB_BOS_PRECISION_TIME_PACKED_SIZE);
+}
+
+/*
+ * Validate that the BOS looks reasonable. This means the following:
+ *
+ * - We read the whole length of the descriptor
+ * - The total number of capabilities doesn't exceed the expected value
+ * - The length of each device capabilities fits within our expected range
+ *
+ * After we finish that up, go through and save all of the valid BOS
+ * descriptors, unpacking the ones that we actually understand.
+ */
+static boolean_t
+usba_bos_save(usba_device_t *ud, const mblk_t *mp, usb_bos_descr_t *bdesc)
+{
+ size_t len = MBLKL(mp);
+ const uchar_t *buf = mp->b_rptr;
+ uint_t ncaps, nalloc;
+ usb_bos_t *bos;
+
+ if (bdesc->bLength != USB_BOS_PACKED_SIZE ||
+ bdesc->bNumDeviceCaps == 0 || len < USB_BOS_PACKED_SIZE ||
+ len < bdesc->wTotalLength) {
+ return (B_FALSE);
+ }
+
+ len = MIN(len, bdesc->wTotalLength);
+ buf += USB_BOS_PACKED_SIZE;
+ len -= USB_BOS_PACKED_SIZE;
+
+ if (len < USB_DEV_CAP_PACKED_SIZE) {
+ return (B_FALSE);
+ }
+
+ ncaps = 0;
+ while (len > 0) {
+ usb_dev_cap_descr_t dev;
+
+ if (usb_parse_data("ccc", buf, len, &dev, sizeof (dev)) !=
+ USB_DEV_CAP_PACKED_SIZE) {
+ return (B_FALSE);
+ }
+
+ if (dev.bDescriptorType != USB_DESCR_TYPE_DEV_CAPABILITY ||
+ dev.bLength > len) {
+ return (B_FALSE);
+ }
+
+ ncaps++;
+ len -= dev.bLength;
+ buf += dev.bLength;
+ }
+
+ if (ncaps != bdesc->bNumDeviceCaps) {
+ return (B_FALSE);
+ }
+
+ nalloc = ncaps;
+ bos = kmem_zalloc(sizeof (usb_bos_t) * nalloc, KM_SLEEP);
+ buf = mp->b_rptr + USB_BOS_PACKED_SIZE;
+ len = MIN(MBLKL(mp), bdesc->wTotalLength) - USB_BOS_PACKED_SIZE;
+ ncaps = 0;
+ while (len > 0) {
+ usb_dev_cap_descr_t dev;
+ boolean_t valid;
+
+ if (usb_parse_data("ccc", buf, len, &dev, sizeof (dev)) !=
+ USB_DEV_CAP_PACKED_SIZE) {
+ goto fail;
+ }
+
+ bos[ncaps].ubos_length = dev.bLength;
+ bos[ncaps].ubos_type = dev.bDevCapabilityType;
+
+ valid = B_FALSE;
+ switch (dev.bDevCapabilityType) {
+ case USB_BOS_TYPE_USB2_EXT:
+ valid = usba_bos_parse_usb2ext(buf, dev.bLength,
+ &bos[ncaps]);
+ break;
+ case USB_BOS_TYPE_SUPERSPEED:
+ valid = usba_bos_parse_superspeed(buf, dev.bLength,
+ &bos[ncaps]);
+ break;
+ case USB_BOS_TYPE_CONTAINER:
+ valid = usba_bos_parse_container(buf, dev.bLength,
+ &bos[ncaps]);
+ break;
+ case USB_BOS_TYPE_PRECISION_TIME:
+ valid = usba_bos_parse_precision_time(buf, dev.bLength,
+ &bos[ncaps]);
+ break;
+ default:
+ /*
+ * Override the type to one that we know isn't used to
+ * indicate that the caller can't rely on the type
+ * that's present here.
+ */
+ bos[ncaps].ubos_type = USB_BOS_TYPE_INVALID;
+ bcopy(buf, bos[ncaps].ubos_caps.ubos_raw, dev.bLength);
+ valid = B_TRUE;
+ break;
+ }
+
+ if (valid) {
+ ncaps++;
+ } else {
+ bos[ncaps].ubos_length = 0;
+ bos[ncaps].ubos_type = USB_BOS_TYPE_INVALID;
+ bzero(bos[ncaps].ubos_caps.ubos_raw,
+ sizeof (bos[ncaps].ubos_caps.ubos_raw));
+ }
+ len -= dev.bLength;
+ buf += dev.bLength;
+ }
+
+ ud->usb_bos_nalloc = nalloc;
+ ud->usb_bos_nents = ncaps;
+ ud->usb_bos = bos;
+
+ return (B_TRUE);
+
+fail:
+ kmem_free(bos, sizeof (usb_bos_t) * nalloc);
+ return (B_FALSE);
+}
+
+/*
+ * Read the Binary Object Store (BOS) data from the device and attempt to parse
+ * it. Do not fail to attach the device if we cannot get all of the information
+ * at this time. While certain aspects of the BOS are required for Windows,
+ * which suggests that we could actually rely on it, we haven't historically.
+ */
+void
+usba_get_binary_object_store(dev_info_t *dip, usba_device_t *ud)
+{
+ int rval;
+ mblk_t *mp = NULL;
+ usb_cr_t completion_reason;
+ usb_cb_flags_t cb_flags;
+ usb_pipe_handle_t ph;
+ size_t size;
+ usb_bos_descr_t bos;
+
+ /*
+ * The BOS is only supported on USB 3.x devices. Therefore if the bcdUSB
+ * is greater than USB 2.0, we can check this. Note, USB 3.x devices
+ * that are linked on a USB device will report version 2.1 in the bcdUSB
+ * field.
+ */
+ if (ud->usb_dev_descr->bcdUSB <= 0x200) {
+ return;
+ }
+
+ ph = usba_get_dflt_pipe_handle(dip);
+
+ /*
+ * First get just the BOS descriptor itself.
+ */
+ rval = usb_pipe_sync_ctrl_xfer(dip, ph,
+ USB_DEV_REQ_DEV_TO_HOST | USB_DEV_REQ_TYPE_STANDARD,
+ USB_REQ_GET_DESCR, /* bRequest */
+ (USB_DESCR_TYPE_BOS << 8), /* wValue */
+ 0, /* wIndex */
+ USB_BOS_PACKED_SIZE, /* wLength */
+ &mp, USB_ATTRS_SHORT_XFER_OK,
+ &completion_reason, &cb_flags, 0);
+
+ if (rval != USB_SUCCESS) {
+ return;
+ }
+
+ size = usba_bos_parse_bos_descr(mp->b_rptr, MBLKL(mp), &bos,
+ sizeof (bos));
+ freemsg(mp);
+ mp = NULL;
+ if (size < USB_BOS_PACKED_SIZE) {
+ return;
+ }
+
+ /*
+ * Check to see if there are any capabilities and if it's worth getting
+ * the whole BOS.
+ */
+ if (bos.bLength != USB_BOS_PACKED_SIZE || bos.bNumDeviceCaps == 0) {
+ return;
+ }
+
+ rval = usb_pipe_sync_ctrl_xfer(dip, ph,
+ USB_DEV_REQ_DEV_TO_HOST | USB_DEV_REQ_TYPE_STANDARD,
+ USB_REQ_GET_DESCR, /* bRequest */
+ (USB_DESCR_TYPE_BOS << 8), /* wValue */
+ 0, /* wIndex */
+ bos.wTotalLength, /* wLength */
+ &mp, USB_ATTRS_SHORT_XFER_OK,
+ &completion_reason, &cb_flags, 0);
+
+ if (rval != USB_SUCCESS) {
+ return;
+ }
+
+ size = usba_bos_parse_bos_descr(mp->b_rptr, MBLKL(mp), &bos,
+ sizeof (bos));
+ if (size < USB_BOS_PACKED_SIZE) {
+ freemsg(mp);
+ return;
+ }
+
+ if (!usba_bos_save(ud, mp, &bos)) {
+ freemsg(mp);
+ return;
+ }
+
+ ud->usb_bos_mp = mp;
+}
+
+static void
+usba_add_superspeed_props(dev_info_t *dip, usb_bos_ssusb_t *ssusb)
+{
+ char *supported[4];
+ uint_t nsup = 0;
+ char *min;
+
+ if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_LOW) {
+ supported[nsup++] = "low-speed";
+ }
+
+ if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_FULL) {
+ supported[nsup++] = "full-speed";
+ }
+
+ if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_HIGH) {
+ supported[nsup++] = "high-speed";
+ }
+
+ if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_SUPER) {
+ supported[nsup++] = "super-speed";
+ }
+
+ if (nsup != 0 && ndi_prop_update_string_array(DDI_DEV_T_NONE, dip,
+ "usb-supported-speeds", supported, nsup) != DDI_PROP_SUCCESS) {
+ USB_DPRINTF_L2(DPRINT_MASK_USBA, NULL, "failed to add "
+ "usb-supported-speeds property");
+ }
+
+ switch (ssusb->bFunctionalitySupport) {
+ case 0:
+ min = "low-speed";
+ break;
+ case 1:
+ min = "full-speed";
+ break;
+ case 2:
+ min = "high-speed";
+ break;
+ case 3:
+ min = "super-speed";
+ break;
+ default:
+ min = NULL;
+ }
+
+ if (min != NULL && ndi_prop_update_string(DDI_DEV_T_NONE, dip,
+ "usb-minimum-speed", min) != DDI_PROP_SUCCESS) {
+ USB_DPRINTF_L2(DPRINT_MASK_USBA, NULL, "failed to add "
+ "usb-minimum-speed property");
+ }
+}
+
+static void
+usba_add_container_props(dev_info_t *dip, usb_bos_container_t *cp)
+{
+ if (ndi_prop_update_byte_array(DDI_DEV_T_NONE, dip, "usb-container-id",
+ cp->ContainerId, sizeof (cp->ContainerId)) != DDI_PROP_SUCCESS) {
+ USB_DPRINTF_L2(DPRINT_MASK_USBA, NULL, "failed to add "
+ "usb-container-id property");
+ }
+}
+
+void
+usba_add_binary_object_store_props(dev_info_t *dip, usba_device_t *ud)
+{
+ uint_t i;
+
+ if (ud->usb_bos == NULL) {
+ return;
+ }
+
+ for (i = 0; i < ud->usb_bos_nents; i++) {
+ usb_bos_t *bos = &ud->usb_bos[i];
+
+ switch (bos->ubos_type) {
+ case USB_BOS_TYPE_SUPERSPEED:
+ usba_add_superspeed_props(dip,
+ &bos->ubos_caps.ubos_ssusb);
+ break;
+ case USB_BOS_TYPE_CONTAINER:
+ usba_add_container_props(dip,
+ &bos->ubos_caps.ubos_container);
+ break;
+ default:
+ /*
+ * This is a capability that we're not going to add
+ * devinfo properties to describe.
+ */
+ continue;
+ }
+ }
+}
+
+void
+usba_free_binary_object_store(usba_device_t *ud)
+{
+ if (ud->usb_bos_mp != NULL) {
+ freemsg(ud->usb_bos_mp);
+ ud->usb_bos_mp = NULL;
+ }
+
+ if (ud->usb_bos != NULL) {
+ kmem_free(ud->usb_bos, sizeof (usb_bos_t) * ud->usb_bos_nalloc);
+ ud->usb_bos = NULL;
+ ud->usb_bos_nalloc = ud->usb_bos_nents = 0;
+ }
+}
diff --git a/usr/src/uts/common/io/usb/usba/usba_devdb.c b/usr/src/uts/common/io/usb/usba/usba_devdb.c
index 4fd1748bf0..e3d14f90c6 100644
--- a/usr/src/uts/common/io/usb/usba/usba_devdb.c
+++ b/usr/src/uts/common/io/usb/usba/usba_devdb.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019, Joyent, Inc.
*/
@@ -140,13 +141,13 @@ usba_devdb_get_conf_rec(struct _buf *file, usba_configrec_t **rec)
token_t token;
char tokval[MAXPATHLEN];
usba_configrec_t *cfgrec;
- config_field_t cfgvar;
+ config_field_t cfgvar = USB_NONE;
u_longlong_t llptr;
u_longlong_t value;
enum {
USB_NEWVAR, USB_CONFIG_VAR, USB_VAR_EQUAL, USB_VAR_VALUE,
USB_ERROR
- } parse_state = USB_NEWVAR;
+ } parse_state = USB_NEWVAR;
cfgrec = (usba_configrec_t *)kmem_zalloc(
sizeof (usba_configrec_t), KM_SLEEP);
diff --git a/usr/src/uts/common/io/usb/usba/usba_ugen.c b/usr/src/uts/common/io/usb/usba/usba_ugen.c
index 745497f590..bcc658a001 100644
--- a/usr/src/uts/common/io/usb/usba/usba_ugen.c
+++ b/usr/src/uts/common/io/usb/usba/usba_ugen.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -409,11 +409,9 @@ usb_ugen_attach(usb_ugen_hdl_t usb_ugen_hdl, ddi_attach_cmd_t cmd)
return (DDI_SUCCESS);
fail:
- if (ugenp) {
- USB_DPRINTF_L2(UGEN_PRINT_ATTA, ugenp->ug_log_hdl,
- "attach fail");
- (void) ugen_cleanup(ugenp);
- }
+ USB_DPRINTF_L2(UGEN_PRINT_ATTA, ugenp->ug_log_hdl,
+ "attach fail");
+ (void) ugen_cleanup(ugenp);
return (DDI_FAILURE);
}
@@ -2545,7 +2543,7 @@ ugen_epx_ctrl_req(ugen_state_t *ugenp, ugen_ep_t *epp,
goto fail;
}
-done:
+
*wait = B_TRUE;
return (USB_SUCCESS);
diff --git a/usr/src/uts/common/io/usb/usba/usbai.c b/usr/src/uts/common/io/usb/usba/usbai.c
index 1ff8507ff1..f6ac391bd8 100644
--- a/usr/src/uts/common/io/usb/usba/usbai.c
+++ b/usr/src/uts/common/io/usb/usba/usbai.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019, Joyent, Inc.
*/
@@ -115,8 +116,8 @@ usba_usbai_destroy()
*/
usb_log_handle_t
usb_alloc_log_hdl(dev_info_t *dip, char *name,
- uint_t *errlevel, uint_t *mask, uint_t *instance_filter,
- usb_flags_t flags)
+ uint_t *errlevel, uint_t *mask, uint_t *instance_filter,
+ usb_flags_t flags)
{
usba_log_handle_impl_t *hdl;
@@ -147,8 +148,8 @@ usb_alloc_log_hdl(dev_info_t *dip, char *name,
/*ARGSUSED*/
usb_log_handle_t
usb_alloc_log_handle(dev_info_t *dip, char *name,
- uint_t *errlevel, uint_t *mask, uint_t *instance_filter,
- uint_t reserved, usb_flags_t flags)
+ uint_t *errlevel, uint_t *mask, uint_t *instance_filter,
+ uint_t reserved, usb_flags_t flags)
{
return (usb_alloc_log_hdl(dip, name, errlevel, mask,
instance_filter, flags));
@@ -215,7 +216,7 @@ static void
usb_vprintf(dev_info_t *dip, int level, char *label, char *fmt, va_list ap)
{
size_t len;
- int instance;
+ int instance = 0;
char driver_name[USBA_DRVNAME_LEN];
char *msg_ptr;
@@ -383,7 +384,7 @@ usb_vprintf(dev_info_t *dip, int level, char *label, char *fmt, va_list ap)
int
usba_vlog(usb_log_handle_t, uint_t, uint_t, char *, va_list)
- __KVPRINTFLIKE(4);
+ __KVPRINTFLIKE(4);
/* When usba10_calls.c goes away, this function can be made static again. */
int
@@ -579,7 +580,7 @@ usba_async_req_raise_power(void *arg)
/* usb function to perform async pm_request_power_change */
int
usb_req_raise_power(dev_info_t *dip, int comp, int level,
- void (*callback)(void *, int), void *arg, usb_flags_t flags)
+ void (*callback)(void *, int), void *arg, usb_flags_t flags)
{
usba_pm_req_t *pmrq;
@@ -633,7 +634,7 @@ usba_async_req_lower_power(void *arg)
/* usb function to perform async pm_request_power_change */
int
usb_req_lower_power(dev_info_t *dip, int comp, int level,
- void (*callback)(void *, int), void *arg, usb_flags_t flags)
+ void (*callback)(void *, int), void *arg, usb_flags_t flags)
{
usba_pm_req_t *pmrq;
@@ -1100,7 +1101,7 @@ usb_unregister_hotplug_cbs(dev_info_t *dip)
/*ARGSUSED*/
int
usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
- usb_flags_t flags)
+ usb_flags_t flags)
{
usba_device_t *usba_device;
usba_evdata_t *evdata;
diff --git a/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c b/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c
index 296fcab878..455774b1b4 100644
--- a/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c
+++ b/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
@@ -540,7 +540,7 @@ usba_init_pipe_handle(dev_info_t *dip,
}
/* fix up the MaxPacketSize if it is the default endpoint descr */
- if ((ep == &usba_default_ep_descr) && usba_device) {
+ if (ep == &usba_default_ep_descr) {
uint16_t maxpktsize;
maxpktsize = usba_device->usb_dev_descr->bMaxPacketSize0;
diff --git a/usr/src/uts/common/io/usb/usba/usbai_register.c b/usr/src/uts/common/io/usb/usba/usbai_register.c
index 6d22a188df..8b75a7619b 100644
--- a/usr/src/uts/common/io/usb/usba/usbai_register.c
+++ b/usr/src/uts/common/io/usb/usba/usbai_register.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019, Joyent, Inc.
*/
/*
@@ -1313,7 +1313,7 @@ usba_make_alts_sparse(usb_alt_if_data_t **array, uint_t *n_elements)
uint8_t largest_value;
uint8_t curr_value;
uint_t in_order = 0;
- usb_alt_if_data_t *orig_addr = *array; /* Non-sparse array base ptr */
+ usb_alt_if_data_t *orig_addr; /* Non-sparse array base ptr */
usb_alt_if_data_t *repl_array; /* Base ptr to sparse array */
uint_t n_repl_elements; /* Number elements in the new array */
uint_t i;
@@ -1328,6 +1328,7 @@ usba_make_alts_sparse(usb_alt_if_data_t **array, uint_t *n_elements)
"make_sparse: array=0x%p, n_orig_elements=%d",
(void *)array, n_orig_elements);
+ orig_addr = *array;
curr_value = orig_addr[0].altif_descr.bAlternateSetting;
smallest_value = largest_value = curr_value;
@@ -1635,7 +1636,7 @@ usba_dump_descr_tree(dev_info_t *dip, usb_client_dev_data_t *usb_reg,
usb_cfg_descr_t *config_descr; /* and its USB descriptor. */
char *string;
char *name_string = NULL;
- int name_string_size;
+ int name_string_size = 0;
if ((usb_reg == NULL) || ((log_handle == NULL) && (dip == NULL))) {
diff --git a/usr/src/uts/common/io/usb/usba/usbai_req.c b/usr/src/uts/common/io/usb/usba/usbai_req.c
index 4792d32efb..3a99185225 100644
--- a/usr/src/uts/common/io/usb/usba/usbai_req.c
+++ b/usr/src/uts/common/io/usb/usba/usbai_req.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
@@ -113,7 +114,7 @@ _usba_check_req(usba_pipe_handle_data_t *ph_data, usb_opaque_t req,
mblk_t *data;
usb_cr_t *cr;
usb_req_attrs_t attrs;
- usb_opaque_t cb, exc_cb;
+ usb_opaque_t cb = NULL, exc_cb = NULL;
uint_t timeout = 0;
uchar_t direction = ph_data->p_ep.bEndpointAddress &
USB_EP_DIR_MASK;
@@ -144,6 +145,8 @@ _usba_check_req(usba_pipe_handle_data_t *ph_data, usb_opaque_t req,
case USB_EP_ATTR_ISOCH:
cr = &isoc_req->isoc_completion_reason;
break;
+ default:
+ return (USB_INVALID_REQUEST);
}
*cr = USB_CR_UNSPECIFIED_ERR;
@@ -220,6 +223,8 @@ _usba_check_req(usba_pipe_handle_data_t *ph_data, usb_opaque_t req,
cb = (usb_opaque_t)isoc_req->isoc_cb;
exc_cb = (usb_opaque_t)isoc_req->isoc_exc_cb;
break;
+ default:
+ return (USB_INVALID_REQUEST);
}
USB_DPRINTF_L4(DPRINT_MASK_USBAI, usbai_log_handle,
diff --git a/usr/src/uts/common/io/usb/usba/usbai_util.c b/usr/src/uts/common/io/usb/usba/usbai_util.c
index dd942e35f2..58fbd472ae 100644
--- a/usr/src/uts/common/io/usb/usba/usbai_util.c
+++ b/usr/src/uts/common/io/usb/usba/usbai_util.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
@@ -301,7 +302,7 @@ usb_get_string_descr(dev_info_t *dip,
usba_get_dflt_pipe_handle(dip),
USB_DEV_REQ_DEV_TO_HOST,
USB_REQ_GET_DESCR,
- USB_DESCR_TYPE_STRING << 8 | index & 0xff,
+ (USB_DESCR_TYPE_STRING << 8) | (index & 0xff),
langid,
4,
&data, USB_ATTRS_SHORT_XFER_OK,
@@ -345,7 +346,7 @@ usb_get_string_descr(dev_info_t *dip,
usba_get_dflt_pipe_handle(dip),
USB_DEV_REQ_DEV_TO_HOST,
USB_REQ_GET_DESCR,
- USB_DESCR_TYPE_STRING << 8 | index & 0xff,
+ (USB_DESCR_TYPE_STRING << 8) | (index & 0xff),
langid,
length,
&data, USB_ATTRS_SHORT_XFER_OK,
@@ -2009,7 +2010,7 @@ usb_serialize_access(
usb_serialization_t tokenp, uint_t how_to_wait, uint_t delta_timeout)
{
int rval = 1; /* Must be initialized > 0 */
- clock_t abs_timeout;
+ clock_t abs_timeout = 0;
usba_serialization_impl_t *impl_tokenp;
impl_tokenp = (usba_serialization_impl_t *)tokenp;
diff --git a/usr/src/uts/common/io/usb/usba10/usba10.c b/usr/src/uts/common/io/usb/usba10/usba10.c
index 9c8b0bed49..0c8e4af630 100644
--- a/usr/src/uts/common/io/usb/usba10/usba10.c
+++ b/usr/src/uts/common/io/usb/usba10/usba10.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019, Joyent, Inc.
*/
@@ -107,7 +108,7 @@ usb_free_descr_tree(
size_t
usb_parse_data(
char *format,
- uchar_t *data,
+ const uchar_t *data,
size_t datalen,
void *structure,
size_t structlen)
diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c
index d5dd1e8e39..ec6684f040 100644
--- a/usr/src/uts/common/io/vioif/vioif.c
+++ b/usr/src/uts/common/io/vioif/vioif.c
@@ -12,7 +12,7 @@
/*
* Copyright 2013 Nexenta Inc. All rights reserved.
* Copyright (c) 2014, 2016 by Delphix. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/* Based on the NetBSD virtio driver by Minoura Makoto. */
@@ -60,7 +60,6 @@
#include <sys/dlpi.h>
#include <sys/taskq.h>
-#include <sys/cyclic.h>
#include <sys/pattr.h>
#include <sys/strsun.h>
@@ -216,6 +215,9 @@ static struct modlinkage modlinkage = {
},
};
+/* Interval for the periodic TX reclaim */
+uint_t vioif_reclaim_ms = 200;
+
ddi_device_acc_attr_t vioif_attr = {
DDI_DEVICE_ATTR_V0,
DDI_NEVERSWAP_ACC, /* virtio is always native byte order */
@@ -278,7 +280,11 @@ struct vioif_softc {
struct virtqueue *sc_tx_vq;
struct virtqueue *sc_ctrl_vq;
- unsigned int sc_tx_stopped:1;
+ /* TX virtqueue management resources */
+ kmutex_t sc_tx_lock;
+ boolean_t sc_tx_corked;
+ boolean_t sc_tx_drain;
+ timeout_id_t sc_tx_reclaim_tid;
/* Feature bits. */
unsigned int sc_rx_csum:1;
@@ -406,6 +412,8 @@ static char *vioif_priv_props[] = {
NULL
};
+static void vioif_reclaim_restart(struct vioif_softc *);
+
/* Add up to ddi? */
static ddi_dma_cookie_t *
vioif_dma_curr_cookie(ddi_dma_handle_t dmah)
@@ -707,27 +715,26 @@ exit_txalloc:
}
/* ARGSUSED */
-int
+static int
vioif_multicst(void *arg, boolean_t add, const uint8_t *macaddr)
{
return (DDI_SUCCESS);
}
/* ARGSUSED */
-int
+static int
vioif_promisc(void *arg, boolean_t on)
{
return (DDI_SUCCESS);
}
/* ARGSUSED */
-int
+static int
vioif_unicst(void *arg, const uint8_t *macaddr)
{
return (DDI_FAILURE);
}
-
static uint_t
vioif_add_rx(struct vioif_softc *sc, int kmflag)
{
@@ -902,23 +909,25 @@ static uint_t
vioif_reclaim_used_tx(struct vioif_softc *sc)
{
struct vq_entry *ve;
- struct vioif_tx_buf *buf;
uint32_t len;
- mblk_t *mp;
uint_t num_reclaimed = 0;
while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) {
+ struct vioif_tx_buf *buf;
+ mblk_t *mp;
+
/* We don't chain descriptors for tx, so don't expect any. */
- ASSERT(!ve->qe_next);
+ ASSERT(ve->qe_next == NULL);
buf = &sc->sc_txbufs[ve->qe_index];
mp = buf->tb_mp;
buf->tb_mp = NULL;
if (mp != NULL) {
- for (int i = 0; i < buf->tb_external_num; i++)
+ for (uint_t i = 0; i < buf->tb_external_num; i++) {
(void) ddi_dma_unbind_handle(
buf->tb_external_mapping[i].vbm_dmah);
+ }
}
virtio_free_chain(ve);
@@ -929,14 +938,107 @@ vioif_reclaim_used_tx(struct vioif_softc *sc)
num_reclaimed++;
}
- if (sc->sc_tx_stopped && num_reclaimed > 0) {
- sc->sc_tx_stopped = 0;
- mac_tx_update(sc->sc_mac_handle);
+ /* Return ring to transmitting state if descriptors were reclaimed. */
+ if (num_reclaimed > 0) {
+ boolean_t do_update = B_FALSE;
+
+ mutex_enter(&sc->sc_tx_lock);
+ if (sc->sc_tx_corked) {
+ /*
+ * TX was corked on a lack of available descriptors.
+ * That dire state has passed so the TX interrupt can
+ * be disabled and MAC can be notified that
+ * transmission is possible again.
+ */
+ sc->sc_tx_corked = B_FALSE;
+ virtio_stop_vq_intr(sc->sc_tx_vq);
+ do_update = B_TRUE;
+ }
+ mutex_exit(&sc->sc_tx_lock);
+
+ /* Notify MAC outside the above lock */
+ if (do_update) {
+ mac_tx_update(sc->sc_mac_handle);
+ }
}
return (num_reclaimed);
}
+static void
+vioif_reclaim_periodic(void *arg)
+{
+ struct vioif_softc *sc = arg;
+ uint_t num_reclaimed;
+
+ num_reclaimed = vioif_reclaim_used_tx(sc);
+
+ mutex_enter(&sc->sc_tx_lock);
+ sc->sc_tx_reclaim_tid = 0;
+ /*
+ * If used descriptors were reclaimed or TX descriptors appear to be
+ * outstanding, the ring is considered active and periodic reclamation
+ * is necessary for now.
+ */
+ if (num_reclaimed != 0 || vq_num_used(sc->sc_tx_vq) != 0) {
+ /* Do not reschedule if the ring is being drained. */
+ if (!sc->sc_tx_drain) {
+ vioif_reclaim_restart(sc);
+ }
+ }
+ mutex_exit(&sc->sc_tx_lock);
+}
+
+static void
+vioif_reclaim_restart(struct vioif_softc *sc)
+{
+ ASSERT(MUTEX_HELD(&sc->sc_tx_lock));
+ ASSERT(!sc->sc_tx_drain);
+
+ if (sc->sc_tx_reclaim_tid == 0) {
+ sc->sc_tx_reclaim_tid = timeout(vioif_reclaim_periodic, sc,
+ MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
+ }
+}
+
+static void
+vioif_tx_drain(struct vioif_softc *sc)
+{
+ mutex_enter(&sc->sc_tx_lock);
+ sc->sc_tx_drain = B_TRUE;
+ /* Put a stop to the periodic reclaim if it is running */
+ if (sc->sc_tx_reclaim_tid != 0) {
+ timeout_id_t tid = sc->sc_tx_reclaim_tid;
+
+ /*
+ * With sc_tx_drain set, there is no risk that a racing
+ * vioif_reclaim_periodic() call will reschedule itself.
+ *
+ * Being part of the mc_stop hook also guarantees that
+ * vioif_tx() will not be called to restart it.
+ */
+ sc->sc_tx_reclaim_tid = 0;
+ mutex_exit(&sc->sc_tx_lock);
+ (void) untimeout(tid);
+ mutex_enter(&sc->sc_tx_lock);
+ }
+ virtio_stop_vq_intr(sc->sc_tx_vq);
+ mutex_exit(&sc->sc_tx_lock);
+
+ /*
+ * Wait for all of the TX descriptors to be processed by the host so
+ * they can be reclaimed.
+ */
+ while (vq_num_used(sc->sc_tx_vq) != 0) {
+ (void) vioif_reclaim_used_tx(sc);
+ delay(5);
+ }
+
+ VERIFY(!sc->sc_tx_corked);
+ VERIFY3U(sc->sc_tx_reclaim_tid, ==, 0);
+ VERIFY3U(vq_num_used(sc->sc_tx_vq), ==, 0);
+}
+
/* sc will be used to update stat counters. */
/* ARGSUSED */
static inline void
@@ -1178,28 +1280,60 @@ exit_tx_external:
return (B_TRUE);
}
-mblk_t *
+static mblk_t *
vioif_tx(void *arg, mblk_t *mp)
{
struct vioif_softc *sc = arg;
- mblk_t *nmp;
+ mblk_t *nmp;
+
+ /*
+ * Prior to attempting to send any more frames, do a reclaim to pick up
+ * any descriptors which have been processed by the host.
+ */
+ if (vq_num_used(sc->sc_tx_vq) != 0) {
+ (void) vioif_reclaim_used_tx(sc);
+ }
while (mp != NULL) {
nmp = mp->b_next;
mp->b_next = NULL;
if (!vioif_send(sc, mp)) {
- sc->sc_tx_stopped = 1;
+ /*
+ * If there are no descriptors available, try to
+ * reclaim some, allowing a retry of the send if some
+ * are found.
+ */
mp->b_next = nmp;
- break;
+ if (vioif_reclaim_used_tx(sc) != 0) {
+ continue;
+ }
+
+ /*
+ * Otherwise, enable the TX ring interrupt so that as
+ * soon as a descriptor becomes available, transmission
+ * can begin again. For safety, make sure the periodic
+ * reclaim is running as well.
+ */
+ mutex_enter(&sc->sc_tx_lock);
+ sc->sc_tx_corked = B_TRUE;
+ virtio_start_vq_intr(sc->sc_tx_vq);
+ vioif_reclaim_restart(sc);
+ mutex_exit(&sc->sc_tx_lock);
+ return (mp);
}
mp = nmp;
}
- return (mp);
+ /* Ensure the periodic reclaim has been started. */
+ mutex_enter(&sc->sc_tx_lock);
+ vioif_reclaim_restart(sc);
+ mutex_exit(&sc->sc_tx_lock);
+
+ return (NULL);
}
-int
+static int
vioif_start(void *arg)
{
struct vioif_softc *sc = arg;
@@ -1211,10 +1345,11 @@ vioif_start(void *arg)
virtio_start_vq_intr(sc->sc_rx_vq);
/*
- * Don't start interrupts on sc_tx_vq. We use VIRTIO_F_NOTIFY_ON_EMPTY,
- * so the device will send a transmit interrupt when the queue is empty
- * and we can reclaim it in one sweep.
+ * Starting interrupts on the TX virtqueue is unnecessary at this time.
+ * Descriptor reclamation is handling during transmit, via a periodic
+ * timer, and when resources are tight, via the then-enabled interrupt.
*/
+ sc->sc_tx_drain = B_FALSE;
/*
* Clear any data that arrived early on the receive queue and populate
@@ -1228,15 +1363,17 @@ vioif_start(void *arg)
return (DDI_SUCCESS);
}
-void
+static void
vioif_stop(void *arg)
{
struct vioif_softc *sc = arg;
+ /* Ensure all TX descriptors have been processed and reclaimed */
+ vioif_tx_drain(sc);
+
virtio_stop_vq_intr(sc->sc_rx_vq);
}
-/* ARGSUSED */
static int
vioif_stat(void *arg, uint_t stat, uint64_t *val)
{
@@ -1519,8 +1656,7 @@ vioif_dev_features(struct vioif_softc *sc)
VIRTIO_NET_F_HOST_ECN |
VIRTIO_NET_F_MAC |
VIRTIO_NET_F_STATUS |
- VIRTIO_F_RING_INDIRECT_DESC |
- VIRTIO_F_NOTIFY_ON_EMPTY);
+ VIRTIO_F_RING_INDIRECT_DESC);
vioif_show_features(sc, "Host features: ", host_features);
vioif_show_features(sc, "Negotiated features: ",
@@ -1535,7 +1671,7 @@ vioif_dev_features(struct vioif_softc *sc)
return (DDI_SUCCESS);
}
-static int
+static boolean_t
vioif_has_feature(struct vioif_softc *sc, uint32_t feature)
{
return (virtio_has_feature(&sc->sc_virtio, feature));
@@ -1585,7 +1721,7 @@ vioif_get_mac(struct vioif_softc *sc)
* Virtqueue interrupt handlers
*/
/* ARGSUSED */
-uint_t
+static uint_t
vioif_rx_handler(caddr_t arg1, caddr_t arg2)
{
struct virtio_softc *vsc = (void *) arg1;
@@ -1604,7 +1740,7 @@ vioif_rx_handler(caddr_t arg1, caddr_t arg2)
}
/* ARGSUSED */
-uint_t
+static uint_t
vioif_tx_handler(caddr_t arg1, caddr_t arg2)
{
struct virtio_softc *vsc = (void *)arg1;
@@ -1612,9 +1748,8 @@ vioif_tx_handler(caddr_t arg1, caddr_t arg2)
struct vioif_softc, sc_virtio);
/*
- * The return value of this function is not needed but makes debugging
- * interrupts simpler because you can use it to detect if anything was
- * reclaimed in this handler.
+ * The TX interrupt could race with other reclamation activity, so
+ * interpreting the return value is unimportant.
*/
(void) vioif_reclaim_used_tx(sc);
@@ -1770,6 +1905,9 @@ vioif_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
goto exit_alloc2;
virtio_stop_vq_intr(sc->sc_tx_vq);
+ mutex_init(&sc->sc_tx_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(sc->sc_virtio.sc_intr_prio));
+
if (vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ)) {
sc->sc_ctrl_vq = virtio_alloc_vq(&sc->sc_virtio, 2,
VIOIF_CTRL_QLEN, 0, "ctrl");
diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c
new file mode 100644
index 0000000000..198c14d4be
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/frameio.c
@@ -0,0 +1,465 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * Frame I/O utility functions
+ */
+
+#include <sys/frameio.h>
+
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/inttypes.h>
+
+static kmem_cache_t *frameio_cache;
+
+int
+frameio_init(void)
+{
+ frameio_cache = kmem_cache_create("frameio_cache",
+ sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX,
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (frameio_cache == NULL)
+ return (1);
+
+ return (0);
+}
+
+void
+frameio_fini(void)
+{
+ if (frameio_cache != NULL)
+ kmem_cache_destroy(frameio_cache);
+}
+
+frameio_t *
+frameio_alloc(int kmflags)
+{
+ return (kmem_cache_alloc(frameio_cache, kmflags));
+}
+
+void
+frameio_free(frameio_t *fio)
+{
+ kmem_cache_free(frameio_cache, fio);
+}
+
+/*
+ * Ensure that we don't see any garbage in the framevecs that we're nominally
+ * supposed to work with. Specifically we want to make sure that the buflen and
+ * the address are not zero.
+ */
+static int
+frameio_hdr_check_vecs(frameio_t *fio)
+{
+ int i;
+ for (i = 0; i < fio->fio_nvecs; i++)
+ if (fio->fio_vecs[i].fv_buf == NULL ||
+ fio->fio_vecs[i].fv_buflen == 0)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * We have to copy in framevec32_t's. To work around the data model issues and
+ * trying not to copy memory we first copy in the framevec32_t data into the
+ * standard fio_vec space. Next we work backwards copying a given framevec32_t
+ * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that
+ * it is important that we do this in reverse so as to ensure that we don't
+ * clobber data as the framevec_t is larger than the framevec32_t.
+ */
+static int
+frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr)
+{
+ framevec32_t *vec32p;
+ framevec_t fv;
+ int i;
+
+ vec32p = (framevec32_t *)&fio->fio_vecs[0];
+
+ if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs,
+ 0) != 0)
+ return (EFAULT);
+
+ for (i = fio->fio_nvecs - 1; i >= 0; i--) {
+ fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf;
+ fv.fv_buflen = vec32p[i].fv_buflen;
+ fv.fv_actlen = vec32p[i].fv_actlen;
+ fio->fio_vecs[i].fv_buf = fv.fv_buf;
+ fio->fio_vecs[i].fv_buflen = fv.fv_buflen;
+ fio->fio_vecs[i].fv_actlen = fv.fv_actlen;
+ }
+
+ return (frameio_hdr_check_vecs(fio));
+}
+
+/*
+ * Copy in a frame io header into fio with space for up to nvecs. If the frameio
+ * contains more vectors than specified it will be ignored. mode should contain
+ * information about the datamodel.
+ */
+int
+frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode)
+{
+ int model = ddi_model_convert_from(mode & FMODELS);
+ int cpf = mode & FKIOCTL ? FKIOCTL : 0;
+ size_t fsize = model == DDI_MODEL_ILP32 ?
+ sizeof (frameio32_t) : sizeof (frameio_t);
+
+ /*
+ * The start of the header is the same in all data models for the
+ * current verison.
+ */
+ if (ddi_copyin(addr, fio, fsize, cpf) != 0)
+ return (EFAULT);
+
+ if (fio->fio_version != FRAMEIO_VERSION_ONE)
+ return (EINVAL);
+
+ if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0)
+ return (EINVAL);
+
+ if (fio->fio_nvpf == 0)
+ return (EINVAL);
+
+ if (fio->fio_nvecs % fio->fio_nvpf != 0)
+ return (EINVAL);
+
+ if (fio->fio_nvecs > max_vecs)
+ return (EOVERFLOW);
+
+ addr = (void *)((uintptr_t)addr + fsize);
+ if (model == DDI_MODEL_ILP32) {
+ if (cpf != 0)
+ return (EINVAL);
+ return (frameio_hdr_copyin_ilp32(fio, addr));
+ }
+
+ if (ddi_copyin(addr, &fio->fio_vecs[0],
+ sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0)
+ return (EFAULT);
+
+ return (frameio_hdr_check_vecs(fio));
+}
+
+static mblk_t *
+frameio_allocb(size_t sz)
+{
+ mblk_t *mp;
+
+ mp = allocb(sz, 0);
+ if (mp == NULL)
+ return (NULL);
+
+ mp->b_datap->db_type = M_DATA;
+ return (mp);
+}
+
+static int
+framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf)
+{
+ mblk_t *mp;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ mp = frameio_allocb(fv->fv_buflen);
+
+ if (mp == NULL) {
+ freemsg(mp);
+ return (EAGAIN);
+ }
+
+ if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen,
+ cpf) != 0) {
+ freemsg(mp);
+ return (EFAULT);
+ }
+
+ mp->b_wptr += fv->fv_buflen;
+ *mpp = mp;
+ return (0);
+}
+
+/*
+ * Read a set of frame vectors that make up a single message boundary and return
+ * that as a single message in *mpp that consists of multiple data parts.
+ */
+static int
+frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf)
+{
+ int nparts = fio->fio_nvpf;
+ int part, error;
+ mblk_t *mp;
+
+ *mpp = NULL;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ /*
+ * Construct the initial frame
+ */
+ for (part = 0; part < nparts; part++) {
+ error = framevec_mblk_read(fv, &mp, cpf);
+ if (error != 0) {
+ freemsg(*mpp);
+ return (error);
+ }
+
+ if (*mpp == NULL)
+ *mpp = mp;
+ else
+ linkb(*mpp, mp);
+ fv++;
+ }
+
+ return (0);
+}
+
+/*
+ * Read data from a series of frameio vectors into a message block chain. A
+ * given frameio request has a number of discrete messages divided into
+ * individual vectors based on fio->fio_nvcspframe. Each discrete message will
+ * be constructed into a message block chain pointed to by b_next.
+ *
+ * If we get an EAGAIN while trying to construct a given message block what we
+ * return depends on what else we've done so far. If we have succesfully
+ * completed at least one message then we free everything else we've done so
+ * far and return that. If no messages have been completed we return EAGAIN. If
+ * instead we encounter a different error, say EFAULT, then all of the fv_actlen
+ * entries values are undefined.
+ */
+int
+frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf)
+{
+ int error = ENOTSUP;
+ int nframes = fio->fio_nvecs / fio->fio_nvpf;
+ int frame;
+ framevec_t *fv;
+ mblk_t *mp, *bmp = NULL;
+
+ /*
+ * Protect against bogus kernel subsystems.
+ */
+ VERIFY(fio->fio_nvecs > 0);
+ VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0);
+
+ *mpp = NULL;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ fv = &fio->fio_vecs[0];
+ for (frame = 0; frame < nframes; frame++) {
+ error = frameio_mblk_read(fio, fv, &mp, cpf);
+ if (error != 0)
+ goto failed;
+
+ if (bmp != NULL)
+ bmp->b_next = mp;
+ else
+ *mpp = mp;
+ bmp = mp;
+ }
+
+ *nvecs = nframes;
+ return (0);
+failed:
+ /*
+ * On EAGAIN we've already taken care of making sure that we have no
+ * leftover messages, eg. they were never linked in.
+ */
+ if (error == EAGAIN) {
+ if (frame != 0)
+ error = 0;
+ if (*nvecs != NULL)
+ *nvecs = frame;
+ ASSERT(*mpp != NULL);
+ } else {
+ for (mp = *mpp; mp != NULL; mp = bmp) {
+ bmp = mp->b_next;
+ freemsg(mp);
+ }
+ if (nvecs != NULL)
+ *nvecs = 0;
+ *mpp = NULL;
+ }
+ return (error);
+}
+
+size_t
+frameio_frame_length(frameio_t *fio, framevec_t *fv)
+{
+ int i;
+ size_t len = 0;
+
+ for (i = 0; i < fio->fio_nvpf; i++, fv++)
+ len += fv->fv_buflen;
+
+ return (len);
+}
+
+/*
+ * Write a portion of an mblk to the current.
+ */
+static int
+framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff,
+ size_t foff, int cpf)
+{
+ ASSERT(len <= MBLKL(mp) - moff);
+ ASSERT(len <= fv->fv_buflen - fv->fv_actlen);
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ if (ddi_copyout(mp->b_rptr + moff, (caddr_t)fv->fv_buf + foff, len,
+ cpf) != 0)
+ return (EFAULT);
+ fv->fv_actlen += len;
+
+ return (0);
+}
+
+/*
+ * Because copying this out to the user might fail we don't want to update the
+ * b_rptr in case we need to copy it out again.
+ */
+static int
+framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf)
+{
+ int err;
+ size_t msize, blksize, len, moff, foff;
+
+ msize = msgsize(mp);
+ if (msize > frameio_frame_length(fio, fv))
+ return (EOVERFLOW);
+
+ moff = 0;
+ foff = 0;
+ blksize = MBLKL(mp);
+ fv->fv_actlen = 0;
+ while (msize != 0) {
+ len = MIN(blksize, fv->fv_buflen - fv->fv_actlen);
+ err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf);
+ if (err != 0)
+ return (err);
+
+ msize -= len;
+ blksize -= len;
+ moff += len;
+ foff += len;
+
+ if (blksize == 0 && msize != 0) {
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ moff = 0;
+ blksize = MBLKL(mp);
+ }
+
+ if (fv->fv_buflen == fv->fv_actlen && msize != 0) {
+ fv++;
+ fv->fv_actlen = 0;
+ foff = 0;
+ }
+ }
+
+ return (0);
+}
+
+int
+frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map,
+ mblk_t *mp, int *nwrite, int cpf)
+{
+ int mcount = 0;
+ int ret = 0;
+
+ if (map != MAP_BLK_FRAME)
+ return (EINVAL);
+
+ while (mp != NULL && mcount < fio->fio_nvecs) {
+ ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf);
+ if (ret != 0)
+ break;
+ mcount += fio->fio_nvpf;
+ mp = mp->b_next;
+ }
+
+ if (ret != 0 && mcount == 0) {
+ if (nwrite != NULL)
+ *nwrite = 0;
+ return (ret);
+ }
+
+ if (nwrite != NULL)
+ *nwrite = mcount / fio->fio_nvpf;
+
+ return (0);
+}
+
+/*
+ * Copy out nframes worth of frameio header data back to userland.
+ */
+int
+frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode)
+{
+ int i;
+ int model = ddi_model_convert_from(mode & FMODELS);
+ framevec32_t *vec32p;
+ framevec32_t f;
+
+ if (fio->fio_nvecs / fio->fio_nvpf < nframes)
+ return (EINVAL);
+
+ fio->fio_nvecs = nframes * fio->fio_nvpf;
+
+ if (model == DDI_MODEL_NONE) {
+ if (ddi_copyout(fio, addr,
+ sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+ }
+
+ ASSERT(model == DDI_MODEL_ILP32);
+
+ vec32p = (framevec32_t *)&fio->fio_vecs[0];
+ for (i = 0; i < fio->fio_nvecs; i++) {
+ f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf;
+ if (fio->fio_vecs[i].fv_buflen > UINT_MAX ||
+ fio->fio_vecs[i].fv_actlen > UINT_MAX)
+ return (EOVERFLOW);
+ f.fv_buflen = fio->fio_vecs[i].fv_buflen;
+ f.fv_actlen = fio->fio_vecs[i].fv_actlen;
+ vec32p[i].fv_buf = f.fv_buf;
+ vec32p[i].fv_buflen = f.fv_buflen;
+ vec32p[i].fv_actlen = f.fv_actlen;
+ }
+
+ if (ddi_copyout(fio, addr,
+ sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+void
+frameio_mark_consumed(frameio_t *fio, int nframes)
+{
+ int i;
+
+ ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes);
+ for (i = 0; i < nframes * fio->fio_nvpf; i++)
+ fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen;
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
new file mode 100644
index 0000000000..d03c7ce4ec
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -0,0 +1,5857 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+/*
+ * vnd - virtual (machine) networking datapath
+ *
+ * vnd's purpose is to provide a highly performant data path for Layer 2 network
+ * traffic and exist side by side an active IP netstack, each servicing
+ * different datalinks. vnd provides many of the same capabilities as the
+ * current TCP/IP stack does and some specific to layer two. Specifically:
+ *
+ * o Use of the DLD fastpath
+ * o Packet capture hooks
+ * o Ability to use hardware capabilities
+ * o Useful interfaces for handling multiple frames
+ *
+ * The following image shows where vnd fits into today's networking stack:
+ *
+ * +---------+----------+----------+
+ * | libdlpi | libvnd | libsocket|
+ * +---------+----------+----------+
+ * | · · VFS |
+ * | VFS · VFS +----------+
+ * | · | sockfs |
+ * +---------+----------+----------+
+ * | | VND | IP |
+ * | +----------+----------+
+ * | DLD/DLS |
+ * +-------------------------------+
+ * | MAC |
+ * +-------------------------------+
+ * | GLDv3 |
+ * +-------------------------------+
+ *
+ * -----------------------------------------
+ * A Tale of Two Devices - DDI Device Basics
+ * -----------------------------------------
+ *
+ * vnd presents itself to userland as a character device; however, it also is a
+ * STREAMS device so that it can interface with dld and the rest of the
+ * networking stack. Users never interface with the STREAMs devices directly and
+ * they are purely an implementation detail of vnd. Opening the STREAMS device
+ * require kcred and as such userland cannot interact with it or push it onto
+ * the stream head.
+ *
+ * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
+ * clone gets its own minor number; however, minor nodes are not created in the
+ * devices tree for these instances. In this state a user may do two different
+ * things. They may issue ioctls that affect global state or they may issue
+ * ioctls that try to attach it to a given datalink. Once a minor device has
+ * been attached to a datalink, all operations on it are scoped to that context,
+ * therefore subsequent global operations are not permitted.
+ *
+ * A given device can be linked into the /devices and /dev name space via a link
+ * ioctl. That ioctl causes a minor node to be created in /devices and then it
+ * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
+ * to, but simpler than, IP's persistence mechanism.
+ *
+ * ---------------------
+ * Binding to a datalink
+ * ---------------------
+ *
+ * Datalinks are backed by the dld (datalink device) and dls (datalink services)
+ * drivers. These drivers provide a STREAMS device for datalinks on the system
+ * which are exposed through /dev/net. Userland generally manipulates datalinks
+ * through libdlpi. When an IP interface is being plumbed up what actually
+ * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
+ * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may
+ * then can negotiate with dld and dls to obtain access to various capabilities
+ * and fast paths via a series of STREAMS messages.
+ *
+ * In vnd, we do the same thing, but we leave our STREAMS module as an
+ * implementation detail of the system. We don't want users to be able to
+ * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
+ * kcred to manipulate it. Thus, when a user issues a request to attach a
+ * datalink to a minor instance of the character device, that vnd minor instance
+ * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
+ * vnd does that open using the passed in credentials from the ioctl, not kcred.
+ * This ensures that users who doesn't have permissions to open the device
+ * cannot. Once that's been opened, we push on the vnd streams module.
+ *
+ * Once the vnd STREAMS instance has been created for this device, eg. the
+ * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
+ * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
+ * This association begins the STREAM device's initialization. We start up an
+ * asynchronous state machine that takes care of all the different aspects of
+ * plumbing up the device with dld and dls and enabling the MAC fast path. We
+ * need to guarantee to consumers of the character device that by the time their
+ * ioctl returns, the data path has been fully initialized.
+ *
+ * The state progression is fairly linear. There are two general steady states.
+ * The first is VND_S_ONLINE, which means that everything is jacked up and good
+ * to go. The alternative is VND_S_ZOMBIE, which means that the streams device
+ * encountered an error or we have finished tearing it down and the character
+ * device can clean it up. The following is our state progression and the
+ * meaning of each state:
+ *
+ * |
+ * |
+ * V
+ * +---------------+
+ * | VNS_S_INITIAL | This is our initial state. Every
+ * +---------------+ vnd STREAMS device starts here.
+ * | While in this state, only dlpi
+ * | M_PROTO and M_IOCTL messages can be
+ * | sent or received. All STREAMS based
+ * | data messages are dropped.
+ * | We transition out of this state by
+ * | sending a DL_INFO_REQ to obtain
+ * | information about the underlying
+ * | link.
+ * v
+ * +-----------------+
+ * +--<-| VNS_S_INFO_SENT | In this state, we verify and
+ * | +-----------------+ record information about the
+ * | | underlying device. If the device is
+ * | | not suitable, eg. not of type
+ * v | DL_ETHER, then we immediately
+ * | | become a ZOMBIE. To leave this
+ * | | state we request exclusive active
+ * | | access to the device via
+ * v | DL_EXCLUSIVE_REQ.
+ * | v
+ * | +----------------------+
+ * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether
+ * | +----------------------+ or not we were able to obtain
+ * | | | exclusive access to the device. If
+ * | | | we were not able to, then we leave,
+ * v | | as that means that something like
+ * | | | IP is already plumbed up on top of
+ * | | | the datalink. We leave this state
+ * | | | by progressing through to the
+ * | | | appropriate DLPI primitive, either
+ * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ
+ * | | | depending on the style of the
+ * | | | datalink.
+ * | | v
+ * | | +-------------------+
+ * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were
+ * | | +-------------------+ able to perform a standard DLPI
+ * | | | attach and if so, go ahead and
+ * v | | send a DLPI_BIND_REQ.
+ * | v v
+ * | +-------------------+
+ * +--<-| VNS_S_BIND_SENT | In this state we see the result of
+ * | +-------------------+ our attempt to bind to PPA 0 of the
+ * v | underlying device. Because we're
+ * | | trying to be a layer two datapath,
+ * | | the specific attachment point isn't
+ * | | too important as we're going to
+ * v | have to enable promiscuous mode. We
+ * | | transition out of this by sending
+ * | | our first of three promiscuous mode
+ * | | requests.
+ * v v
+ * | +------------------------+
+ * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we
+ * | +------------------------+ were able to enable promiscuous
+ * | | mode at the physical level. We
+ * | | transition out of this by enabling
+ * | | multicast and broadcast promiscuous
+ * v | mode.
+ * | v
+ * | +--------------------------+
+ * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we
+ * | +--------------------------+ have enabled DL_PROMISC_MULTI and
+ * v | move onto the second promiscuous
+ * | | mode request.
+ * | v
+ * | +----------------------------+
+ * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we
+ * | +----------------------------+ enabled RX_ONLY promiscuous mode.
+ * | | We specifically do this as we don't
+ * v | want to receive our own traffic
+ * | | that we'll send out. We leave this
+ * | | state by enabling the final flag
+ * | | DL_PROMISC_FIXUPS.
+ * | v
+ * | +--------------------------+
+ * +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we
+ * | +--------------------------+ enabled FIXUP promiscuous mode.
+ * | | We specifically do this as we need
+ * v | to ensure that traffic which is
+ * | | received by being looped back to us
+ * | | correctly has checksums fixed. We
+ * | | leave this state by requesting the
+ * | | dld/dls capabilities that we can
+ * v | process.
+ * | v
+ * | +--------------------+
+ * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of
+ * | +--------------------+ capabilities that dld advertised
+ * | | and enable the ones that currently
+ * v | support for use. See the section
+ * | | later on regarding capabilities
+ * | | for more information. We leave this
+ * | | state by sending an enable request.
+ * v v
+ * | +--------------------+
+ * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability
+ * | +--------------------+ initialization. Once finished, we
+ * | | transition to the next state. If
+ * v | the dld fast path is not available,
+ * | | we become a zombie.
+ * | v
+ * | +--------------+
+ * | | VNS_S_ONLINE | This is a vnd STREAMS device's
+ * | +--------------+ steady state. It will normally
+ * | | reside in this state while it is in
+ * | | active use. It will only transition
+ * v | to the next state when the STREAMS
+ * | | device is closed by the character
+ * | | device. In this state, all data
+ * | | flows over the dld fast path.
+ * | v
+ * | +---------------------+
+ * +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of
+ * | +---------------------+ disabling capabilities and
+ * | | flushing all data. At this point
+ * | | any additional data that we receive
+ * | | will be dropped. We leave this
+ * v | state by trying to remove multicast
+ * | | promiscuity.
+ * | |
+ * | v
+ * | +---------------------------------+
+ * +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have
+ * | +---------------------------------+ successfully removed multicast
+ * | | promiscuous mode. If we have
+ * | | failed, we still carry on but only
+ * | | warn. We leave this state by trying
+ * | | to disable SAP level promiscuous
+ * | | mode.
+ * | v
+ * | +---------------------------+
+ * +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have
+ * | +---------------------------+ successfully removed SAP level
+ * | | promiscuous mode. If we have
+ * | | failed, we still carry on but only
+ * | | warn. Note that we don't worry
+ * | | about either of
+ * | | DL_PROMISC_FIXUPS or
+ * | | DL_PROMISC_RX_ONLY. If these are
+ * | | the only two entries left, then we
+ * | | should have anything that MAC is
+ * | | doing for us at this point,
+ * | | therefore it's safe for us to
+ * | | proceed to unbind, which is how we
+ * | | leave this state via a
+ * | v DL_UNBIND_REQ.
+ * | +-------------------+
+ * +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind
+ * | +-------------------+ request went. Regardless of its
+ * | | success, we always transition to
+ * | | a zombie state.
+ * | v
+ * | +--------------+
+ * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS
+ * +--------------+ device is waiting to finish being
+ * reaped. Because we have no more
+ * ways to receive data it should be
+ * safe to destroy all remaining data
+ * structures.
+ *
+ * If the stream association fails for any reason the state machine reaches
+ * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
+ * STREAMS ioctl to the character device. That will fail the user ioctl and
+ * propagate the vnd_errno_t back to userland. If, on the other hand, the
+ * association succeeds, then the vnd STREAMS device will be fully plumbed up
+ * and ready to transmit and receive message blocks. Consumers will be able to
+ * start using the other cbops(9E) entry points once the attach has fully
+ * finished, which will occur after the original user attach ioctl to the
+ * character device returns.
+ *
+ * It's quite important that we end up sending the full series of STREAMS
+ * messages when tearing down. While it's tempting to say that we should just
+ * rely on the STREAMS device being closed to properly ensure that we have no
+ * more additional data, that's not sufficient due to our use of direct
+ * callbacks. DLS does not ensure that by the time we change the direct
+ * callback (vnd_mac_input) that all callers to it will have been quiesced.
+ * However, it does guarantee that if we disable promiscuous mode ourselves and
+ * we turn off the main data path via DL_UNBIND_REQ that it will work.
+ * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do
+ * it as part of tearing down the STREAMS device. This ensures that we'll
+ * quiesce all data before we destroy our data structures and thus we should
+ * eliminate the race in changing the data function.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * There are several different devices and structures in the vnd driver. There
+ * is a per-netstack component, pieces related to the character device that
+ * consumers see, the internal STREAMS device state, and the data queues
+ * themselves. The following ASCII art picture describes their relationships and
+ * some of the major pieces of data that contain them. These are not exhaustive,
+ * e.g. synchronization primitives are left out.
+ *
+ * +----------------+ +-----------------+
+ * | global | | global |
+ * | device list | | netstack list |
+ * | vnd_dev_list | | vnd_nsd_list |
+ * +----------------+ +-----------------+
+ * | |
+ * | v
+ * | +-------------------+ +-------------------+
+ * | | per-netstack data | ---> | per-netstack data | --> ...
+ * | | vnd_pnsd_t | | vnd_pnsd_t |
+ * | | | +-------------------+
+ * | | |
+ * | | nestackid_t ---+----> Netstack ID
+ * | | vnd_pnsd_flags_t -+----> Status flags
+ * | | zoneid_t ---+----> Zone ID for this netstack
+ * | | hook_family_t ---+----> VND IPv4 Hooks
+ * | | hook_family_t ---+----> VND IPv6 Hooks
+ * | | list_t ----+ |
+ * | +------------+------+
+ * | |
+ * | v
+ * | +------------------+ +------------------+
+ * | | character device | ---> | character device | -> ...
+ * +---------->| vnd_dev_t | | vnd_dev_t |
+ * | | +------------------+
+ * | |
+ * | minor_t ---+--> device minor number
+ * | ldi_handle_t ---+--> handle to /dev/net/%datalink
+ * | vnd_dev_flags_t -+--> device flags, non blocking, etc.
+ * | char[] ---+--> name if linked
+ * | vnd_str_t * -+ |
+ * +--------------+---+
+ * |
+ * v
+ * +-------------------------+
+ * | STREAMS device |
+ * | vnd_str_t |
+ * | |
+ * | vnd_str_state_t ---+---> State machine state
+ * | gsqueue_t * ---+---> mblk_t Serialization queue
+ * | vnd_str_stat_t ---+---> per-device kstats
+ * | vnd_str_capab_t ---+----------------------------+
+ * | vnd_data_queue_t ---+ | |
+ * | vnd_data_queue_t -+ | | v
+ * +-------------------+-+---+ +---------------------+
+ * | | | Stream capabilities |
+ * | | | vnd_str_capab_t |
+ * | | | |
+ * | | supported caps <--+-- vnd_capab_flags_t |
+ * | | dld cap handle <--+-- void * |
+ * | | direct tx func <--+-- vnd_dld_tx_t |
+ * | | +---------------------+
+ * | |
+ * +----------------+ +-------------+
+ * | |
+ * v v
+ * +-------------------+ +-------------------+
+ * | Read data queue | | Write data queue |
+ * | vnd_data_queue_t | | vnd_data_queue_t |
+ * | | | |
+ * | size_t ----+--> Current size | size_t ----+--> Current size
+ * | size_t ----+--> Max size | size_t ----+--> Max size
+ * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head
+ * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail
+ * +-------------------+ +-------------------+
+ *
+ *
+ * Globally, we maintain two lists. One list contains all of the character
+ * device soft states. The other maintains a list of all our netstack soft
+ * states. Each netstack maintains a list of active devices that have been
+ * associated with a datalink in its netstack.
+ *
+ * Recall that a given minor instance of the character device exists in one of
+ * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
+ * or it can be associated with a given datalink. When minor instances are in
+ * the former state, they do not exist in a given vnd_pnsd_t's list of devices.
+ * As part of attaching to a datalink, the given vnd_dev_t will be inserted into
+ * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
+ * vnd_str_t, to be created and associated to a vnd_dev_t.
+ *
+ * The character device, and its vnd_dev_t, is the interface to the rest of the
+ * system. The vnd_dev_t keeps track of various aspects like whether various
+ * operations, such as read, write and the frameio ioctls, are considered
+ * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
+ * keeping track of things like the name of the device, if any, in /dev. The
+ * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
+ * data queues. However, ioctls that manipulate these properties all go through
+ * the vnd_dev_t to its associated vnd_str_t.
+ *
+ * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
+ * for frames to transmit (write queue) and one for frames received (read
+ * queue). These data queues have a maximum size and attempting to add data
+ * beyond that maximum size will result in data being dropped. The sizes are
+ * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
+ * in those buffers or has a reservation in those buffers while they are in vnd
+ * and waiting to be consumed by the user or by mac.
+ *
+ * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
+ * available, negotiated, and currently active features.
+ *
+ * ----------------------
+ * Data Path and gsqueues
+ * ----------------------
+ *
+ * There's a lot of plumbing in vnd to get to the point where we can send data,
+ * but vnd's bread and butter is the data path, so it's worth diving into it in
+ * more detail. Data enters and exits the system from two ends.
+ *
+ * The first end is the vnd consumer. This comes in the form of read and write
+ * system calls as well as the frame I/O ioctls. The read and write system calls
+ * operate on a single frame at a time. Think of a frame as a single message
+ * that has come in off the wire, which may itself comprise multiple mblk_t's
+ * linked together in the kernel. readv(2) and writev(2) have the same
+ * limitations as read(2) and write(2). We enforce this as the system is
+ * required to fill up every uio(9S) buffer before moving onto the next one.
+ * This means that if you have a MTU sized buffer and two frames come in which
+ * are less than half of the MTU they must fill up the given iovec. Even if we
+ * didn't want to do this, we have no way of informing the supplier of the
+ * iovecs that they were only partially filled or where one frame ends and
+ * another begins. That's life, as such we have frame I/O which solves this
+ * problem. It allows for multiple frames to be consumed as well as for frames
+ * to be broken down into multiple vector components.
+ *
+ * The second end is the mac direct calls. As part of negotiating capabilities
+ * via dld, we give mac a function of ours to call when packets are received
+ * [vnd_mac_input()] and a callback to indicate that flow has been restored
+ * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
+ * transmit data with. As part of the contract with mac, mac is allowed to flow
+ * control us by returning a cookie to the transmit function. When that happens,
+ * all outbound traffic is halted until our callback function is called and we
+ * can schedule drains.
+ *
+ * It's worth looking at these in further detail. We'll start with the rx path.
+ *
+ *
+ * |
+ * * . . . packets from gld
+ * |
+ * v
+ * +-------------+
+ * | mac |
+ * +-------------+
+ * |
+ * v
+ * +-------------+
+ * | dld |
+ * +-------------+
+ * |
+ * * . . . dld direct callback
+ * |
+ * v
+ * +---------------+
+ * | vnd_mac_input |
+ * +---------------+
+ * |
+ * v
+ * +---------+ +-------------+
+ * | dropped |<--*---------| vnd_hooks |
+ * | by | . +-------------+
+ * | hooks | . drop probe |
+ * +---------+ kstat bump * . . . Do we have free
+ * | buffer space?
+ * |
+ * no . | . yes
+ * . + .
+ * +---*--+------*-------+
+ * | |
+ * * . . drop probe * . . recv probe
+ * | kstat bump | kstat bump
+ * v |
+ * +---------+ * . . fire pollin
+ * | freemsg | v
+ * +---------+ +-----------------------+
+ * | vnd_str_t`vns_dq_read |
+ * +-----------------------+
+ * ^ ^
+ * +----------+ | | +---------+
+ * | read(9E) |-->-+ +--<--| frameio |
+ * +----------+ +---------+
+ *
+ * The rx path is rather linear. Packets come into us from mac. We always run
+ * them through the various hooks, and if they come out of that, we inspect the
+ * read data queue. If there is not enough space for a packet, we drop it.
+ * Otherwise, we append it to the data queue, and fire read notifications
+ * targetting anyone polling or doing blocking I/O on this device. Those
+ * consumers then drain the head of the data queue.
+ *
+ * The tx path is more complicated due to mac flow control. After any call into
+ * mac, we may have to potentially suspend writes and buffer data for an
+ * arbitrary amount of time. As such, we need to carefully track the total
+ * amount of outstanding data so that we don't waste kernel memory. This is
+ * further complicated by the fact that mac will asynchronously tell us when our
+ * flow has been resumed.
+ *
+ * For data to be able to enter the system, it needs to be able to take a
+ * reservation from the write data queue. Once the reservation has been
+ * obtained, we enter the gsqueue so that we can actually append it. We use
+ * gsqueues (serialization queues) to ensure that packets are manipulated in
+ * order as we deal with the draining and appending packets. We also leverage
+ * its worker thread to help us do draining after mac has restorted our flow.
+ *
+ * The following image describes the flow:
+ *
+ * +-----------+ +--------------+ +-------------------------+ +------+
+ * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done |
+ * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+
+ * +-----------+ +--------------+ . +-------------------------+
+ * | ^ .
+ * | | . reserve space from gsqueue
+ * | | |
+ * queue . . . * | space v
+ * full | * . . . avail +------------------------+
+ * v | | vnd_squeue_tx_append() |
+ * +--------+ +------------+ +------------------------+
+ * | EAGAIN |<--*------| Non-block? |<-+ |
+ * +--------+ . +------------+ | v
+ * . yes v | wait +--------------+
+ * no . .* * . . for | append chain |
+ * +----+ space | to outgoing |
+ * | mblk chain |
+ * from gsqueue +--------------+
+ * | |
+ * | +-------------------------------------------------+
+ * | |
+ * | | yes . . .
+ * v v .
+ * +-----------------------+ +--------------+ . +------+
+ * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
+ * +-----------------------+ +--------------+ +------+
+ * | |
+ * +---------------------------------|---------------------+
+ * | | tx |
+ * | no . . * queue . . *
+ * | flow controlled . | empty * . fire pollout
+ * | . v | if mblk_t's
+ * +-------------+ . +---------------------+ | sent
+ * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
+ * | flags | +---------------------+ |
+ * +-------------+ More data | | | More data |
+ * and limit ^ v * . . and limit ^
+ * not reached . . * | | reached |
+ * +----+ | |
+ * v |
+ * +----------+ +-------------+ +---------------------------+
+ * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with |
+ * | control | | block flags | | vnd_squeue_tx_drain() and |
+ * | callback | +-------------+ | GSQUEUE_FILL flag, iff |
+ * +----------+ | not already scheduled |
+ * +---------------------------+
+ *
+ * The final path taken for a given write(9E)/frameio ioctl depends on whether
+ * or not the vnd_dev_t is non-blocking. That controls the initial path of
+ * trying to take a reservation in write data queue. If the device is in
+ * non-blocking mode, we'll return EAGAIN when there is not enough space
+ * available, otherwise, the calling thread blocks on the data queue.
+ *
+ * Today when we call into vnd_squeue_tx_drain() we will not try to drain the
+ * entire queue, as that could be quite large and we don't want to necessarily
+ * keep the thread that's doing the drain until it's been finished. Not only
+ * could more data be coming in, but the draining thread could be a userland
+ * thread that has more work to do. We have two limits today. There is an upper
+ * bound on the total amount of data and the total number of mblk_t chains. If
+ * we hit either limit, then we will schedule another drain in the gsqueue and
+ * go from there.
+ *
+ * It's worth taking some time to describe how we interact with gsqueues. vnd
+ * has a gsqueue_set_t for itself. It's important that it has its own set, as
+ * the profile of work that vnd does is different from other sub-systems in the
+ * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
+ * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
+ * maintaining one for a given device. Because of that, we want to use a
+ * pseudo-random one to try and spread out the load, and picking one at random
+ * is likely to be just as good as any fancy algorithm we might come up with,
+ * especially as any two devices could have radically different transmit
+ * profiles.
+ *
+ * While some of the write path may seem complicated, it does allow us to
+ * maintain an important property. Once we have acknowledged a write(9E) or
+ * frameio ioctl, we will not drop the packet, excepting something like ipf via
+ * the firewall hooks.
+ *
+ * There is one other source of flow control that can exist in the system which
+ * is in the form of a barrier. The barrier is an internal mechanism used for
+ * ensuring that an gsqueue is drained for a given device. We use this as part
+ * of tearing down. Specifically we disable the write path so nothing new can be
+ * inserted into the gsqueue and then insert a barrier block. Once the barrier
+ * block comes out of the gsqueue, then we know nothing else in the gsqueue that
+ * could refer to the vnd_str_t, being destroyed, exists.
+ *
+ * ---------------------
+ * vnd, zones, netstacks
+ * ---------------------
+ *
+ * vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
+ * Because of that, vnd is also a netstack module. It registers with the
+ * netstack sub-system and receives callbacks every time a netstack is created,
+ * being shutdown, and destroyed. The netstack callbacks drive the creation and
+ * destruction of the vnd_pnsd_t structures.
+ *
+ * Recall from the earlier architecture diagrams that every vnd device is scoped
+ * to a netstack and known about by a given vnd_pnsd_t. When that netstack is
+ * torn down, we also tear down any vnd devices that are hanging around. When
+ * the netstack is torn down, we know that any zones that are scoped to that
+ * netstack are being shut down and have no processes remaining. This is going
+ * to be the case whether they are shared or exclusive stack zones. We have to
+ * perform a careful dance.
+ *
+ * There are two different callbacks that happen on tear down, the first is a
+ * shutdown callback, the second is a destroy callback. When the shutdown
+ * callback is fired we need to prepare for the netstack to go away and ensure
+ * that nothing can continue to persist itself.
+ *
+ * More specifically, when we get notice of a stack being shutdown we first
+ * remove the netstack from the global netstack list to ensure that no one new
+ * can come in and find the netstack and get a reference to it. After that, we
+ * notify the neti hooks that they're going away. Once that's all done, we get
+ * to the heart of the matter.
+ *
+ * When shutting down there could be any number of outstanding contexts that
+ * have a reference on the vnd_pnsd_t and on the individual links. However, we
+ * know that no one new will be able to find the vnd_pnsd_t. To account for
+ * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
+ * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
+ * to the netstack's list. If this is set, then they must not append to it.
+ * Once this is set, we know that the netstack's list of devices can never grow,
+ * only shrink.
+ *
+ * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
+ * the container for the device is being destroyed and that we should not allow
+ * additional references to the device to be created, whether via open, or
+ * linking. The presence of this bit also allows things like the list ioctl and
+ * sdev to know not to consider its existence. At the conclusion of this being
+ * set, we know that no one else should be able to obtain a new reference to the
+ * device.
+ *
+ * Once that has been set for all devices, we go through and remove any existing
+ * links that have been established in sdev. Because doing that may cause the
+ * final reference for the device to be dropped, which still has a reference to
+ * the netstack, we have to restart our walk due to dropped locks. We know that
+ * this walk will eventually complete because the device cannot be relinked and
+ * no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
+ * Once that's finished, the shutdown callback returns.
+ *
+ * When we reach the destroy callback, we simply wait for references on the
+ * netstack to disappear. Because the zone has been shut down, all processes in
+ * it that have open references have been terminated and reaped. Any threads
+ * that are newly trying to reference it will fail. However, there is one thing
+ * that can halt this that we have no control over, which is the global zone
+ * holding open a reference to the device. In this case the zone halt will hang
+ * in vnd_stack_destroy. Once the last references is dropped we finish destroy
+ * the netinfo hooks and free the vnd_pnsd_t.
+ *
+ * ----
+ * sdev
+ * ----
+ *
+ * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
+ * for both the global and non-global zones. In any given zone we always supply
+ * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
+ * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
+ * if a link was named net0, there would be a /dev/vnd/net0. The global zone can
+ * also see every link for every zone, ala /dev/net, under
+ * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
+ * named net0, the global zone would have /dev/vnd/turin/net0.
+ *
+ * The sdev plugin has three interfaces that it supplies back to sdev. One is to
+ * validate that a given node is still valid. The next is a callback from sdev
+ * to say that it is no longer using the node. The third and final one is from
+ * sdev where it asks us to fill a directory. All of the heavy lifting is done
+ * in directory filling and in valiation. We opt not to maintain a reference on
+ * the device while there is an sdev node present. This makes the removal of
+ * nodes much simpler and most of the possible failure modes shouldn't cause any
+ * real problems. For example, the open path has to handle both dev_t's which no
+ * longer exist and which are no longer linked.
+ *
+ * -----
+ * hooks
+ * -----
+ *
+ * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
+ * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
+ * in a minimal fashion. While we will allow traffic to be filtered through the
+ * hooks, we do not provide means for packet injection or additional inspection
+ * at this time. There are a total of four different events created:
+ *
+ * o IPv4 physical in
+ * o IPv4 physical out
+ * o IPv6 physical in
+ * o IPv6 physical out
+ *
+ * ---------------
+ * Synchronization
+ * ---------------
+ *
+ * To make our synchronization simpler, we've put more effort into making the
+ * metadata/setup paths do more work. That work allows the data paths to make
+ * assumptions around synchronization that simplify the general case. Each major
+ * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
+ * annotated with the protection that its members receives. The following
+ * annotations are used:
+ *
+ * A Atomics; these values are only modified using atomics values.
+ * Currently this only applies to kstat values.
+ * E Existence; no lock is needed to access this member, it does not
+ * change while the structure is valid.
+ * GL Global Lock; these members are protected by the global
+ * vnd_dev_lock.
+ * L Locked; access to the member is controlled by a lock that is in
+ * the structure.
+ * NSL netstack lock; this member is protected by the containing
+ * netstack. This only applies to the vnd_dev_t`vdd_nslink.
+ * X This member is special, and is discussed in this section.
+ *
+ * In addition to locking, we also have reference counts on the vnd_dev_t and
+ * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
+ * With rare exception, once a reference count is decremented, the consumer
+ * should not assume that the data is valid any more. The only exception to this
+ * is the case where we're removing an extant reference count from a link into
+ * /devices or /dev. Reference counts are obtained on these structures as a part
+ * of looking them up.
+ *
+ * # Global Lock Ordering
+ * ######################
+ *
+ * The following is the order that you must take locks in vnd:
+ *
+ * 1) vnd`vnd_dev_lock
+ * 2) vnd_pnsd_t`vpnd_lock
+ * 3) vnd_dev_t`vnd_lock
+ * 4) vnd_str_t`vns_lock
+ * 5) vnd_data_queue_t`vdq_lock
+ *
+ * One must adhere to the following rules:
+ *
+ * o You must acquire a lower numbered lock before a high numbered lock.
+ * o It is NOT legal to hold two locks of the same level concurrently, eg. you
+ * can not hold two different vnd_dev_t's vnd_lock at the same time.
+ * o You may release locks in any order.
+ * o If you release a lock, you must honor the locking rules before acquiring
+ * it again.
+ * o You should not hold any locks when calling any of the rele functions.
+ *
+ * # Special Considerations
+ * ########################
+ *
+ * While most of the locking is what's expected, it's worth going into the
+ * special nature that a few members hold. Today, only two structures have
+ * special considerations: the vnd_dev_t and the vnd_str_t. All members with
+ * special considerations have an additional annotation that describes how you
+ * should interact with it.
+ *
+ * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
+ * attached or in the process of attaching. If the code path that goes through
+ * requires an attached vnd_dev_t, eg. the data path and tear down path, then it
+ * is always legal to dereference that member without a lock held. When they are
+ * added to the system, they should be done under the vdd_lock and done as part
+ * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
+ * lifetime of the vnd_dev_t.
+ *
+ * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
+ * always exists as it is a part of the structure. The only time that it's valid
+ * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
+ * set or during tear down. Outside of those paths which are naturally
+ * serialized, there is no explicit locking around the member.
+ *
+ * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
+ * initially set as part of creating the structure, but are set as part of
+ * responding to the association ioctl. Anything in the data path or metadata
+ * path that requires association may assume that they exist, as we do not kick
+ * off the state machine until they're set.
+ *
+ * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
+ * members are designed to be used as part of various operations with the
+ * gsqueues. A lock isn't needed to use them, but to work with them, the
+ * appropriate flag in the vnd_str_t`vns_flags must have been set by the current
+ * thread. Otherwise, it is always fair game to refer to their addresses. Their
+ * contents are ignored by vnd, but some members are manipulated by the gsqueue
+ * subsystem.
+ */
+
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/ddi.h>
+#include <sys/ethernet.h>
+#include <sys/stropts.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/ksynch.h>
+#include <sys/taskq_impl.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/dlpi.h>
+#include <sys/cred.h>
+#include <sys/id_space.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+#include <sys/policy.h>
+#include <sys/sunldi.h>
+#include <sys/cred.h>
+#include <sys/strsubr.h>
+#include <sys/poll.h>
+#include <sys/neti.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+#include <sys/vlan.h>
+#include <sys/dld.h>
+#include <sys/mac_client.h>
+#include <sys/netstack.h>
+#include <sys/fs/sdev_plugin.h>
+#include <sys/kstat.h>
+#include <sys/atomic.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/gsqueue.h>
+#include <sys/ht.h>
+
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <sys/vnd.h>
+
+/*
+ * Globals
+ */
+static dev_info_t *vnd_dip;
+static taskq_t *vnd_taskq;
+static kmem_cache_t *vnd_str_cache;
+static kmem_cache_t *vnd_dev_cache;
+static kmem_cache_t *vnd_pnsd_cache;
+static id_space_t *vnd_minors;
+static int vnd_list_init = 0;
+static sdev_plugin_hdl_t vnd_sdev_hdl;
+static gsqueue_set_t *vnd_sqset;
+
+static kmutex_t vnd_dev_lock;
+static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */
+static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */
+
+/*
+ * STREAMs ioctls
+ *
+ * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
+ * they aren't a part of the header file.
+ */
+#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)
+
+/*
+ * Private ioctl to associate a given streams instance with a minor instance of
+ * the character device.
+ */
+#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1)
+
+typedef struct vnd_strioc_associate {
+ minor_t vsa_minor; /* minor device node */
+ netstackid_t vsa_nsid; /* netstack id */
+ vnd_errno_t vsa_errno; /* errno */
+} vnd_strioc_associate_t;
+
+typedef enum vnd_strioc_state {
+ VSS_UNKNOWN = 0,
+ VSS_COPYIN = 1,
+ VSS_COPYOUT = 2,
+} vnd_strioc_state_t;
+
+typedef struct vnd_strioc {
+ vnd_strioc_state_t vs_state;
+ caddr_t vs_addr;
+} vnd_strioc_t;
+
+/*
+ * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
+ * really, overlap is at the end of the day, inevitable.
+ */
+#define VND_SQUEUE_TAG_TX_DRAIN 0x42
+#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43
+#define VND_SQUEUE_TAG_VND_WRITE 0x44
+#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45
+#define VND_SQUEUE_TAG_STRBARRIER 0x46
+
+/*
+ * vnd reserved names. These are names which are reserved by vnd and thus
+ * shouldn't be used by some external program.
+ */
+static char *vnd_reserved_names[] = {
+ "ctl",
+ "zone",
+ NULL
+};
+
+/*
+ * vnd's DTrace probe macros
+ *
+ * DTRACE_VND* are all for a stable provider. We also have an unstable internal
+ * set of probes for reference count manipulation.
+ */
+#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
+ DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
+ DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4);
+
+#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5) \
+ DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5);
+
+#define DTRACE_VND_REFINC(vdp) \
+ DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+#define DTRACE_VND_REFDEC(vdp) \
+ DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+
+
+/*
+ * Tunables
+ */
+size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */
+size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */
+
+/*
+ * These numbers are designed as per-device tunables that are applied when a new
+ * vnd device is attached. They're a rough stab at what may be a reasonable
+ * amount of work to do in one burst in an squeue.
+ */
+size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */
+size_t vnd_flush_nburst = 10; /* 10 frames */
+
+/*
+ * Constants related to our sdev plugins
+ */
+#define VND_SDEV_NAME "vnd"
+#define VND_SDEV_ROOT "/dev/vnd"
+#define VND_SDEV_ZROOT "/dev/vnd/zone"
+
+/*
+ * vnd relies on privileges, not mode bits to limit access. As such, device
+ * files are read-write to everyone.
+ */
+#define VND_SDEV_MODE (S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \
+ S_IROTH | S_IWOTH)
+
+/*
+ * Statistic macros
+ */
+#define VND_STAT_INC(vsp, field, val) \
+ atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
+#define VND_LATENCY_1MS 1000000
+#define VND_LATENCY_10MS 10000000
+#define VND_LATENCY_100MS 100000000
+#define VND_LATENCY_1S 1000000000
+#define VND_LATENCY_10S 10000000000
+
+/*
+ * Constants for vnd hooks
+ */
+static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+#define IPV4_MCAST_LEN 3
+static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+#define IPV6_MCAST_LEN 2
+static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/*
+ * vnd internal data structures and types
+ */
+
+struct vnd_str;
+struct vnd_dev;
+struct vnd_pnsd;
+
+/*
+ * As part of opening the device stream we need to properly communicate with our
+ * underlying stream. This is a bit of an asynchronous dance and we need to
+ * properly work with dld to get everything set up. We have to initiate the
+ * conversation with dld and as such we keep track of our state here.
+ */
+typedef enum vnd_str_state {
+ VNS_S_INITIAL = 0,
+ VNS_S_INFO_SENT,
+ VNS_S_EXCLUSIVE_SENT,
+ VNS_S_ATTACH_SENT,
+ VNS_S_BIND_SENT,
+ VNS_S_SAP_PROMISC_SENT,
+ VNS_S_MULTI_PROMISC_SENT,
+ VNS_S_RX_ONLY_PROMISC_SENT,
+ VNS_S_FIXUP_PROMISC_SENT,
+ VNS_S_CAPAB_Q_SENT,
+ VNS_S_CAPAB_E_SENT,
+ VNS_S_ONLINE,
+ VNS_S_SHUTTING_DOWN,
+ VNS_S_MULTICAST_PROMISCOFF_SENT,
+ VNS_S_SAP_PROMISCOFF_SENT,
+ VNS_S_UNBIND_SENT,
+ VNS_S_ZOMBIE
+} vnd_str_state_t;
+
+typedef enum vnd_str_flags {
+ VNS_F_NEED_ZONE = 0x1,
+ VNS_F_TASKQ_DISPATCHED = 0x2,
+ VNS_F_CONDEMNED = 0x4,
+ VNS_F_FLOW_CONTROLLED = 0x8,
+ VNS_F_DRAIN_SCHEDULED = 0x10,
+ VNS_F_BARRIER = 0x20,
+ VNS_F_BARRIER_DONE = 0x40
+} vnd_str_flags_t;
+
+typedef enum vnd_capab_flags {
+ VNS_C_HCKSUM = 0x1,
+ VNS_C_DLD = 0x2,
+ VNS_C_DIRECT = 0x4,
+ VNS_C_HCKSUM_BADVERS = 0x8
+} vnd_capab_flags_t;
+
+/*
+ * Definitions to interact with direct callbacks
+ */
+typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *,
+ mac_header_info_t *);
+typedef uintptr_t vnd_mac_cookie_t;
+/* DLD Direct capability function */
+typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t);
+/* DLD Direct tx function */
+typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
+/* DLD Direct function to set flow control callback */
+typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t),
+ void *);
+/* DLD Direct function to see if flow controlled still */
+typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t);
+
+/*
+ * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of.
+ */
+typedef struct vnd_str_capab {
+ vnd_capab_flags_t vsc_flags;
+ t_uscalar_t vsc_hcksum_opts;
+ vnd_dld_cap_t vsc_capab_f;
+ void *vsc_capab_hdl;
+ vnd_dld_tx_t vsc_tx_f;
+ void *vsc_tx_hdl;
+ vnd_dld_set_fcb_t vsc_set_fcb_f;
+ void *vsc_set_fcb_hdl;
+ vnd_dld_is_fc_t vsc_is_fc_f;
+ void *vsc_is_fc_hdl;
+ vnd_mac_cookie_t vsc_fc_cookie;
+ void *vsc_tx_fc_hdl;
+} vnd_str_capab_t;
+
+/*
+ * The vnd_data_queue is a simple construct for storing a series of messages in
+ * a queue.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_data_queue {
+ struct vnd_str *vdq_vns; /* E */
+ kmutex_t vdq_lock;
+ kcondvar_t vdq_ready; /* Uses vdq_lock */
+ ssize_t vdq_max; /* L */
+ ssize_t vdq_cur; /* L */
+ mblk_t *vdq_head; /* L */
+ mblk_t *vdq_tail; /* L */
+} vnd_data_queue_t;
+
+typedef struct vnd_str_stat {
+ kstat_named_t vks_rbytes;
+ kstat_named_t vks_rpackets;
+ kstat_named_t vks_obytes;
+ kstat_named_t vks_opackets;
+ kstat_named_t vks_nhookindrops;
+ kstat_named_t vks_nhookoutdrops;
+ kstat_named_t vks_ndlpidrops;
+ kstat_named_t vks_ndataindrops;
+ kstat_named_t vks_ndataoutdrops;
+ kstat_named_t vks_tdrops;
+ kstat_named_t vks_linkname;
+ kstat_named_t vks_zonename;
+ kstat_named_t vks_nmacflow;
+ kstat_named_t vks_tmacflow;
+ kstat_named_t vks_mac_flow_1ms;
+ kstat_named_t vks_mac_flow_10ms;
+ kstat_named_t vks_mac_flow_100ms;
+ kstat_named_t vks_mac_flow_1s;
+ kstat_named_t vks_mac_flow_10s;
+} vnd_str_stat_t;
+
+/*
+ * vnd stream structure
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_str {
+ kmutex_t vns_lock;
+ kcondvar_t vns_cancelcv; /* Uses vns_lock */
+ kcondvar_t vns_barriercv; /* Uses vns_lock */
+ kcondvar_t vns_stcv; /* Uses vns_lock */
+ vnd_str_state_t vns_state; /* L */
+ vnd_str_state_t vns_laststate; /* L */
+ vnd_errno_t vns_errno; /* L */
+ vnd_str_flags_t vns_flags; /* L */
+ vnd_str_capab_t vns_caps; /* L */
+ taskq_ent_t vns_tqe; /* L */
+ vnd_data_queue_t vns_dq_read; /* E */
+ vnd_data_queue_t vns_dq_write; /* E */
+ mblk_t *vns_dlpi_inc; /* L */
+ queue_t *vns_rq; /* E */
+ queue_t *vns_wq; /* E */
+ queue_t *vns_lrq; /* E */
+ t_uscalar_t vns_dlpi_style; /* L */
+ t_uscalar_t vns_minwrite; /* L */
+ t_uscalar_t vns_maxwrite; /* L */
+ hrtime_t vns_fclatch; /* L */
+ hrtime_t vns_fcupdate; /* L */
+ kstat_t *vns_kstat; /* E */
+ gsqueue_t *vns_squeue; /* E */
+ mblk_t vns_drainblk; /* E + X */
+ mblk_t vns_barrierblk; /* E + X */
+ vnd_str_stat_t vns_ksdata; /* A */
+ size_t vns_nflush; /* L */
+ size_t vns_bsize; /* L */
+ struct vnd_dev *vns_dev; /* E + X */
+ struct vnd_pnsd *vns_nsd; /* E + X */
+} vnd_str_t;
+
+typedef enum vnd_dev_flags {
+ VND_D_ATTACH_INFLIGHT = 0x001,
+ VND_D_ATTACHED = 0x002,
+ VND_D_LINK_INFLIGHT = 0x004,
+ VND_D_LINKED = 0x008,
+ VND_D_CONDEMNED = 0x010,
+ VND_D_ZONE_DYING = 0x020,
+ VND_D_OPENED = 0x040
+} vnd_dev_flags_t;
+
+/*
+ * This represents the data associated with a minor device instance.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_dev {
+ kmutex_t vdd_lock;
+ list_node_t vdd_link; /* GL */
+ list_node_t vdd_nslink; /* NSL */
+ int vdd_ref; /* L */
+ vnd_dev_flags_t vdd_flags; /* L */
+ minor_t vdd_minor; /* E */
+ dev_t vdd_devid; /* E */
+ ldi_ident_t vdd_ldiid; /* E */
+ ldi_handle_t vdd_ldih; /* X */
+ cred_t *vdd_cr; /* X */
+ vnd_str_t *vdd_str; /* L */
+ struct pollhead vdd_ph; /* E */
+ struct vnd_pnsd *vdd_nsd; /* E + X */
+ char vdd_datalink[VND_NAMELEN]; /* L */
+ char vdd_lname[VND_NAMELEN]; /* L */
+} vnd_dev_t;
+
+typedef enum vnd_pnsd_flags {
+ VND_NS_CONDEMNED = 0x1
+} vnd_pnsd_flags_t;
+
+/*
+ * Per netstack data structure.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_pnsd {
+ list_node_t vpnd_link; /* protected by global dev lock */
+ zoneid_t vpnd_zid; /* E */
+ netstackid_t vpnd_nsid; /* E */
+ boolean_t vpnd_hooked; /* E */
+ net_handle_t vpnd_neti_v4; /* E */
+ hook_family_t vpnd_family_v4; /* E */
+ hook_event_t vpnd_event_in_v4; /* E */
+ hook_event_t vpnd_event_out_v4; /* E */
+ hook_event_token_t vpnd_token_in_v4; /* E */
+ hook_event_token_t vpnd_token_out_v4; /* E */
+ net_handle_t vpnd_neti_v6; /* E */
+ hook_family_t vpnd_family_v6; /* E */
+ hook_event_t vpnd_event_in_v6; /* E */
+ hook_event_t vpnd_event_out_v6; /* E */
+ hook_event_token_t vpnd_token_in_v6; /* E */
+ hook_event_token_t vpnd_token_out_v6; /* E */
+ kmutex_t vpnd_lock; /* Protects remaining members */
+ kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */
+ int vpnd_ref; /* L */
+ vnd_pnsd_flags_t vpnd_flags; /* L */
+ list_t vpnd_dev_list; /* L */
+} vnd_pnsd_t;
+
+static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *);
+
+/*
+ * Drop function signature.
+ */
+typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *);
+
+static void
+vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndataindrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndataoutdrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_nhookindrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_nhookoutdrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+/* ARGSUSED */
+static void
+vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ panic("illegal vnd drop");
+}
+
+/* ARGSUSED */
+static void
+vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+ mac_header_info_t *mhip)
+{
+ mblk_t *mp;
+
+ while (mp_chain != NULL) {
+ mp = mp_chain;
+ mp_chain = mp->b_next;
+ vnd_drop_hook_in(vsp, mp, "stream not associated");
+ }
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup(netstackid_t nsid)
+{
+ vnd_pnsd_t *nsp;
+
+ mutex_enter(&vnd_dev_lock);
+ for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+ nsp = list_next(&vnd_nsd_list, nsp)) {
+ if (nsp->vpnd_nsid == nsid) {
+ mutex_enter(&nsp->vpnd_lock);
+ VERIFY(nsp->vpnd_ref >= 0);
+ nsp->vpnd_ref++;
+ mutex_exit(&nsp->vpnd_lock);
+ break;
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zid(zoneid_t zid)
+{
+ netstack_t *ns;
+ vnd_pnsd_t *nsp;
+ ns = netstack_find_by_zoneid(zid);
+ if (ns == NULL)
+ return (NULL);
+ nsp = vnd_nsd_lookup(ns->netstack_stackid);
+ netstack_rele(ns);
+ return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zonename(char *zname)
+{
+ zone_t *zonep;
+ vnd_pnsd_t *nsp;
+
+ zonep = zone_find_by_name(zname);
+ if (zonep == NULL)
+ return (NULL);
+
+ nsp = vnd_nsd_lookup_by_zid(zonep->zone_id);
+ zone_rele(zonep);
+ return (nsp);
+}
+
+static void
+vnd_nsd_ref(vnd_pnsd_t *nsp)
+{
+ mutex_enter(&nsp->vpnd_lock);
+ /*
+ * This can only be used on something that has been obtained through
+ * some other means. As such, the caller should already have a reference
+ * before adding another one. This function should not be used as a
+ * means of creating the initial reference.
+ */
+ VERIFY(nsp->vpnd_ref > 0);
+ nsp->vpnd_ref++;
+ mutex_exit(&nsp->vpnd_lock);
+ cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static void
+vnd_nsd_rele(vnd_pnsd_t *nsp)
+{
+ mutex_enter(&nsp->vpnd_lock);
+ VERIFY(nsp->vpnd_ref > 0);
+ nsp->vpnd_ref--;
+ mutex_exit(&nsp->vpnd_lock);
+ cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static vnd_dev_t *
+vnd_dev_lookup(minor_t m)
+{
+ vnd_dev_t *vdp;
+ mutex_enter(&vnd_dev_lock);
+ for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+ vdp = list_next(&vnd_dev_list, vdp)) {
+ if (vdp->vdd_minor == m) {
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ mutex_exit(&vdp->vdd_lock);
+ break;
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (vdp);
+}
+
+static void
+vnd_dev_free(vnd_dev_t *vdp)
+{
+ /*
+ * When the STREAM exists we need to go through and make sure
+ * communication gets torn down. As part of closing the stream, we
+ * guarantee that nothing else should be able to enter the stream layer
+ * at this point. That means no one should be able to call
+ * read(),write() or one of the frameio ioctls.
+ */
+ if (vdp->vdd_flags & VND_D_ATTACHED) {
+ (void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ crfree(vdp->vdd_cr);
+ vdp->vdd_cr = NULL;
+
+ /*
+ * We have to remove ourselves from our parents list now. It is
+ * really quite important that we have already set the condemend
+ * flag here so that our containing netstack basically knows
+ * that we're on the way down and knows not to wait for us. It's
+ * also important that we do that before we put a rele on the
+ * the device as that is the point at which it will check again.
+ */
+ mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+ list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ vnd_nsd_rele(vdp->vdd_nsd);
+ vdp->vdd_nsd = NULL;
+ }
+ ASSERT(vdp->vdd_flags & VND_D_CONDEMNED);
+ id_free(vnd_minors, vdp->vdd_minor);
+ mutex_destroy(&vdp->vdd_lock);
+ kmem_cache_free(vnd_dev_cache, vdp);
+}
+
+static void
+vnd_dev_ref(vnd_dev_t *vdp)
+{
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ mutex_exit(&vdp->vdd_lock);
+}
+
+/*
+ * As part of releasing the hold on this we may tear down a given vnd_dev_t As
+ * such we need to make sure that we grab the list lock first before grabbing
+ * the vnd_dev_t's lock to ensure proper lock ordering.
+ */
+static void
+vnd_dev_rele(vnd_dev_t *vdp)
+{
+ mutex_enter(&vnd_dev_lock);
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref--;
+ DTRACE_VND_REFDEC(vdp);
+ if (vdp->vdd_ref > 0) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ return;
+ }
+
+ /*
+ * Now that we've removed this from the list, we can go ahead and
+ * drop the list lock. No one else can find this device and reference
+ * it. As its reference count is zero, it by definition does not have
+ * any remaining entries in /devices that could lead someone back to
+ * this.
+ */
+ vdp->vdd_flags |= VND_D_CONDEMNED;
+ list_remove(&vnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+
+ vnd_dev_free(vdp);
+}
+
+/*
+ * Insert a mesage block chain if there's space, otherwise drop it. Return one
+ * so someone who was waiting for data would now end up having found it. eg.
+ * caller should consider a broadcast.
+ */
+static int
+vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved,
+ vnd_dropper_f dropf)
+{
+ size_t msize;
+
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ if (reserved == B_FALSE) {
+ msize = msgsize(mp);
+ if (vqp->vdq_cur + msize > vqp->vdq_max) {
+ dropf(vqp->vdq_vns, mp, "buffer full");
+ return (0);
+ }
+ vqp->vdq_cur += msize;
+ }
+
+ if (vqp->vdq_head == NULL) {
+ ASSERT(vqp->vdq_tail == NULL);
+ vqp->vdq_head = mp;
+ vqp->vdq_tail = mp;
+ } else {
+ vqp->vdq_tail->b_next = mp;
+ vqp->vdq_tail = mp;
+ }
+
+ return (1);
+}
+
+/*
+ * Remove a message message block chain. If the amount of space in the buffer
+ * has changed we return 1. We have no way of knowing whether or not there is
+ * enough space overall for a given writer who is blocked, so we always end up
+ * having to return true and thus tell consumers that they should consider
+ * signalling.
+ */
+static int
+vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp)
+{
+ size_t msize;
+ mblk_t *mp;
+
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(mpp != NULL);
+ if (vqp->vdq_head == NULL) {
+ ASSERT(vqp->vdq_tail == NULL);
+ *mpp = NULL;
+ return (0);
+ }
+
+ mp = vqp->vdq_head;
+ msize = msgsize(mp);
+
+ vqp->vdq_cur -= msize;
+ if (mp->b_next == NULL) {
+ vqp->vdq_head = NULL;
+ vqp->vdq_tail = NULL;
+ /*
+ * We can't be certain that this is always going to be zero.
+ * Someone may have basically taken a reservation of space on
+ * the data queue, eg. claimed spae but not yet pushed it on
+ * yet.
+ */
+ ASSERT(vqp->vdq_cur >= 0);
+ } else {
+ vqp->vdq_head = mp->b_next;
+ ASSERT(vqp->vdq_cur > 0);
+ }
+ mp->b_next = NULL;
+ *mpp = mp;
+ return (1);
+}
+
+/*
+ * Reserve space in the queue. This will bump up the size of the queue and
+ * entitle the user to push something on later without bumping the space.
+ */
+static int
+vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(size >= 0);
+
+ if (size == 0)
+ return (0);
+
+ if (size + vqp->vdq_cur > vqp->vdq_max)
+ return (0);
+
+ vqp->vdq_cur += size;
+ return (1);
+}
+
+static void
+vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(size > 0);
+ ASSERT(size <= vqp->vdq_cur);
+
+ vqp->vdq_cur -= size;
+}
+
+static void
+vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf)
+{
+ mblk_t *mp, *next;
+
+ mutex_enter(&vqp->vdq_lock);
+ for (mp = vqp->vdq_head; mp != NULL; mp = next) {
+ next = mp->b_next;
+ mp->b_next = NULL;
+ dropf(vqp->vdq_vns, mp, "vnd_dq_flush");
+ }
+ vqp->vdq_cur = 0;
+ vqp->vdq_head = NULL;
+ vqp->vdq_tail = NULL;
+ mutex_exit(&vqp->vdq_lock);
+}
+
+static boolean_t
+vnd_dq_is_empty(vnd_data_queue_t *vqp)
+{
+ boolean_t ret;
+
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_head == NULL)
+ ret = B_TRUE;
+ else
+ ret = B_FALSE;
+ mutex_exit(&vqp->vdq_lock);
+
+ return (ret);
+}
+
+/*
+ * Get a network uint16_t from the message and translate it into something the
+ * host understands.
+ */
+static int
+vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out)
+{
+ size_t mpsize;
+ uint8_t *bp;
+
+ mpsize = msgsize(mp);
+ /* Check for overflow */
+ if (off + sizeof (uint16_t) > mpsize)
+ return (1);
+
+ mpsize = MBLKL(mp);
+ while (off >= mpsize) {
+ mp = mp->b_cont;
+ off -= mpsize;
+ mpsize = MBLKL(mp);
+ }
+
+ /*
+ * Data is in network order. Note the second byte of data might be in
+ * the next mp.
+ */
+ bp = mp->b_rptr + off;
+ *out = *bp << 8;
+ if (off + 1 == mpsize) {
+ mp = mp->b_cont;
+ bp = mp->b_rptr;
+ } else {
+ bp++;
+ }
+
+ *out |= *bp;
+ return (0);
+}
+
+/*
+ * Given an mblk chain find the mblk and address of a particular offset.
+ */
+static int
+vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp)
+{
+ size_t mpsize;
+
+ if (off >= msgsize(mp))
+ return (1);
+
+ mpsize = MBLKL(mp);
+ while (off >= mpsize) {
+ mp = mp->b_cont;
+ off -= mpsize;
+ mpsize = MBLKL(mp);
+ }
+ *mpp = mp;
+ *offp = (uintptr_t)mp->b_rptr + off;
+
+ return (0);
+}
+
+/*
+ * Fetch the destination mac address. Set *dstp to that mac address. If the data
+ * is not contiguous in the first mblk_t, fill in datap and set *dstp to it.
+ */
+static int
+vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap)
+{
+ int i;
+
+ if (MBLKL(mp) >= ETHERADDRL) {
+ *dstpp = mp->b_rptr;
+ return (0);
+ }
+
+ *dstpp = datap;
+ for (i = 0; i < ETHERADDRL; i += 2, datap += 2) {
+ if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4,
+ hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6,
+ hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop)
+{
+ uint16_t etype;
+ hook_pkt_event_t info;
+ size_t offset, mblen;
+ uint8_t *dstp;
+ uint8_t dstaddr[6];
+ hook_event_t he;
+ hook_event_token_t het;
+ net_handle_t neti;
+
+ /*
+ * Before we can ask if we're interested we have to do enough work to
+ * determine the ethertype.
+ */
+
+ /* Byte 12 is either the VLAN tag or the ethertype */
+ if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) {
+ ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ if (etype == ETHERTYPE_VLAN) {
+ /* Actual ethertype is another four bytes in */
+ if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) {
+ ddrop(vsp, *mpp,
+ "packet has incomplete ethernet vlan header");
+ *mpp = NULL;
+ return (1);
+ }
+ offset = sizeof (struct ether_vlan_header);
+ } else {
+ offset = sizeof (struct ether_header);
+ }
+
+ /*
+ * At the moment we only hook on the kinds of things that the IP module
+ * would normally.
+ */
+ if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6)
+ return (0);
+
+ if (etype == ETHERTYPE_IP) {
+ neti = netiv4;
+ he = hev4;
+ het = hetv4;
+ } else {
+ neti = netiv6;
+ he = hev6;
+ het = hetv6;
+ }
+
+ if (!he.he_interested)
+ return (0);
+
+
+ if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) {
+ ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ /*
+ * Now that we know we're interested, we have to do some additional
+ * sanity checking for IPF's sake, ala ip_check_length(). Specifically
+ * we need to check to make sure that the remaining packet size,
+ * excluding MAC, is at least the size of an IP header.
+ */
+ mblen = msgsize(*mpp);
+ if ((etype == ETHERTYPE_IP &&
+ mblen - offset < IP_SIMPLE_HDR_LENGTH) ||
+ (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) {
+ ddrop(vsp, *mpp, "packet has invalid IP header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ info.hpe_protocol = neti;
+ info.hpe_ifp = (phy_if_t)vsp;
+ info.hpe_ofp = (phy_if_t)vsp;
+ info.hpe_mp = mpp;
+ info.hpe_flags = 0;
+
+ if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0)
+ info.hpe_flags |= HPE_BROADCAST;
+ else if (etype == ETHERTYPE_IP &&
+ bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0)
+ info.hpe_flags |= HPE_MULTICAST;
+ else if (etype == ETHERTYPE_IPV6 &&
+ bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0)
+ info.hpe_flags |= HPE_MULTICAST;
+
+ if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb,
+ (uintptr_t *)&info.hpe_hdr) != 0) {
+ ddrop(vsp, *mpp, "packet too small -- "
+ "unable to find payload");
+ *mpp = NULL;
+ return (1);
+ }
+
+ if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) {
+ hdrop(vsp, *mpp, "drooped by hooks");
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * This should not be used for DL_INFO_REQ.
+ */
+static mblk_t *
+vnd_dlpi_alloc(size_t len, t_uscalar_t prim)
+{
+ mblk_t *mp;
+ mp = allocb(len, BPRI_MED);
+ if (mp == NULL)
+ return (NULL);
+
+ mp->b_datap->db_type = M_PROTO;
+ mp->b_wptr = mp->b_rptr + len;
+ bzero(mp->b_rptr, len);
+ ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
+
+ return (mp);
+}
+
+static void
+vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp)
+{
+ mblk_t **mpp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ ASSERT(mp->b_next == NULL);
+ mpp = &vsp->vns_dlpi_inc;
+ while (*mpp != NULL)
+ mpp = &((*mpp)->b_next);
+ *mpp = mp;
+}
+
+static mblk_t *
+vnd_dlpi_inc_pop(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vsp->vns_dlpi_inc;
+ if (mp != NULL) {
+ VERIFY(mp->b_next == NULL || mp->b_next != mp);
+ vsp->vns_dlpi_inc = mp->b_next;
+ mp->b_next = NULL;
+ }
+ return (mp);
+}
+
+static int
+vnd_st_sinfo(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_info_req_t *dlir;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+ BPRI_HI);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+ vsp->vns_state = VNS_S_INFO_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+
+ mp->b_datap->db_type = M_PCPROTO;
+ dlir = (dl_info_req_t *)mp->b_rptr;
+ mp->b_wptr = (uchar_t *)&dlir[1];
+ dlir->dl_primitive = DL_INFO_REQ;
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_info(vnd_str_t *vsp)
+{
+ dl_info_ack_t *dlia;
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ dlia = (dl_info_ack_t *)mp->b_rptr;
+ vsp->vns_dlpi_style = dlia->dl_provider_style;
+ vsp->vns_minwrite = dlia->dl_min_sdu;
+ vsp->vns_maxwrite = dlia->dl_max_sdu;
+
+ /*
+ * At this time we only support DL_ETHER devices.
+ */
+ if (dlia->dl_mac_type != DL_ETHER) {
+ freemsg(mp);
+ vsp->vns_errno = VND_E_NOTETHER;
+ return (1);
+ }
+
+ /*
+ * Because vnd operates on entire packets, we need to manually account
+ * for the ethernet header information. We add the size of the
+ * ether_vlan_header to account for this, regardless if it is using
+ * vlans or not.
+ */
+ vsp->vns_maxwrite += sizeof (struct ether_vlan_header);
+
+ freemsg(mp);
+ return (0);
+}
+
+static int
+vnd_st_sexclusive(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ vsp->vns_state = VNS_S_EXCLUSIVE_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+ return (0);
+}
+
+static int
+vnd_st_exclusive(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_exclusive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_EXCLUSIVE_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_exclusive: got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_DLEXCL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+/*
+ * Send down a DLPI_ATTACH_REQ.
+ */
+static int
+vnd_st_sattach(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0;
+ vsp->vns_state = VNS_S_ATTACH_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_attach(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_ATTACH_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_attach: Got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_ATTACHFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_sbind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_bind_req_t *dbrp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
+ DL_BIND_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+ dbrp = (dl_bind_req_t *)(mp->b_rptr);
+ dbrp->dl_sap = 0;
+ dbrp->dl_service_mode = DL_CLDLS;
+
+ vsp->vns_state = VNS_S_BIND_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_bind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+
+ if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_BINDFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+ mblk_t *mp;
+ dl_promiscon_req_t *dprp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ dprp = (dl_promiscon_req_t *)mp->b_rptr;
+ dprp->dl_level = type;
+
+ vsp->vns_state = next;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_promisc(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_promisc");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_PROMISCON_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_promisc: Got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_PROMISCFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_scapabq(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ vsp->vns_state = VNS_S_CAPAB_Q_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+ mac_header_info_t *mhip)
+{
+ int signal = 0;
+ mblk_t *mp;
+ vnd_pnsd_t *nsp = vsp->vns_nsd;
+
+ ASSERT(vsp != NULL);
+ ASSERT(mp_chain != NULL);
+
+ for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+ uint16_t vid;
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * If we were operating in a traditional dlpi context then we
+ * would have enabled DLIOCRAW and rather than the fast path, we
+ * would come through dld_str_rx_raw. That function does two
+ * things that we have to consider doing ourselves. The first is
+ * that it adjusts the b_rptr back to account for dld bumping us
+ * past the mac header. It also tries to account for cases where
+ * mac provides an illusion of the mac header. Fortunately, dld
+ * only allows the fastpath when the media type is the same as
+ * the native type. Therefore all we have to do here is adjust
+ * the b_rptr.
+ */
+ ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
+ mp->b_rptr -= mhip->mhi_hdrsize;
+ vid = VLAN_ID(mhip->mhi_tci);
+ if (mhip->mhi_istagged && vid != VLAN_ID_NONE) {
+ /*
+ * This is an overlapping copy. Do not use bcopy(9F).
+ */
+ (void) memmove(mp->b_rptr + 4, mp->b_rptr, 12);
+ mp->b_rptr += 4;
+ }
+
+ if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+ nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4,
+ nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6,
+ nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0)
+ continue;
+
+ VND_STAT_INC(vsp, vks_rpackets, 1);
+ VND_STAT_INC(vsp, vks_rbytes, msgsize(mp));
+ DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL,
+ vnd_str_t *, vsp, mblk_t *, mp);
+ mutex_enter(&vsp->vns_dq_read.vdq_lock);
+ signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE,
+ vnd_drop_in);
+ mutex_exit(&vsp->vns_dq_read.vdq_lock);
+ }
+
+ if (signal != 0) {
+ cv_broadcast(&vsp->vns_dq_read.vdq_ready);
+ pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM);
+ }
+
+}
+
+static void
+vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff)
+{
+ VND_STAT_INC(vsp, vks_nmacflow, 1);
+ VND_STAT_INC(vsp, vks_tmacflow, diff);
+ if (diff >= VND_LATENCY_1MS)
+ VND_STAT_INC(vsp, vks_mac_flow_1ms, 1);
+ if (diff >= VND_LATENCY_10MS)
+ VND_STAT_INC(vsp, vks_mac_flow_10ms, 1);
+ if (diff >= VND_LATENCY_100MS)
+ VND_STAT_INC(vsp, vks_mac_flow_100ms, 1);
+ if (diff >= VND_LATENCY_1S)
+ VND_STAT_INC(vsp, vks_mac_flow_1s, 1);
+ if (diff >= VND_LATENCY_10S)
+ VND_STAT_INC(vsp, vks_mac_flow_10s, 1);
+}
+
+/*
+ * This is a callback from MAC that indicates that we are allowed to send
+ * packets again.
+ */
+static void
+vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie)
+{
+ vnd_str_t *vsp = arg;
+ hrtime_t now;
+
+ mutex_enter(&vsp->vns_lock);
+ now = gethrtime();
+
+ /*
+ * Check for the case that we beat vnd_squeue_tx_one to the punch.
+ * There's also an additional case here that we got notified because
+ * we're sharing a device that ran out of tx descriptors, even though it
+ * wasn't because of us.
+ */
+ if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) {
+ vsp->vns_fcupdate = now;
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+
+ ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED);
+ ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie);
+ vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED;
+ vsp->vns_caps.vsc_fc_cookie = NULL;
+ vsp->vns_fclatch = 0;
+ DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t,
+ vsp->vns_dq_write.vdq_cur, uintptr_t, cookie);
+ /*
+ * If someone has asked to flush the squeue and thus inserted a barrier,
+ * than we shouldn't schedule a drain.
+ */
+ if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) {
+ vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+ gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk,
+ vnd_squeue_tx_drain, vsp, GSQUEUE_FILL,
+ VND_SQUEUE_TAG_MAC_FLOW_CONTROL);
+ }
+ mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp)
+{
+ ASSERT(MUTEX_HELD(&vsp->vns_lock));
+ VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+ DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0);
+}
+
+static void
+vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph)
+{
+ ASSERT(MUTEX_HELD(&vsp->vns_lock));
+ VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+ DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0);
+}
+
+static int
+vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc)
+{
+ int ret;
+ dld_capab_direct_t d;
+ mac_perim_handle_t mph;
+ vnd_str_capab_t *c = &vsp->vns_caps;
+
+ bzero(&d, sizeof (d));
+ d.di_rx_cf = (uintptr_t)rxfunc;
+ d.di_rx_ch = vsp;
+ d.di_flags = DI_DIRECT_RAW;
+
+ vnd_mac_enter(vsp, &mph);
+
+ /*
+ * If we're coming in here for a second pass, we need to make sure that
+ * we remove an existing flow control notification callback, otherwise
+ * we'll create a duplicate that will remain with garbage data.
+ */
+ if (c->vsc_tx_fc_hdl != NULL) {
+ ASSERT(c->vsc_set_fcb_hdl != NULL);
+ (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL,
+ c->vsc_tx_fc_hdl);
+ c->vsc_tx_fc_hdl = NULL;
+ }
+
+ if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl,
+ DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) {
+ c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df;
+ c->vsc_tx_hdl = d.di_tx_dh;
+ c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df;
+ c->vsc_set_fcb_hdl = d.di_tx_cb_dh;
+ c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df;
+ c->vsc_is_fc_hdl = d.di_tx_fctl_dh;
+ c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl,
+ vnd_mac_flow_control, vsp);
+ c->vsc_flags |= VNS_C_DIRECT;
+ ret = 0;
+ } else {
+ vsp->vns_errno = VND_E_DIRECTFAIL;
+ ret = 1;
+ }
+ vnd_mac_exit(vsp, mph);
+ return (ret);
+}
+
+static int
+vnd_st_capabq(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_capability_ack_t *cap;
+ dl_capability_sub_t *subp;
+ dl_capab_hcksum_t *hck;
+ dl_capab_dld_t *dld;
+ unsigned char *rp;
+ int ret = 0;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+
+ rp = mp->b_rptr;
+ cap = (dl_capability_ack_t *)rp;
+ if (cap->dl_sub_length == 0)
+ goto done;
+
+ /* Don't try to process something too big */
+ if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vsp->vns_errno = VND_E_CAPACKINVAL;
+ ret = 1;
+ goto done;
+ }
+
+ rp += cap->dl_sub_offset;
+
+ while (cap->dl_sub_length > 0) {
+ subp = (dl_capability_sub_t *)rp;
+ /* Sanity check something crazy from down below */
+ if (subp->dl_length + sizeof (dl_capability_sub_t) >
+ cap->dl_sub_length) {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vsp->vns_errno = VND_E_SUBCAPINVAL;
+ ret = 1;
+ goto done;
+ }
+
+ switch (subp->dl_cap) {
+ case DL_CAPAB_HCKSUM:
+ hck = (dl_capab_hcksum_t *)(rp +
+ sizeof (dl_capability_sub_t));
+ if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) {
+ vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS;
+ break;
+ }
+ if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) !=
+ B_TRUE) {
+ vsp->vns_errno = VND_E_CAPABPASS;
+ ret = 1;
+ goto done;
+ }
+ vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM;
+ vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags;
+ break;
+ case DL_CAPAB_DLD:
+ dld = (dl_capab_dld_t *)(rp +
+ sizeof (dl_capability_sub_t));
+ if (dld->dld_version != DLD_CURRENT_VERSION) {
+ vsp->vns_errno = VND_E_DLDBADVERS;
+ ret = 1;
+ goto done;
+ }
+ if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) !=
+ B_TRUE) {
+ vsp->vns_errno = VND_E_CAPABPASS;
+ ret = 1;
+ goto done;
+ }
+ vsp->vns_caps.vsc_flags |= VNS_C_DLD;
+ vsp->vns_caps.vsc_capab_f =
+ (vnd_dld_cap_t)dld->dld_capab;
+ vsp->vns_caps.vsc_capab_hdl =
+ (void *)dld->dld_capab_handle;
+ /*
+ * At this point in time, we have to set up a direct
+ * function that drops all input. This validates that
+ * we'll be able to set up direct input and that we can
+ * easily switch it earlier to the real data function
+ * when we've plumbed everything up.
+ */
+ if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) {
+ /* vns_errno set by vnd_dld_cap_enable */
+ ret = 1;
+ goto done;
+ }
+ break;
+ default:
+ /* Ignore unsupported cap */
+ break;
+ }
+
+ rp += sizeof (dl_capability_sub_t) + subp->dl_length;
+ cap->dl_sub_length -= sizeof (dl_capability_sub_t) +
+ subp->dl_length;
+ }
+
+done:
+ /* Make sure we enabled direct callbacks */
+ if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) {
+ vsp->vns_errno = VND_E_DIRECTNOTSUP;
+ ret = 1;
+ }
+
+ freemsg(mp);
+ return (ret);
+}
+
+static void
+vnd_st_sonline(vnd_str_t *vsp)
+{
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ vsp->vns_state = VNS_S_ONLINE;
+ cv_broadcast(&vsp->vns_stcv);
+}
+
+static void
+vnd_st_shutdown(vnd_str_t *vsp)
+{
+ mac_perim_handle_t mph;
+ vnd_str_capab_t *vsc = &vsp->vns_caps;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ /*
+ * At this point in time we know that there is no one transmitting as
+ * our final reference has been torn down and that vnd_s_close inserted
+ * a barrier to validate that everything is flushed.
+ */
+ if (vsc->vsc_flags & VNS_C_DIRECT) {
+ vnd_mac_enter(vsp, &mph);
+ vsc->vsc_flags &= ~VNS_C_DIRECT;
+ (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL,
+ vsc->vsc_tx_fc_hdl);
+ vsc->vsc_tx_fc_hdl = NULL;
+ (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT,
+ NULL, DLD_DISABLE);
+ vnd_mac_exit(vsp, mph);
+ }
+}
+
+static boolean_t
+vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+ boolean_t ret = B_TRUE;
+ mblk_t *mp;
+ dl_promiscoff_req_t *dprp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ);
+ if (mp == NULL) {
+ cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+ "promiscoff request");
+ ret = B_FALSE;
+ goto next;
+ }
+
+ dprp = (dl_promiscoff_req_t *)mp->b_rptr;
+ dprp->dl_level = type;
+
+ putnext(vsp->vns_wq, mp);
+next:
+ vsp->vns_state = next;
+ cv_broadcast(&vsp->vns_stcv);
+ return (ret);
+}
+
+static void
+vnd_st_promiscoff(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ /*
+ * Unlike other cases where we guard against the incoming packet being
+ * NULL, during tear down we try to keep driving and therefore we may
+ * have gotten here due to an earlier failure, so there's nothing to do.
+ */
+ mp = vnd_dlpi_inc_pop(vsp);
+ if (mp == NULL)
+ return;
+
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_promiscoff");
+ return;
+ }
+
+ if (cprim != DL_PROMISCOFF_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_promiscoff: Got ack/nack for wrong primitive");
+ return;
+ }
+
+ if (prim == DL_ERROR_ACK) {
+ cmn_err(CE_WARN, "!failed to disable promiscuos mode during "
+ "vnd teardown");
+ }
+}
+
+static boolean_t
+vnd_st_sunbind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ boolean_t ret = B_TRUE;
+
+ mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
+ if (mp == NULL) {
+ cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+ "unbind request");
+ ret = B_FALSE;
+ goto next;
+ }
+
+ putnext(vsp->vns_wq, mp);
+next:
+ vsp->vns_state = VNS_S_UNBIND_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ return (ret);
+}
+
+static void
+vnd_st_unbind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ /*
+ * Unlike other cases where we guard against the incoming packet being
+ * NULL, during tear down we try to keep driving and therefore we may
+ * have gotten here due to an earlier failure, so there's nothing to do.
+ */
+ mp = vnd_dlpi_inc_pop(vsp);
+ if (mp == NULL)
+ goto next;
+
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_unbind");
+ goto next;
+ }
+
+ if (cprim != DL_UNBIND_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_unbind: Got ack/nack for wrong primitive");
+ goto next;
+ }
+
+ if (prim == DL_ERROR_ACK) {
+ cmn_err(CE_WARN, "!failed to unbind stream during vnd "
+ "teardown");
+ }
+
+next:
+ vsp->vns_state = VNS_S_ZOMBIE;
+ cv_broadcast(&vsp->vns_stcv);
+}
+
+/*
+ * Perform state transitions. This is a one way shot down the flow chart
+ * described in the big theory statement.
+ */
+static void
+vnd_str_state_transition(void *arg)
+{
+ boolean_t died = B_FALSE;
+ vnd_str_t *vsp = arg;
+ mblk_t *mp;
+
+ mutex_enter(&vsp->vns_lock);
+ if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL &&
+ vsp->vns_state != VNS_S_SHUTTING_DOWN)) {
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+
+ /*
+ * When trying to shut down, or unwinding from a failed enabling, rather
+ * than immediately entering the ZOMBIE state, we may instead opt to try
+ * and enter the next state in the progression. This is especially
+ * important when trying to tear everything down.
+ */
+loop:
+ DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp,
+ vnd_str_state_t, vsp->vns_state);
+ switch (vsp->vns_state) {
+ case VNS_S_INITIAL:
+ VERIFY(vsp->vns_dlpi_inc == NULL);
+ if (vnd_st_sinfo(vsp) != 0)
+ died = B_TRUE;
+ break;
+ case VNS_S_INFO_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_info(vsp) == 0) {
+ if (vnd_st_sexclusive(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_EXCLUSIVE_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_exclusive(vsp) == 0) {
+ if (vsp->vns_dlpi_style == DL_STYLE2) {
+ if (vnd_st_sattach(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ if (vnd_st_sbind(vsp) != 0)
+ died = B_TRUE;
+ }
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_ATTACH_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_attach(vsp) == 0) {
+ if (vnd_st_sbind(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_BIND_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_bind(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_SAP,
+ VNS_S_SAP_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_SAP_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI,
+ VNS_S_MULTI_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_MULTI_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY,
+ VNS_S_RX_ONLY_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_RX_ONLY_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS,
+ VNS_S_FIXUP_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_FIXUP_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_scapabq(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_CAPAB_Q_SENT:
+ if (vnd_st_capabq(vsp) != 0)
+ died = B_TRUE;
+ else
+ vnd_st_sonline(vsp);
+ break;
+ case VNS_S_SHUTTING_DOWN:
+ vnd_st_shutdown(vsp);
+ if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI,
+ VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE)
+ goto loop;
+ break;
+ case VNS_S_MULTICAST_PROMISCOFF_SENT:
+ vnd_st_promiscoff(vsp);
+ if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP,
+ VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE)
+ goto loop;
+ break;
+ case VNS_S_SAP_PROMISCOFF_SENT:
+ vnd_st_promiscoff(vsp);
+ if (vnd_st_sunbind(vsp) == B_FALSE)
+ goto loop;
+ break;
+ case VNS_S_UNBIND_SENT:
+ vnd_st_unbind(vsp);
+ break;
+ case VNS_S_ZOMBIE:
+ while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+ vnd_drop_ctl(vsp, mp, "vsp received data as a zombie");
+ break;
+ default:
+ panic("vnd_str_t entered an unknown state");
+ }
+
+ if (died == B_TRUE) {
+ ASSERT(vsp->vns_errno != VND_E_SUCCESS);
+ vsp->vns_laststate = vsp->vns_state;
+ vsp->vns_state = VNS_S_ZOMBIE;
+ cv_broadcast(&vsp->vns_stcv);
+ }
+
+ mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_dlpi_taskq_dispatch(void *arg)
+{
+ vnd_str_t *vsp = arg;
+ int run = 1;
+
+ while (run != 0) {
+ vnd_str_state_transition(vsp);
+ mutex_enter(&vsp->vns_lock);
+ if (vsp->vns_flags & VNS_F_CONDEMNED ||
+ vsp->vns_dlpi_inc == NULL) {
+ run = 0;
+ vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED;
+ }
+ if (vsp->vns_flags & VNS_F_CONDEMNED)
+ cv_signal(&vsp->vns_cancelcv);
+ mutex_exit(&vsp->vns_lock);
+ }
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getptmue(net_handle_t neti)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ size_t nelem, net_ifaddr_t type[], void *storage)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ zoneid_t *zid)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ uint64_t *flags)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy)
+{
+ return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_phylookup(net_handle_t neti, const char *name)
+{
+ return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static lif_if_t
+vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_route(net_handle_t neti, struct sockaddr *address,
+ struct sockaddr *next)
+{
+ return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp)
+{
+ return (-1);
+}
+
+static net_protocol_t vnd_neti_info_v4 = {
+ NETINFO_VERSION,
+ NHF_VND_INET,
+ vnd_neti_getifname,
+ vnd_neti_getmtu,
+ vnd_neti_getptmue,
+ vnd_neti_getlifaddr,
+ vnd_neti_getlifzone,
+ vnd_neti_getlifflags,
+ vnd_neti_phygetnext,
+ vnd_neti_phylookup,
+ vnd_neti_lifgetnext,
+ vnd_neti_inject,
+ vnd_neti_route,
+ vnd_neti_ispchksum,
+ vnd_neti_isvchksum
+};
+
+static net_protocol_t vnd_neti_info_v6 = {
+ NETINFO_VERSION,
+ NHF_VND_INET6,
+ vnd_neti_getifname,
+ vnd_neti_getmtu,
+ vnd_neti_getptmue,
+ vnd_neti_getlifaddr,
+ vnd_neti_getlifzone,
+ vnd_neti_getlifflags,
+ vnd_neti_phygetnext,
+ vnd_neti_phylookup,
+ vnd_neti_lifgetnext,
+ vnd_neti_inject,
+ vnd_neti_route,
+ vnd_neti_ispchksum,
+ vnd_neti_isvchksum
+};
+
+
+static int
+vnd_netinfo_init(vnd_pnsd_t *nsp)
+{
+ nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid,
+ &vnd_neti_info_v4);
+ ASSERT(nsp->vpnd_neti_v4 != NULL);
+
+ nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid,
+ &vnd_neti_info_v6);
+ ASSERT(nsp->vpnd_neti_v6 != NULL);
+
+ nsp->vpnd_family_v4.hf_version = HOOK_VERSION;
+ nsp->vpnd_family_v4.hf_name = "vnd_inet";
+
+ if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) {
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_family_v6.hf_version = HOOK_VERSION;
+ nsp->vpnd_family_v6.hf_name = "vnd_inet6";
+
+ if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) {
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_in_v4.he_version = HOOK_VERSION;
+ nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN;
+ nsp->vpnd_event_in_v4.he_flags = 0;
+ nsp->vpnd_event_in_v4.he_interested = B_FALSE;
+
+ nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ if (nsp->vpnd_token_in_v4 == NULL) {
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_in_v6.he_version = HOOK_VERSION;
+ nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN;
+ nsp->vpnd_event_in_v6.he_flags = 0;
+ nsp->vpnd_event_in_v6.he_interested = B_FALSE;
+
+ nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ if (nsp->vpnd_token_in_v6 == NULL) {
+ (void) net_event_shutdown(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_event_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_out_v4.he_version = HOOK_VERSION;
+ nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT;
+ nsp->vpnd_event_out_v4.he_flags = 0;
+ nsp->vpnd_event_out_v4.he_interested = B_FALSE;
+
+ nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_out_v4);
+ if (nsp->vpnd_token_out_v4 == NULL) {
+ (void) net_event_shutdown(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_shutdown(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_event_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_out_v6.he_version = HOOK_VERSION;
+ nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT;
+ nsp->vpnd_event_out_v6.he_flags = 0;
+ nsp->vpnd_event_out_v6.he_interested = B_FALSE;
+
+ nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_out_v6);
+ if (nsp->vpnd_token_out_v6 == NULL) {
+ (void) net_event_shutdown(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_shutdown(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_shutdown(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_event_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ return (0);
+}
+
+static void
+vnd_netinfo_shutdown(vnd_pnsd_t *nsp)
+{
+ int ret;
+
+ ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+ VERIFY(ret == 0);
+}
+
+static void
+vnd_netinfo_fini(vnd_pnsd_t *nsp)
+{
+ int ret;
+
+ ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+ VERIFY(ret == 0);
+ ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ VERIFY(ret == 0);
+ ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+ VERIFY(ret == 0);
+ ret = net_protocol_unregister(nsp->vpnd_neti_v4);
+ VERIFY(ret == 0);
+ ret = net_protocol_unregister(nsp->vpnd_neti_v6);
+ VERIFY(ret == 0);
+}
+
+/* ARGSUSED */
+static void
+vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy)
+{
+ vnd_str_t *vsp = arg;
+
+ VERIFY(bmp == &vsp->vns_barrierblk);
+ mutex_enter(&vsp->vns_lock);
+ VERIFY(vsp->vns_flags & VNS_F_BARRIER);
+ VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE));
+ vsp->vns_flags |= VNS_F_BARRIER_DONE;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * For better or worse, we have to broadcast here as we could have a
+ * thread that's blocked for completion as well as one that's blocked
+ * waiting to do a barrier itself.
+ */
+ cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * This is a data barrier for the stream while it is in fastpath mode. It blocks
+ * and ensures that there is nothing else in the squeue.
+ */
+static void
+vnd_strbarrier(vnd_str_t *vsp)
+{
+ mutex_enter(&vsp->vns_lock);
+ while (vsp->vns_flags & VNS_F_BARRIER)
+ cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+ vsp->vns_flags |= VNS_F_BARRIER;
+ mutex_exit(&vsp->vns_lock);
+
+ gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk,
+ vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER);
+
+ mutex_enter(&vsp->vns_lock);
+ while (!(vsp->vns_flags & VNS_F_BARRIER_DONE))
+ cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+ vsp->vns_flags &= ~VNS_F_BARRIER;
+ vsp->vns_flags &= ~VNS_F_BARRIER_DONE;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * We have to broadcast in case anyone is waiting for the barrier
+ * themselves.
+ */
+ cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * Based on the type of message that we're dealing with we're going to want to
+ * do one of several things. Basically if it looks like it's something we know
+ * about, we should probably handle it in one of our transition threads.
+ * Otherwise, we should just simply putnext.
+ */
+static int
+vnd_s_rput(queue_t *q, mblk_t *mp)
+{
+ t_uscalar_t prim;
+ int dispatch = 0;
+ vnd_str_t *vsp = q->q_ptr;
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ if (MBLKL(mp) < sizeof (t_uscalar_t)) {
+ vnd_drop_ctl(vsp, mp, "PROTO message too short");
+ break;
+ }
+
+ prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
+ if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) {
+ vnd_drop_ctl(vsp, mp,
+ "recieved an unsupported dlpi DATA req");
+ break;
+ }
+
+ /*
+ * Enqueue the entry and fire off a taskq dispatch.
+ */
+ mutex_enter(&vsp->vns_lock);
+ vnd_dlpi_inc_push(vsp, mp);
+ if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+ dispatch = 1;
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ }
+ mutex_exit(&vsp->vns_lock);
+ if (dispatch != 0)
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch,
+ vsp, 0, &vsp->vns_tqe);
+ break;
+ case M_DATA:
+ vnd_drop_in(vsp, mp, "M_DATA via put(9E)");
+ break;
+ default:
+ putnext(vsp->vns_rq, mp);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp)
+{
+ int error;
+ vnd_strioc_t *visp;
+
+ if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE ||
+ iocp->ioc_count != TRANSPARENT) {
+ error = EINVAL;
+ goto nak;
+ }
+
+ /*
+ * All streams ioctls that we support must use kcred as a means to
+ * distinguish that this is a layered open by the kernel as opposed to
+ * one by a user who has done an I_PUSH of the module.
+ */
+ if (iocp->ioc_cr != kcred) {
+ error = EPERM;
+ goto nak;
+ }
+
+ if (mp->b_cont == NULL) {
+ error = EAGAIN;
+ goto nak;
+ }
+
+ visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP);
+ ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t));
+ visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr;
+ visp->vs_state = VSS_COPYIN;
+
+ mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL);
+ qreply(q, mp);
+
+ return;
+
+nak:
+ if (mp->b_cont != NULL) {
+ freemsg(mp->b_cont);
+ mp->b_cont = NULL;
+ }
+
+ iocp->ioc_error = error;
+ mp->b_datap->db_type = M_IOCNAK;
+ iocp->ioc_count = 0;
+ qreply(q, mp);
+}
+
+static void
+vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+ vnd_str_state_t state;
+ struct copyreq *crp;
+ vnd_strioc_associate_t *vss;
+ vnd_dev_t *vdp = NULL;
+ vnd_pnsd_t *nsp = NULL;
+ char iname[2*VND_NAMELEN];
+ zone_t *zone;
+ vnd_strioc_t *visp;
+
+ visp = (vnd_strioc_t *)csp->cp_private;
+
+ /* If it's not ours, it's not our problem */
+ if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+ if (q->q_next != NULL) {
+ putnext(q, mp);
+ } else {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+ }
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ return;
+ }
+
+ /* The nak is already sent for us */
+ if (csp->cp_rval != 0) {
+ vnd_drop_ctl(vsp, mp, "M_COPYIN failed");
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ return;
+ }
+
+ /* Data is sitting for us in b_cont */
+ if (mp->b_cont == NULL ||
+ MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) {
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ miocnak(q, mp, 0, EINVAL);
+ return;
+ }
+
+ vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr;
+ vdp = vnd_dev_lookup(vss->vsa_minor);
+ if (vdp == NULL) {
+ vss->vsa_errno = VND_E_NODEV;
+ goto nak;
+ }
+
+ nsp = vnd_nsd_lookup(vss->vsa_nsid);
+ if (nsp == NULL) {
+ vss->vsa_errno = VND_E_NONETSTACK;
+ goto nak;
+ }
+
+ mutex_enter(&vsp->vns_lock);
+ if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) {
+ mutex_exit(&vsp->vns_lock);
+ vss->vsa_errno = VND_E_ASSOCIATED;
+ goto nak;
+ }
+
+ vsp->vns_nsd = nsp;
+ vsp->vns_flags &= ~VNS_F_NEED_ZONE;
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ mutex_exit(&vsp->vns_lock);
+
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0,
+ &vsp->vns_tqe);
+
+
+ /* At this point we need to wait until we have transitioned to ONLINE */
+ mutex_enter(&vsp->vns_lock);
+ while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE)
+ cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+ state = vsp->vns_state;
+ mutex_exit(&vsp->vns_lock);
+
+ if (state == VNS_S_ZOMBIE) {
+ vss->vsa_errno = vsp->vns_errno;
+ goto nak;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ mutex_enter(&vsp->vns_lock);
+ VERIFY(vdp->vdd_str == NULL);
+ /*
+ * Now initialize the remaining kstat properties and let's go ahead and
+ * create it.
+ */
+ (void) snprintf(iname, sizeof (iname), "z%d_%d",
+ vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor);
+ vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net",
+ KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
+ if (vsp->vns_kstat == NULL) {
+ vss->vsa_errno = VND_E_KSTATCREATE;
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ goto nak;
+ }
+ vdp->vdd_str = vsp;
+ vsp->vns_dev = vdp;
+
+ /*
+ * Now, it's time to do the las thing that can fail, changing out the
+ * input function. After this we know that we can receive data, so we
+ * should make sure that we're ready.
+ */
+ if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) {
+ vss->vsa_errno = VND_E_DIRECTFAIL;
+ vdp->vdd_str = NULL;
+ vsp->vns_dev = NULL;
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ goto nak;
+ }
+
+ zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid);
+ ASSERT(zone != NULL);
+ vsp->vns_kstat->ks_data = &vsp->vns_ksdata;
+ /* Account for zone name */
+ vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1;
+ /* Account for eventual link name */
+ vsp->vns_kstat->ks_data_size += VND_NAMELEN;
+ kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name);
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ zone_rele(zone);
+ kstat_install(vsp->vns_kstat);
+
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * Note that the vnd_str_t does not keep a permanent hold on the
+ * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what
+ * the nestack goes through to take care of everything.
+ */
+ vss->vsa_errno = VND_E_SUCCESS;
+nak:
+ if (vdp != NULL)
+ vnd_dev_rele(vdp);
+ if (nsp != NULL)
+ vnd_nsd_rele(nsp);
+ /*
+ * Change the copyin request to a copyout. Note that we can't use
+ * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's
+ * okay, as the copyin vs. copyout is basically the same.
+ */
+ DB_TYPE(mp) = M_COPYOUT;
+ visp->vs_state = VSS_COPYOUT;
+ crp = (struct copyreq *)mp->b_rptr;
+ crp->cq_private = (void *)visp;
+ crp->cq_addr = visp->vs_addr;
+ crp->cq_size = sizeof (vnd_strioc_associate_t);
+ qreply(q, mp);
+}
+
+static void
+vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+ ASSERT(csp->cp_private != NULL);
+ kmem_free(csp->cp_private, sizeof (vnd_strioc_t));
+ if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+ if (q->q_next != NULL) {
+ putnext(q, mp);
+ } else {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+ }
+ return;
+ }
+
+ /* The nak is already sent for us */
+ if (csp->cp_rval != 0) {
+ vnd_drop_ctl(vsp, mp, "M_COPYOUT failed");
+ return;
+ }
+
+ /* Ack and let's be done with it all */
+ miocack(q, mp, 0, 0);
+}
+
+static int
+vnd_s_wput(queue_t *q, mblk_t *mp)
+{
+ vnd_str_t *vsp = q->q_ptr;
+ struct copyresp *crp;
+ vnd_strioc_state_t vstate;
+ vnd_strioc_t *visp;
+
+ switch (DB_TYPE(mp)) {
+ case M_IOCTL:
+ vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr);
+ return (0);
+ case M_IOCDATA:
+ crp = (struct copyresp *)mp->b_rptr;
+ ASSERT(crp->cp_private != NULL);
+ visp = (vnd_strioc_t *)crp->cp_private;
+ vstate = visp->vs_state;
+ ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT);
+ if (vstate == VSS_COPYIN)
+ vnd_striocdata(q, vsp, mp,
+ (struct copyresp *)mp->b_rptr);
+ else
+ vnd_stroutdata(q, vsp, mp,
+ (struct copyresp *)mp->b_rptr);
+ return (0);
+ default:
+ break;
+ }
+ if (q->q_next != NULL)
+ putnext(q, mp);
+ else
+ vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput");
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp)
+{
+ vnd_str_t *vsp;
+ uint_t rand;
+
+ if (q->q_ptr != NULL)
+ return (EINVAL);
+
+ if (!(sflag & MODOPEN))
+ return (ENXIO);
+
+ if (credp != kcred)
+ return (EPERM);
+
+ vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP);
+ bzero(vsp, sizeof (*vsp));
+ mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL);
+ cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL);
+ cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL);
+ vsp->vns_state = VNS_S_INITIAL;
+
+ mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_enter(&vnd_dev_lock);
+ vsp->vns_dq_read.vdq_max = vnd_vdq_default_size;
+ vsp->vns_dq_read.vdq_vns = vsp;
+ vsp->vns_dq_write.vdq_max = vnd_vdq_default_size;
+ vsp->vns_dq_write.vdq_vns = vsp;
+ mutex_exit(&vnd_dev_lock);
+ vsp->vns_rq = q;
+ vsp->vns_wq = WR(q);
+ q->q_ptr = WR(q)->q_ptr = vsp;
+ vsp->vns_flags = VNS_F_NEED_ZONE;
+ vsp->vns_nflush = vnd_flush_nburst;
+ vsp->vns_bsize = vnd_flush_burst_size;
+
+ (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
+ vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand);
+
+ /*
+ * We create our kstat and initialize all of its fields now, but we
+ * don't install it until we actually do the zone association so we can
+ * get everything.
+ */
+ kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname",
+ KSTAT_DATA_STRING);
+ kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename",
+ KSTAT_DATA_STRING);
+ kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms,
+ "flowcontrol_100ms", KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s",
+ KSTAT_DATA_UINT64);
+ qprocson(q);
+ /*
+ * Now that we've called qprocson, grab the lower module for making sure
+ * that we don't have any pass through modules.
+ */
+ vsp->vns_lrq = RD(vsp->vns_wq->q_next);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_s_close(queue_t *q, int flag, cred_t *credp)
+{
+ vnd_str_t *vsp;
+ mblk_t *mp;
+
+ VERIFY(WR(q)->q_next != NULL);
+
+ vsp = q->q_ptr;
+ ASSERT(vsp != NULL);
+
+ /*
+ * We need to transition ourselves down. This means that we have a few
+ * important different things to do in the process of tearing down our
+ * input and output buffers, making sure we've drained the current
+ * squeue, and disabling the fast path. Before we disable the fast path,
+ * we should make sure the squeue is drained. Because we're in streams
+ * close, we know that no packets can come into us from userland, but we
+ * can receive more. As such, the following is the exact order of things
+ * that we do:
+ *
+ * 1) flush the vns_dq_read
+ * 2) Insert the drain mblk
+ * 3) When it's been received, tear down the fast path by kicking
+ * off the state machine.
+ * 4) One final flush of both the vns_dq_read,vns_dq_write
+ */
+
+ vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+ vnd_strbarrier(vsp);
+ mutex_enter(&vsp->vns_lock);
+ vsp->vns_state = VNS_S_SHUTTING_DOWN;
+ if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp,
+ 0, &vsp->vns_tqe);
+ }
+ while (vsp->vns_state != VNS_S_ZOMBIE)
+ cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+ mutex_exit(&vsp->vns_lock);
+
+ qprocsoff(q);
+ mutex_enter(&vsp->vns_lock);
+ vsp->vns_flags |= VNS_F_CONDEMNED;
+ while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)
+ cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock);
+
+ while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+ vnd_drop_ctl(vsp, mp, "vnd_s_close");
+ mutex_exit(&vsp->vns_lock);
+
+ q->q_ptr = NULL;
+ vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+ vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out);
+ mutex_destroy(&vsp->vns_dq_read.vdq_lock);
+ mutex_destroy(&vsp->vns_dq_write.vdq_lock);
+
+ if (vsp->vns_kstat != NULL)
+ kstat_delete(vsp->vns_kstat);
+ mutex_destroy(&vsp->vns_lock);
+ cv_destroy(&vsp->vns_stcv);
+ cv_destroy(&vsp->vns_barriercv);
+ cv_destroy(&vsp->vns_cancelcv);
+ kmem_cache_free(vnd_str_cache, vsp);
+
+ return (0);
+}
+
+static vnd_mac_cookie_t
+vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp)
+{
+ hrtime_t txtime;
+ vnd_mac_cookie_t vc;
+
+ VND_STAT_INC(vsp, vks_opackets, 1);
+ VND_STAT_INC(vsp, vks_obytes, msgsize(mp));
+ DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL,
+ vnd_str_t *, vsp, mblk_t *, mp);
+ /* Actually tx now */
+ txtime = gethrtime();
+ vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl,
+ mp, 0, MAC_DROP_ON_NO_DESC);
+
+ /*
+ * We need to check two different conditions before we immediately set
+ * the flow control lock. The first thing that we need to do is verify
+ * that this is an instance of hard flow control, so to say. The flow
+ * control callbacks won't always fire in cases where we still get a
+ * cookie returned. The explicit check for flow control will guarantee
+ * us that we'll get a subsequent notification callback.
+ *
+ * The second case comes about because we do not hold the
+ * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow
+ * control notification already came across for us in a different thread
+ * calling vnd_mac_flow_control(). To deal with this, we record a
+ * timestamp every time that we change the flow control state. We grab
+ * txtime here before we transmit because that guarantees that the
+ * hrtime_t of the call to vnd_mac_flow_control() will be after txtime.
+ *
+ * If the flow control notification beat us to the punch, the value of
+ * vns_fcupdate will be larger than the value of txtime, and we should
+ * just record the statistics. However, if we didn't beat it to the
+ * punch (txtime > vns_fcupdate), then we know that it's safe to wait
+ * for a notification.
+ */
+ if (vc != NULL) {
+ hrtime_t diff;
+
+ if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl,
+ vc) == 0)
+ return (NULL);
+ mutex_enter(&vsp->vns_lock);
+ diff = vsp->vns_fcupdate - txtime;
+ if (diff > 0) {
+ mutex_exit(&vsp->vns_lock);
+ vnd_mac_flow_control_stat(vsp, diff);
+ return (NULL);
+ }
+ vsp->vns_flags |= VNS_F_FLOW_CONTROLLED;
+ vsp->vns_caps.vsc_fc_cookie = vc;
+ vsp->vns_fclatch = txtime;
+ vsp->vns_fcupdate = txtime;
+ DTRACE_VND3(flow__blocked, vnd_str_t *, vsp,
+ uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc);
+ mutex_exit(&vsp->vns_lock);
+ }
+
+ return (vc);
+}
+
+/* ARGSUSED */
+static void
+vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
+{
+ mblk_t *mp;
+ int nmps;
+ size_t mptot, nflush, bsize;
+ boolean_t blocked, empty;
+ vnd_data_queue_t *vqp;
+ vnd_str_t *vsp = arg;
+
+ mutex_enter(&vsp->vns_lock);
+ /*
+ * We either enter here via an squeue or via vnd_squeue_tx_append(). In
+ * the former case we need to mark that there is no longer an active
+ * user of the drain block.
+ */
+ if (drain_mp != NULL) {
+ VERIFY(drain_mp == &vsp->vns_drainblk);
+ VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED);
+ vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED;
+ }
+
+ /*
+ * If we're still flow controlled or under a flush barrier, nothing to
+ * do.
+ */
+ if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) {
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+
+ nflush = vsp->vns_nflush;
+ bsize = vsp->vns_bsize;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * We're potentially going deep into the networking layer; make sure the
+ * guest can't run concurrently.
+ */
+ ht_begin_unsafe();
+
+ nmps = 0;
+ mptot = 0;
+ blocked = B_FALSE;
+ vqp = &vsp->vns_dq_write;
+ while (nmps < nflush && mptot <= bsize) {
+ mutex_enter(&vqp->vdq_lock);
+ if (vnd_dq_pop(vqp, &mp) == 0) {
+ mutex_exit(&vqp->vdq_lock);
+ break;
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ nmps++;
+ mptot += msgsize(mp);
+ if (vnd_squeue_tx_one(vsp, mp) != NULL) {
+ blocked = B_TRUE;
+ break;
+ }
+ }
+
+ ht_end_unsafe();
+
+ empty = vnd_dq_is_empty(&vsp->vns_dq_write);
+
+ /*
+ * If the queue is not empty, we're not blocked, and there isn't a drain
+ * scheduled, put it into the squeue with the drain block and
+ * GSQUEUE_FILL.
+ */
+ if (blocked == B_FALSE && empty == B_FALSE) {
+ mutex_enter(&vsp->vns_lock);
+ if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) {
+ mblk_t *mp = &vsp->vns_drainblk;
+ vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+ gsqueue_enter_one(vsp->vns_squeue,
+ mp, vnd_squeue_tx_drain, vsp,
+ GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN);
+ }
+ mutex_exit(&vsp->vns_lock);
+ }
+
+ /*
+ * If we drained some amount of data, we need to signal the data queue.
+ */
+ if (nmps > 0) {
+ cv_broadcast(&vsp->vns_dq_write.vdq_ready);
+ pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT);
+ }
+}
+
+/* ARGSUSED */
+static void
+vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy)
+{
+ vnd_str_t *vsp = arg;
+ vnd_data_queue_t *vqp = &vsp->vns_dq_write;
+ vnd_pnsd_t *nsp = vsp->vns_nsd;
+ size_t len = msgsize(mp);
+
+ /*
+ * Before we append this packet, we should run it through the firewall
+ * rules.
+ */
+ if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+ nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6,
+ nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out,
+ vnd_drop_out) != 0) {
+ /*
+ * Because we earlier reserved space for this packet and it's
+ * not making the cut, we need to go through and unreserve that
+ * space. Also note that the message block will likely be freed
+ * by the time we return from vnd_hook so we cannot rely on it.
+ */
+ mutex_enter(&vqp->vdq_lock);
+ vnd_dq_unreserve(vqp, len);
+ mutex_exit(&vqp->vdq_lock);
+ return;
+ }
+
+ /*
+ * We earlier reserved space for this packet. So for now simply append
+ * it and call drain. We know that no other drain can be going on right
+ * now thanks to the squeue.
+ */
+ mutex_enter(&vqp->vdq_lock);
+ (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic);
+ mutex_exit(&vqp->vdq_lock);
+ vnd_squeue_tx_drain(vsp, NULL, NULL, NULL);
+}
+
+/*
+ * We need to see if this is a valid name of sorts for us. That means a few
+ * things. First off, we can't assume that what we've been given has actually
+ * been null terminated. More importantly, that it's a valid name as far as
+ * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We
+ * further constrain ourselves to simply alphanumeric characters and a few
+ * additional ones, ':', '-', and '_'.
+ */
+static int
+vnd_validate_name(const char *buf, size_t buflen)
+{
+ int i, len;
+
+ /* First make sure a null terminator exists */
+ for (i = 0; i < buflen; i++)
+ if (buf[i] == '\0')
+ break;
+ len = i;
+ if (i == 0 || i == buflen)
+ return (0);
+
+ for (i = 0; i < len; i++)
+ if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' &&
+ buf[i] != '_')
+ return (0);
+
+ return (1);
+}
+
+static int
+vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag)
+{
+ vnd_ioc_attach_t via;
+ vnd_strioc_associate_t vss;
+ vnd_pnsd_t *nsp;
+ zone_t *zonep;
+ zoneid_t zid;
+ char buf[2*VND_NAMELEN];
+ int ret, rp;
+
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+ if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0)
+ return (EFAULT);
+ via.via_errno = VND_E_SUCCESS;
+
+ if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) {
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ /*
+ * Only the global zone can request to create a device in a different
+ * zone.
+ */
+ zid = crgetzoneid(credp);
+ if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 &&
+ zid != via.via_zoneid) {
+ via.via_errno = VND_E_PERM;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ if (via.via_zoneid == -1)
+ via.via_zoneid = zid;
+
+ /*
+ * Establish the name we'll use now. We want to be extra paranoid about
+ * the device we're opening so check that now.
+ */
+ if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) {
+ zonep = zone_find_by_id(via.via_zoneid);
+ if (zonep == NULL) {
+ via.via_errno = VND_E_NOZONE;
+ ret = EIO;
+ goto errcopyout;
+ }
+ if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name,
+ via.via_name) >= sizeof (buf)) {
+ zone_rele(zonep);
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+ (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s",
+ zonep->zone_name, via.via_name);
+ zone_rele(zonep);
+ zonep = NULL;
+ } else {
+ if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >=
+ sizeof (buf)) {
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+ (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name);
+ }
+
+ /*
+ * If our zone is dying then the netstack will have been removed from
+ * this list.
+ */
+ nsp = vnd_nsd_lookup_by_zid(via.via_zoneid);
+ if (nsp == NULL) {
+ via.via_errno = VND_E_NOZONE;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ /*
+ * Note we set the attached handle even though we haven't actually
+ * finished the process of attaching the ldi handle.
+ */
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_nsd_rele(nsp);
+ via.via_errno = VND_E_ATTACHED;
+ ret = EIO;
+ goto errcopyout;
+ }
+ vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT;
+ ASSERT(vdp->vdd_cr == NULL);
+ crhold(credp);
+ vdp->vdd_cr = credp;
+ ASSERT(vdp->vdd_nsd == NULL);
+ vdp->vdd_nsd = nsp;
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * Place an additional hold on the vnd_pnsd_t as we go through and do
+ * all of the rest of our work. This will be the hold that we keep for
+ * as long as this thing is attached.
+ */
+ vnd_nsd_ref(nsp);
+
+ ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr,
+ &vdp->vdd_ldih, vdp->vdd_ldiid);
+ if (ret != 0) {
+ if (ret == ENODEV)
+ via.via_errno = VND_E_NODATALINK;
+ goto err;
+ }
+
+ /*
+ * Unfortunately the I_PUSH interface doesn't allow us a way to detect
+ * whether or not we're coming in from a layered device. We really want
+ * to make sure that a normal user can't push on our streams module.
+ * Currently the only idea I have for this is to make sure that the
+ * credp is kcred which is really terrible.
+ */
+ ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL,
+ kcred, &rp);
+ if (ret != 0) {
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ via.via_errno = VND_E_STRINIT;
+ ret = EIO;
+ goto err;
+ }
+
+ vss.vsa_minor = vdp->vdd_minor;
+ vss.vsa_nsid = nsp->vpnd_nsid;
+
+ ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss,
+ FKIOCTL, kcred, &rp);
+ if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) {
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ if (ret == 0) {
+ via.via_errno = vss.vsa_errno;
+ ret = EIO;
+ }
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+
+ /*
+ * There's a chance that our netstack was condemned while we've had a
+ * hold on it. As such we need to check and if so, error out.
+ */
+ if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) {
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ ret = EIO;
+ via.via_errno = VND_E_NOZONE;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_str != NULL);
+ vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+ vdp->vdd_flags |= VND_D_ATTACHED;
+ (void) strlcpy(vdp->vdd_datalink, via.via_name,
+ sizeof (vdp->vdd_datalink));
+ list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ vnd_nsd_rele(nsp);
+
+ return (0);
+
+err:
+ mutex_enter(&vdp->vdd_lock);
+ vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+ crfree(vdp->vdd_cr);
+ vdp->vdd_cr = NULL;
+ vdp->vdd_nsd = NULL;
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * We have two holds to drop here. One for our original reference and
+ * one for the hold this operation would have represented.
+ */
+ vnd_nsd_rele(nsp);
+ vnd_nsd_rele(nsp);
+errcopyout:
+ if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0)
+ ret = EFAULT;
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+ int ret = 0;
+ vnd_ioc_link_t vil;
+ char mname[2*VND_NAMELEN];
+ char **c;
+ vnd_dev_t *v;
+ zoneid_t zid;
+
+ /* Not anyone can link something */
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0)
+ return (EFAULT);
+
+ if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+
+ c = vnd_reserved_names;
+ while (*c != NULL) {
+ if (strcmp(vil.vil_name, *c) == 0) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+ c++;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOTATTACHED;
+ goto errcopyout;
+ }
+
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOZONE;
+ goto errcopyout;
+ }
+
+ if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_LINKED;
+ goto errcopyout;
+ }
+ vdp->vdd_flags |= VND_D_LINK_INFLIGHT;
+ zid = vdp->vdd_nsd->vpnd_zid;
+ mutex_exit(&vdp->vdd_lock);
+
+ if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >=
+ sizeof (mname)) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+
+ mutex_enter(&vnd_dev_lock);
+ for (v = list_head(&vnd_dev_list); v != NULL;
+ v = list_next(&vnd_dev_list, v)) {
+ if (!(v->vdd_flags & VND_D_LINKED))
+ continue;
+
+ if (v->vdd_nsd->vpnd_zid == zid &&
+ strcmp(v->vdd_lname, vil.vil_name) == 0) {
+ mutex_exit(&vnd_dev_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_LINKEXISTS;
+ goto error;
+ }
+ }
+
+ /*
+ * We set the name and mark ourselves attached while holding the list
+ * lock to ensure that no other user can mistakingly find our name.
+ */
+ (void) snprintf(mname, sizeof (mname), "z%d:%s", zid,
+ vil.vil_name);
+ mutex_enter(&vdp->vdd_lock);
+
+ /*
+ * Because we dropped our lock, we need to double check whether or not
+ * the zone was marked as dying while we were here. If it hasn't, then
+ * it's safe for us to link it in.
+ */
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOZONE;
+ goto error;
+ }
+
+ (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname));
+ if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor,
+ DDI_PSEUDO, 0) != DDI_SUCCESS) {
+ ret = EIO;
+ vil.vil_errno = VND_E_MINORNODE;
+ } else {
+ vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+ vdp->vdd_flags |= VND_D_LINKED;
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ ret = 0;
+ }
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+
+ if (ret == 0) {
+ /*
+ * Add a reference to represent that this device is linked into
+ * the file system name space to ensure that it doesn't
+ * disappear.
+ */
+ vnd_dev_ref(vdp);
+ return (0);
+ }
+
+error:
+ mutex_enter(&vdp->vdd_lock);
+ vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+ vdp->vdd_lname[0] = '\0';
+ mutex_exit(&vdp->vdd_lock);
+
+errcopyout:
+ if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0)
+ ret = EFAULT;
+ return (ret);
+}
+
+/*
+ * Common unlink function. This is used both from the ioctl path and from the
+ * netstack shutdown path. The caller is required to hold the mutex on the
+ * vnd_dev_t, but they basically will have it relinquished for them. The only
+ * thing the caller is allowed to do afterward is to potentially rele the
+ * vnd_dev_t if they have their own hold. Note that only the ioctl path has its
+ * own hold.
+ */
+static void
+vnd_dev_unlink(vnd_dev_t *vdp)
+{
+ char mname[2*VND_NAMELEN];
+
+ ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+ (void) snprintf(mname, sizeof (mname), "z%d:%s",
+ vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname);
+ ddi_remove_minor_node(vnd_dip, mname);
+ vdp->vdd_lname[0] = '\0';
+ vdp->vdd_flags &= ~VND_D_LINKED;
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * This rele corresponds to the reference that we took in
+ * vnd_ioctl_link.
+ */
+ vnd_dev_rele(vdp);
+}
+
+static int
+vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+ int ret;
+ zoneid_t zid;
+ vnd_ioc_unlink_t viu;
+
+ /* Not anyone can unlink something */
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ zid = crgetzoneid(credp);
+
+ if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0)
+ return (EFAULT);
+
+ viu.viu_errno = VND_E_SUCCESS;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ viu.viu_errno = VND_E_NOTLINKED;
+ goto err;
+ }
+ VERIFY(vdp->vdd_flags & VND_D_ATTACHED);
+
+ if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ viu.viu_errno = VND_E_PERM;
+ goto err;
+ }
+
+ /* vnd_dev_unlink releases the vdp mutex for us */
+ vnd_dev_unlink(vdp);
+ ret = 0;
+err:
+ if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ mutex_enter(&vnd_dev_lock);
+ if (vib.vib_size > vnd_vdq_hard_max) {
+ mutex_exit(&vnd_dev_lock);
+ vib.vib_errno = VND_E_BUFTOOBIG;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_BUFTOOSMALL;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size;
+ mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max;
+ mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vnd_dev_lock);
+ vib.vib_size = vnd_vdq_hard_max;
+ mutex_exit(&vnd_dev_lock);
+
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max;
+ mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ mutex_enter(&vnd_dev_lock);
+ if (vib.vib_size > vnd_vdq_hard_max) {
+ mutex_exit(&vnd_dev_lock);
+ vib.vib_errno = VND_E_BUFTOOBIG;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_BUFTOOSMALL;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vdp->vdd_str->vns_lock);
+
+ mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size;
+ mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min)
+{
+ vnd_ioc_buf_t vib;
+
+ vib.vib_errno = 0;
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & VND_D_ATTACHED) {
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (min == B_TRUE)
+ vib.vib_size = vdp->vdd_str->vns_minwrite;
+ else
+ vib.vib_size = vdp->vdd_str->vns_maxwrite;
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ } else {
+ vib.vib_errno = VND_E_NOTATTACHED;
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+ int ret, nonblock, nwrite;
+ frameio_t *fio;
+ vnd_data_queue_t *vqp;
+ mblk_t *mp;
+
+ fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+ if (fio == NULL)
+ return (EAGAIN);
+
+ ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr,
+ mode);
+ if (ret != 0) {
+ frameio_free(fio);
+ return (ret);
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ frameio_free(fio);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ nonblock = mode & (FNONBLOCK | FNDELAY);
+
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+
+ /* Check empty case */
+ if (vqp->vdq_cur == 0) {
+ if (nonblock != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EWOULDBLOCK);
+ }
+ while (vqp->vdq_cur == 0) {
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EINTR);
+ }
+ }
+ }
+
+ ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head,
+ &nwrite, mode & FKIOCTL);
+ if (ret != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (ret);
+ }
+
+ ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode);
+ if (ret != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (ret);
+ }
+
+ while (nwrite > 0) {
+ (void) vnd_dq_pop(vqp, &mp);
+ freemsg(mp);
+ nwrite--;
+ }
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+
+ return (0);
+}
+
+static int
+vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+ frameio_t *fio;
+ int ret, nonblock, nframes, i, nread;
+ size_t maxwrite, minwrite, total, flen;
+ mblk_t *mp_chain, *mp, *nmp;
+ vnd_data_queue_t *vqp;
+
+ fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+ if (fio == NULL)
+ return (EAGAIN);
+
+ ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode);
+ if (ret != 0) {
+ frameio_free(fio);
+ return (ret);
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ frameio_free(fio);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ nonblock = mode & (FNONBLOCK | FNDELAY);
+
+ /*
+ * Make sure no single frame is larger than we can accept.
+ */
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ minwrite = vdp->vdd_str->vns_minwrite;
+ maxwrite = vdp->vdd_str->vns_maxwrite;
+ mutex_exit(&vdp->vdd_str->vns_lock);
+
+ nframes = fio->fio_nvpf / fio->fio_nvecs;
+ total = 0;
+ for (i = 0; i < nframes; i++) {
+ flen = frameio_frame_length(fio,
+ &fio->fio_vecs[i*fio->fio_nvpf]);
+ if (flen < minwrite || flen > maxwrite) {
+ frameio_free(fio);
+ return (ERANGE);
+ }
+ total += flen;
+ }
+
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ while (vnd_dq_reserve(vqp, total) == 0) {
+ if (nonblock != 0) {
+ frameio_free(fio);
+ mutex_exit(&vqp->vdq_lock);
+ return (EAGAIN);
+ }
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ /*
+ * We've reserved our space, let's copyin and go from here.
+ */
+ ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL);
+ if (ret != 0) {
+ frameio_free(fio);
+ vnd_dq_unreserve(vqp, total);
+ cv_broadcast(&vqp->vdq_ready);
+ pollwakeup(&vdp->vdd_ph, POLLOUT);
+ return (ret);
+ }
+
+ for (mp = mp_chain; mp != NULL; mp = nmp) {
+ nmp = mp->b_next;
+ mp->b_next = NULL;
+ gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+ vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+ VND_SQUEUE_TAG_VND_WRITE);
+ }
+
+ /*
+ * Update the frameio structure to indicate that we wrote those frames.
+ */
+ frameio_mark_consumed(fio, nread);
+ ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode);
+ frameio_free(fio);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode)
+{
+ const char *link;
+ uint32_t vers = 1;
+ ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+ /*
+ * Copy all of the members out to userland.
+ */
+ if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (vdp->vdd_flags & VND_D_LINKED)
+ link = vdp->vdd_lname;
+ else
+ link = "<anonymous>";
+ if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink,
+ sizeof (arg->vii_datalink), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone,
+ sizeof (zoneid_t), mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+static int
+vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode)
+{
+ vnd_ioc_list_t vl;
+ vnd_ioc_list32_t vl32;
+ zoneid_t zid;
+ vnd_dev_t *vdp;
+ vnd_ioc_info_t *vip;
+ int found, cancopy, ret;
+
+ if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
+ if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ vl.vl_nents = vl32.vl_nents;
+ vl.vl_actents = vl32.vl_actents;
+ vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents;
+ } else {
+ if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+
+ cancopy = vl.vl_nents;
+ vip = vl.vl_ents;
+ found = 0;
+ zid = crgetzoneid(credp);
+ mutex_enter(&vnd_dev_lock);
+ for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+ vdp = list_next(&vnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & VND_D_ATTACHED &&
+ !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) &&
+ (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) {
+ found++;
+ if (cancopy > 0) {
+ ret = vnd_ioctl_list_copy_info(vdp, vip, mode);
+ if (ret != 0) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ return (ret);
+ }
+ cancopy--;
+ vip++;
+ }
+ }
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents,
+ sizeof (uint_t), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ int ret;
+ minor_t m;
+ vnd_dev_t *vdp;
+
+ m = getminor(dev);
+ ASSERT(m != 0);
+
+ /*
+ * Make sure no one has come in on an ioctl from the strioc case.
+ */
+ if ((cmd & VND_STRIOC) == VND_STRIOC)
+ return (ENOTTY);
+
+ /*
+ * Like close, seems like if this minor isn't found, it's a programmer
+ * error somehow.
+ */
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENXIO);
+
+ switch (cmd) {
+ case VND_IOC_ATTACH:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_attach(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_LINK:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_link(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_UNLINK:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_unlink(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_GETRXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_getrxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_SETRXBUF:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_setrxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETTXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_SETTXBUF:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_settxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETMAXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ if (crgetzoneid(credp) != GLOBAL_ZONEID) {
+ ret = EPERM;
+ break;
+ }
+ ret = vnd_ioctl_getmaxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETMINTU:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE);
+ break;
+ case VND_IOC_GETMAXTU:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE);
+ break;
+ case VND_IOC_FRAMEIO_READ:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_frameio_read(vdp, arg, mode);
+ break;
+ case VND_IOC_FRAMEIO_WRITE:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_frameio_write(vdp, arg, mode);
+ break;
+ case VND_IOC_LIST:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_list(arg, credp, mode);
+ break;
+ default:
+ ret = ENOTTY;
+ break;
+ }
+
+ vnd_dev_rele(vdp);
+ return (ret);
+}
+
+static int
+vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+ vnd_dev_t *vdp;
+ minor_t m;
+ zoneid_t zid;
+
+ if (flag & (FEXCL | FNDELAY))
+ return (ENOTSUP);
+
+ if (otyp & OTYP_BLK)
+ return (ENOTSUP);
+
+ zid = crgetzoneid(credp);
+ m = getminor(*devp);
+
+ /*
+ * If we have an open of a non-zero instance then we need to look that
+ * up in our list of entries.
+ */
+ if (m != 0) {
+
+ /*
+ * We don't check for rawaccess globally as a user could be
+ * doing a list ioctl on the control node which doesn't require
+ * this privilege.
+ */
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENOENT);
+
+ /*
+ * We need to check to make sure that the user is allowed to
+ * open this node. At this point it should be an attached handle
+ * as that's all we're allowed to access.
+ */
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (EBUSY);
+ }
+
+ if (!(vdp->vdd_flags & VND_D_OPENED)) {
+ vdp->vdd_flags |= VND_D_OPENED;
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+
+ return (0);
+ }
+
+ if (flag & FEXCL)
+ return (ENOTSUP);
+
+ /*
+ * We need to clone ourselves and set up new a state.
+ */
+ vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP);
+ bzero(vdp, sizeof (vnd_dev_t));
+
+ if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) {
+ kmem_cache_free(vnd_dev_cache, vdp);
+ return (EINVAL);
+ }
+
+ vdp->vdd_minor = id_alloc(vnd_minors);
+ mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL);
+ list_link_init(&vdp->vdd_link);
+ vdp->vdd_ref = 1;
+ *devp = makedevice(getmajor(*devp), vdp->vdd_minor);
+ vdp->vdd_devid = *devp;
+ DTRACE_VND_REFINC(vdp);
+ vdp->vdd_flags |= VND_D_OPENED;
+
+ mutex_enter(&vnd_dev_lock);
+ list_insert_head(&vnd_dev_list, vdp);
+ mutex_exit(&vnd_dev_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+ minor_t m;
+ vnd_dev_t *vdp;
+
+ m = getminor(dev);
+ if (m == 0)
+ return (ENXIO);
+
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_flags & VND_D_OPENED);
+ vdp->vdd_flags &= ~VND_D_OPENED;
+ mutex_exit(&vdp->vdd_lock);
+
+ /* Remove the hold from the previous open. */
+ vnd_dev_rele(vdp);
+
+ /* And now from lookup */
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int nonblock, error = 0;
+ size_t mpsize;
+ vnd_dev_t *vdp;
+ vnd_data_queue_t *vqp;
+ mblk_t *mp = NULL;
+ offset_t u_loffset;
+
+ /*
+ * If we have more than one uio we refuse to do anything. That's for
+ * frameio.
+ */
+ if (uiop->uio_iovcnt > 1)
+ return (EINVAL);
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+
+ /* Check empty case */
+ if (vqp->vdq_cur == 0) {
+ if (nonblock != 0) {
+ error = EWOULDBLOCK;
+ goto err;
+ }
+ while (vqp->vdq_cur == 0) {
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ error = EINTR;
+ goto err;
+ }
+ }
+ }
+
+ /* Ensure our buffer is big enough */
+ mp = vqp->vdq_head;
+ ASSERT(mp != NULL);
+ mpsize = msgsize(mp);
+ if (mpsize > uiop->uio_resid) {
+ error = EOVERFLOW;
+ goto err;
+ }
+
+ u_loffset = uiop->uio_loffset;
+ while (mp != NULL) {
+ if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) {
+ error = EFAULT;
+ uiop->uio_loffset = u_loffset;
+ mp = NULL;
+ goto err;
+ }
+ mpsize -= MBLKL(mp);
+ mp = mp->b_cont;
+ }
+ ASSERT(mpsize == 0);
+ (void) vnd_dq_pop(vqp, &mp);
+ freemsg(mp);
+err:
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+vnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int nonblock, error;
+ vnd_dev_t *vdp;
+ mblk_t *mp;
+ ssize_t iosize, origsize;
+ vnd_data_queue_t *vqp;
+
+ if (uiop->uio_iovcnt > 1)
+ return (EINVAL);
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite ||
+ uiop->uio_resid < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ vnd_dev_rele(vdp);
+ return (ERANGE);
+ }
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ VERIFY(vdp->vdd_str != NULL);
+
+ /*
+ * Reserve space in the data queue if we can. If we can't, block or
+ * return EAGAIN. If we can, go and squeue_enter.
+ */
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) {
+ if (nonblock != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+ return (EAGAIN);
+ }
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ /*
+ * Now that we've reserved the space, try to allocate kernel space for
+ * and copy in the block. To take care of all this we use the
+ * strmakedata subroutine for now.
+ */
+ origsize = iosize = uiop->uio_resid;
+ error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0,
+ &mp);
+
+ /*
+ * strmakedata() will return an error or it may only consume a portion
+ * of the data.
+ */
+ if (error != 0 || uiop->uio_resid != 0) {
+ vnd_dq_unreserve(vqp, origsize);
+ cv_broadcast(&vqp->vdq_ready);
+ pollwakeup(&vdp->vdd_ph, POLLOUT);
+ vnd_dev_rele(vdp);
+ return (ENOSR);
+ }
+
+ gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+ vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+ VND_SQUEUE_TAG_VND_WRITE);
+
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+static int
+vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ short ready = 0;
+ vnd_dev_t *vdp;
+ vnd_data_queue_t *vqp;
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ if ((events & POLLIN) || (events & POLLRDNORM)) {
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_head != NULL)
+ ready |= events & (POLLIN | POLLRDNORM);
+ mutex_exit(&vqp->vdq_lock);
+ }
+
+ if (events & POLLOUT) {
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_cur != vqp->vdq_max)
+ ready |= POLLOUT;
+ mutex_exit(&vqp->vdq_lock);
+ }
+
+ if ((ready == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &vdp->vdd_ph;
+ }
+ *reventsp = ready;
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+/* ARGSUSED */
+static void *
+vnd_stack_init(netstackid_t stackid, netstack_t *ns)
+{
+ vnd_pnsd_t *nsp;
+
+ nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP);
+ bzero(nsp, sizeof (*nsp));
+ nsp->vpnd_nsid = stackid;
+ nsp->vpnd_zid = netstackid_to_zoneid(stackid);
+ nsp->vpnd_flags = 0;
+ mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t),
+ offsetof(vnd_dev_t, vdd_nslink));
+ if (vnd_netinfo_init(nsp) == 0)
+ nsp->vpnd_hooked = B_TRUE;
+
+ mutex_enter(&vnd_dev_lock);
+ list_insert_tail(&vnd_nsd_list, nsp);
+ mutex_exit(&vnd_dev_lock);
+
+ return (nsp);
+}
+
+/* ARGSUSED */
+static void
+vnd_stack_shutdown(netstackid_t stackid, void *arg)
+{
+ vnd_pnsd_t *nsp = arg;
+ vnd_dev_t *vdp;
+
+ ASSERT(nsp != NULL);
+ /*
+ * After shut down no one should be able to find their way to this
+ * netstack again.
+ */
+ mutex_enter(&vnd_dev_lock);
+ list_remove(&vnd_nsd_list, nsp);
+ mutex_exit(&vnd_dev_lock);
+
+ /*
+ * Make sure hooks know that they're going away.
+ */
+ if (nsp->vpnd_hooked == B_TRUE)
+ vnd_netinfo_shutdown(nsp);
+
+ /*
+ * Now we need to go through and notify each zone that they are in
+ * teardown phase. See the big theory statement section on vnd, zones,
+ * netstacks, and sdev for more information about this.
+ */
+ mutex_enter(&nsp->vpnd_lock);
+ nsp->vpnd_flags |= VND_NS_CONDEMNED;
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_CONDEMNED))
+ vdp->vdd_flags |= VND_D_ZONE_DYING;
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&nsp->vpnd_lock);
+
+ /*
+ * Next we remove all the links as we know nothing new can be added to
+ * the list and that none of the extent devices can obtain additional
+ * links.
+ */
+restart:
+ mutex_enter(&nsp->vpnd_lock);
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if ((vdp->vdd_flags & VND_D_CONDEMNED) ||
+ !(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ continue;
+ }
+
+ /*
+ * We drop our lock here and restart afterwards. Note that as
+ * part of unlinking we end up doing a rele of the vnd_dev_t. If
+ * this is the final hold on the vnd_dev_t then it might try and
+ * remove itself. Our locking rules requires not to be holding
+ * any locks when we call any of the rele functions.
+ *
+ * Note that the unlink function requires holders to call into
+ * it with the vnd_dev_t->vdd_lock held and will take care of it
+ * for us. Because we don't have a hold on it, we're done at
+ * this point.
+ */
+ mutex_exit(&nsp->vpnd_lock);
+ /* Forcibly unlink */
+ vnd_dev_unlink(vdp);
+ goto restart;
+ }
+ mutex_exit(&nsp->vpnd_lock);
+}
+
+/* ARGSUSED */
+static void
+vnd_stack_destroy(netstackid_t stackid, void *arg)
+{
+ vnd_pnsd_t *nsp = arg;
+
+ ASSERT(nsp != NULL);
+
+ /*
+ * Now that we've unlinked everything we just have to hang out for
+ * it to finish exiting. Now that it's no longer the kernel itself
+ * that's doing this we just need to wait for our reference count to
+ * equal zero and then we're free. If the global zone is holding open a
+ * reference to a vnd device for another zone, that's bad, but there's
+ * nothing much we can do. See the section on 'vnd, zones, netstacks' in
+ * the big theory statement for more information.
+ */
+ mutex_enter(&nsp->vpnd_lock);
+ while (nsp->vpnd_ref != 0)
+ cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock);
+ mutex_exit(&nsp->vpnd_lock);
+
+ /*
+ * During shutdown we removed ourselves from the list and now we have no
+ * more references so we can safely say that there is nothing left and
+ * destroy everything that we had sitting around.
+ */
+ if (nsp->vpnd_hooked == B_TRUE)
+ vnd_netinfo_fini(nsp);
+
+ mutex_destroy(&nsp->vpnd_lock);
+ list_destroy(&nsp->vpnd_dev_list);
+ kmem_cache_free(vnd_pnsd_cache, nsp);
+}
+
+/*
+ * Convert a node with a name of the form /dev/vnd/zone/%zonename and
+ * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack.
+ */
+static vnd_pnsd_t *
+vnd_sdev_ctx_to_ns(sdev_ctx_t ctx)
+{
+ enum vtype vt;
+ const char *path = sdev_ctx_path(ctx);
+ char *zstart, *dup;
+ size_t duplen;
+ vnd_pnsd_t *nsp;
+
+ vt = sdev_ctx_vtype(ctx);
+ ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0);
+
+ if (vt == VDIR) {
+ zstart = strrchr(path, '/');
+ ASSERT(zstart != NULL);
+ zstart++;
+ return (vnd_nsd_lookup_by_zonename(zstart));
+ }
+
+ ASSERT(vt == VCHR);
+
+ dup = strdup(path);
+ duplen = strlen(dup) + 1;
+ zstart = strrchr(dup, '/');
+ *zstart = '\0';
+ zstart--;
+ zstart = strrchr(dup, '/');
+ zstart++;
+ nsp = vnd_nsd_lookup_by_zonename(zstart);
+ kmem_free(dup, duplen);
+
+ return (nsp);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate_dir(sdev_ctx_t ctx)
+{
+ vnd_pnsd_t *nsp;
+
+ if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0)
+ return (SDEV_VTOR_VALID);
+
+ if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) {
+ ASSERT(getzoneid() == GLOBAL_ZONEID);
+ ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+ return (SDEV_VTOR_VALID);
+ }
+
+ nsp = vnd_sdev_ctx_to_ns(ctx);
+ if (nsp == NULL)
+ return (SDEV_VTOR_INVALID);
+ vnd_nsd_rele(nsp);
+
+ return (SDEV_VTOR_VALID);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate(sdev_ctx_t ctx)
+{
+ enum vtype vt;
+ vnd_dev_t *vdp;
+ minor_t minor;
+
+ vt = sdev_ctx_vtype(ctx);
+ if (vt == VDIR)
+ return (vnd_sdev_validate_dir(ctx));
+ ASSERT(vt == VCHR);
+
+ if (strcmp("ctl", sdev_ctx_name(ctx)) == 0)
+ return (SDEV_VTOR_VALID);
+
+ if (sdev_ctx_minor(ctx, &minor) != 0)
+ return (SDEV_VTOR_STALE);
+
+ vdp = vnd_dev_lookup(minor);
+ if (vdp == NULL)
+ return (SDEV_VTOR_STALE);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED) ||
+ (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_STALE);
+ }
+
+ if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_STALE);
+ }
+
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_VALID);
+}
+
+/*
+ * This function is a no-op. sdev never has holds on our devices as they can go
+ * away at any time and specfs has to deal with that fact.
+ */
+/* ARGSUSED */
+static void
+vnd_sdev_inactive(sdev_ctx_t ctx)
+{
+}
+
+static int
+vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_dev_t *vdp;
+
+ mutex_enter(&nsp->vpnd_lock);
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if ((vdp->vdd_flags & VND_D_LINKED) &&
+ !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+ ret = sdev_plugin_mknod(ctx, vdp->vdd_lname,
+ VND_SDEV_MODE, vdp->vdd_devid);
+ if (ret != 0 && ret != EEXIST) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&nsp->vpnd_lock);
+ vnd_nsd_rele(nsp);
+ return (ret);
+ }
+ }
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&nsp->vpnd_lock);
+
+ return (0);
+}
+
+static int
+vnd_sdev_filldir_root(sdev_ctx_t ctx)
+{
+ zoneid_t zid;
+ vnd_pnsd_t *nsp;
+ int ret;
+
+ zid = getzoneid();
+ nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid));
+ ASSERT(nsp != NULL);
+ ret = vnd_sdev_fillzone(nsp, ctx);
+ vnd_nsd_rele(nsp);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * Checking the zone id is not sufficient as the global zone could be
+ * reaching down into a non-global zone's mounted /dev.
+ */
+ if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) {
+ ret = sdev_plugin_mkdir(ctx, "zone");
+ if (ret != 0 && ret != EEXIST)
+ return (ret);
+ }
+
+ /*
+ * Always add a reference to the control node. There's no need to
+ * reference it since it always exists and is always what we clone from.
+ */
+ ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE,
+ makedevice(ddi_driver_major(vnd_dip), 0));
+ if (ret != 0 && ret != EEXIST)
+ return (ret);
+
+ return (0);
+}
+
+static int
+vnd_sdev_filldir_zroot(sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_pnsd_t *nsp;
+ zone_t *zonep;
+
+ ASSERT(getzoneid() == GLOBAL_ZONEID);
+ ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+
+ mutex_enter(&vnd_dev_lock);
+ for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+ nsp = list_next(&vnd_nsd_list, nsp)) {
+ mutex_enter(&nsp->vpnd_lock);
+ if (list_is_empty(&nsp->vpnd_dev_list)) {
+ mutex_exit(&nsp->vpnd_lock);
+ continue;
+ }
+ mutex_exit(&nsp->vpnd_lock);
+ zonep = zone_find_by_id(nsp->vpnd_zid);
+ /*
+ * This zone must be being torn down, so skip it.
+ */
+ if (zonep == NULL)
+ continue;
+ ret = sdev_plugin_mkdir(ctx, zonep->zone_name);
+ zone_rele(zonep);
+ if (ret != 0 && ret != EEXIST) {
+ mutex_exit(&vnd_dev_lock);
+ return (ret);
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (0);
+}
+
+static int
+vnd_sdev_filldir(sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_pnsd_t *nsp;
+
+ ASSERT(sdev_ctx_vtype(ctx) == VDIR);
+ if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0)
+ return (vnd_sdev_filldir_root(ctx));
+
+ if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0)
+ return (vnd_sdev_filldir_zroot(ctx));
+
+ ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx),
+ strlen(VND_SDEV_ZROOT)) == 0);
+ nsp = vnd_sdev_ctx_to_ns(ctx);
+ if (nsp == NULL)
+ return (0);
+
+ ret = vnd_sdev_fillzone(nsp, ctx);
+ vnd_nsd_rele(nsp);
+
+ return (ret);
+}
+
+static sdev_plugin_ops_t vnd_sdev_ops = {
+ SDEV_PLUGIN_VERSION,
+ SDEV_PLUGIN_SUBDIR,
+ vnd_sdev_validate,
+ vnd_sdev_filldir,
+ vnd_sdev_inactive
+};
+
+static int
+vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int errp = 0;
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ /*
+ * Only allow one instance.
+ */
+ if (vnd_dip != NULL)
+ return (DDI_FAILURE);
+
+ vnd_dip = dip;
+ if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) !=
+ DDI_SUCCESS) {
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
+ DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
+ ddi_remove_minor_node(vnd_dip, NULL);
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops,
+ &errp);
+ if (vnd_sdev_hdl == NULL) {
+ ddi_remove_minor_node(vnd_dip, NULL);
+ ddi_prop_remove_all(vnd_dip);
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY);
+
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ mutex_enter(&vnd_dev_lock);
+ if (!list_is_empty(&vnd_dev_list)) {
+ mutex_exit(&vnd_dev_lock);
+ return (DDI_FAILURE);
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ return (DDI_FAILURE);
+}
+
+/* ARGSUSED */
+static int
+vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)vnd_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+ return (error);
+}
+
+
+
+static void
+vnd_ddi_fini(void)
+{
+ netstack_unregister(NS_VND);
+ if (vnd_taskq != NULL)
+ taskq_destroy(vnd_taskq);
+ if (vnd_str_cache != NULL)
+ kmem_cache_destroy(vnd_str_cache);
+ if (vnd_dev_cache != NULL)
+ kmem_cache_destroy(vnd_dev_cache);
+ if (vnd_pnsd_cache != NULL)
+ kmem_cache_destroy(vnd_pnsd_cache);
+ if (vnd_minors != NULL)
+ id_space_destroy(vnd_minors);
+ if (vnd_list_init != 0) {
+ list_destroy(&vnd_nsd_list);
+ list_destroy(&vnd_dev_list);
+ mutex_destroy(&vnd_dev_lock);
+ vnd_list_init = 0;
+ }
+ frameio_fini();
+}
+
+static int
+vnd_ddi_init(void)
+{
+ if (frameio_init() != 0)
+ return (DDI_FAILURE);
+
+ vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_str_cache == NULL) {
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+ vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_dev_cache == NULL) {
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+ vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache",
+ sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_pnsd_cache == NULL) {
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0);
+ if (vnd_taskq == NULL) {
+ kmem_cache_destroy(vnd_pnsd_cache);
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX);
+ if (vnd_minors == NULL) {
+ taskq_destroy(vnd_taskq);
+ kmem_cache_destroy(vnd_pnsd_cache);
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&vnd_dev_list, sizeof (vnd_dev_t),
+ offsetof(vnd_dev_t, vdd_link));
+ list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t),
+ offsetof(vnd_pnsd_t, vpnd_link));
+ vnd_list_init = 1;
+
+ netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown,
+ vnd_stack_destroy);
+
+ return (DDI_SUCCESS);
+}
+
+static struct module_info vnd_minfo = {
+ 0, /* module id */
+ "vnd", /* module name */
+ 1, /* smallest packet size */
+ INFPSZ, /* largest packet size (infinite) */
+ 1, /* high watermark */
+ 0 /* low watermark */
+};
+
+static struct qinit vnd_r_qinit = {
+ vnd_s_rput,
+ NULL,
+ vnd_s_open,
+ vnd_s_close,
+ NULL,
+ &vnd_minfo,
+ NULL
+};
+
+static struct qinit vnd_w_qinit = {
+ vnd_s_wput,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &vnd_minfo,
+ NULL
+};
+
+static struct streamtab vnd_strtab = {
+ &vnd_r_qinit,
+ &vnd_w_qinit,
+ NULL,
+ NULL
+};
+
+
+static struct cb_ops vnd_cb_ops = {
+ vnd_open, /* open */
+ vnd_close, /* close */
+ nulldev, /* strategy */
+ nulldev, /* print */
+ nodev, /* dump */
+ vnd_read, /* read */
+ vnd_write, /* write */
+ vnd_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ vnd_chpoll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* streamtab */
+ D_MP /* Driver compatibility flag */
+};
+
+static struct dev_ops vnd_dev_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ vnd_info, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ vnd_attach, /* attach */
+ vnd_detach, /* detach */
+ nodev, /* reset */
+ &vnd_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed /* quiesce */
+};
+
+static struct modldrv vnd_modldrv = {
+ &mod_driverops,
+ "Virtual Networking Datapath Driver",
+ &vnd_dev_ops
+};
+
+static struct fmodsw vnd_fmodfsw = {
+ "vnd",
+ &vnd_strtab,
+ D_NEW | D_MP
+};
+
+static struct modlstrmod vnd_modlstrmod = {
+ &mod_strmodops,
+ "Virtual Networking Datapath Driver",
+ &vnd_fmodfsw
+};
+
+static struct modlinkage vnd_modlinkage = {
+ MODREV_1,
+ &vnd_modldrv,
+ &vnd_modlstrmod,
+ NULL
+};
+
+int
+_init(void)
+{
+ int error;
+
+ /*
+ * We need to do all of our global initialization in init as opposed to
+ * attach and detach. The problem here is that because vnd can be used
+ * from a stream context while being detached, we can not rely on having
+ * run attach to create everything, alas. so it goes in _init, just like
+ * our friend ip.
+ */
+ if ((error = vnd_ddi_init()) != DDI_SUCCESS)
+ return (error);
+ error = mod_install((&vnd_modlinkage));
+ if (error != 0)
+ vnd_ddi_fini();
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&vnd_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ error = mod_remove(&vnd_modlinkage);
+ if (error == 0)
+ vnd_ddi_fini();
+ return (error);
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf
new file mode 100644
index 0000000000..65872e1ddf
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
+#
+
+name="vnd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index d671153967..e532a551e7 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
*/
@@ -354,7 +354,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
rw_enter(&vnic_lock, RW_WRITER);
- /* does a VNIC with the same id already exist? */
+ /* Does a VNIC with the same id already exist? */
err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
(mod_hash_val_t *)&vnic);
if (err == 0) {
@@ -370,6 +370,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
bzero(vnic, sizeof (*vnic));
+ vnic->vn_ls = LINK_STATE_UNKNOWN;
vnic->vn_id = vnic_id;
vnic->vn_link_id = linkid;
vnic->vn_vrid = vrid;
@@ -455,6 +456,20 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
} else {
vnic->vn_hcksum_txflags = 0;
}
+
+ /*
+ * Check for LSO capabilities. LSO implementations
+ * depend on hardware checksumming, so the same
+ * requirement is enforced here.
+ */
+ if (vnic->vn_hcksum_txflags != 0) {
+ if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO,
+ &vnic->vn_cap_lso)) {
+ vnic->vn_cap_lso.lso_flags = 0;
+ }
+ } else {
+ vnic->vn_cap_lso.lso_flags = 0;
+ }
}
/* register with the MAC module */
@@ -580,11 +595,12 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
vnic->vn_enabled = B_TRUE;
if (is_anchor) {
- mac_link_update(vnic->vn_mh, LINK_STATE_UP);
+ vnic->vn_ls = LINK_STATE_UP;
} else {
- mac_link_update(vnic->vn_mh,
- mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE));
+ vnic->vn_ls = mac_client_stat_get(vnic->vn_mch,
+ MAC_STAT_LINK_STATE);
}
+ mac_link_update(vnic->vn_mh, vnic->vn_ls);
rw_exit(&vnic_lock);
@@ -824,6 +840,15 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
HCKSUM_INET_PARTIAL);
break;
}
+ case MAC_CAPAB_LSO: {
+ mac_capab_lso_t *cap_lso = cap_data;
+
+ if (vnic->vn_cap_lso.lso_flags == 0) {
+ return (B_FALSE);
+ }
+ *cap_lso = vnic->vn_cap_lso;
+ break;
+ }
case MAC_CAPAB_VNIC: {
mac_capab_vnic_t *vnic_capab = cap_data;
@@ -1092,6 +1117,34 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
err = vnic_set_secondary_macs(vn, &msa);
break;
}
+ case MAC_PROP_PRIVATE: {
+ long val, i;
+ const char *v;
+
+ if (vn->vn_link_id != DATALINK_INVALID_LINKID ||
+ strcmp(pr_name, "_linkstate") != 0) {
+ err = ENOTSUP;
+ break;
+ }
+
+ for (v = pr_val, i = 0; i < pr_valsize; i++, v++) {
+ if (*v == '\0')
+ break;
+ }
+ if (i == pr_valsize) {
+ err = EINVAL;
+ break;
+ }
+
+ (void) ddi_strtol(pr_val, (char **)NULL, 0, &val);
+ if (val != LINK_STATE_UP && val != LINK_STATE_DOWN) {
+ err = EINVAL;
+ break;
+ }
+ vn->vn_ls = val;
+ mac_link_update(vn->vn_mh, vn->vn_ls);
+ break;
+ }
default:
err = ENOTSUP;
break;
@@ -1117,6 +1170,18 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
case MAC_PROP_SECONDARY_ADDRS:
ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val);
break;
+ case MAC_PROP_PRIVATE:
+ if (vn->vn_link_id != DATALINK_INVALID_LINKID) {
+ ret = EINVAL;
+ break;
+ }
+
+ if (strcmp(pr_name, "_linkstate") != 0) {
+ ret = EINVAL;
+ break;
+ }
+ (void) snprintf(pr_val, pr_valsize, "%d", vn->vn_ls);
+ break;
default:
ret = ENOTSUP;
break;
@@ -1126,7 +1191,8 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
}
/* ARGSUSED */
-static void vnic_m_propinfo(void *m_driver, const char *pr_name,
+static void
+vnic_m_propinfo(void *m_driver, const char *pr_name,
mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
{
vnic_t *vn = m_driver;
@@ -1169,6 +1235,18 @@ static void vnic_m_propinfo(void *m_driver, const char *pr_name,
mac_perim_exit(mph);
}
break;
+ case MAC_PROP_PRIVATE:
+ if (vn->vn_link_id != DATALINK_INVALID_LINKID)
+ break;
+
+ if (strcmp(pr_name, "_linkstate") == 0) {
+ char buf[16];
+
+ mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+ (void) snprintf(buf, sizeof (buf), "%d", vn->vn_ls);
+ mac_prop_info_set_default_str(prh, buf);
+ }
+ break;
}
}
@@ -1241,8 +1319,9 @@ vnic_notify_cb(void *arg, mac_notify_type_t type)
break;
case MAC_NOTE_LINK:
- mac_link_update(vnic->vn_mh,
- mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE));
+ vnic->vn_ls = mac_client_stat_get(vnic->vn_mch,
+ MAC_STAT_LINK_STATE);
+ mac_link_update(vnic->vn_mh, vnic->vn_ls);
break;
default:
diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c
new file mode 100644
index 0000000000..2da310ab8d
--- /dev/null
+++ b/usr/src/uts/common/io/zfd.c
@@ -0,0 +1,1154 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Zone File Descriptor Driver.
+ *
+ * This driver is derived from the zcons driver which is in turn derived from
+ * the pts/ptm drivers. The purpose is to expose file descriptors within the
+ * zone which are connected to zoneadmd and used for logging or an interactive
+ * connection to a process within the zone.
+ *
+ * Its implementation is straightforward. Each instance of the driver
+ * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd
+ * uses these devices unidirectionally to provide stdin, stdout and stderr to
+ * the process within the zone.
+ *
+ * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd,
+ * using the devctl framework; thus the driver does not need to maintain any
+ * sort of "admin" node.
+ *
+ * The driver shuttles I/O from master side to slave side and back. In a break
+ * from the pts/ptm semantics, if one side is not open, I/O directed towards
+ * it will simply be discarded. This is so that if zoneadmd is not holding the
+ * master side fd open (i.e. it has died somehow), processes in the zone do not
+ * experience any errors and I/O to the fd does not cause the process to hang.
+ *
+ * The driver can also act as a multiplexer so that data written to the
+ * slave side within the zone is also redirected back to another zfd device
+ * inside the zone for consumption (i.e. it can be read). The intention is
+ * that a logging process within the zone can consume data that is being
+ * written by an application onto the primary stream. This is essentially
+ * a tee off of the primary stream into a log stream. This tee can also be
+ * configured to be flow controlled via an ioctl. Flow control happens on the
+ * primary stream and is used to ensure that the log stream receives all of
+ * the messages off the primary stream when consumption of the data off of
+ * the log stream gets behind. Configuring for flow control implies that the
+ * application writing to the primary stream will be blocked when the log
+ * consumer gets behind. Note that closing the log stream (e.g. when the zone
+ * halts) will cause the loss of all messages queued in the stream.
+ *
+ * The zone's zfd device configuration is driven by zoneadmd and a zone mode.
+ * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat
+ * of a misnomer since its purpose has evolved. The attribute can have a
+ * variety of values, but the lowest two positions are used to control how many
+ * zfd devices are created inside the zone and if the primary stream is a tty.
+ *
+ * Here is a summary of how the 4 modes control what zfd devices are created
+ * and how they're used:
+ *
+ * t-: 1 stdio zdev (0) configured as a tty
+ * --: 3 stdio zdevs (0, 1, 2), not configured as a tty
+ * tn: 1 stdio zdev (0) configured as a tty, 1 additional zdev (1)
+ * -n: 3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4)
+ *
+ * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex
+ * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are
+ * autopushed onto the stream when the slave side is opened. There is only a
+ * single zfd dev (0) needed for the primary stream.
+ *
+ * When the 'n' flag is set, it is assumed that output logging will be done
+ * within the zone itself. In this configuration 1 or 2 additional zfd devices,
+ * depending on tty mode ('t' flag) are created within the zone. An application
+ * can then configure the zfd streams driver into a multiplexer. Output from
+ * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s)
+ * within the zone.
+ *
+ * The following is a diagram of how this works for a '-n' configuration:
+ *
+ *
+ * zoneadmd (for zlogin -I stdout)
+ * GZ: ^
+ * |
+ * --------------------------
+ * ^
+ * NGZ: |
+ * app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout)
+ *
+ * There would be a similar path for the app's stderr into zfd4 for the logger
+ * to consume stderr.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+#include <sys/debug.h>
+#include <sys/devops.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kstr.h>
+#include <sys/modctl.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/stream.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/zfd.h>
+#include <sys/vnode.h>
+#include <sys/fs/snode.h>
+#include <sys/zone.h>
+#include <sys/sdt.h>
+
+static kmutex_t zfd_mux_lock;
+
+static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int zfd_attach(dev_info_t *, ddi_attach_cmd_t);
+static int zfd_detach(dev_info_t *, ddi_detach_cmd_t);
+
+static int zfd_open(queue_t *, dev_t *, int, int, cred_t *);
+static int zfd_close(queue_t *, int, cred_t *);
+static void zfd_wput(queue_t *, mblk_t *);
+static void zfd_rsrv(queue_t *);
+static void zfd_wsrv(queue_t *);
+
+/*
+ * The instance number is encoded in the dev_t in the minor number; the lowest
+ * bit of the minor number is used to track the master vs. slave side of the
+ * fd. The rest of the bits in the minor number are the instance.
+ */
+#define ZFD_MASTER_MINOR 0
+#define ZFD_SLAVE_MINOR 1
+
+#define ZFD_INSTANCE(x) (getminor((x)) >> 1)
+#define ZFD_NODE(x) (getminor((x)) & 0x01)
+
+/*
+ * This macro converts a zfd_state_t pointer to the associated slave minor
+ * node's dev_t.
+ */
+#define ZFD_STATE_TO_SLAVEDEV(x) \
+ (makedevice(ddi_driver_major((x)->zfd_devinfo), \
+ (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR)))
+
+int zfd_debug = 0;
+#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a)
+#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b)
+
+/*
+ * ZFD Pseudo Terminal Module: stream data structure definitions,
+ * based on zcons.
+ */
+static struct module_info zfd_info = {
+ 0x20FD, /* ZOFD - 8445 */
+ "zfd",
+ 0, /* min packet size */
+ INFPSZ, /* max packet size - infinity */
+ 2048, /* high water */
+ 128 /* low water */
+};
+
+static struct qinit zfd_rinit = {
+ NULL,
+ (int (*)()) zfd_rsrv,
+ zfd_open,
+ zfd_close,
+ NULL,
+ &zfd_info,
+ NULL
+};
+
+static struct qinit zfd_winit = {
+ (int (*)()) zfd_wput,
+ (int (*)()) zfd_wsrv,
+ NULL,
+ NULL,
+ NULL,
+ &zfd_info,
+ NULL
+};
+
+static struct streamtab zfd_tab_info = {
+ &zfd_rinit,
+ &zfd_winit,
+ NULL,
+ NULL
+};
+
+#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL)
+
+/*
+ * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops)
+ */
+DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \
+ nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \
+ ddi_quiesce_not_needed);
+
+/*
+ * Module linkage information for the kernel.
+ */
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* Type of module (this is a pseudo driver) */
+ "Zone FD driver", /* description of module */
+ &zfd_ops /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modldrv,
+ NULL
+};
+
+typedef enum {
+ ZFD_NO_MUX,
+ ZFD_PRIMARY_STREAM,
+ ZFD_LOG_STREAM
+} zfd_mux_type_t;
+
+typedef struct zfd_state {
+ dev_info_t *zfd_devinfo; /* instance info */
+ queue_t *zfd_master_rdq; /* GZ read queue */
+ queue_t *zfd_slave_rdq; /* in-zone read queue */
+ int zfd_state; /* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */
+ int zfd_tty; /* ZFD_MAKETTY - strm mods will push */
+ boolean_t zfd_is_flowcon; /* primary stream flow stopped */
+ boolean_t zfd_allow_flowcon; /* use flow control */
+ zfd_mux_type_t zfd_muxt; /* state type: none, primary, log */
+ struct zfd_state *zfd_inst_pri; /* log state's primary ptr */
+ struct zfd_state *zfd_inst_log; /* primary state's log ptr */
+} zfd_state_t;
+
+#define ZFD_STATE_MOPEN 0x01
+#define ZFD_STATE_SOPEN 0x02
+
+static void *zfd_soft_state;
+
+/*
+ * List of STREAMS modules that are autopushed onto a slave instance when its
+ * opened, but only if the ZFD_MAKETTY ioctl has first been received by the
+ * master.
+ */
+static char *zfd_mods[] = {
+ "ptem",
+ "ldterm",
+ "ttcompat",
+ NULL
+};
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t),
+ 0)) != 0) {
+ return (err);
+ }
+
+ if ((err = mod_install(&modlinkage)) != 0)
+ ddi_soft_state_fini(zfd_soft_state);
+
+ mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (err);
+}
+
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = mod_remove(&modlinkage)) != 0) {
+ return (err);
+ }
+
+ ddi_soft_state_fini(&zfd_soft_state);
+ mutex_destroy(&zfd_mux_lock);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+static int
+zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ zfd_state_t *zfds;
+ int instance;
+ char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN];
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ instance = ddi_get_instance(dip);
+ if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+
+ (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME,
+ instance);
+ (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME,
+ instance);
+
+ /*
+ * Create the master and slave minor nodes.
+ */
+ if ((ddi_create_minor_node(dip, slavenm, S_IFCHR,
+ instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) ||
+ (ddi_create_minor_node(dip, masternm, S_IFCHR,
+ instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) {
+ ddi_remove_minor_node(dip, NULL);
+ ddi_soft_state_free(zfd_soft_state, instance);
+ return (DDI_FAILURE);
+ }
+
+ VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL);
+ zfds->zfd_devinfo = dip;
+ zfds->zfd_tty = 0;
+ zfds->zfd_muxt = ZFD_NO_MUX;
+ zfds->zfd_inst_log = NULL;
+ return (DDI_SUCCESS);
+}
+
+static int
+zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ zfd_state_t *zfds;
+ int instance;
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ instance = ddi_get_instance(dip);
+ if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+ return (DDI_FAILURE);
+
+ if ((zfds->zfd_state & ZFD_STATE_MOPEN) ||
+ (zfds->zfd_state & ZFD_STATE_SOPEN)) {
+ DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip);
+ return (DDI_FAILURE);
+ }
+
+ ddi_remove_minor_node(dip, NULL);
+ ddi_soft_state_free(zfd_soft_state, instance);
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * zfd_getinfo()
+ * getinfo(9e) entrypoint.
+ */
+/*ARGSUSED*/
+static int
+zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ zfd_state_t *zfds;
+ int instance = ZFD_INSTANCE((dev_t)arg);
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ if ((zfds = ddi_get_soft_state(zfd_soft_state,
+ instance)) == NULL)
+ return (DDI_FAILURE);
+ *result = zfds->zfd_devinfo;
+ return (DDI_SUCCESS);
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)instance;
+ return (DDI_SUCCESS);
+ }
+ return (DDI_FAILURE);
+}
+
+/*
+ * Return the equivalent queue from the other side of the relationship.
+ * e.g.: given the slave's write queue, return the master's write queue.
+ */
+static queue_t *
+zfd_switch(queue_t *qp)
+{
+ zfd_state_t *zfds = qp->q_ptr;
+ ASSERT(zfds != NULL);
+
+ if (qp == zfds->zfd_master_rdq)
+ return (zfds->zfd_slave_rdq);
+ else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq
+ != NULL)
+ return (OTHERQ(zfds->zfd_slave_rdq));
+ else if (qp == zfds->zfd_slave_rdq)
+ return (zfds->zfd_master_rdq);
+ else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq
+ != NULL)
+ return (OTHERQ(zfds->zfd_master_rdq));
+ else
+ return (NULL);
+}
+
+/*
+ * For debugging and outputting messages. Returns the name of the side of
+ * the relationship associated with this queue.
+ */
+static const char *
+zfd_side(queue_t *qp)
+{
+ zfd_state_t *zfds = qp->q_ptr;
+ ASSERT(zfds != NULL);
+
+ if (qp == zfds->zfd_master_rdq ||
+ OTHERQ(qp) == zfds->zfd_master_rdq) {
+ return ("master");
+ }
+ ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq);
+ return ("slave");
+}
+
+/*ARGSUSED*/
+static int
+zfd_master_open(zfd_state_t *zfds,
+ queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ mblk_t *mop;
+ struct stroptions *sop;
+
+ /*
+ * Enforce exclusivity on the master side; the only consumer should
+ * be the zoneadmd for the zone.
+ */
+ if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0)
+ return (EBUSY);
+
+ if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+ DBG("zfd_master_open(): mop allocation failed\n");
+ return (ENOMEM);
+ }
+
+ zfds->zfd_state |= ZFD_STATE_MOPEN;
+
+ /*
+ * q_ptr stores driver private data; stash the soft state data on both
+ * read and write sides of the queue.
+ */
+ WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+ qprocson(rqp);
+
+ /*
+ * Following qprocson(), the master side is fully plumbed into the
+ * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq
+ * will allow the slave to send messages to us (the master).
+ * This cannot occur before qprocson() because the master is not
+ * ready to process them until that point.
+ */
+ zfds->zfd_master_rdq = rqp;
+
+ /*
+ * set up hi/lo water marks on stream head read queue and add
+ * controlling tty as needed.
+ */
+ mop->b_datap->db_type = M_SETOPTS;
+ mop->b_wptr += sizeof (struct stroptions);
+ sop = (struct stroptions *)(void *)mop->b_rptr;
+ if (oflag & FNOCTTY)
+ sop->so_flags = SO_HIWAT | SO_LOWAT;
+ else
+ sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+ sop->so_hiwat = 512;
+ sop->so_lowat = 256;
+ putnext(rqp, mop);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfd_slave_open(zfd_state_t *zfds,
+ queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ mblk_t *mop;
+ struct stroptions *sop;
+ /*
+ * The slave side can be opened as many times as needed.
+ */
+ if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+ ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds));
+ return (0);
+ }
+
+ /* A log stream is read-only */
+ if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+ (oflag & (FREAD | FWRITE)) != FREAD)
+ return (EINVAL);
+
+ if (zfds->zfd_tty == 1) {
+ major_t major;
+ minor_t minor;
+ minor_t lastminor;
+ uint_t anchorindex;
+
+ /*
+ * Set up sad(7D) so that the necessary STREAMS modules will
+ * be in place. A wrinkle is that 'ptem' must be anchored
+ * in place (see streamio(7i)) because we always want the
+ * fd to have terminal semantics.
+ */
+ minor =
+ ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR;
+ major = ddi_driver_major(zfds->zfd_devinfo);
+ lastminor = 0;
+ anchorindex = 1;
+ if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor,
+ &anchorindex, zfd_mods) != 0) {
+ DBG("zfd_slave_open(): kstr_autopush() failed\n");
+ return (EIO);
+ }
+ }
+
+ if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+ DBG("zfd_slave_open(): mop allocation failed\n");
+ return (ENOMEM);
+ }
+
+ zfds->zfd_state |= ZFD_STATE_SOPEN;
+
+ /*
+ * q_ptr stores driver private data; stash the soft state data on both
+ * read and write sides of the queue.
+ */
+ WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+
+ qprocson(rqp);
+
+ /*
+ * Must follow qprocson(), since we aren't ready to process until then.
+ */
+ zfds->zfd_slave_rdq = rqp;
+
+ /*
+ * set up hi/lo water marks on stream head read queue and add
+ * controlling tty as needed.
+ */
+ mop->b_datap->db_type = M_SETOPTS;
+ mop->b_wptr += sizeof (struct stroptions);
+ sop = (struct stroptions *)(void *)mop->b_rptr;
+ sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+ sop->so_hiwat = 512;
+ sop->so_lowat = 256;
+ putnext(rqp, mop);
+
+ return (0);
+}
+
+/*
+ * open(9e) entrypoint; checks sflag, and rejects anything unordinary.
+ */
+static int
+zfd_open(queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ int instance = ZFD_INSTANCE(*devp);
+ int ret;
+ zfd_state_t *zfds;
+
+ if (sflag != 0)
+ return (EINVAL);
+
+ if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+ return (ENXIO);
+
+ switch (ZFD_NODE(*devp)) {
+ case ZFD_MASTER_MINOR:
+ ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp);
+ break;
+ case ZFD_SLAVE_MINOR:
+ ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp);
+ /*
+ * If we just opened the log stream and flow control has
+ * been enabled, we want to make sure the primary stream can
+ * start flowing.
+ */
+ if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM &&
+ zfds->zfd_inst_pri->zfd_allow_flowcon) {
+ zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+ if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+ qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+ }
+ break;
+ default:
+ ret = ENXIO;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * close(9e) entrypoint.
+ */
+/*ARGSUSED1*/
+static int
+zfd_close(queue_t *rqp, int flag, cred_t *credp)
+{
+ queue_t *wqp;
+ mblk_t *bp;
+ zfd_state_t *zfds;
+ major_t major;
+ minor_t minor;
+
+ zfds = (zfd_state_t *)rqp->q_ptr;
+
+ if (rqp == zfds->zfd_master_rdq) {
+ DBG("Closing master side");
+
+ zfds->zfd_master_rdq = NULL;
+ zfds->zfd_state &= ~ZFD_STATE_MOPEN;
+
+ /*
+ * qenable slave side write queue so that it can flush
+ * its messages as master's read queue is going away
+ */
+ if (zfds->zfd_slave_rdq != NULL) {
+ qenable(WR(zfds->zfd_slave_rdq));
+ }
+
+ qprocsoff(rqp);
+ WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+ } else if (rqp == zfds->zfd_slave_rdq) {
+
+ DBG("Closing slave side");
+ zfds->zfd_state &= ~ZFD_STATE_SOPEN;
+ zfds->zfd_slave_rdq = NULL;
+
+ wqp = WR(rqp);
+ while ((bp = getq(wqp)) != NULL) {
+ if (zfds->zfd_master_rdq != NULL)
+ putnext(zfds->zfd_master_rdq, bp);
+ else if (bp->b_datap->db_type == M_IOCTL)
+ miocnak(wqp, bp, 0, 0);
+ else
+ freemsg(bp);
+ }
+
+ /*
+ * Qenable master side write queue so that it can flush its
+ * messages as slaves's read queue is going away.
+ */
+ if (zfds->zfd_master_rdq != NULL)
+ qenable(WR(zfds->zfd_master_rdq));
+
+ /*
+ * Qenable primary stream if necessary.
+ */
+ if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+ zfds->zfd_inst_pri->zfd_allow_flowcon) {
+ zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+ if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+ qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+ }
+
+ qprocsoff(rqp);
+ WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+ if (zfds->zfd_tty == 1) {
+ /*
+ * Clear the sad configuration so that reopening
+ * doesn't fail to set up sad configuration.
+ */
+ major = ddi_driver_major(zfds->zfd_devinfo);
+ minor = ddi_get_instance(zfds->zfd_devinfo) << 1 |
+ ZFD_SLAVE_MINOR;
+ (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor,
+ NULL, NULL, NULL);
+ }
+ }
+
+ return (0);
+}
+
+static void
+handle_mflush(queue_t *qp, mblk_t *mp)
+{
+ mblk_t *nmp;
+ DBG1("M_FLUSH on %s side", zfd_side(qp));
+
+ if (*mp->b_rptr & FLUSHW) {
+ DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp));
+ flushq(qp, FLUSHDATA);
+ *mp->b_rptr &= ~FLUSHW;
+ if ((*mp->b_rptr & FLUSHR) == 0) {
+ /*
+ * FLUSHW only. Change to FLUSHR and putnext other side,
+ * then we are done.
+ */
+ *mp->b_rptr |= FLUSHR;
+ if (zfd_switch(RD(qp)) != NULL) {
+ putnext(zfd_switch(RD(qp)), mp);
+ return;
+ }
+ } else if ((zfd_switch(RD(qp)) != NULL) &&
+ (nmp = copyb(mp)) != NULL) {
+ /*
+ * It is a FLUSHRW; we copy the mblk and send
+ * it to the other side, since we still need to use
+ * the mblk in FLUSHR processing, below.
+ */
+ putnext(zfd_switch(RD(qp)), nmp);
+ }
+ }
+
+ if (*mp->b_rptr & FLUSHR) {
+ DBG("qreply(qp) turning FLUSHR around\n");
+ qreply(qp, mp);
+ return;
+ }
+ freemsg(mp);
+}
+
+/*
+ * Evaluate the various conditionals to determine if we're teeing into a log
+ * stream and if the primary stream should be flow controlled. This function
+ * can set the zfd_is_flowcon flag as a side effect.
+ *
+ * When teeing with flow control, we always queue the teed msg here and if
+ * the queue is getting full, we set zfd_is_flowcon. The primary stream will
+ * always queue when zfd_is_flowcon and will also not be served when
+ * zfd_is_flowcon is set. This causes backpressure on the primary stream
+ * until the teed queue can drain.
+ */
+static void
+zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp)
+{
+ queue_t *log_qp;
+ zfd_state_t *log_zfds;
+ mblk_t *lmp;
+
+ if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM)
+ return;
+
+ if (type != M_DATA)
+ return;
+
+ log_zfds = zfds->zfd_inst_log;
+ if (log_zfds == NULL)
+ return;
+
+ ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM);
+
+ if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) {
+ if (zfds->zfd_allow_flowcon)
+ zfds->zfd_is_flowcon = B_TRUE;
+ return;
+ }
+
+ /* The zfd_slave_rdq is null until the log dev is opened in the zone */
+ log_qp = RD(log_zfds->zfd_slave_rdq);
+ DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds);
+
+ if (!zfds->zfd_allow_flowcon) {
+ /*
+ * We're not supposed to tee with flow control and the tee is
+ * full so we skip teeing into the log stream.
+ */
+ if ((log_qp->q_flag & QFULL) != 0)
+ return;
+ }
+
+ /*
+ * Tee the message into the log stream.
+ */
+ lmp = dupmsg(mp);
+ if (lmp == NULL) {
+ if (zfds->zfd_allow_flowcon)
+ zfds->zfd_is_flowcon = B_TRUE;
+ return;
+ }
+
+ if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) {
+ putnext(log_qp, lmp);
+ } else {
+ if (putq(log_qp, lmp) == 0) {
+ /* The logger queue is full, free the msg. */
+ freemsg(lmp);
+ }
+ /*
+ * If we're supposed to tee with flow control and the tee is
+ * over the high water mark then we want the primary stream to
+ * stop flowing. We'll stop queueing the primary stream after
+ * the log stream has drained.
+ */
+ if (zfds->zfd_allow_flowcon &&
+ log_qp->q_count > log_qp->q_hiwat) {
+ zfds->zfd_is_flowcon = B_TRUE;
+ }
+ }
+}
+
+/*
+ * wput(9E) is symmetric for master and slave sides, so this handles both
+ * without splitting the codepath. (The only exception to this is the
+ * processing of zfd ioctls, which is restricted to the master side.)
+ *
+ * zfd_wput() looks at the other side; if there is no process holding that
+ * side open, it frees the message. This prevents processes from hanging
+ * if no one is holding open the fd. Otherwise, it putnext's high
+ * priority messages, putnext's normal messages if possible, and otherwise
+ * enqueues the messages; in the case that something is enqueued, wsrv(9E)
+ * will take care of eventually shuttling I/O to the other side.
+ *
+ * When configured as a multiplexer, then anything written to the stream
+ * from inside the zone is also teed off to the corresponding log stream
+ * for consumption within the zone (i.e. the log stream can be read, but never
+ * written to, by an application inside the zone).
+ */
+static void
+zfd_wput(queue_t *qp, mblk_t *mp)
+{
+ unsigned char type = mp->b_datap->db_type;
+ zfd_state_t *zfds;
+ struct iocblk *iocbp;
+ boolean_t must_queue = B_FALSE;
+
+ ASSERT(qp->q_ptr);
+
+ DBG1("entering zfd_wput, %s side", zfd_side(qp));
+
+ /*
+ * Process zfd ioctl messages if qp is the master side's write queue.
+ */
+ zfds = (zfd_state_t *)qp->q_ptr;
+
+ if (type == M_IOCTL) {
+ iocbp = (struct iocblk *)(void *)mp->b_rptr;
+
+ switch (iocbp->ioc_cmd) {
+ case ZFD_MAKETTY:
+ zfds->zfd_tty = 1;
+ miocack(qp, mp, 0, 0);
+ return;
+ case ZFD_EOF:
+ if (zfds->zfd_slave_rdq != NULL)
+ (void) putnextctl(zfds->zfd_slave_rdq,
+ M_HANGUP);
+ miocack(qp, mp, 0, 0);
+ return;
+ case ZFD_HAS_SLAVE:
+ if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+ miocack(qp, mp, 0, 0);
+ } else {
+ miocack(qp, mp, 0, ENOTTY);
+ }
+ return;
+ case ZFD_MUX: {
+ /*
+ * Setup the multiplexer configuration for the two
+ * streams.
+ *
+ * We expect to be called on the stream that will
+ * become the log stream and be passed one data block
+ * with the minor number of the slave side of the
+ * primary stream.
+ */
+ int to;
+ int instance;
+ zfd_state_t *prim_zfds;
+
+ if (iocbp->ioc_count != TRANSPARENT ||
+ mp->b_cont == NULL) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ /* Get the primary slave minor device number */
+ to = *(int *)mp->b_cont->b_rptr;
+ instance = ZFD_INSTANCE(to);
+
+ if ((prim_zfds = ddi_get_soft_state(zfd_soft_state,
+ instance)) == NULL) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ /* Disallow changing primary/log once set. */
+ mutex_enter(&zfd_mux_lock);
+ if (zfds->zfd_muxt != ZFD_NO_MUX ||
+ prim_zfds->zfd_muxt != ZFD_NO_MUX) {
+ mutex_exit(&zfd_mux_lock);
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ zfds->zfd_muxt = ZFD_LOG_STREAM;
+ zfds->zfd_inst_pri = prim_zfds;
+ prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM;
+ prim_zfds->zfd_inst_log = zfds;
+ mutex_exit(&zfd_mux_lock);
+ DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds,
+ void *, zfds);
+
+ miocack(qp, mp, 0, 0);
+ return;
+ }
+ case ZFD_MUX_FLOWCON: {
+ /*
+ * We expect this ioctl to be issued against the
+ * log stream. We don't use the primary stream since
+ * there can be other streams modules pushed onto that
+ * stream which would interfere with the ioctl.
+ */
+ int val;
+ zfd_state_t *prim_zfds;
+
+ if (iocbp->ioc_count != TRANSPARENT ||
+ mp->b_cont == NULL) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ if (zfds->zfd_muxt != ZFD_LOG_STREAM) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+ prim_zfds = zfds->zfd_inst_pri;
+
+ /* Get the flow control setting */
+ val = *(int *)mp->b_cont->b_rptr;
+ if (val != 0 && val != 1) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ prim_zfds->zfd_allow_flowcon = (boolean_t)val;
+ if (!prim_zfds->zfd_allow_flowcon)
+ prim_zfds->zfd_is_flowcon = B_FALSE;
+
+ DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds);
+ miocack(qp, mp, 0, 0);
+ return;
+ }
+ default:
+ break;
+ }
+ }
+
+ /* if on the write side, may need to tee */
+ if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) {
+ /* tee output to any attached log stream */
+ zfd_tee_handler(zfds, type, mp);
+
+ /* high-priority msgs are not subject to flow control */
+ if (zfds->zfd_is_flowcon && type == M_DATA)
+ must_queue = B_TRUE;
+ }
+
+ if (zfd_switch(RD(qp)) == NULL) {
+ DBG1("wput to %s side (no one listening)", zfd_side(qp));
+ switch (type) {
+ case M_FLUSH:
+ handle_mflush(qp, mp);
+ break;
+ case M_IOCTL:
+ miocnak(qp, mp, 0, 0);
+ break;
+ default:
+ freemsg(mp);
+ break;
+ }
+ return;
+ }
+
+ if (type >= QPCTL) {
+ DBG1("(hipri) wput, %s side", zfd_side(qp));
+ switch (type) {
+ case M_READ: /* supposedly from ldterm? */
+ DBG("zfd_wput: tossing M_READ\n");
+ freemsg(mp);
+ break;
+ case M_FLUSH:
+ handle_mflush(qp, mp);
+ break;
+ default:
+ /*
+ * Put this to the other side.
+ */
+ ASSERT(zfd_switch(RD(qp)) != NULL);
+ putnext(zfd_switch(RD(qp)), mp);
+ break;
+ }
+ DBG1("done (hipri) wput, %s side", zfd_side(qp));
+ return;
+ }
+
+ /*
+ * If the primary stream has been stopped for flow control then
+ * enqueue the msg, otherwise only putnext if there isn't already
+ * something in the queue. If we don't do this then things would wind
+ * up out of order.
+ */
+ if (!must_queue && qp->q_first == NULL &&
+ bcanputnext(RD(zfd_switch(qp)), mp->b_band)) {
+ putnext(RD(zfd_switch(qp)), mp);
+ } else {
+ /*
+ * zfd_wsrv expects msgs queued on the primary queue. Those
+ * will be handled by zfd_wsrv after zfd_rsrv performs the
+ * qenable on the proper queue.
+ */
+ (void) putq(qp, mp);
+ }
+
+ DBG1("done wput, %s side", zfd_side(qp));
+}
+
+/*
+ * Read server
+ *
+ * For primary stream:
+ * Under normal execution rsrv(9E) is symmetric for master and slave, so
+ * zfd_rsrv() can handle both without splitting up the codepath. We do this by
+ * enabling the write side of the partner. This triggers the partner to send
+ * messages queued on its write side to this queue's read side.
+ *
+ * For log stream:
+ * Internally we've queued up the msgs that we've teed off to the log stream
+ * so when we're invoked we need to pass these along.
+ */
+static void
+zfd_rsrv(queue_t *qp)
+{
+ zfd_state_t *zfds;
+ zfds = (zfd_state_t *)qp->q_ptr;
+
+ /*
+ * log stream server
+ */
+ if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) {
+ queue_t *log_qp;
+ mblk_t *mp;
+
+ log_qp = RD(zfds->zfd_slave_rdq);
+
+ if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+ zfd_state_t *pzfds = zfds->zfd_inst_pri;
+
+ while ((mp = getq(qp)) != NULL) {
+ if (bcanputnext(log_qp, mp->b_band)) {
+ putnext(log_qp, mp);
+ } else {
+ (void) putbq(log_qp, mp);
+ break;
+ }
+ }
+
+ if (log_qp->q_count < log_qp->q_lowat) {
+ DTRACE_PROBE(zfd__flow__on);
+ pzfds->zfd_is_flowcon = B_FALSE;
+ if (pzfds->zfd_master_rdq != NULL)
+ qenable(RD(pzfds->zfd_master_rdq));
+ }
+ } else {
+ /* No longer open, drain the queue */
+ while ((mp = getq(qp)) != NULL) {
+ freemsg(mp);
+ }
+ flushq(qp, FLUSHALL);
+ }
+ return;
+ }
+
+ /*
+ * Care must be taken here, as either of the master or slave side
+ * qptr could be NULL.
+ */
+ ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq);
+ if (zfd_switch(qp) == NULL) {
+ DBG("zfd_rsrv: other side isn't listening\n");
+ return;
+ }
+ qenable(WR(zfd_switch(qp)));
+}
+
+/*
+ * Write server
+ *
+ * This routine is symmetric for master and slave, so it handles both without
+ * splitting up the codepath.
+ *
+ * If there are messages on this queue that can be sent to the other, send
+ * them via putnext(). Else, if queued messages cannot be sent, leave them
+ * on this queue.
+ */
+static void
+zfd_wsrv(queue_t *qp)
+{
+ queue_t *swq;
+ mblk_t *mp;
+ zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr;
+
+ ASSERT(zfds != NULL);
+
+ /*
+ * Partner has no read queue, so take the data, and throw it away.
+ */
+ if (zfd_switch(RD(qp)) == NULL) {
+ DBG("zfd_wsrv: other side isn't listening");
+ while ((mp = getq(qp)) != NULL) {
+ if (mp->b_datap->db_type == M_IOCTL)
+ miocnak(qp, mp, 0, 0);
+ else
+ freemsg(mp);
+ }
+ flushq(qp, FLUSHALL);
+ return;
+ }
+
+ swq = RD(zfd_switch(qp));
+
+ /*
+ * while there are messages on this write queue...
+ */
+ while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) {
+ /*
+ * Due to the way zfd_wput is implemented, we should never
+ * see a high priority control message here.
+ */
+ ASSERT(mp->b_datap->db_type < QPCTL);
+
+ if (bcanputnext(swq, mp->b_band)) {
+ putnext(swq, mp);
+ } else {
+ (void) putbq(qp, mp);
+ break;
+ }
+ }
+}
diff --git a/usr/src/uts/common/klm/klmmod.c b/usr/src/uts/common/klm/klmmod.c
index 51ed43e198..58e0f2d874 100644
--- a/usr/src/uts/common/klm/klmmod.c
+++ b/usr/src/uts/common/klm/klmmod.c
@@ -12,6 +12,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/*
@@ -278,6 +279,10 @@ lm_svc(struct lm_svc_args *args)
rfs4_lease_time = args->grace;
}
+ if (args->n_v4_only == -1) {
+ g->nlm_v4_only = B_TRUE;
+ }
+
mutex_exit(&g->lock);
err = nlm_svc_starting(g, fp, netid, &knc);
mutex_enter(&g->lock);
diff --git a/usr/src/uts/common/klm/mapfile-mod b/usr/src/uts/common/klm/mapfile-mod
index 0debe6d986..b7789d81fd 100644
--- a/usr/src/uts/common/klm/mapfile-mod
+++ b/usr/src/uts/common/klm/mapfile-mod
@@ -11,6 +11,7 @@
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+# Copyright 2017 Joyent, Inc.
#
@@ -49,6 +50,11 @@ SYMBOL_SCOPE {
nlm_frlock;
nlm_register_lock_locally;
nlm_shrlock;
+# These four functions are available for use within a branded zone.
+ nlm_nsm_clnt_init;
+ nlm_netbuf_to_netobj;
+ sm_mon_1;
+ sm_unmon_1;
local:
*;
diff --git a/usr/src/uts/common/klm/nlm_dispatch.c b/usr/src/uts/common/klm/nlm_dispatch.c
index a0ca2a56c4..8fa9940eae 100644
--- a/usr/src/uts/common/klm/nlm_dispatch.c
+++ b/usr/src/uts/common/klm/nlm_dispatch.c
@@ -11,6 +11,7 @@
/*
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc. All rights reserved.
*/
/*
@@ -412,13 +413,13 @@ nlm_prog_3_dtable[] = {
0,
0 },
- { /* 16: not used */
- NLM_SVC_FUNC(0),
- (xdrproc_t)0,
- (xdrproc_t)0,
+ { /* 16: Linux NLMPROC_NSM_NOTIFY (same handling as NLM_SM_NOTIFY1) */
+ NLM_SVC_FUNC(nlm_sm_notify1_2_svc),
+ (xdrproc_t)xdr_nlm_sm_status,
+ (xdrproc_t)xdr_void,
NULL,
0,
- 0 },
+ NLM_DISP_NOREMOTE },
{ /* 17: NLM_SM_NOTIFY1 */
NLM_SVC_FUNC(nlm_sm_notify1_2_svc),
diff --git a/usr/src/uts/common/klm/nlm_impl.c b/usr/src/uts/common/klm/nlm_impl.c
index 1e9033a17c..e787f70ebd 100644
--- a/usr/src/uts/common/klm/nlm_impl.c
+++ b/usr/src/uts/common/klm/nlm_impl.c
@@ -28,6 +28,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc. All rights reserved.
*/
/*
@@ -57,6 +58,7 @@
#include <sys/queue.h>
#include <sys/bitmap.h>
#include <sys/sdt.h>
+#include <sys/brand.h>
#include <netinet/in.h>
#include <rpc/rpc.h>
@@ -202,6 +204,12 @@ static struct nlm_knc nlm_netconfigs[] = { /* (g) */
};
/*
+ * NLM functions which can be called by a brand hook.
+ */
+void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);
+void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
+
+/*
* NLM misc. function
*/
static void nlm_copy_netbuf(struct netbuf *, struct netbuf *);
@@ -210,8 +218,6 @@ static void nlm_kmem_reclaim(void *);
static void nlm_pool_shutdown(void);
static void nlm_suspend_zone(struct nlm_globals *);
static void nlm_resume_zone(struct nlm_globals *);
-static void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
-static void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);
/*
* NLM thread functions
@@ -1839,6 +1845,12 @@ nlm_host_unmonitor(struct nlm_globals *g, struct nlm_host *host)
return;
host->nh_flags &= ~NLM_NH_MONITORED;
+
+ if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_rpc_statd != NULL) {
+ ZBROP(curzone)->b_rpc_statd(SM_UNMON, g, host);
+ return;
+ }
+
stat = nlm_nsm_unmon(&g->nlm_nsm, host->nh_name);
if (stat != RPC_SUCCESS) {
NLM_WARN("NLM: Failed to contact statd, stat=%d\n", stat);
@@ -1877,6 +1889,11 @@ nlm_host_monitor(struct nlm_globals *g, struct nlm_host *host, int state)
host->nh_flags |= NLM_NH_MONITORED;
mutex_exit(&host->nh_lock);
+ if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_rpc_statd != NULL) {
+ ZBROP(curzone)->b_rpc_statd(SM_MON, g, host);
+ return;
+ }
+
/*
* Before we begin monitoring the host register the network address
* associated with this hostname.
@@ -2353,6 +2370,13 @@ nlm_svc_starting(struct nlm_globals *g, struct file *fp,
VERIFY(g->run_status == NLM_ST_STARTING);
VERIFY(g->nlm_gc_thread == NULL);
+ if (g->nlm_v4_only) {
+ NLM_WARN("Zone %d has no rpcbind, NLM is v4 only", getzoneid());
+ bzero(&g->nlm_nsm, sizeof (struct nlm_nsm));
+ g->nlm_nsm.ns_addr_handle = (void *)-1;
+ goto v4_only;
+ }
+
error = nlm_nsm_init_local(&g->nlm_nsm);
if (error != 0) {
NLM_ERR("Failed to initialize NSM handler "
@@ -2389,6 +2413,7 @@ nlm_svc_starting(struct nlm_globals *g, struct file *fp,
"(rpcerr=%d)\n", stat);
goto shutdown_lm;
}
+v4_only:
g->grace_threshold = ddi_get_lbolt() +
SEC_TO_TICK(g->grace_period);
@@ -2512,7 +2537,9 @@ nlm_svc_stopping(struct nlm_globals *g)
ASSERT(TAILQ_EMPTY(&g->nlm_slocks));
- nlm_nsm_fini(&g->nlm_nsm);
+ /* If started with rpcbind (the normal case) */
+ if (g->nlm_nsm.ns_addr_handle != (void *)-1)
+ nlm_nsm_fini(&g->nlm_nsm);
g->lockd_pid = 0;
g->run_status = NLM_ST_DOWN;
}
@@ -2781,14 +2808,14 @@ nlm_cprresume(void)
rw_exit(&lm_lck);
}
-static void
+void
nlm_nsm_clnt_init(CLIENT *clnt, struct nlm_nsm *nsm)
{
(void) clnt_tli_kinit(clnt, &nsm->ns_knc, &nsm->ns_addr, 0,
NLM_RPC_RETRIES, kcred);
}
-static void
+void
nlm_netbuf_to_netobj(struct netbuf *addr, int *family, netobj *obj)
{
/* LINTED pointer alignment */
diff --git a/usr/src/uts/common/klm/nlm_impl.h b/usr/src/uts/common/klm/nlm_impl.h
index 6b2df7f8b0..2ac711f3c7 100644
--- a/usr/src/uts/common/klm/nlm_impl.h
+++ b/usr/src/uts/common/klm/nlm_impl.h
@@ -30,6 +30,7 @@
/*
* Copyright 2012 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/*
@@ -459,6 +460,7 @@ struct nlm_globals {
int cn_idle_tmo; /* (z) */
int grace_period; /* (z) */
int retrans_tmo; /* (z) */
+ boolean_t nlm_v4_only; /* (z) */
kmutex_t clean_lock; /* (c) */
TAILQ_ENTRY(nlm_globals) nlm_link; /* (g) */
};
diff --git a/usr/src/uts/common/krtld/kobj.c b/usr/src/uts/common/krtld/kobj.c
index 62eaabfb03..1038875bbc 100644
--- a/usr/src/uts/common/krtld/kobj.c
+++ b/usr/src/uts/common/krtld/kobj.c
@@ -2180,6 +2180,7 @@ static void
free_module_data(struct module *mp)
{
struct module_list *lp, *tmp;
+ hotinline_desc_t *hid, *next;
int ksyms_exported = 0;
lp = mp->head;
@@ -2189,6 +2190,15 @@ free_module_data(struct module *mp)
kobj_free((char *)tmp, sizeof (*tmp));
}
+ /* release hotinlines */
+ hid = mp->hi_calls;
+ while (hid != NULL) {
+ next = hid->hid_next;
+ kobj_free(hid->hid_symname, strlen(hid->hid_symname) + 1);
+ kobj_free(hid, sizeof (hotinline_desc_t));
+ hid = next;
+ }
+
rw_enter(&ksyms_lock, RW_WRITER);
if (mp->symspace) {
if (vmem_contains(ksyms_arena, mp->symspace, mp->symsize)) {
@@ -3034,8 +3044,18 @@ do_symbols(struct module *mp, Elf64_Addr bss_base)
if (sp->st_shndx == SHN_UNDEF) {
resolved = 0;
+ /*
+ * Skip over sdt probes and smap calls,
+ * they're relocated later.
+ */
if (strncmp(name, sdt_prefix, strlen(sdt_prefix)) == 0)
continue;
+#if defined(__x86)
+ if (strcmp(name, "smap_enable") == 0 ||
+ strcmp(name, "smap_disable") == 0)
+ continue;
+#endif /* defined(__x86) */
+
/*
* If it's not a weak reference and it's
diff --git a/usr/src/uts/common/mapfiles/README b/usr/src/uts/common/mapfiles/README
new file mode 100644
index 0000000000..5b65771325
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/README
@@ -0,0 +1,68 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+Kernel Module Build Time Symbol Verification
+--------------------------------------------
+
+Historically, kernel modules have all been built as relocatable objects.
+They are not dynamic objects and dependency information is always noted
+in individual makefiles. Along with this, there has never been any
+verification of the symbols that are being used. This means that it's
+possible for a kernel module author to refer to a symbol that doesn't
+exist and not find out until they try to install the module.
+
+To help find these problems at build time, we provide an opt-in system
+for modules to use, leveraging the link-editor's '-z defs' option. This
+option ensures that there are no unknown definitons at link-edit time.
+To supply these definitions we supply a series of mapfiles in this
+directory.
+
+These mapfiles are not the traditional versioning mapfiles like those in
+usr/src/lib/README.mapfiles! Please review the following differences
+closely:
+
+* These mapfiles do not declare any versions!
+* These mapfiles do not use the 'SYMBOL_VERSION' directive, instead they
+ use the 'SYMBOL_SCOPE' directive.
+* These mapfiles do not hide symbols! Library mapfiles always have
+ something to catch all local symbols. That should *never* be used
+ here. These mapfiles should not effect visibility.
+* All symbols in these mapfiles should be marked 'EXTERN' to indicate
+ that they are not provided by the kernel module but by another.
+* These mapfiles do not declare what is or isn't a public interface,
+ though they are often grouped around interfaces, to make it easier for
+ a driver author to get this right.
+
+Mapfiles are organized based on kernel module. For example the GLDv3
+device driver interface is provided by the 'mac' module and thus is
+found in the file 'mac.mapfile'. The DDI is currently in the 'ddi'
+mapfile. Functions that are found in genunix and unix that aren't in
+the DDI should not be put in that mapfile.
+
+Note, the existing files may not be complete. These are intended to only
+have the public interfaces provided by modules and thus should not
+include every symbol in them. As the need arises, add new symbols or
+modules as appropriate.
+
+To opt a module into this, first declare a series of MAPFILES that they
+should check against in the module. This should be a series of one or
+more files, for example:
+
+MAPFILES += ddi mac
+
+Next, you should add an include of Makefile.mapfile right before you
+include Makefile.targ. You can do this with the following line:
+
+include $(UTSBASE)/Makefile.mapfile
diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile
new file mode 100644
index 0000000000..1377af5857
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/ddi.mapfile
@@ -0,0 +1,192 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+# usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+#
+# This file contains core functions provided by the DDI and also items
+# required as part of the platform's runime ABI (think compiler
+# functions).
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+ global:
+ __divdi3 { FLAGS = EXTERN };
+ __stack_chk_fail { FLAGS = EXTERN };
+ __stack_chk_guard { FLAGS = EXTERN };
+ allocb { FLAGS = EXTERN };
+ assfail { FLAGS = EXTERN };
+ assfail3 { FLAGS = EXTERN };
+ atomic_dec_32_nv { FLAGS = EXTERN };
+ bcmp { FLAGS = EXTERN };
+ bcopy { FLAGS = EXTERN };
+ bzero { FLAGS = EXTERN };
+ cmn_err { FLAGS = EXTERN };
+ cv_broadcast { FLAGS = EXTERN };
+ cv_destroy { FLAGS = EXTERN };
+ cv_init { FLAGS = EXTERN };
+ cv_reltimedwait { FLAGS = EXTERN };
+ ddi_cb_register { FLAGS = EXTERN };
+ ddi_cb_unregister { FLAGS = EXTERN };
+ ddi_dev_regsize { FLAGS = EXTERN };
+ ddi_dma_addr_bind_handle { FLAGS = EXTERN };
+ ddi_dma_alloc_handle { FLAGS = EXTERN };
+ ddi_dma_free_handle { FLAGS = EXTERN };
+ ddi_dma_mem_alloc { FLAGS = EXTERN };
+ ddi_dma_mem_free { FLAGS = EXTERN };
+ ddi_dma_nextcookie { FLAGS = EXTERN };
+ ddi_dma_sync { FLAGS = EXTERN };
+ ddi_dma_unbind_handle { FLAGS = EXTERN };
+ ddi_fls { FLAGS = EXTERN };
+ ddi_fm_acc_err_clear { FLAGS = EXTERN };
+ ddi_fm_acc_err_get { FLAGS = EXTERN };
+ ddi_fm_dma_err_get { FLAGS = EXTERN };
+ ddi_fm_ereport_post { FLAGS = EXTERN };
+ ddi_fm_fini { FLAGS = EXTERN };
+ ddi_fm_handler_register { FLAGS = EXTERN };
+ ddi_fm_handler_unregister { FLAGS = EXTERN };
+ ddi_fm_init { FLAGS = EXTERN };
+ ddi_fm_service_impact { FLAGS = EXTERN };
+ ddi_get_driver_private { FLAGS = EXTERN };
+ ddi_get_instance { FLAGS = EXTERN };
+ ddi_get_lbolt { FLAGS = EXTERN };
+ ddi_get_lbolt64 { FLAGS = EXTERN };
+ ddi_get_name { FLAGS = EXTERN };
+ ddi_get_parent { FLAGS = EXTERN };
+ ddi_get16 { FLAGS = EXTERN };
+ ddi_get32 { FLAGS = EXTERN };
+ ddi_get64 { FLAGS = EXTERN };
+ ddi_intr_add_handler { FLAGS = EXTERN };
+ ddi_intr_alloc { FLAGS = EXTERN };
+ ddi_intr_block_disable { FLAGS = EXTERN };
+ ddi_intr_block_enable { FLAGS = EXTERN };
+ ddi_intr_disable { FLAGS = EXTERN };
+ ddi_intr_enable { FLAGS = EXTERN };
+ ddi_intr_free { FLAGS = EXTERN };
+ ddi_intr_get_cap { FLAGS = EXTERN };
+ ddi_intr_get_navail { FLAGS = EXTERN };
+ ddi_intr_get_nintrs { FLAGS = EXTERN };
+ ddi_intr_get_pri { FLAGS = EXTERN };
+ ddi_intr_get_supported_types { FLAGS = EXTERN };
+ ddi_intr_remove_handler { FLAGS = EXTERN };
+ ddi_periodic_add { FLAGS = EXTERN };
+ ddi_periodic_delete { FLAGS = EXTERN };
+ ddi_power { FLAGS = EXTERN };
+ ddi_prop_free { FLAGS = EXTERN };
+ ddi_prop_get_int { FLAGS = EXTERN };
+ ddi_prop_lookup_int_array { FLAGS = EXTERN };
+ ddi_prop_op { FLAGS = EXTERN };
+ ddi_prop_remove_all { FLAGS = EXTERN };
+ ddi_prop_update_int_array { FLAGS = EXTERN };
+ ddi_prop_update_string { FLAGS = EXTERN };
+ ddi_ptob { FLAGS = EXTERN };
+ ddi_put16 { FLAGS = EXTERN };
+ ddi_put32 { FLAGS = EXTERN };
+ ddi_quiesce_not_supported { FLAGS = EXTERN };
+ ddi_regs_map_free { FLAGS = EXTERN };
+ ddi_regs_map_setup { FLAGS = EXTERN };
+ ddi_set_driver_private { FLAGS = EXTERN };
+ ddi_strtol { FLAGS = EXTERN };
+ ddi_taskq_create { FLAGS = EXTERN };
+ ddi_taskq_destroy { FLAGS = EXTERN };
+ ddi_taskq_dispatch { FLAGS = EXTERN };
+ delay { FLAGS = EXTERN };
+ desballoc { FLAGS = EXTERN };
+ dev_err { FLAGS = EXTERN };
+ drv_usectohz { FLAGS = EXTERN };
+ drv_usecwait { FLAGS = EXTERN };
+ fm_ena_generate { FLAGS = EXTERN };
+ freeb { FLAGS = EXTERN };
+ freemsg { FLAGS = EXTERN };
+ freemsgchain { FLAGS = EXTERN };
+ gethrtime { FLAGS = EXTERN };
+ kmem_alloc { FLAGS = EXTERN };
+ kmem_free { FLAGS = EXTERN };
+ kmem_zalloc { FLAGS = EXTERN };
+ kstat_create { FLAGS = EXTERN };
+ kstat_delete { FLAGS = EXTERN };
+ kstat_install { FLAGS = EXTERN };
+ kstat_named_init { FLAGS = EXTERN };
+ list_create { FLAGS = EXTERN };
+ list_destroy { FLAGS = EXTERN };
+ list_head { FLAGS = EXTERN };
+ list_insert_tail { FLAGS = EXTERN };
+ list_next { FLAGS = EXTERN };
+ list_remove { FLAGS = EXTERN };
+ list_remove_head { FLAGS = EXTERN };
+ memcpy { FLAGS = EXTERN };
+ memset { FLAGS = EXTERN };
+ miocack { FLAGS = EXTERN };
+ miocnak { FLAGS = EXTERN };
+ mod_driverops { FLAGS = EXTERN };
+ mod_info { FLAGS = EXTERN };
+ mod_install { FLAGS = EXTERN };
+ mod_remove { FLAGS = EXTERN };
+ msgpullup { FLAGS = EXTERN };
+ msgsize { FLAGS = EXTERN };
+ mutex_destroy { FLAGS = EXTERN };
+ mutex_enter { FLAGS = EXTERN };
+ mutex_exit { FLAGS = EXTERN };
+ mutex_init { FLAGS = EXTERN };
+ mutex_owned { FLAGS = EXTERN };
+ mutex_tryenter { FLAGS = EXTERN };
+ nochpoll { FLAGS = EXTERN };
+ nodev { FLAGS = EXTERN };
+ nulldev { FLAGS = EXTERN };
+ panic { FLAGS = EXTERN };
+ pci_config_get16 { FLAGS = EXTERN };
+ pci_config_get32 { FLAGS = EXTERN };
+ pci_config_get64 { FLAGS = EXTERN };
+ pci_config_get8 { FLAGS = EXTERN };
+ pci_config_put16 { FLAGS = EXTERN };
+ pci_config_put32 { FLAGS = EXTERN };
+ pci_config_put64 { FLAGS = EXTERN };
+ pci_config_put8 { FLAGS = EXTERN };
+ pci_config_setup { FLAGS = EXTERN };
+ pci_config_teardown { FLAGS = EXTERN };
+ pci_ereport_post { FLAGS = EXTERN };
+ pci_ereport_setup { FLAGS = EXTERN };
+ pci_ereport_teardown { FLAGS = EXTERN };
+ pci_lcap_locate { FLAGS = EXTERN };
+ qreply { FLAGS = EXTERN };
+ rw_destroy { FLAGS = EXTERN };
+ rw_enter { FLAGS = EXTERN };
+ rw_exit { FLAGS = EXTERN };
+ rw_init { FLAGS = EXTERN };
+ snprintf { FLAGS = EXTERN };
+ sprintf { FLAGS = EXTERN };
+ strcat { FLAGS = EXTERN };
+ strcmp { FLAGS = EXTERN };
+ strcpy { FLAGS = EXTERN };
+ strlen { FLAGS = EXTERN };
+ timeout { FLAGS = EXTERN };
+ untimeout { FLAGS = EXTERN };
+ vsnprintf { FLAGS = EXTERN };
+ vsprintf { FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/mapfiles/dtrace.mapfile.awk b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk
new file mode 100644
index 0000000000..b8a7e2d372
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk
@@ -0,0 +1,34 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# This script is designed to assemble a mapfile for DTrace probes.
+#
+BEGIN {
+ print "#"
+ print "# This file is autogenerated by dtrace.mapfile.awk"
+ print "#"
+ print "$mapfile_version 2"
+ print "SYMBOL_SCOPE {"
+ print " global:"
+}
+
+/__dtrace_probe_/ {
+ printf "\t%s\t{ FLAGS = EXTERN };\n", $1
+}
+
+END {
+ print "};"
+}
diff --git a/usr/src/uts/common/mapfiles/kernel.mapfile b/usr/src/uts/common/mapfiles/kernel.mapfile
new file mode 100644
index 0000000000..6bddb3c7ef
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/kernel.mapfile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+# usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+#
+# This file contains functions provided by the kernel that various
+# modules use. This is a combination of things in both unix and genunix.
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+ global:
+ bt_getlowbit { FLAGS = EXTERN };
+ servicing_interrupt { FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/mapfiles/mac.mapfile b/usr/src/uts/common/mapfiles/mac.mapfile
new file mode 100644
index 0000000000..d40c09b311
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/mac.mapfile
@@ -0,0 +1,57 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+# usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+ global:
+ mac_alloc { FLAGS = EXTERN };
+ mac_fini_ops { FLAGS = EXTERN };
+ mac_free { FLAGS = EXTERN };
+ mac_hcksum_get { FLAGS = EXTERN };
+ mac_hcksum_set { FLAGS = EXTERN };
+ mac_init_ops { FLAGS = EXTERN };
+ mac_link_update { FLAGS = EXTERN };
+ mac_lso_get { FLAGS = EXTERN };
+ mac_maxsdu_update { FLAGS = EXTERN };
+ mac_prop_info_set_default_link_flowctrl { FLAGS = EXTERN };
+ mac_prop_info_set_default_str { FLAGS = EXTERN };
+ mac_prop_info_set_default_uint8 { FLAGS = EXTERN };
+ mac_prop_info_set_perm { FLAGS = EXTERN };
+ mac_prop_info_set_range_uint32 { FLAGS = EXTERN };
+ mac_ring_intr_set { FLAGS = EXTERN };
+ mac_register { FLAGS = EXTERN };
+ mac_rx { FLAGS = EXTERN };
+ mac_rx_ring { FLAGS = EXTERN };
+ mac_transceiver_info_set_present { FLAGS = EXTERN };
+ mac_transceiver_info_set_usable { FLAGS = EXTERN };
+ mac_tx_ring_update { FLAGS = EXTERN };
+ mac_tx_update { FLAGS = EXTERN };
+ mac_unregister { FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/mapfiles/random.mapfile b/usr/src/uts/common/mapfiles/random.mapfile
new file mode 100644
index 0000000000..d3d8bc89fa
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/random.mapfile
@@ -0,0 +1,37 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+# usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+ global:
+ random_get_bytes { FLAGS = EXTERN };
+ random_get_blocking_bytes { FLAGS = EXTERN };
+ random_get_pseudo_bytes { FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h
index 9ac3066362..6a4f538c97 100644
--- a/usr/src/uts/common/netinet/in.h
+++ b/usr/src/uts/common/netinet/in.h
@@ -3,6 +3,7 @@
* Use is subject to license terms.
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/*
* Copyright (c) 1982, 1986 Regents of the University of California.
@@ -225,6 +226,7 @@ typedef uint16_t sa_family_t;
#define IPPORT_SLP 427
#define IPPORT_MIP 434
#define IPPORT_SMB 445 /* a.k.a. microsoft-ds */
+#define IPPORT_VXLAN 4789
/*
* Internet Key Exchange (IKE) ports
@@ -268,6 +270,11 @@ typedef uint16_t sa_family_t;
#define IPPORT_RESERVED 1024
#define IPPORT_USERRESERVED 5000
+#ifdef _KERNEL
+#define IPPORT_DYNAMIC_MIN 49152
+#define IPPORT_DYNAMIC_MAX 65535
+#endif
+
/*
* Link numbers
*/
diff --git a/usr/src/uts/common/netinet/udp.h b/usr/src/uts/common/netinet/udp.h
index c65a9bad3a..74cff75d43 100644
--- a/usr/src/uts/common/netinet/udp.h
+++ b/usr/src/uts/common/netinet/udp.h
@@ -1,6 +1,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
/*
@@ -17,9 +18,6 @@
#ifndef _NETINET_UDP_H
#define _NETINET_UDP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-/* udp.h 1.7 88/08/19 SMI; from UCB 7.1 6/5/86 */
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -36,6 +34,16 @@ struct udphdr {
#define UDP_EXCLBIND 0x0101 /* for internal use only */
#define UDP_RCVHDR 0x0102 /* for internal use only */
#define UDP_NAT_T_ENDPOINT 0x0103 /* for internal use only */
+#define UDP_SRCPORT_HASH 0x0104 /* for internal use only */
+#define UDP_SND_TO_CONNECTED 0x0105 /* for internal use only */
+
+/*
+ * Hash definitions for UDP_SRCPORT_HASH that effectively tell UDP how to go
+ * handle UDP_SRCPORT_HASH.
+ */
+#define UDP_HASH_DISABLE 0x0000 /* for internal use only */
+#define UDP_HASH_VXLAN 0x0001 /* for internal use only */
+
/*
* Following option in UDP_ namespace required to be exposed through
* <xti.h> (It also requires exposing options not implemented). The options
diff --git a/usr/src/uts/common/nfs/nfssys.h b/usr/src/uts/common/nfs/nfssys.h
index e9a2746017..7d2401856c 100644
--- a/usr/src/uts/common/nfs/nfssys.h
+++ b/usr/src/uts/common/nfs/nfssys.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -122,13 +123,20 @@ struct nfs_revauth_args32 {
enum lm_fmly { LM_INET, LM_INET6, LM_LOOPBACK };
enum lm_proto { LM_TCP, LM_UDP };
+/*
+ * The 'n_v4_only' member was formerly called 'debug'. This member is not used
+ * in the kernel. To avoid a new version of this user/kernel interface
+ * structure, the member was renamed in a binary compatible way. It is now used
+ * by the user-level code to indicate that the zone is not running
+ * rpcbind/rpc.statd and that only NFSv4 locking is needed.
+ */
struct lm_svc_args {
int version; /* keep this first */
int fd;
enum lm_fmly n_fmly; /* protocol family */
enum lm_proto n_proto; /* protocol */
dev_t n_rdev; /* device ID */
- int debug; /* debugging level */
+ int n_v4_only; /* NFSv4 locking only */
time_t timout; /* client handle life (asynch RPCs) */
int grace; /* secs in grace period */
time_t retransmittimeout; /* retransmission interval */
@@ -141,7 +149,7 @@ struct lm_svc_args32 {
enum lm_fmly n_fmly; /* protocol family */
enum lm_proto n_proto; /* protocol */
dev32_t n_rdev; /* device ID */
- int32_t debug; /* debugging level */
+ int32_t n_v4_only; /* NFSv4 locking only */
time32_t timout; /* client handle life (asynch RPCs) */
int32_t grace; /* secs in grace period */
time32_t retransmittimeout; /* retransmission interval */
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c
index e598e0d08d..891c4e0836 100644
--- a/usr/src/uts/common/os/acct.c
+++ b/usr/src/uts/common/os/acct.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -47,6 +48,7 @@
#include <sys/time.h>
#include <sys/msacct.h>
#include <sys/zone.h>
+#include <sys/brand.h>
/*
* Each zone has its own accounting settings (on or off) and associated
@@ -373,7 +375,7 @@ acct_compress(ulong_t t)
* On exit, write a record on the accounting file.
*/
void
-acct(char st)
+acct(int st)
{
struct vnode *vp;
struct cred *cr;
@@ -402,6 +404,21 @@ acct(char st)
* This only gets called from exit after all lwp's have exited so no
* cred locking is needed.
*/
+
+ /* If there is a brand-specific hook, use it instead */
+ if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) {
+ ZBROP(curzone)->b_acct_out(vp, st);
+ mutex_exit(&ag->aclock);
+ return;
+ }
+
+ /*
+ * The 'st' status value was traditionally masked this way by our
+ * caller, but we now accept the unmasked value for brand handling.
+ * Zones not using the brand hook mask the status here.
+ */
+ st &= 0xff;
+
p = curproc;
ua = PTOU(p);
bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm));
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index 773ecc9c6a..ecf396f926 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/kmem.h>
@@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = {
};
#else /* !__sparcv9 */
struct brand_mach_ops native_mach_ops = {
- NULL, NULL, NULL, NULL
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
#endif /* !__sparcv9 */
@@ -53,7 +54,8 @@ brand_t native_brand = {
BRAND_VER_1,
"native",
NULL,
- &native_mach_ops
+ &native_mach_ops,
+ 0
};
/*
@@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp)
mutex_exit(&brand_list_lock);
}
-void
-brand_setbrand(proc_t *p)
+int
+brand_setbrand(proc_t *p, boolean_t lwps_ok)
{
brand_t *bp = p->p_zone->zone_brand;
+ void *brand_data = NULL;
- ASSERT(bp != NULL);
- ASSERT(p->p_brand == &native_brand);
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+ VERIFY(bp != NULL);
/*
- * We should only be called from exec(), when we know the process
- * is single-threaded.
+ * Process branding occurs during fork() and exec(). When it happens
+ * during fork(), the LWP count will always be 0 since branding is
+ * performed as part of getproc(), before LWPs have been associated.
+ * The same is not true during exec(), where a multi-LWP process may
+ * undergo branding just prior to gexec(). This is to ensure
+ * exec-related brand hooks are available. While it may seem
+ * complicated to brand a multi-LWP process, the two possible outcomes
+ * simplify things:
+ *
+ * 1. The exec() succeeds: LWPs besides the caller will be killed and
+ * any further branding will occur in a single-LWP context.
+ * 2. The exec() fails: The process will be promptly unbranded since
+ * the hooks are no longer needed.
+ *
+ * To prevent inconsistent brand state from being encountered during
+ * the exec(), LWPs beyond the caller which are associated with this
+ * process must be held temporarily. They will be released either when
+ * they are killed in the exec() success, or when the brand is cleared
+ * after exec() failure.
*/
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
+ if (lwps_ok) {
+ /*
+ * We've been called from a exec() context tolerating the
+ * existence of multiple LWPs during branding is necessary.
+ */
+ VERIFY(p == curproc);
+ VERIFY(p->p_tlist != NULL);
+ if (p->p_tlist != p->p_tlist->t_forw) {
+ /*
+ * Multiple LWPs are present. Hold all but the caller.
+ */
+ if (!holdlwps(SHOLDFORK1)) {
+ return (-1);
+ }
+ }
+ } else {
+ /*
+ * Processes branded during fork() should not have LWPs at all.
+ */
+ VERIFY(p->p_tlist == NULL);
+ }
+
+ if (bp->b_data_size > 0) {
+ brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP);
+ }
+
+ mutex_enter(&p->p_lock);
+ ASSERT(!PROC_IS_BRANDED(p));
p->p_brand = bp;
+ p->p_brand_data = brand_data;
ASSERT(PROC_IS_BRANDED(p));
BROP(p)->b_setbrand(p);
+ mutex_exit(&p->p_lock);
+ return (0);
}
void
-brand_clearbrand(proc_t *p, boolean_t no_lwps)
+brand_clearbrand(proc_t *p, boolean_t lwps_ok)
{
brand_t *bp = p->p_zone->zone_brand;
- klwp_t *lwp = NULL;
- ASSERT(bp != NULL);
- ASSERT(!no_lwps || (p->p_tlist == NULL));
+ void *brand_data;
- /*
- * If called from exec_common() or proc_exit(),
- * we know the process is single-threaded.
- * If called from fork_fail, p_tlist is NULL.
- */
- if (!no_lwps) {
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
- lwp = p->p_tlist->t_lwp;
- }
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+ VERIFY(bp != NULL);
+ VERIFY(PROC_IS_BRANDED(p));
- ASSERT(PROC_IS_BRANDED(p));
- BROP(p)->b_proc_exit(p, lwp);
+ if (BROP(p)->b_clearbrand != NULL)
+ BROP(p)->b_clearbrand(p, lwps_ok);
+
+ mutex_enter(&p->p_lock);
p->p_brand = &native_brand;
+ brand_data = p->p_brand_data;
+ p->p_brand_data = NULL;
+
+ if (lwps_ok) {
+ VERIFY(p == curproc);
+ /*
+ * A process with multiple LWPs is being de-branded after
+ * failing an exec. The other LWPs were held as part of the
+ * procedure, so they must be resumed now.
+ */
+ if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) {
+ continuelwps(p);
+ }
+ } else {
+ /*
+ * While clearing the brand, it's ok for one LWP to be present.
+ * This happens when a native binary is executed inside a
+ * branded zone, since the brand will be removed during the
+ * course of a successful exec.
+ */
+ VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw);
+ }
+ mutex_exit(&p->p_lock);
+
+ if (brand_data != NULL) {
+ kmem_free(brand_data, bp->b_data_size);
+ }
}
#if defined(__sparcv9)
@@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
return (ENOSYS);
/* For all other operations this must be a branded process. */
- if (p->p_brand == &native_brand)
+ if (!PROC_IS_BRANDED(p))
return (ENOSYS);
ASSERT(p->p_brand == pbrand);
@@ -600,16 +671,16 @@ restoreexecenv(struct execenv *ep, stack_t *sp)
/*ARGSUSED*/
int
brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
- intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
- cred_t *cred, int brand_action, struct brand *pbrand, char *bname,
- char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32)
+ intpdata_t *idatap, int level, size_t *execsz, int setid,
+ caddr_t exec_file, cred_t *cred, int *brand_action, struct brand *pbrand,
+ char *bname, char *brandlib, char *brandlib32)
{
vnode_t *nvp;
Ehdr ehdr;
Addr uphdr_vaddr;
intptr_t voffset;
- int interp;
+ char *interp;
int i, err;
struct execenv env;
struct execenv origenv;
@@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
klwp_t *lwp = ttolwp(curthread);
brand_proc_data_t *spd;
brand_elf_data_t sed, *sedp;
- char *linker;
uintptr_t lddata; /* lddata of executable's linker */
ASSERT(curproc->p_brand == pbrand);
@@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
*/
if (args->to_model == DATAMODEL_NATIVE) {
args->emulator = brandlib;
- linker = brandlinker;
}
#if defined(_LP64)
else {
args->emulator = brandlib32;
- linker = brandlinker32;
}
#endif /* _LP64 */
@@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
if (args->to_model == DATAMODEL_NATIVE) {
err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
&voffset, exec_file, &interp, &env.ex_bssbase,
- &env.ex_brkbase, &env.ex_brksize, NULL);
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
}
#if defined(_LP64)
else {
@@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
Elf32_Addr uphdr_vaddr32;
err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
&voffset, exec_file, &interp, &env.ex_bssbase,
- &env.ex_brkbase, &env.ex_brksize, NULL);
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
Ehdr32to64(&ehdr32, &ehdr);
if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
#endif /* _LP64 */
if (err != 0) {
restoreexecenv(&origenv, &orig_sigaltstack);
+
+ if (interp != NULL)
+ kmem_free(interp, MAXPATHLEN);
+
return (err);
}
@@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
sedp->sed_phent = ehdr.e_phentsize;
sedp->sed_phnum = ehdr.e_phnum;
- if (interp) {
+ if (interp != NULL) {
if (ehdr.e_type == ET_DYN) {
/*
* This is a shared object executable, so we
@@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
* it in and store relevant information about it in the
* aux vector, where the brand library can find it.
*/
- if ((err = lookupname(linker, UIO_SYSSPACE,
+ if ((err = lookupname(interp, UIO_SYSSPACE,
FOLLOW, NULLVPP, &nvp)) != 0) {
- uprintf("%s: not found.", brandlinker);
+ uprintf("%s: not found.", interp);
restoreexecenv(&origenv, &orig_sigaltstack);
+ kmem_free(interp, MAXPATHLEN);
return (err);
}
+
+ kmem_free(interp, MAXPATHLEN);
+
if (args->to_model == DATAMODEL_NATIVE) {
err = mapexec_brand(nvp, args, &ehdr,
&uphdr_vaddr, &voffset, exec_file, &interp,
- NULL, NULL, NULL, &lddata);
+ NULL, NULL, NULL, &lddata, NULL);
}
#if defined(_LP64)
else {
@@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
Elf32_Addr uphdr_vaddr32;
err = mapexec32_brand(nvp, args, &ehdr32,
&uphdr_vaddr32, &voffset, exec_file, &interp,
- NULL, NULL, NULL, &lddata);
+ NULL, NULL, NULL, &lddata, NULL);
Ehdr32to64(&ehdr32, &ehdr);
if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
/*
* Third, the /proc aux vectors set up by elfexec() point to
- * brand emulation library and it's linker. Copy these to the
+ * brand emulation library and its linker. Copy these to the
* /proc brand specific aux vector, and update the regular
- * /proc aux vectors to point to the executable (and it's
+ * /proc aux vectors to point to the executable (and its
* linker). This will enable debuggers to access the
* executable via the usual /proc or elf notes aux vectors.
*
@@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)
}
/*ARGSUSED*/
-int
+void
brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)
{
ASSERT(l->lwp_procp->p_brand == pbrand);
ASSERT(l->lwp_procp->p_brand_data != NULL);
ASSERT(l->lwp_brand == NULL);
l->lwp_brand = (void *)-1;
- return (0);
}
/*ARGSUSED*/
void
brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)
{
- proc_t *p = l->lwp_procp;
-
ASSERT(l->lwp_procp->p_brand == pbrand);
ASSERT(l->lwp_procp->p_brand_data != NULL);
ASSERT(l->lwp_brand != NULL);
-
- /*
- * We should never be called for the last thread in a process.
- * (That case is handled by brand_solaris_proc_exit().)
- * Therefore this lwp must be exiting from a multi-threaded
- * process.
- */
- ASSERT(p->p_tlist != p->p_tlist->t_forw);
-
- l->lwp_brand = NULL;
}
/*ARGSUSED*/
void
-brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand)
+brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)
{
ASSERT(p->p_brand == pbrand);
ASSERT(p->p_brand_data != NULL);
- /*
- * When called from proc_exit(), we know that process is
- * single-threaded and free our lwp brand data.
- * otherwise just free p_brand_data and return.
- */
- if (l != NULL) {
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
- ASSERT(p->p_tlist->t_lwp == l);
- (void) brand_solaris_freelwp(l, pbrand);
- }
-
/* upon exit, free our proc brand data */
kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));
p->p_brand_data = NULL;
@@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)
ASSERT(p->p_tlist == p->p_tlist->t_forw);
p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP);
- (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);
}
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index 805813037d..1280c8a1b6 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
*/
#include <sys/timer.h>
@@ -41,6 +41,9 @@
static clock_backend_t clock_highres;
+/* minimum non-privileged interval (200us) */
+long clock_highres_interval_min = 200000;
+
/*ARGSUSED*/
static int
clock_highres_settime(timespec_t *ts)
@@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts)
static int
clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))
{
- /*
- * CLOCK_HIGHRES timers of sufficiently high resolution can deny
- * service; only allow privileged users to create such timers.
- * Sites that do not wish to have this restriction should
- * give users the "proc_clock_highres" privilege.
- */
- if (secpolicy_clock_highres(CRED()) != 0) {
- it->it_arg = NULL;
- return (EPERM);
- }
-
it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);
it->it_fire = fire;
@@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags,
cpu_t *cpu;
cpupart_t *cpupart;
int pset;
+ boolean_t value_need_clamp = B_FALSE;
+ boolean_t intval_need_clamp = B_FALSE;
+ cred_t *cr = CRED();
+ struct itimerspec clamped;
+
+ /*
+ * CLOCK_HIGHRES timers of sufficiently high resolution can deny
+ * service; only allow privileged users to create such timers.
+ * Non-privileged users (those without the "proc_clock_highres"
+ * privilege) can create timers with lower resolution but if they
+ * attempt to use a very low time value (< 200us) then their
+ * timer will be clamped at 200us.
+ */
+ if (when->it_value.tv_sec == 0 &&
+ when->it_value.tv_nsec > 0 &&
+ when->it_value.tv_nsec < clock_highres_interval_min)
+ value_need_clamp = B_TRUE;
+
+ if (when->it_interval.tv_sec == 0 &&
+ when->it_interval.tv_nsec > 0 &&
+ when->it_interval.tv_nsec < clock_highres_interval_min)
+ intval_need_clamp = B_TRUE;
+
+ if ((value_need_clamp || intval_need_clamp) &&
+ secpolicy_clock_highres(cr) != 0) {
+ clamped.it_value.tv_sec = when->it_value.tv_sec;
+ clamped.it_interval.tv_sec = when->it_interval.tv_sec;
+
+ if (value_need_clamp) {
+ clamped.it_value.tv_nsec = clock_highres_interval_min;
+ } else {
+ clamped.it_value.tv_nsec = when->it_value.tv_nsec;
+ }
+
+ if (intval_need_clamp) {
+ clamped.it_interval.tv_nsec =
+ clock_highres_interval_min;
+ } else {
+ clamped.it_interval.tv_nsec = when->it_interval.tv_nsec;
+ }
+
+ when = &clamped;
+ }
cyctime.cyt_when = ts2hrt(&when->it_value);
cyctime.cyt_interval = ts2hrt(&when->it_interval);
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index 909a6c2860..1a3502a710 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
@@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
avl_index_t where;
klwp_t *curlwp = ttolwp(curthread);
- ASSERT(author == curproc);
+ /*
+ * It's possible that author is not curproc if the zone is creating
+ * a new process as a child of zsched.
+ */
mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index d5e272c16a..a147b1cf0f 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2019 Joyent Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type)
/*
* Determine what rootvp to use.
*/
+ mutex_enter(&curproc->p_lock);
if (core_type == CORE_PROC) {
rootvp = (PTOU(curproc)->u_rdir == NULL ?
curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir);
@@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type)
VN_HOLD(startvp);
if (rootvp != rootdir)
VN_HOLD(rootvp);
+ mutex_exit(&curproc->p_lock);
if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp,
startvp, CRED())) != 0) {
pn_free(&pn);
@@ -793,7 +795,7 @@ clock_t core_delay_usec = 10000;
* using core_write() below, and so it has the same failure semantics.
*/
int
-core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
+core_seg(proc_t *p, vnode_t *vp, u_offset_t offset, caddr_t addr, size_t size,
rlim64_t rlimit, cred_t *credp)
{
caddr_t eaddr;
@@ -801,6 +803,11 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
size_t len;
int err = 0;
+ if (offset > OFF_MAX || offset + size > OFF_MAX ||
+ offset + size < offset) {
+ return (EOVERFLOW);
+ }
+
eaddr = addr + size;
for (base = addr; base < eaddr; base += len) {
len = eaddr - base;
@@ -841,15 +848,20 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
* unexpectedly returns zero but no progress has been made, we return ENOSPC.
*/
int
-core_write(vnode_t *vp, enum uio_seg segflg, offset_t offset,
+core_write(vnode_t *vp, enum uio_seg segflg, u_offset_t offset,
const void *buf, size_t len, rlim64_t rlimit, cred_t *credp)
{
ssize_t resid = len;
int error = 0;
+ if (offset > OFF_MAX || offset + len > OFF_MAX ||
+ offset + len < offset) {
+ return (EOVERFLOW);
+ }
+
while (len != 0) {
- error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, offset,
- segflg, 0, rlimit, credp, &resid);
+ error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len,
+ (offset_t)offset, segflg, 0, rlimit, credp, &resid);
if (error != 0)
break;
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 87c0896814..620f26034f 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -108,7 +109,8 @@ kmutex_t cpu_lock;
cpu_t *cpu_list; /* list of all CPUs */
cpu_t *clock_cpu_list; /* used by clock to walk CPUs */
cpu_t *cpu_active; /* list of active CPUs */
-static cpuset_t cpu_available; /* set of available CPUs */
+cpuset_t cpu_active_set; /* cached set of active CPUs */
+cpuset_t cpu_available; /* set of available CPUs */
cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */
cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */
@@ -386,36 +388,56 @@ force_thread_migrate(kthread_id_t tp)
/*
* Set affinity for a specified CPU.
- * A reference count is incremented and the affinity is held until the
- * reference count is decremented to zero by thread_affinity_clear().
- * This is so regions of code requiring affinity can be nested.
- * Caller needs to ensure that cpu_id remains valid, which can be
- * done by holding cpu_lock across this call, unless the caller
- * specifies CPU_CURRENT in which case the cpu_lock will be acquired
- * by thread_affinity_set and CPU->cpu_id will be the target CPU.
+ *
+ * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for
+ * curthread, will set affinity to the CPU on which the thread is currently
+ * running. For other cpu_id values, the caller must ensure that the
+ * referenced CPU remains valid, which can be done by holding cpu_lock across
+ * this call.
+ *
+ * CPU affinity is guaranteed after return of thread_affinity_set(). If a
+ * caller setting affinity to CPU_CURRENT requires that its thread not migrate
+ * CPUs prior to a successful return, it should take extra precautions (such as
+ * their own call to kpreempt_disable) to ensure that safety.
+ *
+ * CPU_BEST can be used to pick a "best" CPU to migrate to, including
+ * potentially the current CPU.
+ *
+ * A CPU affinity reference count is maintained by thread_affinity_set and
+ * thread_affinity_clear (incrementing and decrementing it, respectively),
+ * maintaining CPU affinity while the count is non-zero, and allowing regions
+ * of code which require affinity to be nested.
*/
void
thread_affinity_set(kthread_id_t t, int cpu_id)
{
- cpu_t *cp;
- int c;
+ cpu_t *cp;
ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL));
- if ((c = cpu_id) == CPU_CURRENT) {
- mutex_enter(&cpu_lock);
- cpu_id = CPU->cpu_id;
+ if (cpu_id == CPU_CURRENT) {
+ VERIFY3P(t, ==, curthread);
+ kpreempt_disable();
+ cp = CPU;
+ } else if (cpu_id == CPU_BEST) {
+ VERIFY3P(t, ==, curthread);
+ kpreempt_disable();
+ cp = disp_choose_best_cpu();
+ } else {
+ /*
+ * We should be asserting that cpu_lock is held here, but
+ * the NCA code doesn't acquire it. The following assert
+ * should be uncommented when the NCA code is fixed.
+ *
+ * ASSERT(MUTEX_HELD(&cpu_lock));
+ */
+ VERIFY((cpu_id >= 0) && (cpu_id < NCPU));
+ cp = cpu[cpu_id];
+
+ /* user must provide a good cpu_id */
+ VERIFY(cp != NULL);
}
- /*
- * We should be asserting that cpu_lock is held here, but
- * the NCA code doesn't acquire it. The following assert
- * should be uncommented when the NCA code is fixed.
- *
- * ASSERT(MUTEX_HELD(&cpu_lock));
- */
- ASSERT((cpu_id >= 0) && (cpu_id < NCPU));
- cp = cpu[cpu_id];
- ASSERT(cp != NULL); /* user must provide a good cpu_id */
+
/*
* If there is already a hard affinity requested, and this affinity
* conflicts with that, panic.
@@ -432,13 +454,14 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
* Make sure we're running on the right CPU.
*/
if (cp != t->t_cpu || t != curthread) {
+ ASSERT(cpu_id != CPU_CURRENT);
force_thread_migrate(t); /* drops thread lock */
} else {
thread_unlock(t);
}
- if (c == CPU_CURRENT)
- mutex_exit(&cpu_lock);
+ if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST)
+ kpreempt_enable();
}
/*
@@ -1194,7 +1217,7 @@ cpu_online(cpu_t *cp)
* Handle on-line request.
* This code must put the new CPU on the active list before
* starting it because it will not be paused, and will start
- * using the active list immediately. The real start occurs
+ * using the active list immediately. The real start occurs
* when the CPU_QUIESCED flag is turned off.
*/
@@ -1473,8 +1496,8 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
* Update CPU last ran on if it was this CPU
*/
if (t->t_cpu == cp && t->t_bound_cpu != cp)
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(ncp, t,
+ t->t_pri);
ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
t->t_weakbound_cpu == cp);
@@ -1516,10 +1539,9 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
* Update CPU last ran on if it was this CPU
*/
- if (t->t_cpu == cp && t->t_bound_cpu != cp) {
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
- }
+ if (t->t_cpu == cp && t->t_bound_cpu != cp)
+ t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri);
+
ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
t->t_weakbound_cpu == cp);
t = t->t_next;
@@ -1724,6 +1746,7 @@ cpu_list_init(cpu_t *cp)
cp->cpu_part = &cp_default;
CPUSET_ADD(cpu_available, cp->cpu_id);
+ CPUSET_ADD(cpu_active_set, cp->cpu_id);
}
/*
@@ -1895,6 +1918,7 @@ cpu_add_active_internal(cpu_t *cp)
cp->cpu_prev_onln = cpu_active->cpu_prev_onln;
cpu_active->cpu_prev_onln->cpu_next_onln = cp;
cpu_active->cpu_prev_onln = cp;
+ CPUSET_ADD(cpu_active_set, cp->cpu_id);
if (pp->cp_cpulist) {
cp->cpu_next_part = pp->cp_cpulist;
@@ -1965,6 +1989,7 @@ cpu_remove_active(cpu_t *cp)
}
cp->cpu_next_onln = cp;
cp->cpu_prev_onln = cp;
+ CPUSET_DEL(cpu_active_set, cp->cpu_id);
cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
@@ -2704,13 +2729,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,
return (0);
}
-#if CPUSET_WORDS > 1
-/*
- * Functions for implementing cpuset operations when a cpuset is more
- * than one word. On platforms where a cpuset is a single word these
- * are implemented as macros in cpuvar.h.
- */
+cpuset_t *
+cpuset_alloc(int kmflags)
+{
+ return (kmem_alloc(sizeof (cpuset_t), kmflags));
+}
+
+void
+cpuset_free(cpuset_t *s)
+{
+ kmem_free(s, sizeof (cpuset_t));
+}
void
cpuset_all(cpuset_t *s)
@@ -2722,43 +2752,66 @@ cpuset_all(cpuset_t *s)
}
void
-cpuset_all_but(cpuset_t *s, uint_t cpu)
+cpuset_all_but(cpuset_t *s, const uint_t cpu)
{
cpuset_all(s);
CPUSET_DEL(*s, cpu);
}
void
-cpuset_only(cpuset_t *s, uint_t cpu)
+cpuset_only(cpuset_t *s, const uint_t cpu)
{
CPUSET_ZERO(*s);
CPUSET_ADD(*s, cpu);
}
+long
+cpu_in_set(const cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ return (BT_TEST(s->cpub, cpu));
+}
+
+void
+cpuset_add(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_SET(s->cpub, cpu);
+}
+
+void
+cpuset_del(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_CLEAR(s->cpub, cpu);
+}
+
int
-cpuset_isnull(cpuset_t *s)
+cpuset_isnull(const cpuset_t *s)
{
int i;
- for (i = 0; i < CPUSET_WORDS; i++)
+ for (i = 0; i < CPUSET_WORDS; i++) {
if (s->cpub[i] != 0)
return (0);
+ }
return (1);
}
int
-cpuset_cmp(cpuset_t *s1, cpuset_t *s2)
+cpuset_isequal(const cpuset_t *s1, const cpuset_t *s2)
{
int i;
- for (i = 0; i < CPUSET_WORDS; i++)
+ for (i = 0; i < CPUSET_WORDS; i++) {
if (s1->cpub[i] != s2->cpub[i])
return (0);
+ }
return (1);
}
uint_t
-cpuset_find(cpuset_t *s)
+cpuset_find(const cpuset_t *s)
{
uint_t i;
@@ -2778,7 +2831,7 @@ cpuset_find(cpuset_t *s)
}
void
-cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)
+cpuset_bounds(const cpuset_t *s, uint_t *smallestid, uint_t *largestid)
{
int i, j;
uint_t bit;
@@ -2822,7 +2875,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)
*smallestid = *largestid = CPUSET_NOTINSET;
}
-#endif /* CPUSET_WORDS */
+void
+cpuset_atomic_del(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_CLEAR(s->cpub, (cpu))
+}
+
+void
+cpuset_atomic_add(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_SET(s->cpub, (cpu))
+}
+
+long
+cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu)
+{
+ long res;
+
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_SET_EXCL(s->cpub, cpu, res);
+ return (res);
+}
+
+long
+cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu)
+{
+ long res;
+
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res);
+ return (res);
+}
+
+void
+cpuset_or(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] |= src->cpub[i];
+ }
+}
+
+void
+cpuset_xor(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] ^= src->cpub[i];
+ }
+}
+
+void
+cpuset_and(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] &= src->cpub[i];
+ }
+}
+
+void
+cpuset_zero(cpuset_t *dst)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] = 0;
+ }
+}
+
/*
* Unbind threads bound to specified CPU.
@@ -3112,9 +3230,9 @@ cpu_get_state_str(cpu_t *cpu)
static void
cpu_stats_kstat_create(cpu_t *cp)
{
- int instance = cp->cpu_id;
- char *module = "cpu";
- char *class = "misc";
+ int instance = cp->cpu_id;
+ char *module = "cpu";
+ char *class = "misc";
kstat_t *ksp;
zoneid_t zoneid;
@@ -3350,18 +3468,18 @@ cpu_stat_ks_update(kstat_t *ksp, int rw)
cso->cpu_sysinfo.cpu[CPU_USER] = msnsecs[CMS_USER];
if (cso->cpu_sysinfo.cpu[CPU_KERNEL] < msnsecs[CMS_SYSTEM])
cso->cpu_sysinfo.cpu[CPU_KERNEL] = msnsecs[CMS_SYSTEM];
- cso->cpu_sysinfo.cpu[CPU_WAIT] = 0;
- cso->cpu_sysinfo.wait[W_IO] = 0;
+ cso->cpu_sysinfo.cpu[CPU_WAIT] = 0;
+ cso->cpu_sysinfo.wait[W_IO] = 0;
cso->cpu_sysinfo.wait[W_SWAP] = 0;
cso->cpu_sysinfo.wait[W_PIO] = 0;
- cso->cpu_sysinfo.bread = CPU_STATS(cp, sys.bread);
- cso->cpu_sysinfo.bwrite = CPU_STATS(cp, sys.bwrite);
- cso->cpu_sysinfo.lread = CPU_STATS(cp, sys.lread);
- cso->cpu_sysinfo.lwrite = CPU_STATS(cp, sys.lwrite);
- cso->cpu_sysinfo.phread = CPU_STATS(cp, sys.phread);
- cso->cpu_sysinfo.phwrite = CPU_STATS(cp, sys.phwrite);
- cso->cpu_sysinfo.pswitch = CPU_STATS(cp, sys.pswitch);
- cso->cpu_sysinfo.trap = CPU_STATS(cp, sys.trap);
+ cso->cpu_sysinfo.bread = CPU_STATS(cp, sys.bread);
+ cso->cpu_sysinfo.bwrite = CPU_STATS(cp, sys.bwrite);
+ cso->cpu_sysinfo.lread = CPU_STATS(cp, sys.lread);
+ cso->cpu_sysinfo.lwrite = CPU_STATS(cp, sys.lwrite);
+ cso->cpu_sysinfo.phread = CPU_STATS(cp, sys.phread);
+ cso->cpu_sysinfo.phwrite = CPU_STATS(cp, sys.phwrite);
+ cso->cpu_sysinfo.pswitch = CPU_STATS(cp, sys.pswitch);
+ cso->cpu_sysinfo.trap = CPU_STATS(cp, sys.trap);
cso->cpu_sysinfo.intr = 0;
for (i = 0; i < PIL_MAX; i++)
cso->cpu_sysinfo.intr += CPU_STATS(cp, sys.intr[i]);
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 25727d54c5..0bd6cfd44f 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr)
cr->cr_zone->zone_id);
}
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+ return (cr->cr_zone == NULL ?
+ (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+ cr->cr_zone->zone_did);
+}
+
projid_t
crgetprojid(const cred_t *cr)
{
diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c
index 0aa54eeaee..316dffc326 100644
--- a/usr/src/uts/common/os/cyclic.c
+++ b/usr/src/uts/common/os/cyclic.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ * Copyright 2018 Joyent Inc.
*/
/*
@@ -112,6 +112,7 @@
* cyclic_remove() <-- Removes a cyclic
* cyclic_bind() <-- Change a cyclic's CPU or partition binding
* cyclic_reprogram() <-- Reprogram a cyclic's expiration
+ * cyclic_move_here() <-- Shuffle cyclic to current CPU
*
* Inter-subsystem Interfaces
*
@@ -3111,6 +3112,61 @@ cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
return (1);
}
+/*
+ * void cyclic_move_here(cyclic_id_t)
+ *
+ * Overview
+ *
+ * cyclic_move_here() attempts to shuffle a cyclic onto the current CPU.
+ *
+ * Arguments and notes
+ *
+ * The first argument is a cyclic_id returned from cyclic_add().
+ * cyclic_move_here() may _not_ be called on a cyclic_id returned from
+ * cyclic_add_omni() or one bound to a CPU or partition via cyclic_bind().
+ *
+ * This cyclic shuffling is performed on a best-effort basis. If for some
+ * reason the current CPU is unsuitable or the thread migrates between CPUs
+ * during the call, the function may return with the cyclic residing on some
+ * other CPU.
+ *
+ * Return value
+ *
+ * None; cyclic_move_here() always reports success.
+ *
+ * Caller's context
+ *
+ * cpu_lock must be held by the caller, and the caller must not be in
+ * interrupt context. The caller may not hold any locks which are also
+ * grabbed by any cyclic handler.
+ */
+void
+cyclic_move_here(cyclic_id_t id)
+{
+ cyc_id_t *idp = (cyc_id_t *)id;
+ cyc_cpu_t *cc = idp->cyi_cpu;
+ cpu_t *dest = CPU;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ CYC_PTRACE("move_here", idp, dest);
+ VERIFY3P(cc, !=, NULL);
+ VERIFY3U(cc->cyp_cyclics[idp->cyi_ndx].cy_flags &
+ (CYF_CPU_BOUND|CYF_PART_BOUND), ==, 0);
+
+ if (cc->cyp_cpu == dest) {
+ return;
+ }
+
+ /* Is the destination CPU suitable for a migration target? */
+ if (dest->cpu_cyclic == NULL ||
+ dest->cpu_cyclic->cyp_state == CYS_OFFLINE ||
+ (dest->cpu_flags & CPU_ENABLE) == 0) {
+ return;
+ }
+
+ cyclic_juggle_one_to(idp, dest->cpu_cyclic);
+}
+
hrtime_t
cyclic_getres()
{
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index c3c0481e7f..a4b35dcb5b 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
/* Log callback errors */
if (ret != DDI_SUCCESS) {
- cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
+ cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",
ddi_driver_name(req_p->ireq_dip),
ddi_get_instance(req_p->ireq_dip), (int)action, ret);
}
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index f51e2c5ca1..24b6f0e2eb 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -26,7 +26,7 @@
/* Copyright (c) 1988 AT&T */
/* All Rights Reserved */
/*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */
#endif
#define PSUIDFLAGS (SNOCD|SUGID)
+#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */
/*
* These are consumed within the specific exec modules, but are defined here
@@ -143,7 +144,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
proc_t *p = ttoproc(curthread);
klwp_t *lwp = ttolwp(curthread);
struct user *up = PTOU(p);
- long execsz; /* temporary count of exec size */
+ size_t execsz; /* temporary count of exec size */
int i;
int error;
char exec_file[MAXCOMLEN+1];
@@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
* only if the pathname does not contain a "/" the resolved path
* points to a file in the current working (attribute) directory.
*/
- if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
+ mutex_enter(&p->p_lock);
+ if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 &&
strchr(resolvepn.pn_path, '/') == NULL) {
+ mutex_exit(&p->p_lock);
if (dir != NULL)
VN_RELE(dir);
error = EACCES;
@@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
VN_RELE(vp);
goto out;
}
+ mutex_exit(&p->p_lock);
bzero(exec_file, MAXCOMLEN+1);
(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
@@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp,
ua.argp = argp;
ua.envp = envp;
- /* If necessary, brand this process before we start the exec. */
- if (brandme)
- brand_setbrand(p);
+ /* If necessary, brand this process/lwp before we start the exec. */
+ if (brandme) {
+ void *brand_data = NULL;
+
+ /*
+ * Process branding may fail if multiple LWPs are present and
+ * holdlwps() cannot complete successfully.
+ */
+ error = brand_setbrand(p, B_TRUE);
+
+ if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
+ brand_data = BROP(p)->b_lwpdata_alloc(p);
+ if (brand_data == NULL) {
+ error = 1;
+ }
+ }
+
+ if (error == 0) {
+ mutex_enter(&p->p_lock);
+ BROP(p)->b_initlwp(lwp, brand_data);
+ mutex_exit(&p->p_lock);
+ } else {
+ VN_RELE(vp);
+ if (dir != NULL) {
+ VN_RELE(dir);
+ }
+ pn_free(&resolvepn);
+ goto fail;
+ }
+ }
if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
- exec_file, p->p_cred, brand_action)) != 0) {
- if (brandme)
- brand_clearbrand(p, B_FALSE);
+ exec_file, p->p_cred, &brand_action)) != 0) {
+ if (brandme) {
+ BROP(p)->b_freelwp(lwp);
+ brand_clearbrand(p, B_TRUE);
+ }
VN_RELE(vp);
if (dir != NULL)
VN_RELE(dir);
@@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
/*
* Clear contract template state
*/
- lwp_ctmpl_clear(lwp);
+ lwp_ctmpl_clear(lwp, B_TRUE);
/*
* Save the directory in which we found the executable for expanding
@@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
* pending held signals remain held, so don't clear t_hold.
*/
mutex_enter(&p->p_lock);
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
lwp->lwp_oldcontext = 0;
lwp->lwp_ustack = 0;
lwp->lwp_old_stk_ctl = 0;
@@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
/* Unbrand ourself if necessary. */
- if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
+ if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
+ BROP(p)->b_freelwp(lwp);
brand_clearbrand(p, B_FALSE);
+ }
setregs(&args);
@@ -566,10 +603,10 @@ gexec(
struct uarg *args,
struct intpdata *idatap,
int level,
- long *execsz,
+ size_t *execsz,
caddr_t exec_file,
struct cred *cred,
- int brand_action)
+ int *brand_action)
{
struct vnode *vp, *execvp = NULL;
proc_t *pp = ttoproc(curthread);
@@ -890,8 +927,14 @@ gexec(
if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
args->traceinval = 1;
}
- if (pp->p_proc_flag & P_PR_PTRACE)
+
+ /*
+ * If legacy ptrace is enabled, generate the SIGTRAP.
+ */
+ if (pp->p_proc_flag & P_PR_PTRACE) {
psignal(pp, SIGTRAP);
+ }
+
if (args->traceinval)
prinvalidate(&pp->p_user);
}
@@ -1448,7 +1491,7 @@ noexec(
struct uarg *args,
struct intpdata *idatap,
int level,
- long *execsz,
+ size_t *execsz,
int setid,
caddr_t exec_file,
struct cred *cred)
@@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
return (0);
}
+/*
+ * Add a fixed size byte array to the stack (only from kernel space).
+ */
+static int
+stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
+{
+ int error;
+
+ if (STK_AVAIL(args) < sizeof (int))
+ return (E2BIG);
+ *--args->stk_offp = args->stk_strp - args->stk_base;
+
+ if (len > STK_AVAIL(args))
+ return (E2BIG);
+ bcopy(sp, args->stk_strp, len);
+
+ args->stk_strp += len;
+
+ return (0);
+}
+
static int
stk_getptr(uarg_t *args, char *src, char **dst)
{
@@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
size_t size, pad;
char *argv = (char *)uap->argp;
char *envp = (char *)uap->envp;
+ uint8_t rdata[RANDOM_LEN];
/*
* Copy interpreter's name and argument to argv[0] and argv[1].
@@ -1673,8 +1738,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
args->ne = args->na - argc;
/*
- * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
- * AT_SUN_EMULATOR strings to the stack.
+ * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
+ * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
+ * array, to the stack.
*/
if (auxvpp != NULL && *auxvpp != NULL) {
if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
@@ -1687,6 +1753,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
if (args->emulator != NULL &&
(error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
return (error);
+
+ /*
+ * For the AT_RANDOM aux vector we provide 16 bytes of random
+ * data.
+ */
+ (void) random_get_pseudo_bytes(rdata, sizeof (rdata));
+
+ if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
+ return (error);
+
+ if (args->brand_nroot != NULL &&
+ (error = stk_add(args, args->brand_nroot,
+ UIO_SYSSPACE)) != 0)
+ return (error);
}
/*
@@ -1793,7 +1873,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
/*
* Fill in the aux vector now that we know the user stack addresses
* for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
- * AT_SUN_EMULATOR strings.
+ * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
*/
if (auxvpp != NULL && *auxvpp != NULL) {
if (args->to_model == DATAMODEL_NATIVE) {
@@ -1806,6 +1886,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
if (args->emulator != NULL)
ADDAUX(*a,
AT_SUN_EMULATOR, (long)&ustrp[*--offp])
+ ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
+ if (args->brand_nroot != NULL) {
+ ADDAUX(*a,
+ AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
+ }
} else {
auxv32_t **a = (auxv32_t **)auxvpp;
ADDAUX(*a,
@@ -1818,6 +1903,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
if (args->emulator != NULL)
ADDAUX(*a, AT_SUN_EMULATOR,
(int)(uintptr_t)&ustrp[*--offp])
+ ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
+ if (args->brand_nroot != NULL) {
+ ADDAUX(*a, AT_SUN_BRAND_NROOT,
+ (int)(uintptr_t)&ustrp[*--offp])
+ }
}
}
@@ -1961,6 +2051,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
usrstack = (char *)USRSTACK32;
}
+ if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
+ usrstack = (char *)args->maxstack;
+
ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
#if defined(__sparc)
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index 1b9359da47..06e0117cd6 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -138,6 +138,27 @@ rexit(int rval)
}
/*
+ * Bump the init_restarts kstat and let interested parties know about the
+ * restart.
+ */
+static void
+restart_init_notify(zone_t *zone)
+{
+ nvlist_t *nvl = NULL;
+
+ zone->zone_proc_init_restarts++;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 &&
+ nvlist_add_uint32(nvl, ZONE_CB_RESTARTS,
+ zone->zone_proc_init_restarts) == 0) {
+ zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS,
+ ZONE_EVENT_INIT_RESTART_SC, nvl);
+ }
+
+ nvlist_free(nvl);
+}
+
+/*
* Called by proc_exit() when a zone's init exits, presumably because
* it failed. As long as the given zone is still in the "running"
* state, we will re-exec() init, but first we need to reset things
@@ -230,7 +251,7 @@ restart_init(int what, int why)
siginfofree(lwp->lwp_curinfo);
lwp->lwp_curinfo = NULL;
}
- lwp_ctmpl_clear(lwp);
+ lwp_ctmpl_clear(lwp, B_FALSE);
/*
* Reset both the process root directory and the current working
@@ -260,6 +281,8 @@ restart_init(int what, int why)
ASSERT(p == curproc);
(void) freectty(B_TRUE);
+ restart_init_notify(p->p_zone);
+
/*
* Now exec() the new init(1M) on top of the current process. If we
* succeed, the caller will treat this like a successful system call.
@@ -320,6 +343,119 @@ proc_is_exiting(proc_t *p)
}
/*
+ * Return true if zone's init is restarted, false if exit processing should
+ * proceeed.
+ */
+static boolean_t
+zone_init_exit(zone_t *z, int why, int what)
+{
+ /*
+ * Typically we don't let the zone's init exit unless zone_start_init()
+ * failed its exec, or we are shutting down the zone or the machine,
+ * although the various flags handled within this function will control
+ * the behavior.
+ *
+ * Since we are single threaded, we don't need to lock the following
+ * accesses to zone_proc_initpid.
+ */
+ if (z->zone_boot_err != 0 ||
+ zone_status_get(z) >= ZONE_IS_SHUTTING_DOWN ||
+ zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
+ /*
+ * Clear the zone's init pid and proceed with exit processing.
+ */
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+ }
+
+ /*
+ * There are a variety of configuration flags on the zone to control
+ * init exit behavior.
+ *
+ * If the init process should be restarted, the "zone_restart_init"
+ * member will be set.
+ */
+ if (!z->zone_restart_init) {
+ /*
+ * The zone has been setup to halt when init exits.
+ */
+ z->zone_init_status = wstat(why, what);
+ (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+ }
+
+ /*
+ * At this point we know we're configured to restart init, but there
+ * are various modifiers to that behavior.
+ */
+
+ if (z->zone_reboot_on_init_exit) {
+ /*
+ * Some init programs in branded zones do not tolerate a
+ * restart in the traditional manner; setting
+ * "zone_reboot_on_init_exit" will cause the entire zone to be
+ * rebooted instead.
+ */
+
+ if (z->zone_restart_init_0) {
+ /*
+ * Some init programs in branded zones only want to
+ * restart if they exit 0, otherwise the zone should
+ * shutdown. Setting the "zone_restart_init_0" member
+ * controls this behavior.
+ */
+ if (why == CLD_EXITED && what == 0) {
+ /* Trigger a zone reboot */
+ (void) zone_kadmin(A_REBOOT, 0, NULL,
+ zone_kcred());
+ } else {
+ /* Shutdown instead of reboot */
+ (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
+ zone_kcred());
+ }
+ } else {
+ /* Trigger a zone reboot */
+ (void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred());
+ }
+
+ z->zone_init_status = wstat(why, what);
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+ }
+
+ if (z->zone_restart_init_0) {
+ /*
+ * Some init programs in branded zones only want to restart if
+ * they exit 0, otherwise the zone should shutdown. Setting the
+ * "zone_restart_init_0" member controls this behavior.
+ *
+ * In this case we only restart init if it exited successfully.
+ */
+ if (why == CLD_EXITED && what == 0 &&
+ restart_init(what, why) == 0) {
+ return (B_TRUE);
+ }
+ } else {
+ /*
+ * No restart modifiers on the zone, attempt to restart init.
+ */
+ if (restart_init(what, why) == 0) {
+ return (B_TRUE);
+ }
+ }
+
+
+ /*
+ * The restart failed, the zone will shut down.
+ */
+ z->zone_init_status = wstat(why, what);
+ (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+}
+
+/*
* Return value:
* 1 - exitlwps() failed, call (or continue) lwp_exit()
* 0 - restarting init. Return through system call path
@@ -366,45 +502,36 @@ proc_exit(int why, int what)
}
mutex_exit(&p->p_lock);
- DTRACE_PROC(lwp__exit);
- DTRACE_PROC1(exit, int, why);
+ if (p->p_pid == z->zone_proc_initpid) {
+ /* If zone's init restarts, we're done here. */
+ if (zone_init_exit(z, why, what))
+ return (0);
+ }
/*
- * Will perform any brand specific proc exit processing, since this
- * is always the last lwp, will also perform lwp_exit and free brand
- * data
+ * Delay firing probes (and performing brand cleanup) until after the
+ * zone_proc_initpid check. Cases which result in zone shutdown or
+ * restart via zone_kadmin eventually result in a call back to
+ * proc_exit.
*/
- if (PROC_IS_BRANDED(p)) {
- lwp_detach_brand_hdlrs(lwp);
- brand_clearbrand(p, B_FALSE);
- }
+ DTRACE_PROC(lwp__exit);
+ DTRACE_PROC1(exit, int, why);
/*
- * Don't let init exit unless zone_start_init() failed its exec, or
- * we are shutting down the zone or the machine.
- *
- * Since we are single threaded, we don't need to lock the
- * following accesses to zone_proc_initpid.
+ * Will perform any brand specific proc exit processing. Since this
+ * is always the last lwp, will also perform lwp exit/free and proc
+ * exit. Brand data will be freed when the process is reaped.
*/
- if (p->p_pid == z->zone_proc_initpid) {
- if (z->zone_boot_err == 0 &&
- zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
- zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
- if (z->zone_restart_init == B_TRUE) {
- if (restart_init(what, why) == 0)
- return (0);
- } else {
- (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
- CRED());
- }
- }
-
+ if (PROC_IS_BRANDED(p)) {
+ BROP(p)->b_lwpexit(lwp);
+ BROP(p)->b_proc_exit(p);
/*
- * Since we didn't or couldn't restart init, we clear
- * the zone's init state and proceed with exit
- * processing.
+ * To ensure that b_proc_exit has access to brand-specific data
+ * contained by the one remaining lwp, call the freelwp hook as
+ * the last part of this clean-up process.
*/
- z->zone_proc_initpid = -1;
+ BROP(p)->b_freelwp(lwp);
+ lwp_detach_brand_hdlrs(lwp);
}
lwp_pcb_exit();
@@ -565,7 +692,7 @@ proc_exit(int why, int what)
semexit(p);
rv = wstat(why, what);
- acct(rv & 0xff);
+ acct(rv);
exacct_commit_proc(p, rv);
/*
@@ -658,10 +785,22 @@ proc_exit(int why, int what)
if ((q = p->p_child) != NULL && p != proc_init) {
struct proc *np;
struct proc *initp = proc_init;
+ pid_t zone_initpid = 1;
+ struct proc *zoneinitp = NULL;
boolean_t setzonetop = B_FALSE;
- if (!INGLOBALZONE(curproc))
- setzonetop = B_TRUE;
+ if (!INGLOBALZONE(curproc)) {
+ zone_initpid = curproc->p_zone->zone_proc_initpid;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+ zoneinitp = prfind(zone_initpid);
+ if (zoneinitp != NULL) {
+ initp = zoneinitp;
+ } else {
+ zone_initpid = 1;
+ setzonetop = B_TRUE;
+ }
+ }
pgdetach(p);
@@ -673,7 +812,8 @@ proc_exit(int why, int what)
*/
delete_ns(q->p_parent, q);
- q->p_ppid = 1;
+ q->p_ppid = zone_initpid;
+
q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
if (setzonetop) {
mutex_enter(&q->p_lock);
@@ -847,8 +987,50 @@ proc_exit(int why, int what)
mutex_exit(&p->p_lock);
if (!evaporate) {
- p->p_pidflag &= ~CLDPEND;
- sigcld(p, sqp);
+ /*
+ * The brand specific code only happens when the brand has a
+ * function to call in place of sigcld and the parent of the
+ * exiting process is not the global zone init. If the parent
+ * is the global zone init, then the process was reparented,
+ * and we don't want brand code delivering possibly strange
+ * signals to init. Also, init is not branded, so any brand
+ * specific exit data will not be picked up by init anyway.
+ */
+ if (PROC_IS_BRANDED(p) &&
+ BROP(p)->b_exit_with_sig != NULL &&
+ p->p_ppid != 1) {
+ /*
+ * The code for _fini that could unload the brand_t
+ * blocks until the count of zones using the module
+ * reaches zero. Zones decrement the refcount on their
+ * brands only after all user tasks in that zone have
+ * exited and been waited on. The decrement on the
+ * brand's refcount happen in zone_destroy(). That
+ * depends on zone_shutdown() having been completed.
+ * zone_shutdown() includes a call to zone_empty(),
+ * where the zone waits for itself to reach the state
+ * ZONE_IS_EMPTY. This state is only set in either
+ * zone_shutdown(), when there are no user processes as
+ * the zone enters this function, or in
+ * zone_task_rele(). zone_task_rele() is called from
+ * code triggered by waiting on processes, not by the
+ * processes exiting through proc_exit(). This means
+ * all the branded processes that could exist for a
+ * specific brand_t must exit and get reaped before the
+ * refcount on the brand_t can reach 0. _fini will
+ * never unload the corresponding brand module before
+ * proc_exit finishes execution for all processes
+ * branded with a particular brand_t, which makes the
+ * operation below safe to do. Brands that wish to use
+ * this mechanism must wait in _fini as described
+ * above.
+ */
+ BROP(p)->b_exit_with_sig(p, sqp);
+ } else {
+ p->p_pidflag &= ~CLDPEND;
+ sigcld(p, sqp);
+ }
+
} else {
/*
* Do what sigcld() would do if the disposition
@@ -927,10 +1109,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
int
waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
{
- int found;
proc_t *cp, *pp;
- int proc_gone;
int waitflag = !(options & WNOWAIT);
+ boolean_t have_brand_helper = B_FALSE;
/*
* Obsolete flag, defined here only for binary compatibility
@@ -958,7 +1139,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
pp = ttoproc(curthread);
/*
- * lock parent mutex so that sibling chain can be searched.
+ * Anytime you are looking for a process, you take pidlock to prevent
+ * things from changing as you look.
*/
mutex_enter(&pidlock);
@@ -978,10 +1160,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
return (ECHILD);
}
- while (pp->p_child != NULL) {
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
+ have_brand_helper = B_TRUE;
+ }
+
+ while (pp->p_child != NULL || have_brand_helper) {
+ boolean_t brand_wants_wait = B_FALSE;
+ int proc_gone = 0;
+ int found = 0;
+
+ /*
+ * Give the brand a chance to return synthetic results from
+ * this waitid() call before we do the real thing.
+ */
+ if (have_brand_helper) {
+ int ret;
+
+ if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
+ &brand_wants_wait, &ret) == 0) {
+ mutex_exit(&pidlock);
+ return (ret);
+ }
- proc_gone = 0;
+ if (pp->p_child == NULL) {
+ goto no_real_children;
+ }
+ }
+ /*
+ * Look for interesting children in the newstate list.
+ */
+ VERIFY(pp->p_child != NULL);
for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
continue;
@@ -989,6 +1198,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
continue;
if (idtype == P_PGID && id != cp->p_pgrp)
continue;
+ if (PROC_IS_BRANDED(pp)) {
+ if (BROP(pp)->b_wait_filter != NULL &&
+ BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+ continue;
+ }
switch (cp->p_wcode) {
@@ -1033,12 +1247,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
* Wow! None of the threads on the p_sibling_ns list were
* interesting threads. Check all the kids!
*/
- found = 0;
for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
if (idtype == P_PID && id != cp->p_pid)
continue;
if (idtype == P_PGID && id != cp->p_pgrp)
continue;
+ if (PROC_IS_BRANDED(pp)) {
+ if (BROP(pp)->b_wait_filter != NULL &&
+ BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+ continue;
+ }
switch (cp->p_wcode) {
case CLD_TRAPPED:
@@ -1107,11 +1325,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
break;
}
+no_real_children:
/*
* If we found no interesting processes at all,
* break out and return ECHILD.
*/
- if (found + proc_gone == 0)
+ if (!brand_wants_wait && (found + proc_gone == 0))
break;
if (options & WNOHANG) {
@@ -1130,7 +1349,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
* change state while we wait, we don't wait at all.
* Get out with ECHILD according to SVID.
*/
- if (found == proc_gone)
+ if (!brand_wants_wait && (found == proc_gone))
break;
if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
@@ -1226,6 +1445,12 @@ freeproc(proc_t *p)
p->p_killsqp = NULL;
}
+ /* Clear any remaining brand data */
+ if (PROC_IS_BRANDED(p)) {
+ brand_clearbrand(p, B_FALSE);
+ }
+
+
prfree(p); /* inform /proc */
/*
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index 76eddd4e50..41e7e63d2b 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc.
+ * Copyright 2017, Joyent Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -386,6 +386,7 @@ flist_grow(int maxfd)
dst->uf_flag = src->uf_flag;
dst->uf_busy = src->uf_busy;
dst->uf_portfd = src->uf_portfd;
+ dst->uf_gen = src->uf_gen;
}
/*
@@ -487,7 +488,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */
afd->a_fd[i] = -1;
}
-static void
+void
set_active_fd(int fd)
{
afd_t *afd = &curthread->t_activefd;
@@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd)
}
/*
- * Convert a user supplied file descriptor into a pointer to a file
- * structure. Only task is to check range of the descriptor (soft
- * resource limit was enforced at open time and shouldn't be checked
- * here).
+ * Convert a user supplied file descriptor into a pointer to a file structure.
+ * Only task is to check range of the descriptor (soft resource limit was
+ * enforced at open time and shouldn't be checked here).
*/
file_t *
-getf(int fd)
+getf_gen(int fd, uf_entry_gen_t *genp)
{
uf_info_t *fip = P_FINFO(curproc);
uf_entry_t *ufp;
@@ -607,6 +607,9 @@ getf(int fd)
return (NULL);
}
ufp->uf_refcnt++;
+ if (genp != NULL) {
+ *genp = ufp->uf_gen;
+ }
set_active_fd(fd); /* record the active file descriptor */
@@ -615,6 +618,12 @@ getf(int fd)
return (fp);
}
+file_t *
+getf(int fd)
+{
+ return (getf_gen(fd, NULL));
+}
+
/*
* Close whatever file currently occupies the file descriptor slot
* and install the new file, usually NULL, in the file descriptor slot.
@@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp)
ASSERT(ufp->uf_flag == 0);
fd_reserve(fip, fd, 1);
ufp->uf_file = newfp;
+ ufp->uf_gen++;
UF_EXIT(ufp);
mutex_exit(&fip->fi_lock);
return (0);
@@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
*/
cfip->fi_nfiles = nfiles = flist_minsize(pfip);
- cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
+ cfip->fi_list = nfiles == 0 ? NULL :
+ kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
fd++, pufp++, cufp++) {
@@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
cufp->uf_alloc = pufp->uf_alloc;
cufp->uf_flag = pufp->uf_flag;
cufp->uf_busy = pufp->uf_busy;
+ cufp->uf_gen = pufp->uf_gen;
if (pufp->uf_file == NULL) {
ASSERT(pufp->uf_flag == 0);
if (pufp->uf_busy) {
@@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp)
fd_reserve(fip, fd, 1);
ASSERT(ufp->uf_file == NULL);
ufp->uf_file = fp;
+ if (fp != NULL) {
+ ufp->uf_gen++;
+ }
UF_EXIT(ufp);
mutex_exit(&fip->fi_lock);
return (fd);
@@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp)
} else {
UF_ENTER(ufp, fip, fd);
ASSERT(ufp->uf_busy);
+ ufp->uf_gen++;
}
ASSERT(ufp->uf_fpollinfo == NULL);
ASSERT(ufp->uf_flag == 0);
@@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp)
error = EBADF;
else {
vnode_t *vp = fp->f_vnode;
- int flag = fp->f_flag |
- ((fp->f_flag2 & ~FEPOLLED) << 16);
+ int flag = fp->f_flag | (fp->f_flag2 << 16);
/*
* BSD fcntl() FASYNC compatibility.
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index a63931459f..7e198910b4 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);
static int getproc(proc_t **, pid_t, uint_t);
#define GETPROC_USER 0x0
#define GETPROC_KERNEL 0x1
+#define GETPROC_ZSCHED 0x2
static void fork_fail(proc_t *);
static void forklwp_fail(proc_t *);
@@ -705,7 +706,7 @@ fork_fail(proc_t *cp)
if (PTOU(curproc)->u_cwd)
refstr_rele(PTOU(curproc)->u_cwd);
if (PROC_IS_BRANDED(cp)) {
- brand_clearbrand(cp, B_TRUE);
+ brand_clearbrand(cp, B_FALSE);
}
}
@@ -754,7 +755,7 @@ forklwp_fail(proc_t *p)
kmem_free(t->t_door, sizeof (door_data_t));
t->t_door = NULL;
}
- lwp_ctmpl_clear(ttolwp(t));
+ lwp_ctmpl_clear(ttolwp(t), B_FALSE);
/*
* Remove the thread from the all threads list.
@@ -791,6 +792,9 @@ extern struct as kas;
/*
* fork a kernel process.
+ *
+ * Passing a pid argument of -1 indicates that the new process should be
+ * launched as a child of 'zsched' within the zone.
*/
int
newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
@@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
rctl_set_t *init_set;
ASSERT(pid != 1);
+ ASSERT(pid >= 0);
if (getproc(&p, pid, GETPROC_KERNEL) < 0)
return (EAGAIN);
@@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
rctl_set_t *init_set;
task_t *tk, *tk_old;
klwp_t *lwp;
+ boolean_t pzsched = B_FALSE;
+ int flag = GETPROC_USER;
+
+ /* Handle a new user-level thread as child of zsched. */
+ if (pid < 0) {
+ VERIFY(curzone != global_zone);
+ flag = GETPROC_ZSCHED;
+ pzsched = B_TRUE;
+ pid = 0;
+ }
- if (getproc(&p, pid, GETPROC_USER) < 0)
+ if (getproc(&p, pid, flag) < 0)
return (EAGAIN);
/*
* init creates a new task, distinct from the task
@@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
}
t = lwptot(lwp);
- ctp = contract_process_fork(sys_process_tmpl, p, curproc,
+ ctp = contract_process_fork(sys_process_tmpl, p,
+ (pzsched ? curproc->p_zone->zone_zsched : curproc),
B_FALSE);
ASSERT(ctp != NULL);
if (ct != NULL)
@@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
return (-1); /* no point in starting new processes */
- pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+ if (flags & GETPROC_ZSCHED) {
+ pp = curproc->p_zone->zone_zsched;
+ } else {
+ pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+ }
task = pp->p_task;
proj = task->tk_proj;
zone = pp->p_zone;
@@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
cp->p_t1_lgrpid = LGRP_NONE;
cp->p_tr_lgrpid = LGRP_NONE;
+ /* Default to native brand initially */
+ cp->p_brand = &native_brand;
+
if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
if (nproc == v.v_proc) {
CPU_STATS_ADDQ(CPU, sys, procovf, 1);
@@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
cp->p_sessp = pp->p_sessp;
sess_hold(pp);
- cp->p_brand = pp->p_brand;
- if (PROC_IS_BRANDED(pp))
- BROP(pp)->b_copy_procdata(cp, pp);
cp->p_bssbase = pp->p_bssbase;
cp->p_brkbase = pp->p_brkbase;
cp->p_brksize = pp->p_brksize;
@@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
mutex_exit(&cp->p_lock);
mutex_exit(&pidlock);
+ if (PROC_IS_BRANDED(pp)) {
+ /*
+ * The only reason why process branding should fail is when
+ * the procedure is complicated by multiple LWPs on the scene.
+ * With an LWP count of 0, this newly allocated process has no
+ * reason to fail branding.
+ */
+ VERIFY0(brand_setbrand(cp, B_FALSE));
+
+ BROP(pp)->b_copy_procdata(cp, pp);
+ }
+
avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
offsetof(contract_t, ct_ctlist));
@@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
*/
fcnt_add(P_FINFO(pp), 1);
+ mutex_enter(&pp->p_lock);
if (PTOU(pp)->u_cdir) {
VN_HOLD(PTOU(pp)->u_cdir);
} else {
@@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
VN_HOLD(PTOU(pp)->u_rdir);
if (PTOU(pp)->u_cwd)
refstr_hold(PTOU(pp)->u_cwd);
+ mutex_exit(&pp->p_lock);
/*
* copy the parent's uarea.
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index de2a4f26c4..07fd623a95 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -21,7 +21,7 @@
/*
* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -55,6 +55,7 @@
#include <sys/fcntl.h>
#include <sys/lwpchan_impl.h>
#include <sys/nbmlock.h>
+#include <sys/brand.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
return (0);
}
+caddr_t
+map_userlimit(proc_t *pp, struct as *as, int flags)
+{
+ if (flags & _MAP_LOW32) {
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
+ return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
+ } else {
+ return ((caddr_t)_userlimit32);
+ }
+ }
+
+ return (as->a_userlimit);
+}
+
/*
* Used for MAP_ANON - fast way to get anonymous pages
@@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
return (EACCES);
if ((flags & MAP_FIXED) != 0) {
- caddr_t userlimit;
-
/*
* Use the user address. First verify that
* the address to be used is page aligned.
@@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
return (EINVAL);
- userlimit = flags & _MAP_LOW32 ?
- (caddr_t)USERLIMIT32 : as->a_userlimit;
- switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+ switch (valid_usr_range(*addrp, len, uprot, as,
+ map_userlimit(as->a_proc, as, flags))) {
case RANGE_OKAY:
break;
case RANGE_BADPROT:
@@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
#define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
!(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
-static int
+int
smmap_common(caddr_t *addrp, size_t len,
int prot, int flags, struct file *fp, offset_t pos)
{
@@ -780,8 +792,6 @@ smmap_common(caddr_t *addrp, size_t len,
* If the user specified an address, do some simple checks here
*/
if ((flags & MAP_FIXED) != 0) {
- caddr_t userlimit;
-
/*
* Use the user address. First verify that
* the address to be used is page aligned.
@@ -789,10 +799,8 @@ smmap_common(caddr_t *addrp, size_t len,
*/
if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
return (EINVAL);
-
- userlimit = flags & _MAP_LOW32 ?
- (caddr_t)USERLIMIT32 : as->a_userlimit;
- switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+ switch (valid_usr_range(*addrp, len, uprot, as,
+ map_userlimit(curproc, as, flags))) {
case RANGE_OKAY:
break;
case RANGE_BADPROT:
diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c
deleted file mode 100644
index 2dad0cb940..0000000000
--- a/usr/src/uts/common/os/id_space.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/id_space.h>
-#include <sys/debug.h>
-
-/*
- * ID Spaces
- *
- * The id_space_t provides a simple implementation of a managed range of
- * integer identifiers using a vmem arena. An ID space guarantees that the
- * next identifer returned by an allocation is larger than the previous one,
- * unless there are no larger slots remaining in the range. In this case,
- * the ID space will return the first available slot in the lower part of the
- * range (viewing the previous identifier as a partitioning element). If no
- * slots are available, id_alloc()/id_allocff() will sleep until an
- * identifier becomes available. Accordingly, id_space allocations must be
- * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/
- * id_allocff_nosleep() will return -1 if no slots are available or if the
- * system is low on memory. If id_alloc_nosleep() fails, callers should
- * not try to extend the ID space. This is to avoid making a possible
- * low-memory situation worse.
- *
- * As an ID space is designed for representing a range of id_t's, there
- * is a preexisting maximal range: [0, MAXUID]. ID space requests outside
- * that range will fail on a DEBUG kernel. The id_allocff*() functions
- * return the first available id, and should be used when there is benefit
- * to having a compact allocated range.
- *
- * (Presently, the id_space_t abstraction supports only direct allocations; ID
- * reservation, in which an ID is allocated but placed in a internal
- * dictionary for later use, should be added when a consuming subsystem
- * arrives.)
- */
-
-#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1))
-#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1))
-
-/*
- * Create an arena to represent the range [low, high).
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_space_t *
-id_space_create(const char *name, id_t low, id_t high)
-{
- ASSERT(low >= 0);
- ASSERT(low < high);
-
- return (vmem_create(name, ID_TO_ADDR(low), high - low, 1,
- NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER));
-}
-
-/*
- * Destroy a previously created ID space.
- * No restrictions on caller's context.
- */
-void
-id_space_destroy(id_space_t *isp)
-{
- vmem_destroy(isp);
-}
-
-void
-id_space_extend(id_space_t *isp, id_t low, id_t high)
-{
- (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP);
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_alloc(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_alloc_nosleep(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_allocff(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_allocff_nosleep(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate a specific identifier if possible, returning the id if
- * successful, or -1 on failure.
- */
-id_t
-id_alloc_specific_nosleep(id_space_t *isp, id_t id)
-{
- void *minaddr = ID_TO_ADDR(id);
- void *maxaddr = ID_TO_ADDR(id + 1);
-
- /*
- * Note that even though we're vmem_free()ing this later, it
- * should be OK, since there's no quantum cache.
- */
- return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0,
- minaddr, maxaddr, VM_NOSLEEP)));
-}
-
-/*
- * Free a previously allocated ID.
- * No restrictions on caller's context.
- */
-void
-id_free(id_space_t *isp, id_t id)
-{
- vmem_free(isp, ID_TO_ADDR(id), 1);
-}
diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c
index 86cb867da8..bf917ef716 100644
--- a/usr/src/uts/common/os/ipc.c
+++ b/usr/src/uts/common/os/ipc.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
(IPC_ZONE_USAGE(perm, service) == 0)));
}
+/*
+ * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID.
+ */
+void
+ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm)
+{
+ ASSERT(service->ipcs_count > 0);
+ ASSERT(MUTEX_HELD(&service->ipcs_lock));
+
+ ipc_remove(service, perm);
+ mutex_exit(&service->ipcs_lock);
+
+ /* perform any per-service removal actions */
+ service->ipcs_rmid(perm);
+
+ ipc_rele(service, perm);
+}
/*
* Common code to perform an IPC_RMID. Returns an errno value on
@@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
/*
* Nothing can fail from this point on.
*/
- ipc_remove(service, perm);
- mutex_exit(&service->ipcs_lock);
-
- /* perform any per-service removal actions */
- service->ipcs_rmid(perm);
-
- ipc_rele(service, perm);
+ ipc_rmsvc(service, perm);
return (0);
}
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index bc0cda418b..ed2c7fc346 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2018, Joyent, Inc.
@@ -1011,6 +1012,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
size_t kmem_content_log_size; /* content log size [2% of memory] */
size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */
size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */
+size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */
size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */
size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
@@ -1018,6 +1020,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */
size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */
size_t kmem_minfirewall; /* hardware-enforced redzone threshold */
+#ifdef DEBUG
+int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */
+#else
+int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */
+#endif
+
+int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */
+
#ifdef _LP64
size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */
#else
@@ -1098,6 +1108,7 @@ kmem_log_header_t *kmem_transaction_log;
kmem_log_header_t *kmem_content_log;
kmem_log_header_t *kmem_failure_log;
kmem_log_header_t *kmem_slab_log;
+kmem_log_header_t *kmem_zerosized_log;
static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
@@ -2853,8 +2864,33 @@ kmem_alloc(size_t size, int kmflag)
/* fall through to kmem_cache_alloc() */
} else {
- if (size == 0)
+ if (size == 0) {
+ if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
+ return (NULL);
+
+ /*
+ * If this is a sleeping allocation or one that has
+ * been specified to panic on allocation failure, we
+ * consider it to be deprecated behavior to allocate
+ * 0 bytes. If we have been configured to panic under
+ * this condition, we panic; if to warn, we warn -- and
+ * regardless, we log to the kmem_zerosized_log that
+ * that this condition has occurred (which gives us
+ * enough information to be able to debug it).
+ */
+ if (kmem_panic && kmem_panic_zerosized)
+ panic("attempted to kmem_alloc() size of 0");
+
+ if (kmem_warn_zerosized) {
+ cmn_err(CE_WARN, "kmem_alloc(): sleeping "
+ "allocation with size of 0; "
+ "see kmem_zerosized_log for details");
+ }
+
+ kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
+
return (NULL);
+ }
buf = vmem_alloc(kmem_oversize_arena, size,
kmflag & KM_VMFLAGS);
@@ -4397,8 +4433,8 @@ kmem_init(void)
}
kmem_failure_log = kmem_log_init(kmem_failure_log_size);
-
kmem_slab_log = kmem_log_init(kmem_slab_log_size);
+ kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
/*
* Initialize STREAMS message caches so allocb() is available.
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 93c04cff8d..b09b2d3558 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -198,6 +198,9 @@ struct {
kstat_named_t pagesfree;
kstat_named_t pageslocked;
kstat_named_t pagestotal;
+ kstat_named_t lowmemscan;
+ kstat_named_t zonecapscan;
+ kstat_named_t nthrottle;
} system_pages_kstat = {
{ "physmem", KSTAT_DATA_ULONG },
{ "nalloc", KSTAT_DATA_ULONG },
@@ -219,6 +222,9 @@ struct {
{ "pagesfree", KSTAT_DATA_ULONG },
{ "pageslocked", KSTAT_DATA_ULONG },
{ "pagestotal", KSTAT_DATA_ULONG },
+ { "low_mem_scan", KSTAT_DATA_ULONG },
+ { "zone_cap_scan", KSTAT_DATA_ULONG },
+ { "n_throttle", KSTAT_DATA_ULONG },
};
static int header_kstat_update(kstat_t *, int);
@@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw)
system_pages_kstat.pageslocked.value.ul = (ulong_t)(availrmem_initial -
availrmem);
system_pages_kstat.pagestotal.value.ul = (ulong_t)total_pages;
+ system_pages_kstat.lowmemscan.value.ul = (ulong_t)low_mem_scan;
+ system_pages_kstat.zonecapscan.value.ul = (ulong_t)zone_cap_scan;
+ system_pages_kstat.nthrottle.value.ul = (ulong_t)n_throttle;
/*
* pp_kernel represents total pages used by the kernel since the
* startup. This formula takes into account the boottime kernel
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index 6288f47bed..6f6aced619 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -90,6 +91,7 @@
#include <sys/pg.h>
#include <sys/promif.h>
#include <sys/sdt.h>
+#include <sys/ht.h>
lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */
lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
@@ -520,6 +522,8 @@ lgrp_main_mp_init(void)
{
klgrpset_t changed;
+ ht_init();
+
/*
* Update lgroup topology (if necessary)
*/
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 149f5f8a88..06c03dd38e 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2013 Gary Mills
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -249,8 +250,7 @@ log_init(void)
*/
printf("\rSunOS Release %s Version %s %u-bit\n",
utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
- printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
- "All rights reserved.\n");
+ printf("Copyright (c) 2010-2019, Joyent Inc. All rights reserved.\n");
#ifdef DEBUG
printf("DEBUG enabled\n");
#endif
@@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc)
mblk_t *
log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg,
- size_t size, int on_intr)
+ size_t size, int on_intr)
{
mblk_t *mp = NULL;
mblk_t *mp2;
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index b2adae570f..341e4ae356 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
#include <sys/param.h>
@@ -57,6 +57,8 @@
#include <sys/lgrp.h>
#include <sys/rctl.h>
#include <sys/contract_impl.h>
+#include <sys/contract/process.h>
+#include <sys/contract/process_impl.h>
#include <sys/cpc_impl.h>
#include <sys/sdt.h>
#include <sys/cmn_err.h>
@@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
ret_tidhash_t *ret_tidhash = NULL;
int i;
int rctlfail = 0;
- boolean_t branded = 0;
+ void *brand_data = NULL;
struct ctxop *ctx = NULL;
ASSERT(cid != sysdccid); /* system threads must start in SYS */
@@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
*/
lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
+ /*
+ * If necessary, speculatively allocate lwp brand data. This is done
+ * ahead of time so p_lock need not be dropped during lwp branding.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) {
+ if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) {
+ mutex_enter(&p->p_lock);
+ err = 1;
+ atomic_inc_32(&p->p_zone->zone_ffmisc);
+ goto error;
+ }
+ }
+
mutex_enter(&p->p_lock);
grow:
/*
@@ -630,18 +645,6 @@ grow:
} while (lwp_hash_lookup(p, t->t_tid) != NULL);
}
- /*
- * If this is a branded process, let the brand do any necessary lwp
- * initialization.
- */
- if (PROC_IS_BRANDED(p)) {
- if (BROP(p)->b_initlwp(lwp)) {
- err = 1;
- atomic_inc_32(&p->p_zone->zone_ffmisc);
- goto error;
- }
- branded = 1;
- }
if (t->t_tid == 1) {
kpreempt_disable();
@@ -654,7 +657,6 @@ grow:
}
}
- p->p_lwpcnt++;
t->t_waitfor = -1;
/*
@@ -696,8 +698,27 @@ grow:
t->t_post_sys = 1;
/*
+ * Perform lwp branding
+ *
+ * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be
+ * continuously held between when the tidhash is sized and when the lwp
+ * is inserted into it. Operations requiring p->p_lock to be
+ * temporarily dropped can be performed in b_initlwp_post.
+ */
+ if (PROC_IS_BRANDED(p)) {
+ BROP(p)->b_initlwp(lwp, brand_data);
+ /*
+ * The b_initlwp hook is expected to consume any preallocated
+ * brand_data in a way that prepares it for deallocation by the
+ * b_freelwp hook.
+ */
+ brand_data = NULL;
+ }
+
+ /*
* Insert the new thread into the list of all threads.
*/
+ p->p_lwpcnt++;
if ((tx = p->p_tlist) == NULL) {
t->t_back = t;
t->t_forw = t;
@@ -718,6 +739,13 @@ grow:
lep->le_start = t->t_start;
lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1);
+ /*
+ * Complete lwp branding
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) {
+ BROP(p)->b_initlwp_post(lwp);
+ }
+
lwp_fp_init(lwp);
if (state == TS_RUN) {
@@ -755,8 +783,9 @@ error:
if (cid != NOCLASS && bufp != NULL)
CL_FREE(cid, bufp);
- if (branded)
- BROP(p)->b_freelwp(lwp);
+ if (brand_data != NULL) {
+ BROP(p)->b_lwpdata_free(brand_data);
+ }
mutex_exit(&p->p_lock);
t->t_state = TS_FREE;
@@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
int i;
for (i = 0; i < ct_ntypes; i++) {
- dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]);
+ ct_template_t *tmpl = src->lwp_ct_active[i];
+
+ /*
+ * If the process contract template is setup to be preserved
+ * across exec, then if we're forking, perform an implicit
+ * template_clear now. This ensures that future children of
+ * this child will remain in the same contract unless they're
+ * explicitly setup differently. We know we're forking if the
+ * two LWPs belong to different processes.
+ */
+ if (i == CTT_PROCESS && tmpl != NULL) {
+ ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+ if (dst->lwp_procp != src->lwp_procp &&
+ (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+ tmpl = NULL;
+ }
+
+ dst->lwp_ct_active[i] = ctmpl_dup(tmpl);
dst->lwp_ct_latest[i] = NULL;
+
}
}
@@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
* Clear an LWP's contract template state.
*/
void
-lwp_ctmpl_clear(klwp_t *lwp)
+lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)
{
ct_template_t *tmpl;
int i;
for (i = 0; i < ct_ntypes; i++) {
- if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
- ctmpl_free(tmpl);
- lwp->lwp_ct_active[i] = NULL;
- }
-
if (lwp->lwp_ct_latest[i] != NULL) {
contract_rele(lwp->lwp_ct_latest[i]);
lwp->lwp_ct_latest[i] = NULL;
}
+
+ if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
+ /*
+ * If we're exec-ing a new program and the process
+ * contract template is setup to be preserved across
+ * exec, then don't clear it.
+ */
+ if (is_exec && i == CTT_PROCESS) {
+ ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+ if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+ continue;
+ }
+
+ ctmpl_free(tmpl);
+ lwp->lwp_ct_active[i] = NULL;
+ }
}
}
@@ -893,13 +953,6 @@ lwp_exit(void)
if (t->t_upimutex != NULL)
upimutex_cleanup();
- /*
- * Perform any brand specific exit processing, then release any
- * brand data associated with the lwp
- */
- if (PROC_IS_BRANDED(p))
- BROP(p)->b_lwpexit(lwp);
-
lwp_pcb_exit();
mutex_enter(&p->p_lock);
@@ -943,6 +996,18 @@ lwp_exit(void)
DTRACE_PROC(lwp__exit);
/*
+ * Perform any brand specific exit processing, then release any
+ * brand data associated with the lwp
+ */
+ if (PROC_IS_BRANDED(p)) {
+ mutex_exit(&p->p_lock);
+ BROP(p)->b_lwpexit(lwp);
+ BROP(p)->b_freelwp(lwp);
+ mutex_enter(&p->p_lock);
+ prbarrier(p);
+ }
+
+ /*
* If the lwp is a detached lwp or if the process is exiting,
* remove (lwp_hash_out()) the lwp from the lwp directory.
* Otherwise null out the lwp's le_thread pointer in the lwp
@@ -1103,7 +1168,7 @@ lwp_cleanup(void)
}
kpreempt_enable();
- lwp_ctmpl_clear(ttolwp(t));
+ lwp_ctmpl_clear(ttolwp(t), B_FALSE);
}
int
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 7bc41b6954..3364d1e523 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -158,7 +158,7 @@ exec_init(const char *initpath, const char *args)
int error = 0, count = 0;
proc_t *p = ttoproc(curthread);
klwp_t *lwp = ttolwp(curthread);
- int brand_action;
+ int brand_action = EBA_NONE;
if (args == NULL)
args = "";
@@ -288,7 +288,15 @@ exec_init(const char *initpath, const char *args)
*/
sigemptyset(&curthread->t_hold);
- brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
+ /*
+ * Only instruct exec_common to brand the process if necessary. It is
+ * possible that the init process is already properly branded due to the
+ * proc_exit -> restart_init -> exec_init call chain.
+ */
+ if (ZONE_IS_BRANDED(p->p_zone) &&
+ p->p_brand != p->p_zone->zone_brand) {
+ brand_action = EBA_BRAND;
+ }
again:
error = exec_common((const char *)exec_fnamep,
(const char **)uap, NULL, brand_action);
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 3571747e9c..6be46fa422 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp)
* Put pressure on pageout.
*/
page_needfree(free_get);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
mutex_enter(&mhp->mh_mutex);
(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index 142c10754e..0410e6f47b 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
}
if (num_segs++ == 0) {
/*
- * The p_vaddr of the first PT_LOAD segment
- * must either be NULL or within the first
- * page in order to be interpreted.
- * Otherwise, its an invalid file.
+ * While ELF doesn't specify the meaning of
+ * p_vaddr for PT_LOAD segments in ET_DYN
+ * objects, we mandate that is either NULL or
+ * (to accommodate some historical binaries)
+ * within the first page. (Note that there
+ * exist non-native ET_DYN objects that violate
+ * this constraint that we nonetheless must be
+ * able to execute; see the ET_DYN handling in
+ * mapelfexec() for details.)
*/
if (e_type == ET_DYN &&
((caddr_t)((uintptr_t)vaddr &
diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c
index e2a3335eb4..f1003f7834 100644
--- a/usr/src/uts/common/os/modctl.c
+++ b/usr/src/uts/common/os/modctl.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 Joyent, Inc.
*/
/*
@@ -3470,6 +3471,11 @@ mod_load(struct modctl *mp, int usepath)
retval = install_stubs_by_name(mp, mp->mod_modname);
/*
+ * Perform hotinlines before module is started.
+ */
+ do_hotinlines(mp->mod_mp);
+
+ /*
* Now that the module is loaded, we need to give DTrace
* a chance to notify its providers. This is done via
* the dtrace_modload function pointer.
diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c
index 3605104ae7..a04294eed5 100644
--- a/usr/src/uts/common/os/modsysfile.c
+++ b/usr/src/uts/common/os/modsysfile.c
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2017 Nexenta Systems, Inc.
*/
@@ -57,10 +58,12 @@ struct hwc_class *hcl_head; /* head of list of classes */
static kmutex_t hcl_lock; /* for accessing list of classes */
#define DAFILE "/etc/driver_aliases"
+#define PPTFILE "/etc/ppt_aliases"
#define CLASSFILE "/etc/driver_classes"
#define DACFFILE "/etc/dacf.conf"
static char class_file[] = CLASSFILE;
+static char pptfile[] = PPTFILE;
static char dafile[] = DAFILE;
static char dacffile[] = DACFFILE;
@@ -2150,14 +2153,13 @@ hwc_parse_now(char *fname, struct par_list **pl, ddi_prop_t **props)
return (0); /* always return success */
}
-void
-make_aliases(struct bind **bhash)
+static void
+parse_aliases(struct bind **bhash, struct _buf *file)
{
enum {
AL_NEW, AL_DRVNAME, AL_DRVNAME_COMMA, AL_ALIAS, AL_ALIAS_COMMA
} state;
- struct _buf *file;
char tokbuf[MAXPATHLEN];
char drvbuf[MAXPATHLEN];
token_t token;
@@ -2166,9 +2168,6 @@ make_aliases(struct bind **bhash)
static char dupwarn[] = "!Driver alias \"%s\" conflicts with "
"an existing driver name or alias.";
- if ((file = kobj_open_file(dafile)) == (struct _buf *)-1)
- return;
-
state = AL_NEW;
major = DDI_MAJOR_T_NONE;
while (!done) {
@@ -2253,8 +2252,22 @@ make_aliases(struct bind **bhash)
kobj_file_err(CE_WARN, file, tok_err, tokbuf);
}
}
+}
- kobj_close_file(file);
+void
+make_aliases(struct bind **bhash)
+{
+ struct _buf *file;
+
+ if ((file = kobj_open_file(pptfile)) != (struct _buf *)-1) {
+ parse_aliases(bhash, file);
+ kobj_close_file(file);
+ }
+
+ if ((file = kobj_open_file(dafile)) != (struct _buf *)-1) {
+ parse_aliases(bhash, file);
+ kobj_close_file(file);
+ }
}
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index b555bb82b7..eba6147fab 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -112,6 +113,18 @@ pid_lookup(pid_t pid)
return (pidp);
}
+struct pid *
+pid_find(pid_t pid)
+{
+ struct pid *pidp;
+
+ mutex_enter(&pidlinklock);
+ pidp = pid_lookup(pid);
+ mutex_exit(&pidlinklock);
+
+ return (pidp);
+}
+
void
pid_setmin(void)
{
@@ -522,6 +535,20 @@ sprunlock(proc_t *p)
THREAD_KPRI_RELEASE();
}
+/*
+ * Undo effects of sprlock but without dropping p->p_lock
+ */
+void
+sprunprlock(proc_t *p)
+{
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ cv_signal(&pr_pid_cv[p->p_slot]);
+ p->p_proc_flag &= ~P_PR_LOCK;
+ THREAD_KPRI_RELEASE();
+}
+
void
pid_init(void)
{
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index d3d362a8a7..861c748cff 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -56,6 +56,7 @@
#include <sys/mntent.h>
#include <sys/contract_impl.h>
#include <sys/dld_ioc.h>
+#include <sys/brand.h>
/*
* There are two possible layers of privilege routines and two possible
@@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)
void
secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
{
+ proc_t *p = curproc;
+
+ /*
+ * Allow the brand to override this behaviour.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) {
+ /*
+ * This brand hook will return 0 if handling is complete, or
+ * some other value if the brand would like us to fall back to
+ * the usual behaviour.
+ */
+ if (BROP(p)->b_setid_clear(vap, cr) == 0) {
+ return;
+ }
+ }
+
if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(cr,
(vap->va_mode & S_ISUID) != 0 &&
@@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr)
}
int
+secpolicy_fs_import(const cred_t *cr)
+{
+ return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL));
+}
+
+
+int
secpolicy_pfexec_register(const cred_t *cr)
{
return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL));
@@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr)
return (secpolicy_net_config(cr, B_FALSE));
return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
}
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+ if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+ return (EPERM);
+ return (0);
+}
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index bc1787c9ca..854fb602da 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP
Allows a process to perform privileged mappings through a
graphics device.
+privilege PRIV_HYPRLOFS_CONTROL
+
+ Allows a process to manage hyprlofs entries.
+
privilege PRIV_IPC_DAC_READ
Allows a process to read a System V IPC
@@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES
Allows a process to open the real console device directly.
Allows a process to open devices that have been exclusively opened.
+privilege PRIV_SYS_FS_IMPORT
+
+ Allows a process to import a potentially untrusted file system.
+
privilege PRIV_SYS_IPC_CONFIG
Allows a process to increase the size of a System V IPC Message
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 09b80323d5..e0a1126567 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
#include <sys/atomic.h>
@@ -194,6 +195,8 @@ id_space_t *rctl_ids;
kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */
kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */
+extern rctl_hndl_t rc_process_maxlockedmem;
+
kmutex_t rctl_lists_lock;
rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
@@ -2872,12 +2875,12 @@ rctl_init(void)
* rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
* int chargeproc)
*
- * Increments the amount of locked memory on a project, and
- * zone. If proj is non-NULL the project must be held by the
- * caller; if it is NULL the proj and zone of proc_t p are used.
- * If chargeproc is non-zero, then the charged amount is cached
- * on p->p_locked_mem so that the charge can be migrated when a
- * process changes projects.
+ * Increments the amount of locked memory on a process, project, and
+ * zone. If 'proj' is non-NULL, the project must be held by the
+ * caller; if it is NULL, the project and zone of process 'p' are used.
+ * If 'chargeproc' is non-zero, then the charged amount is added
+ * to p->p_locked_mem. This is also used so that the charge can be
+ * migrated when a process changes projects.
*
* Return values
* 0 - success
@@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
ASSERT(p != NULL);
ASSERT(MUTEX_HELD(&p->p_lock));
+
if (proj != NULL) {
projp = proj;
zonep = proj->kpj_zone;
@@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
}
}
- zonep->zone_locked_mem += inc;
- projp->kpj_data.kpd_locked_mem += inc;
if (chargeproc != 0) {
+ /* Check for overflow */
+ if ((p->p_locked_mem + inc) < p->p_locked_mem) {
+ ret = EAGAIN;
+ goto out;
+ }
+ if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p,
+ &e, inc, 0) & RCT_DENY) {
+ ret = EAGAIN;
+ goto out;
+ }
+
p->p_locked_mem += inc;
}
+
+ zonep->zone_locked_mem += inc;
+ projp->kpj_data.kpd_locked_mem += inc;
out:
mutex_exit(&zonep->zone_mem_lock);
return (ret);
diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c
index 9b7324fe7b..c62540d2b4 100644
--- a/usr/src/uts/common/os/rctl_proc.c
+++ b/usr/src/uts/common/os/rctl_proc.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -32,6 +33,7 @@
#include <sys/port_kernel.h>
#include <sys/signal.h>
#include <sys/var.h>
+#include <sys/policy.h>
#include <sys/vmparam.h>
#include <sys/machparam.h>
@@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl;
rctl_hndl_t rc_process_semopm;
rctl_hndl_t rc_process_portev;
rctl_hndl_t rc_process_sigqueue;
+rctl_hndl_t rc_process_maxlockedmem;
/*
* process.max-cpu-time / RLIMIT_CPU
@@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = {
};
/*
+ * process.max-locked-memory
+ */
+/*ARGSUSED*/
+static int
+proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
+ struct rctl_val *rv, rctl_qty_t i, uint_t f)
+{
+ if (secpolicy_lock_memory(CRED()) == 0)
+ return (0);
+ return ((p->p_locked_mem + i) > rv->rcv_value);
+}
+
+static rctl_ops_t proc_maxlockedmem_ops = {
+ rcop_no_action,
+ rcop_no_usage,
+ rcop_no_set,
+ proc_maxlockedmem_test
+};
+
+/*
* void rctlproc_default_init()
*
* Overview
@@ -383,6 +406,11 @@ rctlproc_init(void)
rctl_add_default_limit("process.max-sigqueue-size",
_SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
+ rc_process_maxlockedmem = rctl_register("process.max-locked-memory",
+ RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS |
+ RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES,
+ ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops);
+
/*
* Place minimal set of controls on "sched" process for inheritance by
* processes created via newproc().
diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c
index c1d6569f11..15e77d39f7 100644
--- a/usr/src/uts/common/os/sched.c
+++ b/usr/src/uts/common/os/sched.c
@@ -27,6 +27,10 @@
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
+/*
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
@@ -646,16 +650,17 @@ top:
klwp_t *lwp = ttolwp(tp);
/*
- * Swapout eligible lwps (specified by the scheduling
- * class) which don't have TS_DONT_SWAP set. Set the
- * "intent to swap" flag (TS_SWAPENQ) on threads
- * which have TS_DONT_SWAP set so that they can be
+ * Swapout eligible lwps (specified by the scheduling class)
+ * which don't have TS_DONT_SWAP set. Set the "intent to swap"
+ * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP
+ * set or are currently on a split stack so that they can be
* swapped if and when they reach a safe point.
*/
thread_lock(tp);
thread_pri = CL_SWAPOUT(tp, swapflags);
if (thread_pri != -1) {
- if (tp->t_schedflag & TS_DONT_SWAP) {
+ if ((tp->t_schedflag & TS_DONT_SWAP) ||
+ (tp->t_flag & T_SPLITSTK)) {
tp->t_schedflag |= TS_SWAPENQ;
tp->t_trapret = 1;
aston(tp);
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 5721083751..18b396a765 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t)
/*
- * If the sc_sigblock field is set for the specified thread, set
- * its signal mask to block all maskable signals, then clear the
- * sc_sigblock field. This finishes what user-level code requested
- * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
- * Called from signal-related code either by the current thread for
- * itself or by a thread that holds the process's p_lock (/proc code).
+ * If the sc_sigblock field is set for the specified thread, set its signal
+ * mask to block all maskable signals, then clear the sc_sigblock field. This
+ * accomplishes what user-level code requested to be done when it set
+ * tdp->sc_shared->sc_sigblock non-zero.
+ *
+ * This is generally called by signal-related code in the current thread. In
+ * order to call against a thread other than curthread, p_lock for the
+ * containing process must be held. Even then, the caller is not protected
+ * from races with the thread in question updating its own fields. It is the
+ * responsibility of the caller to perform additional synchronization.
+ *
*/
void
schedctl_finish_sigblock(kthread_t *t)
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index bacc595f78..5deae96d73 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
size_t share_size;
struct shm_data ssd;
uintptr_t align_hint;
+ long curprot;
/*
* Pick a share pagesize to use, if (!isspt(sp)).
@@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
}
}
+ curprot = sp->shm_opts & SHM_PROT_MASK;
if (!isspt(sp)) {
error = sptcreate(size, &segspt, sp->shm_amp, prot,
flags, share_szc);
@@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
}
sp->shm_sptinfo->sptas = segspt->s_as;
sp->shm_sptseg = segspt;
- sp->shm_sptprot = prot;
- } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
+ sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot;
+ } else if ((prot & curprot) != curprot) {
/*
* Ensure we're attaching to an ISM segment with
* fewer or equal permissions than what we're
@@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)
}
break;
+ /* Stage segment for removal, but don't remove until last detach */
+ case SHM_RMID:
+ if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0)
+ break;
+
+ /*
+ * If attached, just mark it as a pending remove, otherwise
+ * we must perform the normal ipc_rmid now.
+ */
+ if ((sp->shm_perm.ipc_ref - 1) > 0) {
+ sp->shm_opts |= SHM_RM_PENDING;
+ } else {
+ mutex_exit(lock);
+ return (ipc_rmid(shm_svc, shmid, cr));
+ }
+ break;
+
default:
error = EINVAL;
break;
@@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)
sp->shm_ismattch--;
sp->shm_dtime = gethrestime_sec();
sp->shm_lpid = pp->p_pid;
+ if ((sp->shm_opts & SHM_RM_PENDING) != 0 &&
+ sp->shm_perm.ipc_ref == 2) {
+ /*
+ * If this is the last detach of the segment across the whole
+ * system then now we can perform the delayed IPC_RMID.
+ * The ipc_ref count has 1 for the original 'get' and one for
+ * each 'attach' (see 'stat' handling in shmctl).
+ */
+ sp->shm_opts &= ~SHM_RM_PENDING;
+ mutex_enter(&shm_svc->ipcs_lock);
+ ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */
+ ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock));
+ ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0);
+
+ /* Lock was dropped, need to retake it for following rele. */
+ (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
+ }
ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */
kmem_free(sap, sizeof (segacct_t));
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 453b1f22d4..67a93581dd 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -60,6 +60,7 @@
#include <sys/cyclic.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
+#include <sys/brand.h>
#include <sys/signalfd.h>
const k_sigset_t nullsmask = {0, 0, 0};
@@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)
}
/*
+ * Return true if the signal can safely be ignored.
+ * That is, if the signal is included in the p_ignore mask and doing so is not
+ * forbidden by any process branding.
+ */
+static int
+sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+ return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */
+ !(PROC_IS_BRANDED(p) && /* allowed by brand */
+ BROP(p)->b_sig_ignorable != NULL &&
+ BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE));
+
+}
+
+/*
* Return true if the signal can safely be discarded on generation.
* That is, if there is no need for the signal on the receiving end.
* The answer is true if the process is a zombie or
@@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)
* the signal is not being accepted via sigwait()
*/
static int
-sig_discardable(proc_t *p, int sig)
+sig_discardable(proc_t *p, kthread_t *tp, int sig)
{
kthread_t *t = p->p_tlist;
+ klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;
return (t == NULL || /* if zombie or ... */
- (sigismember(&p->p_ignore, sig) && /* signal is ignored */
+ (sig_ignorable(p, lwp, sig) && /* signal is ignored */
t->t_forw == t && /* and single-threaded */
!tracing(p, sig) && /* and no /proc tracing */
!signal_is_blocked(t, sig) && /* and signal not blocked */
@@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)
!(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {
ttoproc(t)->p_stopsig = 0;
t->t_dtrace_stop = 0;
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
setrun_locked(t);
} else if (t != curthread && t->t_state == TS_ONPROC) {
aston(t); /* make it do issig promptly */
@@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)
}
}
- if (sig_discardable(p, sig)) {
+ if (sig_discardable(p, t, sig)) {
DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,
proc_t *, p, int, sig);
return;
@@ -497,7 +514,7 @@ issig_justlooking(void)
if (sigismember(&set, sig) &&
(tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig))) {
+ !sig_ignorable(p, lwp, sig))) {
/*
* Don't promote a signal that will stop
* the process when lwp_nostop is set.
@@ -623,6 +640,28 @@ issig_forreal(void)
}
/*
+ * The brand hook name 'b_issig_stop' is a misnomer.
+ * Allow the brand the chance to alter (or suppress) delivery
+ * of this signal.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) {
+ int r;
+
+ /*
+ * The brand hook will return 0 if it would like
+ * us to drive on, -1 if we should restart
+ * the loop to check other conditions, or 1 if we
+ * should terminate the loop.
+ */
+ r = BROP(p)->b_issig_stop(p, lwp);
+ if (r < 0) {
+ continue;
+ } else if (r > 0) {
+ break;
+ }
+ }
+
+ /*
* Honor requested stop before dealing with the
* current signal; a debugger may change it.
* Do not want to go back to loop here since this is a special
@@ -656,7 +695,7 @@ issig_forreal(void)
lwp->lwp_cursig = 0;
lwp->lwp_extsig = 0;
if (sigismember(&t->t_sigwait, sig) ||
- (!sigismember(&p->p_ignore, sig) &&
+ (!sig_ignorable(p, lwp, sig) &&
!isjobstop(sig))) {
if (p->p_flag & (SEXITLWPS|SKILLED)) {
sig = SIGKILL;
@@ -708,7 +747,7 @@ issig_forreal(void)
toproc = 0;
if (tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig)) {
+ !sig_ignorable(p, lwp, sig)) {
if (sigismember(&t->t_extsig, sig))
ext = 1;
break;
@@ -722,7 +761,7 @@ issig_forreal(void)
toproc = 1;
if (tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig)) {
+ !sig_ignorable(p, lwp, sig)) {
if (sigismember(&p->p_extsig, sig))
ext = 1;
break;
@@ -954,6 +993,16 @@ stop(int why, int what)
}
break;
+ case PR_BRAND:
+ /*
+ * We have been stopped by the brand code for a brand-private
+ * reason. This is an asynchronous stop affecting only this
+ * LWP.
+ */
+ VERIFY(PROC_IS_BRANDED(p));
+ flags &= ~TS_BSTART;
+ break;
+
default: /* /proc stop */
flags &= ~TS_PSTART;
/*
@@ -1065,7 +1114,7 @@ stop(int why, int what)
}
}
- if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) {
+ if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {
/*
* Do process-level notification when all lwps are
* either stopped on events of interest to /proc
@@ -1171,6 +1220,13 @@ stop(int why, int what)
if (why == PR_CHECKPOINT)
del_one_utstop();
+ /*
+ * Allow the brand to post notification of this stop condition.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) {
+ BROP(p)->b_stop_notify(p, lwp, why, what);
+ }
+
thread_lock(t);
ASSERT((t->t_schedflag & TS_ALLSTART) == 0);
t->t_schedflag |= flags;
@@ -1192,7 +1248,7 @@ stop(int why, int what)
(p->p_flag & (SEXITLWPS|SKILLED))) {
p->p_stopsig = 0;
thread_lock(t);
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
setrun_locked(t);
thread_unlock_nopreempt(t);
} else if (why == PR_JOBCONTROL) {
@@ -1327,7 +1383,7 @@ psig(void)
* this signal from pending to current (we dropped p->p_lock).
* This can happen only in a multi-threaded process.
*/
- if (sigismember(&p->p_ignore, sig) ||
+ if (sig_ignorable(p, lwp, sig) ||
(func == SIG_DFL && sigismember(&stopdefault, sig))) {
lwp->lwp_cursig = 0;
lwp->lwp_extsig = 0;
@@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)
/*
* This can only happen when the parent is init.
* (See call to sigcld(q, NULL) in exit().)
- * Use KM_NOSLEEP to avoid deadlock.
+ * Use KM_NOSLEEP to avoid deadlock. The child procs
+ * initpid can be 1 for zlogin.
*/
- ASSERT(pp == proc_init);
+ ASSERT(pp->p_pidp->pid_id ==
+ cp->p_zone->zone_proc_initpid ||
+ pp->p_pidp->pid_id == 1);
winfo(cp, &info, 0);
sigaddq(pp, NULL, &info, KM_NOSLEEP);
} else {
@@ -1804,6 +1863,15 @@ sigcld_repost()
sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
mutex_enter(&pidlock);
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) {
+ /*
+ * Allow the brand to inject synthetic SIGCLD signals.
+ */
+ if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) {
+ mutex_exit(&pidlock);
+ return;
+ }
+ }
for (cp = pp->p_child; cp; cp = cp->p_sibling) {
if (cp->p_pidflag & CLDPEND) {
post_sigcld(cp, sqp);
@@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(sig >= 1 && sig < NSIG);
- if (sig_discardable(p, sig))
+ if (sig_discardable(p, t, sig))
siginfofree(sigqp);
else
sigaddqins(p, t, sigqp);
@@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)
* blocking the signal (it *could* change it's mind while
* the signal is pending) then don't bother creating one.
*/
- if (!sig_discardable(p, sig) &&
+ if (!sig_discardable(p, t, sig) &&
(sigismember(&p->p_siginfo, sig) ||
(curproc->p_ct_process != p->p_ct_process) ||
(sig == SIGCLD && SI_FROMKERNEL(infop))) &&
diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c
index 6084676b17..6dc7230bed 100644
--- a/usr/src/uts/common/os/smb_subr.c
+++ b/usr/src/uts/common/os/smb_subr.c
@@ -25,7 +25,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
+ */
#include <sys/smbios_impl.h>
#include <sys/cmn_err.h>
@@ -43,13 +45,13 @@ smb_strerror(int err)
void *
smb_alloc(size_t len)
{
- return (kmem_alloc(len, KM_SLEEP));
+ return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);
}
void *
smb_zalloc(size_t len)
{
- return (kmem_zalloc(len, KM_SLEEP));
+ return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);
}
void
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index d4c2f7023d..68afeef013 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -78,6 +78,7 @@
#include <sys/policy.h>
#include <sys/dld.h>
#include <sys/zone.h>
+#include <sys/limits.h>
#include <c2/audit.h>
/*
@@ -986,12 +987,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
* (registered in sd_wakeq).
*/
struiod_t uiod;
+ struct iovec buf[IOV_MAX_STACK];
+ int iovlen = 0;
if (first)
stp->sd_wakeq &= ~RSLEEP;
- (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
- sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+ if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+ iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+ uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+ } else {
+ uiod.d_iov = buf;
+ }
+
+ (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
uiod.d_mp = 0;
/*
* Mark that a thread is in rwnext on the read side
@@ -1030,6 +1039,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
if ((bp = uiod.d_mp) != NULL) {
*errorp = 0;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (bp);
}
error = 0;
@@ -1049,8 +1060,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
} else {
*errorp = error;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (NULL);
}
+
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
+
/*
* Try a getq in case a rwnext() generated mblk
* has bubbled up via strrput().
@@ -2545,6 +2562,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
int b_flag, int pri, int flags)
{
struiod_t uiod;
+ struct iovec buf[IOV_MAX_STACK];
+ int iovlen = 0;
mblk_t *mp;
queue_t *wqp = stp->sd_wrq;
int error = 0;
@@ -2636,13 +2655,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
mp->b_flag |= b_flag;
mp->b_band = (uchar_t)pri;
- (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
- sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+ if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+ iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+ uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP);
+ } else {
+ uiod.d_iov = buf;
+ }
+
+ (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
uiod.d_uio.uio_offset = 0;
uiod.d_mp = mp;
error = rwnext(wqp, &uiod);
if (! uiod.d_mp) {
uioskip(uiop, *iosize);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
ASSERT(mp == uiod.d_mp);
@@ -2660,17 +2687,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
error = 0;
} else {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
/* Have to check canput before consuming data from the uio */
if (pri == 0) {
if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (EWOULDBLOCK);
}
} else {
if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (EWOULDBLOCK);
}
}
@@ -2678,6 +2711,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
/* Copyin data from the uio */
if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
uioskip(uiop, *iosize);
@@ -2694,6 +2729,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
putnext(wqp, mp);
stream_runservice(stp);
}
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (0);
}
@@ -3179,6 +3216,7 @@ job_control_type(int cmd)
case JAGENT: /* Obsolete */
case JTRUN: /* Obsolete */
case JXTPROTO: /* Obsolete */
+ case TIOCSETLD:
return (JCSETP);
}
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 1ffb561428..ac1ee2d1ce 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -26,6 +26,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -8470,6 +8471,12 @@ mblk_copycred(mblk_t *mp, const mblk_t *src)
dbp->db_cpid = cpid;
}
+
+/*
+ * Now that NIC drivers are expected to deal only with M_DATA mblks, the
+ * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their
+ * respective mac_hcksum_set and mac_hcksum_get counterparts.
+ */
int
hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
uint32_t start, uint32_t stuff, uint32_t end, uint32_t value,
diff --git a/usr/src/uts/common/os/subr.c b/usr/src/uts/common/os/subr.c
index 8ca338a986..ee7293db9a 100644
--- a/usr/src/uts/common/os/subr.c
+++ b/usr/src/uts/common/os/subr.c
@@ -23,8 +23,12 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* All Rights Reserved */
#include <sys/types.h>
#include <sys/sysmacros.h>
@@ -308,46 +312,60 @@ uchar_t bcd_to_byte[256] = { /* CSTYLED */
/*
* Hot-patch a single instruction in the kernel's text.
- * If you want to patch multiple instructions you must
- * arrange to do it so that all intermediate stages are
- * sane -- we don't stop other cpus while doing this.
+ *
+ * If you want to patch multiple instructions you must arrange to do it so that
+ * all intermediate stages are sane -- we don't stop other cpus while doing
+ * this.
+ *
* Size must be 1, 2, or 4 bytes with iaddr aligned accordingly.
+ *
+ * The instruction itself might straddle a page boundary, so we have to account
+ * for that.
*/
void
hot_patch_kernel_text(caddr_t iaddr, uint32_t new_instr, uint_t size)
{
+ const uintptr_t pageoff = (uintptr_t)iaddr & PAGEOFFSET;
+ const boolean_t straddles = (pageoff + size > PAGESIZE);
+ const size_t mapsize = straddles ? PAGESIZE * 2 : PAGESIZE;
+ caddr_t ipageaddr = iaddr - pageoff;
caddr_t vaddr;
page_t **ppp;
- uintptr_t off = (uintptr_t)iaddr & PAGEOFFSET;
- vaddr = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
+ vaddr = vmem_alloc(heap_arena, mapsize, VM_SLEEP);
- (void) as_pagelock(&kas, &ppp, iaddr - off, PAGESIZE, S_WRITE);
+ (void) as_pagelock(&kas, &ppp, ipageaddr, mapsize, S_WRITE);
hat_devload(kas.a_hat, vaddr, PAGESIZE,
- hat_getpfnum(kas.a_hat, iaddr - off),
- PROT_READ | PROT_WRITE, HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
+ hat_getpfnum(kas.a_hat, ipageaddr), PROT_READ | PROT_WRITE,
+ HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
+
+ if (straddles) {
+ hat_devload(kas.a_hat, vaddr + PAGESIZE, PAGESIZE,
+ hat_getpfnum(kas.a_hat, ipageaddr + PAGESIZE),
+ PROT_READ | PROT_WRITE, HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
+ }
switch (size) {
case 1:
- *(uint8_t *)(vaddr + off) = new_instr;
+ *(uint8_t *)(vaddr + pageoff) = new_instr;
break;
case 2:
- *(uint16_t *)(vaddr + off) = new_instr;
+ *(uint16_t *)(vaddr + pageoff) = new_instr;
break;
case 4:
- *(uint32_t *)(vaddr + off) = new_instr;
+ *(uint32_t *)(vaddr + pageoff) = new_instr;
break;
default:
panic("illegal hot-patch");
}
membar_enter();
- sync_icache(vaddr + off, size);
+ sync_icache(vaddr + pageoff, size);
sync_icache(iaddr, size);
- as_pageunlock(&kas, ppp, iaddr - off, PAGESIZE, S_WRITE);
- hat_unload(kas.a_hat, vaddr, PAGESIZE, HAT_UNLOAD_UNLOCK);
- vmem_free(heap_arena, vaddr, PAGESIZE);
+ as_pageunlock(&kas, ppp, ipageaddr, mapsize, S_WRITE);
+ hat_unload(kas.a_hat, vaddr, mapsize, HAT_UNLOAD_UNLOCK);
+ vmem_free(heap_arena, vaddr, mapsize);
}
/*
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index c39819156d..e0cc20fa45 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -5903,6 +5903,12 @@ ddi_ffs(long mask)
return (ffs(mask));
}
+int
+ddi_ffsll(long long mask)
+{
+ return (ffs(mask));
+}
+
/*
* Find last bit set. Take mask and clear
* all but the most significant bit, and
@@ -5914,8 +5920,14 @@ ddi_ffs(long mask)
int
ddi_fls(long mask)
{
+ return (ddi_flsll(mask));
+}
+
+int
+ddi_flsll(long long mask)
+{
while (mask) {
- long nx;
+ long long nx;
if ((nx = (mask & (mask - 1))) == 0)
break;
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index fb8bf07077..fb64000e4d 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -23,6 +23,7 @@
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2018, Joyent, Inc.
*/
@@ -61,8 +62,7 @@ struct mmaplf32a;
int access(char *, int);
int alarm(int);
int auditsys(struct auditcalls *, rval_t *);
-int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
- uintptr_t);
+int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
intptr_t brk(caddr_t);
int chdir(char *);
int chmod(char *, int);
@@ -647,7 +647,7 @@ struct sysent sysent[NSYSCALL] =
SYSENT_NOSYS(),
SYSENT_C("llseek", llseek32, 4)),
/* 176 */ SYSENT_LOADABLE(), /* inst_sync */
- /* 177 */ SYSENT_CI("brandsys", brandsys, 6),
+ /* 177 */ SYSENT_CI("brandsys", brandsys, 5),
/* 178 */ SYSENT_LOADABLE(), /* kaio */
/* 179 */ SYSENT_LOADABLE(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
@@ -1002,7 +1002,7 @@ struct sysent sysent32[NSYSCALL] =
/* 174 */ SYSENT_CI("pwrite", pwrite32, 4),
/* 175 */ SYSENT_C("llseek", llseek32, 4),
/* 176 */ SYSENT_LOADABLE32(), /* inst_sync */
- /* 177 */ SYSENT_CI("brandsys", brandsys, 6),
+ /* 177 */ SYSENT_CI("brandsys", brandsys, 5),
/* 178 */ SYSENT_LOADABLE32(), /* kaio */
/* 179 */ SYSENT_LOADABLE32(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
@@ -1094,18 +1094,20 @@ char **syscallnames;
systrace_sysent_t *systrace_sysent;
void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
/*ARGSUSED*/
void
systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
- uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+ uintptr_t arg6, uintptr_t arg7)
{}
/*ARGSUSED*/
int64_t
dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+ uintptr_t arg7)
{
systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];
dtrace_id_t id;
@@ -1113,7 +1115,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
proc_t *p;
if ((id = sy->stsy_entry) != DTRACE_IDNONE)
- (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+ (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7);
/*
* We want to explicitly allow DTrace consumers to stop a process
@@ -1127,14 +1130,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
}
mutex_exit(&p->p_lock);
- rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+ rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7);
if (ttolwp(curthread)->lwp_errno != 0)
rval = -1;
if ((id = sy->stsy_return) != DTRACE_IDNONE)
(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
- (uintptr_t)((int64_t)rval >> 32), 0, 0, 0);
+ (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);
return (rval);
}
@@ -1146,7 +1150,8 @@ systrace_sysent_t *systrace_sysent32;
/*ARGSUSED*/
int64_t
dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+ uintptr_t arg7)
{
systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];
dtrace_id_t id;
@@ -1154,7 +1159,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
proc_t *p;
if ((id = sy->stsy_entry) != DTRACE_IDNONE)
- (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+ (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+ arg7);
/*
* We want to explicitly allow DTrace consumers to stop a process
@@ -1168,14 +1174,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
}
mutex_exit(&p->p_lock);
- rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+ rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+ arg7);
if (ttolwp(curthread)->lwp_errno != 0)
rval = -1;
if ((id = sy->stsy_return) != DTRACE_IDNONE)
(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
- (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0);
+ (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);
return (rval);
}
@@ -1203,5 +1210,5 @@ dtrace_systrace_rtt(void)
}
if ((id = sy->stsy_return) != DTRACE_IDNONE)
- (*systrace_probe)(id, 0, 0, 0, 0, 0, 0);
+ (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);
}
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index b25a6cbcf1..5453ebf380 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -25,11 +25,12 @@
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/timer.h>
#include <sys/systm.h>
+#include <sys/sysmacros.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/debug.h>
@@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it)
* waiters. p_lock must be held on entry; it will not be dropped by
* timer_unlock().
*/
+/* ARGSUSED */
static void
timer_unlock(proc_t *p, itimer_t *it)
{
@@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
timer_lock(p, it);
}
+ ASSERT(p->p_itimer_sz > tid);
ASSERT(p->p_itimer[tid] == it);
p->p_itimer[tid] = NULL;
@@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
it->it_backend->clk_timer_delete(it);
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portev) {
port_kevent_t *pev;
@@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
static itimer_t *
timer_grab(proc_t *p, timer_t tid)
{
- itimer_t **itp, *it;
+ itimer_t *it;
- if (tid >= timer_max || tid < 0)
+ if (tid < 0) {
return (NULL);
+ }
mutex_enter(&p->p_lock);
-
- if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) {
+ if (p->p_itimer == NULL || tid >= p->p_itimer_sz ||
+ (it = p->p_itimer[tid]) == NULL) {
mutex_exit(&p->p_lock);
return (NULL);
}
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if (it->it_lock & ITLK_REMOVE) {
@@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)
* should not be held on entry; timer_release() will acquire p_lock but
* will drop it before returning.
*/
-static void
+void
timer_release(proc_t *p, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)
* p_lock should not be held on entry; timer_delete_grabbed() will acquire
* p_lock, but will drop it before returning.
*/
-static void
+void
timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -258,6 +263,13 @@ clock_timer_init()
{
clock_timer_cache = kmem_cache_create("timer_cache",
sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ /*
+ * Push the timer_max limit up to at least 4 * NCPU. Due to the way
+ * NCPU is defined, proper initialization of the timer limit is
+ * performed at runtime.
+ */
+ timer_max = MAX(NCPU * 4, timer_max);
}
void
@@ -453,6 +465,9 @@ timer_fire(itimer_t *it)
it->it_pending = 1;
port_send_event((port_kevent_t *)it->it_portev);
mutex_exit(&it->it_mutex);
+ } else if (it->it_flags & IT_CALLBACK) {
+ it->it_cb_func(it);
+ ASSERT(MUTEX_NOT_HELD(&it->it_mutex));
} else if (it->it_flags & IT_SIGNAL) {
it->it_pending = 1;
mutex_exit(&it->it_mutex);
@@ -466,159 +481,175 @@ timer_fire(itimer_t *it)
mutex_exit(&p->p_lock);
}
-int
-timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
+/*
+ * Allocate an itimer_t and find and appropriate slot for it in p_itimer.
+ * Acquires p_lock and holds it on return, regardless of success.
+ */
+static itimer_t *
+timer_alloc(proc_t *p, timer_t *id)
{
- struct sigevent ev;
- proc_t *p = curproc;
- clock_backend_t *backend;
- itimer_t *it, **itp;
- sigqueue_t *sigq;
- cred_t *cr = CRED();
- int error = 0;
- timer_t i;
- port_notify_t tim_pnevp;
- port_kevent_t *pkevp = NULL;
+ itimer_t *it, **itp = NULL;
+ uint_t i;
- if ((backend = CLOCK_BACKEND(clock)) == NULL)
- return (set_errno(EINVAL));
+ ASSERT(MUTEX_NOT_HELD(&p->p_lock));
- if (evp != NULL) {
- /*
- * short copyin() for binary compatibility
- * fetch oldsigevent to determine how much to copy in.
- */
- if (get_udatamodel() == DATAMODEL_NATIVE) {
- if (copyin(evp, &ev, sizeof (struct oldsigevent)))
- return (set_errno(EFAULT));
+ it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
+ bzero(it, sizeof (itimer_t));
+ mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
- sizeof (port_notify_t)))
- return (set_errno(EFAULT));
+ mutex_enter(&p->p_lock);
+retry:
+ if (p->p_itimer != NULL) {
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if (p->p_itimer[i] == NULL) {
+ itp = &(p->p_itimer[i]);
+ break;
}
-#ifdef _SYSCALL32_IMPL
- } else {
- struct sigevent32 ev32;
- port_notify32_t tim_pnevp32;
+ }
+ }
- if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
- return (set_errno(EFAULT));
- ev.sigev_notify = ev32.sigev_notify;
- ev.sigev_signo = ev32.sigev_signo;
+ /*
+ * A suitable slot was not found. If possible, allocate (or resize)
+ * the p_itimer array and try again.
+ */
+ if (itp == NULL) {
+ uint_t target_sz = _TIMER_ALLOC_INIT;
+ itimer_t **itp_new;
+
+ if (p->p_itimer != NULL) {
+ ASSERT(p->p_itimer_sz != 0);
+
+ target_sz = p->p_itimer_sz * 2;
+ }
+ /*
+ * Protect against exceeding the max or overflow
+ */
+ if (target_sz > timer_max || target_sz > INT_MAX ||
+ target_sz < p->p_itimer_sz) {
+ kmem_cache_free(clock_timer_cache, it);
+ return (NULL);
+ }
+ mutex_exit(&p->p_lock);
+ itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *),
+ KM_SLEEP);
+ mutex_enter(&p->p_lock);
+ if (target_sz <= p->p_itimer_sz) {
/*
- * See comment in sigqueue32() on handling of 32-bit
- * sigvals in a 64-bit kernel.
+ * A racing thread performed the resize while we were
+ * waiting outside p_lock. Discard our now-useless
+ * allocation and retry.
*/
- ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin((void *)(uintptr_t)
- ev32.sigev_value.sival_ptr,
- (void *)&tim_pnevp32,
- sizeof (port_notify32_t)))
- return (set_errno(EFAULT));
- tim_pnevp.portnfy_port =
- tim_pnevp32.portnfy_port;
- tim_pnevp.portnfy_user =
- (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+ kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+ goto retry;
+ } else {
+ /*
+ * Instantiate the larger allocation and select the
+ * first fresh entry for use.
+ */
+ if (p->p_itimer != NULL) {
+ uint_t old_sz;
+
+ old_sz = p->p_itimer_sz;
+ bcopy(p->p_itimer, itp_new,
+ old_sz * sizeof (itimer_t *));
+ kmem_free(p->p_itimer,
+ old_sz * sizeof (itimer_t *));
+
+ /*
+ * Short circuit to use the first free entry in
+ * the new allocation. It's possible that
+ * other lower-indexed timers were freed while
+ * p_lock was dropped, but skipping over them
+ * is not harmful at all. In the common case,
+ * we skip the need to walk over an array
+ * filled with timers before arriving at the
+ * slot we know is fresh from the allocation.
+ */
+ i = old_sz;
+ } else {
+ /*
+ * For processes lacking any existing timers,
+ * we can simply select the first entry.
+ */
+ i = 0;
}
-#endif
+ p->p_itimer = itp_new;
+ p->p_itimer_sz = target_sz;
}
- switch (ev.sigev_notify) {
- case SIGEV_NONE:
- break;
- case SIGEV_SIGNAL:
- if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
- return (set_errno(EINVAL));
- break;
- case SIGEV_THREAD:
- case SIGEV_PORT:
- break;
- default:
- return (set_errno(EINVAL));
- }
- } else {
- /*
- * Use the clock's default sigevent (this is a structure copy).
- */
- ev = backend->clk_default;
}
+ ASSERT(i <= INT_MAX);
+ *id = (timer_t)i;
+ return (it);
+}
+
+/*
+ * Setup a timer
+ *
+ * This allocates an itimer_t (including a timer_t ID and slot in the process),
+ * wires it up according to the provided sigevent, and associates it with the
+ * desired clock backend. Upon successful completion, the timer will be
+ * locked, preventing it from being armed via timer_settime() or deleted via
+ * timer_delete(). This gives the caller a chance to perform any last minute
+ * manipulations (such as configuring the IT_CALLBACK functionality and/or
+ * copying the timer_t out to userspace) before using timer_release() to unlock
+ * it or timer_delete_grabbed() to delete it.
+ */
+int
+timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
+ itimer_t **itp, timer_t *tidp)
+{
+ proc_t *p = curproc;
+ int error = 0;
+ itimer_t *it;
+ sigqueue_t *sigq;
+ timer_t tid;
+
/*
- * We'll allocate our timer and sigqueue now, before we grab p_lock.
- * If we can't find an empty slot, we'll free them before returning.
+ * We'll allocate our sigqueue now, before we grab p_lock.
+ * If we can't find an empty slot, we'll free it before returning.
*/
- it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
- bzero(it, sizeof (itimer_t));
- mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
- mutex_enter(&p->p_lock);
-
/*
- * If this is this process' first timer, we need to attempt to allocate
- * an array of timerstr_t pointers. We drop p_lock to perform the
- * allocation; if we return to discover that p_itimer is non-NULL,
- * we will free our allocation and drive on.
+ * Allocate a timer and choose a slot for it. This acquires p_lock.
*/
- if ((itp = p->p_itimer) == NULL) {
- mutex_exit(&p->p_lock);
- itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP);
- mutex_enter(&p->p_lock);
-
- if (p->p_itimer == NULL)
- p->p_itimer = itp;
- else {
- kmem_free(itp, timer_max * sizeof (itimer_t *));
- itp = p->p_itimer;
- }
- }
-
- for (i = 0; i < timer_max && itp[i] != NULL; i++)
- continue;
+ it = timer_alloc(p, &tid);
+ ASSERT(MUTEX_HELD(&p->p_lock));
- if (i == timer_max) {
- /*
- * We couldn't find a slot. Drop p_lock, free the preallocated
- * timer and sigqueue, and return an error.
- */
+ if (it == NULL) {
mutex_exit(&p->p_lock);
- kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
-
- return (set_errno(EAGAIN));
+ return (EAGAIN);
}
- ASSERT(i < timer_max && itp[i] == NULL);
+ ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
+ ASSERT(evp != NULL);
/*
* If we develop other notification mechanisms, this will need
* to call into (yet another) backend.
*/
- sigq->sq_info.si_signo = ev.sigev_signo;
- if (evp == NULL)
- sigq->sq_info.si_value.sival_int = i;
- else
- sigq->sq_info.si_value = ev.sigev_value;
+ sigq->sq_info.si_signo = evp->sigev_signo;
+ sigq->sq_info.si_value = evp->sigev_value;
sigq->sq_info.si_code = SI_TIMER;
sigq->sq_info.si_pid = p->p_pid;
sigq->sq_info.si_ctid = PRCTID(p);
sigq->sq_info.si_zoneid = getzoneid();
- sigq->sq_info.si_uid = crgetruid(cr);
+ sigq->sq_info.si_uid = crgetruid(CRED());
sigq->sq_func = timer_signal;
sigq->sq_next = NULL;
sigq->sq_backptr = it;
it->it_sigq = sigq;
it->it_backend = backend;
it->it_lock = ITLK_LOCKED;
- itp[i] = it;
-
- if (ev.sigev_notify == SIGEV_THREAD ||
- ev.sigev_notify == SIGEV_PORT) {
+ if (evp->sigev_notify == SIGEV_THREAD ||
+ evp->sigev_notify == SIGEV_PORT) {
int port;
+ port_kevent_t *pkevp = NULL;
+
+ ASSERT(pnp != NULL);
/*
* This timer is programmed to use event port notification when
@@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
*/
it->it_flags |= IT_PORT;
- port = tim_pnevp.portnfy_port;
+ port = pnp->portnfy_port;
/* associate timer as event source with the port */
error = port_associate_ksource(port, PORT_SOURCE_TIMER,
(port_source_t **)&it->it_portsrc, timer_close_port,
(void *)it, NULL);
if (error) {
- itp[i] = NULL; /* clear slot */
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* allocate an event structure/slot */
@@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
if (error) {
(void) port_dissociate_ksource(port, PORT_SOURCE_TIMER,
(port_source_t *)it->it_portsrc);
- itp[i] = NULL; /* clear slot */
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* initialize event data */
- port_init_event(pkevp, i, tim_pnevp.portnfy_user,
+ port_init_event(pkevp, tid, pnp->portnfy_user,
timer_port_callback, it);
it->it_portev = pkevp;
it->it_portfd = port;
} else {
- if (ev.sigev_notify == SIGEV_SIGNAL)
+ if (evp->sigev_notify == SIGEV_SIGNAL)
it->it_flags |= IT_SIGNAL;
}
+ /* Populate the slot now that the timer is prepped. */
+ p->p_itimer[tid] = it;
mutex_exit(&p->p_lock);
/*
@@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
it->it_lwp = ttolwp(curthread);
it->it_proc = p;
- if (copyout(&i, tid, sizeof (timer_t)) != 0) {
- error = EFAULT;
- goto err;
- }
-
- /*
- * If we're here, then we have successfully created the timer; we
- * just need to release the timer and return.
- */
- timer_release(p, it);
-
+ *itp = it;
+ *tidp = tid;
return (0);
err:
@@ -708,11 +730,115 @@ err:
* impossible for a removal to be pending.
*/
ASSERT(!(it->it_lock & ITLK_REMOVE));
- timer_delete_grabbed(p, i, it);
+ timer_delete_grabbed(p, tid, it);
- return (set_errno(error));
+ return (error);
}
+
+int
+timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp)
+{
+ int error = 0;
+ proc_t *p = curproc;
+ clock_backend_t *backend;
+ struct sigevent ev;
+ itimer_t *it;
+ timer_t tid;
+ port_notify_t tim_pnevp;
+
+ if ((backend = CLOCK_BACKEND(clock)) == NULL)
+ return (set_errno(EINVAL));
+
+ if (evp != NULL) {
+ /*
+ * short copyin() for binary compatibility
+ * fetch oldsigevent to determine how much to copy in.
+ */
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(evp, &ev, sizeof (struct oldsigevent)))
+ return (set_errno(EFAULT));
+
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
+ sizeof (port_notify_t)))
+ return (set_errno(EFAULT));
+ }
+#ifdef _SYSCALL32_IMPL
+ } else {
+ struct sigevent32 ev32;
+ port_notify32_t tim_pnevp32;
+
+ if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
+ return (set_errno(EFAULT));
+ ev.sigev_notify = ev32.sigev_notify;
+ ev.sigev_signo = ev32.sigev_signo;
+ /*
+ * See comment in sigqueue32() on handling of 32-bit
+ * sigvals in a 64-bit kernel.
+ */
+ ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin((void *)(uintptr_t)
+ ev32.sigev_value.sival_ptr,
+ (void *)&tim_pnevp32,
+ sizeof (port_notify32_t)))
+ return (set_errno(EFAULT));
+ tim_pnevp.portnfy_port =
+ tim_pnevp32.portnfy_port;
+ tim_pnevp.portnfy_user =
+ (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+ }
+#endif
+ }
+ switch (ev.sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
+ return (set_errno(EINVAL));
+ break;
+ case SIGEV_THREAD:
+ case SIGEV_PORT:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+ } else {
+ /*
+ * Use the clock's default sigevent (this is a structure copy).
+ */
+ ev = backend->clk_default;
+ }
+
+ if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * Populate si_value with the timer ID if no sigevent was passed in.
+ */
+ if (evp == NULL) {
+ it->it_sigq->sq_info.si_value.sival_int = tid;
+ }
+
+ if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+ timer_delete_grabbed(p, tid, it);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * If we're here, then we have successfully created the timer; we
+ * just need to release the timer and return.
+ */
+ timer_release(p, it);
+
+ return (0);
+}
+
+
int
timer_gettime(timer_t tid, itimerspec_t *val)
{
@@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid)
void
timer_lwpexit(void)
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
klwp_t *lwp = ttolwp(curthread);
- itimer_t *it, **itp;
+ itimer_t *it;
ASSERT(MUTEX_HELD(&p->p_lock));
- if ((itp = p->p_itimer) == NULL)
+ if (p->p_itimer == NULL) {
return;
+ }
- for (i = 0; i < timer_max; i++) {
- if ((it = itp[i]) == NULL)
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if ((it = p->p_itimer[i]) == NULL) {
continue;
+ }
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) {
@@ -876,20 +1005,22 @@ timer_lwpexit(void)
void
timer_lwpbind()
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
klwp_t *lwp = ttolwp(curthread);
- itimer_t *it, **itp;
+ itimer_t *it;
ASSERT(MUTEX_HELD(&p->p_lock));
- if ((itp = p->p_itimer) == NULL)
+ if (p->p_itimer == NULL) {
return;
+ }
- for (i = 0; i < timer_max; i++) {
- if ((it = itp[i]) == NULL)
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if ((it = p->p_itimer[i]) == NULL)
continue;
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) {
@@ -911,16 +1042,19 @@ timer_lwpbind()
void
timer_exit(void)
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
ASSERT(p->p_itimer != NULL);
+ ASSERT(p->p_itimer_sz != 0);
- for (i = 0; i < timer_max; i++)
- (void) timer_delete(i);
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ (void) timer_delete((timer_t)i);
+ }
- kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *));
+ kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));
p->p_itimer = NULL;
+ p->p_itimer_sz = 0;
}
/*
@@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)
for (tid = 0; tid < timer_max; tid++) {
if ((it = timer_grab(p, tid)) == NULL)
continue;
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portfd == port) {
port_kevent_t *pev;
diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c
index 61acc6cf97..53be806026 100644
--- a/usr/src/uts/common/os/timers.c
+++ b/usr/src/uts/common/os/timers.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/*
@@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv)
void
hrt2ts(hrtime_t hrt, timestruc_t *tsp)
{
+#if defined(__amd64)
+ /*
+ * The cleverness explained above is unecessary on x86_64 CPUs where
+ * modern compilers are able to optimize down to faster operations.
+ */
+ tsp->tv_sec = hrt / NANOSEC;
+ tsp->tv_nsec = hrt % NANOSEC;
+#else
uint32_t sec, nsec, tmp;
tmp = (uint32_t)(hrt >> 30);
@@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)
}
tsp->tv_sec = (time_t)sec;
tsp->tv_nsec = nsec;
+#endif /* defined(__amd64) */
}
/*
* Convert from timestruc_t to hrtime_t.
- *
- * The code below is equivalent to:
- *
- * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
- *
- * but requires no integer multiply.
*/
hrtime_t
ts2hrt(const timestruc_t *tsp)
{
+#if defined(__amd64) || defined(__i386)
+ /*
+ * On modern x86 CPUs, the simple version is faster.
+ */
+ return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec);
+#else
+ /*
+ * The code below is equivalent to:
+ *
+ * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
+ *
+ * but requires no integer multiply.
+ */
hrtime_t hrt;
hrt = tsp->tv_sec;
@@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp)
hrt = (hrt << 7) - hrt - hrt - hrt;
hrt = (hrt << 9) + tsp->tv_nsec;
return (hrt);
+#endif /* defined(__amd64) || defined(__i386) */
}
/*
@@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp)
void
hrt2tv(hrtime_t hrt, struct timeval *tvp)
{
+#if defined(__amd64)
+ /*
+ * Like hrt2ts, the simple version is faster on x86_64.
+ */
+ tvp->tv_sec = hrt / NANOSEC;
+ tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC);
+#else
uint32_t sec, nsec, tmp;
uint32_t q, r, t;
@@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp)
sec++;
}
tvp->tv_sec = (time_t)sec;
-/*
- * this routine is very similar to hr2ts, but requires microseconds
- * instead of nanoseconds, so an interger divide by 1000 routine
- * completes the conversion
- */
+ /*
+ * this routine is very similar to hr2ts, but requires microseconds
+ * instead of nanoseconds, so an interger divide by 1000 routine
+ * completes the conversion
+ */
t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);
q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);
q = q >> 9;
r = nsec - q*1000;
tvp->tv_usec = q + ((r + 24) >> 10);
-
+#endif /* defined(__amd64) */
}
int
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index 608208bbca..f5ee76a2cb 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -58,6 +59,7 @@
#include <sys/tnf_probe.h>
#include <sys/mem_cage.h>
#include <sys/time.h>
+#include <sys/zone.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -73,7 +75,7 @@ static int checkpage(page_t *, int);
* algorithm. They are initialized to 0, and then computed at boot time
* based on the size of the system. If they are patched non-zero in
* a loaded vmunix they are left alone and may thus be changed per system
- * using adb on the loaded system.
+ * using mdb on the loaded system.
*/
pgcnt_t slowscan = 0;
pgcnt_t fastscan = 0;
@@ -81,6 +83,7 @@ pgcnt_t fastscan = 0;
static pgcnt_t handspreadpages = 0;
static int loopfraction = 2;
static pgcnt_t looppages;
+/* See comment below describing 4% and 80% */
static int min_percent_cpu = 4;
static int max_percent_cpu = 80;
static pgcnt_t maxfastscan = 0;
@@ -98,14 +101,34 @@ pgcnt_t deficit;
pgcnt_t nscan;
pgcnt_t desscan;
+/* kstats */
+uint64_t low_mem_scan;
+uint64_t zone_cap_scan;
+uint64_t n_throttle;
+
+clock_t zone_pageout_ticks; /* tunable to change zone pagescan ticks */
+
/*
* Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
* are the number of ticks in each wakeup cycle that gives the
* equivalent of some underlying %CPU duty cycle.
- * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
- * awakened every 25 clock ticks. So, converting from %CPU to ticks
- * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
- * So, for example, 4% == 1 tick and 80% == 20 ticks.
+ *
+ * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging()
+ * will run 4 times/sec to update pageout scanning parameters and kickoff
+ * the pageout_scanner() thread if necessary.
+ *
+ * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When
+ * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed
+ * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1).
+ *
+ * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When
+ * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed
+ * by the scanner in a 1 second interval is 80% of a CPU
+ * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25
+ * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec.
+ *
+ * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks
+ * will be 200, so the CPU percentages are the same as when hz is 100.
*
* min_pageout_ticks:
* ticks/wakeup equivalent of min_percent_cpu.
@@ -117,19 +140,29 @@ pgcnt_t desscan;
* Number of clock ticks budgeted for each wakeup cycle.
* Computed each time around by schedpaging().
* Varies between min_pageout_ticks .. max_pageout_ticks,
- * depending on memory pressure.
- *
- * pageout_lbolt:
- * Timestamp of the last time pageout_scanner woke up and started
- * (or resumed) scanning for not recently referenced pages.
+ * depending on memory pressure or zones over their cap.
*/
static clock_t min_pageout_ticks;
static clock_t max_pageout_ticks;
static clock_t pageout_ticks;
-static clock_t pageout_lbolt;
-static uint_t reset_hands;
+#define MAX_PSCAN_THREADS 16
+static boolean_t reset_hands[MAX_PSCAN_THREADS];
+
+/*
+ * These can be tuned in /etc/system or set with mdb.
+ * 'des_page_scanners' is the desired number of page scanner threads. The
+ * system will bring the actual number of threads into line with the desired
+ * number. If des_page_scanners is set to an invalid value, the system will
+ * correct the setting.
+ */
+uint_t des_page_scanners;
+uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
+
+uint_t n_page_scanners;
+static pgcnt_t pscan_region_sz; /* informational only */
+
#define PAGES_POLL_MASK 1023
@@ -145,33 +178,37 @@ static uint_t reset_hands;
* pageout_sample_pages:
* The accumulated number of pages scanned during sampling.
*
- * pageout_sample_ticks:
- * The accumulated clock ticks for the sample.
+ * pageout_sample_etime:
+ * The accumulated number of nanoseconds for the sample.
*
* pageout_rate:
- * Rate in pages/nanosecond, computed at the end of sampling.
+ * Rate in pages/second, computed at the end of sampling.
*
* pageout_new_spread:
- * The new value to use for fastscan and handspreadpages.
- * Calculated after enough samples have been taken.
+ * The new value to use for maxfastscan and (perhaps) handspreadpages.
+ * Intended to be the number pages that can be scanned per sec using ~10%
+ * of a CPU. Calculated after enough samples have been taken.
+ * pageout_rate / 10
*/
typedef hrtime_t hrrate_t;
-static uint64_t pageout_sample_lim = 4;
-static uint64_t pageout_sample_cnt = 0;
+static uint_t pageout_sample_lim = 4;
+static uint_t pageout_sample_cnt = 0;
static pgcnt_t pageout_sample_pages = 0;
static hrrate_t pageout_rate = 0;
static pgcnt_t pageout_new_spread = 0;
-static clock_t pageout_cycle_ticks;
-static hrtime_t sample_start, sample_end;
static hrtime_t pageout_sample_etime = 0;
+/* True if page scanner is first starting up */
+#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
+
/*
* Record number of times a pageout_scanner wakeup cycle finished because it
* timed out (exceeded its CPU budget), rather than because it visited
- * its budgeted number of pages.
+ * its budgeted number of pages. This is only done when scanning under low
+ * free memory conditions, not when scanning for zones over their cap.
*/
uint64_t pageout_timeouts = 0;
@@ -194,25 +231,35 @@ kcondvar_t memavail_cv;
#define LOOPPAGES total_pages
/*
- * Set up the paging constants for the clock algorithm.
- * Called after the system is initialized and the amount of memory
- * and number of paging devices is known.
+ * Local boolean to control scanning when zones are over their cap. Avoids
+ * accessing the zone_num_over_cap variable except within schedpaging(), which
+ * only runs periodically. This is here only to reduce our access to
+ * zone_num_over_cap, since it is already accessed a lot during paging, and
+ * the page scanner accesses the zones_over variable on each page during a
+ * scan. There is no lock needed for zone_num_over_cap since schedpaging()
+ * doesn't modify the variable, it only cares if the variable is 0 or non-0.
+ */
+static boolean_t zones_over = B_FALSE;
+
+/*
+ * Set up the paging constants for the page scanner clock-hand algorithm.
+ * Called at startup after the system is initialized and the amount of memory
+ * and number of paging devices is known (recalc will be 0). Called again once
+ * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples
+ * (recalc will be 1).
+ *
+ * Will also be called after a memory dynamic reconfiguration operation and
+ * recalc will be 1 in those cases too.
*
- * lotsfree is 1/64 of memory, but at least 512K.
+ * lotsfree is 1/64 of memory, but at least 512K (ha!).
* desfree is 1/2 of lotsfree.
* minfree is 1/2 of desfree.
- *
- * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
- *
- * lotsfree = btop(512K)
- * desfree = btop(200K)
- * minfree = btop(100K)
- * throttlefree = INT_MIN
- * max_percent_cpu = 4
*/
void
setupclock(int recalc)
{
+ uint_t i;
+ pgcnt_t sz, tmp;
static spgcnt_t init_lfree, init_dfree, init_mfree;
static spgcnt_t init_tfree, init_preserve, init_mpgio;
@@ -221,8 +268,8 @@ setupclock(int recalc)
looppages = LOOPPAGES;
/*
- * setupclock can now be called to recalculate the paging
- * parameters in the case of dynamic addition of memory.
+ * setupclock can be called to recalculate the paging
+ * parameters in the case of dynamic reconfiguration of memory.
* So to make sure we make the proper calculations, if such a
* situation should arise, we save away the initial values
* of each parameter so we can recall them when needed. This
@@ -311,105 +358,98 @@ setupclock(int recalc)
maxpgio = init_mpgio;
/*
- * The clock scan rate varies between fastscan and slowscan
- * based on the amount of free memory available. Fastscan
- * rate should be set based on the number pages that can be
- * scanned per sec using ~10% of processor time. Since this
- * value depends on the processor, MMU, Mhz etc., it is
- * difficult to determine it in a generic manner for all
- * architectures.
+ * When the system is in a low memory state, the page scan rate varies
+ * between fastscan and slowscan based on the amount of free memory
+ * available. When only zones are over their memory cap, the scan rate
+ * is always fastscan.
*
- * Instead of trying to determine the number of pages scanned
- * per sec for every processor, fastscan is set to be the smaller
- * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
- * time is limited to ~4% of processor time.
+ * The fastscan rate should be set based on the number pages that can
+ * be scanned per sec using ~10% of a CPU. Since this value depends on
+ * the processor, MMU, Ghz etc., it must be determined dynamically.
*
- * Setting fastscan to be 1/2 of memory allows pageout to scan
- * all of memory in ~2 secs. This implies that user pages not
- * accessed within 1 sec (assuming, handspreadpages == fastscan)
- * can be reclaimed when free memory is very low. Stealing pages
- * not accessed within 1 sec seems reasonable and ensures that
- * active user processes don't thrash.
+ * When the scanner first starts up, fastscan will be set to 0 and
+ * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages).
+ * However, once the scanner has collected enough samples, then fastscan
+ * is set to be the smaller of 1/2 of memory (looppages / loopfraction)
+ * or maxfastscan (which is set from pageout_new_spread). Thus,
+ * MAXHANDSPREADPAGES is irrelevant after the scanner is fully
+ * initialized.
*
- * Smaller values of fastscan result in scanning fewer pages
- * every second and consequently pageout may not be able to free
- * sufficient memory to maintain the minimum threshold. Larger
- * values of fastscan result in scanning a lot more pages which
- * could lead to thrashing and higher CPU usage.
+ * pageout_new_spread is calculated when the scanner first starts
+ * running. During this initial sampling period the nscan_limit
+ * is set to the total_pages of system memory. Thus, the scanner could
+ * theoretically scan all of memory in one pass. However, each sample
+ * is also limited by the %CPU budget. This is controlled by
+ * pageout_ticks which is set in schedpaging(). During the sampling
+ * period, pageout_ticks is set to max_pageout_ticks. This tick value
+ * is derived from the max_percent_cpu (80%) described above. On a
+ * system with more than a small amount of memory (~8GB), the scanner's
+ * %CPU will be the limiting factor in calculating pageout_new_spread.
*
- * Fastscan needs to be limited to a maximum value and should not
- * scale with memory to prevent pageout from consuming too much
- * time for scanning on slow CPU's and avoid thrashing, as a
- * result of scanning too many pages, on faster CPU's.
- * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
- * (the upper bound for fastscan) based on the average number
- * of pages that can potentially be scanned in ~1 sec (using ~4%
- * of the CPU) on some of the following machines that currently
- * run Solaris 2.x:
+ * At the end of the sampling period, the pageout_rate indicates how
+ * many pages could be scanned per second. The pageout_new_spread is
+ * then set to be 1/10th of that (i.e. approximating 10% of a CPU).
+ * Of course, this value could still be more than the physical memory
+ * on the system. If so, fastscan is set to 1/2 of memory, as
+ * mentioned above.
*
- * average memory scanned in ~1 sec
+ * All of this leads up to the setting of handspreadpages, which is
+ * set to fastscan. This is the distance, in pages, between the front
+ * and back hands during scanning. It will dictate which pages will
+ * be considered "hot" on the backhand and which pages will be "cold"
+ * and reclaimed
*
- * 25 Mhz SS1+: 23 Meg
- * LX: 37 Meg
- * 50 Mhz SC2000: 68 Meg
+ * If the scanner is limited by desscan, then at the highest rate it
+ * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the
+ * scanner is limited by the %CPU, then at the highest rate (20% of a
+ * CPU per cycle) the number of pages scanned could be much less.
*
- * 40 Mhz 486: 26 Meg
- * 66 Mhz 486: 42 Meg
+ * Thus, if the scanner is limited by desscan, then the handspreadpages
+ * setting means 1sec between the front and back hands, but if the
+ * scanner is limited by %CPU, it could be several seconds between the
+ * two hands.
*
- * When free memory falls just below lotsfree, the scan rate
- * goes from 0 to slowscan (i.e., pageout starts running). This
+ * The basic assumption is that at the worst case, stealing pages
+ * not accessed within 1 sec seems reasonable and ensures that active
+ * user processes don't thrash. This is especially true when the system
+ * is in a low memory state.
+ *
+ * There are some additional factors to consider for the case of
+ * scanning when zones are over their cap. In this situation it is
+ * also likely that the machine will have a large physical memory which
+ * will take many seconds to fully scan (due to the %CPU and desscan
+ * limits per cycle). It is probable that there will be few (or 0)
+ * pages attributed to these zones in any single scanning cycle. The
+ * result is that reclaiming enough pages for these zones might take
+ * several additional seconds (this is generally not a problem since
+ * the zone physical cap is just a soft cap).
+ *
+ * This is similar to the typical multi-processor situation in which
+ * pageout is often unable to maintain the minimum paging thresholds
+ * under heavy load due to the fact that user processes running on
+ * other CPU's can be dirtying memory at a much faster pace than
+ * pageout can find pages to free.
+ *
+ * One potential approach to address both of these cases is to enable
+ * more than one CPU to run the page scanner, in such a manner that the
+ * various clock hands don't overlap. However, this also makes it more
+ * difficult to determine the values for fastscan, slowscan and
+ * handspreadpages. This is left as a future enhancement, if necessary.
+ *
+ * When free memory falls just below lotsfree, the scan rate goes from
+ * 0 to slowscan (i.e., the page scanner starts running). This
* transition needs to be smooth and is achieved by ensuring that
* pageout scans a small number of pages to satisfy the transient
* memory demand. This is set to not exceed 100 pages/sec (25 per
* wakeup) since scanning that many pages has no noticible impact
* on system performance.
*
- * In addition to setting fastscan and slowscan, pageout is
- * limited to using ~4% of the CPU. This results in increasing
- * the time taken to scan all of memory, which in turn means that
- * user processes have a better opportunity of preventing their
- * pages from being stolen. This has a positive effect on
- * interactive and overall system performance when memory demand
- * is high.
- *
- * Thus, the rate at which pages are scanned for replacement will
- * vary linearly between slowscan and the number of pages that
- * can be scanned using ~4% of processor time instead of varying
- * linearly between slowscan and fastscan.
- *
- * Also, the processor time used by pageout will vary from ~1%
- * at slowscan to ~4% at fastscan instead of varying between
- * ~1% at slowscan and ~10% at fastscan.
- *
- * The values chosen for the various VM parameters (fastscan,
- * handspreadpages, etc) are not universally true for all machines,
- * but appear to be a good rule of thumb for the machines we've
- * tested. They have the following ranges:
- *
- * cpu speed: 20 to 70 Mhz
- * page size: 4K to 8K
- * memory size: 16M to 5G
- * page scan rate: 4000 - 17400 4K pages per sec
- *
- * The values need to be re-examined for machines which don't
- * fall into the various ranges (e.g., slower or faster CPUs,
- * smaller or larger pagesizes etc) shown above.
- *
- * On an MP machine, pageout is often unable to maintain the
- * minimum paging thresholds under heavy load. This is due to
- * the fact that user processes running on other CPU's can be
- * dirtying memory at a much faster pace than pageout can find
- * pages to free. The memory demands could be met by enabling
- * more than one CPU to run the clock algorithm in such a manner
- * that the various clock hands don't overlap. This also makes
- * it more difficult to determine the values for fastscan, slowscan
- * and handspreadpages.
- *
- * The swapper is currently used to free up memory when pageout
- * is unable to meet memory demands by swapping out processes.
- * In addition to freeing up memory, swapping also reduces the
- * demand for memory by preventing user processes from running
- * and thereby consuming memory.
+ * The swapper is currently used to free up memory when pageout is
+ * unable to meet memory demands. It does this by swapping out entire
+ * processes. In addition to freeing up memory, swapping also reduces
+ * the demand for memory because the swapped out processes cannot
+ * run, and thereby consume memory. However, this is a pathological
+ * state and performance will generally be considered unacceptable.
*/
if (init_mfscan == 0) {
if (pageout_new_spread != 0)
@@ -419,12 +459,13 @@ setupclock(int recalc)
} else {
maxfastscan = init_mfscan;
}
- if (init_fscan == 0)
+ if (init_fscan == 0) {
fastscan = MIN(looppages / loopfraction, maxfastscan);
- else
+ } else {
fastscan = init_fscan;
- if (fastscan > looppages / loopfraction)
- fastscan = looppages / loopfraction;
+ if (fastscan > looppages / loopfraction)
+ fastscan = looppages / loopfraction;
+ }
/*
* Set slow scan time to 1/10 the fast scan time, but
@@ -444,12 +485,10 @@ setupclock(int recalc)
* decreases as the scan rate rises. It must be < the amount
* of pageable memory.
*
- * Since pageout is limited to ~4% of the CPU, setting handspreadpages
- * to be "fastscan" results in the front hand being a few secs
- * (varies based on the processor speed) ahead of the back hand
- * at fastscan rates. This distance can be further reduced, if
- * necessary, by increasing the processor time used by pageout
- * to be more than ~4% and preferrably not more than ~10%.
+ * Since pageout is limited to the %CPU per cycle, setting
+ * handspreadpages to be "fastscan" results in the front hand being
+ * a few secs (varies based on the processor speed) ahead of the back
+ * hand at fastscan rates.
*
* As a result, user processes have a much better chance of
* referencing their pages before the back hand examines them.
@@ -471,29 +510,78 @@ setupclock(int recalc)
if (handspreadpages >= looppages)
handspreadpages = looppages - 1;
+ if (recalc == 0) {
+ /*
+ * Setup basic values at initialization.
+ */
+ pscan_region_sz = total_pages;
+ des_page_scanners = n_page_scanners = 1;
+ reset_hands[0] = B_TRUE;
+ return;
+ }
+
/*
- * If we have been called to recalculate the parameters,
- * set a flag to re-evaluate the clock hand pointers.
+ * Recalculating
+ *
+ * We originally set the number of page scanners to 1. Now that we
+ * know what the handspreadpages is for a scanner, figure out how many
+ * scanners we should run. We want to ensure that the regions don't
+ * overlap and that they are not touching.
+ *
+ * A default 64GB region size is used as the initial value to calculate
+ * how many scanner threads we should create on lower memory systems.
+ * The idea is to limit the number of threads to a practical value
+ * (e.g. a 64GB machine really only needs one scanner thread). For very
+ * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
+ * threads.
+ *
+ * The scanner threads themselves are evenly spread out around the
+ * memory "clock" in pageout_scanner when we reset the hands, and each
+ * thread will scan all of memory.
*/
- if (recalc)
- reset_hands = 1;
+ sz = (btop(64ULL * 0x40000000ULL));
+ if (sz < handspreadpages) {
+ /*
+ * 64GB is smaller than the separation between the front
+ * and back hands; use double handspreadpages.
+ */
+ sz = handspreadpages << 1;
+ }
+ if (sz > total_pages) {
+ sz = total_pages;
+ }
+ /* Record region size for inspection with mdb, otherwise unused */
+ pscan_region_sz = sz;
+
+ tmp = sz;
+ for (i = 1; tmp < total_pages; i++) {
+ tmp += sz;
+ }
+
+ if (i > MAX_PSCAN_THREADS)
+ i = MAX_PSCAN_THREADS;
+
+ des_page_scanners = i;
}
/*
* Pageout scheduling.
*
* Schedpaging controls the rate at which the page out daemon runs by
- * setting the global variables nscan and desscan RATETOSCHEDPAGING
- * times a second. Nscan records the number of pages pageout has examined
- * in its current pass; schedpaging resets this value to zero each time
- * it runs. Desscan records the number of pages pageout should examine
- * in its next pass; schedpaging sets this value based on the amount of
- * currently available memory.
+ * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING
+ * times a second. The pageout_ticks variable controls the percent of one
+ * CPU that each page scanner thread should consume (see min_percent_cpu
+ * and max_percent_cpu descriptions). The desscan variable records the number
+ * of pages pageout should examine in its next pass; schedpaging sets this
+ * value based on the amount of currently available memory. In addtition, the
+ * nscan variable records the number of pages pageout has examined in its
+ * current pass; schedpaging resets this value to zero each time it runs.
*/
-#define RATETOSCHEDPAGING 4 /* hz that is */
+#define RATETOSCHEDPAGING 4 /* times/second */
-static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
+/* held while pageout_scanner or schedpaging are modifying shared data */
+static kmutex_t pageout_mutex;
/*
* Pool of available async pageout putpage requests.
@@ -506,7 +594,7 @@ static kcondvar_t push_cv;
static int async_list_size = 256; /* number of async request structs */
-static void pageout_scanner(void);
+static void pageout_scanner(void *);
/*
* If a page is being shared more than "po_share" times
@@ -535,67 +623,153 @@ schedpaging(void *arg)
if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
kcage_cageout_wakeup();
- if (mutex_tryenter(&pageout_mutex)) {
- /* pageout() not running */
- nscan = 0;
- vavail = freemem - deficit;
- if (pageout_new_spread != 0)
- vavail -= needfree;
- if (vavail < 0)
- vavail = 0;
- if (vavail > lotsfree)
- vavail = lotsfree;
+ (void) atomic_swap_ulong(&nscan, 0);
+ vavail = freemem - deficit;
+ if (pageout_new_spread != 0)
+ vavail -= needfree;
+ if (vavail < 0)
+ vavail = 0;
+ if (vavail > lotsfree)
+ vavail = lotsfree;
+ /*
+ * Fix for 1161438 (CRS SPR# 73922). All variables
+ * in the original calculation for desscan were 32 bit signed
+ * ints. As freemem approaches 0x0 on a system with 1 Gig or
+ * more of memory, the calculation can overflow. When this
+ * happens, desscan becomes negative and pageout_scanner()
+ * stops paging out.
+ */
+ if ((needfree) && (pageout_new_spread == 0)) {
/*
- * Fix for 1161438 (CRS SPR# 73922). All variables
- * in the original calculation for desscan were 32 bit signed
- * ints. As freemem approaches 0x0 on a system with 1 Gig or
- * more of memory, the calculation can overflow. When this
- * happens, desscan becomes negative and pageout_scanner()
- * stops paging out.
+ * If we've not yet collected enough samples to
+ * calculate a spread, kick into high gear anytime
+ * needfree is non-zero. Note that desscan will not be
+ * the limiting factor for systems with larger memory;
+ * the %CPU will limit the scan. That will also be
+ * maxed out below.
*/
- if ((needfree) && (pageout_new_spread == 0)) {
- /*
- * If we've not yet collected enough samples to
- * calculate a spread, use the old logic of kicking
- * into high gear anytime needfree is non-zero.
- */
- desscan = fastscan / RATETOSCHEDPAGING;
- } else {
- /*
- * Once we've calculated a spread based on system
- * memory and usage, just treat needfree as another
- * form of deficit.
- */
- spgcnt_t faststmp, slowstmp, result;
+ desscan = fastscan / RATETOSCHEDPAGING;
+ } else {
+ /*
+ * Once we've calculated a spread based on system
+ * memory and usage, just treat needfree as another
+ * form of deficit.
+ */
+ spgcnt_t faststmp, slowstmp, result;
+
+ slowstmp = slowscan * vavail;
+ faststmp = fastscan * (lotsfree - vavail);
+ result = (slowstmp + faststmp) /
+ nz(lotsfree) / RATETOSCHEDPAGING;
+ desscan = (pgcnt_t)result;
+ }
+
+ /*
+ * If we've not yet collected enough samples to calculate a
+ * spread, also kick %CPU to the max.
+ */
+ if (pageout_new_spread == 0) {
+ pageout_ticks = max_pageout_ticks;
+ } else {
+ pageout_ticks = min_pageout_ticks +
+ (lotsfree - vavail) *
+ (max_pageout_ticks - min_pageout_ticks) /
+ nz(lotsfree);
+ }
- slowstmp = slowscan * vavail;
- faststmp = fastscan * (lotsfree - vavail);
- result = (slowstmp + faststmp) /
- nz(lotsfree) / RATETOSCHEDPAGING;
- desscan = (pgcnt_t)result;
+ if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
+ /*
+ * We have finished the pagescan initialization and the desired
+ * number of page scanners has changed, either because
+ * initialization just finished, because of a memory DR, or
+ * because des_page_scanners has been modified on the fly (i.e.
+ * by mdb). If we need more scanners, start them now, otherwise
+ * the excess scanners will terminate on their own when they
+ * reset their hands.
+ */
+ uint_t i;
+ uint_t curr_nscan = n_page_scanners;
+ pgcnt_t max = total_pages / handspreadpages;
+
+ if (des_page_scanners > max)
+ des_page_scanners = max;
+
+ if (des_page_scanners > MAX_PSCAN_THREADS) {
+ des_page_scanners = MAX_PSCAN_THREADS;
+ } else if (des_page_scanners == 0) {
+ des_page_scanners = 1;
}
- pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
- (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
+ /*
+ * Each thread has its own entry in the reset_hands array, so
+ * we don't need any locking in pageout_scanner to check the
+ * thread's reset_hands entry. Thus, we use a pre-allocated
+ * fixed size reset_hands array and upper limit on the number
+ * of pagescan threads.
+ *
+ * The reset_hands entries need to be true before we start new
+ * scanners, but if we're reducing, we don't want a race on the
+ * recalculation for the existing threads, so we set
+ * n_page_scanners first.
+ */
+ n_page_scanners = des_page_scanners;
+ for (i = 0; i < MAX_PSCAN_THREADS; i++) {
+ reset_hands[i] = B_TRUE;
+ }
- if (freemem < lotsfree + needfree ||
- pageout_sample_cnt < pageout_sample_lim) {
- TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
- "pageout_cv_signal:freemem %ld", freemem);
- cv_signal(&proc_pageout->p_cv);
- } else {
- /*
- * There are enough free pages, no need to
- * kick the scanner thread. And next time
- * around, keep more of the `highly shared'
- * pages.
- */
- cv_signal_pageout();
- if (po_share > MIN_PO_SHARE) {
- po_share >>= 1;
+ if (des_page_scanners > curr_nscan) {
+ /* Create additional pageout scanner threads. */
+ for (i = curr_nscan; i < des_page_scanners; i++) {
+ (void) lwp_kernel_create(proc_pageout,
+ pageout_scanner, (void *)(uintptr_t)i,
+ TS_RUN, curthread->t_pri);
}
}
+ }
+
+ zones_over = B_FALSE;
+
+ if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
+ if (!PAGE_SCAN_STARTUP)
+ low_mem_scan++;
+ DTRACE_PROBE(schedpage__wake__low);
+ WAKE_PAGEOUT_SCANNER();
+
+ } else if (zone_num_over_cap > 0) {
+ /* One or more zones are over their cap. */
+
+ /* No page limit */
+ desscan = total_pages;
+
+ /*
+ * Increase the scanning CPU% to the max. This implies
+ * 80% of one CPU/sec if the scanner can run each
+ * opportunity. Can also be tuned via setting
+ * zone_pageout_ticks in /etc/system or with mdb.
+ */
+ pageout_ticks = (zone_pageout_ticks != 0) ?
+ zone_pageout_ticks : max_pageout_ticks;
+
+ zones_over = B_TRUE;
+ zone_cap_scan++;
+
+ DTRACE_PROBE(schedpage__wake__zone);
+ WAKE_PAGEOUT_SCANNER();
+
+ } else {
+ /*
+ * There are enough free pages, no need to
+ * kick the scanner thread. And next time
+ * around, keep more of the `highly shared'
+ * pages.
+ */
+ cv_signal_pageout();
+
+ mutex_enter(&pageout_mutex);
+ if (po_share > MIN_PO_SHARE) {
+ po_share >>= 1;
+ }
mutex_exit(&pageout_mutex);
}
@@ -617,36 +791,46 @@ ulong_t push_list_size; /* # of requests on pageout queue */
#define FRONT 1
#define BACK 2
-int dopageout = 1; /* must be non-zero to turn page stealing on */
+int dopageout = 1; /* /etc/system tunable to disable page reclamation */
/*
* The page out daemon, which runs as process 2.
*
- * As long as there are at least lotsfree pages,
- * this process is not run. When the number of free
- * pages stays in the range desfree to lotsfree,
- * this daemon runs through the pages in the loop
- * at a rate determined in schedpaging(). Pageout manages
- * two hands on the clock. The front hand moves through
- * memory, clearing the reference bit,
- * and stealing pages from procs that are over maxrss.
- * The back hand travels a distance behind the front hand,
- * freeing the pages that have not been referenced in the time
- * since the front hand passed. If modified, they are pushed to
- * swap before being freed.
+ * Page out occurs when either:
+ * a) there is less than lotsfree pages,
+ * b) there are one or more zones over their physical memory cap.
+ *
+ * The daemon treats physical memory as a circular array of pages and scans the
+ * pages using a 'two-handed clock' algorithm. The front hand moves through
+ * the pages, clearing the reference bit. The back hand travels a distance
+ * (handspreadpages) behind the front hand, freeing the pages that have not
+ * been referenced in the time since the front hand passed. If modified, they
+ * are first written to their backing store before being freed.
+ *
+ * In order to make page invalidation more responsive on machines with larger
+ * memory, multiple pageout_scanner threads may be created. In this case, the
+ * threads are evenly distributed around the the memory "clock face" so that
+ * memory can be reclaimed more quickly (that is, there can be large regions in
+ * which no pages can be reclaimed by a single thread, leading to lag which
+ * causes undesirable behavior such as htable stealing).
+ *
+ * As long as there are at least lotsfree pages, or no zones over their cap,
+ * then pageout_scanner threads are not run. When pageout_scanner threads are
+ * running for case (a), all pages are considered for pageout. For case (b),
+ * only pages belonging to a zone over its cap will be considered for pageout.
*
- * There are 2 threads that act on behalf of the pageout process.
- * One thread scans pages (pageout_scanner) and frees them up if
+ * There are multiple threads that act on behalf of the pageout process.
+ * A set of threads scan pages (pageout_scanner) and frees them up if
* they don't require any VOP_PUTPAGE operation. If a page must be
* written back to its backing store, the request is put on a list
* and the other (pageout) thread is signaled. The pageout thread
* grabs VOP_PUTPAGE requests from the list, and processes them.
* Some filesystems may require resources for the VOP_PUTPAGE
* operations (like memory) and hence can block the pageout
- * thread, but the scanner thread can still operate. There is still
+ * thread, but the pageout_scanner threads can still operate. There is still
* no guarantee that memory deadlocks cannot occur.
*
- * For now, this thing is in very rough form.
+ * The pageout_scanner parameters are determined in schedpaging().
*/
void
pageout()
@@ -684,9 +868,9 @@ pageout()
pageout_pri = curthread->t_pri;
- /* Create the pageout scanner thread. */
- (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
- pageout_pri - 1);
+ /* Create the (first) pageout scanner thread. */
+ (void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0,
+ TS_RUN, pageout_pri - 1);
/*
* kick off pageout scheduler.
@@ -720,6 +904,7 @@ pageout()
arg->a_next = NULL;
mutex_exit(&push_lock);
+ DTRACE_PROBE(pageout__push);
if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
pushes++;
@@ -740,32 +925,24 @@ pageout()
* Kernel thread that scans pages looking for ones to free
*/
static void
-pageout_scanner(void)
+pageout_scanner(void *a)
{
struct page *fronthand, *backhand;
- uint_t count;
+ uint_t count, iter = 0;
callb_cpr_t cprinfo;
- pgcnt_t nscan_limit;
+ pgcnt_t nscan_cnt, nscan_limit;
pgcnt_t pcount;
+ uint_t inst = (uint_t)(uintptr_t)a;
+ hrtime_t sample_start, sample_end;
+ clock_t pageout_lbolt;
+ kmutex_t pscan_mutex;
- CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
- mutex_enter(&pageout_mutex);
+ VERIFY3U(inst, <, MAX_PSCAN_THREADS);
- /*
- * The restart case does not attempt to point the hands at roughly
- * the right point on the assumption that after one circuit things
- * will have settled down - and restarts shouldn't be that often.
- */
+ mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
- /*
- * Set the two clock hands to be separated by a reasonable amount,
- * but no more than 360 degrees apart.
- */
- backhand = page_first();
- if (handspreadpages >= total_pages)
- fronthand = page_nextn(backhand, total_pages - 1);
- else
- fronthand = page_nextn(backhand, handspreadpages);
+ CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
+ mutex_enter(&pscan_mutex);
min_pageout_ticks = MAX(1,
((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
@@ -776,71 +953,116 @@ loop:
cv_signal_pageout();
CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(&proc_pageout->p_cv, &pageout_mutex);
- CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
+ cv_wait(&proc_pageout->p_cv, &pscan_mutex);
+ CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
if (!dopageout)
goto loop;
- if (reset_hands) {
- reset_hands = 0;
+ if (reset_hands[inst]) {
+ struct page *first;
+ pgcnt_t offset = total_pages / n_page_scanners;
- backhand = page_first();
- if (handspreadpages >= total_pages)
+ reset_hands[inst] = B_FALSE;
+ if (inst >= n_page_scanners) {
+ /*
+ * The desired number of page scanners has been
+ * reduced and this instance is no longer wanted.
+ * Exit the lwp.
+ */
+ VERIFY3U(inst, !=, 0);
+ mutex_exit(&pscan_mutex);
+ mutex_enter(&curproc->p_lock);
+ lwp_exit();
+ }
+
+ /*
+ * The reset case repositions the hands at the proper place
+ * on the memory clock face to prevent creep into another
+ * thread's active region or when the number of threads has
+ * changed.
+ *
+ * Set the two clock hands to be separated by a reasonable
+ * amount, but no more than 360 degrees apart.
+ *
+ * If inst == 0, backhand starts at first page, otherwise
+ * it is (inst * offset) around the memory "clock face" so that
+ * we spread out each scanner instance evenly.
+ */
+ first = page_first();
+ backhand = page_nextn(first, offset * inst);
+ if (handspreadpages >= total_pages) {
fronthand = page_nextn(backhand, total_pages - 1);
- else
+ } else {
fronthand = page_nextn(backhand, handspreadpages);
+ }
}
+ /*
+ * This CPU kstat is only incremented here and we're obviously on this
+ * CPU, so no lock.
+ */
CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
count = 0;
- TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
- "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
- freemem, lotsfree, nscan, desscan);
-
/* Kernel probe */
TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
pcount = 0;
- if (pageout_sample_cnt < pageout_sample_lim) {
+ nscan_cnt = 0;
+ if (PAGE_SCAN_STARTUP) {
nscan_limit = total_pages;
} else {
nscan_limit = desscan;
}
+
+ DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
+ page_t *, backhand, page_t *, fronthand);
+
pageout_lbolt = ddi_get_lbolt();
sample_start = gethrtime();
/*
* Scan the appropriate number of pages for a single duty cycle.
- * However, stop scanning as soon as there is enough free memory.
- * For a short while, we will be sampling the performance of the
- * scanner and need to keep running just to get sample data, in
- * which case we keep going and don't pay attention to whether
- * or not there is enough free memory.
+ * Only scan while at least one of these is true:
+ * 1) one or more zones is over its cap
+ * 2) there is not enough free memory
+ * 3) during page scan startup when determining sample data
*/
-
- while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
- pageout_sample_cnt < pageout_sample_lim)) {
+ while (nscan_cnt < nscan_limit &&
+ (zones_over ||
+ freemem < lotsfree + needfree ||
+ PAGE_SCAN_STARTUP)) {
int rvfront, rvback;
+ DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
+
/*
* Check to see if we have exceeded our %CPU budget
* for this wakeup, but not on every single page visited,
* just every once in a while.
*/
if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
+ clock_t pageout_cycle_ticks;
+
pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
if (pageout_cycle_ticks >= pageout_ticks) {
- ++pageout_timeouts;
+ /*
+ * This is where we normally break out of the
+ * loop when scanning zones or sampling.
+ */
+ if (!zones_over) {
+ atomic_inc_64(&pageout_timeouts);
+ }
+ DTRACE_PROBE1(pageout__timeout, uint_t, inst);
break;
}
}
/*
* If checkpage manages to add a page to the free list,
- * we give ourselves another couple of trips around the loop.
+ * we give ourselves another couple of trips around memory.
*/
if ((rvfront = checkpage(fronthand, FRONT)) == 1)
count = 0;
@@ -850,7 +1072,8 @@ loop:
++pcount;
/*
- * protected by pageout_mutex instead of cpu_stat_lock
+ * This CPU kstat is only incremented here and we're obviously
+ * on this CPU, so no lock.
*/
CPU_STATS_ADDQ(CPU, vm, scan, 1);
@@ -858,7 +1081,7 @@ loop:
* Don't include ineligible pages in the number scanned.
*/
if (rvfront != -1 || rvback != -1)
- nscan++;
+ nscan_cnt++;
backhand = page_next(backhand);
@@ -868,56 +1091,89 @@ loop:
*/
if ((fronthand = page_next(fronthand)) == page_first()) {
- TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
- "pageout_hand_wrap:freemem %ld whichhand %d",
- freemem, FRONT);
+ DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
/*
- * protected by pageout_mutex instead of cpu_stat_lock
+ * Every 64 wraps we reposition our hands within our
+ * region to prevent creep into another thread.
+ */
+ if ((++iter % pageout_reset_cnt) == 0)
+ reset_hands[inst] = B_TRUE;
+
+ /*
+ * This CPU kstat is only incremented here and we're
+ * obviously on this CPU, so no lock.
*/
CPU_STATS_ADDQ(CPU, vm, rev, 1);
- if (++count > 1) {
+
+ /*
+ * If scanning because the system is low on memory,
+ * then when we wraparound memory we want to try to
+ * reclaim more pages.
+ * If scanning only because zones are over their cap,
+ * then wrapping is common and we simply keep going.
+ */
+ if (freemem < lotsfree + needfree && ++count > 1) {
/*
+ * The system is low on memory.
* Extremely unlikely, but it happens.
- * We went around the loop at least once
- * and didn't get far enough.
+ * We went around memory at least once
+ * and didn't reclaim enough.
* If we are still skipping `highly shared'
* pages, skip fewer of them. Otherwise,
* give up till the next clock tick.
*/
+ mutex_enter(&pageout_mutex);
if (po_share < MAX_PO_SHARE) {
po_share <<= 1;
+ mutex_exit(&pageout_mutex);
} else {
/*
- * Really a "goto loop", but
- * if someone is TRACing or
- * TNF_PROBE_ing, at least
- * make records to show
- * where we are.
+ * Really a "goto loop", but if someone
+ * is tracing or TNF_PROBE_ing, hit
+ * those probes first.
*/
+ mutex_exit(&pageout_mutex);
break;
}
}
}
}
+ atomic_add_long(&nscan, nscan_cnt);
+
sample_end = gethrtime();
- TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
- "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
- freemem, lotsfree, nscan, desscan, count);
+ DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
+ uint_t, inst);
/* Kernel probe */
TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
- tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
+ tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free,
+ freemem);
- if (pageout_sample_cnt < pageout_sample_lim) {
+ /*
+ * The following two blocks are only relevant when the scanner is
+ * first started up. After the scanner runs for a while, neither of
+ * the conditions will ever be true again.
+ *
+ * The global variables used below are only modified by this thread and
+ * only during initial scanning when there is a single page scanner
+ * thread running. Thus, we don't use any locking.
+ */
+ if (PAGE_SCAN_STARTUP) {
+ VERIFY3U(inst, ==, 0);
pageout_sample_pages += pcount;
pageout_sample_etime += sample_end - sample_start;
++pageout_sample_cnt;
- }
- if (pageout_sample_cnt >= pageout_sample_lim &&
- pageout_new_spread == 0) {
+
+ } else if (pageout_new_spread == 0) {
+ uint_t i;
+
+ /*
+ * We have run enough samples, set the spread.
+ */
+ VERIFY3U(inst, ==, 0);
pageout_rate = (hrrate_t)pageout_sample_pages *
(hrrate_t)(NANOSEC) / pageout_sample_etime;
pageout_new_spread = pageout_rate / 10;
@@ -931,9 +1187,8 @@ loop:
* Look at the page at hand. If it is locked (e.g., for physical i/o),
* system (u., page table) or free, then leave it alone. Otherwise,
* if we are running the front hand, turn off the page's reference bit.
- * If the proc is over maxrss, we take it. If running the back hand,
- * check whether the page has been reclaimed. If not, free the page,
- * pushing it to disk first if necessary.
+ * If running the back hand, check whether the page has been reclaimed.
+ * If not, free the page, pushing it to disk first if necessary.
*
* Return values:
* -1 if the page is not a candidate at all,
@@ -947,6 +1202,7 @@ checkpage(struct page *pp, int whichhand)
int isfs = 0;
int isexec = 0;
int pagesync_flag;
+ zoneid_t zid = ALL_ZONES;
/*
* Skip pages:
@@ -989,6 +1245,21 @@ checkpage(struct page *pp, int whichhand)
return (-1);
}
+ if (zones_over) {
+ ASSERT(pp->p_zoneid == ALL_ZONES ||
+ pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
+ if (pp->p_zoneid == ALL_ZONES ||
+ zone_pdata[pp->p_zoneid].zpers_over == 0) {
+ /*
+ * Cross-zone shared page, or zone not over it's cap.
+ * Leave the page alone.
+ */
+ page_unlock(pp);
+ return (-1);
+ }
+ zid = pp->p_zoneid;
+ }
+
/*
* Maintain statistics for what we are freeing
*/
@@ -1016,31 +1287,24 @@ checkpage(struct page *pp, int whichhand)
recheck:
/*
- * If page is referenced; make unreferenced but reclaimable.
- * If this page is not referenced, then it must be reclaimable
- * and we can add it to the free list.
+ * If page is referenced; fronthand makes unreferenced and reclaimable.
+ * For the backhand, a process referenced the page since the front hand
+ * went by, so it's not a candidate for freeing up.
*/
if (ppattr & P_REF) {
- TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
- "pageout_isref:pp %p whichhand %d", pp, whichhand);
+ DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand);
if (whichhand == FRONT) {
- /*
- * Checking of rss or madvise flags needed here...
- *
- * If not "well-behaved", fall through into the code
- * for not referenced.
- */
hat_clrref(pp);
}
- /*
- * Somebody referenced the page since the front
- * hand went by, so it's not a candidate for
- * freeing up.
- */
page_unlock(pp);
return (0);
}
+ /*
+ * This page is not referenced, so it must be reclaimable and we can
+ * add it to the free list. This can be done by either hand.
+ */
+
VM_STAT_ADD(pageoutvmstats.checkpage[0]);
/*
@@ -1073,8 +1337,9 @@ recheck:
u_offset_t offset = pp->p_offset;
/*
- * XXX - Test for process being swapped out or about to exit?
- * [Can't get back to process(es) using the page.]
+ * Note: There is no possibility to test for process being
+ * swapped out or about to exit since we can't get back to
+ * process(es) from the page.
*/
/*
@@ -1092,6 +1357,11 @@ recheck:
VN_RELE(vp);
return (0);
}
+ if (isfs) {
+ zone_pageout_stat(zid, ZPO_DIRTY);
+ } else {
+ zone_pageout_stat(zid, ZPO_ANONDIRTY);
+ }
return (1);
}
@@ -1102,8 +1372,7 @@ recheck:
* the pagesync but before it was unloaded we catch it
* and handle the page properly.
*/
- TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
- "pageout_free:pp %p whichhand %d", pp, whichhand);
+ DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand);
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
ppattr = hat_page_getattr(pp, P_MOD | P_REF);
if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
@@ -1120,8 +1389,10 @@ recheck:
} else {
CPU_STATS_ADD_K(vm, fsfree, 1);
}
+ zone_pageout_stat(zid, ZPO_FS);
} else {
CPU_STATS_ADD_K(vm, anonfree, 1);
+ zone_pageout_stat(zid, ZPO_ANON);
}
return (1); /* freed a page! */
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index c177ecfd75..ad35fd7187 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1627,7 +1627,7 @@ vmem_destroy(vmem_t *vmp)
leaked = vmem_size(vmp, VMEM_ALLOC);
if (leaked != 0)
- cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+ cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
"identifiers" : "bytes");
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index c759f7e010..1db130797c 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -106,14 +106,16 @@
* removed from the list of active zones. zone_destroy() returns, and
* the zone can be recreated.
*
- * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
- * callbacks are executed, and all memory associated with the zone is
- * freed.
+ * ZONE_IS_FREE (internal state): All references have been dropped and
+ * the zone_t is no longer in the zone_active nor zone_deathrow lists.
+ * The zone_t is in the process of being freed. This state exists
+ * only for publishing a sysevent to indicate that the zone by this
+ * name can be booted again.
*
- * Threads can wait for the zone to enter a requested state by using
- * zone_status_wait() or zone_status_timedwait() with the desired
- * state passed in as an argument. Zone state transitions are
- * uni-directional; it is not possible to move back to an earlier state.
+ * Threads can wait for the zone to enter a requested state (other than
+ * ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait()
+ * with the desired state passed in as an argument. Zone state transitions
+ * are uni-directional; it is not possible to move back to an earlier state.
*
*
* Zone-Specific Data:
@@ -252,6 +254,8 @@
#include <sys/cpucaps.h>
#include <vm/seg.h>
#include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
/*
* This constant specifies the number of seconds that threads waiting for
@@ -312,6 +316,7 @@ static id_space_t *zoneid_space;
* 'global_zone'.
*/
zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
zone_t *global_zone = NULL; /* Set when the global zone is initialized */
/*
@@ -327,8 +332,8 @@ static list_t zone_active;
static list_t zone_deathrow;
static kmutex_t zone_deathrow_lock;
-/* number of zones is limited by virtual interface limit in IP */
-uint_t maxzones = 8192;
+/* This can be dynamically reduced if various subsystems hit internal limits. */
+uint_t maxzones = MAX_ZONES;
/* Event channel to sent zone state change notifications */
evchan_t *zone_event_chan;
@@ -350,6 +355,7 @@ const char *zone_status_table[] = {
ZONE_EVENT_SHUTTING_DOWN, /* down */
ZONE_EVENT_SHUTTING_DOWN, /* dying */
ZONE_EVENT_UNINITIALIZED, /* dead */
+ ZONE_EVENT_FREE, /* free */
};
/*
@@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = {
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
rctl_hndl_t rc_zone_shmmax;
@@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t);
static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
static int zone_set_network(zoneid_t, zone_net_data_t *);
static int zone_get_network(zoneid_t, zone_net_data_t *);
+static void zone_status_set(zone_t *, zone_status_t);
typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
@@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
* Version 5 alters the zone_boot system call, and converts its old
* bootargs parameter to be set by the zone_setattr API instead.
* Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
*/
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
+
+/*
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ * 1) pages and RSS data associated with processes inside a zone
+ * 2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
+ *
+ * All zone physical memory cap data is stored in this array instead of within
+ * the zone structure itself. This is because zone structures come and go, but
+ * paging-related work can be asynchronous to any particular zone. In,
+ * particular:
+ * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
+ * associated with any zone.
+ * 2) Freeing segkp pages can occur long after the zone which first
+ * instantiated those pages has gone away.
+ * We want to be able to account for pages/zone without constantly having to
+ * take extra locks and finding the relevant zone structure, particularly during
+ * page scanning.
+ *
+ * The page scanner can run when "zone_num_over_cap" is non-zero. It can
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
+ * if that zone is over its cap.
+ *
+ * There is no locking for the page scanner to perform these two checks.
+ * We cannot have the page scanner blocking normal paging activity for
+ * running processes. Because the physical memory cap is a soft cap, it is
+ * fine for the scanner to simply read the current state of the counter and
+ * the zone's zpers_over entry in the array. The scanner should never modify
+ * either of these items. Internally the entries and the counter are managed
+ * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
+ * take care to ensure that we only take the zone_physcap_lock mutex when a
+ * zone is transitioning over/under its physical memory cap.
+ *
+ * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
+ * the "zone_pdata" array and associated counter.
+ *
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
+ * In the future we may need to expand these counters to 64-bit, but for now
+ * we're using 32-bit to conserve memory, since this array is statically
+ * allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
+ */
+uint_t zone_num_over_cap;
+zone_persist_t zone_pdata[MAX_ZONES];
+static kmutex_t zone_physcap_lock;
/*
* Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+ rcop_no_action,
+ zone_cpu_base_get,
+ zone_cpu_base_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+ rcop_no_action,
+ zone_cpu_burst_time_get,
+ zone_cpu_burst_time_set,
+ rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+ rctl_qty_t r = 0;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ return (r);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+ zone_persist_t *zp;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ /*
+ * set priority to the new value.
+ */
+ zp = &zone_pdata[zone->zone_id];
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+ mutex_exit(&zp->zpers_zfs_lock);
+ return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+ rcop_no_action,
+ zone_zfs_io_pri_get,
+ zone_zfs_io_pri_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_lwps_usage(rctl_t *r, proc_t *p)
{
rctl_qty_t nlwps;
@@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ q = ptob(zp->zpers_pg_cnt);
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zoneid_t zid;
+ uint_t pg_val;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ zid = e->rcep_p.zone->zone_id;
+ if (nv == UINT64_MAX) {
+ pg_val = UINT32_MAX;
+ } else {
+ uint64_t pages = btop(nv);
+
+ /*
+ * Return from RCTLOP_SET is always ignored so just clamp an
+ * out-of-range value to our largest "limited" value.
+ */
+ if (pages >= UINT32_MAX) {
+ pg_val = UINT32_MAX - 1;
+ } else {
+ pg_val = (uint_t)pages;
+ }
+ }
+ zone_pdata[zid].zpers_pg_limit = pg_val;
+ return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+ rcop_no_action,
+ zone_phys_mem_usage,
+ zone_phys_mem_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
@@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
}
static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
+ return (0);
+}
+
+static int
zone_nprocs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
@@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
}
static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
int (*updatefunc) (kstat_t *, int))
{
kstat_t *ksp;
@@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name,
return (ksp);
}
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_vfs_kstat_t *zvp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the VFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the slow ops
+ * counters are updated directly by the VFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zvp->zv_nread.value.ui64 = kiop->nread;
+ zvp->zv_reads.value.ui64 = kiop->reads;
+ zvp->zv_rtime.value.ui64 = kiop->rtime;
+ zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+ zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+ zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+ zvp->zv_writes.value.ui64 = kiop->writes;
+ zvp->zv_wtime.value.ui64 = kiop->wtime;
+ zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+ zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+ scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_vfs_kstat_t *zvp;
+
+ if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+ zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_vfs_lock;
+ zone->zone_vfs_stats = zvp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+ kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_vfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_zfs_kstat_t *zzp = ksp->ks_data;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp == NULL) {
+ zzp->zz_nread.value.ui64 = 0;
+ zzp->zz_reads.value.ui64 = 0;
+ zzp->zz_rtime.value.ui64 = 0;
+ zzp->zz_rlentime.value.ui64 = 0;
+ zzp->zz_nwritten.value.ui64 = 0;
+ zzp->zz_writes.value.ui64 = 0;
+ zzp->zz_waittime.value.ui64 = 0;
+ } else {
+ kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure
+ * used by kstat_runq_enter() and related functions. Since the
+ * I/O throttle counters are updated directly by the ZFS layer,
+ * there's no need to copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+ zzp->zz_waittime.value.ui64 =
+ zp->zpers_zfsp->zpers_zfs_rd_waittime;
+ }
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_zfs_kstat_t *zzp;
+
+ if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+ zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_zfs_lock;
+ zone->zone_zfs_stats = zzp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+ kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_zfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
static int
zone_mcap_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_mcap_kstat_t *zmp = ksp->ks_data;
+ zone_persist_t *zp;
if (rw == KSTAT_WRITE)
return (EACCES);
+ zp = &zone_pdata[zone->zone_id];
+
+ zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
+ zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+ zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+ zmp->zm_nover.value.ui64 = zp->zpers_nover;
+#ifndef DEBUG
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
+#else
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+ zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
+#endif
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
@@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone)
/* The kstat "name" field is not large enough for a full zonename */
kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
@@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
+ zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
+
zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
+ zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts;
zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
return (0);
@@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone)
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
+ KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_init_restarts, "init_restarts",
+ KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
ksp->ks_update = zone_misc_kstat_update;
@@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone)
static void
zone_kstat_create(zone_t *zone)
{
- zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+ zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
"lockedmem", zone_lockedmem_kstat_update);
- zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+ zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
"swapresv", zone_swapresv_kstat_update);
- zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+ zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+ "physicalmem", zone_physmem_kstat_update);
+ zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
"nprocs", zone_nprocs_kstat_update);
+ if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+ zone->zone_vfs_stats = kmem_zalloc(
+ sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+ zone->zone_zfs_stats = kmem_zalloc(
+ sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ }
+
if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
zone->zone_mcap_stats = kmem_zalloc(
sizeof (zone_mcap_kstat_t), KM_SLEEP);
@@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone)
sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_swapresv_kstat,
sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_physmem_kstat,
+ sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_nprocs_kstat,
sizeof (zone_kstat_t));
+
+ zone_kstat_delete_common(&zone->zone_vfs_ksp,
+ sizeof (zone_vfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_zfs_ksp,
+ sizeof (zone_zfs_kstat_t));
zone_kstat_delete_common(&zone->zone_mcap_ksp,
sizeof (zone_mcap_kstat_t));
zone_kstat_delete_common(&zone->zone_misc_ksp,
@@ -2101,8 +2579,12 @@ zone_zsd_init(void)
zone0.zone_initname = initname;
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
+ zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+ zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2209,6 +2691,21 @@ zone_init(void)
RCTL_GLOBAL_INFINITE,
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
+ rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+ rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+ rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ 16384, 16384, &zone_zfs_io_pri_ops);
+
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2250,6 +2747,20 @@ zone_init(void)
rde = rctl_dict_lookup("zone.cpu-shares");
(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+ /*
+ * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
+ * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+ */
+ dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ bzero(dval, sizeof (rctl_val_t));
+ dval->rcv_value = 1;
+ dval->rcv_privilege = RCPRIV_PRIVILEGED;
+ dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+ dval->rcv_action_recip_pid = -1;
+
+ rde = rctl_dict_lookup("zone.zfs-io-priority");
+ (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2260,6 +2771,11 @@ zone_init(void)
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_swap_ops);
+ rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_phys_mem_ops);
+
rc_zone_max_lofi = rctl_register("zone.max-lofi",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2281,6 +2797,9 @@ zone_init(void)
zone0.zone_ntasks = 1;
mutex_exit(&p0.p_lock);
zone0.zone_restart_init = B_TRUE;
+ zone0.zone_reboot_on_init_exit = B_FALSE;
+ zone0.zone_restart_init_0 = B_FALSE;
+ zone0.zone_init_status = -1;
zone0.zone_brand = &native_brand;
rctl_prealloc_destroy(gp);
/*
@@ -2362,6 +2881,8 @@ zone_init(void)
static void
zone_free(zone_t *zone)
{
+ zone_dl_t *zdl;
+
ASSERT(zone != global_zone);
ASSERT(zone->zone_ntasks == 0);
ASSERT(zone->zone_nlwps == 0);
@@ -2377,6 +2898,9 @@ zone_free(zone_t *zone)
*/
cpucaps_zone_remove(zone);
+ /* Clear physical memory capping data. */
+ bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
+
ASSERT(zone->zone_cpucap == NULL);
/* remove from deathrow list */
@@ -2390,8 +2914,30 @@ zone_free(zone_t *zone)
list_destroy(&zone->zone_ref_list);
zone_free_zsd(zone);
zone_free_datasets(zone);
+
+ /*
+ * While dlmgmtd should have removed all of these, it could have left
+ * something behind or crashed. In which case it's not safe for us to
+ * assume that the list is empty which list_destroy() will ASSERT. We
+ * clean up for our userland comrades which may have crashed, or worse,
+ * been disabled by SMF.
+ */
+ while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+ if (zdl->zdl_net != NULL)
+ nvlist_free(zdl->zdl_net);
+ kmem_free(zdl, sizeof (zone_dl_t));
+ }
list_destroy(&zone->zone_dl_list);
+ /*
+ * This zone_t can no longer inhibit creation of another zone_t
+ * with the same name or debug ID. Generate a sysevent so that
+ * userspace tools know it is safe to carry on.
+ */
+ mutex_enter(&zone_status_lock);
+ zone_status_set(zone, ZONE_IS_FREE);
+ mutex_exit(&zone_status_lock);
+
cpu_uarray_free(zone->zone_ustate);
if (zone->zone_rootvp != NULL)
@@ -2436,11 +2982,17 @@ zone_free(zone_t *zone)
static void
zone_status_set(zone_t *zone, zone_status_t status)
{
+ timestruc_t now;
+ uint64_t t;
nvlist_t *nvl = NULL;
ASSERT(MUTEX_HELD(&zone_status_lock));
- ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
- status >= zone_status_get(zone));
+ ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE ||
+ status == ZONE_IS_FREE) && status >= zone_status_get(zone));
+
+ /* Current time since Jan 1 1970 but consumers expect NS */
+ gethrestime(&now);
+ t = (now.tv_sec * NANOSEC) + now.tv_nsec;
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
@@ -2449,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status)
nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
zone_status_table[zone->zone_status]) ||
nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
- nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+ nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
#ifdef DEBUG
(void) printf(
"Failed to allocate and send zone state change event.\n");
+#else
+ /* EMPTY */
#endif
}
nvlist_free(nvl);
@@ -2474,6 +3028,38 @@ zone_status_get(zone_t *zone)
return (zone->zone_status);
}
+/*
+ * Publish a zones-related sysevent for purposes other than zone state changes.
+ * While it is unfortunate that zone_event_chan is associated with
+ * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be
+ * the only ones with class "status" and subclass "change".
+ */
+void
+zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass,
+ nvlist_t *ev_nvl)
+{
+ nvlist_t *nvl = NULL;
+ timestruc_t now;
+ uint64_t t;
+
+ gethrestime(&now);
+ t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
+ if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 ||
+ nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 ||
+ nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 ||
+ nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 ||
+ sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com",
+ "kernel", nvl, EVCH_SLEEP) != 0) {
+#ifdef DEBUG
+ (void) printf("Failed to allocate and send zone misc event.\n");
+#else
+ /* EMPTY */
+#endif
+ }
+ nvlist_free(nvl);
+}
+
static int
zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
{
@@ -2527,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand)
return (EINVAL);
}
- /* set up the brand specific data */
+ /*
+ * Set up the brand specific data.
+ * Note that it's possible that the hook has to drop the
+ * zone_status_lock and reaquire it before returning so we can't
+ * assume the lock has been held the entire time.
+ */
zone->zone_brand = bp;
- ZBROP(zone)->b_init_brand_data(zone);
+ ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
mutex_exit(&zone_status_lock);
return (0);
@@ -2602,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
}
static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
-{
- uint64_t mcap;
- int err = 0;
-
- if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
- zone->zone_phys_mcap = mcap;
-
- return (err);
-}
-
-static int
zone_set_sched_class(zone_t *zone, const char *new_class)
{
char sched_class[PC_CLNMSZ];
@@ -3020,6 +3599,12 @@ getzoneid(void)
return (curproc->p_zone->zone_id);
}
+zoneid_t
+getzonedid(void)
+{
+ return (curproc->p_zone->zone_did);
+}
+
/*
* Internal versions of zone_find_by_*(). These don't zone_hold() or
* check the validity of a zone's state.
@@ -3766,6 +4351,17 @@ zone_start_init(void)
*/
z->zone_proc_initpid = p->p_pid;
+ if (z->zone_setup_app_contract == B_TRUE) {
+ /*
+ * Normally a process cannot modify its own contract, but we're
+ * just starting the zone's init process and its contract is
+ * always initialized from the sys_process_tmpl template, so
+ * this is the simplest way to setup init's contract to kill
+ * the process if any other process in the contract exits.
+ */
+ p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+ }
+
/*
* We maintain zone_boot_err so that we can return the cause of the
* failure back to the caller of the zone_boot syscall.
@@ -3794,9 +4390,54 @@ zone_start_init(void)
lwp_exit();
}
} else {
+ id_t cid = curthread->t_cid;
+
if (zone_status_get(z) == ZONE_IS_BOOTING)
zone_status_set(z, ZONE_IS_RUNNING);
mutex_exit(&zone_status_lock);
+
+ mutex_enter(&class_lock);
+ ASSERT(cid < loaded_classes);
+ if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+ z->zone_fixed_hipri) {
+ /*
+ * If the zone is using FX then by default all
+ * processes start at the lowest priority and stay
+ * there. We provide a mechanism for the zone to
+ * indicate that it should run at "high priority". In
+ * this case we setup init to run at the highest FX
+ * priority (which is one level higher than the
+ * non-fixed scheduling classes can use).
+ */
+ pcparms_t pcparms;
+
+ pcparms.pc_cid = cid;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+ FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+ FX_DOUPRILIM | FX_DOUPRI;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ (void) parmsset(&pcparms, curthread);
+
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+ /*
+ * zsched always starts the init lwp at priority
+ * minclsyspri - 1. This priority gets set in t_pri and
+ * is invalid for RT, but RT never uses t_pri. However
+ * t_pri is used by procfs, so we always see processes
+ * within an RT zone with an invalid priority value.
+ * We fix that up now.
+ */
+ curthread->t_pri = RTGPPRIO0;
+ }
+ mutex_exit(&class_lock);
+
/* cause the process to return to userland. */
lwp_rtt();
}
@@ -4282,8 +4923,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
error = EINVAL;
name = nvpair_name(nvp);
- if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
- != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+ if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+ strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+ nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
goto out;
}
if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4402,7 +5044,7 @@ zone_create(const char *zone_name, const char *zone_root,
caddr_t rctlbuf, size_t rctlbufsz,
caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
int match, uint32_t doi, const bslabel_t *label,
- int flags)
+ int flags, zoneid_t zone_did)
{
struct zsched_arg zarg;
nvlist_t *rctls = NULL;
@@ -4474,6 +5116,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
zone->zone_id = zoneid;
+ zone->zone_did = zone_did;
zone->zone_status = ZONE_IS_UNINITIALIZED;
zone->zone_pool = pool_default;
zone->zone_pool_mod = gethrtime();
@@ -4481,6 +5124,9 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_ncpus = 0;
zone->zone_ncpus_online = 0;
zone->zone_restart_init = B_TRUE;
+ zone->zone_reboot_on_init_exit = B_FALSE;
+ zone->zone_restart_init_0 = B_FALSE;
+ zone->zone_init_status = -1;
zone->zone_brand = &native_brand;
zone->zone_initname = NULL;
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4547,8 +5193,13 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_max_swap_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
- zone0.zone_lockedmem_kstat = NULL;
- zone0.zone_swapresv_kstat = NULL;
+ zone->zone_lockedmem_kstat = NULL;
+ zone->zone_swapresv_kstat = NULL;
+ zone->zone_physmem_kstat = NULL;
+
+ zone_pdata[zoneid].zpers_zfsp =
+ kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+ zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
@@ -4557,6 +5208,13 @@ zone_create(const char *zone_name, const char *zone_root,
*/
zone->zone_rctls = NULL;
+ /*
+ * Ensure page count is 0 (in case zoneid has wrapped).
+ * Initialize physical memory cap as unlimited.
+ */
+ zone_pdata[zoneid].zpers_pg_cnt = 0;
+ zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
+
if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
zone_free(zone);
return (zone_create_error(error, 0, extended_error));
@@ -4705,8 +5363,8 @@ zone_create(const char *zone_name, const char *zone_root,
/*
* The process, task, and project rctls are probably wrong;
* we need an interface to get the default values of all rctls,
- * and initialize zsched appropriately. I'm not sure that that
- * makes much of a difference, though.
+ * and initialize zsched appropriately. However, we allow zoneadmd
+ * to pass down both zone and project rctls for the zone's init.
*/
error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
if (error != 0) {
@@ -4845,6 +5503,7 @@ zone_boot(zoneid_t zoneid)
static int
zone_empty(zone_t *zone)
{
+ int cnt = 0;
int waitstatus;
/*
@@ -4855,7 +5514,16 @@ zone_empty(zone_t *zone)
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
while ((waitstatus = zone_status_timedwait_sig(zone,
ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
- killall(zone->zone_id);
+ boolean_t force = B_FALSE;
+
+ /* Every 30 seconds, try harder */
+ if (cnt++ >= 30) {
+ cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+ zone->zone_id);
+ force = B_TRUE;
+ cnt = 0;
+ }
+ killall(zone->zone_id, force);
}
/*
* return EINTR if we were signaled
@@ -5184,6 +5852,7 @@ zone_destroy(zoneid_t zoneid)
zone_status_t status;
clock_t wait_time;
boolean_t log_refcounts;
+ zone_persist_t *zp;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -5217,6 +5886,12 @@ zone_destroy(zoneid_t zoneid)
zone_hold(zone);
mutex_exit(&zonehash_lock);
+ zp = &zone_pdata[zoneid];
+ mutex_enter(&zp->zpers_zfs_lock);
+ kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+ zp->zpers_zfsp = NULL;
+ mutex_exit(&zp->zpers_zfs_lock);
+
/*
* wait for zsched to exit
*/
@@ -5606,14 +6281,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
- case ZONE_ATTR_PHYS_MCAP:
- size = sizeof (zone->zone_phys_mcap);
- if (bufsize > size)
- bufsize = size;
- if (buf != NULL &&
- copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
- error = EFAULT;
- break;
case ZONE_ATTR_SCHED_CLASS:
mutex_enter(&class_lock);
@@ -5677,6 +6344,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
}
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_DID:
+ size = sizeof (zoneid_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ size = sizeof (boolean_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+ bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -5708,10 +6392,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
- * global zone.
+ * No attributes can be set on the global zone.
*/
- if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+ if (zoneid == GLOBAL_ZONEID) {
return (set_errno(EINVAL));
}
@@ -5724,11 +6407,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
mutex_exit(&zonehash_lock);
/*
- * At present most attributes can only be set on non-running,
+ * At present attributes can only be set on non-running,
* non-global zones.
*/
zone_status = zone_status_get(zone);
- if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+ if (zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -5741,6 +6424,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
zone->zone_restart_init = B_FALSE;
err = 0;
break;
+ case ZONE_ATTR_INITRESTART0:
+ zone->zone_restart_init_0 = B_TRUE;
+ err = 0;
+ break;
+ case ZONE_ATTR_INITREBOOT:
+ zone->zone_reboot_on_init_exit = B_TRUE;
+ err = 0;
+ break;
case ZONE_ATTR_BOOTARGS:
err = zone_set_bootargs(zone, (const char *)buf);
break;
@@ -5753,9 +6444,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_SECFLAGS:
err = zone_set_secflags(zone, (psecflags_t *)buf);
break;
- case ZONE_ATTR_PHYS_MCAP:
- err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
- break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
break;
@@ -5783,6 +6471,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
err = zone_set_network(zoneid, zbuf);
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_APP_SVC_CT:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_setup_app_contract = (boolean_t)buf;
+ err = 0;
+ }
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_fixed_hipri = (boolean_t)buf;
+ err = 0;
+ }
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6486,6 +7190,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
zs.doi = zs32.doi;
zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
zs.flags = zs32.flags;
+ zs.zoneid = zs32.zoneid;
#else
panic("get_udatamodel() returned bogus result\n");
#endif
@@ -6496,7 +7201,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
(caddr_t)zs.rctlbuf, zs.rctlbufsz,
(caddr_t)zs.zfsbuf, zs.zfsbufsz,
zs.extended_error, zs.match, zs.doi,
- zs.label, zs.flags));
+ zs.label, zs.flags, zs.zoneid));
case ZONE_BOOT:
return (zone_boot((zoneid_t)(uintptr_t)arg1));
case ZONE_DESTROY:
@@ -6597,6 +7302,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
bcopy(zone->zone_name, zone_name, zone_namelen);
zoneid = zone->zone_id;
uniqid = zone->zone_uniqid;
+ arg.status = zone->zone_init_status;
/*
* zoneadmd may be down, but at least we can empty out the zone.
* We can ignore the return value of zone_empty() since we're called
@@ -6774,7 +7480,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
* zone_ki_call_zoneadmd() will do a more thorough job of this
* later.
*/
- killall(zone->zone_id);
+ killall(zone->zone_id, B_FALSE);
/*
* Now, create the thread to contact zoneadmd and do the rest of the
* work. This thread can't be created in our zone otherwise
@@ -6837,16 +7543,15 @@ zone_shutdown_global(void)
}
/*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
* The 'write' parameter is set to 1 if the dataset is also writable.
*/
int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
{
static int zfstype = -1;
zone_dataset_t *zd;
size_t len;
- zone_t *zone = curproc->p_zone;
const char *name = NULL;
vfs_t *vfsp = NULL;
@@ -6914,7 +7619,8 @@ zone_dataset_visible(const char *dataset, int *write)
vfs_list_read_lock();
vfsp = zone->zone_vfslist;
do {
- ASSERT(vfsp);
+ if (vfsp == NULL)
+ break;
if (vfsp->vfs_fstype == zfstype) {
name = refstr_value(vfsp->vfs_resource);
@@ -6951,6 +7657,18 @@ zone_dataset_visible(const char *dataset, int *write)
}
/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ zone_t *zone = curproc->p_zone;
+
+ return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
+/*
* zone_find_by_any_path() -
*
* kernel-private routine similar to zone_find_by_path(), but which
@@ -7052,6 +7770,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
zone_t *zone;
zone_t *thiszone;
+ /*
+ * Only the GZ may add a datalink to a zone's list.
+ */
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (set_errno(EPERM));
+
+ /*
+ * Only a process with the datalink config priv may add a
+ * datalink to a zone's list.
+ */
+ if (secpolicy_dl_config(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ /*
+ * When links exist in the GZ, they aren't added to the GZ's
+ * zone_dl_list. We must enforce this because link_activate()
+ * depends on zone_check_datalink() returning only NGZs.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ return (set_errno(EINVAL));
+
if ((thiszone = zone_find_by_id(zoneid)) == NULL)
return (set_errno(ENXIO));
@@ -7084,6 +7823,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
zone_t *zone;
int err = 0;
+ /*
+ * Only the GZ may remove a datalink from a zone's list.
+ */
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (set_errno(EPERM));
+
+ /*
+ * Only a process with the datalink config priv may remove a
+ * datalink from a zone's list.
+ */
+ if (secpolicy_dl_config(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ /*
+ * If we can't add a datalink to the GZ's zone_dl_list then we
+ * certainly can't remove them either.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ return (set_errno(EINVAL));
+
if ((zone = zone_find_by_id(zoneid)) == NULL)
return (set_errno(EINVAL));
@@ -7101,25 +7860,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
}
/*
- * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
- * the linkid. Otherwise we just check if the specified zoneidp has been
- * assigned the supplied linkid.
+ *
+ * This function may be used in two ways:
+ *
+ * 1. to get the zoneid of the zone this link is under, or
+ *
+ * 2. to verify that the link is under a specific zone.
+ *
+ * The first use is achieved by passing a zoneid of ALL_ZONES. The
+ * function then iterates the datalink list of every zone on the
+ * system until it finds the linkid. If the linkid is found then the
+ * function returns 0 and zoneidp is updated. Otherwise, ENXIO is
+ * returned and zoneidp is not modified. The use of ALL_ZONES is
+ * limited to callers in the GZ to prevent leaking information to
+ * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed
+ * to the second type in the list above.
+ *
+ * The second use is achieved by passing a specific zoneid. The GZ can
+ * use this to verify a link is under a particular zone. An NGZ can
+ * use this to verify a link is under itself. But an NGZ cannot use
+ * this to determine if a link is under some other zone as that would
+ * result in information leakage. If the link exists under the zone
+ * then 0 is returned. Otherwise, ENXIO is returned.
*/
int
zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
{
zone_t *zone;
+ zoneid_t zoneid = *zoneidp;
+ zoneid_t caller = getzoneid();
int err = ENXIO;
- if (*zoneidp != ALL_ZONES) {
- if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
- if (zone_dl_exists(zone, linkid))
+ /*
+ * Only the GZ may enquire about all zones; an NGZ may only
+ * enuqire about itself.
+ */
+ if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID)
+ zoneid = caller;
+
+ if (zoneid != caller && caller != GLOBAL_ZONEID)
+ return (err);
+
+ if (zoneid != ALL_ZONES) {
+ if ((zone = zone_find_by_id(zoneid)) != NULL) {
+ if (zone_dl_exists(zone, linkid)) {
+ /*
+ * We need to set this in case an NGZ
+ * passes ALL_ZONES.
+ */
+ *zoneidp = zoneid;
err = 0;
+ }
zone_rele(zone);
}
return (err);
}
+ ASSERT(caller == GLOBAL_ZONEID);
mutex_enter(&zonehash_lock);
for (zone = list_head(&zone_active); zone != NULL;
zone = list_next(&zone_active, zone)) {
@@ -7130,6 +7927,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
}
}
mutex_exit(&zonehash_lock);
+
return (err);
}
@@ -7150,6 +7948,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
zone_dl_t *zdl;
datalink_id_t *idptr = idarray;
+ /*
+ * Only the GZ or the owning zone may look at the datalink list.
+ */
+ if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid))
+ return (set_errno(EPERM));
+
if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
return (set_errno(EFAULT));
if ((zone = zone_find_by_id(zoneid)) == NULL)
@@ -7175,6 +7979,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
mutex_exit(&zone->zone_lock);
zone_rele(zone);
+ /*
+ * Prevent returning negative nump values -- we should never
+ * have this many links anyways.
+ */
+ if (num > INT_MAX)
+ return (set_errno(EOVERFLOW));
+
/* Increased or decreased, caller should be notified. */
if (num != dlcount) {
if (copyout(&num, nump, sizeof (num)) != 0)
@@ -7388,3 +8199,231 @@ done:
else
return (0);
}
+
+static void
+zone_incr_capped(zoneid_t zid)
+{
+ zone_persist_t *zp = &zone_pdata[zid];
+
+ /* See if over (unlimited is UINT32_MAX), or already marked that way. */
+ if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck setting under mutex */
+ if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+ zp->zpers_over = 1;
+ zp->zpers_nover++;
+ zone_num_over_cap++;
+ DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * We want some hysteresis when the zone is going under its cap so that we're
+ * not continuously toggling page scanning back and forth by a single page
+ * around the cap. Using ~1% of the zone's page limit seems to be a good
+ * quantity. This table shows some various zone memory caps and the number of
+ * pages (assuming a 4k page size). Given this, we choose to shift the page
+ * limit by 7 places to get a hysteresis that is slightly less than 1%.
+ *
+ * cap pages pages 1% shift7 shift7
+ * 128M 32768 0x0008000 327 256 0x00100
+ * 512M 131072 0x0020000 1310 1024 0x00400
+ * 1G 262144 0x0040000 2621 2048 0x00800
+ * 4G 1048576 0x0100000 10485 8192 0x02000
+ * 8G 2097152 0x0200000 20971 16384 0x04000
+ * 16G 4194304 0x0400000 41943 32768 0x08000
+ * 32G 8388608 0x0800000 83886 65536 0x10000
+ * 64G 16777216 0x1000000 167772 131072 0x20000
+ */
+static void
+zone_decr_capped(zoneid_t zid)
+{
+ zone_persist_t *zp = &zone_pdata[zid];
+ uint32_t adjusted_limit;
+
+ /*
+ * See if under, or already marked that way. There is no need to
+ * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+ * since we'll never set zpers_over in zone_incr_capped().
+ */
+ if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
+ return;
+ }
+
+ adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
+
+ /* Recheck, accounting for our hysteresis. */
+ if (zp->zpers_pg_cnt >= adjusted_limit) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck under mutex. */
+ if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+ zp->zpers_over = 0;
+ ASSERT(zone_num_over_cap > 0);
+ zone_num_over_cap--;
+ DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * For zone_add_page() and zone_rm_page(), access to the page we're touching is
+ * controlled by our caller's locking.
+ * On x86 our callers already did: ASSERT(x86_hm_held(pp))
+ * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
+ */
+void
+zone_add_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_persist_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ ASSERT(!PP_ISFREE(pp));
+
+ zid = curzone->zone_id;
+ if (pp->p_zoneid == zid) {
+ /* Another mapping to this page for this zone, do nothing */
+ return;
+ }
+
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = page_get_pagecnt(pp->p_szc);
+ }
+
+ if (pp->p_share == 0) {
+ /* First mapping to this page. */
+ pp->p_zoneid = zid;
+ zp = &zone_pdata[zid];
+ ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
+ zone_incr_capped(zid);
+ return;
+ }
+
+ if (pp->p_zoneid != ALL_ZONES) {
+ /*
+ * The page is now being shared across a different zone.
+ * Decrement the original zone's usage.
+ */
+ zid = pp->p_zoneid;
+ pp->p_zoneid = ALL_ZONES;
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ }
+}
+
+void
+zone_rm_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_persist_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ zid = pp->p_zoneid;
+ if (zid == ALL_ZONES || pp->p_share != 0)
+ return;
+
+ /* This is the last mapping to the page for a zone. */
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
+ }
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ pp->p_zoneid = ALL_ZONES;
+}
+
+void
+zone_pageout_stat(int zid, zone_pageout_op_t op)
+{
+ zone_persist_t *zp;
+
+ if (zid == ALL_ZONES)
+ return;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+#ifndef DEBUG
+ atomic_add_64(&zp->zpers_pg_out, 1);
+#else
+ switch (op) {
+ case ZPO_DIRTY:
+ atomic_add_64(&zp->zpers_pg_fsdirty, 1);
+ break;
+ case ZPO_FS:
+ atomic_add_64(&zp->zpers_pg_fs, 1);
+ break;
+ case ZPO_ANON:
+ atomic_add_64(&zp->zpers_pg_anon, 1);
+ break;
+ case ZPO_ANONDIRTY:
+ atomic_add_64(&zp->zpers_pg_anondirty, 1);
+ break;
+ default:
+ cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
+ break;
+ }
+#endif
+}
+
+/*
+ * Return the zone's physical memory cap and current free memory (in pages).
+ */
+void
+zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
+{
+ zone_persist_t *zp;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+ /*
+ * If memory or swap limits are set on the zone, use those, otherwise
+ * use the system values. physmem and freemem are also in pages.
+ */
+ if (zp->zpers_pg_limit == UINT32_MAX) {
+ *memcap = physmem;
+ *free = freemem;
+ } else {
+ int64_t freemem;
+
+ *memcap = (pgcnt_t)zp->zpers_pg_limit;
+ freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
+ if (freemem > 0) {
+ *free = (pgcnt_t)freemem;
+ } else {
+ *free = (pgcnt_t)0;
+ }
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/refhash/refhash.c
index 8f96c2d9f1..e2de00597e 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c
+++ b/usr/src/uts/common/refhash/refhash.c
@@ -10,16 +10,18 @@
*/
/*
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
+#include <sys/refhash.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/list.h>
#include <sys/ddi.h>
+#define RHL_F_DEAD 0x01
+
#ifdef lint
extern refhash_link_t *obj_to_link(refhash_t *, void *);
extern void *link_to_obj(refhash_t *, refhash_link_t *);
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 8d26a71342..909160f2db 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -21,7 +21,7 @@
#
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2018, Joyent, Inc.
+# Copyright 2019, Joyent, Inc.
# Copyright 2013 Garrett D'Amore <garrett@damore.org>
# Copyright 2013 Saso Kiselkov. All rights reserved.
# Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com>
@@ -258,6 +258,7 @@ CHKHDRS= \
flock.h \
flock_impl.h \
fork.h \
+ frameio.h \
fss.h \
fsspriocntl.h \
fsid.h \
@@ -283,6 +284,7 @@ CHKHDRS= \
idmap.h \
ieeefp.h \
id_space.h \
+ inotify.h \
instance.h \
int_const.h \
int_fmtio.h \
@@ -351,6 +353,7 @@ CHKHDRS= \
lgrp.h \
lgrp_user.h \
libc_kernel.h \
+ limits.h \
link.h \
list.h \
list_impl.h \
@@ -435,6 +438,9 @@ CHKHDRS= \
ontrap.h \
open.h \
openpromio.h \
+ overlay.h \
+ overlay_common.h \
+ overlay_target.h \
panic.h \
param.h \
pathconf.h \
@@ -511,6 +517,7 @@ CHKHDRS= \
sema_impl.h \
semaphore.h \
sendfile.h \
+ sensors.h \
ser_sync.h \
session.h \
sha1.h \
@@ -659,6 +666,8 @@ CHKHDRS= \
vmem.h \
vmem_impl.h \
vmsystm.h \
+ vnd.h \
+ vnd_errno.h \
vnic.h \
vnic_impl.h \
vnode.h \
@@ -670,11 +679,13 @@ CHKHDRS= \
vuid_queue.h \
vuid_state.h \
vuid_store.h \
+ vxlan.h \
wait.h \
waitq.h \
watchpoint.h \
winlockio.h \
zcons.h \
+ zfd.h \
zone.h \
xti_inet.h \
xti_osi.h \
@@ -840,13 +851,14 @@ FSHDRS= \
autofs.h \
decomp.h \
dv_node.h \
- sdev_impl.h \
fifonode.h \
hsfs_isospec.h \
hsfs_node.h \
hsfs_rrip.h \
hsfs_spec.h \
hsfs_susp.h \
+ hyprlofs.h \
+ hyprlofs_info.h \
lofs_info.h \
lofs_node.h \
mntdata.h \
@@ -856,6 +868,8 @@ FSHDRS= \
pc_label.h \
pc_node.h \
pxfs_ki.h \
+ sdev_impl.h \
+ sdev_plugin.h \
snode.h \
swapnode.h \
tmp.h \
@@ -980,6 +994,7 @@ SATAGENHDRS= \
SYSEVENTHDRS= \
ap_driver.h \
+ datalink.h \
dev.h \
domain.h \
dr.h \
diff --git a/usr/src/uts/common/sys/acct.h b/usr/src/uts/common/sys/acct.h
index f00884681b..e01ad61025 100644
--- a/usr/src/uts/common/sys/acct.h
+++ b/usr/src/uts/common/sys/acct.h
@@ -22,6 +22,7 @@
/*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -88,7 +89,7 @@ extern int acct(const char *);
#if defined(_KERNEL)
-void acct(char);
+void acct(int);
int sysacct(char *);
struct vnode;
diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h
index 547c9cc241..80733aa31e 100644
--- a/usr/src/uts/common/sys/aggr_impl.h
+++ b/usr/src/uts/common/sys/aggr_impl.h
@@ -21,6 +21,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_AGGR_IMPL_H
@@ -54,25 +56,47 @@ extern "C" {
*/
#define MAC_PSEUDO_RING_INUSE 0x01
+#define MAX_GROUPS_PER_PORT 128
+
+/*
+ * VLAN filters placed on the Rx pseudo group.
+ */
+typedef struct aggr_vlan {
+ list_node_t av_link;
+ uint16_t av_vid; /* VLAN ID */
+ uint_t av_refs; /* num aggr clients using this VID */
+} aggr_vlan_t;
+
typedef struct aggr_unicst_addr_s {
uint8_t aua_addr[ETHERADDRL];
struct aggr_unicst_addr_s *aua_next;
} aggr_unicst_addr_t;
typedef struct aggr_pseudo_rx_ring_s {
- mac_ring_handle_t arr_rh; /* filled in by aggr_fill_ring() */
- struct aggr_port_s *arr_port;
- mac_ring_handle_t arr_hw_rh;
- uint_t arr_flags;
- uint64_t arr_gen;
+ mac_ring_handle_t arr_rh; /* set by aggr_fill_ring() */
+ struct aggr_port_s *arr_port;
+ struct aggr_pseudo_rx_group_s *arr_grp;
+ mac_ring_handle_t arr_hw_rh;
+ uint_t arr_flags;
+ uint64_t arr_gen;
} aggr_pseudo_rx_ring_t;
+/*
+ * An aggr pseudo group abstracts the underlying ports' HW groups. For
+ * example, if each port has 8 groups (mac_group_t), then the aggr
+ * will create 8 pseudo groups. Each pseudo group represents a
+ * collection of HW groups: one group from each port. If you have
+ * three ports then the pseudo group stands in for three HW groups.
+ */
typedef struct aggr_pseudo_rx_group_s {
+ uint_t arg_index;
struct aggr_grp_s *arg_grp; /* filled in by aggr_fill_group() */
mac_group_handle_t arg_gh; /* filled in by aggr_fill_group() */
aggr_unicst_addr_t *arg_macaddr;
aggr_pseudo_rx_ring_t arg_rings[MAX_RINGS_PER_GROUP];
uint_t arg_ring_cnt;
+ uint_t arg_untagged; /* num clients untagged */
+ list_t arg_vlans; /* VLANs on this group */
} aggr_pseudo_rx_group_t;
typedef struct aggr_pseudo_tx_ring_s {
@@ -106,12 +130,13 @@ typedef struct aggr_port_s {
lp_collector_enabled : 1,
lp_promisc_on : 1,
lp_no_link_update : 1,
- lp_rx_grp_added : 1,
lp_tx_grp_added : 1,
lp_closing : 1,
- lp_pad_bits : 24;
+ lp_pad_bits : 25;
mac_handle_t lp_mh;
- mac_client_handle_t lp_mch;
+
+ mac_client_handle_t lp_mch;
+
const mac_info_t *lp_mip;
mac_notify_handle_t lp_mnh;
uint_t lp_tx_idx; /* idx in group's tx array */
@@ -123,13 +148,19 @@ typedef struct aggr_port_s {
aggr_lacp_port_t lp_lacp; /* LACP state */
lacp_stats_t lp_lacp_stats;
uint32_t lp_margin;
- mac_promisc_handle_t lp_mphp;
+
mac_unicast_handle_t lp_mah;
/* List of non-primary addresses that requires promiscous mode set */
aggr_unicst_addr_t *lp_prom_addr;
- /* handle of the underlying HW RX group */
- mac_group_handle_t lp_hwgh;
+
+ /*
+ * References to the underlying HW Rx groups of this port.
+ * Used by aggr to program HW classification for the pseudo
+ * groups.
+ */
+ mac_group_handle_t lp_hwghs[MAX_GROUPS_PER_PORT];
+
int lp_tx_ring_cnt;
/* handles of the underlying HW TX rings */
mac_ring_handle_t *lp_tx_rings;
@@ -176,7 +207,7 @@ typedef struct aggr_grp_s {
lg_lso : 1,
lg_pad_bits : 8;
aggr_port_t *lg_ports; /* list of configured ports */
- aggr_port_t *lg_mac_addr_port;
+ aggr_port_t *lg_mac_addr_port; /* using address of this port */
mac_handle_t lg_mh;
zoneid_t lg_zoneid;
uint_t lg_nattached_ports;
@@ -186,11 +217,18 @@ typedef struct aggr_grp_s {
uint_t lg_tx_ports_size; /* size of lg_tx_ports */
uint32_t lg_tx_policy; /* outbound policy */
uint8_t lg_mac_tx_policy;
- uint64_t lg_ifspeed;
link_state_t lg_link_state;
+
+
+ /*
+ * The lg_stat_lock must be held when accessing these fields.
+ */
+ kmutex_t lg_stat_lock;
+ uint64_t lg_ifspeed;
link_duplex_t lg_link_duplex;
uint64_t lg_stat[MAC_NSTAT];
uint64_t lg_ether_stat[ETHER_NSTAT];
+
aggr_lacp_mode_t lg_lacp_mode; /* off, active, or passive */
Agg_t aggr; /* 802.3ad data */
uint32_t lg_hcksum_txflags;
@@ -213,7 +251,9 @@ typedef struct aggr_grp_s {
kthread_t *lg_lacp_rx_thread;
boolean_t lg_lacp_done;
- aggr_pseudo_rx_group_t lg_rx_group;
+ uint_t lg_rx_group_count;
+ aggr_pseudo_rx_group_t lg_rx_groups[MAX_GROUPS_PER_PORT];
+
aggr_pseudo_tx_group_t lg_tx_group;
kmutex_t lg_tx_flowctl_lock;
@@ -335,8 +375,11 @@ extern void aggr_grp_port_hold(aggr_port_t *);
extern void aggr_grp_port_rele(aggr_port_t *);
extern void aggr_grp_port_wait(aggr_grp_t *);
-extern int aggr_port_addmac(aggr_port_t *, const uint8_t *);
-extern void aggr_port_remmac(aggr_port_t *, const uint8_t *);
+extern int aggr_port_addmac(aggr_port_t *, uint_t, const uint8_t *);
+extern void aggr_port_remmac(aggr_port_t *, uint_t, const uint8_t *);
+
+extern int aggr_port_addvlan(aggr_port_t *, uint_t, uint16_t);
+extern int aggr_port_remvlan(aggr_port_t *, uint_t, uint16_t);
extern mblk_t *aggr_ring_tx(void *, mblk_t *);
extern mblk_t *aggr_find_tx_ring(void *, mblk_t *,
diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h
index 1fb5011970..b3b2898987 100644
--- a/usr/src/uts/common/sys/auxv.h
+++ b/usr/src/uts/common/sys/auxv.h
@@ -78,6 +78,9 @@ typedef struct {
#define AT_FLAGS 8 /* processor flags */
#define AT_ENTRY 9 /* a.out entry point */
+/* First introduced on Linux */
+#define AT_RANDOM 25 /* address of 16 random bytes */
+
/*
* These relate to the original PPC ABI document; Linux reused
* the values for other things (see below), so disambiguation of
@@ -90,19 +93,18 @@ typedef struct {
* These are the values from LSB 1.3, the first five are also described
* in the draft amd64 ABI.
*
- * At the time of writing, Solaris doesn't place any of these values into
- * the aux vector, except AT_CLKTCK which is placed on the aux vector for
- * lx branded processes; also, we do similar things via AT_SUN_ values.
+ * At the time of writing, illumos doesn't place any of these values into the
+ * aux vector, except where noted. We do similar things via AT_SUN_ values.
*
* AT_NOTELF 10 program is not ELF?
- * AT_UID 11 real user id
- * AT_EUID 12 effective user id
- * AT_GID 13 real group id
- * AT_EGID 14 effective group id
+ * AT_UID 11 real user id (provided in LX)
+ * AT_EUID 12 effective user id (provided in LX)
+ * AT_GID 13 real group id (provided in LX)
+ * AT_EGID 14 effective group id (provided in LX)
*
* AT_PLATFORM 15
* AT_HWCAP 16
- * AT_CLKTCK 17 c.f. _SC_CLK_TCK
+ * AT_CLKTCK 17 c.f. _SC_CLK_TCK (provided in LX)
* AT_FPUCW 18
*
* AT_DCACHEBSIZE 19 (moved from 10)
@@ -110,6 +112,16 @@ typedef struct {
* AT_UCACHEBSIZE 21 (moved from 12)
*
* AT_IGNOREPPC 22
+ *
+ * On Linux:
+ * AT_* values 18 through 22 are reserved
+ * AT_SECURE 23 secure mode boolean (provided in LX)
+ * AT_BASE_PLATFORM 24 string identifying real platform, may
+ * differ from AT_PLATFORM.
+ * AT_HWCAP2 26 extension of AT_HWCAP
+ * AT_EXECFN 31 filename of program
+ * AT_SYSINFO 32
+ * AT_SYSINFO_EHDR 33 The vDSO location
*/
/*
@@ -186,6 +198,8 @@ extern uint_t getisax(uint32_t *, uint_t);
#define AT_SUN_BRAND_AUX1 2020
#define AT_SUN_BRAND_AUX2 2021
#define AT_SUN_BRAND_AUX3 2022
+#define AT_SUN_BRAND_AUX4 2025
+#define AT_SUN_BRAND_NROOT 2024
/*
* Aux vector for comm page
diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h
index badc3faff8..df22f492bf 100644
--- a/usr/src/uts/common/sys/brand.h
+++ b/usr/src/uts/common/sys/brand.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_BRAND_H
@@ -102,29 +103,106 @@ struct brand_mach_ops;
struct intpdata;
struct execa;
+/*
+ * Common structure to define hooks for brand operation.
+ *
+ * Required Fields:
+ * b_init_brand_data - Setup zone brand data during zone_setbrand
+ * b_free_brand_data - Free zone brand data during zone_destroy
+ * b_brandsys - Syscall handler for brandsys
+ * b_setbrand - Initialize process brand data
+ * b_getattr - Get brand-custom zone attribute
+ * b_setattr - Set brand-custom zone attribute
+ * b_copy_procdata - Copy process brand data during fork
+ * b_proc_exit - Perform process brand exit processing
+ * b_exec - Reset branded process state on exec
+ * b_lwp_setrval - Set return code for forked child
+ * b_initlwp - Initialize lwp brand data (cannot drop p->p_lock)
+ * b_forklwp - Copy lwp brand data during fork
+ * b_freelwp - Free lwp brand data
+ * b_lwpexit - Perform lwp-specific brand exit processing
+ * b_elfexec - Load and execute ELF binary
+ * b_sigset_native_to_brand - Convert sigset native->brand
+ * b_sigset_brand_to_native - Convert sigset brand->native
+ * b_nsig - Maxiumum signal number
+ * b_sendsig - Update process state after sendsig
+ *
+ * Optional Fields:
+ * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp
+ * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur
+ * during lwp creation before b_initlwp could be called.
+ * b_initlwp_post - Complete lwp branding (can temporarily drop p->p_lock)
+ * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior
+ * b_psig_to_proc - Custom additional behavior during psig
+ * b_wait_filter - Filter processes from being matched by waitid
+ * b_native_exec - Provide interpreter path prefix for executables
+ * b_ptrace_exectrap - Custom behavior for legacy ptrace traps
+ * b_map32limit - Specify alternate limit for MAP_32BIT mappings
+ * b_stop_notify - Hook process stop events
+ * b_waitid_helper - Generate synthetic results for waitid
+ * b_sigcld_repost - Post synthetic SIGCLD signals
+ * b_issig_stop - Alter/suppress signal delivery during issig
+ * b_sig_ignorable - Disallow discarding of signals
+ * b_savecontext - Alter context during savecontext
+ * b_restorecontext - Alter context during restorecontext
+ * b_sendsig_stack - Override stack used for signal delivery
+ * b_setid_clear - Override setid_clear behavior
+ * b_pagefault - Trap pagefault events
+ * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all)
+ * b_clearbrand - Perform any actions necessary when clearing the brand.
+ * b_rpc_statd - Upcall to rpc.statd running within the zone
+ * b_acct_out - Output properly formatted accounting record
+ */
struct brand_ops {
- void (*b_init_brand_data)(zone_t *);
+ void (*b_init_brand_data)(zone_t *, kmutex_t *);
void (*b_free_brand_data)(zone_t *);
int (*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t);
void (*b_setbrand)(struct proc *);
int (*b_getattr)(zone_t *, int, void *, size_t *);
int (*b_setattr)(zone_t *, int, void *, size_t);
void (*b_copy_procdata)(struct proc *, struct proc *);
- void (*b_proc_exit)(struct proc *, klwp_t *);
+ void (*b_proc_exit)(struct proc *);
void (*b_exec)();
void (*b_lwp_setrval)(klwp_t *, int, int);
- int (*b_initlwp)(klwp_t *);
+ void *(*b_lwpdata_alloc)(struct proc *);
+ void (*b_lwpdata_free)(void *);
+ void (*b_initlwp)(klwp_t *, void *);
+ void (*b_initlwp_post)(klwp_t *);
void (*b_forklwp)(klwp_t *, klwp_t *);
void (*b_freelwp)(klwp_t *);
void (*b_lwpexit)(klwp_t *);
- int (*b_elfexec)(struct vnode *vp, struct execa *uap,
- struct uarg *args, struct intpdata *idata, int level,
- long *execsz, int setid, caddr_t exec_file,
- struct cred *cred, int brand_action);
+ int (*b_elfexec)(struct vnode *, struct execa *, struct uarg *,
+ struct intpdata *, int, size_t *, int, caddr_t, struct cred *,
+ int *);
void (*b_sigset_native_to_brand)(sigset_t *);
void (*b_sigset_brand_to_native)(sigset_t *);
+ void (*b_sigfd_translate)(k_siginfo_t *);
int b_nsig;
+ void (*b_exit_with_sig)(proc_t *, sigqueue_t *);
+ boolean_t (*b_wait_filter)(proc_t *, proc_t *);
+ boolean_t (*b_native_exec)(uint8_t, const char **);
+ uint32_t (*b_map32limit)(proc_t *);
+ void (*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t);
+ int (*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int,
+ boolean_t *, int *);
+ int (*b_sigcld_repost)(proc_t *, sigqueue_t *);
+ int (*b_issig_stop)(proc_t *, klwp_t *);
+ boolean_t (*b_sig_ignorable)(proc_t *, klwp_t *, int);
+ void (*b_savecontext)(ucontext_t *);
+#if defined(_SYSCALL32_IMPL)
+ void (*b_savecontext32)(ucontext32_t *);
+#endif
+ void (*b_restorecontext)(ucontext_t *);
+ caddr_t (*b_sendsig_stack)(int);
+ void (*b_sendsig)(int);
+ int (*b_setid_clear)(vattr_t *vap, cred_t *cr);
+ int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type,
+ enum seg_rw);
+ boolean_t b_intp_parse_arg;
+ void (*b_clearbrand)(proc_t *, boolean_t);
+ void (*b_rpc_statd)(int, void *, void *);
+ void (*b_acct_out)(struct vnode *, int);
};
/*
@@ -135,6 +213,7 @@ typedef struct brand {
char *b_name;
struct brand_ops *b_ops;
struct brand_mach_ops *b_machops;
+ size_t b_data_size;
} brand_t;
extern brand_t native_brand;
@@ -165,7 +244,7 @@ extern brand_t *brand_register_zone(struct brand_attr *);
extern brand_t *brand_find_name(char *);
extern void brand_unregister_zone(brand_t *);
extern int brand_zone_count(brand_t *);
-extern void brand_setbrand(proc_t *);
+extern int brand_setbrand(proc_t *, boolean_t);
extern void brand_clearbrand(proc_t *, boolean_t);
/*
@@ -178,17 +257,16 @@ extern int brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t,
extern void brand_solaris_copy_procdata(proc_t *, proc_t *,
struct brand *);
extern int brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *,
- intpdata_t *, int, long *, int, caddr_t, cred_t *, int,
- struct brand *, char *, char *, char *, char *, char *);
+ intpdata_t *, int, size_t *, int, caddr_t, cred_t *, int *,
+ struct brand *, char *, char *, char *);
extern void brand_solaris_exec(struct brand *);
extern int brand_solaris_fini(char **, struct modlinkage *,
struct brand *);
extern void brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *);
extern void brand_solaris_freelwp(klwp_t *, struct brand *);
-extern int brand_solaris_initlwp(klwp_t *, struct brand *);
+extern void brand_solaris_initlwp(klwp_t *, struct brand *);
extern void brand_solaris_lwpexit(klwp_t *, struct brand *);
-extern void brand_solaris_proc_exit(struct proc *, klwp_t *,
- struct brand *);
+extern void brand_solaris_proc_exit(struct proc *, struct brand *);
extern void brand_solaris_setbrand(proc_t *, struct brand *);
#if defined(_SYSCALL32)
diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h
index e20e0e0c35..b6b5c20e44 100644
--- a/usr/src/uts/common/sys/buf.h
+++ b/usr/src/uts/common/sys/buf.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*
* Copyright 2017 RackTop Systems.
*/
@@ -188,6 +189,7 @@ struct biostats {
#define B_STARTED 0x2000000 /* io:::start probe called for buf */
#define B_ABRWRITE 0x4000000 /* Application based recovery active */
#define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */
+#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */
/*
* There is some confusion over the meaning of B_FREE and B_INVAL and what
@@ -200,6 +202,12 @@ struct biostats {
* between the sole use of these two flags. In both cases, IO will be done
* if the page is not yet committed to storage.
*
+ * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is
+ * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no
+ * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then
+ * the mapping for the page is only invalidated for the current process.
+ * In this case, the page is not destroyed unless this was the final mapping.
+ *
* In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
* should be used.
*
diff --git a/usr/src/uts/common/sys/contract/process.h b/usr/src/uts/common/sys/contract/process.h
index 21cf94dcf9..2c70d7c9f1 100644
--- a/usr/src/uts/common/sys/contract/process.h
+++ b/usr/src/uts/common/sys/contract/process.h
@@ -21,13 +21,12 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_CONTRACT_PROCESS_H
#define _SYS_CONTRACT_PROCESS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/contract.h>
#include <sys/time.h>
@@ -55,7 +54,8 @@ typedef struct cont_process cont_process_t;
#define CT_PR_NOORPHAN 0x2 /* kill when contract is abandoned */
#define CT_PR_PGRPONLY 0x4 /* only kill process group on fatal errors */
#define CT_PR_REGENT 0x8 /* automatically detach inherited contracts */
-#define CT_PR_ALLPARAM 0xf
+#define CT_PR_KEEP_EXEC 0x10 /* preserve template accross exec */
+#define CT_PR_ALLPARAM 0x1f
/*
* ctr_ev_* flags
diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h
index 6063ff4380..6bc042108c 100644
--- a/usr/src/uts/common/sys/cpucaps.h
+++ b/usr/src/uts/common/sys/cpucaps.h
@@ -22,6 +22,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_CPUCAPS_H
@@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *);
*/
extern int cpucaps_project_set(kproject_t *, rctl_qty_t);
extern int cpucaps_zone_set(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t);
/*
* Get current CPU usage for a project/zone.
*/
extern rctl_qty_t cpucaps_project_get(kproject_t *);
extern rctl_qty_t cpucaps_zone_get(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_base(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *);
/*
* Scheduling class hooks into CPU caps framework.
diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h
index 95afd21827..2cd4ed644d 100644
--- a/usr/src/uts/common/sys/cpucaps_impl.h
+++ b/usr/src/uts/common/sys/cpucaps_impl.h
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_CPUCAPS_IMPL_H
@@ -66,8 +67,12 @@ typedef struct cpucap {
waitq_t cap_waitq; /* waitq for capped threads */
kstat_t *cap_kstat; /* cpucaps specific kstat */
int64_t cap_gen; /* zone cap specific */
+ hrtime_t cap_chk_value; /* effective CPU usage cap */
hrtime_t cap_value; /* scaled CPU usage cap */
hrtime_t cap_usage; /* current CPU usage */
+ hrtime_t cap_base; /* base CPU for burst */
+ u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */
+ u_longlong_t cap_bursting; /* # of ticks currently bursting */
disp_lock_t cap_usagelock; /* protects cap_usage above */
/*
* Per cap statistics.
@@ -75,6 +80,7 @@ typedef struct cpucap {
hrtime_t cap_maxusage; /* maximum cap usage */
u_longlong_t cap_below; /* # of ticks spend below the cap */
u_longlong_t cap_above; /* # of ticks spend above the cap */
+ u_longlong_t cap_above_base; /* # of ticks spent above the base */
} cpucap_t;
/*
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 8565ca053e..7ac2fafe2f 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -23,6 +23,7 @@
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2017 RackTop Systems.
*/
@@ -98,11 +99,11 @@ typedef struct cpu {
/*
* Links to other CPUs. It is safe to walk these lists if
* one of the following is true:
- * - cpu_lock held
- * - preemption disabled via kpreempt_disable
- * - PIL >= DISP_LEVEL
- * - acting thread is an interrupt thread
- * - all other CPUs are paused
+ * - cpu_lock held
+ * - preemption disabled via kpreempt_disable
+ * - PIL >= DISP_LEVEL
+ * - acting thread is an interrupt thread
+ * - all other CPUs are paused
*/
struct cpu *cpu_next; /* next existing CPU */
struct cpu *cpu_prev; /* prev existing CPU */
@@ -130,7 +131,7 @@ typedef struct cpu {
*/
char cpu_runrun; /* scheduling flag - set to preempt */
char cpu_kprunrun; /* force kernel preemption */
- pri_t cpu_chosen_level; /* priority at which cpu */
+ pri_t cpu_chosen_level; /* priority at which cpu */
/* was chosen for scheduling */
kthread_t *cpu_dispthread; /* thread selected for dispatch */
disp_lock_t cpu_thread_lock; /* dispatcher lock on current thread */
@@ -286,7 +287,7 @@ extern cpu_core_t cpu_core[];
* list in avintr.c.
*/
#define INTR_ACTIVE(cpup, level) \
- ((level) <= LOCK_LEVEL ? \
+ ((level) <= LOCK_LEVEL ? \
((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup)))
/*
@@ -389,7 +390,6 @@ extern cpu_core_t cpu_core[];
#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */
/* Note: inside ifdef: _KERNEL || _KMEMUSER || _BOOT */
-#if defined(_MACHDEP)
/*
* Macros for manipulating sets of CPUs as a bitmap. Note that this
@@ -405,34 +405,60 @@ extern cpu_core_t cpu_core[];
#define CPUSET_WORDS BT_BITOUL(NCPU)
#define CPUSET_NOTINSET ((uint_t)-1)
-#if CPUSET_WORDS > 1
-
-typedef struct cpuset {
+#if defined(_MACHDEP)
+struct cpuset {
ulong_t cpub[CPUSET_WORDS];
-} cpuset_t;
+};
+#else
+struct cpuset;
+#endif
+
+typedef struct cpuset cpuset_t;
+
+extern cpuset_t *cpuset_alloc(int);
+extern void cpuset_free(cpuset_t *);
+
+/*
+ * Functions for manipulating cpusets. These were previously considered
+ * private when some cpuset_t handling was performed in the CPUSET_* macros.
+ * They are now acceptable to use in non-_MACHDEP code.
+ */
+extern void cpuset_all(cpuset_t *);
+extern void cpuset_all_but(cpuset_t *, const uint_t);
+extern int cpuset_isnull(const cpuset_t *);
+extern int cpuset_isequal(const cpuset_t *, const cpuset_t *);
+extern void cpuset_only(cpuset_t *, const uint_t);
+extern long cpu_in_set(const cpuset_t *, const uint_t);
+extern void cpuset_add(cpuset_t *, const uint_t);
+extern void cpuset_del(cpuset_t *, const uint_t);
+extern uint_t cpuset_find(const cpuset_t *);
+extern void cpuset_bounds(const cpuset_t *, uint_t *, uint_t *);
+extern void cpuset_atomic_del(cpuset_t *, const uint_t);
+extern void cpuset_atomic_add(cpuset_t *, const uint_t);
+extern long cpuset_atomic_xadd(cpuset_t *, const uint_t);
+extern long cpuset_atomic_xdel(cpuset_t *, const uint_t);
+extern void cpuset_or(cpuset_t *, cpuset_t *);
+extern void cpuset_xor(cpuset_t *, cpuset_t *);
+extern void cpuset_and(cpuset_t *, cpuset_t *);
+extern void cpuset_zero(cpuset_t *);
+
+
+#if defined(_MACHDEP)
/*
- * Private functions for manipulating cpusets that do not fit in a
- * single word. These should not be used directly; instead the
- * CPUSET_* macros should be used so the code will be portable
- * across different definitions of NCPU.
+ * Prior to the cpuset_t restructuring, the CPUSET_* macros contained
+ * significant logic, rather than directly invoking the backend functions.
+ * They are maintained here so that existing _MACHDEP code can use them.
*/
-extern void cpuset_all(cpuset_t *);
-extern void cpuset_all_but(cpuset_t *, uint_t);
-extern int cpuset_isnull(cpuset_t *);
-extern int cpuset_cmp(cpuset_t *, cpuset_t *);
-extern void cpuset_only(cpuset_t *, uint_t);
-extern uint_t cpuset_find(cpuset_t *);
-extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *);
#define CPUSET_ALL(set) cpuset_all(&(set))
#define CPUSET_ALL_BUT(set, cpu) cpuset_all_but(&(set), cpu)
#define CPUSET_ONLY(set, cpu) cpuset_only(&(set), cpu)
-#define CPU_IN_SET(set, cpu) BT_TEST((set).cpub, cpu)
-#define CPUSET_ADD(set, cpu) BT_SET((set).cpub, cpu)
-#define CPUSET_DEL(set, cpu) BT_CLEAR((set).cpub, cpu)
+#define CPU_IN_SET(set, cpu) cpu_in_set(&(set), cpu)
+#define CPUSET_ADD(set, cpu) cpuset_add(&(set), cpu)
+#define CPUSET_DEL(set, cpu) cpuset_del(&(set), cpu)
#define CPUSET_ISNULL(set) cpuset_isnull(&(set))
-#define CPUSET_ISEQUAL(set1, set2) cpuset_cmp(&(set1), &(set2))
+#define CPUSET_ISEQUAL(set1, set2) cpuset_isequal(&(set1), &(set2))
/*
* Find one CPU in the cpuset.
@@ -460,86 +486,24 @@ extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *);
* deleting a cpu that's not in the cpuset)
*/
-#define CPUSET_ATOMIC_DEL(set, cpu) BT_ATOMIC_CLEAR((set).cpub, (cpu))
-#define CPUSET_ATOMIC_ADD(set, cpu) BT_ATOMIC_SET((set).cpub, (cpu))
-
-#define CPUSET_ATOMIC_XADD(set, cpu, result) \
- BT_ATOMIC_SET_EXCL((set).cpub, cpu, result)
-
-#define CPUSET_ATOMIC_XDEL(set, cpu, result) \
- BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result)
-
-
-#define CPUSET_OR(set1, set2) { \
- int _i; \
- for (_i = 0; _i < CPUSET_WORDS; _i++) \
- (set1).cpub[_i] |= (set2).cpub[_i]; \
-}
-
-#define CPUSET_XOR(set1, set2) { \
- int _i; \
- for (_i = 0; _i < CPUSET_WORDS; _i++) \
- (set1).cpub[_i] ^= (set2).cpub[_i]; \
-}
-
-#define CPUSET_AND(set1, set2) { \
- int _i; \
- for (_i = 0; _i < CPUSET_WORDS; _i++) \
- (set1).cpub[_i] &= (set2).cpub[_i]; \
-}
-
-#define CPUSET_ZERO(set) { \
- int _i; \
- for (_i = 0; _i < CPUSET_WORDS; _i++) \
- (set).cpub[_i] = 0; \
-}
-
-#elif CPUSET_WORDS == 1
-
-typedef ulong_t cpuset_t; /* a set of CPUs */
-
-#define CPUSET(cpu) (1UL << (cpu))
-
-#define CPUSET_ALL(set) ((void)((set) = ~0UL))
-#define CPUSET_ALL_BUT(set, cpu) ((void)((set) = ~CPUSET(cpu)))
-#define CPUSET_ONLY(set, cpu) ((void)((set) = CPUSET(cpu)))
-#define CPU_IN_SET(set, cpu) ((set) & CPUSET(cpu))
-#define CPUSET_ADD(set, cpu) ((void)((set) |= CPUSET(cpu)))
-#define CPUSET_DEL(set, cpu) ((void)((set) &= ~CPUSET(cpu)))
-#define CPUSET_ISNULL(set) ((set) == 0)
-#define CPUSET_ISEQUAL(set1, set2) ((set1) == (set2))
-#define CPUSET_OR(set1, set2) ((void)((set1) |= (set2)))
-#define CPUSET_XOR(set1, set2) ((void)((set1) ^= (set2)))
-#define CPUSET_AND(set1, set2) ((void)((set1) &= (set2)))
-#define CPUSET_ZERO(set) ((void)((set) = 0))
-
-#define CPUSET_FIND(set, cpu) { \
- cpu = (uint_t)(lowbit(set) - 1); \
-}
-
-#define CPUSET_BOUNDS(set, smallest, largest) { \
- smallest = (uint_t)(lowbit(set) - 1); \
- largest = (uint_t)(highbit(set) - 1); \
-}
+#define CPUSET_ATOMIC_DEL(set, cpu) cpuset_atomic_del(&(set), cpu)
+#define CPUSET_ATOMIC_ADD(set, cpu) cpuset_atomic_add(&(set), cpu)
-#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu))
-#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu))
+#define CPUSET_ATOMIC_XADD(set, cpu, result) \
+ (result) = cpuset_atomic_xadd(&(set), cpu)
-#define CPUSET_ATOMIC_XADD(set, cpu, result) \
- { result = atomic_set_long_excl(&(set), (cpu)); }
+#define CPUSET_ATOMIC_XDEL(set, cpu, result) \
+ (result) = cpuset_atomic_xdel(&(set), cpu)
-#define CPUSET_ATOMIC_XDEL(set, cpu, result) \
- { result = atomic_clear_long_excl(&(set), (cpu)); }
+#define CPUSET_OR(set1, set2) cpuset_or(&(set1), &(set2))
-#else /* CPUSET_WORDS <= 0 */
+#define CPUSET_XOR(set1, set2) cpuset_xor(&(set1), &(set2))
-#error NCPU is undefined or invalid
+#define CPUSET_AND(set1, set2) cpuset_and(&(set1), &(set2))
-#endif /* CPUSET_WORDS */
-
-extern cpuset_t cpu_seqid_inuse;
+#define CPUSET_ZERO(set) cpuset_zero(&(set))
-#endif /* _MACHDEP */
+#endif /* _MACHDEP */
#endif /* _KERNEL || _KMEMUSER || _BOOT */
#define CPU_CPR_OFFLINE 0x0
@@ -550,10 +514,14 @@ extern cpuset_t cpu_seqid_inuse;
#if defined(_KERNEL) || defined(_KMEMUSER)
+extern cpuset_t cpu_seqid_inuse;
+
extern struct cpu *cpu[]; /* indexed by CPU number */
extern struct cpu **cpu_seq; /* indexed by sequential CPU id */
extern cpu_t *cpu_list; /* list of CPUs */
extern cpu_t *cpu_active; /* list of active CPUs */
+extern cpuset_t cpu_active_set; /* cached set of active CPUs */
+extern cpuset_t cpu_available; /* cached set of available CPUs */
extern int ncpus; /* number of CPUs present */
extern int ncpus_online; /* number of CPUs not quiesced */
extern int max_ncpus; /* max present before ncpus is known */
@@ -572,13 +540,19 @@ extern struct cpu *curcpup(void);
#endif
/*
- * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id
- * as the target and to grab cpu_lock instead of requiring the caller
- * to grab it.
+ * CPU_CURRENT indicates to thread_affinity_set() to use whatever curthread's
+ * current CPU is; holding cpu_lock is not required.
*/
#define CPU_CURRENT -3
/*
+ * CPU_BEST can be used by thread_affinity_set() callers to set affinity to a
+ * good CPU (in particular, an ht_acquire()-friendly choice); holding cpu_lock
+ * is not required.
+ */
+#define CPU_BEST -4
+
+/*
* Per-CPU statistics
*
* cpu_stats_t contains numerous system and VM-related statistics, in the form
@@ -613,7 +587,7 @@ extern struct cpu *curcpup(void);
*/
#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++)
-#endif /* _KERNEL || _KMEMUSER */
+#endif /* defined(_KERNEL) || defined(_KMEMUSER) */
/*
* CPU support routines (not for genassym.c)
diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h
index fb79dfecde..1f938132e0 100644
--- a/usr/src/uts/common/sys/cred.h
+++ b/usr/src/uts/common/sys/cred.h
@@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *);
extern gid_t crgetrgid(const cred_t *);
extern gid_t crgetsgid(const cred_t *);
extern zoneid_t crgetzoneid(const cred_t *);
+extern zoneid_t crgetzonedid(const cred_t *);
extern projid_t crgetprojid(const cred_t *);
extern cred_t *crgetmapped(const cred_t *);
diff --git a/usr/src/uts/common/sys/cyclic.h b/usr/src/uts/common/sys/cyclic.h
index 5f28543f9f..270a09449f 100644
--- a/usr/src/uts/common/sys/cyclic.h
+++ b/usr/src/uts/common/sys/cyclic.h
@@ -23,6 +23,7 @@
* Use is subject to license terms.
*
* Copyright 2017 RackTop Systems.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_CYCLIC_H
@@ -81,6 +82,7 @@ extern cyclic_id_t cyclic_add_omni(cyc_omni_handler_t *);
extern void cyclic_remove(cyclic_id_t);
extern void cyclic_bind(cyclic_id_t, cpu_t *, cpupart_t *);
extern int cyclic_reprogram(cyclic_id_t, hrtime_t);
+extern void cyclic_move_here(cyclic_id_t);
extern hrtime_t cyclic_getres();
extern int cyclic_offline(cpu_t *cpu);
diff --git a/usr/src/uts/common/sys/disp.h b/usr/src/uts/common/sys/disp.h
index b324f4d323..cb3711edcd 100644
--- a/usr/src/uts/common/sys/disp.h
+++ b/usr/src/uts/common/sys/disp.h
@@ -23,6 +23,8 @@
* Use is subject to license terms.
*
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -63,11 +65,11 @@ typedef struct _disp {
/*
* Priorities:
* disp_maxrunpri is the maximum run priority of runnable threads
- * on this queue. It is -1 if nothing is runnable.
+ * on this queue. It is -1 if nothing is runnable.
*
* disp_max_unbound_pri is the maximum run priority of threads on
* this dispatch queue but runnable by any CPU. This may be left
- * artificially high, then corrected when some CPU tries to take
+ * artificially high, then corrected when some CPU tries to take
* an unbound thread. It is -1 if nothing is runnable.
*/
pri_t disp_maxrunpri; /* maximum run priority */
@@ -151,8 +153,7 @@ extern void dq_srundec(kthread_t *);
extern void cpu_rechoose(kthread_t *);
extern void cpu_surrender(kthread_t *);
extern void kpreempt(int);
-extern struct cpu *disp_lowpri_cpu(struct cpu *, struct lgrp_ld *, pri_t,
- struct cpu *);
+extern struct cpu *disp_lowpri_cpu(struct cpu *, kthread_t *, pri_t);
extern int disp_bound_threads(struct cpu *, int);
extern int disp_bound_anythreads(struct cpu *, int);
extern int disp_bound_partition(struct cpu *, int);
@@ -167,6 +168,8 @@ extern void resume_from_zombie(kthread_t *)
extern void disp_swapped_enq(kthread_t *);
extern int disp_anywork(void);
+extern struct cpu *disp_choose_best_cpu(void);
+
#define KPREEMPT_SYNC (-1)
#define kpreempt_disable() \
{ \
@@ -183,6 +186,8 @@ extern int disp_anywork(void);
#endif /* _KERNEL */
+#define CPU_IDLE_PRI (-1)
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h
index f5c990e7c0..2178ad1f0d 100644
--- a/usr/src/uts/common/sys/dktp/dadk.h
+++ b/usr/src/uts/common/sys/dktp/dadk.h
@@ -65,6 +65,8 @@ struct dadk {
kstat_t *dad_errstats; /* error stats */
kmutex_t dad_cmd_mutex;
int dad_cmd_count;
+ uint32_t dad_err_cnt; /* number of recent errors */
+ hrtime_t dad_last_log; /* time of last error log */
};
#define DAD_SECSIZ dad_phyg.g_secsiz
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index 6449f39a35..5be223ce93 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -192,6 +192,7 @@ typedef struct dld_ioc_rename {
datalink_id_t dir_linkid1;
datalink_id_t dir_linkid2;
char dir_link[MAXLINKNAMELEN];
+ boolean_t dir_zoneinit;
} dld_ioc_rename_t;
/*
@@ -204,6 +205,7 @@ typedef struct dld_ioc_rename {
typedef struct dld_ioc_zid {
zoneid_t diz_zid;
datalink_id_t diz_linkid;
+ boolean_t diz_transient;
} dld_ioc_zid_t;
/*
@@ -356,6 +358,7 @@ typedef struct dld_ioc_led {
#define DLD_CAPAB_POLL 0x00000002
#define DLD_CAPAB_PERIM 0x00000003
#define DLD_CAPAB_LSO 0x00000004
+#define DLD_CAPAB_IPCHECK 0x00000005
#define DLD_ENABLE 0x00000001
#define DLD_DISABLE 0x00000002
@@ -382,6 +385,7 @@ typedef struct dld_ioc_led {
*/
typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t);
+#define DI_DIRECT_RAW 0x1
/*
* Direct Tx/Rx capability.
*/
@@ -406,8 +410,16 @@ typedef struct dld_capab_direct_s {
/* flow control "can I put on a ring" callback */
uintptr_t di_tx_fctl_df; /* canput-like callback */
void *di_tx_fctl_dh;
+
+ /* flags that control our behavior */
+ uint_t di_flags;
} dld_capab_direct_t;
+typedef struct dld_capab_ipcheck_s {
+ uintptr_t ipc_allowed_df;
+ void *ipc_allowed_dh;
+} dld_capab_ipcheck_t;
+
/*
* Polling/softring capability.
*/
diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h
index 035eea893a..336fa9cb67 100644
--- a/usr/src/uts/common/sys/dld_impl.h
+++ b/usr/src/uts/common/sys/dld_impl.h
@@ -53,7 +53,8 @@ typedef enum {
typedef enum {
DLD_UNINITIALIZED,
DLD_PASSIVE,
- DLD_ACTIVE
+ DLD_ACTIVE,
+ DLD_EXCLUSIVE
} dld_passivestate_t;
/*
@@ -256,6 +257,8 @@ extern void dld_str_rx_unitdata(void *, mac_resource_handle_t,
extern void dld_str_notify_ind(dld_str_t *);
extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *,
uintptr_t, uint16_t);
+extern mac_tx_cookie_t str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *,
+ uintptr_t, uint16_t);
extern int dld_flow_ctl_callb(dld_str_t *, uint64_t,
int (*func)(), void *);
diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h
index 2f519a8eda..093a4dc0c3 100644
--- a/usr/src/uts/common/sys/dld_ioc.h
+++ b/usr/src/uts/common/sys/dld_ioc.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_DLD_IOC_H
@@ -59,6 +60,7 @@ extern "C" {
#define IPTUN_IOC 0x454A
#define BRIDGE_IOC 0xB81D
#define IBPART_IOC 0x6171
+#define OVERLAY_IOC 0x2005
/* GLDv3 modules use these macros to generate unique ioctl commands */
#define DLDIOC(cmdid) DLD_IOC_CMD(DLD_IOC, (cmdid))
@@ -68,6 +70,7 @@ extern "C" {
#define IPTUNIOC(cmdid) DLD_IOC_CMD(IPTUN_IOC, (cmdid))
#define BRIDGEIOC(cmdid) DLD_IOC_CMD(BRIDGE_IOC, (cmdid))
#define IBPARTIOC(cmdid) DLD_IOC_CMD(IBPART_IOC, (cmdid))
+#define OVERLAYIOC(cmdid) DLD_IOC_CMD(OVERLAY_IOC, (cmdid))
#ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 5bc2bd41c5..d76daffeb7 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -107,6 +108,7 @@ typedef struct dl_ipnetinfo {
#define DL_PASSIVE_REQ 0x114 /* Allow access to aggregated link */
#define DL_INTR_MODE_REQ 0x115 /* Request Rx processing in INTR mode */
#define DL_NOTIFY_CONF 0x116 /* Notification from upstream */
+#define DL_EXCLUSIVE_REQ 0x117 /* Make bind active */
/*
* Primitives used for Connectionless Service
@@ -388,6 +390,8 @@ typedef struct dl_ipnetinfo {
#define DL_PROMISC_PHYS 0x01 /* promiscuous mode at phys level */
#define DL_PROMISC_SAP 0x02 /* promiscuous mode at sap level */
#define DL_PROMISC_MULTI 0x03 /* promiscuous mode for multicast */
+#define DL_PROMISC_RX_ONLY 0x04 /* above only enabled for rx */
+#define DL_PROMISC_FIXUPS 0x05 /* above will be fixed up */
/*
* DLPI notification codes for DL_NOTIFY_REQ primitives.
@@ -673,11 +677,11 @@ typedef struct {
#define HCKSUM_ENABLE 0x01 /* Set to enable hardware checksum */
/* capability */
#define HCKSUM_INET_PARTIAL 0x02 /* Partial 1's complement checksum */
- /* ability */
+ /* ability for TCP/UDP packets. */
#define HCKSUM_INET_FULL_V4 0x04 /* Full 1's complement checksum */
- /* ability for IPv4 packets. */
+ /* ability for IPv4 TCP/UDP packets. */
#define HCKSUM_INET_FULL_V6 0x08 /* Full 1's complement checksum */
- /* ability for IPv6 packets. */
+ /* ability for IPv6 TCP/UDP packets. */
#define HCKSUM_IPHDRCKSUM 0x10 /* IPv4 Header checksum offload */
/* capability */
#ifdef _KERNEL
@@ -1107,6 +1111,13 @@ typedef struct {
} dl_intr_mode_req_t;
/*
+ * DL_EXCLUSIVE_REQ, M_PROTO type
+ */
+typedef struct {
+ t_uscalar_t dl_primitive;
+} dl_exclusive_req_t;
+
+/*
* CONNECTION-ORIENTED SERVICE PRIMITIVES
*/
@@ -1528,6 +1539,7 @@ union DL_primitives {
dl_control_ack_t control_ack;
dl_passive_req_t passive_req;
dl_intr_mode_req_t intr_mode_req;
+ dl_exclusive_req_t exclusive_req;
};
#define DL_INFO_REQ_SIZE sizeof (dl_info_req_t)
@@ -1596,6 +1608,7 @@ union DL_primitives {
#define DL_CONTROL_ACK_SIZE sizeof (dl_control_ack_t)
#define DL_PASSIVE_REQ_SIZE sizeof (dl_passive_req_t)
#define DL_INTR_MODE_REQ_SIZE sizeof (dl_intr_mode_req_t)
+#define DL_EXCLUSIVE_REQ_SIZE sizeof (dl_exclusive_req_t)
#ifdef _KERNEL
/*
diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h
index 6bd2bbe35a..81f9e2abac 100644
--- a/usr/src/uts/common/sys/dls.h
+++ b/usr/src/uts/common/sys/dls.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_DLS_H
@@ -85,6 +86,8 @@ typedef struct dls_link_s dls_link_t;
#define DLS_PROMISC_SAP 0x00000001
#define DLS_PROMISC_MULTI 0x00000002
#define DLS_PROMISC_PHYS 0x00000004
+#define DLS_PROMISC_RX_ONLY 0x00000008
+#define DLS_PROMISC_FIXUPS 0x00000010
extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *);
extern void dls_close(dld_str_t *);
@@ -106,11 +109,13 @@ extern void str_notify(void *, mac_notify_type_t);
extern int dls_devnet_open(const char *,
dls_dl_handle_t *, dev_t *);
+extern int dls_devnet_open_in_zone(const char *,
+ dls_dl_handle_t *, dev_t *, zoneid_t);
extern void dls_devnet_close(dls_dl_handle_t);
extern boolean_t dls_devnet_rebuild();
extern int dls_devnet_rename(datalink_id_t, datalink_id_t,
- const char *);
+ const char *, boolean_t);
extern int dls_devnet_create(mac_handle_t, datalink_id_t,
zoneid_t);
extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *,
@@ -122,12 +127,13 @@ extern int dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *);
extern void dls_devnet_rele(dls_dl_handle_t);
extern void dls_devnet_prop_task_wait(dls_dl_handle_t);
+extern const char *dls_devnet_link(dls_dl_handle_t);
extern const char *dls_devnet_mac(dls_dl_handle_t);
extern uint16_t dls_devnet_vid(dls_dl_handle_t);
extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t);
extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *);
extern int dls_devnet_phydev(datalink_id_t, dev_t *);
-extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t);
+extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t);
extern zoneid_t dls_devnet_getzid(dls_dl_handle_t);
extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t);
extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t);
@@ -141,6 +147,8 @@ extern int dls_mgmt_update(const char *, uint32_t, boolean_t,
extern int dls_mgmt_get_linkinfo(datalink_id_t, char *,
datalink_class_t *, uint32_t *, uint32_t *);
extern int dls_mgmt_get_linkid(const char *, datalink_id_t *);
+extern int dls_mgmt_get_linkid_in_zone(const char *,
+ datalink_id_t *, zoneid_t);
extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t,
datalink_media_t, uint32_t);
extern int dls_devnet_macname2linkid(const char *,
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 60f51c47b5..329f8dd08e 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_DLS_IMPL_H
@@ -46,11 +47,12 @@ typedef struct dls_multicst_addr_s {
} dls_multicst_addr_t;
struct dls_link_s { /* Protected by */
- char dl_name[MAXNAMELEN]; /* SL */
+ char dl_name[MAXNAMELEN]; /* RO */
uint_t dl_ddi_instance; /* SL */
mac_handle_t dl_mh; /* SL */
mac_client_handle_t dl_mch; /* SL */
mac_unicast_handle_t dl_mah; /* SL */
+ mac_notify_handle_t dl_mnh; /* SL */
const mac_info_t *dl_mip; /* SL */
uint_t dl_ref; /* SL */
mod_hash_t *dl_str_hash; /* SL, modhash lock */
@@ -61,6 +63,7 @@ struct dls_link_s { /* Protected by */
uint_t dl_zone_ref;
link_tagmode_t dl_tagmode; /* atomic */
uint_t dl_nonip_cnt; /* SL */
+ uint_t dl_exclusive; /* SL */
};
typedef struct dls_head_s {
@@ -96,13 +99,16 @@ extern void dls_create_str_kstats(dld_str_t *);
extern int dls_stat_update(kstat_t *, dls_link_t *, int);
extern int dls_stat_create(const char *, int, const char *,
zoneid_t, int (*)(struct kstat *, int), void *,
- kstat_t **);
+ kstat_t **, zoneid_t);
+extern void dls_stat_delete(kstat_t *);
extern int dls_devnet_open_by_dev(dev_t, dls_link_t **,
dls_dl_handle_t *);
extern int dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *,
dls_link_t **);
extern void dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *);
+extern int dls_devnet_hold_tmp_by_link(dls_link_t *,
+ dls_dl_handle_t *);
extern void dls_init(void);
extern int dls_fini(void);
@@ -126,6 +132,7 @@ extern void dls_mgmt_init(void);
extern void dls_mgmt_fini(void);
extern int dls_mgmt_get_phydev(datalink_id_t, dev_t *);
+extern int dls_exclusive_set(dld_str_t *, boolean_t);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h
index b4032c24d6..6fec277991 100644
--- a/usr/src/uts/common/sys/dls_mgmt.h
+++ b/usr/src/uts/common/sys/dls_mgmt.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#ifndef _DLS_MGMT_H
@@ -46,13 +47,15 @@ typedef enum {
DATALINK_CLASS_SIMNET = 0x20,
DATALINK_CLASS_BRIDGE = 0x40,
DATALINK_CLASS_IPTUN = 0x80,
- DATALINK_CLASS_PART = 0x100
+ DATALINK_CLASS_PART = 0x100,
+ DATALINK_CLASS_OVERLAY = 0x200
} datalink_class_t;
#define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \
DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \
DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \
- DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART)
+ DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \
+ DATALINK_CLASS_OVERLAY)
/*
* A combination of flags and media.
@@ -111,10 +114,14 @@ typedef uint64_t datalink_media_t;
#define DLMGMT_CMD_BASE 128
/*
- * Indicate the link mapping is active or persistent
+ * Indicate if the link mapping is active, persistent, or transient. A
+ * transient link is an active link with a twist -- it is an active
+ * link which is destroyed along with the zone rather than reassigned
+ * to the GZ.
*/
#define DLMGMT_ACTIVE 0x01
#define DLMGMT_PERSIST 0x02
+#define DLMGMT_TRANSIENT 0x04
/* upcall argument */
typedef struct dlmgmt_door_arg {
@@ -165,6 +172,7 @@ typedef struct dlmgmt_door_getname {
typedef struct dlmgmt_door_getlinkid {
int ld_cmd;
char ld_link[MAXLINKNAMELEN];
+ zoneid_t ld_zoneid;
} dlmgmt_door_getlinkid_t;
typedef struct dlmgmt_door_getnext_s {
@@ -225,6 +233,7 @@ typedef struct dlmgmt_getattr_retval_s {
char lr_attrval[MAXLINKATTRVALLEN];
} dlmgmt_getattr_retval_t;
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h
index 4bd884e9c2..1a2ca397ef 100644
--- a/usr/src/uts/common/sys/elf.h
+++ b/usr/src/uts/common/sys/elf.h
@@ -500,6 +500,11 @@ typedef struct {
#define PT_GNU_STACK 0x6474e551 /* Indicates stack executability */
#define PT_GNU_RELRO 0x6474e552 /* Read-only after relocation */
+/*
+ * Linux specific program headers not even used by Linux (!!)
+ */
+#define PT_PAX_FLAGS 0x65041580 /* PaX flags (see below) */
+
#define PT_LOSUNW 0x6ffffffa
#define PT_SUNWBSS 0x6ffffffa /* Sun Specific segment (unused) */
#define PT_SUNWSTACK 0x6ffffffb /* describes the stack segment */
@@ -515,6 +520,45 @@ typedef struct {
#define PF_W 0x2
#define PF_X 0x1
+/*
+ * PaX is a regrettable series of never-integrated Linux patches for a
+ * facility to provide additional protections on memory pages for purposes of
+ * increasing security, and for allowing binaries to demand (or refuse) those
+ * protections via the PT_PAX_FLAGS program header. (Portents of its
+ * rudderless existence, "PaX" is a term of indefinite origin written by an
+ * unknown group of people.) This facility is unfortunate in any number of
+ * ways, and was largely obviated by the broad adoption of non-executable
+ * stacks at any rate -- but it lives on in binaries that continue to mark
+ * themselves to explicitly refuse the (never-integrated, now-obviated)
+ * facility. One might cringe that PaX overloads the meaning of the p_flags
+ * to specify protections, but that is the least of its transgressions:
+ * instead of using one p_type constant to explicitly enable a series of
+ * protections and another to explicitly disable others, it insists on
+ * conflating both actions into PT_PAX_FLAGS. The resulting doubling of
+ * constant definitions (two constant definitions for every protection instead
+ * of merely one) assures that the values can't even fit in the eight
+ * PF_MASKOS bits putatively defined to provide a modicum of cleanliness for
+ * such filthy functionality. And were all of this not enough, there is one
+ * final nomenclature insult to be added to this semantic injury: the
+ * constants for the p_flags don't even embed "_PAX_" in their name -- despite
+ * the fact that this is their only purpose! We resist the temptation to
+ * right this final wrong here; we grit our teeth and provide exactly the
+ * Linux definitions -- or rather, what would have been the Linux definitions
+ * had this belching jalopy ever been permitted to crash itself into mainline.
+ */
+#define PF_PAGEEXEC 0x00000010 /* PaX: enable PAGEEXEC */
+#define PF_NOPAGEEXEC 0x00000020 /* PaX: disable PAGEEXEC */
+#define PF_SEGMEXEC 0x00000040 /* PaX: enable SEGMEXEC */
+#define PF_NOSEGMEXEC 0x00000080 /* PaX: disable SEGMEXEC */
+#define PF_MPROTECT 0x00000100 /* PaX: enable MPROTECT */
+#define PF_NOMPROTECT 0x00000200 /* PaX: disable MPROTECT */
+#define PF_RANDEXEC 0x00000400 /* PaX: enable RANDEXEC */
+#define PF_NORANDEXEC 0x00000800 /* PaX: disable RANDEXEC */
+#define PF_EMUTRAMP 0x00001000 /* PaX: enable EMUTRAMP */
+#define PF_NOEMUTRAMP 0x00002000 /* PaX: disable EMUTRAMP */
+#define PF_RANDMMAP 0x00004000 /* PaX: enable RANDMMAP */
+#define PF_NORANDMMAP 0x00008000 /* PaX: disable RANDMMAP */
+
#define PF_MASKOS 0x0ff00000 /* OS specific values */
#define PF_MASKPROC 0xf0000000 /* processor specific values */
diff --git a/usr/src/uts/common/sys/eventfd.h b/usr/src/uts/common/sys/eventfd.h
index 1b0d961b0b..b64a101348 100644
--- a/usr/src/uts/common/sys/eventfd.h
+++ b/usr/src/uts/common/sys/eventfd.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright (c) 2015 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
*/
/*
@@ -47,6 +47,13 @@ typedef uint64_t eventfd_t;
#define EVENTFDIOC (('e' << 24) | ('f' << 16) | ('d' << 8))
#define EVENTFDIOC_SEMAPHORE (EVENTFDIOC | 1) /* toggle sem state */
+/*
+ * Kernel-internal method to write to eventfd while bypassing overflow limits,
+ * therefore avoiding potential to block as well. This is used to fulfill AIO
+ * behavior in LX related to eventfd notification.
+ */
+#define EVENTFDIOC_POST (EVENTFDIOC | 2)
+
#ifndef _KERNEL
extern int eventfd(unsigned int, int);
@@ -58,6 +65,7 @@ extern int eventfd_write(int, eventfd_t);
#define EVENTFDMNRN_EVENTFD 0
#define EVENTFDMNRN_CLONE 1
#define EVENTFD_VALMAX (ULLONG_MAX - 1ULL)
+#define EVENTFD_VALOVERFLOW ULLONG_MAX
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h
index 8056f9a8e8..12115b7e27 100644
--- a/usr/src/uts/common/sys/exec.h
+++ b/usr/src/uts/common/sys/exec.h
@@ -26,6 +26,10 @@
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
#ifndef _SYS_EXEC_H
#define _SYS_EXEC_H
@@ -79,7 +83,7 @@ typedef struct uarg {
ssize_t arglen;
char *fname;
char *pathname;
- ssize_t auxsize;
+ size_t auxsize;
caddr_t stackend;
size_t stk_align;
size_t stk_size;
@@ -102,10 +106,13 @@ typedef struct uarg {
vnode_t *ex_vp;
char *emulator;
char *brandname;
+ const char *brand_nroot;
char *auxp_auxflags; /* addr of auxflags auxv on the user stack */
char *auxp_brand; /* address of first brand auxv on user stack */
cred_t *pfcred;
boolean_t scrubenv;
+ uintptr_t maxstack;
+ boolean_t stk_prot_override;
uintptr_t commpage;
} uarg_t;
@@ -175,8 +182,8 @@ struct execsw {
int exec_maglen;
int (*exec_func)(struct vnode *vp, struct execa *uap,
struct uarg *args, struct intpdata *idata, int level,
- long *execsz, int setid, caddr_t exec_file,
- struct cred *cred, int brand_action);
+ size_t *execsz, int setid, caddr_t exec_file,
+ struct cred *cred, int *brand_action);
int (*exec_core)(struct vnode *vp, struct proc *p,
struct cred *cred, rlim64_t rlimit, int sig,
core_content_t content);
@@ -213,8 +220,8 @@ extern int exece(const char *fname, const char **argp, const char **envp);
extern int exec_common(const char *fname, const char **argp,
const char **envp, int brand_action);
extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args,
- struct intpdata *idata, int level, long *execsz, caddr_t exec_file,
- struct cred *cred, int brand_action);
+ struct intpdata *idata, int level, size_t *execsz, caddr_t exec_file,
+ struct cred *cred, int *brand_action);
extern struct execsw *allocate_execsw(char *name, char *magic,
size_t magic_size);
extern struct execsw *findexecsw(char *magic);
@@ -239,26 +246,32 @@ extern void exec_set_sp(size_t);
* when compiling the 32-bit compatability elf code in the elfexec module.
*/
extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
- long *, int, caddr_t, cred_t *, int);
+ size_t *, int, caddr_t, cred_t *, int *);
extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *,
- intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *);
+ intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *,
+ uintptr_t *, uintptr_t *);
+extern int elfreadhdr(vnode_t *, cred_t *, Ehdr *, uint_t *, caddr_t *,
+ size_t *);
#endif /* !_ELF32_COMPAT */
#if defined(_LP64)
extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
- long *, int, caddr_t, cred_t *, int);
+ size_t *, int, caddr_t, cred_t *, int *);
extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *,
- intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *);
+ intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *,
+ uintptr_t *, uintptr_t *);
+extern int elf32readhdr(vnode_t *, cred_t *, Elf32_Ehdr *, uint_t *, caddr_t *,
+ size_t *);
#endif /* _LP64 */
/*
* Utility functions for exec module core routines:
*/
-extern int core_seg(proc_t *, vnode_t *, offset_t, caddr_t,
- size_t, rlim64_t, cred_t *);
+extern int core_seg(proc_t *, vnode_t *, u_offset_t, caddr_t, size_t,
+ rlim64_t, cred_t *);
-extern int core_write(vnode_t *, enum uio_seg, offset_t,
- const void *, size_t, rlim64_t, cred_t *);
+extern int core_write(vnode_t *, enum uio_seg, u_offset_t, const void *,
+ size_t, rlim64_t, cred_t *);
/* a.out stuff */
diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h
index ec0741fe08..556a7ab2a1 100644
--- a/usr/src/uts/common/sys/file.h
+++ b/usr/src/uts/common/sys/file.h
@@ -27,13 +27,13 @@
/* All Rights Reserved */
/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
-/* Copyright 2015 Joyent, Inc. */
+/* Copyright 2017 Joyent, Inc. */
#ifndef _SYS_FILE_H
#define _SYS_FILE_H
#include <sys/t_lock.h>
-#ifdef _KERNEL
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
#include <sys/model.h>
#include <sys/user.h>
#endif
@@ -122,11 +122,6 @@ typedef struct fpollinfo {
#if defined(_KERNEL) || defined(_FAKE_KERNEL)
/*
- * This is a flag that is set on f_flag2, but is never user-visible
- */
-#define FEPOLLED 0x8000
-
-/*
* Fake flags for driver ioctl calls to inform them of the originating
* process' model. See <sys/model.h>
*
@@ -200,6 +195,7 @@ struct vattr;
struct uf_info;
extern file_t *getf(int);
+extern file_t *getf_gen(int, uf_entry_gen_t *);
extern void releasef(int);
extern void areleasef(int, struct uf_info *);
#ifndef _BOOT
@@ -226,6 +222,7 @@ extern void fcnt_add(struct uf_info *, int);
extern void close_exec(struct uf_info *);
extern void clear_stale_fd(void);
extern void clear_active_fd(int);
+extern void set_active_fd(int);
extern void free_afd(afd_t *afd);
extern int fgetstartvp(int, char *, struct vnode **);
extern int fsetattrat(int, char *, int, struct vattr *);
diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h
new file mode 100644
index 0000000000..54e6dbeedf
--- /dev/null
+++ b/usr/src/uts/common/sys/frameio.h
@@ -0,0 +1,107 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FRAMEIO_H
+#define _SYS_FRAMEIO_H
+
+/*
+ * Frame I/O definitions
+ */
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+/* Kernel only headers */
+#include <sys/stream.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * An individual frame vector component. Collections of these are used to make
+ * ioctls.
+ */
+typedef struct framevec {
+ void *fv_buf; /* Buffer with data */
+ size_t fv_buflen; /* Size of the buffer */
+ size_t fv_actlen; /* Amount of buffer consumed, ignore on error */
+} framevec_t;
+
+/*
+ * The base unit used with frameio.
+ */
+typedef struct frameio {
+ uint_t fio_version; /* Should always be FRAMEIO_CURRENT_VERSION */
+ uint_t fio_nvpf; /* How many vectors make up one frame */
+ uint_t fio_nvecs; /* The total number of vectors */
+ framevec_t fio_vecs[]; /* C99 VLA */
+} frameio_t;
+
+
+#define FRAMEIO_VERSION_ONE 1
+#define FRAMEIO_CURRENT_VERSION FRAMEIO_VERSION_ONE
+
+#define FRAMEIO_NVECS_MAX 32
+
+/*
+ * Definitions for kernel modules to include as helpers. These are consolidation
+ * private.
+ */
+#ifdef _KERNEL
+
+/*
+ * 32-bit versions for 64-bit kernels
+ */
+typedef struct framevec32 {
+ caddr32_t fv_buf;
+ size32_t fv_buflen;
+ size32_t fv_actlen;
+} framevec32_t;
+
+typedef struct frameio32 {
+ uint_t fio_version;
+ uint_t fio_vecspframe;
+ uint_t fio_nvecs;
+ framevec32_t fio_vecs[];
+} frameio32_t;
+
+/*
+ * Describe the different ways that vectors should map to frames.
+ */
+typedef enum frameio_write_mblk_map {
+ MAP_BLK_FRAME
+} frameio_write_mblk_map_t;
+
+int frameio_init(void);
+void frameio_fini(void);
+frameio_t *frameio_alloc(int);
+void frameio_free(frameio_t *);
+int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t);
+int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int);
+int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *,
+ int *, int);
+int frameio_hdr_copyout(frameio_t *, int, void *, uint_t);
+size_t frameio_frame_length(frameio_t *, framevec_t *);
+void frameio_mark_consumed(frameio_t *, int);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FRAMEIO_H */
diff --git a/usr/src/uts/common/sys/fs/fifonode.h b/usr/src/uts/common/sys/fs/fifonode.h
index d8b158ce3c..1ea8563e1c 100644
--- a/usr/src/uts/common/sys/fs/fifonode.h
+++ b/usr/src/uts/common/sys/fs/fifonode.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -83,6 +84,7 @@ struct fifonode {
struct msgb *fn_tail; /* last message to read */
fifolock_t *fn_lock; /* pointer to per fifo lock */
uint_t fn_count; /* Number of bytes on fn_mp */
+ uint_t fn_hiwat; /* pipe (fifofast) high water */
kcondvar_t fn_wait_cv; /* fifo conditional variable */
ushort_t fn_wcnt; /* number of writers */
ushort_t fn_rcnt; /* number of readers */
@@ -135,6 +137,8 @@ typedef struct fifodata {
#define FIFOPOLLRBAND 0x20000
#define FIFOSTAYFAST 0x40000 /* don't turn into stream mode */
#define FIFOWAITMODE 0x80000 /* waiting for the possibility to change mode */
+/* Data on loan, block reads. Use in conjunction with FIFOSTAYFAST. */
+#define FIFORDBLOCK 0x100000
#define FIFOHIWAT (16 * 1024)
#define FIFOLOWAT (0)
@@ -147,16 +151,6 @@ typedef struct fifodata {
#if defined(_KERNEL)
-/*
- * Fifohiwat defined as a variable is to allow tuning of the high
- * water mark if needed. It is not meant to be released.
- */
-#if FIFODEBUG
-extern int Fifohiwat;
-#else /* FIFODEBUG */
-#define Fifohiwat FIFOHIWAT
-#endif /* FIFODEBUG */
-
extern struct vnodeops *fifo_vnodeops;
extern const struct fs_operation_def fifo_vnodeops_template[];
extern struct kmem_cache *fnode_cache;
@@ -181,6 +175,8 @@ extern void fifo_fastoff(fifonode_t *);
extern struct streamtab *fifo_getinfo();
extern void fifo_wakereader(fifonode_t *, fifolock_t *);
extern void fifo_wakewriter(fifonode_t *, fifolock_t *);
+extern boolean_t fifo_stayfast_enter(fifonode_t *);
+extern void fifo_stayfast_exit(fifonode_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h
new file mode 100644
index 0000000000..b8c4149df2
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_HYPRLOFS_H
+#define _SYS_FS_HYPRLOFS_H
+
+#include <sys/param.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * hyprlofs ioctl numbers.
+ */
+#define HYPRLOFS_IOC ('H' << 8)
+
+#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1)
+#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2)
+#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3)
+#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4)
+
+typedef struct {
+ char *hle_path;
+ uint_t hle_plen;
+ char *hle_name;
+ uint_t hle_nlen;
+} hyprlofs_entry_t;
+
+typedef struct {
+ hyprlofs_entry_t *hle_entries;
+ uint_t hle_len;
+} hyprlofs_entries_t;
+
+typedef struct {
+ char hce_path[MAXPATHLEN];
+ char hce_name[MAXPATHLEN];
+} hyprlofs_curr_entry_t;
+
+typedef struct {
+ hyprlofs_curr_entry_t *hce_entries;
+ uint_t hce_cnt;
+} hyprlofs_curr_entries_t;
+
+#ifdef _KERNEL
+typedef struct {
+ caddr32_t hle_path;
+ uint_t hle_plen;
+ caddr32_t hle_name;
+ uint_t hle_nlen;
+} hyprlofs_entry32_t;
+
+typedef struct {
+ caddr32_t hle_entries;
+ uint_t hle_len;
+} hyprlofs_entries32_t;
+
+typedef struct {
+ caddr32_t hce_entries;
+ uint_t hce_cnt;
+} hyprlofs_curr_entries32_t;
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_HYPRLOFS_H */
diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h
new file mode 100644
index 0000000000..38389f77d9
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h
@@ -0,0 +1,174 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_HYPRLOFS_INFO_H
+#define _SYS_FS_HYPRLOFS_INFO_H
+
+#include <sys/t_lock.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <sys/vfs_opreg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * hlnode is the file system dependent node for hyprlofs.
+ * It is modeled on the tmpfs tmpnode.
+ *
+ * hln_rwlock protects access of the directory list at hln_dir
+ * as well as syncronizing read/writes to directory hlnodes.
+ * hln_tlock protects updates to hln_mode and hln_nlink.
+ * hln_tlock doesn't require any hlnode locks.
+ */
+typedef struct hlnode {
+ struct hlnode *hln_back; /* linked list of hlnodes */
+ struct hlnode *hln_forw; /* linked list of hlnodes */
+ union {
+ struct {
+ struct hldirent *un_dirlist; /* dirent list */
+ uint_t un_dirents; /* number of dirents */
+ } un_dirstruct;
+ vnode_t *un_realvp; /* real vnode */
+ } un_hlnode;
+ vnode_t *hln_vnode; /* vnode for this hlnode */
+ int hln_gen; /* pseudo gen num for hlfid */
+ int hln_looped; /* flag indicating loopback */
+ vattr_t hln_attr; /* attributes */
+ krwlock_t hln_rwlock; /* rw - serialize mods and */
+ /* directory updates */
+ kmutex_t hln_tlock; /* time, flag, and nlink lock */
+} hlnode_t;
+
+/*
+ * hyprlofs per-mount data structure.
+ * All fields are protected by hlm_contents.
+ */
+typedef struct {
+ vfs_t *hlm_vfsp; /* filesystem's vfs struct */
+ hlnode_t *hlm_rootnode; /* root hlnode */
+ char *hlm_mntpath; /* name of hyprlofs mount point */
+ dev_t hlm_dev; /* unique dev # of mounted `device' */
+ uint_t hlm_gen; /* pseudo generation number for files */
+ kmutex_t hlm_contents; /* lock for hlfsmount structure */
+} hlfsmount_t;
+
+/*
+ * hyprlofs directories are made up of a linked list of hldirent structures
+ * hanging off directory hlnodes. File names are not fixed length,
+ * but are null terminated.
+ */
+typedef struct hldirent {
+ hlnode_t *hld_hlnode; /* hlnode for this file */
+ struct hldirent *hld_next; /* next directory entry */
+ struct hldirent *hld_prev; /* prev directory entry */
+ uint_t hld_offset; /* "offset" of dir entry */
+ uint_t hld_hash; /* a hash of td_name */
+ struct hldirent *hld_link; /* linked via the hash table */
+ hlnode_t *hld_parent; /* parent, dir we are in */
+ char *hld_name; /* must be null terminated */
+ /* max length is MAXNAMELEN */
+} hldirent_t;
+
+/*
+ * hlfid overlays the fid structure (for VFS_VGET)
+ */
+typedef struct {
+ uint16_t hlfid_len;
+ ino32_t hlfid_ino;
+ int32_t hlfid_gen;
+} hlfid_t;
+
+/*
+ * File system independent to hyprlofs conversion macros
+ */
+#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data)
+#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data)
+#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data)
+#define HLNTOV(tp) ((tp)->hln_vnode)
+#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp)
+#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp))
+#define hlnode_rele(tp) VN_RELE(HLNTOV(tp))
+
+#define hln_dir un_hlnode.un_dirstruct.un_dirlist
+#define hln_dirents un_hlnode.un_dirstruct.un_dirents
+#define hln_realvp un_hlnode.un_realvp
+
+/*
+ * Attributes
+ */
+#define hln_mask hln_attr.va_mask
+#define hln_type hln_attr.va_type
+#define hln_mode hln_attr.va_mode
+#define hln_uid hln_attr.va_uid
+#define hln_gid hln_attr.va_gid
+#define hln_fsid hln_attr.va_fsid
+#define hln_nodeid hln_attr.va_nodeid
+#define hln_nlink hln_attr.va_nlink
+#define hln_size hln_attr.va_size
+#define hln_atime hln_attr.va_atime
+#define hln_mtime hln_attr.va_mtime
+#define hln_ctime hln_attr.va_ctime
+#define hln_rdev hln_attr.va_rdev
+#define hln_blksize hln_attr.va_blksize
+#define hln_nblocks hln_attr.va_nblocks
+#define hln_seq hln_attr.va_seq
+
+/*
+ * enums
+ */
+enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */
+enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */
+
+/*
+ * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs
+ * leaves free for the rest of the system. The default value for
+ * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a
+ * different number of pages. Since hyprlofs doesn't actually use much
+ * memory, its unlikely this ever needs to be patched.
+ */
+#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */
+
+extern size_t hyprlofs_minfree; /* Anonymous memory in pages */
+
+extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *,
+ cred_t *);
+extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *);
+extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op,
+ cred_t *);
+extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *);
+extern void hyprlofs_dirtrunc(hlnode_t *);
+extern int hyprlofs_taccess(void *, int, cred_t *);
+extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op,
+ vnode_t *, vattr_t *, hlnode_t **, cred_t *);
+
+extern struct vnodeops *hyprlofs_vnodeops;
+extern const struct fs_operation_def hyprlofs_vnodeops_template[];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_HYPRLOFS_INFO_H */
diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h
index 9f9ce5c8c1..d1c5f674f1 100644
--- a/usr/src/uts/common/sys/fs/sdev_impl.h
+++ b/usr/src/uts/common/sys/fs/sdev_impl.h
@@ -37,6 +37,7 @@ extern "C" {
#include <sys/vfs_opreg.h>
#include <sys/list.h>
#include <sys/nvpair.h>
+#include <sys/fs/sdev_plugin.h>
#include <sys/sunddi.h>
/*
@@ -129,6 +130,21 @@ typedef struct sdev_local_data {
struct sdev_dprof sdev_lprof; /* profile for multi-inst */
} sdev_local_data_t;
+/* sdev_flags */
+typedef enum sdev_flags {
+ SDEV_BUILD = 0x0001, /* directory cache out-of-date */
+ SDEV_GLOBAL = 0x0002, /* global /dev nodes */
+ SDEV_PERSIST = 0x0004, /* backing store persisted node */
+ SDEV_NO_NCACHE = 0x0008, /* do not include in neg. cache */
+ SDEV_DYNAMIC = 0x0010, /* special-purpose vnode ops */
+ /* (ex: pts) */
+ SDEV_VTOR = 0x0020, /* validate sdev_nodes during search */
+ SDEV_ATTR_INVALID = 0x0040, /* invalid node attributes, */
+ /* need update */
+ SDEV_SUBDIR = 0x0080, /* match all subdirs under here */
+ SDEV_ZONED = 0x0100 /* zoned subdir */
+} sdev_flags_t;
+
/*
* /dev filesystem sdev_node defines
*/
@@ -151,7 +167,7 @@ typedef struct sdev_node {
ino64_t sdev_ino; /* inode */
uint_t sdev_nlink; /* link count */
int sdev_state; /* state of this node */
- int sdev_flags; /* flags bit */
+ sdev_flags_t sdev_flags; /* flags bit */
kmutex_t sdev_lookup_lock; /* node creation synch lock */
kcondvar_t sdev_lookup_cv; /* node creation sync cv */
@@ -162,7 +178,7 @@ typedef struct sdev_node {
struct sdev_global_data sdev_globaldata;
struct sdev_local_data sdev_localdata;
} sdev_instance_data;
-
+ list_node_t sdev_plist; /* link on plugin list */
void *sdev_private;
} sdev_node_t;
@@ -193,29 +209,11 @@ typedef enum {
SDEV_READY
} sdev_node_state_t;
-/* sdev_flags */
-#define SDEV_BUILD 0x0001 /* directory cache out-of-date */
-#define SDEV_GLOBAL 0x0002 /* global /dev nodes */
-#define SDEV_PERSIST 0x0004 /* backing store persisted node */
-#define SDEV_NO_NCACHE 0x0008 /* do not include in neg. cache */
-#define SDEV_DYNAMIC 0x0010 /* special-purpose vnode ops */
- /* (ex: pts) */
-#define SDEV_VTOR 0x0020 /* validate sdev_nodes during search */
-#define SDEV_ATTR_INVALID 0x0040 /* invalid node attributes, */
- /* need update */
-#define SDEV_SUBDIR 0x0080 /* match all subdirs under here */
-#define SDEV_ZONED 0x0100 /* zoned subdir */
-
/* sdev_lookup_flags */
#define SDEV_LOOKUP 0x0001 /* node creation in progress */
#define SDEV_READDIR 0x0002 /* VDIR readdir in progress */
#define SDEV_LGWAITING 0x0004 /* waiting for devfsadm completion */
-#define SDEV_VTOR_INVALID -1
-#define SDEV_VTOR_SKIP 0
-#define SDEV_VTOR_VALID 1
-#define SDEV_VTOR_STALE 2
-
/* convenient macros */
#define SDEV_IS_GLOBAL(dv) \
(dv->sdev_flags & SDEV_GLOBAL)
@@ -368,8 +366,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *,
extern int devname_profile_update(char *, size_t);
extern struct sdev_data *sdev_find_mntinfo(char *);
void sdev_mntinfo_rele(struct sdev_data *);
+typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *);
+void sdev_mnt_walk(sdev_mnt_walk_f, void *);
extern struct vnodeops *devpts_getvnodeops(void);
extern struct vnodeops *devvt_getvnodeops(void);
+extern void sdev_plugin_nodeready(struct sdev_node *);
+extern int sdev_plugin_init(void);
+extern int sdev_plugin_fini(void);
/*
* boot states - warning, the ordering here is significant
@@ -515,6 +518,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *);
extern void sdev_modctl_dump_files(void);
/*
+ * plugin and legacy vtab stuff
+ */
+/* directory dependent vop table */
+typedef struct sdev_vop_table {
+ char *vt_name; /* subdirectory name */
+ const fs_operation_def_t *vt_service; /* vnodeops table */
+ struct vnodeops **vt_global_vops; /* global container for vop */
+ int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */
+ int vt_flags;
+} sdev_vop_table_t;
+
+extern struct sdev_vop_table vtab[];
+extern struct vnodeops *sdev_get_vop(struct sdev_node *);
+extern void sdev_set_no_negcache(struct sdev_node *);
+extern void *sdev_get_vtor(struct sdev_node *dv);
+
+/*
* globals
*/
extern kmutex_t sdev_lock;
@@ -527,6 +547,7 @@ extern struct vnodeops *devipnet_vnodeops;
extern struct vnodeops *devvt_vnodeops;
extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */
extern struct vnodeops *devzvol_vnodeops;
+extern int sdev_vnodeops_tbl_size;
extern const fs_operation_def_t sdev_vnodeops_tbl[];
extern const fs_operation_def_t devpts_vnodeops_tbl[];
diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h
new file mode 100644
index 0000000000..f4ed813c1e
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/sdev_plugin.h
@@ -0,0 +1,106 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+#ifndef _SYS_SDEV_PLUGIN_H
+#define _SYS_SDEV_PLUGIN_H
+
+/*
+ * Kernel sdev plugin interface
+ */
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vnode.h>
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef uintptr_t sdev_plugin_hdl_t;
+typedef uintptr_t sdev_ctx_t;
+
+/*
+ * Valid return values for sdev_plugin_validate_t.
+ */
+typedef enum sdev_plugin_validate {
+ SDEV_VTOR_INVALID = -1,
+ SDEV_VTOR_SKIP = 0,
+ SDEV_VTOR_VALID = 1,
+ SDEV_VTOR_STALE = 2
+} sdev_plugin_validate_t;
+
+/*
+ * Valid flags
+ */
+typedef enum sdev_plugin_flags {
+ SDEV_PLUGIN_NO_NCACHE = 0x1,
+ SDEV_PLUGIN_SUBDIR = 0x2
+} sdev_plugin_flags_t;
+
+#define SDEV_PLUGIN_FLAGS_MASK 0x3
+
+/*
+ * Functions a module must implement
+ */
+typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t);
+typedef int (*sp_filldir_f)(sdev_ctx_t);
+typedef void (*sp_inactive_f)(sdev_ctx_t);
+
+#define SDEV_PLUGIN_VERSION 1
+
+typedef struct sdev_plugin_ops {
+ int spo_version;
+ sdev_plugin_flags_t spo_flags;
+ sp_valid_f spo_validate;
+ sp_filldir_f spo_filldir;
+ sp_inactive_f spo_inactive;
+} sdev_plugin_ops_t;
+
+extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *,
+ int *);
+extern int sdev_plugin_unregister(sdev_plugin_hdl_t);
+
+typedef enum sdev_ctx_flags {
+ SDEV_CTX_GLOBAL = 0x2 /* node belongs to the GZ */
+} sdev_ctx_flags_t;
+
+/*
+ * Context helper functions
+ */
+extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t);
+extern const char *sdev_ctx_name(sdev_ctx_t);
+extern const char *sdev_ctx_path(sdev_ctx_t);
+extern int sdev_ctx_minor(sdev_ctx_t, minor_t *);
+extern enum vtype sdev_ctx_vtype(sdev_ctx_t);
+
+/*
+ * Callbacks to manipulate nodes
+ */
+extern int sdev_plugin_mkdir(sdev_ctx_t, char *);
+extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SDEV_PLUGIN_H */
diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h
index fb07de6588..f4cee09244 100644
--- a/usr/src/uts/common/sys/fs/tmp.h
+++ b/usr/src/uts/common/sys/fs/tmp.h
@@ -23,7 +23,7 @@
* All rights reserved. Use is subject to license terms.
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
*/
#ifndef _SYS_FS_TMP_H
@@ -43,8 +43,10 @@ struct tmount {
struct vfs *tm_vfsp; /* filesystem's vfs struct */
struct tmpnode *tm_rootnode; /* root tmpnode */
char *tm_mntpath; /* name of tmpfs mount point */
- ulong_t tm_anonmax; /* file system max anon reservation */
- pgcnt_t tm_anonmem; /* pages of reserved anon memory */
+ size_t tm_anonmax; /* file system max anon reservation */
+ size_t tm_anonmem; /* bytes of reserved anon memory */
+ /* and allocated kmem for the fs */
+ size_t tm_allocmem; /* bytes alloced from tmp_kmem_ funcs */
dev_t tm_dev; /* unique dev # of mounted `device' */
uint_t tm_gen; /* pseudo generation number for files */
kmutex_t tm_contents; /* lock for tmount structure */
@@ -58,6 +60,7 @@ struct tmount {
#define VTOTM(vp) ((struct tmount *)(vp)->v_vfsp->vfs_data)
#define VTOTN(vp) ((struct tmpnode *)(vp)->v_data)
#define TNTOV(tp) ((tp)->tn_vnode)
+#define TNTOTM(tp) (VTOTM(TNTOV(tp)))
#define tmpnode_hold(tp) VN_HOLD(TNTOV(tp))
#define tmpnode_rele(tp) VN_RELE(TNTOV(tp))
@@ -69,41 +72,39 @@ enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */
/*
* tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs
- * leaves free for the rest of the system. E.g. in a system with 32MB of
- * configured swap space, if 16MB were reserved (leaving 16MB free),
- * tmpfs could allocate up to 16MB - tmpfs_minfree. The default value
- * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched
- * to a different number of pages.
- * NB: If tmpfs allocates too much swap space, other processes will be
- * unable to execute.
+ * leaves free for the rest of the system. In antiquity, this number could be
+ * relevant on a system-wide basis, as physical DRAM was routinely exhausted;
+ * however, in more modern times, the relative growth of DRAM with respect to
+ * application footprint means that this number is only likely to become
+ * factor in a virtualized OS environment (e.g., a zone) -- and even then only
+ * when DRAM and swap have both been capped low to allow for maximum tenancy.
+ * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should
+ * therefore be configured to a value that is roughly the smallest practical
+ * value for memory + swap minus the largest reasonable size for tmpfs in such
+ * a configuration. As of this writing, the smallest practical memory + swap
+ * configuration is 128MB, and it seems reasonable to allow tmpfs to consume
+ * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB. Care
+ * should be exercised in changing this: tuning this value too high will
+ * result in spurious ENOSPC errors in tmpfs in small zones (a problem that
+ * can induce cascading failure surprisingly often); tuning this value too low
+ * will result in tmpfs consumption alone to alone induce application-level
+ * memory allocation failure.
*/
-#define TMPMINFREE 2 * 1024 * 1024 /* 2 Megabytes */
+#define TMPMINFREE 16 * 1024 * 1024 /* 16 Megabytes */
extern size_t tmpfs_minfree; /* Anonymous memory in pages */
-/*
- * tmpfs can allocate only a certain percentage of kernel memory,
- * which is used for tmpnodes, directories, file names, etc.
- * This is statically set as TMPMAXFRACKMEM of physical memory.
- * The actual number of allocatable bytes can be patched in tmpfs_maxkmem.
- */
-#define TMPMAXFRACKMEM 25 /* 1/25 of physical memory */
-
-extern size_t tmp_kmemspace;
-extern size_t tmpfs_maxkmem; /* Allocatable kernel memory in bytes */
-
extern void tmpnode_init(struct tmount *, struct tmpnode *,
struct vattr *, struct cred *);
+extern void tmpnode_cleanup(struct tmpnode *tp);
extern int tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t);
extern void tmpnode_growmap(struct tmpnode *, ulong_t);
extern int tdirlookup(struct tmpnode *, char *, struct tmpnode **,
struct cred *);
extern int tdirdelete(struct tmpnode *, struct tmpnode *, char *,
enum dr_op, struct cred *);
-extern void tdirinit(struct tmpnode *, struct tmpnode *);
+extern int tdirinit(struct tmpnode *, struct tmpnode *);
extern void tdirtrunc(struct tmpnode *);
-extern void *tmp_memalloc(size_t, int);
-extern void tmp_memfree(void *, size_t);
extern int tmp_resv(struct tmount *, struct tmpnode *, size_t, int);
extern int tmp_taccess(void *, int, struct cred *);
extern int tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *,
@@ -114,6 +115,9 @@ extern int tdirenter(struct tmount *, struct tmpnode *, char *,
enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *,
struct tmpnode **, struct cred *, caller_context_t *);
+extern void *tmp_kmem_zalloc(struct tmount *, size_t, int);
+extern void tmp_kmem_free(struct tmount *, void *, size_t);
+
#define TMP_MUSTHAVE 0x01
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h
index 2d4e1aa7fb..4a48af52a1 100644
--- a/usr/src/uts/common/sys/fx.h
+++ b/usr/src/uts/common/sys/fx.h
@@ -21,13 +21,12 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_FX_H
#define _SYS_FX_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/thread.h>
#include <sys/ddi.h>
@@ -145,7 +144,14 @@ typedef struct fxkparms {
uint_t fx_cflags;
} fxkparms_t;
+/*
+ * control flags (kparms->fx_cflags).
+ */
+#define FX_DOUPRILIM 0x01 /* change user priority limit */
+#define FX_DOUPRI 0x02 /* change user priority */
+#define FX_DOTQ 0x04 /* change FX time quantum */
+#define FXMAXUPRI 60 /* maximum user priority setting */
/*
* Interface for partner private code. This is not a public interface.
diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h
new file mode 100644
index 0000000000..91ab46fc44
--- /dev/null
+++ b/usr/src/uts/common/sys/gsqueue.h
@@ -0,0 +1,59 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _SYS_GSQUEUE_H
+#define _SYS_GSQUEUE_H
+
+/*
+ * Standard interfaces to serializaion queues for everyone (except IP).
+ */
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct gsqueue gsqueue_t;
+typedef struct gsqueue_set gsqueue_set_t;
+
+typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t);
+typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *);
+
+extern gsqueue_set_t *gsqueue_set_create(pri_t);
+extern void gsqueue_set_destroy(gsqueue_set_t *);
+extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t);
+
+extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *);
+extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t);
+
+#define GSQUEUE_FILL 0x0001
+#define GSQUEUE_NODRAIN 0x0002
+#define GSQUEUE_PROCESS 0x0004
+
+extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *,
+ int, uint8_t);
+
+#define GSQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_GSQUEUE_H */
diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h
index d8a15f0fe5..f3337bbacf 100644
--- a/usr/src/uts/common/sys/hook_impl.h
+++ b/usr/src/uts/common/sys/hook_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -171,7 +172,7 @@ typedef struct hook_family_int {
cvwaitlock_t hfi_lock;
SLIST_ENTRY(hook_family_int) hfi_entry;
hook_event_int_head_t hfi_head;
- hook_family_t hfi_family;
+ hook_family_t hfi_family;
kstat_t *hfi_kstat;
struct hook_stack *hfi_stack;
hook_notify_head_t hfi_nhead;
@@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t;
#define Hn_ARP "arp"
#define Hn_IPV4 "inet"
#define Hn_IPV6 "inet6"
+#define Hn_VIONA "viona_inet"
extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t);
extern int hook_register(hook_family_int_t *, char *, hook_t *);
diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h
index d56fcceb5a..46d25f207f 100644
--- a/usr/src/uts/common/sys/id_space.h
+++ b/usr/src/uts/common/sys/id_space.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All Rights reserved.
*/
#ifndef _ID_SPACE_H
@@ -34,8 +35,6 @@ extern "C" {
#include <sys/mutex.h>
#include <sys/vmem.h>
-#ifdef _KERNEL
-
typedef vmem_t id_space_t;
id_space_t *id_space_create(const char *, id_t, id_t);
@@ -48,8 +47,6 @@ id_t id_allocff_nosleep(id_space_t *);
id_t id_alloc_specific_nosleep(id_space_t *, id_t);
void id_free(id_space_t *, id_t);
-#endif /* _KERNEL */
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h
new file mode 100644
index 0000000000..8acc1a7280
--- /dev/null
+++ b/usr/src/uts/common/sys/inotify.h
@@ -0,0 +1,153 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Header file to support for the inotify facility. Note that this facility
+ * is designed to be binary compatible with the Linux inotify facility; values
+ * for constants here should therefore exactly match those found in Linux, and
+ * this facility shouldn't be extended independently of Linux.
+ */
+
+#ifndef _SYS_INOTIFY_H
+#define _SYS_INOTIFY_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Events that can be explicitly requested on any inotify watch.
+ */
+#define IN_ACCESS 0x00000001
+#define IN_MODIFY 0x00000002
+#define IN_ATTRIB 0x00000004
+#define IN_CLOSE_WRITE 0x00000008
+#define IN_CLOSE_NOWRITE 0x00000010
+#define IN_OPEN 0x00000020
+#define IN_MOVED_FROM 0x00000040
+#define IN_MOVED_TO 0x00000080
+#define IN_CREATE 0x00000100
+#define IN_DELETE 0x00000200
+#define IN_DELETE_SELF 0x00000400
+#define IN_MOVE_SELF 0x00000800
+
+/*
+ * Events that can be sent to an inotify watch -- requested or not.
+ */
+#define IN_UNMOUNT 0x00002000
+#define IN_Q_OVERFLOW 0x00004000
+#define IN_IGNORED 0x00008000
+
+/*
+ * Flags that can modify an inotify event.
+ */
+#define IN_ONLYDIR 0x01000000
+#define IN_DONT_FOLLOW 0x02000000
+#define IN_EXCL_UNLINK 0x04000000
+#define IN_MASK_ADD 0x20000000
+#define IN_ISDIR 0x40000000
+#define IN_ONESHOT 0x80000000
+
+/*
+ * Helpful constants.
+ */
+#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE)
+#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO)
+#define IN_ALL_EVENTS \
+ (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
+ IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \
+ IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF)
+
+#define IN_CHILD_EVENTS \
+ (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
+ IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN)
+
+/*
+ * To assure binary compatibility with Linux, these values are fixed at their
+ * Linux equivalents, not their native ones.
+ */
+#define IN_CLOEXEC 02000000 /* LX_O_CLOEXEC */
+#define IN_NONBLOCK 04000 /* LX_O_NONBLOCK */
+
+struct inotify_event {
+ int32_t wd; /* watch descriptor */
+ uint32_t mask; /* mask of events */
+ uint32_t cookie; /* event association cookie, if any */
+ uint32_t len; /* size of name field */
+ char name[]; /* optional NUL-terminated name */
+};
+
+/*
+ * These ioctl values are specific to the native implementation; applications
+ * shouldn't be using them directly, and they should therefore be safe to
+ * change without breaking apps.
+ */
+#define INOTIFYIOC (('i' << 24) | ('n' << 16) | ('y' << 8))
+#define INOTIFYIOC_ADD_WATCH (INOTIFYIOC | 1) /* add watch */
+#define INOTIFYIOC_RM_WATCH (INOTIFYIOC | 2) /* remove watch */
+#define INOTIFYIOC_ADD_CHILD (INOTIFYIOC | 3) /* add child watch */
+#define INOTIFYIOC_ACTIVATE (INOTIFYIOC | 4) /* activate watch */
+
+#ifndef _LP64
+#ifndef _LITTLE_ENDIAN
+#define INOTIFY_PTR(type, name) uint32_t name##pad; type *name
+#else
+#define INOTIFY_PTR(type, name) type *name; uint32_t name##pad
+#endif
+#else
+#define INOTIFY_PTR(type, name) type *name
+#endif
+
+typedef struct inotify_addwatch {
+ int inaw_fd; /* open fd for object */
+ uint32_t inaw_mask; /* desired mask */
+} inotify_addwatch_t;
+
+typedef struct inotify_addchild {
+ INOTIFY_PTR(char, inac_name); /* pointer to name */
+ int inac_fd; /* open fd for parent */
+} inotify_addchild_t;
+
+#ifndef _KERNEL
+
+extern int inotify_init(void);
+extern int inotify_init1(int);
+extern int inotify_add_watch(int, const char *, uint32_t);
+extern int inotify_rm_watch(int, int);
+
+#else
+
+#define IN_UNMASKABLE \
+ (IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR)
+
+#define IN_MODIFIERS \
+ (IN_EXCL_UNLINK | IN_ONESHOT)
+
+#define IN_FLAGS \
+ (IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD)
+
+#define IN_REMOVAL (1ULL << 32)
+#define INOTIFYMNRN_INOTIFY 0
+#define INOTIFYMNRN_CLONE 1
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_INOTIFY_H */
diff --git a/usr/src/uts/common/sys/ipc_impl.h b/usr/src/uts/common/sys/ipc_impl.h
index 0569c3e967..d7dc365c09 100644
--- a/usr/src/uts/common/sys/ipc_impl.h
+++ b/usr/src/uts/common/sys/ipc_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
#ifndef _IPC_IMPL_H
@@ -226,6 +227,7 @@ int ipc_commit_begin(ipc_service_t *, key_t, int, kipc_perm_t *);
kmutex_t *ipc_commit_end(ipc_service_t *, kipc_perm_t *);
void ipc_cleanup(ipc_service_t *, kipc_perm_t *);
+void ipc_rmsvc(ipc_service_t *, kipc_perm_t *);
int ipc_rmid(ipc_service_t *, int, cred_t *);
int ipc_ids(ipc_service_t *, int *, uint_t, uint_t *);
diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h
index bad74f8b81..f21c3fb5af 100644
--- a/usr/src/uts/common/sys/ipd.h
+++ b/usr/src/uts/common/sys/ipd.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
*/
/*
@@ -35,7 +35,7 @@ extern "C" {
#endif
#define IPD_DEV_PATH "/dev/ipd"
-#define IPD_MAX_DELAY 10000 /* 10 ms in us */
+#define IPD_MAX_DELAY 1000000 /* 1 second in microseconds */
typedef struct ipd_ioc_perturb {
zoneid_t ipip_zoneid;
diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h
index bf89ef0d33..0a76ee19a7 100644
--- a/usr/src/uts/common/sys/iso/signal_iso.h
+++ b/usr/src/uts/common/sys/iso/signal_iso.h
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -95,7 +96,7 @@ extern "C" {
/* insert new signals here, and move _SIGRTM* appropriately */
#define _SIGRTMIN 42 /* first (highest-priority) realtime signal */
-#define _SIGRTMAX 73 /* last (lowest-priority) realtime signal */
+#define _SIGRTMAX 74 /* last (lowest-priority) realtime signal */
extern long _sysconf(int); /* System Private interface to sysconf() */
#define SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN)) /* first realtime signal */
#define SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX)) /* last realtime signal */
diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h
index 41b70f6a6e..0ea1a396b9 100644
--- a/usr/src/uts/common/sys/klwp.h
+++ b/usr/src/uts/common/sys/klwp.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#ifndef _SYS_KLWP_H
@@ -191,7 +191,14 @@ typedef struct _klwp {
struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */
struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */
- void *lwp_brand; /* per-lwp brand data */
+ /*
+ * Branding:
+ * lwp_brand - per-lwp brand data
+ * lwp_brand_syscall - brand syscall interposer
+ */
+ void *lwp_brand;
+ int (*lwp_brand_syscall)(void);
+
struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */
} klwp_t;
diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h
index 2396ef4625..d52a54f6b7 100644
--- a/usr/src/uts/common/sys/kobj.h
+++ b/usr/src/uts/common/sys/kobj.h
@@ -24,6 +24,9 @@
*
* Copyright 2017 RackTop Systems.
*/
+/*
+ * Copyright (c) 2017 Joyent, Inc.
+ */
#ifndef _SYS_KOBJ_H
#define _SYS_KOBJ_H
@@ -47,6 +50,12 @@ struct module_list {
struct module *mp;
};
+typedef struct hotinline_desc {
+ char *hid_symname; /* symbol name */
+ uintptr_t hid_instr_offset; /* offset of call in text */
+ struct hotinline_desc *hid_next; /* next hotinline */
+} hotinline_desc_t;
+
typedef unsigned short symid_t; /* symbol table index */
typedef unsigned char *reloc_dest_t;
@@ -99,6 +108,8 @@ struct module {
caddr_t textwin;
caddr_t textwin_base;
+ hotinline_desc_t *hi_calls;
+
sdt_probedesc_t *sdt_probes;
size_t sdt_nprobes;
char *sdt_tab;
@@ -187,6 +198,7 @@ extern int kobj_read_file(struct _buf *, char *, unsigned, unsigned);
extern int kobj_get_filesize(struct _buf *, uint64_t *size);
extern uintptr_t kobj_getelfsym(char *, void *, int *);
extern void kobj_set_ctf(struct module *, caddr_t data, size_t size);
+extern void do_hotinlines(struct module *);
extern int kobj_filbuf(struct _buf *);
extern void kobj_sync(void);
diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h
index 5d8827f1ae..d720caa631 100644
--- a/usr/src/uts/common/sys/ksocket.h
+++ b/usr/src/uts/common/sys/ksocket.h
@@ -21,6 +21,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _SYS_KSOCKET_H_
@@ -122,6 +123,11 @@ extern int ksocket_close(ksocket_t, struct cred *);
extern void ksocket_hold(ksocket_t);
extern void ksocket_rele(ksocket_t);
+typedef boolean_t (*ksocket_krecv_f)(ksocket_t, struct msgb *, size_t, int,
+ void *);
+extern int ksocket_krecv_set(ksocket_t, ksocket_krecv_f, void *);
+extern void ksocket_krecv_unblock(ksocket_t);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/limits.h b/usr/src/uts/common/sys/limits.h
new file mode 100644
index 0000000000..88625d1829
--- /dev/null
+++ b/usr/src/uts/common/sys/limits.h
@@ -0,0 +1,32 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2015 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_LIMITS_H
+#define _SYS_LIMITS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IOV_MAX 1024
+
+#ifdef _KERNEL
+#define IOV_MAX_STACK 16 /* max. IOV on-stack allocation */
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LIMITS_H */
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index 0907d6deff..afe554ba03 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright (c) 2015 Garrett D'Amore <garrett@damore.org>
*/
@@ -101,6 +101,14 @@ typedef struct mac_propval_uint32_range_s {
} mac_propval_uint32_range_t;
/*
+ * Defines ranges which are a series of C style strings.
+ */
+typedef struct mac_propval_str_range_s {
+ uint32_t mpur_nextbyte;
+ char mpur_data[1];
+} mac_propval_str_range_t;
+
+/*
* Data type of property values.
*/
typedef enum {
@@ -120,6 +128,7 @@ typedef struct mac_propval_range_s {
mac_propval_type_t mpr_type; /* type of value */
union {
mac_propval_uint32_range_t mpr_uint32[1];
+ mac_propval_str_range_t mpr_str;
} u;
} mac_propval_range_t;
@@ -614,6 +623,36 @@ typedef struct mactype_register_s {
} mactype_register_t;
/*
+ * Flags to describe the hardware emulation desired from a client when
+ * calling mac_hw_emul().
+ *
+ * MAC_HWCKSUM_EMUL
+ *
+ * If an mblk is marked with HCK_* flags, then calculate those
+ * checksums and update the checksum flags.
+ *
+ * MAC_IPCKSUM_EMUL
+ *
+ * Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header
+ * checksum. We still update both the IPv4 and ULP checksum
+ * flags.
+ *
+ * MAC_LSO_EMUL
+ *
+ * If an mblk is marked with HW_LSO, then segment the LSO mblk
+ * into a new chain of mblks which reference the original data
+ * block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the
+ * caller needs both then it must set both.
+ */
+typedef enum mac_emul {
+ MAC_HWCKSUM_EMUL = (1 << 0),
+ MAC_IPCKSUM_EMUL = (1 << 1),
+ MAC_LSO_EMUL = (1 << 2)
+} mac_emul_t;
+
+#define MAC_HWCKSUM_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL)
+
+/*
* Driver interface functions.
*/
extern int mac_open_by_linkid(datalink_id_t,
diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h
index 0fc4939503..8fff314bfe 100644
--- a/usr/src/uts/common/sys/mac_client.h
+++ b/usr/src/uts/common/sys/mac_client.h
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -88,6 +88,7 @@ typedef enum {
} mac_client_promisc_type_t;
/* flags passed to mac_unicast_add() */
+
#define MAC_UNICAST_NODUPCHECK 0x0001
#define MAC_UNICAST_PRIMARY 0x0002
#define MAC_UNICAST_HW 0x0004
@@ -115,6 +116,7 @@ typedef enum {
#define MAC_PROMISC_FLAGS_NO_PHYS 0x0002
#define MAC_PROMISC_FLAGS_VLAN_TAG_STRIP 0x0004
#define MAC_PROMISC_FLAGS_NO_COPY 0x0008
+#define MAC_PROMISC_FLAGS_DO_FIXUPS 0x0010
/* flags passed to mac_tx() */
#define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */
@@ -136,6 +138,7 @@ extern void mac_multicast_remove(mac_client_handle_t, const uint8_t *);
extern void mac_rx_set(mac_client_handle_t, mac_rx_t, void *);
extern void mac_rx_clear(mac_client_handle_t);
+extern void mac_rx_barrier(mac_client_handle_t);
extern void mac_secondary_dup(mac_client_handle_t, mac_client_handle_t);
extern void mac_secondary_cleanup(mac_client_handle_t);
extern mac_tx_cookie_t mac_tx(mac_client_handle_t, mblk_t *,
@@ -198,6 +201,8 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *);
extern void mac_client_set_rings(mac_client_handle_t, int, int);
+extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h
index 9b3b4fe369..21e8620121 100644
--- a/usr/src/uts/common/sys/mac_client_impl.h
+++ b/usr/src/uts/common/sys/mac_client_impl.h
@@ -24,7 +24,7 @@
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_MAC_CLIENT_IMPL_H
@@ -57,7 +57,7 @@ typedef struct mac_unicast_impl_s { /* Protected by */
uint16_t mui_vid; /* SL */
} mac_unicast_impl_t;
-#define MAC_CLIENT_FLAGS_PRIMARY 0X0001
+#define MAC_CLIENT_FLAGS_PRIMARY 0x0001
#define MAC_CLIENT_FLAGS_VNIC_PRIMARY 0x0002
#define MAC_CLIENT_FLAGS_MULTI_PRIMARY 0x0004
#define MAC_CLIENT_FLAGS_PASSIVE_PRIMARY 0x0008
@@ -83,6 +83,7 @@ typedef struct mac_promisc_impl_s { /* Protected by */
boolean_t mpi_no_phys; /* WO */
boolean_t mpi_strip_vlan_tag; /* WO */
boolean_t mpi_no_copy; /* WO */
+ boolean_t mpi_do_fixups; /* WO */
} mac_promisc_impl_t;
typedef union mac_tx_percpu_s {
@@ -131,12 +132,17 @@ struct mac_client_impl_s { /* Protected by */
uint32_t mci_flags; /* SL */
krwlock_t mci_rw_lock;
mac_unicast_impl_t *mci_unicast_list; /* mci_rw_lock */
+
/*
* The mac_client_impl_t may be shared by multiple clients, i.e
* multiple VLANs sharing the same MAC client. In this case the
- * address/vid tubles differ and are each associated with their
+ * address/vid tuples differ and are each associated with their
* own flow entry, but the rest underlying components SRS, etc,
* are common.
+ *
+ * This is only needed to support sun4v vsw. There are several
+ * places in MAC we could simplify the code if we removed
+ * sun4v support.
*/
flow_entry_t *mci_flent_list; /* mci_rw_lock */
uint_t mci_nflents; /* mci_rw_lock */
@@ -313,6 +319,74 @@ extern int mac_tx_percpu_cnt;
(((mcip)->mci_state_flags & MCIS_TAG_DISABLE) == 0 && \
(mcip)->mci_nvids == 1) \
+/*
+ * MAC Client Implementation State (mci_state_flags)
+ *
+ * MCIS_IS_VNIC
+ *
+ * The client is a VNIC.
+ *
+ * MCIS_EXCLUSIVE
+ *
+ * The client has exclusive control over the MAC, such that it is
+ * the sole client of the MAC.
+ *
+ * MCIS_TAG_DISABLE
+ *
+ * MAC will not add VLAN tags to outgoing traffic. If this flag
+ * is set it is up to the client to add the correct VLAN tag.
+ *
+ * MCIS_STRIP_DISABLE
+ *
+ * MAC will not strip the VLAN tags on incoming traffic before
+ * passing it to mci_rx_fn. This only applies to non-bypass
+ * traffic.
+ *
+ * MCIS_IS_AGGR_PORT
+ *
+ * The client represents a port on an aggr.
+ *
+ * MCIS_CLIENT_POLL_CAPABLE
+ *
+ * The client is capable of polling the Rx TCP/UDP softrings.
+ *
+ * MCIS_DESC_LOGGED
+ *
+ * This flag is set when the client's link info has been logged
+ * by the mac_log_linkinfo() timer. This ensures that the
+ * client's link info is only logged once.
+ *
+ * MCIS_SHARE_BOUND
+ *
+ * This client has an HIO share bound to it.
+ *
+ * MCIS_DISABLE_TX_VID_CHECK
+ *
+ * MAC will not check the VID of the client's Tx traffic.
+ *
+ * MCIS_USE_DATALINK_NAME
+ *
+ * The client is using the same name as its underlying MAC. This
+ * happens when dlmgmtd is unreachable during client creation.
+ *
+ * MCIS_UNICAST_HW
+ *
+ * The client requires MAC address hardware classification. This
+ * is only used by sun4v vsw.
+ *
+ * MCIS_IS_AGGR_CLIENT
+ *
+ * The client sits atop an aggr.
+ *
+ * MCIS_RX_BYPASS_DISABLE
+ *
+ * Do not allow the client to enable DLS bypass.
+ *
+ * MCIS_NO_UNICAST_ADDR
+ *
+ * This client has no MAC unicast addresss associated with it.
+ *
+ */
/* MCI state flags */
#define MCIS_IS_VNIC 0x0001
#define MCIS_EXCLUSIVE 0x0002
@@ -325,7 +399,7 @@ extern int mac_tx_percpu_cnt;
#define MCIS_DISABLE_TX_VID_CHECK 0x0100
#define MCIS_USE_DATALINK_NAME 0x0200
#define MCIS_UNICAST_HW 0x0400
-#define MCIS_IS_AGGR 0x0800
+#define MCIS_IS_AGGR_CLIENT 0x0800
#define MCIS_RX_BYPASS_DISABLE 0x1000
#define MCIS_NO_UNICAST_ADDR 0x2000
@@ -337,8 +411,7 @@ extern int mac_tx_percpu_cnt;
extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);
extern void mac_client_init(void);
extern void mac_client_fini(void);
-extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *,
- mac_client_impl_t *);
+extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *);
extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *);
diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h
index 6b409513a6..97b3fd685a 100644
--- a/usr/src/uts/common/sys/mac_client_priv.h
+++ b/usr/src/uts/common/sys/mac_client_priv.h
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -58,6 +58,9 @@ extern const mac_info_t *mac_info(mac_handle_t);
extern boolean_t mac_info_get(const char *, mac_info_t *);
extern boolean_t mac_promisc_get(mac_handle_t);
+extern boolean_t mac_protect_check_addr(mac_client_handle_t, boolean_t,
+ in6_addr_t *);
+
extern int mac_start(mac_handle_t);
extern void mac_stop(mac_handle_t);
@@ -121,9 +124,17 @@ extern void mac_tx_client_quiesce(mac_client_handle_t);
extern void mac_tx_client_condemn(mac_client_handle_t);
extern void mac_tx_client_restart(mac_client_handle_t);
extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t);
+extern uint_t mac_hwrings_idx_get(mac_handle_t, uint_t, mac_group_handle_t *,
+ mac_ring_handle_t *, mac_ring_type_t);
extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *,
mac_ring_handle_t *, mac_ring_type_t);
extern uint_t mac_hwring_getinfo(mac_ring_handle_t);
+extern void mac_hwring_set_passthru(mac_ring_handle_t, mac_rx_t, void *,
+ mac_resource_handle_t);
+extern void mac_hwring_clear_passthru(mac_ring_handle_t);
+extern void mac_client_set_flow_cb(mac_client_handle_t, mac_rx_t, void *);
+extern void mac_client_clear_flow_cb(mac_client_handle_t);
+
extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t,
mac_ring_handle_t);
extern void mac_hwring_teardown(mac_ring_handle_t);
@@ -131,6 +142,8 @@ extern int mac_hwring_disable_intr(mac_ring_handle_t);
extern int mac_hwring_enable_intr(mac_ring_handle_t);
extern int mac_hwring_start(mac_ring_handle_t);
extern void mac_hwring_stop(mac_ring_handle_t);
+extern int mac_hwring_activate(mac_ring_handle_t);
+extern void mac_hwring_quiesce(mac_ring_handle_t);
extern mblk_t *mac_hwring_poll(mac_ring_handle_t, int);
extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *);
extern int mac_hwring_getstat(mac_ring_handle_t, uint_t, uint64_t *);
@@ -144,6 +157,13 @@ extern void mac_hwring_set_default(mac_handle_t, mac_ring_handle_t);
extern int mac_hwgroup_addmac(mac_group_handle_t, const uint8_t *);
extern int mac_hwgroup_remmac(mac_group_handle_t, const uint8_t *);
+extern int mac_hwgroup_addvlan(mac_group_handle_t, uint16_t);
+extern int mac_hwgroup_remvlan(mac_group_handle_t, uint16_t);
+
+extern boolean_t mac_has_hw_vlan(mac_handle_t);
+
+extern uint_t mac_get_num_rx_groups(mac_handle_t);
+extern int mac_set_promisc(mac_handle_t, boolean_t);
extern void mac_set_upper_mac(mac_client_handle_t, mac_handle_t,
mac_resource_props_t *);
@@ -171,6 +191,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t);
extern void *mac_get_devinfo(mac_handle_t);
extern boolean_t mac_is_vnic(mac_handle_t);
+extern boolean_t mac_is_overlay(mac_handle_t);
extern uint32_t mac_no_notification(mac_handle_t);
extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t);
diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h
index e290ba7dbe..d37752ec23 100644
--- a/usr/src/uts/common/sys/mac_flow.h
+++ b/usr/src/uts/common/sys/mac_flow.h
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc. All rights reserved.
*/
#ifndef _MAC_FLOW_H
@@ -155,6 +155,14 @@ typedef enum {
#define MPT_MAXIPADDR MPT_MAXCNT
#define MPT_MAXCID MPT_MAXCNT
#define MPT_MAXCIDLEN 256
+#define MPT_FALSE 0x00000000
+#define MPT_TRUE 0x00000001
+
+/* Dynamic address detection types */
+#define MPT_DYN_DHCPV4 0x00000001
+#define MPT_DYN_DHCPV6 0x00000002
+#define MPT_DYN_SLAAC 0x00000004
+#define MPT_DYN_ALL 0x00000007
typedef struct mac_ipaddr_s {
uint32_t ip_version;
@@ -175,11 +183,13 @@ typedef struct mac_dhcpcid_s {
} mac_dhcpcid_t;
typedef struct mac_protect_s {
- uint32_t mp_types;
- uint32_t mp_ipaddrcnt;
- mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR];
- uint32_t mp_cidcnt;
- mac_dhcpcid_t mp_cids[MPT_MAXCID];
+ uint32_t mp_types; /* Enabled protection types */
+ uint32_t mp_ipaddrcnt; /* Count of allowed IPs */
+ mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR]; /* Allowed IPs */
+ uint32_t mp_cidcnt; /* Count of allowed DHCP CIDs */
+ mac_dhcpcid_t mp_cids[MPT_MAXCID]; /* Allowed DHCP CIDs */
+ uint32_t mp_allcids; /* Whether to allow all CIDs through */
+ uint32_t mp_dynamic; /* Enabled dynamic address methods */
} mac_protect_t;
/* The default priority for links */
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index 774c4fad9a..ce09304699 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_MAC_IMPL_H
@@ -108,6 +108,7 @@ typedef struct mac_cb_info_s {
kcondvar_t mcbi_cv;
uint_t mcbi_del_cnt; /* Deleted callback cnt */
uint_t mcbi_walker_cnt; /* List walker count */
+ uint_t mcbi_barrier_cnt; /* Barrier waiter count */
} mac_cb_info_t;
typedef struct mac_notify_cb_s {
@@ -123,40 +124,18 @@ typedef struct mac_notify_cb_s {
*/
typedef boolean_t (*mcb_func_t)(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
-#define MAC_CALLBACK_WALKER_INC(mcbi) { \
- mutex_enter((mcbi)->mcbi_lockp); \
- (mcbi)->mcbi_walker_cnt++; \
- mutex_exit((mcbi)->mcbi_lockp); \
-}
+#define MAC_CALLBACK_WALKER_INC(mcbi) \
+ mac_callback_walker_enter(mcbi)
-#define MAC_CALLBACK_WALKER_INC_HELD(mcbi) (mcbi)->mcbi_walker_cnt++;
-
-#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) { \
- mac_cb_t *rmlist; \
- \
- mutex_enter((mcbi)->mcbi_lockp); \
- if (--(mcbi)->mcbi_walker_cnt == 0 && (mcbi)->mcbi_del_cnt != 0) { \
- rmlist = mac_callback_walker_cleanup((mcbi), headp); \
- mac_callback_free(rmlist); \
- cv_broadcast(&(mcbi)->mcbi_cv); \
- } \
- mutex_exit((mcbi)->mcbi_lockp); \
-}
+#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) \
+ mac_callback_walker_exit(mcbi, headp, B_FALSE)
-#define MAC_PROMISC_WALKER_INC(mip) \
- MAC_CALLBACK_WALKER_INC(&(mip)->mi_promisc_cb_info)
-
-#define MAC_PROMISC_WALKER_DCR(mip) { \
- mac_cb_info_t *mcbi; \
- \
- mcbi = &(mip)->mi_promisc_cb_info; \
- mutex_enter(mcbi->mcbi_lockp); \
- if (--mcbi->mcbi_walker_cnt == 0 && mcbi->mcbi_del_cnt != 0) { \
- i_mac_promisc_walker_cleanup(mip); \
- cv_broadcast(&mcbi->mcbi_cv); \
- } \
- mutex_exit(mcbi->mcbi_lockp); \
-}
+#define MAC_PROMISC_WALKER_INC(mip) \
+ mac_callback_walker_enter(&(mip)->mi_promisc_cb_info)
+
+#define MAC_PROMISC_WALKER_DCR(mip) \
+ mac_callback_walker_exit(&(mip)->mi_promisc_cb_info, \
+ &(mip)->mi_promisc_list, B_TRUE)
typedef struct mactype_s {
const char *mt_ident;
@@ -208,9 +187,18 @@ struct mac_ring_s {
mac_ring_t *mr_next; /* next ring in the chain */
mac_group_handle_t mr_gh; /* reference to group */
- mac_classify_type_t mr_classify_type; /* HW vs SW */
+ mac_classify_type_t mr_classify_type;
struct mac_soft_ring_set_s *mr_srs; /* associated SRS */
- mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */
+ mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */
+
+ /*
+ * Ring passthru callback and arguments. See the
+ * MAC_PASSTHRU_CLASSIFIER comment in mac_provider.h.
+ */
+ mac_rx_t mr_pt_fn;
+ void *mr_pt_arg1;
+ mac_resource_handle_t mr_pt_arg2;
+
uint_t mr_refcnt; /* Ring references */
/* ring generation no. to guard against drivers using stale rings */
uint64_t mr_gen_num;
@@ -244,7 +232,7 @@ struct mac_ring_s {
(mr)->mr_refcnt++; \
}
-#define MR_REFRELE(mr) { \
+#define MR_REFRELE(mr) { \
mutex_enter(&(mr)->mr_lock); \
ASSERT((mr)->mr_refcnt != 0); \
(mr)->mr_refcnt--; \
@@ -255,8 +243,8 @@ struct mac_ring_s {
}
/*
- * Per mac client flow information associated with a RX group.
- * The entire structure is SL protected.
+ * Used to attach MAC clients to an Rx group. The members are SL
+ * protected.
*/
typedef struct mac_grp_client {
struct mac_grp_client *mgc_next;
@@ -270,15 +258,20 @@ typedef struct mac_grp_client {
((g)->mrg_clients->mgc_next == NULL)) ? \
(g)->mrg_clients->mgc_client : NULL)
+#define MAC_GROUP_HW_VLAN(g) \
+ (((g) != NULL) && \
+ ((g)->mrg_info.mgi_addvlan != NULL) && \
+ ((g)->mrg_info.mgi_remvlan != NULL))
+
/*
* Common ring group data structure for ring control and management.
- * The entire structure is SL protected
+ * The entire structure is SL protected.
*/
struct mac_group_s {
int mrg_index; /* index in the list */
mac_ring_type_t mrg_type; /* ring type */
mac_group_state_t mrg_state; /* state of the group */
- mac_group_t *mrg_next; /* next ring in the chain */
+ mac_group_t *mrg_next; /* next group in the chain */
mac_handle_t mrg_mh; /* reference to MAC */
mac_ring_t *mrg_rings; /* grouped rings */
uint_t mrg_cur_count; /* actual size of group */
@@ -300,7 +293,7 @@ struct mac_group_s {
mac_ring_handle_t mrh = rh; \
mac_impl_t *mimpl = (mac_impl_t *)mhp; \
/* \
- * Send packets through a selected tx ring, or through the \
+ * Send packets through a selected tx ring, or through the \
* default handler if there is no selected ring. \
*/ \
if (mrh == NULL) \
@@ -322,9 +315,9 @@ struct mac_group_s {
#define MAC_TX(mip, rh, mp, src_mcip) { \
mac_ring_handle_t rhandle = (rh); \
/* \
- * If there is a bound Hybrid I/O share, send packets through \
+ * If there is a bound Hybrid I/O share, send packets through \
* the default tx ring. (When there's a bound Hybrid I/O share, \
- * the tx rings of this client are mapped in the guest domain \
+ * the tx rings of this client are mapped in the guest domain \
* and not accessible from here.) \
*/ \
_NOTE(CONSTANTCONDITION) \
@@ -333,7 +326,7 @@ struct mac_group_s {
if (mip->mi_promisc_list != NULL) \
mac_promisc_dispatch(mip, mp, src_mcip); \
/* \
- * Grab the proper transmit pointer and handle. Special \
+ * Grab the proper transmit pointer and handle. Special \
* optimization: we can test mi_bridge_link itself atomically, \
* and if that indicates no bridge send packets through tx ring.\
*/ \
@@ -360,17 +353,23 @@ typedef struct mac_mcast_addrs_s {
} mac_mcast_addrs_t;
typedef enum {
- MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* hardware steering */
+ MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* HW classification */
MAC_ADDRESS_TYPE_UNICAST_PROMISC /* promiscuous mode */
} mac_address_type_t;
+typedef struct mac_vlan_s {
+ struct mac_vlan_s *mv_next;
+ uint16_t mv_vid;
+} mac_vlan_t;
+
typedef struct mac_address_s {
mac_address_type_t ma_type; /* address type */
- int ma_nusers; /* number of users */
- /* of that address */
+ int ma_nusers; /* num users of addr */
struct mac_address_s *ma_next; /* next address */
uint8_t ma_addr[MAXMACADDRLEN]; /* address value */
size_t ma_len; /* address length */
+ mac_vlan_t *ma_vlans; /* VLANs on this addr */
+ boolean_t ma_untagged; /* accept untagged? */
mac_group_t *ma_group; /* asscociated group */
mac_impl_t *ma_mip; /* MAC handle */
} mac_address_t;
@@ -487,7 +486,7 @@ struct mac_impl_s {
mac_capab_led_t mi_led;
/*
- * MAC address list. SL protected.
+ * MAC address and VLAN lists. SL protected.
*/
mac_address_t *mi_addresses;
@@ -654,6 +653,7 @@ struct mac_impl_s {
#define MIS_LEGACY 0x0040
#define MIS_NO_ACTIVE 0x0080
#define MIS_POLL_DISABLE 0x0100
+#define MIS_IS_OVERLAY 0x0200
#define mi_getstat mi_callbacks->mc_getstat
#define mi_start mi_callbacks->mc_start
@@ -722,23 +722,35 @@ typedef struct mac_client_impl_s mac_client_impl_t;
extern void mac_init(void);
extern int mac_fini(void);
+/*
+ * MAC packet/chain drop functions to aggregate all dropped-packet
+ * debugging to a single surface.
+ */
+/*PRINTFLIKE2*/
+extern void mac_drop_pkt(mblk_t *, const char *, ...)
+ __KPRINTFLIKE(2);
+
+/*PRINTFLIKE2*/
+extern void mac_drop_chain(mblk_t *, const char *, ...)
+ __KPRINTFLIKE(2);
+
extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);
extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *,
uint8_t *, ip6_frag_t **);
extern mblk_t *mac_copymsgchain_cksum(mblk_t *);
-extern mblk_t *mac_fix_cksum(mblk_t *);
extern void mac_packet_print(mac_handle_t, mblk_t *);
extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *,
mac_header_info_t *);
extern void mac_tx_notify(mac_impl_t *);
-extern boolean_t mac_callback_find(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
-extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
-extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
-extern void mac_callback_remove_wait(mac_cb_info_t *);
-extern void mac_callback_free(mac_cb_t *);
-extern mac_cb_t *mac_callback_walker_cleanup(mac_cb_info_t *, mac_cb_t **);
+extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern void mac_callback_remove_wait(mac_cb_info_t *);
+extern void mac_callback_barrier(mac_cb_info_t *);
+extern void mac_callback_free(mac_cb_t *);
+extern void mac_callback_walker_enter(mac_cb_info_t *);
+extern void mac_callback_walker_exit(mac_cb_info_t *, mac_cb_t **, boolean_t);
/* in mac_bcast.c */
extern void mac_bcast_init(void);
@@ -759,6 +771,8 @@ extern void mac_client_bcast_refresh(mac_client_impl_t *, mac_multicst_t,
*/
extern int mac_group_addmac(mac_group_t *, const uint8_t *);
extern int mac_group_remmac(mac_group_t *, const uint8_t *);
+extern int mac_group_addvlan(mac_group_t *, uint16_t);
+extern int mac_group_remvlan(mac_group_t *, uint16_t);
extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *,
mac_group_t *);
extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *);
@@ -779,6 +793,7 @@ extern void mac_rx_switch_grp_to_sw(mac_group_t *);
* MAC address functions are used internally by MAC layer.
*/
extern mac_address_t *mac_find_macaddr(mac_impl_t *, uint8_t *);
+extern mac_address_t *mac_find_macaddr_vlan(mac_impl_t *, uint8_t *, uint16_t);
extern boolean_t mac_check_macaddr_shared(mac_address_t *);
extern int mac_update_macaddr(mac_address_t *, uint8_t *);
extern void mac_freshen_macaddr(mac_address_t *, uint8_t *);
@@ -829,7 +844,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *);
extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t);
extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t);
extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *);
-extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t);
extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *);
extern void i_mac_share_alloc(mac_client_impl_t *);
@@ -849,7 +864,6 @@ extern void mac_tx_client_block(mac_client_impl_t *);
extern void mac_tx_client_unblock(mac_client_impl_t *);
extern void mac_tx_invoke_callbacks(mac_client_impl_t *, mac_tx_cookie_t);
extern int i_mac_promisc_set(mac_impl_t *, boolean_t);
-extern void i_mac_promisc_walker_cleanup(mac_impl_t *);
extern mactype_t *mactype_getplugin(const char *);
extern void mac_addr_factory_init(mac_impl_t *);
extern void mac_addr_factory_fini(mac_impl_t *);
@@ -863,8 +877,9 @@ extern int mac_start_group(mac_group_t *);
extern void mac_stop_group(mac_group_t *);
extern int mac_start_ring(mac_ring_t *);
extern void mac_stop_ring(mac_ring_t *);
-extern int mac_add_macaddr(mac_impl_t *, mac_group_t *, uint8_t *, boolean_t);
-extern int mac_remove_macaddr(mac_address_t *);
+extern int mac_add_macaddr_vlan(mac_impl_t *, mac_group_t *, uint8_t *,
+ uint16_t, boolean_t);
+extern int mac_remove_macaddr_vlan(mac_address_t *, uint16_t);
extern void mac_set_group_state(mac_group_t *, mac_group_state_t);
extern void mac_group_add_client(mac_group_t *, mac_client_impl_t *);
diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h
index 4c91c03967..2dea3a4758 100644
--- a/usr/src/uts/common/sys/mac_provider.h
+++ b/usr/src/uts/common/sys/mac_provider.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
*/
#ifndef _SYS_MAC_PROVIDER_H
@@ -108,6 +108,7 @@ typedef enum {
MAC_CAPAB_NO_ZCOPY = 0x00100000, /* boolean only, no data */
MAC_CAPAB_LEGACY = 0x00200000, /* data is mac_capab_legacy_t */
MAC_CAPAB_VRRP = 0x00400000, /* data is mac_capab_vrrp_t */
+ MAC_CAPAB_OVERLAY = 0x00800000, /* boolean only, no data */
MAC_CAPAB_TRANSCEIVER = 0x01000000, /* mac_capab_transciever_t */
MAC_CAPAB_LED = 0x02000000 /* data is mac_capab_led_t */
} mac_capab_t;
@@ -242,16 +243,59 @@ typedef struct mac_callbacks_s {
/*
* Virtualization Capabilities
*/
+
/*
- * The ordering of entries below is important. MAC_HW_CLASSIFIER
- * is the cutoff below which are entries which don't depend on
- * H/W. MAC_HW_CLASSIFIER and entries after that are cases where
- * H/W has been updated through add/modify/delete APIs.
+ * The type of ring classification. This is used by MAC to determine
+ * what, if any, processing it has to do upon receiving traffic on a
+ * particular Rx ring.
+ *
+ * MAC_NO_CLASSIFIER
+ *
+ * No classification has been set. No traffic should cross an Rx
+ * ring in this state.
+ *
+ * MAC_SW_CLASSIFIER
+ *
+ * The driver delivers traffic for multiple clients to this ring.
+ * All traffic must be software classified by MAC to guarantee
+ * delivery to the correct client. This classification type may
+ * be chosen for several reasons.
+ *
+ * o The driver provides only one group and there are multiple
+ * clients using the MAC.
+ *
+ * o The driver provides some hardware filtering but not enough
+ * to fully classify the traffic. E.g., a VLAN VNIC requires L2
+ * unicast address filtering as well as VLAN filtering, but
+ * some drivers may only support the former.
+ *
+ * o The ring belongs to the default group. The default group
+ * acts as a spillover for all clients that can't reserve an
+ * exclusive group. It also handles multicast traffic for all
+ * clients. For these reasons, the default group's rings are
+ * always software classified.
+ *
+ * MAC_HW_CLASSIFIER
+ *
+ * The driver delivers traffic for a single MAC client across
+ * this ring. With this guarantee, MAC can simply pass the
+ * traffic up the stack or even allow polling of the ring.
+ *
+ * MAC_PASSTHRU_CLASSIFIER
+ *
+ * The ring is in "passthru" mode. In this mode we bypass all of
+ * the typical MAC processing and pass the traffic directly to
+ * the mr_pt_fn callback, see mac_rx_common(). This is used in
+ * cases where there is another module acting as MAC provider on
+ * behalf of the driver. E.g., link aggregations use this mode to
+ * take full control of the port's rings; allowing it to enforce
+ * LACP protocols and aggregate rings across discrete drivers.
*/
typedef enum {
MAC_NO_CLASSIFIER = 0,
MAC_SW_CLASSIFIER,
- MAC_HW_CLASSIFIER
+ MAC_HW_CLASSIFIER,
+ MAC_PASSTHRU_CLASSIFIER
} mac_classify_type_t;
typedef void (*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *,
@@ -281,6 +325,28 @@ typedef enum {
} mac_ring_type_t;
/*
+ * The value VLAN_ID_NONE (VID 0) means a client does not have
+ * membership to any VLAN. However, this statement is true for both
+ * untagged packets and priority tagged packets leading to confusion
+ * over what semantic is intended. To the provider, VID 0 is a valid
+ * VID when priority tagging is in play. To MAC and everything above
+ * VLAN_ID_NONE almost universally implies untagged traffic. Thus, we
+ * convert VLAN_ID_NONE to a sentinel value (MAC_VLAN_UNTAGGED) at the
+ * border between MAC and MAC provider. This informs the provider that
+ * the client is interested in untagged traffic and the provider
+ * should set any relevant bits to receive such traffic.
+ *
+ * Currently, the API between MAC and the provider passes the VID as a
+ * unit16_t. In the future this could actually be the entire TCI mask
+ * (PCP, DEI, and VID). This current scheme is safe in that potential
+ * future world as well; as 0xFFFF is not a valid TCI (the 0xFFF VID
+ * is reserved and never transmitted across networks).
+ */
+#define MAC_VLAN_UNTAGGED UINT16_MAX
+#define MAC_VLAN_UNTAGGED_VID(vid) \
+ (((vid) == VLAN_ID_NONE) ? MAC_VLAN_UNTAGGED : (vid))
+
+/*
* Grouping type of a ring group
*
* MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped.
@@ -342,6 +408,7 @@ typedef struct mac_ring_info_s {
mac_ring_poll_t poll;
} mrfunion;
mac_ring_stat_t mri_stat;
+
/*
* mri_flags will have some bits set to indicate some special
* property/feature of a ring like serialization needed for a
@@ -358,6 +425,8 @@ typedef struct mac_ring_info_s {
* #defines for mri_flags. The flags are temporary flags that are provided
* only to workaround issues in specific drivers, and they will be
* removed in the future.
+ *
+ * These are consumed only by sun4v and neptune (nxge).
*/
#define MAC_RING_TX_SERIALIZE 0x1
#define MAC_RING_RX_ENQUEUE 0x2
@@ -366,6 +435,8 @@ typedef int (*mac_group_start_t)(mac_group_driver_t);
typedef void (*mac_group_stop_t)(mac_group_driver_t);
typedef int (*mac_add_mac_addr_t)(void *, const uint8_t *);
typedef int (*mac_rem_mac_addr_t)(void *, const uint8_t *);
+typedef int (*mac_add_vlan_filter_t)(mac_group_driver_t, uint16_t);
+typedef int (*mac_rem_vlan_filter_t)(mac_group_driver_t, uint16_t);
struct mac_group_info_s {
mac_group_driver_t mgi_driver; /* Driver reference */
@@ -374,9 +445,11 @@ struct mac_group_info_s {
uint_t mgi_count; /* Count of rings */
mac_intr_t mgi_intr; /* Optional per-group intr */
- /* Only used for rx groups */
+ /* Only used for Rx groups */
mac_add_mac_addr_t mgi_addmac; /* Add a MAC address */
mac_rem_mac_addr_t mgi_remmac; /* Remove a MAC address */
+ mac_add_vlan_filter_t mgi_addvlan; /* Add a VLAN filter */
+ mac_rem_vlan_filter_t mgi_remvlan; /* Remove a VLAN filter */
};
/*
@@ -558,11 +631,12 @@ extern void mac_prop_info_set_range_uint32(
extern void mac_prop_info_set_perm(mac_prop_info_handle_t,
uint8_t);
-extern void mac_hcksum_get(mblk_t *, uint32_t *,
+extern void mac_hcksum_get(const mblk_t *, uint32_t *,
uint32_t *, uint32_t *, uint32_t *,
uint32_t *);
extern void mac_hcksum_set(mblk_t *, uint32_t, uint32_t,
uint32_t, uint32_t, uint32_t);
+extern void mac_hcksum_clone(const mblk_t *, mblk_t *);
extern void mac_lso_get(mblk_t *, uint32_t *, uint32_t *);
diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h
index 0d49a2ff4d..65819c1209 100644
--- a/usr/src/uts/common/sys/mman.h
+++ b/usr/src/uts/common/sys/mman.h
@@ -340,6 +340,7 @@ struct memcntl_mha32 {
#define MS_SYNC 0x4 /* wait for msync */
#define MS_ASYNC 0x1 /* return immediately */
#define MS_INVALIDATE 0x2 /* invalidate caches */
+#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */
#if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__)
/* functions to mctl */
diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h
index 88c98dc5a4..7196f7b3ac 100644
--- a/usr/src/uts/common/sys/mntent.h
+++ b/usr/src/uts/common/sys/mntent.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*
* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
@@ -47,6 +48,7 @@ extern "C" {
#define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */
#define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */
#define MNTTYPE_LOFS "lofs" /* Loop back file system */
+#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */
#define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */
#define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */
#define MNTTYPE_SWAP "swap" /* Swap file system */
diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h
index 6407534a3b..658f9f3f6b 100644
--- a/usr/src/uts/common/sys/netconfig.h
+++ b/usr/src/uts/common/sys/netconfig.h
@@ -28,6 +28,7 @@
*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_NETCONFIG_H
diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h
index b21504109c..92bd5b897d 100644
--- a/usr/src/uts/common/sys/neti.h
+++ b/usr/src/uts/common/sys/neti.h
@@ -21,6 +21,8 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018, Joyent, Inc.
*/
#ifndef _SYS_NETI_H
@@ -46,6 +48,9 @@ struct msgb; /* avoiding sys/stream.h here */
#define NHF_INET "NHF_INET"
#define NHF_INET6 "NHF_INET6"
#define NHF_ARP "NHF_ARP"
+#define NHF_VND_INET "NHF_VND_INET"
+#define NHF_VND_INET6 "NHF_VND_INET6"
+#define NHF_VIONA "NHF_VIONA"
/*
* Event identification
@@ -61,7 +66,7 @@ struct msgb; /* avoiding sys/stream.h here */
/*
* Network NIC hardware checksum capability
*/
-#define NET_HCK_NONE 0x00
+#define NET_HCK_NONE 0x00
#define NET_HCK_L3_FULL 0x01
#define NET_HCK_L3_PART 0x02
#define NET_HCK_L4_FULL 0x10
diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h
index 7ee33318cd..b327e69fad 100644
--- a/usr/src/uts/common/sys/netstack.h
+++ b/usr/src/uts/common/sys/netstack.h
@@ -88,7 +88,8 @@ typedef id_t netstackid_t;
#define NS_IPSECESP 16
#define NS_IPNET 17
#define NS_ILB 18
-#define NS_MAX (NS_ILB+1)
+#define NS_VND 19
+#define NS_MAX (NS_VND+1)
/*
* State maintained for each module which tracks the state of
diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h
new file mode 100644
index 0000000000..12d0dbca51
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay.h
@@ -0,0 +1,96 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_H
+#define _SYS_OVERLAY_H
+
+/*
+ * Overlay device support
+ */
+
+#include <sys/param.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define OVERLAY_IOC_CREATE OVERLAYIOC(1)
+#define OVERLAY_IOC_DELETE OVERLAYIOC(2)
+#define OVERLAY_IOC_PROPINFO OVERLAYIOC(3)
+#define OVERLAY_IOC_GETPROP OVERLAYIOC(4)
+#define OVERLAY_IOC_SETPROP OVERLAYIOC(5)
+#define OVERLAY_IOC_NPROPS OVERLAYIOC(6)
+#define OVERLAY_IOC_ACTIVATE OVERLAYIOC(7)
+#define OVERLAY_IOC_STATUS OVERLAYIOC(8)
+
+typedef struct overlay_ioc_create {
+ datalink_id_t oic_linkid;
+ uint32_t oic_filler;
+ uint64_t oic_vnetid;
+ char oic_encap[MAXLINKNAMELEN];
+} overlay_ioc_create_t;
+
+typedef struct overlay_ioc_activate {
+ datalink_id_t oia_linkid;
+} overlay_ioc_activate_t;
+
+typedef struct overlay_ioc_delete {
+ datalink_id_t oid_linkid;
+} overlay_ioc_delete_t;
+
+typedef struct overlay_ioc_nprops {
+ datalink_id_t oipn_linkid;
+ int32_t oipn_nprops;
+} overlay_ioc_nprops_t;
+
+typedef struct overlay_ioc_propinfo {
+ datalink_id_t oipi_linkid;
+ int32_t oipi_id;
+ char oipi_name[OVERLAY_PROP_NAMELEN];
+ uint_t oipi_type;
+ uint_t oipi_prot;
+ uint8_t oipi_default[OVERLAY_PROP_SIZEMAX];
+ uint32_t oipi_defsize;
+ uint32_t oipi_posssize;
+ uint8_t oipi_poss[OVERLAY_PROP_SIZEMAX];
+} overlay_ioc_propinfo_t;
+
+typedef struct overlay_ioc_prop {
+ datalink_id_t oip_linkid;
+ int32_t oip_id;
+ char oip_name[OVERLAY_PROP_NAMELEN];
+ uint8_t oip_value[OVERLAY_PROP_SIZEMAX];
+ uint32_t oip_size;
+} overlay_ioc_prop_t;
+
+typedef enum overlay_status {
+ OVERLAY_I_OK = 0x00,
+ OVERLAY_I_DEGRADED = 0x01
+} overlay_status_t;
+
+typedef struct overlay_ioc_status {
+ datalink_id_t ois_linkid;
+ uint_t ois_status;
+ char ois_message[OVERLAY_STATUS_BUFLEN];
+} overlay_ioc_status_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_H */
diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h
new file mode 100644
index 0000000000..d638096006
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_common.h
@@ -0,0 +1,65 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_COMMON_H
+#define _SYS_OVERLAY_COMMON_H
+
+/*
+ * Common overlay definitions
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum overlay_target_mode {
+ OVERLAY_TARGET_NONE = 0x0,
+ OVERLAY_TARGET_POINT,
+ OVERLAY_TARGET_DYNAMIC
+} overlay_target_mode_t;
+
+typedef enum overlay_plugin_dest {
+ OVERLAY_PLUGIN_D_INVALID = 0x0,
+ OVERLAY_PLUGIN_D_ETHERNET = 0x1,
+ OVERLAY_PLUGIN_D_IP = 0x2,
+ OVERLAY_PLUGIN_D_PORT = 0x4,
+ OVERLAY_PLUGIN_D_MASK = 0x7
+} overlay_plugin_dest_t;
+
+typedef enum overlay_prop_type {
+ OVERLAY_PROP_T_INT = 0x1, /* signed int */
+ OVERLAY_PROP_T_UINT, /* unsigned int */
+ OVERLAY_PROP_T_IP, /* sinaddr6 */
+ OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */
+} overlay_prop_type_t;
+
+typedef enum overlay_prop_prot {
+ OVERLAY_PROP_PERM_REQ = 0x1,
+ OVERLAY_PROP_PERM_READ = 0x2,
+ OVERLAY_PROP_PERM_WRITE = 0x4,
+ OVERLAY_PROP_PERM_RW = 0x6,
+ OVERLAY_PROP_PERM_RRW = 0x7,
+ OVERLAY_PROP_PERM_MASK = 0x7
+} overlay_prop_prot_t;
+
+#define OVERLAY_PROP_NAMELEN 64
+#define OVERLAY_PROP_SIZEMAX 256
+#define OVERLAY_STATUS_BUFLEN 256
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_COMMON_H */
diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h
new file mode 100644
index 0000000000..7fb8b8da1d
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_impl.h
@@ -0,0 +1,205 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_IMPL_H
+#define _SYS_OVERLAY_IMPL_H
+
+/*
+ * Overlay device support
+ */
+
+#include <sys/overlay.h>
+#include <sys/overlay_common.h>
+#include <sys/overlay_plugin.h>
+#include <sys/overlay_target.h>
+#include <sys/ksynch.h>
+#include <sys/list.h>
+#include <sys/avl.h>
+#include <sys/ksocket.h>
+#include <sys/socket.h>
+#include <sys/refhash.h>
+#include <sys/ethernet.h>
+#include <sys/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define OVEP_VERSION_ONE 0x1
+
+typedef struct overlay_plugin {
+ kmutex_t ovp_mutex;
+ list_node_t ovp_link; /* overlay_plugin_lock */
+ uint_t ovp_active; /* ovp_mutex */
+ const char *ovp_name; /* RO */
+ const overlay_plugin_ops_t *ovp_ops; /* RO */
+ const char *const *ovp_props; /* RO */
+ uint_t ovp_nprops; /* RO */
+ uint_t ovp_id_size; /* RO */
+ overlay_plugin_flags_t ovp_flags; /* RO */
+ overlay_plugin_dest_t ovp_dest; /* RO */
+} overlay_plugin_t;
+
+typedef struct overlay_mux {
+ list_node_t omux_lnode;
+ ksocket_t omux_ksock; /* RO */
+ overlay_plugin_t *omux_plugin; /* RO: associated encap */
+ int omux_domain; /* RO: socket domain */
+ int omux_family; /* RO: socket family */
+ int omux_protocol; /* RO: socket protocol */
+ struct sockaddr *omux_addr; /* RO: socket address */
+ socklen_t omux_alen; /* RO: sockaddr len */
+ kmutex_t omux_lock; /* Protects everything below */
+ uint_t omux_count; /* Active instances */
+ avl_tree_t omux_devices; /* Tree of devices */
+} overlay_mux_t;
+
+typedef enum overlay_target_flag {
+ OVERLAY_T_TEARDOWN = 0x1
+} overlay_target_flag_t;
+
+typedef struct overlay_target {
+ kmutex_t ott_lock;
+ kcondvar_t ott_cond;
+ overlay_target_mode_t ott_mode; /* RO */
+ overlay_plugin_dest_t ott_dest; /* RO */
+ uint64_t ott_id; /* RO */
+ overlay_target_flag_t ott_flags; /* ott_lock */
+ uint_t ott_ocount; /* ott_lock */
+ union { /* ott_lock */
+ overlay_target_point_t ott_point;
+ struct overlay_target_dyn {
+ refhash_t *ott_dhash;
+ avl_tree_t ott_tree;
+ } ott_dyn;
+ } ott_u;
+} overlay_target_t;
+
+typedef enum overlay_dev_flag {
+ OVERLAY_F_ACTIVATED = 0x01, /* Activate ioctl completed */
+ OVERLAY_F_IN_MUX = 0x02, /* Currently in a mux */
+ OVERLAY_F_IN_TX = 0x04, /* Currently doing tx */
+ OVERLAY_F_IN_RX = 0x08, /* Currently doing rx */
+ OVERLAY_F_IOMASK = 0x0c, /* A mask for rx and tx */
+ OVERLAY_F_MDDROP = 0x10, /* Drop traffic for metadata update */
+ OVERLAY_F_STOPMASK = 0x1e, /* None set when stopping */
+ OVERLAY_F_VARPD = 0x20, /* varpd plugin exists */
+ OVERLAY_F_DEGRADED = 0x40, /* device is degraded */
+ OVERLAY_F_MASK = 0x7f /* mask of everything */
+} overlay_dev_flag_t;
+
+typedef struct overlay_dev {
+ kmutex_t odd_lock;
+ kcondvar_t odd_iowait;
+ list_node_t odd_link; /* overlay_dev_lock */
+ mac_handle_t odd_mh; /* RO */
+ overlay_plugin_t *odd_plugin; /* RO */
+ datalink_id_t odd_linkid; /* RO */
+ void *odd_pvoid; /* RO -- only used by plugin */
+ uint_t odd_ref; /* protected by odd_lock */
+ uint_t odd_mtu; /* protected by odd_lock */
+ overlay_dev_flag_t odd_flags; /* protected by odd_lock */
+ uint_t odd_rxcount; /* protected by odd_lock */
+ uint_t odd_txcount; /* protected by odd_lock */
+ overlay_mux_t *odd_mux; /* protected by odd_lock */
+ uint64_t odd_vid; /* RO if active else odd_lock */
+ avl_node_t odd_muxnode; /* managed by mux */
+ overlay_target_t *odd_target; /* See big theory statement */
+ char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */
+} overlay_dev_t;
+
+typedef enum overlay_target_entry_flags {
+ OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */
+ OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */
+ OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */
+ OVERLAY_ENTRY_F_VALID_MASK = 0x06
+} overlay_target_entry_flags_t;
+
+typedef struct overlay_target_entry {
+ kmutex_t ote_lock;
+ refhash_link_t ote_reflink; /* hashtable link */
+ avl_node_t ote_avllink; /* iteration link */
+ list_node_t ote_qlink;
+ overlay_target_entry_flags_t ote_flags; /* RW: state flags */
+ uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */
+ overlay_target_t *ote_ott; /* RO */
+ overlay_dev_t *ote_odd; /* RO */
+ overlay_target_point_t ote_dest; /* RW: destination */
+ mblk_t *ote_chead; /* RW: blocked mb chain head */
+ mblk_t *ote_ctail; /* RW: blocked mb chain tail */
+ size_t ote_mbsize; /* RW: outstanding mblk size */
+ hrtime_t ote_vtime; /* RW: valid timestamp */
+} overlay_target_entry_t;
+
+
+#define OVERLAY_CTL "overlay"
+
+extern dev_info_t *overlay_dip;
+
+extern mblk_t *overlay_m_tx(void *, mblk_t *);
+
+typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *);
+extern void overlay_dev_iter(overlay_dev_iter_f, void *);
+
+extern void overlay_plugin_init(void);
+extern overlay_plugin_t *overlay_plugin_lookup(const char *);
+extern void overlay_plugin_rele(overlay_plugin_t *);
+extern void overlay_plugin_fini(void);
+typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *);
+extern void overlay_plugin_walk(overlay_plugin_walk_f, void *);
+
+extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t);
+extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t);
+
+extern void overlay_mux_init(void);
+extern void overlay_mux_fini(void);
+
+extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int,
+ struct sockaddr *, socklen_t, int *);
+extern void overlay_mux_close(overlay_mux_t *);
+extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *);
+extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *);
+extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *);
+
+extern void overlay_prop_init(overlay_prop_handle_t);
+
+extern void overlay_target_init(void);
+extern int overlay_target_busy(void);
+extern int overlay_target_open(dev_t *, int, int, cred_t *);
+extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+extern int overlay_target_close(dev_t, int, int, cred_t *);
+extern void overlay_target_free(overlay_dev_t *);
+
+#define OVERLAY_TARGET_OK 0
+#define OVERLAY_TARGET_DROP 1
+#define OVERLAY_TARGET_ASYNC 2
+extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *,
+ socklen_t *);
+extern void overlay_target_quiesce(overlay_target_t *);
+extern void overlay_target_fini(void);
+
+extern void overlay_fm_init(void);
+extern void overlay_fm_fini(void);
+extern void overlay_fm_degrade(overlay_dev_t *, const char *);
+extern void overlay_fm_restore(overlay_dev_t *);
+
+extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t);
+extern void overlay_hold_rele(overlay_dev_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_IMPL_H */
diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h
new file mode 100644
index 0000000000..07efaa05df
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_plugin.h
@@ -0,0 +1,324 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_PLUGIN_H
+#define _SYS_OVERLAY_PLUGIN_H
+
+/*
+ * overlay plugin interface for encapsulation/decapsulation modules
+ *
+ * This header file defines how encapsulation and decapsulation plugins
+ * interact within the broader system. At this time, these interfaces are
+ * considered private to illumos and therefore are subject to change. As we gain
+ * more experience with a few of the different encapsulation formats, say nvgre
+ * or geneve, then we can move to make this a more-stable interface.
+ *
+ * A plugin is a general kernel module that uses the miscellaneous mod-linkage.
+ *
+ * In it's _init(9E) routine, it must register itself with the overlay
+ * subsystem. To do this, it allocates an overlay_plugin_register_t via
+ * overlay_plugin_alloc(), that it then * fills out with various required
+ * information and then attempts to register with the system via a call to
+ * overlay_plugin_register(). If that succeeds, it should then call
+ * mod_install(9F). If the mod_install(9F) fails, then it should call
+ * overlay_plugin_unregister(). Regardless of success or failure, it should call
+ * overlay_plugin_free() to ensure that any memory that may be associated with
+ * the registration is freed.
+ *
+ * When the module's _fini(9E) is called, overlay_plugin_unregister() should be
+ * called first. It may return an error, such as EBUSY. In such cases, it should
+ * be returned as the return status of _fini(9E). This is quite necessary, it
+ * ensures that if the module is in use it doesn't get unloaded out from under
+ * us the broader subsystem while it's still in use. A driver can use that to
+ * know that there are no current instances of its private data.
+ *
+ * ------------------
+ * Plugin Definitions
+ * ------------------
+ *
+ * A plugin is required to fill in both an operations vector and a series of
+ * information to the callback routine. Here are the routines and their
+ * purposes. The full signatures are available below.
+ *
+ * overlay_plugin_init_t
+ *
+ * This interface is used to create a new instance of a plugin. An instance
+ * of a plugin will be created for each overlay device that is created. For
+ * example, if a device is created with VXLAN ID 23 and ID 42, then there
+ * will be two different calls to this function.
+ *
+ * This function gives the plugin a chance to create a private data
+ * structure that will be returned on subsequent calls to the system.
+ *
+ * overlay_plugin_fini_t
+ *
+ * This is the opposite of overlay_plugin_init_t. It will be called when it
+ * is safe to remove any private data that is associated with this instance
+ * of the plugin.
+ *
+ * overlay_plugin_propinfo_t
+ *
+ * This is called with the name of a property that is registered when the
+ * plugin is created. This function will be called with the name of the
+ * property that information is being requested about. The plugin is
+ * responsible for filling out information such as setting the name, the
+ * type of property it is, the protection of the property (can a user
+ * update it?), whether the property is required, an optional default value
+ * for the property, and an optional set of values or ranges that are
+ * allowed.
+ *
+ * overlay_plugin_getprop_t
+ *
+ * Return the value of the named property from the current instance of the
+ * plugin.
+ *
+ * overlay_plugin_setprop_t
+ *
+ * Set the value of the named property to the specified value for the
+ * current instance of the plugin. Note, that it is the plugin's
+ * responsibility to ensure that the value of the property is valid and to
+ * update state as appropriate.
+ *
+ * overlay_plugin_socket_t
+ *
+ * Every overlay device has a corresponding socket that it uses to send and
+ * receive traffic. This routine is used to get the parameters that should
+ * be used to define such a socket. The actual socket may be multiplexed
+ * with other uses of it.
+ *
+ * overlay_plugin_sockopt_t
+ *
+ * Allow a plugin to set any necessary socket options that it needs on the
+ * kernel socket that is being used by a mux. This will only be called once
+ * for a given mux, if additional devices are added to a mux, it will not
+ * be called additional times.
+ *
+ * overlay_plugin_encap_t
+ *
+ * In this routine you're given a message block and information about the
+ * packet, such as the identifier and are asked to fill out a message block
+ * that represents the encapsulation header and optionally manipulate the
+ * input message if required.
+ *
+ * overlay_plugin_decap_t
+ *
+ * In this routine, you're given the encapsulated message block. The
+ * requirement is to decapsulate it and determine what is the correct
+ * overlay identifier for this network and to fill in the header size so
+ * the broader system knows how much of this data should be considered
+ * consumed.
+ *
+ * ovpo_callbacks
+ *
+ * This should be set to zero, it's reserved for future use.
+ *
+ * Once these properties are defined, the module should define the following
+ * members in the overlay_plugin_register_t.
+ *
+ * ovep_version
+ *
+ * Should be set to the value of the macro OVEP_VERSION.
+ *
+ * ovep_name
+ *
+ * Should be set to a character string that has the name of the module.
+ * Generally this should match the name of the kernel module; however, this
+ * is the name that users will use to refer to this module when creating
+ * devices.
+ *
+ * overlay_plugin_ops_t
+ *
+ * Should be set to the functions as described above.
+ *
+ * ovep_props
+ *
+ * This is an array of character strings that holds the names of the
+ * properties of the encapsulation plugin.
+ *
+ *
+ * ovep_id_size
+ *
+ * This is the size in bytes of the valid range for the identifier. The
+ * valid identifier range is considered a ovep_id_size byte unsigned
+ * integer, [ 0, 1 << (ovep_id_size * 8) ).
+ *
+ * ovep_flags
+ *
+ * A series of flags that indicate optional features that are supported.
+ * Valid flags include:
+ *
+ * OVEP_F_VLAN_TAG
+ *
+ * The encapsulation format allows for the encapsulated
+ * packet to maintain a VLAN tag.
+ *
+ * ovep_dest
+ *
+ * Describes the kind of destination that the overlay plugin supports for
+ * sending traffic. For example, vxlan uses UDP, therefore it requires both
+ * an IP address and a port; however, nvgre uses the gre header and
+ * therefore only requires an IP address. The following flags may be
+ * combined:
+ *
+ * OVERLAY_PLUGIN_D_ETHERNET
+ *
+ * Indicates that to send a packet to its destination, we
+ * require a link-layer ethernet address.
+ *
+ * OVERLAY_PLUGIN_D_IP
+ *
+ * Indicates that to send a packet to its destination, we
+ * require an IP address. Note, all IP addresses are
+ * transmitted as IPv6 addresses and for an IPv4
+ * destination, using an IPv4-mapped IPv6 address is the
+ * expected way to transmit that.
+ *
+ * OVERLAY_PLUGIN_D_PORT
+ *
+ * Indicates that to send a packet to its destination, a
+ * port is required, this usually indicates that the
+ * protocol uses something like TCP or UDP.
+ *
+ *
+ * -------------------------------------------------
+ * Downcalls, Upcalls, and Synchronization Guarantees
+ * -------------------------------------------------
+ *
+ * Every instance of a given module is independent. The kernel only guarantees
+ * that it will probably perform downcalls into different instances in parallel
+ * at some point. No locking is provided by the framework for synchronization
+ * across instances. If a module finds itself needing that, it will be up to it
+ * to provide it.
+ *
+ * In a given instance, the kernel may call into entry points in parallel. If
+ * the instance has private data, it should likely synchronize it. The one
+ * guarantee that we do make, is that calls to getprop and setprop will be done
+ * synchronized by a caller holding the MAC perimeter.
+ *
+ * While servicing a downcall from the general overlay device framework, a
+ * kernel module should not make any upcalls, excepting those functions that are
+ * defined in this header file, eg. the property related callbacks. Improtantly,
+ * it cannot make any assumptions about what locks may or may not be held by the
+ * broader system. The only thing that it is safe for it to use are its own
+ * locks.
+ *
+ * ----------------
+ * Downcall Context
+ * ----------------
+ *
+ * For all of the downcalls, excepting the overlay_plugin_encap_t and
+ * overlay_plugin_decap_t, the calls will be made either in kernel or user
+ * context, the module should not assume either way.
+ *
+ * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user,
+ * kernel or interrupt context; however, it is guaranteed that the interrupt
+ * will be below LOCK_LEVEL, and therefore it is safe to grab locks.
+ */
+
+#include <sys/stream.h>
+#include <sys/mac_provider.h>
+#include <sys/ksocket.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define OVEP_VERSION 0x1
+
+typedef enum overlay_plugin_flags {
+ OVEP_F_VLAN_TAG = 0x01 /* Supports VLAN Tags */
+} overlay_plugin_flags_t;
+
+/*
+ * The ID space could easily be more than a 64-bit number, even
+ * though today it's either a 24-64 bit value. How should we future
+ * proof ourselves here?
+ */
+typedef struct ovep_encap_info {
+ uint64_t ovdi_id;
+ size_t ovdi_hdr_size;
+} ovep_encap_info_t;
+
+typedef struct __overlay_prop_handle *overlay_prop_handle_t;
+typedef struct __overlay_handle *overlay_handle_t;
+
+/*
+ * Plugins are guaranteed that calls to setprop are serialized. However, any
+ * number of other calls can be going on in parallel otherwise.
+ */
+typedef int (*overlay_plugin_encap_t)(void *, mblk_t *,
+ ovep_encap_info_t *, mblk_t **);
+typedef int (*overlay_plugin_decap_t)(void *, mblk_t *,
+ ovep_encap_info_t *);
+typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **);
+typedef void (*overlay_plugin_fini_t)(void *);
+typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *,
+ struct sockaddr *, socklen_t *);
+typedef int (*overlay_plugin_sockopt_t)(ksocket_t);
+typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *,
+ uint32_t *);
+typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *,
+ uint32_t);
+typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t);
+
+typedef struct overlay_plugin_ops {
+ uint_t ovpo_callbacks;
+ overlay_plugin_init_t ovpo_init;
+ overlay_plugin_fini_t ovpo_fini;
+ overlay_plugin_encap_t ovpo_encap;
+ overlay_plugin_decap_t ovpo_decap;
+ overlay_plugin_socket_t ovpo_socket;
+ overlay_plugin_sockopt_t ovpo_sockopt;
+ overlay_plugin_getprop_t ovpo_getprop;
+ overlay_plugin_setprop_t ovpo_setprop;
+ overlay_plugin_propinfo_t ovpo_propinfo;
+} overlay_plugin_ops_t;
+
+typedef struct overlay_plugin_register {
+ uint_t ovep_version;
+ const char *ovep_name;
+ const overlay_plugin_ops_t *ovep_ops;
+ const char **ovep_props;
+ uint_t ovep_id_size;
+ uint_t ovep_flags;
+ uint_t ovep_dest;
+} overlay_plugin_register_t;
+
+/*
+ * Functions that interact with registration
+ */
+extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t);
+extern void overlay_plugin_free(overlay_plugin_register_t *);
+extern int overlay_plugin_register(overlay_plugin_register_t *);
+extern int overlay_plugin_unregister(const char *);
+
+/*
+ * Property information callbacks
+ */
+extern void overlay_prop_set_name(overlay_prop_handle_t, const char *);
+extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t);
+extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t);
+extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t);
+extern void overlay_prop_set_nodefault(overlay_prop_handle_t);
+extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t,
+ uint32_t);
+extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_PLUGIN_H */
diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h
new file mode 100644
index 0000000000..ae92ef3532
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_target.h
@@ -0,0 +1,293 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#ifndef _OVERLAY_TARGET_H
+#define _OVERLAY_TARGET_H
+
+/*
+ * Overlay device varpd ioctl interface (/dev/overlay)
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <netinet/in.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct overlay_target_point {
+ uint8_t otp_mac[ETHERADDRL];
+ struct in6_addr otp_ip;
+ uint16_t otp_port;
+} overlay_target_point_t;
+
+#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8))
+
+#define OVERLAY_TARG_INFO (OVERLAY_TARG_IOCTL | 0x01)
+
+typedef enum overlay_targ_info_flags {
+ OVERLAY_TARG_INFO_F_ACTIVE = 0x01,
+ OVERLAY_TARG_INFO_F_DEGRADED = 0x02
+} overlay_targ_info_flags_t;
+
+/*
+ * Get target information about an overlay device
+ */
+typedef struct overlay_targ_info {
+ datalink_id_t oti_linkid;
+ uint32_t oti_needs;
+ uint64_t oti_flags;
+ uint64_t oti_vnetid;
+} overlay_targ_info_t;
+
+/*
+ * Declare an association between a given varpd instance and a datalink.
+ */
+#define OVERLAY_TARG_ASSOCIATE (OVERLAY_TARG_IOCTL | 0x02)
+
+typedef struct overlay_targ_associate {
+ datalink_id_t ota_linkid;
+ uint32_t ota_mode;
+ uint64_t ota_id;
+ uint32_t ota_provides;
+ overlay_target_point_t ota_point;
+} overlay_targ_associate_t;
+
+/*
+ * Remove an association from a device. If the device has already been started,
+ * this implies OVERLAY_TARG_DEGRADE.
+ */
+#define OVERLAY_TARG_DISASSOCIATE (OVERLAY_TARG_IOCTL | 0x3)
+
+/*
+ * Tells the kernel that while a varpd instance still exists, it basically isn't
+ * making any forward progress, so the device should consider itself degraded.
+ */
+#define OVERLAY_TARG_DEGRADE (OVERLAY_TARG_IOCTL | 0x4)
+
+typedef struct overlay_targ_degrade {
+ datalink_id_t otd_linkid;
+ uint32_t otd_pad;
+ char otd_buf[OVERLAY_STATUS_BUFLEN];
+} overlay_targ_degrade_t;
+
+/*
+ * Tells the kernel to remove the degraded status that it set on a device.
+ */
+#define OVERLAY_TARG_RESTORE (OVERLAY_TARG_IOCTL | 0x5)
+
+typedef struct overlay_targ_id {
+ datalink_id_t otid_linkid;
+} overlay_targ_id_t;
+
+/*
+ * The following ioctls are all used to support dynamic lookups from userland,
+ * generally serviced by varpd.
+ *
+ * The way this is designed to work is that user land will have threads sitting
+ * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit
+ * waiting for work for up to approximately one second of time before they will
+ * be sent back out to user land to give user land a chance to clean itself up
+ * or more generally, come back into the kernel for work. Once these threads
+ * return, they will have a request with which more action can be done. The
+ * following ioctls can all be used to answer the request.
+ *
+ * OVERLAY_TARG_RESPOND - overlay_targ_resp_t
+ *
+ * The overlay_targ_resp_t has the appropriate information from
+ * which a reply can be generated. The information is filled into
+ * an overlay_targ_point_t as appropriate based on the
+ * overlay_plugin_dest_t type.
+ *
+ *
+ * OVERLAY_TARG_DROP - overlay_targ_resp_t
+ *
+ * The overlay_targ_resp_t should identify a request for which to
+ * drop a packet.
+ *
+ *
+ * OVERLAY_TARG_INJECT - overlay_targ_pkt_t
+ *
+ * The overlay_targ_pkt_t injects a fully formed packet into the
+ * virtual network. It may either be identified by its data link id
+ * or by the request id. If both are specified, the
+ * datalink id will be used. Note, that an injection is not
+ * considered a reply and if this corresponds to a requeset, then
+ * that individual packet must still be dropped.
+ *
+ *
+ * OVERLAY_TARG_PKT - overlay_targ_pkt_t
+ *
+ * This ioctl can be used to copy data from a given request into a
+ * user buffer. This can be used in combination with
+ * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp.
+ *
+ *
+ * OVERLAY_TARG_RESEND - overlay_targ_pkt_t
+ *
+ * This ioctl is similar to the OVERLAY_TARG_INJECT, except instead
+ * of receiving it on the local mac handle, it queues it for
+ * retransmission again. This is useful if you have a packet that
+ * was originally destined for some broadcast or multicast address
+ * that you now want to send to a unicast address.
+ */
+#define OVERLAY_TARG_LOOKUP (OVERLAY_TARG_IOCTL | 0x10)
+#define OVERLAY_TARG_RESPOND (OVERLAY_TARG_IOCTL | 0x11)
+#define OVERLAY_TARG_DROP (OVERLAY_TARG_IOCTL | 0x12)
+#define OVERLAY_TARG_INJECT (OVERLAY_TARG_IOCTL | 0x13)
+#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14)
+#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15)
+
+typedef struct overlay_targ_lookup {
+ uint64_t otl_dlid;
+ uint64_t otl_reqid;
+ uint64_t otl_varpdid;
+ uint64_t otl_vnetid;
+ uint64_t otl_hdrsize;
+ uint64_t otl_pktsize;
+ uint8_t otl_srcaddr[ETHERADDRL];
+ uint8_t otl_dstaddr[ETHERADDRL];
+ uint32_t otl_dsttype;
+ uint32_t otl_sap;
+ int32_t otl_vlan;
+} overlay_targ_lookup_t;
+
+typedef struct overlay_targ_resp {
+ uint64_t otr_reqid;
+ overlay_target_point_t otr_answer;
+} overlay_targ_resp_t;
+
+typedef struct overlay_targ_pkt {
+ uint64_t otp_linkid;
+ uint64_t otp_reqid;
+ uint64_t otp_size;
+ void *otp_buf;
+} overlay_targ_pkt_t;
+
+#ifdef _KERNEL
+
+typedef struct overlay_targ_pkt32 {
+ uint64_t otp_linkid;
+ uint64_t otp_reqid;
+ uint64_t otp_size;
+ caddr32_t otp_buf;
+} overlay_targ_pkt32_t;
+
+#endif /* _KERNEL */
+
+/*
+ * This provides a way to get a list of active overlay devices independently
+ * from dlmgmtd. At the end of the day the kernel always knows what will exist
+ * and this allows varpd which is an implementation of libdladm not to end up
+ * needing to call back into dlmgmtd via libdladm and create an unfortunate
+ * dependency cycle.
+ */
+
+#define OVERLAY_TARG_LIST (OVERLAY_TARG_IOCTL | 0x20)
+
+typedef struct overlay_targ_list {
+ uint32_t otl_nents;
+ uint32_t otl_ents[];
+} overlay_targ_list_t;
+
+/*
+ * The following family of ioctls all manipulate the target cache of a given
+ * device.
+ *
+ * OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t
+ *
+ * The overlay_targ_cache_t should be have its link identifier and
+ * the desired mac address filled in. On return, it will fill in
+ * the otc_dest member, if the entry exists in the table.
+ *
+ *
+ * OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t
+ *
+ * The cache table entry of the mac address referred to by otc_mac
+ * and otd_linkid will be filled in with the details provided by in
+ * the otc_dest member.
+ *
+ * OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t
+ *
+ * Removes the cache entry identified by otc_mac from the table.
+ * Note that this does not stop any in-flight lookups or deal with
+ * any data that is awaiting a lookup.
+ *
+ *
+ * OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t
+ *
+ * Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the
+ * entire table identified by otc_linkid. All other parameters are
+ * ignored.
+ *
+ *
+ * OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t
+ *
+ * Iterates over the contents of a target cache identified by
+ * otci_linkid. Iteration is guaranteed to be exactly once for
+ * items which are in the hashtable at the beginning and end of
+ * iteration. For items which are added or removed after iteration
+ * has begun, only at most once semantics are guaranteed. Consumers
+ * should ensure that otci_marker is zeroed before starting
+ * iteration and should preserve its contents across calls.
+ *
+ * Before calling in, otci_count should be set to the number of
+ * entries that space has been allocated for in otci_ents. The
+ * value will be updated to indicate the total number written out.
+ */
+
+#define OVERLAY_TARG_CACHE_GET (OVERLAY_TARG_IOCTL | 0x30)
+#define OVERLAY_TARG_CACHE_SET (OVERLAY_TARG_IOCTL | 0x31)
+#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32)
+#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33)
+#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34)
+
+/*
+ * This is a pretty arbitrary number that we're constraining ourselves to
+ * for iteration. Basically the goal is to make sure that we can't have a user
+ * ask us to allocate too much memory on their behalf at any time. A more
+ * dynamic form may be necessary some day.
+ */
+#define OVERLAY_TARGET_ITER_MAX 500
+
+#define OVERLAY_TARGET_CACHE_DROP 0x01
+
+typedef struct overlay_targ_cache_entry {
+ uint8_t otce_mac[ETHERADDRL];
+ uint16_t otce_flags;
+ overlay_target_point_t otce_dest;
+} overlay_targ_cache_entry_t;
+
+typedef struct overlay_targ_cache {
+ datalink_id_t otc_linkid;
+ overlay_targ_cache_entry_t otc_entry;
+} overlay_targ_cache_t;
+
+typedef struct overlay_targ_cache_iter {
+ datalink_id_t otci_linkid;
+ uint32_t otci_pad;
+ uint64_t otci_marker;
+ uint16_t otci_count;
+ uint8_t otci_pad2[3];
+ overlay_targ_cache_entry_t otci_ents[];
+} overlay_targ_cache_iter_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OVERLAY_TARGET_H */
diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h
index 282d84b912..66bd91f76f 100644
--- a/usr/src/uts/common/sys/param.h
+++ b/usr/src/uts/common/sys/param.h
@@ -116,7 +116,7 @@ extern "C" {
#define DEFAULT_MAXPID 999999
#define DEFAULT_JUMPPID 100000
#else
-#define DEFAULT_MAXPID 30000
+#define DEFAULT_MAXPID 99999
#define DEFAULT_JUMPPID 0
#endif
diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h
index 1269aeca10..587a51f0aa 100644
--- a/usr/src/uts/common/sys/pattr.h
+++ b/usr/src/uts/common/sys/pattr.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_PATTR_H
@@ -106,6 +107,25 @@ typedef struct pattr_hcksum_s {
#define HW_LSO_FLAGS HW_LSO /* All LSO flags, currently only one */
/*
+ * The packet originates from a MAC on the same machine as the
+ * receiving MAC. There are two ways this can happen.
+ *
+ * 1. MAC loopback: When a packet is destined for a MAC client on the
+ * same MAC as the sender. This datapath is taken in
+ * max_tx_send().
+ *
+ * 2. Bridge Fwd: When a packet is destined for a MAC client on the
+ * same bridge as the sender. This datapath is taken in
+ * bridge_forward().
+ *
+ * Presented with this flag, a receiver can then decide whether or not
+ * it needs to emulate some or all of the HW offloads that the NIC
+ * would have performed otherwise -- or whether it should accept the
+ * packet as-is.
+ */
+#define HW_LOCAL_MAC 0x100
+
+/*
* Structure used for zerocopy attribute.
*/
typedef struct pattr_zcopy_s {
diff --git a/usr/src/uts/common/sys/pci.h b/usr/src/uts/common/sys/pci.h
index 66ce71bcc2..d62d19c3a5 100644
--- a/usr/src/uts/common/sys/pci.h
+++ b/usr/src/uts/common/sys/pci.h
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef _SYS_PCI_H
@@ -168,6 +168,7 @@ extern "C" {
/*
* PCI status register bits
*/
+#define PCI_STAT_READY 0x1 /* Immediate Readiness */
#define PCI_STAT_INTR 0x8 /* Interrupt state */
#define PCI_STAT_CAP 0x10 /* Implements Capabilities */
#define PCI_STAT_66MHZ 0x20 /* 66 MHz capable */
@@ -928,6 +929,8 @@ typedef struct pcix_attr {
#define PCI_MSI_MME_SHIFT 0x4 /* Shift for MME bits */
#define PCI_MSI_64BIT_MASK 0x0080 /* 64bit support mask in MSI ctrl reg */
#define PCI_MSI_PVM_MASK 0x0100 /* PVM support mask in MSI ctrl reg */
+#define PCI_MSI_EMD_MASK 0x0200 /* EMD Capable Mask */
+#define PCI_MSI_EMD_ENABLE 0x0400 /* EMD Enable bit */
/*
* PCI Extended Message Signalled Interrupts (MSI-X) capability entry offsets
diff --git a/usr/src/uts/common/sys/pci_cap.h b/usr/src/uts/common/sys/pci_cap.h
index 730e10d77b..9804913241 100644
--- a/usr/src/uts/common/sys/pci_cap.h
+++ b/usr/src/uts/common/sys/pci_cap.h
@@ -82,12 +82,12 @@ typedef enum {
#define PCI_CAP_GET32(h, i, b, o) ((uint32_t) \
pci_cap_get(h, PCI_CAP_CFGSZ_32, i, b, o))
-#define PCI_CAP_PUT8(h, i, b, o, d) ((uint8_t) \
- pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d))
-#define PCI_CAP_PUT16(h, i, b, o, d) ((uint16_t) \
- pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d))
-#define PCI_CAP_PUT32(h, i, b, o, d) ((uint32_t) \
- pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d))
+#define PCI_CAP_PUT8(h, i, b, o, d) \
+ pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d)
+#define PCI_CAP_PUT16(h, i, b, o, d) \
+ pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d)
+#define PCI_CAP_PUT32(h, i, b, o, d) \
+ pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d)
#define PCI_XCAP_GET8(h, i, b, o) ((uint8_t) \
pci_cap_get(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o))
@@ -96,12 +96,12 @@ typedef enum {
#define PCI_XCAP_GET32(h, i, b, o) ((uint32_t) \
pci_cap_get(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o))
-#define PCI_XCAP_PUT8(h, i, b, o, d) ((uint8_t) \
- pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d))
-#define PCI_XCAP_PUT16(h, i, b, o, d) ((uint16_t) \
- pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d))
-#define PCI_XCAP_PUT32(h, i, b, o, d) ((uint32_t) \
- pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d))
+#define PCI_XCAP_PUT8(h, i, b, o, d) \
+ pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d)
+#define PCI_XCAP_PUT16(h, i, b, o, d) \
+ pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d)
+#define PCI_XCAP_PUT32(h, i, b, o, d) \
+ pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d)
extern int pci_cap_probe(ddi_acc_handle_t h, uint16_t index,
diff --git a/usr/src/uts/common/sys/pcie.h b/usr/src/uts/common/sys/pcie.h
index 05b70a56fa..a26729c523 100644
--- a/usr/src/uts/common/sys/pcie.h
+++ b/usr/src/uts/common/sys/pcie.h
@@ -22,6 +22,9 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright 2019, Joyent, Inc.
+ */
#ifndef _SYS_PCIE_H
#define _SYS_PCIE_H
@@ -49,6 +52,7 @@ extern "C" {
#define PCIE_SLOTCTL 0x18 /* Slot Control */
#define PCIE_SLOTSTS 0x1A /* Slot Status */
#define PCIE_ROOTCTL 0x1C /* Root Control */
+#define PCIE_ROOTCAP 0x1E /* Root Capabilities */
#define PCIE_ROOTSTS 0x20 /* Root Status */
#define PCIE_DEVCAP2 0x24 /* Device Capability 2 */
#define PCIE_DEVCTL2 0x28 /* Device Control 2 */
@@ -125,6 +129,9 @@ extern "C" {
#define PCIE_DEVCAP_EP_L1_LAT_MAX 0x1C0 /* > 64 us */
#define PCIE_DEVCAP_EP_L1_LAT_MASK 0x700 /* EP L1 Accetable Latency */
+/*
+ * As of PCIe 2.x these three bits are now undefined.
+ */
#define PCIE_DEVCAP_ATTN_BUTTON 0x1000 /* Attention Button Present */
#define PCIE_DEVCAP_ATTN_INDICATOR 0x2000 /* Attn Indicator Present */
#define PCIE_DEVCAP_PWR_INDICATOR 0x4000 /* Power Indicator Present */
@@ -140,6 +147,8 @@ extern "C" {
#define PCIE_DEVCAP_PLMT_SCL_1_BY_1000 0xC000000 /* 0.001x Scale */
#define PCIE_DEVCAP_PLMT_SCL_MASK 0xC000000 /* Power Limit Scale */
+#define PCIE_DEVCAP_FLR 0x10000000 /* Function Level Reset */
+
/*
* Device Control Register (2 bytes)
*/
@@ -174,6 +183,9 @@ extern "C" {
#define PCIE_DEVCTL_MAX_READ_REQ_MASK 0x7000 /* Max_Read_Request_Size */
#define PCIE_DEVCTL_MAX_READ_REQ_SHIFT 0xC
+#define PCIE_DEVCTL_BRIDGE_RETRY 0x8000 /* Bridge can return CRS */
+#define PCIE_DEVCTL_INITIATE_FLR 0x8000 /* Start Function Level Reset */
+
/*
* Device Status Register (2 bytes)
*/
@@ -183,11 +195,20 @@ extern "C" {
#define PCIE_DEVSTS_UR_DETECTED 0x8 /* Unsupported Req Detected */
#define PCIE_DEVSTS_AUX_POWER 0x10 /* AUX Power Detected */
#define PCIE_DEVSTS_TRANS_PENDING 0x20 /* Transactions Pending */
+#define PCIE_DEVSTS_EPR_DETECTED 0x40 /* Emergency Power Reduction */
/*
* Link Capability Register (4 bytes)
*/
-#define PCIE_LINKCAP_MAX_SPEED_2_5 0x1 /* 2.5 Gb/s Speed */
+#define PCIE_LINKCAP_MAX_SPEED_2_5 0x1 /* 2.5 GT/s Speed */
+/*
+ * In version 2 of PCI express, this indicated that both 5.0 GT/s and 2.5 GT/s
+ * speeds were supported. The use of this as the maximum link speed was added
+ * with PCIex v3.
+ */
+#define PCIE_LINKCAP_MAX_SPEED_5 0x2 /* 5.0 GT/s Speed */
+#define PCIE_LINKCAP_MAX_SPEED_8 0x3 /* 8.0 GT/s Speed */
+#define PCIE_LINKCAP_MAX_SPEED_16 0x4 /* 16.0 GT/s Speed */
#define PCIE_LINKCAP_MAX_SPEED_MASK 0xF /* Maximum Link Speed */
#define PCIE_LINKCAP_MAX_WIDTH_X1 0x010
#define PCIE_LINKCAP_MAX_WIDTH_X2 0x020
@@ -199,6 +220,7 @@ extern "C" {
#define PCIE_LINKCAP_MAX_WIDTH_MASK 0x3f0 /* Maximum Link Width */
#define PCIE_LINKCAP_ASPM_SUP_L0S 0x400 /* L0s Entry Supported */
+#define PCIE_LINKCAP_ASPM_SUP_L1 0x800 /* L1 Entry Supported */
#define PCIE_LINKCAP_ASPM_SUP_L0S_L1 0xC00 /* L0s abd L1 Supported */
#define PCIE_LINKCAP_ASPM_SUP_MASK 0xC00 /* ASPM Support */
@@ -222,9 +244,12 @@ extern "C" {
#define PCIE_LINKCAP_L1_EXIT_LAT_MAX 0x38000 /* > 64 us */
#define PCIE_LINKCAP_L1_EXIT_LAT_MASK 0x38000 /* L1 Exit Latency */
-/* PCIe v1.1 spec based */
+#define PCIE_LINKCAP_CLOCK_POWER_MGMT 0x40000 /* Clock Power Management */
+#define PCIE_LINKCAP_SDER_CAP 0x80000 /* Surprise Down Err report */
#define PCIE_LINKCAP_DLL_ACTIVE_REP_CAPABLE 0x100000 /* DLL Active */
/* Capable bit */
+#define PCIE_LINKCAP_LINK_BW_NOTIFY_CAP 0x200000 /* Link Bandwidth Notify Cap */
+#define PCIE_LINKCAP_ASPM_OPTIONAL 0x400000 /* ASPM Opt. Comp. */
#define PCIE_LINKCAP_PORT_NUMBER 0xFF000000 /* Port Number */
#define PCIE_LINKCAP_PORT_NUMBER_SHIFT 24 /* Port Number Shift */
@@ -247,11 +272,23 @@ extern "C" {
#define PCIE_LINKCTL_RETRAIN_LINK 0x20 /* Retrain Link */
#define PCIE_LINKCTL_COMMON_CLK_CFG 0x40 /* Common Clock Configuration */
#define PCIE_LINKCTL_EXT_SYNCH 0x80 /* Extended Synch */
+#define PCIE_LINKCTL_CLOCK_POWER_MGMT 0x100 /* Enable Clock Power Mgmt. */
+#define PCIE_LINKCTL_HW_WIDTH_DISABLE 0x200 /* hw auto width disable */
+#define PCIE_LINKCTL_LINK_BW_INTR_EN 0x400 /* Link bw mgmt intr */
+#define PCIE_LINKCTL_LINK_AUTO_BW_INTR_EN 0x800 /* Auto bw intr */
+
+#define PCI_LINKCTRL_DRS_SIG_CTRL_NO_REP 0x00
+#define PCI_LINKCTRL_DRS_SIG_CTRL_IE 0x4000
+#define PCI_LINKCTRL_DRS_SIG_CTRL_DRS_FRS 0x8000
+#define PCIE_LINKCTL_DRS_SIG_CTRL_MASK 0xC000 /* DRS Signaling Control */
/*
* Link Status Register (2 bytes)
*/
-#define PCIE_LINKSTS_SPEED_2_5 0x1 /* Link Speed */
+#define PCIE_LINKSTS_SPEED_2_5 0x1 /* 2.5 GT/s Link Speed */
+#define PCIE_LINKSTS_SPEED_5 0x2 /* 5.0 GT/s Link Speed */
+#define PCIE_LINKSTS_SPEED_8 0x3 /* 8.0 GT/s Link Speed */
+#define PCIE_LINKSTS_SPEED_16 0x4 /* 16.0 GT/s Link Speed */
#define PCIE_LINKSTS_SPEED_MASK 0xF /* Link Speed */
#define PCIE_LINKSTS_NEG_WIDTH_X1 0x010
@@ -263,12 +300,13 @@ extern "C" {
#define PCIE_LINKSTS_NEG_WIDTH_X32 0x200
#define PCIE_LINKSTS_NEG_WIDTH_MASK 0x3F0 /* Negotiated Link Width */
+/* This bit is undefined as of PCIe 2.x */
#define PCIE_LINKSTS_TRAINING_ERROR 0x400 /* Training Error */
#define PCIE_LINKSTS_LINK_TRAINING 0x800 /* Link Training */
#define PCIE_LINKSTS_SLOT_CLK_CFG 0x1000 /* Slot Clock Configuration */
-
-/* PCIe v1.1 spec based */
#define PCIE_LINKSTS_DLL_LINK_ACTIVE 0x2000 /* DLL Link Active */
+#define PCIE_LINKSTS_LINK_BW_MGMT 0x4000 /* Link bw mgmt status */
+#define PCIE_LINKSTS_AUTO_BW 0x8000 /* Link auto BW status */
/*
* Slot Capability Register (4 bytes)
@@ -311,6 +349,7 @@ extern "C" {
#define PCIE_SLOTCTL_PWR_CONTROL 0x0400 /* Power controller Control */
#define PCIE_SLOTCTL_EMI_LOCK_CONTROL 0x0800 /* EMI Lock control */
#define PCIE_SLOTCTL_DLL_STATE_EN 0x1000 /* DLL State Changed En */
+#define PCIE_SLOTCTL_AUTO_SLOT_PL_DIS 0x2000 /* Auto Slot Power Limit Dis */
#define PCIE_SLOTCTL_ATTN_INDICATOR_MASK 0x00C0 /* Attn Indicator mask */
#define PCIE_SLOTCTL_PWR_INDICATOR_MASK 0x0300 /* Power Indicator mask */
#define PCIE_SLOTCTL_INTR_MASK 0x103f /* Supported intr mask */
@@ -354,6 +393,12 @@ extern "C" {
#define PCIE_ROOTCTL_SYS_ERR_ON_NFE_EN 0x2 /* Sys Err on NF Err Enable */
#define PCIE_ROOTCTL_SYS_ERR_ON_FE_EN 0x4 /* Sys Err on Fatal Err En */
#define PCIE_ROOTCTL_PME_INTERRUPT_EN 0x8 /* PME Interrupt Enable */
+#define PCIE_ROOTCTL_CRS_SW_VIS_EN 0x10 /* CRS SW Visibility EN */
+
+/*
+ * Root Capabilities register (2 bytes)
+ */
+#define PCIE_ROOTCAP_CRS_SW_VIS 0x01 /* CRS SW Visible */
/*
* Root Status Register (4 bytes)
@@ -378,14 +423,25 @@ extern "C" {
#define PCIE_DEVCAP2_LTR_MECH 0x800
#define PCIE_DEVCAP2_TPH_COMP_SHIFT 12
#define PCIE_DEVCAP2_TPH_COMP_MASK 0x3
+#define PCIE_DEVCAP2_LNSYS_CLS_SHIFT 14
+#define PCIE_DEVCAP2_LNSYS_CLS_MASK 0x3
+#define PCIE_DEVCAP2_10B_TAG_COMP_SUP 0x10000
+#define PCIE_DEVCAP2_10B_TAG_REQ_SUP 0x20000
+#define PCIE_DEVCAP2_OBFF_SHIFT 18
+#define PCIE_DEVCAP2_OBFF_MASK 0x3
#define PCIE_DEVCAP2_EXT_FMT_FIELD 0x100000
#define PCIE_DEVCAP2_END_END_TLP_PREFIX 0x200000
#define PCIE_DEVCAP2_MAX_END_END_SHIFT 22
#define PCIE_DEVCAP2_MAX_END_END_MASK 0x3
+#define PCIE_DEVCAP2_EPR_SUP_SHIFT 24
+#define PCIE_DEVCAP2_EPR_SUP_MASK 0x3
+#define PCIE_DEVCAP2_EPR_INIT_REQ 0x4000000
+#define PCIE_DEVCAP2_FRS_SUP 0x80000000
/*
* Device Control 2 Register (2 bytes)
*/
+#define PCIE_DEVCTL2_COM_TO_RANGE_MASK 0xf
#define PCIE_DEVCTL2_COM_TO_RANGE_0 0x0
#define PCIE_DEVCTL2_COM_TO_RANGE_1 0x1
#define PCIE_DEVCTL2_COM_TO_RANGE_2 0x2
@@ -402,11 +458,65 @@ extern "C" {
#define PCIE_DEVCTL2_IDO_REQ_EN 0x100
#define PCIE_DEVCTL2_IDO_COMPL_EN 0x200
#define PCIE_DEVCTL2_LTR_MECH_EN 0x400
+#define PCIE_DEVCTL2_EPR_REQ 0x800
+#define PCIE_DEVCTL2_10BTAG_REQ_EN 0x1000
+#define PCIE_DEVCTL2_OBFF_MASK 0x6000
+#define PCIE_DEVCTL2_OBFF_DISABLE 0x0000
+#define PCIE_DEVCTL2_OBFF_EN_VARA 0x2000
+#define PCIE_DEVCTL2_OBFF_EN_VARB 0x4000
+#define PCIE_DEVCTL2_OBFF_EN_WAKE 0x6000
#define PCIE_DEVCTL2_END_END_TLP_PREFIX 0x8000
-
-
+/*
+ * Link Capability 2 Register (4 bytes)
+ */
+#define PCIE_LINKCAP2_SPEED_2_5 0x02
+#define PCIE_LINKCAP2_SPEED_5 0x04
+#define PCIE_LINKCAP2_SPEED_8 0x08
+#define PCIE_LINKCAP2_SPEED_16 0x10
+#define PCIE_LINKCAP2_SPEED_MASK 0xfe
+#define PCIE_LINKCAP2_CROSSLINK 0x100
+#define PCIE_LINKCAP2_LSKP_OSGSS_MASK 0xfe00
+#define PCIE_LINKCAP2_LKSP_OSGSS_2_5 0x0200
+#define PCIE_LINKCAP2_LKSP_OSGSS_5 0x0400
+#define PCIE_LINKCAP2_LKSP_OSGSS_8 0x0800
+#define PCIE_LINKCAP2_LKSP_OSGSS_16 0x1000
+#define PCIE_LINKCAP2_LKSP_OSRSS_MASK 0x7f0000
+#define PCIE_LINKCAP2_LKSP_OSRSS_2_5 0x010000
+#define PCIE_LINKCAP2_LKSP_OSRSS_5 0x020000
+#define PCIE_LINKCAP2_LKSP_OSRSS_8 0x040000
+#define PCIE_LINKCAP2_LKSP_OSRSS_16 0x080000
+#define PCIE_LINKCAP2_RTPD_SUP 0x800000
+#define PCIE_LINKCAP2_TRTPD_SUP 0x01000000
+#define PCIE_LINKCAP2_DRS 0x80000000
+
+/*
+ * Link Control 2 Register (2 bytes)
+ */
+#define PCIE_LINKCTL2_TARGET_SPEED_MASK 0x000f
+#define PICE_LINKCTL2_ENTER_COMPLIANCE 0x0010
+#define PCIE_LINKCTL2_HW_AUTO_SPEED_DIS 0x0020
+#define PCIE_LINKCTL2_SELECT_DEEMPH 0x0040
+#define PCIE_LINKCTL2_TX_MARGIN_MASK 0x0380
+#define PCIE_LINKCTL2_ENTER_MOD_COMP 0x0400
+#define PCIE_LINKCTL2_COMP_SOS 0x0800
+#define PCIE_LINKCTL2_COMP_DEEMPM_MASK 0xf000
+
+/*
+ * Link Status 2 Register (2 bytes)
+ */
+#define PCIE_LINKSTS2_CUR_DEEMPH 0x0001
+#define PCIE_LINKSTS2_EQ8GT_COMP 0x0002
+#define PCIE_LINKSTS2_EQ8GT_P1_SUC 0x0004
+#define PCIE_LINKSTS2_EQ8GT_P2_SUC 0x0008
+#define PCIE_LINKSTS2_EQ8GT_P3_SUC 0x0010
+#define PCIE_LINKSTS2_LINK_EQ_REQ 0x0020
+#define PCIE_LINKSTS2_RETIMER_PRES_DET 0x0040
+#define PCIE_LINKSTS2_2RETIMER_PRES_DET 0x0080
+#define PCIE_LINKSTS2_XLINK_RES 0x0300
+#define PCIE_LINKSTS2_DS_COMP_PRES_MASK 0x7000
+#define PCIE_LINKSTS2_DRS_MSG_RX 0x8000
/*
* PCI-Express Enhanced Capabilities Link Entry Bit Offsets
@@ -441,6 +551,28 @@ extern "C" {
#define PCIE_EXT_CAP_ID_ACS 0xD /* Access Control Services */
#define PCIE_EXT_CAP_ID_ARI 0xE /* Alternative Routing ID */
#define PCIE_EXT_CAP_ID_ATS 0xF /* Address Translation Svcs */
+#define PCIE_EXT_CAP_ID_SRIOV 0x10 /* Single Root I/O Virt. */
+#define PCIE_EXT_CAP_ID_MRIOV 0x11 /* Multi Root I/O Virt. */
+#define PCIE_EXT_CAP_ID_MULTICAST 0x12 /* Multicast Services */
+#define PCIE_EXT_CAP_ID_EA 0x14 /* Enhanced Allocation */
+#define PCIE_EXT_CAP_ID_RESIZE_BAR 0x15 /* Resizable BAR */
+#define PCIE_EXT_CAP_ID_DPA 0x16 /* Dynamic Power Allocation */
+#define PCIE_EXT_CAP_ID_TPH_REQ 0x17 /* TPH Requester */
+#define PCIE_EXT_CAP_ID_LTR 0x18 /* Latency Tolerance Report */
+#define PCIE_EXT_CAP_ID_PCIE2 0x19 /* PCI Express Capability 2 */
+#define PCIE_EXT_CAP_ID_PASID 0x1B /* PASID */
+#define PCIE_EXT_CAP_ID_LNR 0x1C /* LNR */
+#define PCIE_EXT_CAP_ID_DPC 0x1D /* DPC */
+#define PCIE_EXT_CAP_ID_L1PM 0x1E /* L1 PM Substrates */
+#define PCIE_EXT_CAP_ID_PTM 0x1F /* Precision Time Management */
+#define PCIE_EXT_CAP_ID_FRS 0x21 /* Function Ready Stat. Queue */
+#define PCIE_EXT_CAP_ID_RTR 0x22 /* Readiness Time Reporting */
+#define PCIE_EXT_CAP_ID_DVS 0x23 /* Designated Vendor-Specific */
+#define PCIE_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */
+#define PCIE_EXT_CAP_ID_PL16GTE 0x26 /* Physical Layer 16.0 GT/s */
+#define PCIE_EXT_CAP_ID_LANE_MARGIN 0x27 /* Lane Margining */
+#define PCIE_EXT_CAP_ID_HIEARCHY_ID 0x28 /* Hierarchy ID */
+#define PCIE_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Mgmt */
/*
* PCI-Express Advanced Error Reporting Extended Capability Offsets
@@ -545,10 +677,10 @@ extern "C" {
* AER Secondary Uncorrectable Error Register
*/
#define PCIE_AER_SUCE_TA_ON_SC 0x1 /* Target Abort on Split Comp */
-#define PCIE_AER_SUCE_MA_ON_SC 0x2 /* Master Abort on Split Comp */
+#define PCIE_AER_SUCE_MA_ON_SC 0x2 /* Master Abort on Split Comp */
#define PCIE_AER_SUCE_RCVD_TA 0x4 /* Received Target Abort */
-#define PCIE_AER_SUCE_RCVD_MA 0x8 /* Received Master Abort */
-#define PCIE_AER_SUCE_USC_ERR 0x20 /* Unexpected Split Comp Err */
+#define PCIE_AER_SUCE_RCVD_MA 0x8 /* Received Master Abort */
+#define PCIE_AER_SUCE_USC_ERR 0x20 /* Unexpected Split Comp Err */
#define PCIE_AER_SUCE_USC_MSG_DATA_ERR 0x40 /* USC Message Data Error */
#define PCIE_AER_SUCE_UC_DATA_ERR 0x80 /* Uncorrectable Data Error */
#define PCIE_AER_SUCE_UC_ATTR_ERR 0x100 /* UC Attribute Err */
diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h
index 1f08fad51d..faebc9d020 100644
--- a/usr/src/uts/common/sys/pcie_impl.h
+++ b/usr/src/uts/common/sys/pcie_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019, Joyent, Inc.
*/
/*
@@ -285,6 +286,29 @@ typedef struct pf_root_fault {
typedef struct pf_data pf_data_t;
+typedef enum pcie_link_width {
+ PCIE_LINK_WIDTH_UNKNOWN,
+ PCIE_LINK_WIDTH_X1,
+ PCIE_LINK_WIDTH_X2,
+ PCIE_LINK_WIDTH_X4,
+ PCIE_LINK_WIDTH_X8,
+ PCIE_LINK_WIDTH_X12,
+ PCIE_LINK_WIDTH_X16,
+ PCIE_LINK_WIDTH_X32
+} pcie_link_width_t;
+
+/*
+ * Note, this member should always be treated as a bit field, as a device may
+ * support multiple speeds.
+ */
+typedef enum pcie_link_speed {
+ PCIE_LINK_SPEED_UNKNOWN = 0x00,
+ PCIE_LINK_SPEED_2_5 = 0x01,
+ PCIE_LINK_SPEED_5 = 0x02,
+ PCIE_LINK_SPEED_8 = 0x04,
+ PCIE_LINK_SPEED_16 = 0x08
+} pcie_link_speed_t;
+
/*
* For hot plugged device, these data are init'ed during during probe
* For non-hotplugged device, these data are init'ed in pci_autoconfig (on x86),
@@ -339,6 +363,15 @@ typedef struct pcie_bus {
/* workaround for PCI/PCI-X devs behind PCIe2PCI Bridge */
pcie_req_id_t bus_pcie2pci_secbus;
+
+ /*
+ * Link speed specific fields.
+ */
+ pcie_link_width_t bus_max_width;
+ pcie_link_width_t bus_cur_width;
+ pcie_link_speed_t bus_sup_speed;
+ pcie_link_speed_t bus_max_speed;
+ pcie_link_speed_t bus_cur_speed;
} pcie_bus_t;
/*
@@ -365,6 +398,7 @@ struct pf_data {
boolean_t pe_lock;
boolean_t pe_valid;
uint32_t pe_severity_flags; /* Severity of error */
+ uint32_t pe_severity_mask;
uint32_t pe_orig_severity_flags; /* Original severity */
pf_affected_dev_t *pe_affected_dev;
pcie_bus_t *pe_bus_p;
@@ -393,6 +427,7 @@ typedef struct pf_impl {
/* bus_fm_flags field */
#define PF_FM_READY (1 << 0) /* bus_fm_lock initialized */
#define PF_FM_IS_NH (1 << 1) /* known as non-hardened */
+#define PF_FM_IS_PASSTHRU (1 << 2) /* device is controlled by VM */
/*
* PCIe fabric handle lookup address flags. Used to define what type of
@@ -421,11 +456,10 @@ typedef struct pf_impl {
#define PF_ERR_MATCHED_PARENT (1 << 5) /* Error Handled By Parent */
#define PF_ERR_PANIC (1 << 6) /* Error should panic system */
#define PF_ERR_PANIC_DEADLOCK (1 << 7) /* deadlock detected */
-#define PF_ERR_PANIC_BAD_RESPONSE (1 << 8) /* Device no response */
+#define PF_ERR_BAD_RESPONSE (1 << 8) /* Device bad/no response */
#define PF_ERR_MATCH_DOM (1 << 9) /* Error Handled By IO domain */
-#define PF_ERR_FATAL_FLAGS \
- (PF_ERR_PANIC | PF_ERR_PANIC_DEADLOCK | PF_ERR_PANIC_BAD_RESPONSE)
+#define PF_ERR_FATAL_FLAGS (PF_ERR_PANIC | PF_ERR_PANIC_DEADLOCK)
#define PF_HDL_FOUND 1
#define PF_HDL_NOTFOUND 2
@@ -529,6 +563,7 @@ extern void pcie_enable_errors(dev_info_t *dip);
extern void pcie_disable_errors(dev_info_t *dip);
extern int pcie_enable_ce(dev_info_t *dip);
extern boolean_t pcie_bridge_is_link_disabled(dev_info_t *);
+extern boolean_t pcie_is_pci_device(dev_info_t *dip);
extern pcie_bus_t *pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf,
uint8_t flags);
@@ -587,6 +622,7 @@ extern void pf_eh_enter(pcie_bus_t *bus_p);
extern void pf_eh_exit(pcie_bus_t *bus_p);
extern int pf_scan_fabric(dev_info_t *rpdip, ddi_fm_error_t *derr,
pf_data_t *root_pfd_p);
+extern void pf_set_passthru(dev_info_t *, boolean_t);
extern void pf_init(dev_info_t *, ddi_iblock_cookie_t, ddi_attach_cmd_t);
extern void pf_fini(dev_info_t *, ddi_detach_cmd_t);
extern int pf_hdl_lookup(dev_info_t *, uint64_t, uint32_t, uint64_t,
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index de15be4d60..816d6995cf 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -108,6 +108,7 @@ int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *);
int secpolicy_kmdb(const cred_t *);
int secpolicy_lock_memory(const cred_t *);
int secpolicy_meminfo(const cred_t *);
+int secpolicy_fs_import(const cred_t *);
int secpolicy_modctl(const cred_t *, int);
int secpolicy_net(const cred_t *, int, boolean_t);
int secpolicy_net_bindmlp(const cred_t *);
@@ -176,6 +177,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *,
const vattr_t *, cred_t *);
int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t);
int secpolicy_xvm_control(const cred_t *);
+int secpolicy_hyprlofs_control(const cred_t *);
int secpolicy_basic_exec(const cred_t *, vnode_t *);
int secpolicy_basic_fork(const cred_t *);
diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h
index 67b47f9a1e..3e0eb3b21f 100644
--- a/usr/src/uts/common/sys/poll_impl.h
+++ b/usr/src/uts/common/sys/poll_impl.h
@@ -25,7 +25,7 @@
*/
/*
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_POLL_IMPL_H
@@ -140,6 +140,7 @@ struct pollstate {
pollstate_t *ps_contend_nextp; /* next in contender list */
pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */
int ps_flags; /* state flags */
+ short ps_implicit_ev; /* implicit poll event interest */
};
/* pollstate flags */
@@ -225,6 +226,7 @@ struct polldat {
int pd_nsets; /* num of xref sets, used by poll(2) */
xref_t *pd_ref; /* ptr to xref info, 1 for each set */
port_kevent_t *pd_portev; /* associated port event struct */
+ uf_entry_gen_t pd_gen; /* fd generation at cache time */
uint64_t pd_epolldata; /* epoll data, if any */
};
@@ -256,6 +258,7 @@ struct pollcache {
/* pc_flag */
#define PC_POLLWAKE 0x02 /* pollwakeup() occurred */
+#define PC_EPOLL 0x04 /* pollcache is epoll-enabled */
#if defined(_KERNEL)
/*
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index 712bd7cb24..7d2209132d 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -315,6 +315,7 @@ typedef struct proc {
size_t p_swrss; /* resident set size before last swap */
struct aio *p_aio; /* pointer to async I/O struct */
struct itimer **p_itimer; /* interval timers */
+ uint_t p_itimer_sz; /* max allocated interval timers */
timeout_id_t p_alarmid; /* alarm's timeout id */
caddr_t p_usrstack; /* top of the process stack */
uint_t p_stkprot; /* stack memory protection */
@@ -358,6 +359,7 @@ typedef struct proc {
struct zone *p_zone; /* zone in which process lives */
struct vnode *p_execdir; /* directory that p_exec came from */
struct brand *p_brand; /* process's brand */
+
void *p_brand_data; /* per-process brand state */
psecflags_t p_secflags; /* per-process security flags */
@@ -374,7 +376,6 @@ typedef struct proc {
*/
struct user p_user; /* (see sys/user.h) */
} proc_t;
-
#define PROC_T /* headers relying on proc_t are OK */
#ifdef _KERNEL
@@ -640,6 +641,7 @@ extern int signal_is_blocked(kthread_t *, int);
extern int sigcheck(proc_t *, kthread_t *);
extern void sigdefault(proc_t *);
+extern struct pid *pid_find(pid_t pid);
extern void pid_setmin(void);
extern pid_t pid_allocate(proc_t *, pid_t, int);
extern int pid_rele(struct pid *);
@@ -655,6 +657,7 @@ extern int sprtrylock_proc(proc_t *);
extern void sprwaitlock_proc(proc_t *);
extern void sprlock_proc(proc_t *);
extern void sprunlock(proc_t *);
+extern void sprunprlock(proc_t *);
extern void pid_init(void);
extern proc_t *pid_entry(int);
extern int pid_slot(proc_t *);
@@ -729,6 +732,10 @@ extern kthread_t *thread_unpin(void);
extern void thread_init(void);
extern void thread_load(kthread_t *, void (*)(), caddr_t, size_t);
+extern void thread_splitstack(void (*)(void *), void *, size_t);
+extern void thread_splitstack_run(caddr_t, void (*)(void *), void *);
+extern void thread_splitstack_cleanup(void);
+
extern void tsd_create(uint_t *, void (*)(void *));
extern void tsd_destroy(uint_t *);
extern void *tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void));
@@ -770,7 +777,7 @@ extern void pokelwps(proc_t *);
extern void continuelwps(proc_t *);
extern int exitlwps(int);
extern void lwp_ctmpl_copy(klwp_t *, klwp_t *);
-extern void lwp_ctmpl_clear(klwp_t *);
+extern void lwp_ctmpl_clear(klwp_t *, boolean_t);
extern klwp_t *forklwp(klwp_t *, proc_t *, id_t);
extern void lwp_load(klwp_t *, gregset_t, uintptr_t);
extern void lwp_setrval(klwp_t *, int, int);
diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h
index dfb54eaef5..99da92ab79 100644
--- a/usr/src/uts/common/sys/procfs.h
+++ b/usr/src/uts/common/sys/procfs.h
@@ -236,6 +236,7 @@ typedef struct pstatus {
#define PR_FAULTED 6
#define PR_SUSPENDED 7
#define PR_CHECKPOINT 8
+#define PR_BRAND 9
/*
* lwp ps(1) information file. /proc/<pid>/lwp/<lwpid>/lwpsinfo
diff --git a/usr/src/uts/common/sys/prsystm.h b/usr/src/uts/common/sys/prsystm.h
index 7adc920da2..75259dc421 100644
--- a/usr/src/uts/common/sys/prsystm.h
+++ b/usr/src/uts/common/sys/prsystm.h
@@ -28,7 +28,7 @@
/* All Rights Reserved */
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_PRSYSTM_H
@@ -86,7 +86,7 @@ extern void prgetcred(proc_t *, struct prcred *);
extern void prgetpriv(proc_t *, struct prpriv *);
extern size_t prgetprivsize(void);
extern void prgetsecflags(proc_t *, struct prsecflags *);
-extern int prnsegs(struct as *, int);
+extern uint_t prnsegs(struct as *, int);
extern void prexit(proc_t *);
extern void prfree(proc_t *);
extern void prlwpexit(kthread_t *);
diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h
index 55987fe6d7..8b97fd7e3b 100644
--- a/usr/src/uts/common/sys/ptms.h
+++ b/usr/src/uts/common/sys/ptms.h
@@ -126,6 +126,12 @@ extern void ptms_logp(char *, uintptr_t);
#define DDBGP(a, b)
#endif
+typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t;
+typedef struct ptmptsopencb {
+ boolean_t (*ppocb_func)(ptmptsopencb_arg_t);
+ ptmptsopencb_arg_t ppocb_arg;
+} ptmptsopencb_t;
+
#endif /* _KERNEL */
typedef struct pt_own {
@@ -157,6 +163,19 @@ typedef struct pt_own {
#define ZONEPT (('P'<<8)|4) /* set zone of master/slave pair */
#define OWNERPT (('P'<<8)|5) /* set owner/group for slave device */
+#ifdef _KERNEL
+/*
+ * kernel ioctl commands
+ *
+ * PTMPTSOPENCB: Returns a callback function pointer and opaque argument.
+ * The return value of the callback function when it's invoked
+ * with the opaque argument passed to it will indicate if the
+ * pts slave device is currently open.
+ */
+#define PTMPTSOPENCB (('P'<<8)|6) /* check if the slave is open */
+
+#endif /* _KERNEL */
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h b/usr/src/uts/common/sys/refhash.h
index 2069e6d3f1..b7427a454d 100644
--- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h
+++ b/usr/src/uts/common/sys/refhash.h
@@ -10,11 +10,11 @@
*/
/*
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
-#ifndef _SYS_SCSI_ADAPTERS_MPTHASH_H
-#define _SYS_SCSI_ADAPTERS_MPTHASH_H
+#ifndef _SYS_REFHASH_H
+#define _SYS_REFHASH_H
#include <sys/types.h>
#include <sys/list.h>
@@ -58,4 +58,4 @@ extern void *refhash_first(refhash_t *);
extern void *refhash_next(refhash_t *, void *);
extern boolean_t refhash_obj_valid(refhash_t *hp, const void *);
-#endif /* _SYS_SCSI_ADAPTERS_MPTHASH_H */
+#endif /* _SYS_REFHASH_H */
diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h
index 13166f378d..d65ca00f69 100644
--- a/usr/src/uts/common/sys/resource.h
+++ b/usr/src/uts/common/sys/resource.h
@@ -23,6 +23,7 @@
*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
diff --git a/usr/src/uts/common/sys/rt.h b/usr/src/uts/common/sys/rt.h
index d4233aecb5..2ed7320a09 100644
--- a/usr/src/uts/common/sys/rt.h
+++ b/usr/src/uts/common/sys/rt.h
@@ -22,6 +22,7 @@
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -75,6 +76,16 @@ typedef struct rtkparms {
int rt_tqsig; /* real-time time quantum signal */
uint_t rt_cflags; /* real-time control flags */
} rtkparms_t;
+
+#define RTGPPRIO0 100 /* Global priority for RT priority 0 */
+
+/*
+ * control flags (kparms->rt_cflags).
+ */
+#define RT_DOPRI 0x01 /* change priority */
+#define RT_DOTQ 0x02 /* change RT time quantum */
+#define RT_DOSIG 0x04 /* change RT time quantum signal */
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h
new file mode 100644
index 0000000000..afb7a94c58
--- /dev/null
+++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h
@@ -0,0 +1,147 @@
+/*-
+ * Copyright (c) 2012-2015 LSI Corp.
+ * Copyright (c) 2013-2016 Avago Technologies
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2000-2015 LSI Corporation.
+ * Copyright (c) 2013-2016 Avago Technologies
+ * All rights reserved.
+ *
+ *
+ * Name: mpi2_pci.h
+ * Title: MPI PCIe Attached Devices structures and definitions.
+ * Creation Date: October 9, 2012
+ *
+ * mpi2_pci.h Version: 02.00.02
+ *
+ * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25
+ * prefix are for use only on MPI v2.5 products, and must not be used
+ * with MPI v2.0 products. Unless otherwise noted, names beginning with
+ * MPI2 or Mpi2 are for use with both MPI v2.0 and MPI v2.5 products.
+ *
+ * Version History
+ * ---------------
+ *
+ * Date Version Description
+ * -------- -------- ------------------------------------------------------
+ * 03-16-15 02.00.00 Initial version.
+ * 02-17-16 02.00.01 Removed AHCI support.
+ * Removed SOP support.
+ * 07-01-16 02.00.02 Added MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP to
+ * NVME Encapsulated Request.
+ * --------------------------------------------------------------------------
+ */
+
+#ifndef MPI2_PCI_H
+#define MPI2_PCI_H
+
+
+/*
+ * Values for the PCIe DeviceInfo field used in PCIe Device Status Change Event
+ * data and PCIe Configuration pages.
+ */
+#define MPI26_PCIE_DEVINFO_DIRECT_ATTACH (0x00000010)
+
+#define MPI26_PCIE_DEVINFO_MASK_DEVICE_TYPE (0x0000000F)
+#define MPI26_PCIE_DEVINFO_NO_DEVICE (0x00000000)
+#define MPI26_PCIE_DEVINFO_PCI_SWITCH (0x00000001)
+#define MPI26_PCIE_DEVINFO_NVME (0x00000003)
+
+
+/****************************************************************************
+* NVMe Encapsulated message
+****************************************************************************/
+
+/* NVME Encapsulated Request Message */
+typedef struct _MPI26_NVME_ENCAPSULATED_REQUEST
+{
+ U16 DevHandle; /* 0x00 */
+ U8 ChainOffset; /* 0x02 */
+ U8 Function; /* 0x03 */
+ U16 EncapsulatedCommandLength; /* 0x04 */
+ U8 Reserved1; /* 0x06 */
+ U8 MsgFlags; /* 0x07 */
+ U8 VP_ID; /* 0x08 */
+ U8 VF_ID; /* 0x09 */
+ U16 Reserved2; /* 0x0A */
+ U32 Reserved3; /* 0x0C */
+ U64 ErrorResponseBaseAddress; /* 0x10 */
+ U16 ErrorResponseAllocationLength; /* 0x18 */
+ U16 Flags; /* 0x1A */
+ U32 DataLength; /* 0x1C */
+ U8 NVMe_Command[4]; /* 0x20 */ /* variable length */
+
+} MPI26_NVME_ENCAPSULATED_REQUEST, MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_REQUEST,
+ Mpi26NVMeEncapsulatedRequest_t, MPI2_POINTER pMpi26NVMeEncapsulatedRequest_t;
+
+/* defines for the Flags field */
+#define MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP (0x0020)
+/* Submission Queue Type*/
+#define MPI26_NVME_FLAGS_SUBMISSIONQ_MASK (0x0010)
+#define MPI26_NVME_FLAGS_SUBMISSIONQ_IO (0x0000)
+#define MPI26_NVME_FLAGS_SUBMISSIONQ_ADMIN (0x0010)
+/* Error Response Address Space */
+#define MPI26_NVME_FLAGS_MASK_ERROR_RSP_ADDR (0x000C)
+#define MPI26_NVME_FLAGS_SYSTEM_RSP_ADDR (0x0000)
+#define MPI26_NVME_FLAGS_IOCPLB_RSP_ADDR (0x0008)
+#define MPI26_NVME_FLAGS_IOCPLBNTA_RSP_ADDR (0x000C)
+/* Data Direction*/
+#define MPI26_NVME_FLAGS_DATADIRECTION_MASK (0x0003)
+#define MPI26_NVME_FLAGS_NODATATRANSFER (0x0000)
+#define MPI26_NVME_FLAGS_WRITE (0x0001)
+#define MPI26_NVME_FLAGS_READ (0x0002)
+#define MPI26_NVME_FLAGS_BIDIRECTIONAL (0x0003)
+
+
+/* NVMe Encapuslated Reply Message */
+typedef struct _MPI26_NVME_ENCAPSULATED_ERROR_REPLY
+{
+ U16 DevHandle; /* 0x00 */
+ U8 MsgLength; /* 0x02 */
+ U8 Function; /* 0x03 */
+ U16 EncapsulatedCommandLength; /* 0x04 */
+ U8 Reserved1; /* 0x06 */
+ U8 MsgFlags; /* 0x07 */
+ U8 VP_ID; /* 0x08 */
+ U8 VF_ID; /* 0x09 */
+ U16 Reserved2; /* 0x0A */
+ U16 Reserved3; /* 0x0C */
+ U16 IOCStatus; /* 0x0E */
+ U32 IOCLogInfo; /* 0x10 */
+ U16 ErrorResponseCount; /* 0x14 */
+ U16 Reserved4; /* 0x16 */
+} MPI26_NVME_ENCAPSULATED_ERROR_REPLY,
+ MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_ERROR_REPLY,
+ Mpi26NVMeEncapsulatedErrorReply_t,
+ MPI2_POINTER pMpi26NVMeEncapsulatedErrorReply_t;
+
+
+#endif
+
+
diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
index 0050c8c00f..be8bf675b8 100644
--- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
+++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
@@ -58,10 +58,10 @@
#include <sys/byteorder.h>
#include <sys/queue.h>
+#include <sys/refhash.h>
#include <sys/isa_defs.h>
#include <sys/sunmdi.h>
#include <sys/mdi_impldefs.h>
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
#include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h>
#include <sys/scsi/adapters/mpt_sas/mpi/mpi2_tool.h>
#include <sys/scsi/adapters/mpt_sas/mpi/mpi2_cnfg.h>
diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h
new file mode 100644
index 0000000000..5aba743834
--- /dev/null
+++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h
@@ -0,0 +1,750 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#ifndef _SMRT_H
+#define _SMRT_H
+
+#include <sys/types.h>
+#include <sys/pci.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/conf.h>
+#include <sys/map.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/scsi/scsi.h>
+#include <sys/scsi/impl/spc3_types.h>
+#include <sys/devops.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sdt.h>
+#include <sys/policy.h>
+#include <sys/ctype.h>
+
+#if !defined(_LITTLE_ENDIAN) || !defined(_BIT_FIELDS_LTOH)
+/*
+ * This driver contains a number of multi-byte bit fields and other structs
+ * that are only correct on a system with the same ordering as x86.
+ */
+#error "smrt: driver works only on little endian systems"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Some structures are statically sized based on the expected number of logical
+ * drives and controllers in the system. These definitions are used throughout
+ * other driver-specific header files, and must appear prior to their
+ * inclusion.
+ */
+#define SMRT_MAX_LOGDRV 64 /* Maximum number of logical drives */
+#define SMRT_MAX_PHYSDEV 128 /* Maximum number of physical devices */
+
+#include <sys/scsi/adapters/smrt/smrt_ciss.h>
+#include <sys/scsi/adapters/smrt/smrt_scsi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern ddi_device_acc_attr_t smrt_dev_attributes;
+
+typedef enum smrt_init_level {
+ SMRT_INITLEVEL_BASIC = (0x1 << 0),
+ SMRT_INITLEVEL_I2O_MAPPED = (0x1 << 1),
+ SMRT_INITLEVEL_CFGTBL_MAPPED = (0x1 << 2),
+ SMRT_INITLEVEL_PERIODIC = (0x1 << 3),
+ SMRT_INITLEVEL_INT_ALLOC = (0x1 << 4),
+ SMRT_INITLEVEL_INT_ADDED = (0x1 << 5),
+ SMRT_INITLEVEL_INT_ENABLED = (0x1 << 6),
+ SMRT_INITLEVEL_SCSA = (0x1 << 7),
+ SMRT_INITLEVEL_MUTEX = (0x1 << 8),
+ SMRT_INITLEVEL_TASKQ = (0x1 << 9),
+ SMRT_INITLEVEL_ASYNC_EVENT = (0x1 << 10),
+} smrt_init_level_t;
+
+/*
+ * Commands issued to the controller carry a (generally 32-bit, though with
+ * two reserved signalling bits) identifying tag number. In order to avoid
+ * having the controller confuse us by double-reporting the completion of a
+ * particular tag, we try to reuse them as infrequently as possible. In
+ * practice, this means looping through a range of values. The minimum and
+ * maximum value are defined below. A single command tag value is set aside
+ * for polled commands sent prior to full initialisation of the driver.
+ */
+#define SMRT_PRE_TAG_NUMBER 0x00000bad
+#define SMRT_MIN_TAG_NUMBER 0x00001000
+#define SMRT_MAX_TAG_NUMBER 0x0fffffff
+
+/*
+ * Character strings that represent the names of the iports used for both
+ * physical and virtual volumes.
+ */
+#define SMRT_IPORT_PHYS "p0"
+#define SMRT_IPORT_VIRT "v0"
+
+/*
+ * Definitions to support waiting for the controller to converge on a
+ * particular state: ready or not ready. These are used with
+ * smrt_ctlr_wait_for_state().
+ */
+#define SMRT_WAIT_DELAY_SECONDS 120
+typedef enum smrt_wait_state {
+ SMRT_WAIT_STATE_READY = 1,
+ SMRT_WAIT_STATE_UNREADY
+} smrt_wait_state_t;
+
+typedef enum smrt_ctlr_mode {
+ SMRT_CTLR_MODE_UNKNOWN = 0,
+ SMRT_CTLR_MODE_SIMPLE
+} smrt_ctlr_mode_t;
+
+/*
+ * In addition to Logical Volumes, we also expose the controller at a
+ * pseudo target address on the SCSI bus we are essentially pretending to be.
+ */
+#define SMRT_CONTROLLER_TARGET 128
+
+/*
+ * When waiting for volume discovery to complete, we wait for a maximum
+ * duration (in seconds) before giving up.
+ */
+#define SMRT_DISCOVER_TIMEOUT 30
+
+/*
+ * The maintenance routine which checks for controller lockup and aborts
+ * commands that have passed their timeout runs periodically. The time is
+ * expressed in seconds.
+ */
+#define SMRT_PERIODIC_RATE 5
+
+/*
+ * At times, we need to check if the controller is still responding. To do
+ * that, we send a Nop message to the controller and make sure it completes
+ * successfully. So that we don't wait forever, we set a timeout (in seconds).
+ */
+#define SMRT_PING_CHECK_TIMEOUT 60
+
+/*
+ * When detaching the device, we may need to have an asynchronous event
+ * cancellation be issued. While this should be relatively smooth, we don't
+ * want to wait forever for it. As such we set a timeout in seconds.
+ */
+#define SMRT_ASYNC_CANCEL_TIMEOUT 60
+
+/*
+ * HP PCI vendor ID and Generation 9 device ID. Used to identify generations of
+ * supported controllers.
+ */
+#define SMRT_VENDOR_HP 0x103c
+#define SMRT_DEVICE_GEN9 0x3238
+
+typedef enum smrt_controller_status {
+ /*
+ * An attempt is being made to detach the controller instance.
+ */
+ SMRT_CTLR_STATUS_DETACHING = (0x1 << 0),
+
+ /*
+ * The controller is believed to be functioning correctly. The driver
+ * is to allow command submission, process interrupts, and perform
+ * periodic background maintenance.
+ */
+ SMRT_CTLR_STATUS_RUNNING = (0x1 << 1),
+
+ /*
+ * The controller is currently being reset.
+ */
+ SMRT_CTLR_STATUS_RESETTING = (0x1 << 2),
+
+ /*
+ * Our async event notification command is currently in need of help
+ * from the broader driver. This will be set by smrt_event_complete()
+ * to indicate that the command is not being processed due to a
+ * controller reset or because another fatal error occurred. The
+ * periodic will have to pick up and recover this for us. It is only
+ * safe for the driver to manipulate the event command outside of
+ * smrt_event_complete() if this flag is set.
+ */
+ SMRT_CTLR_ASYNC_INTERVENTION = (0x1 << 3),
+
+ /*
+ * See the theory statement on discovery and resets in smrt_ciss.c for
+ * an explanation of these values.
+ */
+ SMRT_CTLR_DISCOVERY_REQUESTED = (0x1 << 4),
+ SMRT_CTLR_DISCOVERY_RUNNING = (0x1 << 5),
+ SMRT_CTLR_DISCOVERY_PERIODIC = (0x1 << 6),
+ SMRT_CTLR_DISCOVERY_REQUIRED = (0x1 << 7),
+} smrt_controller_status_t;
+
+#define SMRT_CTLR_DISCOVERY_MASK (SMRT_CTLR_DISCOVERY_REQUESTED | \
+ SMRT_CTLR_DISCOVERY_RUNNING | SMRT_CTLR_DISCOVERY_PERIODIC)
+
+typedef struct smrt_stats {
+ uint64_t smrts_tran_aborts;
+ uint64_t smrts_tran_resets;
+ uint64_t smrts_tran_starts;
+ uint64_t smrts_ctlr_resets;
+ unsigned smrts_max_inflight;
+ uint64_t smrts_unclaimed_interrupts;
+ uint64_t smrts_claimed_interrupts;
+ uint64_t smrts_ignored_scsi_cmds;
+ uint64_t smrts_events_received;
+ uint64_t smrts_events_errors;
+ uint64_t smrts_events_intervened;
+ uint64_t smrts_discovery_tq_errors;
+} smrt_stats_t;
+
+typedef struct smrt_versions {
+ uint8_t smrtv_hardware_version;
+
+ /*
+ * These strings must be large enough to hold the 4 byte version string
+ * retrieved from an IDENTIFY CONTROLLER response, as well as the
+ * terminating NUL byte:
+ */
+ char smrtv_firmware_rev[5];
+ char smrtv_recovery_rev[5];
+ char smrtv_bootblock_rev[5];
+} smrt_versions_t;
+
+typedef struct smrt smrt_t;
+typedef struct smrt_command smrt_command_t;
+typedef struct smrt_command_internal smrt_command_internal_t;
+typedef struct smrt_command_scsa smrt_command_scsa_t;
+typedef struct smrt_pkt smrt_pkt_t;
+
+/*
+ * Per-Controller Structure
+ */
+struct smrt {
+ dev_info_t *smrt_dip;
+ int smrt_instance;
+ smrt_controller_status_t smrt_status;
+ smrt_stats_t smrt_stats;
+
+ /*
+ * Controller configuration discovered during initialisation.
+ */
+ uint32_t smrt_host_support;
+ uint32_t smrt_bus_support;
+ uint32_t smrt_maxcmds;
+ uint32_t smrt_sg_cnt;
+ smrt_versions_t smrt_versions;
+ uint16_t smrt_pci_vendor;
+ uint16_t smrt_pci_device;
+
+ /*
+ * iport specific data
+ */
+ dev_info_t *smrt_virt_iport;
+ dev_info_t *smrt_phys_iport;
+ scsi_hba_tgtmap_t *smrt_virt_tgtmap;
+ scsi_hba_tgtmap_t *smrt_phys_tgtmap;
+
+ /*
+ * The transport mode of the controller.
+ */
+ smrt_ctlr_mode_t smrt_ctlr_mode;
+
+ /*
+ * The current initialisation level of the driver. Bits in this field
+ * are set during initialisation and unset during cleanup of the
+ * allocated resources.
+ */
+ smrt_init_level_t smrt_init_level;
+
+ /*
+ * Essentially everything is protected by "smrt_mutex". When the
+ * completion queue is updated, threads sleeping on "smrt_cv_finishq"
+ * are awoken.
+ */
+ kmutex_t smrt_mutex;
+ kcondvar_t smrt_cv_finishq;
+
+ /*
+ * List of enumerated logical volumes (smrt_volume_t).
+ */
+ list_t smrt_volumes;
+
+ /*
+ * List of enumerated physical devices (smrt_physical_t).
+ */
+ list_t smrt_physicals;
+
+ /*
+ * List of attached SCSA target drivers (smrt_target_t).
+ */
+ list_t smrt_targets;
+
+ /*
+ * Controller Heartbeat Tracking
+ */
+ uint32_t smrt_last_heartbeat;
+ hrtime_t smrt_last_heartbeat_time;
+
+ hrtime_t smrt_last_interrupt_claimed;
+ hrtime_t smrt_last_interrupt_unclaimed;
+ hrtime_t smrt_last_reset_start;
+ hrtime_t smrt_last_reset_finish;
+
+ /*
+ * Command object tracking. These lists, and all commands within the
+ * lists, are protected by "smrt_mutex".
+ */
+ uint32_t smrt_next_tag;
+ avl_tree_t smrt_inflight;
+ list_t smrt_commands; /* List of all commands. */
+ list_t smrt_finishq; /* List of completed commands. */
+ list_t smrt_abortq; /* List of commands to abort. */
+
+ /*
+ * Discovery coordination
+ */
+ ddi_taskq_t *smrt_discover_taskq;
+ hrtime_t smrt_last_phys_discovery;
+ hrtime_t smrt_last_log_discovery;
+ uint64_t smrt_discover_gen;
+
+ /*
+ * Controller interrupt handler registration.
+ */
+ int smrt_interrupt_type;
+ int smrt_interrupt_cap;
+ uint_t smrt_interrupt_pri;
+ ddi_intr_handle_t smrt_interrupts[1];
+ int smrt_ninterrupts;
+
+ ddi_periodic_t smrt_periodic;
+
+ scsi_hba_tran_t *smrt_hba_tran;
+
+ ddi_dma_attr_t smrt_dma_attr;
+
+ /*
+ * Access to the I2O Registers:
+ */
+ unsigned smrt_i2o_bar;
+ caddr_t smrt_i2o_space;
+ ddi_acc_handle_t smrt_i2o_handle;
+
+ /*
+ * Access to the Configuration Table:
+ */
+ unsigned smrt_ct_bar;
+ uint32_t smrt_ct_baseaddr;
+ CfgTable_t *smrt_ct;
+ ddi_acc_handle_t smrt_ct_handle;
+
+ /*
+ * Asynchronous Event State
+ */
+ uint32_t smrt_event_count;
+ smrt_command_t *smrt_event_cmd;
+ smrt_command_t *smrt_event_cancel_cmd;
+ kcondvar_t smrt_event_queue;
+};
+
+/*
+ * Logical Volume Structure
+ */
+typedef enum smrt_volume_flags {
+ SMRT_VOL_FLAG_WWN = (0x1 << 0),
+} smrt_volume_flags_t;
+
+typedef struct smrt_volume {
+ LUNAddr_t smlv_addr;
+ smrt_volume_flags_t smlv_flags;
+
+ uint8_t smlv_wwn[16];
+ uint64_t smlv_gen;
+
+ smrt_t *smlv_ctlr;
+ list_node_t smlv_link;
+
+ /*
+ * List of SCSA targets currently attached to this Logical Volume:
+ */
+ list_t smlv_targets;
+} smrt_volume_t;
+
+typedef struct smrt_physical {
+ LUNAddr_t smpt_addr;
+ uint64_t smpt_wwn;
+ uint8_t smpt_dtype;
+ uint16_t smpt_bmic;
+ uint64_t smpt_gen;
+ boolean_t smpt_supported;
+ boolean_t smpt_visible;
+ boolean_t smpt_unsup_warn;
+ list_node_t smpt_link;
+ list_t smpt_targets;
+ smrt_t *smpt_ctlr;
+ smrt_identify_physical_drive_t *smpt_info;
+} smrt_physical_t;
+
+/*
+ * Per-Target Structure
+ */
+typedef struct smrt_target {
+ struct scsi_device *smtg_scsi_dev;
+
+ boolean_t smtg_physical;
+
+ /*
+ * This is only used when performing discovery during panic, as we need
+ * a mechanism to determine if the set of drives has shifted.
+ */
+ boolean_t smtg_gone;
+
+ /*
+ * Linkage back to the device that this target represents. This may be
+ * either a smrt_volume_t or a smrt_physical_t. We keep a pointer to the
+ * address, as that's the one thing we generally care about.
+ */
+ union {
+ smrt_physical_t *smtg_phys;
+ smrt_volume_t *smtg_vol;
+ } smtg_lun;
+ list_node_t smtg_link_lun;
+ LUNAddr_t *smtg_addr;
+
+ /*
+ * Linkage back to the controller:
+ */
+ smrt_t *smtg_ctlr;
+ list_node_t smtg_link_ctlr;
+} smrt_target_t;
+
+/*
+ * DMA Resource Tracking Structure
+ */
+typedef enum smrt_dma_level {
+ SMRT_DMALEVEL_HANDLE_ALLOC = (0x1 << 0),
+ SMRT_DMALEVEL_MEMORY_ALLOC = (0x1 << 1),
+ SMRT_DMALEVEL_HANDLE_BOUND = (0x1 << 2),
+} smrt_dma_level_t;
+
+typedef struct smrt_dma {
+ smrt_dma_level_t smdma_level;
+ size_t smdma_real_size;
+ ddi_dma_handle_t smdma_dma_handle;
+ ddi_acc_handle_t smdma_acc_handle;
+ ddi_dma_cookie_t smdma_dma_cookies[1];
+ uint_t smdma_dma_ncookies;
+} smrt_dma_t;
+
+
+typedef enum smrt_command_status {
+ /*
+ * When a command is submitted to the controller, it is marked USED
+ * to avoid accidental reuse of the command without reinitialising
+ * critical fields. The submitted command is also marked INFLIGHT
+ * to reflect its inclusion in the "smrt_inflight" AVL tree. When
+ * the command is completed by the controller, INFLIGHT is unset.
+ */
+ SMRT_CMD_STATUS_USED = (0x1 << 0),
+ SMRT_CMD_STATUS_INFLIGHT = (0x1 << 1),
+
+ /*
+ * This flag is set during abort queue processing to record that this
+ * command was aborted in response to an expired timeout, and not some
+ * other cancellation. If the controller is able to abort the command,
+ * we use this flag to let the SCSI framework know that the command
+ * timed out.
+ */
+ SMRT_CMD_STATUS_TIMEOUT = (0x1 << 2),
+
+ /*
+ * The controller set the error bit when completing this command.
+ * Details of the particular fault may be read from the error
+ * information written by the controller.
+ */
+ SMRT_CMD_STATUS_ERROR = (0x1 << 3),
+
+ /*
+ * This command has been abandoned by the original submitter. This
+ * could happen if the command did not complete in a timely fashion.
+ * When it reaches the finish queue it will be freed without further
+ * processing.
+ */
+ SMRT_CMD_STATUS_ABANDONED = (0x1 << 4),
+
+ /*
+ * This command has made it through the completion queue and had final
+ * processing performed.
+ */
+ SMRT_CMD_STATUS_COMPLETE = (0x1 << 5),
+
+ /*
+ * A polled message will be ignored by the regular processing of the
+ * completion queue. The blocking function doing the polling is
+ * responsible for watching the command on which it has set the POLLED
+ * flag. Regular completion queue processing (which might happen in
+ * the polling function, or it might happen in the interrupt handler)
+ * will set POLL_COMPLETE once it is out of the finish queue
+ * altogether.
+ */
+ SMRT_CMD_STATUS_POLLED = (0x1 << 6),
+ SMRT_CMD_STATUS_POLL_COMPLETE = (0x1 << 7),
+
+ /*
+ * An abort message has been sent to the controller in an attempt to
+ * cancel this command.
+ */
+ SMRT_CMD_STATUS_ABORT_SENT = (0x1 << 8),
+
+ /*
+ * This command has been passed to our tran_start(9E) handler.
+ */
+ SMRT_CMD_STATUS_TRAN_START = (0x1 << 9),
+
+ /*
+ * This command was for a SCSI command that we are explicitly avoiding
+ * sending to the controller.
+ */
+ SMRT_CMD_STATUS_TRAN_IGNORED = (0x1 << 10),
+
+ /*
+ * This command has been submitted once, and subsequently passed to
+ * smrt_command_reuse().
+ */
+ SMRT_CMD_STATUS_REUSED = (0x1 << 11),
+
+ /*
+ * A controller reset has been issued, so a response for this command
+ * is not expected. If one arrives before the controller reset has
+ * taken effect, it likely cannot be trusted.
+ */
+ SMRT_CMD_STATUS_RESET_SENT = (0x1 << 12),
+
+ /*
+ * Certain commands related to discovery and pinging need to be run
+ * during the context after a reset has occurred, but before the
+ * controller is considered. Such commands can use this flag to bypass
+ * the normal smrt_submit() check.
+ */
+ SMRT_CMD_IGNORE_RUNNING = (0x1 << 13),
+} smrt_command_status_t;
+
+typedef enum smrt_command_type {
+ SMRT_CMDTYPE_INTERNAL = 1,
+ SMRT_CMDTYPE_EVENT,
+ SMRT_CMDTYPE_ABORTQ,
+ SMRT_CMDTYPE_SCSA,
+ SMRT_CMDTYPE_PREINIT,
+} smrt_command_type_t;
+
+struct smrt_command {
+ uint32_t smcm_tag;
+ smrt_command_type_t smcm_type;
+ smrt_command_status_t smcm_status;
+
+ smrt_t *smcm_ctlr;
+ smrt_target_t *smcm_target;
+
+ list_node_t smcm_link; /* Linkage for allocated list. */
+ list_node_t smcm_link_finish; /* Linkage for completion list. */
+ list_node_t smcm_link_abort; /* Linkage for abort list. */
+ avl_node_t smcm_node; /* Inflight AVL membership. */
+
+ hrtime_t smcm_time_submit;
+ hrtime_t smcm_time_complete;
+
+ hrtime_t smcm_expiry;
+
+ /*
+ * The time at which an abort message was sent to try and terminate
+ * this command, as well as the tag of the abort message itself:
+ */
+ hrtime_t smcm_abort_time;
+ uint32_t smcm_abort_tag;
+
+ /*
+ * Ancillary data objects. Only one of these will be allocated for any
+ * given command, but we nonetheless resist the temptation to use a
+ * union of pointers in order to make incorrect usage obvious.
+ */
+ smrt_command_scsa_t *smcm_scsa;
+ smrt_command_internal_t *smcm_internal;
+
+ /*
+ * Physical allocation tracking for the actual command to send to the
+ * controller.
+ */
+ smrt_dma_t smcm_contig;
+
+ CommandList_t *smcm_va_cmd;
+ uint32_t smcm_pa_cmd;
+
+ ErrorInfo_t *smcm_va_err;
+ uint32_t smcm_pa_err;
+};
+
+/*
+ * Commands issued internally to the driver (as opposed to by the HBA
+ * framework) generally require a buffer in which to assemble the command body,
+ * and for receiving the response from the controller. The following object
+ * tracks this (optional) extra buffer.
+ */
+struct smrt_command_internal {
+ smrt_dma_t smcmi_contig;
+
+ void *smcmi_va;
+ uint32_t smcmi_pa;
+ size_t smcmi_len;
+};
+
+/*
+ * Commands issued via the SCSI framework have a number of additional
+ * properties.
+ */
+struct smrt_command_scsa {
+ struct scsi_pkt *smcms_pkt;
+ smrt_command_t *smcms_command;
+};
+
+
+/*
+ * CISS transport routines.
+ */
+void smrt_periodic(void *);
+void smrt_lockup_check(smrt_t *);
+int smrt_submit(smrt_t *, smrt_command_t *);
+void smrt_submit_simple(smrt_t *, smrt_command_t *);
+int smrt_retrieve(smrt_t *);
+void smrt_retrieve_simple(smrt_t *);
+int smrt_poll_for(smrt_t *, smrt_command_t *);
+int smrt_preinit_command_simple(smrt_t *, smrt_command_t *);
+
+/*
+ * Interrupt service routines.
+ */
+int smrt_interrupts_setup(smrt_t *);
+int smrt_interrupts_enable(smrt_t *);
+void smrt_interrupts_teardown(smrt_t *);
+uint32_t smrt_isr_hw_simple(caddr_t, caddr_t);
+
+/*
+ * Interrupt enable/disable routines.
+ */
+void smrt_intr_set(smrt_t *, boolean_t);
+
+/*
+ * Controller initialisation routines.
+ */
+int smrt_ctlr_init(smrt_t *);
+void smrt_ctlr_teardown(smrt_t *);
+int smrt_ctlr_reset(smrt_t *);
+int smrt_ctlr_wait_for_state(smrt_t *, smrt_wait_state_t);
+int smrt_ctlr_init_simple(smrt_t *);
+void smrt_ctlr_teardown_simple(smrt_t *);
+int smrt_cfgtbl_flush(smrt_t *);
+int smrt_cfgtbl_transport_has_support(smrt_t *, int);
+void smrt_cfgtbl_transport_set(smrt_t *, int);
+int smrt_cfgtbl_transport_confirm(smrt_t *, int);
+uint32_t smrt_ctlr_get_cmdsoutmax(smrt_t *);
+uint32_t smrt_ctlr_get_maxsgelements(smrt_t *);
+
+/*
+ * Device enumeration and lookup routines.
+ */
+void smrt_discover_request(smrt_t *);
+
+int smrt_logvol_discover(smrt_t *, uint16_t, uint64_t);
+void smrt_logvol_teardown(smrt_t *);
+smrt_volume_t *smrt_logvol_lookup_by_id(smrt_t *, unsigned long);
+void smrt_logvol_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t,
+ void **);
+boolean_t smrt_logvol_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t,
+ void *, scsi_tgtmap_deact_rsn_t);
+
+int smrt_phys_discover(smrt_t *, uint16_t, uint64_t);
+smrt_physical_t *smrt_phys_lookup_by_ua(smrt_t *, const char *);
+void smrt_phys_teardown(smrt_t *);
+void smrt_phys_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t,
+ void **);
+boolean_t smrt_phys_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t,
+ void *, scsi_tgtmap_deact_rsn_t);
+
+/*
+ * SCSI framework routines.
+ */
+int smrt_ctrl_hba_setup(smrt_t *);
+void smrt_ctrl_hba_teardown(smrt_t *);
+
+int smrt_logvol_hba_setup(smrt_t *, dev_info_t *);
+void smrt_logvol_hba_teardown(smrt_t *, dev_info_t *);
+int smrt_phys_hba_setup(smrt_t *, dev_info_t *);
+void smrt_phys_hba_teardown(smrt_t *, dev_info_t *);
+
+void smrt_hba_complete(smrt_command_t *);
+
+void smrt_process_finishq(smrt_t *);
+void smrt_process_abortq(smrt_t *);
+
+/*
+ * Command block management.
+ */
+smrt_command_t *smrt_command_alloc(smrt_t *, smrt_command_type_t,
+ int);
+smrt_command_t *smrt_command_alloc_preinit(smrt_t *, size_t, int);
+int smrt_command_attach_internal(smrt_t *, smrt_command_t *, size_t,
+ int);
+void smrt_command_free(smrt_command_t *);
+smrt_command_t *smrt_lookup_inflight(smrt_t *, uint32_t);
+void smrt_command_reuse(smrt_command_t *);
+
+/*
+ * Device message construction routines.
+ */
+void smrt_write_lun_addr_phys(LUNAddr_t *, boolean_t, unsigned, unsigned);
+void smrt_write_controller_lun_addr(LUNAddr_t *);
+uint16_t smrt_lun_addr_to_bmic(PhysDevAddr_t *);
+void smrt_write_message_abort_one(smrt_command_t *, uint32_t);
+void smrt_write_message_abort_all(smrt_command_t *, LUNAddr_t *);
+void smrt_write_message_nop(smrt_command_t *, int);
+void smrt_write_message_event_notify(smrt_command_t *);
+
+/*
+ * Device management routines.
+ */
+int smrt_device_setup(smrt_t *);
+void smrt_device_teardown(smrt_t *);
+uint32_t smrt_get32(smrt_t *, offset_t);
+void smrt_put32(smrt_t *, offset_t, uint32_t);
+
+/*
+ * SATA related routines.
+ */
+int smrt_sata_determine_wwn(smrt_t *, PhysDevAddr_t *, uint64_t *, uint16_t);
+
+/*
+ * Asynchronous Event Notification
+ */
+int smrt_event_init(smrt_t *);
+void smrt_event_fini(smrt_t *);
+void smrt_event_complete(smrt_command_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SMRT_H */
diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h
new file mode 100644
index 0000000000..e1f1db68b3
--- /dev/null
+++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h
@@ -0,0 +1,345 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#ifndef _SMRT_CISS_H
+#define _SMRT_CISS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Maximum number of Scatter/Gather List entries. These entries are statically
+ * allocated for all commands.
+ */
+#define CISS_MAXSGENTRIES 64
+
+/*
+ * If the controller advertises a value of 0 for the maximum S/G list length it
+ * supports, the specification states that we should assume a value of 31.
+ */
+#define CISS_SGCNT_FALLBACK 31
+
+/*
+ * The CDB field in the request block is fixed at 16 bytes in length. (See
+ * "3.2. Request Block" in the CISS specification.)
+ */
+#define CISS_CDBLEN 16
+
+/*
+ * Command Status Values. These are listed in "Table 2 Command Status" in "3.3
+ * Error Info" of the CISS specification.
+ */
+#define CISS_CMD_SUCCESS 0x00
+#define CISS_CMD_TARGET_STATUS 0x01
+#define CISS_CMD_DATA_UNDERRUN 0x02
+#define CISS_CMD_DATA_OVERRUN 0x03
+#define CISS_CMD_INVALID 0x04
+#define CISS_CMD_PROTOCOL_ERR 0x05
+#define CISS_CMD_HARDWARE_ERR 0x06
+#define CISS_CMD_CONNECTION_LOST 0x07
+#define CISS_CMD_ABORTED 0x08
+#define CISS_CMD_ABORT_FAILED 0x09
+#define CISS_CMD_UNSOLICITED_ABORT 0x0a
+#define CISS_CMD_TIMEOUT 0x0b
+#define CISS_CMD_UNABORTABLE 0x0c
+
+/*
+ * Request Transfer Directions, used in "RequestBlock.Type.Direction":
+ */
+#define CISS_XFER_NONE 0x00
+#define CISS_XFER_WRITE 0x01
+#define CISS_XFER_READ 0x02
+#define CISS_XFER_RSVD 0x03
+
+/*
+ * Request Attributes, used in "RequestBlock.Type.Attribute":
+ */
+#define CISS_ATTR_UNTAGGED 0x00
+#define CISS_ATTR_SIMPLE 0x04
+#define CISS_ATTR_HEADOFQUEUE 0x05
+#define CISS_ATTR_ORDERED 0x06
+
+/*
+ * Request Type, used in "RequestBlock.Type.Type":
+ */
+#define CISS_TYPE_CMD 0x00
+#define CISS_TYPE_MSG 0x01
+
+/*
+ * I2O Space Register Offsets
+ *
+ * The name "I2O", and these register offsets, appear to be amongst the last
+ * vestiges of a long-defunct attempt at standardising mainframe-style I/O
+ * channels in the Intel server space: the Intelligent Input/Output (I2O)
+ * Architecture Specification.
+ *
+ * The draft of version 1.5 of this specification, in section "4.2.1.5.1
+ * Extensions for PCI", suggests that the following are memory offsets into
+ * "the memory region specified by the first base address configuration
+ * register indicating memory space (offset 10h, 14h, and so forth)". These
+ * match up with the offsets of the first two BARs in a PCI configuration space
+ * type 0 header.
+ *
+ * The specification also calls out the Inbound Post List FIFO, write-only at
+ * offset 40h; the Outbound Post List FIFO, read-only at offset 44h; the
+ * Interrupt Status Register, at offset 30h; and the Interrupt Mask Register,
+ * at offset 34h.
+ *
+ * This ill-fated attempt to increase the proprietary complexity of (and
+ * presumably, thus, the gross margin on) computer systems is all but extinct.
+ * The transport layer of this storage controller is all that's left of their
+ * religion.
+ */
+#define CISS_I2O_INBOUND_DOORBELL 0x20
+#define CISS_I2O_INTERRUPT_STATUS 0x30
+#define CISS_I2O_INTERRUPT_MASK 0x34
+#define CISS_I2O_INBOUND_POST_Q 0x40
+#define CISS_I2O_OUTBOUND_POST_Q 0x44
+#define CISS_I2O_OUTBOUND_DOORBELL_STATUS 0x9c
+#define CISS_I2O_OUTBOUND_DOORBELL_CLEAR 0xa0
+#define CISS_I2O_SCRATCHPAD 0xb0
+#define CISS_I2O_CFGTBL_CFG_OFFSET 0xb4
+#define CISS_I2O_CFGTBL_MEM_OFFSET 0xb8
+
+/*
+ * Rather than make a lot of small mappings for each part of the address
+ * space we wish to access, we will make one large mapping. If more
+ * offsets are added to the I2O list above, this space should be extended
+ * appropriately.
+ */
+#define CISS_I2O_MAP_BASE 0x20
+#define CISS_I2O_MAP_LIMIT 0x100
+
+/*
+ * The Scratchpad Register (I2O_SCRATCHPAD) is not mentioned in the CISS
+ * specification. It serves at least two known functions:
+ * - Signalling controller readiness
+ * - Exposing a debugging code when the controller firmware locks up
+ */
+#define CISS_SCRATCHPAD_INITIALISED 0xffff0000
+
+/*
+ * Outbound Doorbell Register Values.
+ *
+ * These are read from the Outbound Doorbell Set/Status Register
+ * (CISS_I2O_OUTBOUND_DOORBELL_STATUS), but cleared by writing to the Clear
+ * Register (CISS_I2O_OUTBOUND_DOORBELL_CLEAR).
+ */
+#define CISS_ODR_BIT_INTERRUPT (1UL << 0)
+#define CISS_ODR_BIT_LOCKUP (1UL << 1)
+
+/*
+ * Inbound Doorbell Register Values.
+ *
+ * These are written to and read from the Inbound Doorbell Register
+ * (CISS_I2O_INBOUND_DOORBELL).
+ */
+#define CISS_IDR_BIT_CFGTBL_CHANGE (1UL << 0)
+
+/*
+ * Interrupt Mask Register Values.
+ *
+ * These are written to and read from the Interrupt Mask Register
+ * (CISS_I2O_INTERRUPT_MASK). Note that a 1 bit in this register masks or
+ * disables the interrupt in question; to enable the interrupt the bit must be
+ * set to 0.
+ */
+#define CISS_IMR_BIT_SIMPLE_INTR_DISABLE (1UL << 3)
+
+/*
+ * Interrupt Status Register Values.
+ *
+ * These are read from the Interrupt Status Register
+ * (CISS_I2O_INTERRUPT_STATUS).
+ */
+#define CISS_ISR_BIT_SIMPLE_INTR (1UL << 3)
+
+/*
+ * Transport Methods.
+ *
+ * These bit positions are used in the Configuration Table to detect controller
+ * support for a particular method, via "TransportSupport"; to request that the
+ * controller enable a particular method, via "TransportRequest"; and to detect
+ * whether the controller has acknowledged the request and enabled the desired
+ * method, via "TransportActive".
+ *
+ * See: "9.1 Configuration Table" in the CISS Specification.
+ */
+#define CISS_CFGTBL_READY_FOR_COMMANDS (1UL << 0)
+#define CISS_CFGTBL_XPORT_SIMPLE (1UL << 1)
+#define CISS_CFGTBL_XPORT_PERFORMANT (1UL << 2)
+#define CISS_CFGTBL_XPORT_MEMQ (1UL << 4)
+
+/*
+ * In the Simple Transport Method, when the appropriate interrupt status bit is
+ * set (CISS_ISR_BIT_SIMPLE_INTR), the Outbound Post Queue register is
+ * repeatedly read for notifications of the completion of commands previously
+ * submitted to the controller. These macros help break up the read value into
+ * its component fields: the tag number, and whether or not the command
+ * completed in error.
+ */
+#define CISS_OPQ_READ_TAG(x) ((x) >> 2)
+#define CISS_OPQ_READ_ERROR(x) ((x) & (1UL << 1))
+
+/*
+ * Physical devices that are reported may be marked as 'masked'. A masked device
+ * is one that the driver can see, but must not perform any I/O to.
+ */
+#define SMRT_CISS_MODE_MASKED 3
+
+/*
+ * The following packed structures are used to ease the manipulation of
+ * requests and responses from the controller.
+ */
+#pragma pack(1)
+
+typedef struct smrt_tag {
+ uint32_t reserved:1;
+ uint32_t error:1;
+ uint32_t tag_value:30;
+ uint32_t unused;
+} smrt_tag_t;
+
+typedef union SCSI3Addr {
+ struct {
+ uint8_t Dev;
+ uint8_t Bus:6;
+ uint8_t Mode:2;
+ } PeripDev;
+ struct {
+ uint8_t DevLSB;
+ uint8_t DevMSB:6;
+ uint8_t Mode:2;
+ } LogDev;
+ struct {
+ uint8_t Dev:5;
+ uint8_t Bus:3;
+ uint8_t Targ:6;
+ uint8_t Mode:2;
+ } LogUnit;
+} SCSI3Addr_t;
+
+typedef struct PhysDevAddr {
+ uint32_t TargetId:24;
+ uint32_t Bus:6;
+ uint32_t Mode:2;
+ SCSI3Addr_t Target[2];
+} PhysDevAddr_t;
+
+typedef struct LogDevAddr {
+ uint32_t VolId:30;
+ uint32_t Mode:2;
+ uint8_t reserved[4];
+} LogDevAddr_t;
+
+typedef union LUNAddr {
+ uint8_t LunAddrBytes[8];
+ SCSI3Addr_t SCSI3Lun[4];
+ PhysDevAddr_t PhysDev;
+ LogDevAddr_t LogDev;
+} LUNAddr_t;
+
+typedef struct CommandListHeader {
+ uint8_t ReplyQueue;
+ uint8_t SGList;
+ uint16_t SGTotal;
+ smrt_tag_t Tag;
+ LUNAddr_t LUN;
+} CommandListHeader_t;
+
+typedef struct RequestBlock {
+ uint8_t CDBLen;
+ struct {
+ uint8_t Type:3;
+ uint8_t Attribute:3;
+ uint8_t Direction:2;
+ } Type;
+ uint16_t Timeout;
+ uint8_t CDB[CISS_CDBLEN];
+} RequestBlock_t;
+
+typedef struct ErrDescriptor {
+ uint64_t Addr;
+ uint32_t Len;
+} ErrDescriptor_t;
+
+typedef struct SGDescriptor {
+ uint64_t Addr;
+ uint32_t Len;
+ uint32_t Ext;
+} SGDescriptor_t;
+
+typedef struct CommandList {
+ CommandListHeader_t Header;
+ RequestBlock_t Request;
+ ErrDescriptor_t ErrDesc;
+ SGDescriptor_t SG[CISS_MAXSGENTRIES];
+} CommandList_t;
+
+typedef union MoreErrInfo {
+ struct {
+ uint8_t Reserved[3];
+ uint8_t Type;
+ uint32_t ErrorInfo;
+ } Common_Info;
+ struct {
+ uint8_t Reserved[2];
+ uint8_t offense_size;
+ uint8_t offense_num;
+ uint32_t offense_value;
+ } Invalid_Cmd;
+} MoreErrInfo_t;
+
+typedef struct ErrorInfo {
+ uint8_t ScsiStatus;
+ uint8_t SenseLen;
+ uint16_t CommandStatus;
+ uint32_t ResidualCnt;
+ MoreErrInfo_t MoreErrInfo;
+ uint8_t SenseInfo[MAX_SENSE_LENGTH];
+} ErrorInfo_t;
+
+typedef struct CfgTable {
+ uint8_t Signature[4];
+ uint32_t SpecValence;
+ uint32_t TransportSupport;
+ uint32_t TransportActive;
+ uint32_t TransportRequest;
+ uint32_t Upper32Addr;
+ uint32_t CoalIntDelay;
+ uint32_t CoalIntCount;
+ uint32_t CmdsOutMax;
+ uint32_t BusTypes;
+ uint32_t TransportMethodOffset;
+ uint8_t ServerName[16];
+ uint32_t HeartBeat;
+ uint32_t HostDrvrSupport;
+ uint32_t MaxSGElements;
+ uint32_t MaxLunSupport;
+ uint32_t MaxPhyDevSupport;
+ uint32_t MaxPhyDrvPerLun;
+ uint32_t MaxPerfModeCmdsOutMax;
+ uint32_t MaxBlockFetchCount;
+} CfgTable_t;
+
+#pragma pack()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SMRT_CISS_H */
diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h
new file mode 100644
index 0000000000..47ef99b2e0
--- /dev/null
+++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h
@@ -0,0 +1,371 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
+ * Copyright (c) 2017 Joyent, Inc.
+ */
+
+#ifndef _SMRT_SCSI_H
+#define _SMRT_SCSI_H
+
+#include <sys/types.h>
+
+#include <sys/scsi/adapters/smrt/smrt_ciss.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* CISS LUN Addressing MODEs */
+#define PERIPHERIAL_DEV_ADDR 0x0
+#define LOGICAL_VOL_ADDR 0x1
+#define MASK_PERIPHERIAL_DEV_ADDR 0x3
+#define CISS_PHYS_MODE 0x0
+
+/*
+ * Vendor-specific SCSI Commands
+ *
+ * These command opcodes are for use in the opcode byte of the CDB in a request
+ * of type CISS_TYPE_CMD. They are custom SCSI commands, using the
+ * vendor-specific part of the opcode space; i.e., 0xC0 through 0xFF.
+ */
+#define CISS_SCMD_READ 0xC0
+#define CISS_SCMD_WRITE 0xC1
+#define CISS_SCMD_REPORT_LOGICAL_LUNS 0xC2
+#define CISS_SCMD_REPORT_PHYSICAL_LUNS 0xC3
+
+/*
+ * These command opcodes are _not_ in the usual vendor-specific space, but are
+ * nonetheless vendor-specific. They allow BMIC commands to be written to and
+ * read from the controller. If a command transfers no data, the specification
+ * suggests that BMIC_WRITE (0x27) is appropriate.
+ */
+#define CISS_SCMD_BMIC_READ 0x26
+#define CISS_SCMD_BMIC_WRITE 0x27
+
+/*
+ * CISS Messages
+ *
+ * The CISS specification describes several directives that do not behave like
+ * SCSI commands. They are sent in requests of type CISS_TYPE_MSG.
+ *
+ * The Abort, Reset, and Nop, messages are defined in "8. Messages" in the CISS
+ * Specification.
+ */
+#define CISS_MSG_ABORT 0x0
+#define CISS_ABORT_TASK 0x0
+#define CISS_ABORT_TASKSET 0x1
+
+#define CISS_MSG_RESET 0x1
+#define CISS_RESET_CTLR 0x0
+#define CISS_RESET_BUS 0x1
+#define CISS_RESET_TGT 0x3
+#define CISS_RESET_LUN 0x4
+
+#define CISS_MSG_NOP 0x3
+
+/*
+ * BMIC Commands
+ *
+ * These commands allow for the use of non-standard facilities specific to the
+ * Smart Array firmware. They are sent to the controller through a specially
+ * constructed CDB with the CISS_SCMD_BMIC_READ or CISS_SCMD_BMIC_WRITE opcode.
+ */
+#define CISS_BMIC_IDENTIFY_CONTROLLER 0x11
+#define CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE 0x15
+#define CISS_BMIC_NOTIFY_ON_EVENT 0xD0
+#define CISS_BMIC_NOTIFY_ON_EVENT_CANCEL 0xD1
+
+/*
+ * Device and Phy type codes. These are used across many commands, including
+ * IDENTIFY PHYSICAL DEVICE and the REPORT PHYSICAL LUN extended reporting.
+ */
+#define SMRT_DTYPE_PSCSI 0x00
+#define SMRT_DTYPE_SATA 0x01
+#define SMRT_DTYPE_SAS 0x02
+#define SMRT_DTYPE_SATA_BW 0x03
+#define SMRT_DTYPE_SAS_BW 0x04
+#define SMRT_DTYPE_EXPANDER 0x05
+#define SMRT_DTYPE_SES 0x06
+#define SMRT_DTYPE_CONTROLLER 0x07
+#define SMRT_DTYPE_SGPIO 0x08
+#define SMRT_DTYPE_NVME 0x09
+#define SMRT_DTYPE_NOPHY 0xFF
+
+/*
+ * The following packed structures are used to ease the manipulation of SCSI
+ * and BMIC commands sent to, and status information returned from, the
+ * controller.
+ */
+#pragma pack(1)
+
+typedef struct smrt_report_logical_lun_ent {
+ LogDevAddr_t smrle_addr;
+} smrt_report_logical_lun_ent_t;
+
+typedef struct smrt_report_logical_lun_extent {
+ LogDevAddr_t smrle_addr;
+ uint8_t smrle_wwn[16];
+} smrt_report_logical_lun_extent_t;
+
+typedef struct smrt_report_logical_lun {
+ uint32_t smrll_datasize; /* Big Endian */
+ uint8_t smrll_extflag;
+ uint8_t smrll_reserved1[3];
+ union {
+ smrt_report_logical_lun_ent_t ents[SMRT_MAX_LOGDRV];
+ smrt_report_logical_lun_extent_t extents[SMRT_MAX_LOGDRV];
+ } smrll_data;
+} smrt_report_logical_lun_t;
+
+typedef struct smrt_report_logical_lun_req {
+ uint8_t smrllr_opcode;
+ uint8_t smrllr_extflag;
+ uint8_t smrllr_reserved1[4];
+ uint32_t smrllr_datasize; /* Big Endian */
+ uint8_t smrllr_reserved2;
+ uint8_t smrllr_control;
+} smrt_report_logical_lun_req_t;
+
+typedef struct smrt_report_physical_lun_ent {
+ PhysDevAddr_t srple_addr;
+} smrt_report_physical_lun_ent_t;
+
+/*
+ * This structure represents the 'physical node identifier' extended option for
+ * REPORT PHYSICAL LUNS. This is triggered when the extended flags is set to
+ * 0x1. Note that for SAS the other structure should always be used.
+ */
+typedef struct smrt_report_physical_pnid {
+ uint8_t srpp_node[8];
+ uint8_t srpp_port[8];
+} smrt_report_physical_pnid_t;
+
+/*
+ * This structure represents the 'other physical device info' extended option
+ * for report physical luns. This is triggered when the extended flags is set
+ * to 0x2.
+ */
+typedef struct smrt_report_physical_opdi {
+ uint8_t srpo_wwid[8];
+ uint8_t srpo_dtype;
+ uint8_t srpo_flags;
+ uint8_t srpo_multilun;
+ uint8_t srpo_paths;
+ uint32_t srpo_iohdl;
+} smrt_report_physical_opdi_t;
+
+typedef struct smrt_report_physical_lun_extent {
+ PhysDevAddr_t srple_addr;
+ union {
+ smrt_report_physical_pnid_t srple_pnid;
+ smrt_report_physical_opdi_t srple_opdi;
+ } srple_extdata;
+} smrt_report_physical_lun_extent_t;
+
+/*
+ * Values that can be ORed together into smrllr_extflag. smprl_extflag indicates
+ * if any extended processing was done or not.
+ */
+#define SMRT_REPORT_PHYSICAL_LUN_EXT_NONE 0x00
+#define SMRT_REPORT_PHYSICAL_LUN_EXT_PNID 0x01
+#define SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI 0x02
+#define SMRT_REPORT_PHYSICAL_LUN_EXT_MASK 0x0f
+#define SMRT_REPORT_PHYSICAL_LUN_CTRL_ONLY (1 << 6)
+#define SMRT_REPORT_PHYSICAL_LUN_ALL_PATHS (1 << 7)
+
+typedef struct smrt_report_physical_lun {
+ uint32_t smrpl_datasize; /* Big Endian */
+ uint8_t smrpl_extflag;
+ uint8_t smrpl_reserved1[3];
+ union {
+ smrt_report_physical_lun_ent_t ents[SMRT_MAX_PHYSDEV];
+ smrt_report_physical_lun_extent_t extents[SMRT_MAX_PHYSDEV];
+ } smrpl_data;
+} smrt_report_physical_lun_t;
+
+
+typedef struct smrt_report_physical_lun_req {
+ uint8_t smrplr_opcode;
+ uint8_t smrplr_extflag;
+ uint8_t smrplr_reserved[1];
+ uint32_t smrplr_datasize; /* Big Endian */
+ uint8_t smrplr_reserved2;
+ uint8_t smrplr_control;
+} smrt_report_physical_lun_req_t;
+
+/*
+ * Request structure for the BMIC command IDENTIFY CONTROLLER. This structure
+ * is written into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode. Reserved
+ * fields should be filled with zeroes.
+ */
+typedef struct smrt_identify_controller_req {
+ uint8_t smicr_opcode;
+ uint8_t smicr_lun;
+ uint8_t smicr_reserved1[4];
+ uint8_t smicr_command;
+ uint8_t smicr_reserved2[2];
+ uint8_t smicr_reserved3[1];
+ uint8_t smicr_reserved4[6];
+} smrt_identify_controller_req_t;
+
+/*
+ * Response structure for IDENTIFY CONTROLLER. This structure is used to
+ * interpret the response the controller will write into the data buffer.
+ */
+typedef struct smrt_identify_controller {
+ uint8_t smic_logical_drive_count;
+ uint32_t smic_config_signature;
+ uint8_t smic_firmware_rev[4];
+ uint8_t smic_recovery_rev[4];
+ uint8_t smic_hardware_version;
+ uint8_t smic_bootblock_rev[4];
+
+ /*
+ * These are obsolete for SAS controllers:
+ */
+ uint32_t smic_drive_present_map;
+ uint32_t smic_external_drive_map;
+
+ uint32_t smic_board_id;
+} smrt_identify_controller_t;
+
+/*
+ * Request structure for IDENTIFY PHYSICAL DEVICE. This structure is written
+ * into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode. Reserved fields
+ * should be filled with zeroes. Note, the lower 8 bits of the BMIC ID are in
+ * index1, whereas the upper 8 bites are in index2; however, the controller may
+ * only support 8 bits worth of devices (and this driver does not support that
+ * many devices).
+ */
+typedef struct smrt_identify_physical_drive_req {
+ uint8_t sipdr_opcode;
+ uint8_t sipdr_lun;
+ uint8_t sipdr_bmic_index1;
+ uint8_t sipdr_reserved1[3];
+ uint8_t sipdr_command;
+ uint8_t sipdr_reserved2[2];
+ uint8_t sipdr_bmic_index2;
+ uint8_t sipdr_reserved4[6];
+} smrt_identify_physical_drive_req_t;
+
+/*
+ * Relevant values for the sipd_more_flags member.
+ */
+#define SMRT_MORE_FLAGS_LOGVOL (1 << 5)
+#define SMRT_MORE_FLAGS_SPARE (1 << 6)
+
+/*
+ * Response structure for IDENTIFY PHYSICAL DEVICE. This structure is used to
+ * describe aspects of a physical drive. Note, not all fields are valid in all
+ * firmware revisions.
+ */
+typedef struct smrt_identify_physical_drive {
+ uint8_t sipd_scsi_bus; /* Invalid for SAS */
+ uint8_t sipd_scsi_id; /* Invalid for SAS */
+ uint16_t sipd_lblk_size;
+ uint32_t sipd_nblocks;
+ uint32_t sipd_rsrvd_blocsk;
+ uint8_t sipd_model[40];
+ uint8_t sipd_serial[40];
+ uint8_t sipd_firmware[8];
+ uint8_t sipd_scsi_inquiry;
+ uint8_t sipd_compaq_stamp;
+ uint8_t sipd_last_failure;
+ uint8_t sipd_flags;
+ uint8_t sipd_more_flags;
+ uint8_t sipd_scsi_lun; /* Invalid for SAS */
+ uint8_t sipd_yet_more_flags;
+ uint8_t sipd_even_more_flags;
+ uint32_t sipd_spi_speed_rules;
+ uint8_t sipd_phys_connector[2];
+ uint8_t sipd_phys_box_on_bus;
+ uint8_t sipd_phys_bay_in_box;
+ uint32_t sipd_rpm;
+ uint8_t sipd_device_type;
+ uint8_t sipd_sata_version;
+ uint64_t sipd_big_nblocks;
+ uint64_t sipd_ris_slba;
+ uint32_t sipd_ris_size;
+ uint8_t sipd_wwid[20];
+ uint8_t sipd_controller_phy_map[32];
+ uint16_t sipd_phy_count;
+ uint8_t sipd_phy_connected_dev_type[256];
+ uint8_t sipd_phy_to_drive_bay[256];
+ uint16_t sipd_phy_to_attached_dev[256];
+ uint8_t sipd_box_index;
+ uint8_t sipd_drive_support;
+ uint16_t sipd_extra_flags;
+ uint8_t sipd_neogiated_link_rate[256];
+ uint8_t sipd_phy_to_phy_map[256];
+ uint8_t sipd_pad[312];
+} smrt_identify_physical_drive_t;
+
+/*
+ * Note that this structure describes the CISS version of the command. There
+ * also exists a BMIC version, but it has a slightly different structure. This
+ * structure is also used for the cancellation request; however, in that case,
+ * the senr_flags field is reserved.
+ */
+typedef struct smrt_event_notify_req {
+ uint8_t senr_opcode;
+ uint8_t senr_subcode;
+ uint8_t senr_reserved1[2];
+ uint32_t senr_flags; /* Big Endian */
+ uint32_t senr_size; /* Big Endian */
+ uint8_t senr_control;
+} smrt_event_notify_req_t;
+
+/*
+ * When receiving event notifications, the buffer size must be 512 bytes large.
+ * We make sure that we always allocate a buffer of this size, even though we
+ * define a structure that is much shorter and only uses the fields that we end
+ * up caring about. This size requirement comes from the specification.
+ */
+#define SMRT_EVENT_NOTIFY_BUFLEN 512
+
+#define SMRT_EVENT_CLASS_PROTOCOL 0
+#define SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR 1
+
+#define SMRT_EVENT_CLASS_HOTPLUG 1
+#define SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE 0
+
+#define SMRT_EVENT_CLASS_HWERROR 2
+#define SMRT_EVENT_CLASS_ENVIRONMENT 3
+
+#define SMRT_EVENT_CLASS_PHYS 4
+#define SMRT_EVENT_PHYS_SUBCLASS_STATE 0
+
+#define SMRT_EVENT_CLASS_LOGVOL 5
+
+typedef struct smrt_event_notify {
+ uint32_t sen_timestamp;
+ uint16_t sen_class;
+ uint16_t sen_subclass;
+ uint16_t sen_detail;
+ uint8_t sen_data[64];
+ char sen_message[80];
+ uint32_t sen_tag;
+ uint16_t sen_date;
+ uint16_t sen_year;
+ uint32_t sen_time;
+ uint16_t sen_pre_power_time;
+ LUNAddr_t sen_addr;
+} smrt_event_notify_t;
+
+#pragma pack()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SMRT_SCSI_H */
diff --git a/usr/src/uts/common/sys/scsi/generic/inquiry.h b/usr/src/uts/common/sys/scsi/generic/inquiry.h
index ddfd683169..fcbf00d5dc 100644
--- a/usr/src/uts/common/sys/scsi/generic/inquiry.h
+++ b/usr/src/uts/common/sys/scsi/generic/inquiry.h
@@ -362,7 +362,8 @@ struct scsi_inquiry {
#define DTYPE_NOTPRESENT (DPQ_NEVER | DTYPE_UNKNOWN)
/*
- * Defined Response Data Formats:
+ * Defined Versions for inquiry data. These represent the base version that a
+ * device supports.
*/
#define RDF_LEVEL0 0x00 /* no conformance claim (SCSI-1) */
#define RDF_CCS 0x01 /* Obsolete (pseudo-spec) */
@@ -370,7 +371,8 @@ struct scsi_inquiry {
#define RDF_SCSI_SPC 0x03 /* ANSI INCITS 301-1997 (SPC) */
#define RDF_SCSI_SPC2 0x04 /* ANSI INCITS 351-2001 (SPC-2) */
#define RDF_SCSI_SPC3 0x05 /* ANSI INCITS 408-2005 (SPC-3) */
-#define RDF_SCSI_SPC4 0x06 /* t10 (SPC-4) */
+#define RDF_SCSI_SPC4 0x06 /* ANSI INCITS 513-2015 (SPC-4) */
+#define RDF_SCSI_SPC5 0x07 /* t10 (SPC-5) */
/*
* Defined Target Port Group Select values:
@@ -436,6 +438,7 @@ struct vpd_desc {
#define PM_CAPABLE_SPC2 RDF_SCSI_SPC2
#define PM_CAPABLE_SPC3 RDF_SCSI_SPC3
#define PM_CAPABLE_SPC4 RDF_SCSI_SPC4
+#define PM_CAPABLE_SPC5 RDF_SCSI_SPC5
#define PM_CAPABLE_LOG_MASK 0xffff0000 /* use upper 16 bit to */
/* indicate log specifics */
#define PM_CAPABLE_LOG_SUPPORTED 0x10000 /* Log page 0xE might be */
diff --git a/usr/src/uts/common/sys/scsi/targets/sddef.h b/usr/src/uts/common/sys/scsi/targets/sddef.h
index 57e1e01aec..c4af129a32 100644
--- a/usr/src/uts/common/sys/scsi/targets/sddef.h
+++ b/usr/src/uts/common/sys/scsi/targets/sddef.h
@@ -775,6 +775,12 @@ _NOTE(MUTEX_PROTECTS_DATA(sd_lun::un_fi_mutex,
#define SD_FM_LOG(un) (((struct sd_fm_internal *)\
((un)->un_fm_private))->fm_log_level)
+/*
+ * Version Related Macros
+ */
+#define SD_SCSI_VERS_IS_GE_SPC_4(un) \
+ (SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC4 || \
+ SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC5)
/*
* Values for un_ctype
@@ -1862,6 +1868,10 @@ struct sd_fm_internal {
#define SD_PM_CAPABLE_IS_SPC_4(pm_cap) \
((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4)
+#define SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap) \
+ (((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) || \
+ ((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC5))
+
#define SD_PM_CAP_LOG_SUPPORTED(pm_cap) \
((pm_cap & PM_CAPABLE_LOG_SUPPORTED) ? TRUE : FALSE)
diff --git a/usr/src/uts/common/sys/sensors.h b/usr/src/uts/common/sys/sensors.h
new file mode 100644
index 0000000000..b9ca9f1f3f
--- /dev/null
+++ b/usr/src/uts/common/sys/sensors.h
@@ -0,0 +1,81 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019, Joyent, Inc.
+ */
+
+#ifndef _SYS_SENSORS_H
+#define _SYS_SENSORS_H
+
+/*
+ * Consolidated sensor ioctls for various parts of the operating system. These
+ * interfaces should not be relied on at all. They are evolving and will change
+ * as we add more to the system for this. This may eventually become a larger
+ * framework, though it's more likely we'll consolidate that in userland.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * List of different possible kinds of sensors.
+ */
+#define SENSOR_KIND_UNKNOWN 0x00
+#define SENSOR_KIND_TEMPERATURE 0x01
+
+/*
+ * Lists of units that senors may have.
+ */
+#define SENSOR_UNIT_UNKNOWN 0x00
+#define SENSOR_UNIT_CELSIUS 0x01
+#define SENSOR_UNIT_FAHRENHEIT 0x02
+#define SENSOR_UNIT_KELVIN 0x03
+
+#define SENSOR_IOCTL (('s' << 24) | ('e' << 16) | ('n' << 8))
+
+/*
+ * Ask the sensor what kind of sensor it is.
+ */
+#define SENSOR_IOCTL_TYPE (SENSOR_IOCTL | 0x01)
+
+typedef struct sensor_ioctl_kind {
+ uint64_t sik_kind;
+} sensor_ioctl_kind_t;
+
+/*
+ * Ask the sensor for a temperature measurement. The sensor is responsible for
+ * returning the units it's in. A temperature measurement is broken down into a
+ * signed value and a notion of its granularity. The sit_gran member indicates
+ * the granularity: the number of increments per degree in the temperature
+ * measurement (the sit_temp member). sit_gran is signed and the sign indicates
+ * whether one needs to multiply or divide the granularity. For example, a
+ * value that set sit_gran to 10 would mean that the value in sit_temp was in
+ * 10ths of a degree and that to get the actual value in degrees, one would
+ * divide by 10. On the other hand, a negative value means that we effectively
+ * have to multiply to get there. For example, a value of -2 would indicate that
+ * each value in sit_temp indicated two degrees and to get the temperature in
+ * degrees you would multiply sit_temp by two.
+ */
+#define SENSOR_IOCTL_TEMPERATURE (SENSOR_IOCTL | 0x02)
+
+typedef struct sensor_ioctl_temperature {
+ uint32_t sit_unit;
+ int32_t sit_gran;
+ int64_t sit_temp;
+} sensor_ioctl_temperature_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SENSORS_H */
diff --git a/usr/src/uts/common/sys/shm.h b/usr/src/uts/common/sys/shm.h
index 0219fc2cf7..8f530afda2 100644
--- a/usr/src/uts/common/sys/shm.h
+++ b/usr/src/uts/common/sys/shm.h
@@ -21,6 +21,7 @@
*/
/*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2016 Joyent, Inc.
*
* Copyright 2003 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -120,6 +121,10 @@ struct shmid_ds {
#define SHM_LOCK 3 /* Lock segment in core */
#define SHM_UNLOCK 4 /* Unlock segment */
+#if defined(_KERNEL)
+#define SHM_RMID 5 /* Private RMID for lx support */
+#endif
+
#if !defined(_KERNEL)
int shmget(key_t, size_t, int);
int shmids(int *, uint_t, uint_t *);
diff --git a/usr/src/uts/common/sys/shm_impl.h b/usr/src/uts/common/sys/shm_impl.h
index 4d8cdcede5..1eae2ca0a4 100644
--- a/usr/src/uts/common/sys/shm_impl.h
+++ b/usr/src/uts/common/sys/shm_impl.h
@@ -21,13 +21,12 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#ifndef _SYS_SHM_IMPL_H
#define _SYS_SHM_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/ipc_impl.h>
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <sys/shm.h>
@@ -70,7 +69,11 @@ typedef struct kshmid {
time_t shm_ctime; /* last change time */
struct sptinfo *shm_sptinfo; /* info about ISM segment */
struct seg *shm_sptseg; /* pointer to ISM segment */
- long shm_sptprot; /* was reserved (still a "long") */
+ ulong_t shm_opts;
+ /*
+ * Composed of: sptprot (uchar_t) and
+ * RM_PENDING flag (1 bit).
+ */
} kshmid_t;
/*
@@ -78,6 +81,14 @@ typedef struct kshmid {
*/
#define SHMSA_ISM 1 /* uses shared page table */
+/*
+ * shm_opts definitions
+ * Low byte in shm_opts is used for sptprot (see PROT_ALL). The upper bits are
+ * used for additional options.
+ */
+#define SHM_PROT_MASK 0xff
+#define SHM_RM_PENDING 0x100
+
typedef struct sptinfo {
struct as *sptas; /* dummy as ptr. for spt segment */
} sptinfo_t;
diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h
index aece147bec..b12dff6034 100644
--- a/usr/src/uts/common/sys/signal.h
+++ b/usr/src/uts/common/sys/signal.h
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -158,8 +159,8 @@ struct sigaction32 {
* use of these symbols by applications is injurious
* to binary compatibility
*/
-#define NSIG 74 /* valid signals range from 1 to NSIG-1 */
-#define MAXSIG 73 /* size of u_signal[], NSIG-1 <= MAXSIG */
+#define NSIG 75 /* valid signals range from 1 to NSIG-1 */
+#define MAXSIG 74 /* size of u_signal[], NSIG-1 <= MAXSIG */
#endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */
#define MINSIGSTKSZ 2048
diff --git a/usr/src/uts/common/sys/smbios.h b/usr/src/uts/common/sys/smbios.h
index d28141e668..43163a7507 100644
--- a/usr/src/uts/common/sys/smbios.h
+++ b/usr/src/uts/common/sys/smbios.h
@@ -526,6 +526,10 @@ typedef struct smbios_processor {
#define SMB_PRU_LGA36471 0x36 /* LGA3647-1 */
#define SMB_PRU_SP3 0x37 /* socket SP3 */
#define SMB_PRU_SP3r2 0x38 /* socket SP3r2 */
+#define SMB_PRU_LGA2066 0x39 /* Socket LGA2066 */
+#define SMB_PRU_BGA1392 0x3A /* Socket BGA1392 */
+#define SMB_PRU_BGA1510 0x3B /* Socket BGA1510 */
+#define SMB_PRU_BGA1528 0x3C /* Socket BGA1528 */
#define SMB_PRC_RESERVED 0x0001 /* reserved */
#define SMB_PRC_UNKNOWN 0x0002 /* unknown */
@@ -707,6 +711,7 @@ typedef struct smbios_processor {
#define SMB_PRF_ZARCH 0xCC /* z/Architecture */
#define SMB_PRF_CORE_I5 0xCD /* Intel Core i5 */
#define SMB_PRF_CORE_I3 0xCE /* Intel Core i3 */
+#define SMB_PRF_CORE_I9 0xCF /* Intel Core i9 */
#define SMB_PRF_C7M 0xD2 /* VIA C7-M */
#define SMB_PRF_C7D 0xD3 /* VIA C7-D */
#define SMB_PRF_C7 0xD4 /* VIA C7 */
@@ -872,6 +877,7 @@ typedef struct smbios_port {
#define SMB_POC_BNC 0x20 /* BNC */
#define SMB_POC_1394 0x21 /* 1394 */
#define SMB_POC_SATA 0x22 /* SAS/SATA plug receptacle */
+#define SMB_POC_USB_C 0x23 /* USB Type-C receptacle */
#define SMB_POC_PC98 0xA0 /* PC-98 */
#define SMB_POC_PC98HR 0xA1 /* PC-98Hireso */
#define SMB_POC_PCH98 0xA2 /* PC-H98 */
@@ -913,6 +919,8 @@ typedef struct smbios_port {
#define SMB_POT_NETWORK 0x1F /* Network port */
#define SMB_POT_SATA 0x20 /* SATA */
#define SMB_POT_SAS 0x21 /* SAS */
+#define SMB_POT_MFDP 0x22 /* MFDP (Multi-Function Display Port) */
+#define SMB_POT_THUNDERBOLT 0x23 /* Thunderbolt */
#define SMB_POT_8251 0xA0 /* 8251 compatible */
#define SMB_POT_8251F 0xA1 /* 8251 FIFO compatible */
#define SMB_POT_OTHER 0xFF /* other */
@@ -933,6 +941,8 @@ typedef struct smbios_slot {
uint16_t smbl_sg; /* segment group number */
uint8_t smbl_bus; /* bus number */
uint8_t smbl_df; /* device/function number */
+ uint8_t smbl_dbw; /* data bus width */
+ uint8_t smbl_npeers; /* PCIe bifurcation peers */
} smbios_slot_t;
#define SMB_SLT_OTHER 0x01 /* other */
@@ -1036,6 +1046,21 @@ typedef struct smbios_slot {
#define SMB_SLCH2_PME 0x01 /* slot supports PME# signal */
#define SMB_SLCH2_HOTPLUG 0x02 /* slot supports hot-plug devices */
#define SMB_SLCH2_SMBUS 0x04 /* slot supports SMBus signal */
+#define SMB_SLCH2_BIFUR 0x08 /* slot supports PCIe bifurcation */
+
+/*
+ * SMBIOS 7.10.9 Slot Peer Devices
+ *
+ * This structure represents an optional peer device that may be part of an
+ * SMBIOS 3.2 slot.
+ */
+typedef struct smbios_slot_peer {
+ uint16_t smblp_group; /* peer segment group number */
+ uint8_t smblp_bus; /* peer bus number */
+ uint8_t smblp_device; /* peer device number */
+ uint8_t smblp_function; /* peer function number */
+ uint8_t smblp_data_width; /* peer data bus width */
+} smbios_slot_peer_t;
/*
* SMBIOS On-Board Device Information. See DSP0134 Section 7.11 for more
@@ -1189,6 +1214,17 @@ typedef struct smbios_memdevice {
uint16_t smbmd_minvolt; /* minimum voltage */
uint16_t smbmd_maxvolt; /* maximum voltage */
uint16_t smbmd_confvolt; /* configured voltage */
+ uint8_t smbmd_memtech; /* memory technology */
+ uint32_t smbmd_opcap_flags; /* operating mode capability */
+ const char *smbmd_firmware_rev; /* firmware rev */
+ uint16_t smbmd_modmfg_id; /* JEDEC module mfg id */
+ uint16_t smbmd_modprod_id; /* JEDEC module product id */
+ uint16_t smbmd_cntrlmfg_id; /* JEDEC controller mfg id */
+ uint16_t smbmd_cntrlprod_id; /* JEDEC controller prod id */
+ uint64_t smbmd_nvsize; /* non-volatile size in bytes */
+ uint64_t smbmd_volatile_size; /* volatile size in bytes */
+ uint64_t smbmd_cache_size; /* cache size in bytes */
+ uint64_t smbmd_logical_size; /* logical size in bytes */
} smbios_memdevice_t;
#define SMB_MDFF_OTHER 0x01 /* other */
@@ -1234,6 +1270,7 @@ typedef struct smbios_memdevice {
#define SMB_MDT_LPDDR2 0x1C /* LPDDR2 */
#define SMB_MDT_LPDDR3 0x1D /* LPDDR3 */
#define SMB_MDT_LPDDR4 0x1E /* LPDDR4 */
+#define SMB_MDT_LOGNV 0x1F /* Logical non-volatile device */
#define SMB_MDF_OTHER 0x0002 /* other */
#define SMB_MDF_UNKNOWN 0x0004 /* unknown */
@@ -1256,6 +1293,20 @@ typedef struct smbios_memdevice {
#define SMB_MDR_QUAD 0x04 /* quad */
#define SMB_MDR_OCTAL 0x08 /* octal */
+#define SMB_MTECH_OTHER 0x01 /* other */
+#define SMB_MTECH_UNKNOWN 0x02 /* unknown */
+#define SMB_MTECH_DRAM 0x03 /* DRAM */
+#define SMB_MTECH_NVDIMM_N 0x04 /* NVDIMM-N */
+#define SMB_MTECH_NVDIMM_F 0x05 /* NVDIMM-F */
+#define SMB_MTECH_NVDIMM_P 0x06 /* NVDIMM-P */
+#define SMB_MTECH_INTCPM 0x07 /* Intel persistent memory */
+
+#define SMB_MOMC_OTHER 0x01 /* other */
+#define SMB_MOMC_UNKNOWN 0x02 /* unknown */
+#define SMB_MOMC_VOLATILE 0x04 /* Volatile memory */
+#define SMB_MOMC_BYTE_PM 0x08 /* Byte-accessible persistent memory */
+#define SMB_MOMC_BLOCK_PM 0x10 /* Block-accessible persistent memory */
+
/*
* SMBIOS Memory Array Mapped Address. See DSP0134 Section 7.20 for more
* information. We convert start/end addresses into addr/size for convenience.
@@ -1626,7 +1677,8 @@ typedef struct smbios_memdevice_ext {
#define SMB_VERSION_28 0x0208 /* SMBIOS encoding for DMTF spec 2.8 */
#define SMB_VERSION_30 0x0300 /* SMBIOS encoding for DMTF spec 3.0 */
#define SMB_VERSION_31 0x0301 /* SMBIOS encoding for DMTF spec 3.1 */
-#define SMB_VERSION SMB_VERSION_31 /* SMBIOS latest version definitions */
+#define SMB_VERSION_32 0x0302 /* SMBIOS encoding for DMTF spec 3.2 */
+#define SMB_VERSION SMB_VERSION_32 /* SMBIOS latest version definitions */
#define SMB_O_NOCKSUM 0x1 /* do not verify header checksums */
#define SMB_O_NOVERS 0x2 /* do not verify header versions */
@@ -1686,6 +1738,10 @@ extern int smbios_info_cache(smbios_hdl_t *, id_t, smbios_cache_t *);
extern int smbios_info_port(smbios_hdl_t *, id_t, smbios_port_t *);
extern int smbios_info_extport(smbios_hdl_t *, id_t, smbios_port_ext_t *);
extern int smbios_info_slot(smbios_hdl_t *, id_t, smbios_slot_t *);
+extern int smbios_info_slot_peers(smbios_hdl_t *, id_t, uint_t *,
+ smbios_slot_peer_t **);
+extern void smbios_info_slot_peers_free(smbios_hdl_t *, uint_t,
+ smbios_slot_peer_t *);
extern int smbios_info_obdevs(smbios_hdl_t *, id_t, int, smbios_obdev_t *);
extern int smbios_info_obdevs_ext(smbios_hdl_t *, id_t, smbios_obdev_ext_t *);
extern int smbios_info_strtab(smbios_hdl_t *, id_t, int, const char *[]);
@@ -1785,6 +1841,9 @@ extern const char *smbios_memdevice_type_desc(uint_t);
extern const char *smbios_memdevice_flag_name(uint_t);
extern const char *smbios_memdevice_flag_desc(uint_t);
extern const char *smbios_memdevice_rank_desc(uint_t);
+extern const char *smbios_memdevice_memtech_desc(uint_t);
+extern const char *smbios_memdevice_op_capab_name(uint_t);
+extern const char *smbios_memdevice_op_capab_desc(uint_t);
extern const char *smbios_onboard_type_desc(uint_t);
diff --git a/usr/src/uts/common/sys/smbios_impl.h b/usr/src/uts/common/sys/smbios_impl.h
index 66edfb027a..df61892a82 100644
--- a/usr/src/uts/common/sys/smbios_impl.h
+++ b/usr/src/uts/common/sys/smbios_impl.h
@@ -21,7 +21,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -201,8 +201,8 @@ typedef struct smb_cache {
#define SMB_CACHE_SIZE(s) (((s) & 0x8000) ? \
((uint32_t)((s) & 0x7FFF) * 64 * 1024) : ((uint32_t)(s) * 1024))
-#define SMB_CACHE_EXT_SIZE(s) (((s) & 0x80000000U) ? \
- ((uint64_t)((s) & 0x7FFFFFFFULL) * 64ULL * 1024ULL) : \
+#define SMB_CACHE_EXT_SIZE(s) (((s) & 0x80000000U) ? \
+ ((uint64_t)((s) & 0x7FFFFFFFULL) * 64ULL * 1024ULL) : \
((uint64_t)(s) * 1024ULL))
#define SMB_CACHE_CFG_MODE(c) (((c) >> 8) & 3)
@@ -226,6 +226,13 @@ typedef struct smb_port {
/*
* SMBIOS implementation structure for SMB_TYPE_SLOT.
*/
+typedef struct smb_slot_peer {
+ uint16_t smbspb_group_no; /* segment group number */
+ uint8_t smbspb_bus; /* bus number */
+ uint8_t smbspb_df; /* device/function number */
+ uint8_t smbspb_width; /* electrical width */
+} smb_slot_peer_t;
+
typedef struct smb_slot {
smb_header_t smbsl_hdr; /* structure header */
uint8_t smbsl_name; /* reference designation */
@@ -239,6 +246,10 @@ typedef struct smb_slot {
uint16_t smbsl_sg; /* segment group number */
uint8_t smbsl_bus; /* bus number */
uint8_t smbsl_df; /* device/function number */
+ /* Added in SMBIOS 3.2+ */
+ uint8_t smbsl_dbw; /* Data bus width */
+ uint8_t smbsl_npeers; /* Peer bdf groups */
+ smb_slot_peer_t smbsl_peers[]; /* bifurcation peers */
} smb_slot_t;
/*
@@ -343,6 +354,18 @@ typedef struct smb_memdevice {
uint16_t smbmdev_minvolt; /* minimum voltage */
uint16_t smbmdev_maxvolt; /* maximum voltage */
uint16_t smbmdev_confvolt; /* configured voltage */
+ /* Added in SMBIOS 3.2 */
+ uint8_t smbmdev_memtech; /* memory technology */
+ uint16_t smbmdev_opmode; /* memory operating mode capability */
+ uint8_t smbmdev_fwver; /* firmware version */
+ uint16_t smbmdev_modulemfgid; /* module manufacturer ID */
+ uint16_t smbmdev_moduleprodid; /* module product ID */
+ uint16_t smbmdev_memsysmfgid; /* memory controller manufacturer id */
+ uint16_t smbmdev_memsysprodid; /* memory controller product id */
+ uint64_t smbmdev_nvsize; /* non-volatile memory size */
+ uint64_t smbmdev_volsize; /* volatile memory size */
+ uint64_t smbmdev_cachesize; /* cache size */
+ uint64_t smbmdev_logicalsize; /* logical size */
} smb_memdevice_t;
#define SMB_MDS_KBYTES 0x8000 /* size in specified in kilobytes */
@@ -627,7 +650,7 @@ typedef struct smb_struct {
const smb_header_t *smbst_hdr; /* address of raw structure data */
const uchar_t *smbst_str; /* address of string data (if any) */
const uchar_t *smbst_end; /* address of 0x0000 ending tag */
- struct smb_struct *smbst_next; /* next structure in hash chain */
+ struct smb_struct *smbst_next; /* next structure in hash chain */
uint16_t *smbst_strtab; /* string index -> offset table */
uint_t smbst_strtablen; /* length of smbst_strtab */
} smb_struct_t;
@@ -788,6 +811,20 @@ typedef struct smb_base_cache {
uint8_t smbba_flags; /* cache flags (SMB_CAF_*) */
} smb_base_cache_t;
+typedef struct smb_base_slot {
+ const char *smbbl_name; /* reference designation */
+ uint8_t smbbl_type; /* slot type */
+ uint8_t smbbl_width; /* slot data bus width */
+ uint8_t smbbl_usage; /* current usage */
+ uint8_t smbbl_length; /* slot length */
+ uint16_t smbbl_id; /* slot ID */
+ uint8_t smbbl_ch1; /* slot characteristics 1 */
+ uint8_t smbbl_ch2; /* slot characteristics 2 */
+ uint16_t smbbl_sg; /* segment group number */
+ uint8_t smbbl_bus; /* bus number */
+ uint8_t smbbl_df; /* device/function number */
+} smb_base_slot_t;
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h
index 93b0af97e8..d6e13d4823 100644
--- a/usr/src/uts/common/sys/socket.h
+++ b/usr/src/uts/common/sys/socket.h
@@ -22,6 +22,7 @@
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -39,6 +40,9 @@
/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
#ifndef _SYS_SOCKET_H
#define _SYS_SOCKET_H
@@ -204,6 +208,7 @@ struct so_snd_bufinfo {
#define SO_SRCADDR 0x2001 /* Internal: AF_UNIX source address */
#define SO_FILEP 0x2002 /* Internal: AF_UNIX file pointer */
#define SO_UNIX_CLOSE 0x2003 /* Internal: AF_UNIX peer closed */
+#define SO_REUSEPORT 0x2004 /* allow simultaneous port reuse */
#endif /* _KERNEL */
/*
@@ -303,8 +308,9 @@ struct linger {
#define AF_INET_OFFLOAD 30 /* Sun private; do not use */
#define AF_TRILL 31 /* TRILL interface */
#define AF_PACKET 32 /* PF_PACKET Linux socket interface */
+#define AF_LX_NETLINK 33 /* Linux-compatible netlink */
-#define AF_MAX 32
+#define AF_MAX 33
/*
* Protocol families, same as address families for now.
@@ -344,6 +350,7 @@ struct linger {
#define PF_INET_OFFLOAD AF_INET_OFFLOAD /* Sun private; do not use */
#define PF_TRILL AF_TRILL
#define PF_PACKET AF_PACKET
+#define PF_LX_NETLINK AF_LX_NETLINK
#define PF_MAX AF_MAX
@@ -429,6 +436,7 @@ struct msghdr32 {
/* with left over data */
#define MSG_XPG4_2 0x8000 /* Private: XPG4.2 flag */
+/* Obsolete but kept for compilation compatability. Use IOV_MAX. */
#define MSG_MAXIOVLEN 16
#ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index ac07bad909..6794b5687b 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -102,6 +103,7 @@ struct sockaddr_ux {
typedef struct sonodeops sonodeops_t;
typedef struct sonode sonode_t;
+typedef boolean_t (*so_krecv_f)(sonode_t *, mblk_t *, size_t, int, void *);
struct sodirect_s;
@@ -244,6 +246,10 @@ struct sonode {
struct sof_instance *so_filter_top; /* top of stack */
struct sof_instance *so_filter_bottom; /* bottom of stack */
clock_t so_filter_defertime; /* time when deferred */
+
+ /* Kernel direct receive callbacks */
+ so_krecv_f so_krecv_cb; /* recv callback */
+ void *so_krecv_arg; /* recv cb arg */
};
#define SO_HAVE_DATA(so) \
@@ -297,15 +303,16 @@ struct sonode {
#define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */
#define SS_HAVEOOBDATA 0x00004000 /* OOB data present */
#define SS_HADOOBDATA 0x00008000 /* OOB data consumed */
-#define SS_CLOSING 0x00010000 /* in process of closing */
+#define SS_CLOSING 0x00010000 /* in process of closing */
#define SS_FIL_DEFER 0x00020000 /* filter deferred notification */
#define SS_FILOP_OK 0x00040000 /* socket can attach filters */
#define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */
+
#define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */
#define SS_FIL_STOP 0x00200000 /* no more filter actions */
-
#define SS_SODIRECT 0x00400000 /* transport supports sodirect */
+#define SS_FILOP_UNSF 0x00800000 /* block attaching unsafe filters */
#define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */
#define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */
@@ -321,7 +328,8 @@ struct sonode {
/*
* Sockets that can fall back to TPI must ensure that fall back is not
- * initiated while a thread is using a socket.
+ * initiated while a thread is using a socket. Otherwise this disables all
+ * future filter attachment.
*/
#define SO_BLOCK_FALLBACK(so, fn) \
ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \
@@ -337,6 +345,24 @@ struct sonode {
} \
}
+/*
+ * Sockets that can fall back to TPI must ensure that fall back is not
+ * initiated while a thread is using a socket. Otherwise this disables all
+ * future unsafe filter attachment. Safe filters can still attach after
+ * we execute the function in which this macro is used.
+ */
+#define SO_BLOCK_FALLBACK_SAFE(so, fn) \
+ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \
+ rw_enter(&(so)->so_fallback_rwlock, RW_READER); \
+ if ((so)->so_state & SS_FALLBACK_COMP) { \
+ rw_exit(&(so)->so_fallback_rwlock); \
+ return (fn); \
+ } else if (((so)->so_state & SS_FILOP_UNSF) == 0) { \
+ mutex_enter(&(so)->so_lock); \
+ (so)->so_state |= SS_FILOP_UNSF; \
+ mutex_exit(&(so)->so_lock); \
+ }
+
#define SO_UNBLOCK_FALLBACK(so) { \
rw_exit(&(so)->so_fallback_rwlock); \
}
@@ -368,6 +394,7 @@ struct sonode {
/* The modes below are only for non-streams sockets */
#define SM_ACCEPTSUPP 0x400 /* can handle accept() */
#define SM_SENDFILESUPP 0x800 /* Private: proto supp sendfile */
+#define SM_DEFERERR 0x1000 /* Private: defer so_error delivery */
/*
* Socket versions. Used by the socket library when calling _so_socket().
@@ -946,6 +973,15 @@ extern struct sonode *socreate(struct sockparams *, int, int, int, int,
extern int so_copyin(const void *, void *, size_t, int);
extern int so_copyout(const void *, void *, size_t, int);
+/*
+ * Functions to manipulate the use of direct receive callbacks. This should not
+ * be used outside of sockfs and ksocket. These are generally considered a use
+ * once interface for a socket and will cause all outstanding data on the socket
+ * to be flushed.
+ */
+extern int so_krecv_set(sonode_t *, so_krecv_f, void *);
+extern void so_krecv_unblock(sonode_t *);
+
#endif
/*
diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h
index 9f6d8b499b..c4dd6539de 100644
--- a/usr/src/uts/common/sys/sockfilter.h
+++ b/usr/src/uts/common/sys/sockfilter.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_SOCKFILTER_H
@@ -129,6 +130,15 @@ typedef struct sof_ops {
#define SOF_VERSION 1
+/*
+ * Flag indicating that the filter module is safe to attach after bind,
+ * getsockname, getsockopt or setsockopt calls. By default filters are unsafe
+ * so may not be attached after any socket operation. However, a safe filter
+ * can still be attached after one of the above calls. This makes attaching
+ * the filter less dependent on the initial socket setup order.
+ */
+#define SOF_ATT_SAFE 0x1
+
extern int sof_register(int, const char *, const sof_ops_t *, int);
extern int sof_unregister(const char *);
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index f1bd429815..89b355970e 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_SQUEUE_H
@@ -29,6 +30,17 @@
extern "C" {
#endif
+/*
+ * Originally in illumos, we had an IP-centric view of the serialization queue
+ * abstraction. While that has useful properties, the implementation of squeues
+ * hardcodes various parts of the implementation of IP into it which makes it
+ * unsuitable for other consumers. To enable them, we created another interface,
+ * but opted not to port all of the functionality that IP uses in the form of
+ * ip_squeue.c As other consumers need the functionality that IP has in squeues,
+ * then we'll come up with more genericized methods and add that functionality
+ * to <sys/gsqueue.h>. Please do not continue to use this header.
+ */
+
#include <sys/types.h>
#include <sys/processor.h>
#include <sys/stream.h>
@@ -76,16 +88,17 @@ typedef enum {
struct ip_recv_attr_s;
extern void squeue_init(void);
-extern squeue_t *squeue_create(clock_t, pri_t);
+extern squeue_t *squeue_create(pri_t, boolean_t);
extern void squeue_bind(squeue_t *, processorid_t);
extern void squeue_unbind(squeue_t *);
extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
uint32_t, struct ip_recv_attr_s *, int, uint8_t);
extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
+extern void squeue_destroy(squeue_t *);
struct conn_s;
extern int squeue_synch_enter(struct conn_s *, mblk_t *);
-extern void squeue_synch_exit(struct conn_s *);
+extern void squeue_synch_exit(struct conn_s *, int);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index 22550886eb..2bb717fb52 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_SQUEUE_IMPL_H
@@ -84,7 +85,6 @@ typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t,
struct ip_recv_attr_s *, int, uint8_t);
typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t);
-extern void squeue_worker_wakeup(squeue_t *);
extern int ip_squeue_flag;
struct squeue_s {
@@ -99,14 +99,11 @@ struct squeue_s {
ill_rx_ring_t *sq_rx_ring; /* The Rx ring tied to this sq */
ill_t *sq_ill; /* The ill this squeue is tied to */
- clock_t sq_curr_time; /* Current tick (lbolt) */
+ hrtime_t sq_awoken; /* time of worker wake req */
kcondvar_t sq_worker_cv; /* cond var. worker thread blocks on */
kcondvar_t sq_poll_cv; /* cond variable poll_thr waits on */
kcondvar_t sq_synch_cv; /* cond var. synch thread waits on */
kcondvar_t sq_ctrlop_done_cv; /* cond variable for ctrl ops */
- clock_t sq_wait; /* lbolts to wait after a fill() */
- timeout_id_t sq_tid; /* timer id of pending timeout() */
- clock_t sq_awaken; /* time async thread was awakened */
processorid_t sq_bind; /* processor to bind to */
kthread_t *sq_worker; /* kernel thread id */
@@ -117,6 +114,7 @@ struct squeue_s {
squeue_set_t *sq_set; /* managed by squeue creator */
pri_t sq_priority; /* squeue thread priority */
+ boolean_t sq_isip; /* use IP-centric features */
/* Keep the debug-only fields at the end of the structure */
#ifdef DEBUG
@@ -140,7 +138,6 @@ struct squeue_s {
#define SQS_USER 0x00000010 /* A non interrupt user */
#define SQS_BOUND 0x00000020 /* Worker thread is bound */
#define SQS_REENTER 0x00000040 /* Re entered thread */
-#define SQS_TMO_PROG 0x00000080 /* Timeout is being set */
#define SQS_POLL_CAPAB 0x00000100 /* Squeue can control interrupts */
#define SQS_ILL_BOUND 0x00000200 /* Squeue bound to an ill */
@@ -165,6 +162,7 @@ struct squeue_s {
#define SQS_POLL_RESTART_DONE 0x01000000
#define SQS_POLL_THR_QUIESCE 0x02000000
#define SQS_PAUSE 0x04000000 /* The squeue has been paused */
+#define SQS_EXIT 0x08000000 /* squeue is being torn down */
#define SQS_WORKER_THR_CONTROL \
(SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP)
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index 4be8d794fc..7488d3dee8 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc. All rights reserved.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -644,16 +645,13 @@ struct stroptions {
/*
* Structure for rw (read/write) procedure calls. A pointer
* to a struiod_t is passed as a parameter to the rwnext() call.
- *
- * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
- * as there isn't a formal definition of IOV_MAX ???
*/
#define DEF_IOV_MAX 16
struct struiod {
mblk_t *d_mp; /* pointer to mblk (chain) */
uio_t d_uio; /* uio info */
- iovec_t d_iov[DEF_IOV_MAX]; /* iov referenced by uio */
+ iovec_t *d_iov; /* iov referenced by uio */
};
/*
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index ce86badfc1..f3bc1ed407 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -25,6 +25,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_STRSUBR_H
@@ -1239,10 +1240,17 @@ extern void strsignal_nolock(stdata_t *, int, uchar_t);
struct multidata_s;
struct pdesc_s;
+
+/*
+ * Now that NIC drivers are expected to deal only with M_DATA mblks, the
+ * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their
+ * respective mac_hcksum_set and mac_hcksum_get counterparts.
+ */
extern int hcksum_assoc(mblk_t *, struct multidata_s *, struct pdesc_s *,
uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, int);
extern void hcksum_retrieve(mblk_t *, struct multidata_s *, struct pdesc_s *,
uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+
extern void lso_info_set(mblk_t *, uint32_t, uint32_t);
extern void lso_info_cleanup(mblk_t *);
extern unsigned int bcksum(uchar_t *, int, unsigned int);
diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h
index 1d94c8fd2c..3026dc961a 100644
--- a/usr/src/uts/common/sys/sunddi.h
+++ b/usr/src/uts/common/sys/sunddi.h
@@ -24,6 +24,7 @@
* Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef _SYS_SUNDDI_H
@@ -202,13 +203,13 @@ extern "C" {
#define DDI_NT_KEYBOARD "ddi_keyboard" /* keyboard device */
-#define DDI_NT_PARALLEL "ddi_parallel" /* parallel port */
+#define DDI_NT_PARALLEL "ddi_parallel" /* parallel port */
#define DDI_NT_PRINTER "ddi_printer" /* printer device */
#define DDI_NT_UGEN "ddi_generic:usb" /* USB generic drv */
-#define DDI_NT_SMP "ddi_sas_smp" /* smp devcies */
+#define DDI_NT_SMP "ddi_sas_smp" /* smp devcies */
#define DDI_NT_NEXUS "ddi_ctl:devctl" /* nexus drivers */
@@ -260,6 +261,11 @@ extern "C" {
#define DDI_NT_INTRCTL "ddi_tool_intr" /* tool intr access */
/*
+ * Various device types used for sensors.
+ */
+#define DDI_NT_SENSOR_TEMP_CPU "ddi_sensor:temperature:cpu"
+
+/*
* DDI event definitions
*/
#define EC_DEVFS "EC_devfs" /* Event class devfs */
@@ -839,7 +845,7 @@ ddi_prop_op_nblocks_blksize(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
* allocated by property provider via kmem_alloc. Requester
* is responsible for freeing returned property via kmem_free.
*
- * Arguments:
+ * Arguments:
*
* dev: Input: dev_t of property.
* dip: Input: dev_info_t pointer of child.
@@ -850,7 +856,7 @@ ddi_prop_op_nblocks_blksize(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
* valuep: Output: Addr of callers buffer pointer.
* lengthp:Output: *lengthp will contain prop length on exit.
*
- * Possible Returns:
+ * Possible Returns:
*
* DDI_PROP_SUCCESS: Prop found and returned.
* DDI_PROP_NOT_FOUND: Prop not found
@@ -1585,8 +1591,14 @@ int
ddi_ffs(long mask);
int
+ddi_ffsll(long long mask);
+
+int
ddi_fls(long mask);
+int
+ddi_flsll(long long mask);
+
/*
* The ddi_soft_state* routines comprise generic storage management utilities
* for driver soft state structures. Two types of soft_state indexes are
diff --git a/usr/src/uts/common/sys/sysconfig.h b/usr/src/uts/common/sys/sysconfig.h
index 3a68d76ebe..d5b65ef78c 100644
--- a/usr/src/uts/common/sys/sysconfig.h
+++ b/usr/src/uts/common/sys/sysconfig.h
@@ -25,6 +25,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#ifndef _SYS_SYSCONFIG_H
@@ -101,6 +102,8 @@ extern int mach_sysconfig(int);
#define _CONFIG_EPHID_MAX 47 /* maximum ephemeral uid */
+#define _CONFIG_NPROC_NCPU 48 /* NCPU (sometimes > NPROC_MAX) */
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/sysevent.h b/usr/src/uts/common/sys/sysevent.h
index 304745ed08..c2be00ad27 100644
--- a/usr/src/uts/common/sys/sysevent.h
+++ b/usr/src/uts/common/sys/sysevent.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_SYSEVENT_H
@@ -67,10 +68,12 @@ extern "C" {
#define SE_KERN_PID 0
#define SUNW_VENDOR "SUNW"
+#define ILLUMOS_VENDOR "ILLUMOS"
#define SE_USR_PUB "usr:"
#define SE_KERN_PUB "kern:"
#define SUNW_KERN_PUB SUNW_VENDOR ":" SE_KERN_PUB
#define SUNW_USR_PUB SUNW_VENDOR ":" SE_USR_PUB
+#define ILLUMOS_KERN_PUB ILLUMOS_VENDOR ":" SE_KERN_PUB
/*
* Event header and attribute value limits
diff --git a/usr/src/uts/common/sys/sysevent/datalink.h b/usr/src/uts/common/sys/sysevent/datalink.h
new file mode 100644
index 0000000000..592ef5bdde
--- /dev/null
+++ b/usr/src/uts/common/sys/sysevent/datalink.h
@@ -0,0 +1,54 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_SYSEVENT_DATALINK_H
+#define _SYS_SYSEVENT_DATALINK_H
+
+/*
+ * Datalink System Event payloads
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Event schema for EC_DATALINK_LINK_STATE
+ *
+ * Event Class - EC_DATALINK
+ * Event Sub-Class - EC_DATALINK_LINK_STATE
+ *
+ * Attribute Name - DATALINK_EV_LINK_NAME
+ * Attribute Type - SE_DATA_TYPE_STRING
+ * Attribute Value - [Name of the datalink]
+ *
+ * Attribute Name - DATALINK_EV_LINK_ID
+ * Attribute Type - SE_DATA_TYPE_INT32
+ * Attribute Value - [datalink_id_t for the device]
+ *
+ * Attribute Name - DATALINK_EV_ZONE_ID
+ * Attribute Type - SE_DATA_TYPE_INT32
+ * Attribute Value - [zoneid_t of the zone the datalink is in]
+ */
+
+#define DATALINK_EV_LINK_NAME "link"
+#define DATALINK_EV_LINK_ID "linkid"
+#define DATALINK_EV_ZONE_ID "zone"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_DATALINK_H */
diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h
index cf6e040ee9..8995ba4aa0 100644
--- a/usr/src/uts/common/sys/sysevent/eventdefs.h
+++ b/usr/src/uts/common/sys/sysevent/eventdefs.h
@@ -212,9 +212,11 @@ extern "C" {
#define ESC_ZFS_HISTORY_EVENT "ESC_ZFS_history_event"
/*
- * datalink subclass definitions.
+ * datalink subclass definitions. Supporting attributes for datalink state found
+ * in sys/sysevent/datalink.h.
*/
#define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */
+#define ESC_DATALINK_LINK_STATE "ESC_datalink_link_state" /* link state */
/*
* VRRP subclass definitions. Supporting attributes (name/value paris) are
diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h
index d43974451e..17e509d4d8 100644
--- a/usr/src/uts/common/sys/systrace.h
+++ b/usr/src/uts/common/sys/systrace.h
@@ -22,13 +22,12 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_SYSTRACE_H
#define _SYS_SYSTRACE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dtrace.h>
#ifdef __cplusplus
@@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent;
extern systrace_sysent_t *systrace_sysent32;
extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1,
- uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+ uintptr_t arg6, uintptr_t arg7);
#ifdef _SYSCALL32_IMPL
extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1,
- uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+ uintptr_t arg6, uintptr_t arg7);
#endif
#endif
diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h
index 0c07623ce6..b955e5f3f2 100644
--- a/usr/src/uts/common/sys/termios.h
+++ b/usr/src/uts/common/sys/termios.h
@@ -363,6 +363,24 @@ extern pid_t tcgetsid(int);
#define TCSETSF (_TIOC|16)
/*
+ * linux terminal ioctls we need to be aware of
+ */
+#define TIOCSETLD (_TIOC|123) /* set line discipline parms */
+#define TIOCGETLD (_TIOC|124) /* get line discipline parms */
+
+/*
+ * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is
+ * perfectly legal except, linux expects them to be separate. So we keep
+ * them separately.
+ */
+struct lx_cc {
+ unsigned char veof; /* veof value */
+ unsigned char veol; /* veol value */
+ unsigned char vmin; /* vmin value */
+ unsigned char vtime; /* vtime value */
+};
+
+/*
* NTP PPS ioctls
*/
#define TIOCGPPS (_TIOC|125)
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index f9f1d6462b..6cc474f864 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -71,7 +71,10 @@ typedef struct ctxop {
void (*exit_op)(void *); /* invoked during {thread,lwp}_exit() */
void (*free_op)(void *, int); /* function which frees the context */
void *arg; /* argument to above functions, ctx pointer */
- struct ctxop *next; /* next context ops */
+ struct ctxop *next; /* next context ops */
+ struct ctxop *prev; /* previous context ops */
+ hrtime_t save_ts; /* timestamp of last save */
+ hrtime_t restore_ts; /* timestamp of last restore */
} ctxop_t;
/*
@@ -351,6 +354,8 @@ typedef struct _kthread {
kmutex_t t_wait_mutex; /* used in CV wait functions */
char *t_name; /* thread name */
+
+ uint64_t t_unsafe; /* unsafe to run with HT VCPU thread */
} kthread_t;
/*
@@ -372,7 +377,7 @@ typedef struct _kthread {
#define T_WOULDBLOCK 0x0020 /* for lockfs */
#define T_DONTBLOCK 0x0040 /* for lockfs */
#define T_DONTPEND 0x0080 /* for lockfs */
-#define T_SYS_PROF 0x0100 /* profiling on for duration of system call */
+#define T_SPLITSTK 0x0100 /* kernel stack is currently split */
#define T_WAITCVSEM 0x0200 /* waiting for a lwp_cv or lwp_sema on sleepq */
#define T_WATCHPT 0x0400 /* thread undergoing a watchpoint emulation */
#define T_PANIC 0x0800 /* thread initiated a system panic */
@@ -401,6 +406,7 @@ typedef struct _kthread {
#define TP_CHANGEBIND 0x1000 /* thread has a new cpu/cpupart binding */
#define TP_ZTHREAD 0x2000 /* this is a kernel thread for a zone */
#define TP_WATCHSTOP 0x4000 /* thread is stopping via holdwatch() */
+#define TP_KTHREAD 0x8000 /* in-kernel worker thread for a process */
/*
* Thread scheduler flag (t_schedflag) definitions.
@@ -413,6 +419,7 @@ typedef struct _kthread {
#define TS_SIGNALLED 0x0010 /* thread was awakened by cv_signal() */
#define TS_PROJWAITQ 0x0020 /* thread is on its project's waitq */
#define TS_ZONEWAITQ 0x0040 /* thread is on its zone's waitq */
+#define TS_VCPU 0x0080 /* thread will enter guest context */
#define TS_CSTART 0x0100 /* setrun() by continuelwps() */
#define TS_UNPAUSE 0x0200 /* setrun() by unpauselwps() */
#define TS_XSTART 0x0400 /* setrun() by SIGCONT */
@@ -420,8 +427,9 @@ typedef struct _kthread {
#define TS_RESUME 0x1000 /* setrun() by CPR resume process */
#define TS_CREATE 0x2000 /* setrun() by syslwp_create() */
#define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */
+#define TS_BSTART 0x8000 /* setrun() by brand */
#define TS_ALLSTART \
- (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE)
+ (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART)
#define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ)
/*
@@ -449,6 +457,10 @@ typedef struct _kthread {
#define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \
!((t)->t_schedflag & TS_PSTART))
+/* True if thread is stopped for a brand-specific reason */
+#define BSTOPPED(t) ((t)->t_state == TS_STOPPED && \
+ !((t)->t_schedflag & TS_BSTART))
+
/* True if thread is asleep and wakeable */
#define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \
((t)->t_flag & T_WAKEABLE)))
@@ -599,6 +611,7 @@ int thread_setname(kthread_t *, const char *);
int thread_vsetname(kthread_t *, const char *, ...);
extern int default_binding_mode;
+extern int default_stksize;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h
index 81b4753049..a69bf4dd63 100644
--- a/usr/src/uts/common/sys/time.h
+++ b/usr/src/uts/common/sys/time.h
@@ -15,10 +15,11 @@
* Use is subject to license terms.
*
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_TIME_H
@@ -247,8 +248,8 @@ struct itimerval32 {
#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
-#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC))
-#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC))
+#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC))
+#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC))
#define NSEC2SEC(n) ((n) / (NANOSEC / SEC))
#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC))
@@ -264,6 +265,14 @@ typedef longlong_t hrtime_t;
#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+/*
+ * Unsigned counterpart to hrtime_t
+ */
+typedef u_longlong_t uhrtime_t;
+
+#define HRTIME_MAX LLONG_MAX
+#define UHRTIME_MAX ULLONG_MAX
+
#include <sys/time_impl.h>
#include <sys/mutex.h>
diff --git a/usr/src/uts/common/sys/timer.h b/usr/src/uts/common/sys/timer.h
index ec349c962f..748e0c0627 100644
--- a/usr/src/uts/common/sys/timer.h
+++ b/usr/src/uts/common/sys/timer.h
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#ifndef _SYS_TIMER_H
@@ -34,6 +34,9 @@
#include <sys/types.h>
#include <sys/proc.h>
#include <sys/thread.h>
+#include <sys/param.h>
+#include <sys/siginfo.h>
+#include <sys/port.h>
#ifdef __cplusplus
extern "C" {
@@ -42,7 +45,13 @@ extern "C" {
#ifdef _KERNEL
#define _TIMER_MAX 32
-extern int timer_max; /* patchable via /etc/system */
+/*
+ * Max timers per process. This is patchable via /etc/system and can be
+ * updated via kmdb. Sticking to positive powers of 2 is recommended.
+ */
+extern int timer_max;
+
+#define _TIMER_ALLOC_INIT 8 /* initial size for p_itimer array */
/*
* Bit values for the it_lock field.
@@ -56,6 +65,7 @@ extern int timer_max; /* patchable via /etc/system */
*/
#define IT_SIGNAL 0x01
#define IT_PORT 0x02 /* use event port notification */
+#define IT_CALLBACK 0x04 /* custom callback function */
struct clock_backend;
@@ -83,14 +93,27 @@ struct itimer {
struct clock_backend *it_backend;
void (*it_fire)(itimer_t *);
kmutex_t it_mutex;
- void *it_portev; /* port_kevent_t pointer */
- void *it_portsrc; /* port_source_t pointer */
- int it_portfd; /* port file descriptor */
+ union {
+ struct {
+ void *_it_portev; /* port_kevent_t pointer */
+ void *_it_portsrc; /* port_source_t pointer */
+ int _it_portfd; /* port file descriptor */
+ } _it_ev_port;
+ struct {
+ void (*_it_cb_func)(itimer_t *);
+ uintptr_t _it_cb_data[2];
+ } _it_ev_cb;
+ } _it_ev_data;
};
#define it_sigq __data.__proc.__it_sigq
#define it_lwp __data.__proc.__it_lwp
#define it_frontend __data.__it_frontend
+#define it_portev _it_ev_data._it_ev_port._it_portev
+#define it_portsrc _it_ev_data._it_ev_port._it_portsrc
+#define it_portfd _it_ev_data._it_ev_port._it_portfd
+#define it_cb_func _it_ev_data._it_ev_cb._it_cb_func
+#define it_cb_data _it_ev_data._it_ev_cb._it_cb_data
typedef struct clock_backend {
struct sigevent clk_default;
@@ -107,7 +130,11 @@ typedef struct clock_backend {
extern void clock_add_backend(clockid_t clock, clock_backend_t *backend);
extern clock_backend_t *clock_get_backend(clockid_t clock);
+extern void timer_release(struct proc *, itimer_t *);
+extern void timer_delete_grabbed(struct proc *, timer_t tid, itimer_t *it);
extern void timer_lwpbind();
+extern int timer_setup(clock_backend_t *, struct sigevent *, port_notify_t *,
+ itimer_t **, timer_t *);
extern void timer_func(sigqueue_t *);
extern void timer_exit(void);
diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h
index 904b52cac4..75d000b831 100644
--- a/usr/src/uts/common/sys/uadmin.h
+++ b/usr/src/uts/common/sys/uadmin.h
@@ -23,6 +23,7 @@
*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -159,7 +160,7 @@ extern kmutex_t ualock;
extern void mdboot(int, int, char *, boolean_t);
extern void mdpreboot(int, int, char *);
extern int kadmin(int, int, void *, cred_t *);
-extern void killall(zoneid_t);
+extern void killall(zoneid_t, boolean_t);
#endif
extern int uadmin(int, int, uintptr_t);
diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h
index bca1ed1fa3..9584be559f 100644
--- a/usr/src/uts/common/sys/uio.h
+++ b/usr/src/uts/common/sys/uio.h
@@ -145,7 +145,8 @@ typedef struct uioa_s {
*/
typedef enum xuio_type {
UIOTYPE_ASYNCIO,
- UIOTYPE_ZEROCOPY
+ UIOTYPE_ZEROCOPY,
+ UIOTYPE_PEEKSIZE
} xuio_type_t;
typedef struct xuio {
@@ -175,6 +176,15 @@ typedef struct xuio {
int xu_zc_rw; /* read or write buffer */
void *xu_zc_priv; /* fs specific */
} xu_zc;
+
+ /*
+ * Peek Size Support -- facilitate peeking at the size of a
+ * waiting message on a socket.
+ */
+ struct {
+ ssize_t xu_ps_size; /* size of waiting msg */
+ boolean_t xu_ps_set; /* was size calculated? */
+ } xu_ps;
} xu_ext;
} xuio_t;
diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h
index c96f914a70..f1b209faad 100644
--- a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h
+++ b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_USB_HIDMINOR_H
@@ -44,21 +44,28 @@ extern "C" {
* transparent.
*
* So we change minor node numbering scheme to be:
- * external node minor num == instance << 1
- * internal node minor num == instance << 1 | 0x1
+ * external node minor num == instance << 9
+ * internal node minor num == instance << 9 | 0x100
* (There are only internal nodes for keyboard/mouse now.)
+ *
+ * The 8 bits of the LSB are used for ugen minor numbering (hence the use
+ * of the first bit of the next byte for the "internal" flag)
*/
-#define HID_MINOR_BITS_MASK 0x1
+#define HID_MINOR_BITS_MASK 0x1ff
+#define HID_MINOR_UGEN_BITS_MASK 0xff
#define HID_MINOR_INSTANCE_MASK ~HID_MINOR_BITS_MASK
-#define HID_MINOR_INSTANCE_SHIFT 1
+#define HID_MINOR_INSTANCE_SHIFT 9
-#define HID_MINOR_INTERNAL 0x1
+#define HID_MINOR_INTERNAL 0x100
#define HID_MINOR_MAKE_INTERNAL(minor) \
((minor) | HID_MINOR_INTERNAL)
#define HID_IS_INTERNAL_OPEN(minor) \
(((minor) & HID_MINOR_INTERNAL))
+#define HID_IS_UGEN_OPEN(minor) \
+ (((minor) & HID_MINOR_UGEN_BITS_MASK))
+
#define HID_MINOR_TO_INSTANCE(minor) \
(((minor) & HID_MINOR_INSTANCE_MASK) >> \
HID_MINOR_INSTANCE_SHIFT)
diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h
index e9a25ea894..ee68f0088a 100644
--- a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h
+++ b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_USB_HIDVAR_H
@@ -33,6 +33,7 @@ extern "C" {
#endif
#include <sys/usb/usba/usbai_private.h>
+#include <sys/usb/usba/usba_ugen.h>
/*
* HID : This header file contains the internal structures
@@ -222,6 +223,8 @@ typedef struct hid_state {
queue_t *hid_inuse_rq;
int hid_internal_flag; /* see below */
int hid_external_flag; /* see below */
+
+ usb_ugen_hdl_t hid_ugen_hdl; /* ugen support */
} hid_state_t;
/* warlock directives, stable data */
diff --git a/usr/src/uts/common/sys/usb/usba/bos.h b/usr/src/uts/common/sys/usb/usba/bos.h
new file mode 100644
index 0000000000..417dd1e60c
--- /dev/null
+++ b/usr/src/uts/common/sys/usb/usba/bos.h
@@ -0,0 +1,242 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _SYS_USB_BOS_H
+#define _SYS_USB_BOS_H
+
+/*
+ * This header contains definitions that relate to the USB Binary Object Store.
+ * While this functionality was originally introduced with WUSB, it was used in
+ * USB 3.x as a way to provide additional device related information. This is
+ * currently separate from the primary usbai headers as this functionality is
+ * not currently used by client device drivers themselves, but only by the hub
+ * driver for private functionality.
+ *
+ * This data is all derived from the USB 3.1 specification, Chapter 9.6.2 Binary
+ * Device Object Store (BOS).
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Capability list, see USB 3.1 r1.0, Table 9-14.
+ */
+#define USB_BOS_TYPE_INVALID 0x00 /* Internal, synthetic value */
+#define USB_BOS_TYPE_WUSB 0x01
+#define USB_BOS_TYPE_USB2_EXT 0x02
+#define USB_BOS_TYPE_SUPERSPEED 0x03
+#define USB_BOS_TYPE_CONTAINER 0x04
+#define USB_BOS_TYPE_PLATFORM 0x05
+#define USB_BOS_TYPE_PD_CAP 0x06
+#define USB_BOS_TYPE_BATTERY_INFO 0x07
+#define USB_BOS_TYPE_PD_CONSUMER_CAP 0x08
+#define USB_BOS_TYPE_PD_PRODUCER_CAP 0x09
+#define USB_BOS_TYPE_SUPERSPEED_PLUS 0x0a
+#define USB_BOS_TYPE_PRECISION_TIME 0x0b
+#define USB_BOS_TYPE_WUSB_EXT 0x0c
+
+/*
+ * General Binary Object Store (BOS) descriptor. This is returned at the start
+ * of the BOS tree. See USB 3.1/Table 9-12.
+ */
+typedef struct usb_bos_descr {
+ uint8_t bLength; /* Descriptor size */
+ uint8_t bDescriptorType; /* Set to USB_DESCR_TYPE_BOS */
+ uint16_t wTotalLength; /* Total length */
+ uint8_t bNumDeviceCaps; /* Number of caps that follow */
+} usb_bos_descr_t;
+
+/*
+ * This is the size of the usb_bos_descr_t in terms of packed bytes.
+ */
+#define USB_BOS_PACKED_SIZE 5
+
+/*
+ * This represents a Device Capability Descriptor. bNumDeviceCaps of these
+ * follow the usb_bos_descr_t. This structure is the generic header of each
+ * device capability. Capability specific ones follow this. See USB 3.1/Table
+ * 9-14.
+ */
+typedef struct usb_dev_cap_descr {
+ uint8_t bLength; /* Descriptor size */
+ uint8_t bDescriptorType; /* USB_TYPE_DEV_CAPABILITY */
+ uint8_t bDevCapabilityType; /* USB_BOS_TYPE_* value */
+} usb_dev_cap_descr_t;
+
+#define USB_DEV_CAP_PACKED_SIZE 3
+
+/*
+ * SuperSpeed devices include this descriptor to describe additional
+ * capabilities that they have when operating in USB 2.0 High-Speed mode. See
+ * USB 3.1/9.6.2.1 USB 2.0 Extension.
+ */
+typedef struct usb_bos_usb2ext {
+ uint8_t bLength;
+ uint8_t bDescriptorType;
+ uint8_t bDevCapabilityType;
+ uint32_t bmAttributes; /* Bitfield defined below */
+} usb_bos_usb2ext_t;
+
+#define USB_BOS_USB2EXT_PACKED_SIZE 7
+
+#define USB_BOS_USB2EXT_LPM 0x02
+
+/*
+ * SuperSpeed devices include this descriptor to describe various hardware
+ * attributes related to basic USB 3.0 SuperSpeed functionality. See USB
+ * 3.1/9.6.2.2 SuperSpeed USB Device Capability.
+ */
+typedef struct usb_bos_ssusb {
+ uint8_t bLength;
+ uint8_t bDescriptorType;
+ uint8_t bDevCapabilityType;
+ uint8_t bmAttributes; /* Capability bitfield */
+ uint16_t wSpeedsSupported; /* speed bitmap defined below */
+ uint8_t bFunctionalitySupport; /* Minimum supported speed */
+ uint8_t bU1DevExitLat; /* Exit latency in us */
+ uint16_t bU2DevExitLat; /* Exit latency in us */
+} usb_bos_ssusb_t;
+
+#define USB_BOS_SSUSB_PACKED_SIZE 10
+
+#define USB_BOS_SSUB_CAP_LTM 0x02
+
+#define USB_BOS_SSUSB_SPEED_LOW (1 << 0)
+#define USB_BOS_SSUSB_SPEED_FULL (1 << 1)
+#define USB_BOS_SSUSB_SPEED_HIGH (1 << 2)
+#define USB_BOS_SSUSB_SPEED_SUPER (1 << 3)
+
+/*
+ * This structure is used to indicate a UUID for a given device that could
+ * register on multiple ports. For example, a hub that appears on both a USB 2.x
+ * and USB 3.x port like a hub. This UUID allows one to know that the device is
+ * the same. See USB 3.1/9.6.2.3 Container ID.
+ */
+typedef struct usb_bos_container {
+ uint8_t bLength;
+ uint8_t bDescriptorType;
+ uint8_t bDevCapabilityType;
+ uint8_t bReserved;
+ uint8_t ContainerId[16];
+} usb_bos_container_t;
+
+#define USB_BOS_CONTAINER_PACKED_SIZE 20
+
+/*
+ * This structure is used to indicate a platform-specific capability. For more
+ * information, see USB 3.1/9.6.2.4 Platform Descriptor.
+ */
+typedef struct usb_bos_platform {
+ uint8_t bLength;
+ uint8_t bDescriptorType;
+ uint8_t bDevCapabilityType;
+ uint8_t bReserved;
+ uint8_t PlatformCapabilityUUID[16];
+ uint8_t CapabilityData[];
+} usb_bos_platform_t;
+
+#define USB_BOS_PLATFORM_MIN_PACKED_SIZE 20
+
+/*
+ * This structure is used to indicate capabilities and attributes of a
+ * SuperSpeedPlus link. This describes the USB 3.1+ speed needs and minimum
+ * attributes of the device. See USB 3.1/9.6.2.5 SuperSpeedPlus USB Device
+ * Capability.
+ */
+typedef struct usb_bos_ssplus {
+ uint8_t bLength;
+ uint8_t bDescriptortype;
+ uint8_t bDevCapabilityType;
+ uint8_t bReserved;
+ uint32_t bmAttributes;
+ uint16_t wFunctionalitySupport;
+ uint16_t wReserved;
+ uint32_t bmSublinkSpeedAttr[];
+} usb_bos_ssplus_t;
+
+#define USB_BOS_SSPLUS_MIN_PACKED_SIZE 16
+
+/*
+ * These macros take apart the bmAttributes fields.
+ */
+#define USB_BOS_SSPLUS_NSSAC(x) (((x) & 0xf) + 1)
+#define USB_BOS_SSPLUS_NSSIC(x) ((((x) & 0xf0) >> 4) + 1)
+
+/*
+ * These macros take apart the wFunctionalitySupport member.
+ */
+#define USB_BOS_SSPLUS_MIN_SSAI(x) ((x) & 0x0f)
+#define USB_BOS_SSPLUS_MIN_RX_LANE(x) (((x) >> 8) & 0xf)
+#define USB_BOS_SSPLUS_MIN_TX_LANE(x) (((x) >> 12) & 0xf)
+
+/*
+ * These macros are used to take apart the bmSublinkSpeedAttr members. There is
+ * always at least one of them that exist in each attribute; however, there
+ * could be more based on the value in NSSAC.
+ */
+#define USB_BOS_SSPLUS_ATTR_SSID(x) ((x) & 0xf)
+#define USB_BOS_SSPLUS_ATTR_LSE(x) (((x) >> 4) & 0x3)
+#define USB_BOS_SSPLUS_ATTR_LSE_BITPS 0
+#define USB_BOS_SSPLUS_ATTR_LSE_KBITPS 1
+#define USB_BOS_SSPLUS_ATTR_LSE_GBITPS 2
+
+/*
+ * These two macros take apart the sublink type. bit 6 indicates whether or not
+ * the links are symmetric or asymmetric. It is asymmetric if the value is set
+ * to one (USB_BOS_SSPLUS_ATTR_ST_ASYM), symmetric otherwise. If it is
+ * asymmetric, then bit 7 indicates whether or not it's a tx or rx link.
+ */
+#define USB_BOS_SSPLUS_ATTR_ST_ASYM (1 << 6)
+#define USB_BOS_SSPLUS_ATTR_ST_TX (1 << 7)
+
+#define USB_BOS_SSPLUS_ATTR_LP(x) (((x) >> 14) & 0x3)
+#define USB_BOS_SSPLUS_ATTR_LP_SS 0x0
+#define USB_BOS_SSPLUS_ATTR_LP_SSPLUS 0x1
+
+#define USB_BOS_SSPLUS_ATTR_LSM(x) ((x) >> 16)
+
+typedef struct usb_bos_precision_time {
+ uint8_t bLength;
+ uint8_t bDescriptorType;
+ uint8_t bDevCapabilityType;
+} usb_bos_precision_time_t;
+
+#define USB_BOS_PRECISION_TIME_PACKED_SIZE 3
+
+/*
+ * This structure serves as an internal, parsed representation of a USB bos
+ * descriptor.
+ */
+typedef struct usb_bos {
+ uint8_t ubos_length;
+ uint8_t ubos_type;
+ union {
+ usb_bos_usb2ext_t ubos_usb2;
+ usb_bos_ssusb_t ubos_ssusb;
+ usb_bos_container_t ubos_container;
+ usb_bos_platform_t ubos_platform;
+ usb_bos_ssplus_t ubos_ssplus;
+ usb_bos_precision_time_t ubos_time;
+ uint8_t ubos_raw[256];
+ } ubos_caps;
+} usb_bos_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_USB_BOS_H */
diff --git a/usr/src/uts/common/sys/usb/usba/usba10.h b/usr/src/uts/common/sys/usb/usba/usba10.h
index 947dd65379..42e78cd35e 100644
--- a/usr/src/uts/common/sys/usb/usba/usba10.h
+++ b/usr/src/uts/common/sys/usb/usba/usba10.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef _SYS_USB_USBA10_H
@@ -139,7 +140,7 @@ usba10_usb_free_descr_tree(
size_t
usba10_usb_parse_data(
char *format,
- uchar_t *data,
+ const uchar_t *data,
size_t datalen,
void *structure,
size_t structlen);
diff --git a/usr/src/uts/common/sys/usb/usba/usba_impl.h b/usr/src/uts/common/sys/usb/usba/usba_impl.h
index 784bb32d44..ddb6f7346d 100644
--- a/usr/src/uts/common/sys/usb/usba/usba_impl.h
+++ b/usr/src/uts/common/sys/usb/usba/usba_impl.h
@@ -23,6 +23,7 @@
* Use is subject to license terms.
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef _SYS_USB_USBA_USBA_IMPL_H
@@ -34,6 +35,7 @@
#include <sys/usb/usba/hubdi.h>
#include <sys/usb/usba/usba_private.h>
#include <sys/usb/usba/usba_types.h>
+#include <sys/usb/usba/bos.h>
#include <sys/taskq.h>
#include <sys/disp.h>
@@ -301,6 +303,13 @@ void usba_rem_root_hub(dev_info_t *dip);
void usba_get_dev_string_descrs(dev_info_t *, usba_device_t *);
/*
+ * Retrieve the binary object store for the device.
+ */
+void usba_get_binary_object_store(dev_info_t *, usba_device_t *);
+void usba_add_binary_object_store_props(dev_info_t *, usba_device_t *);
+void usba_free_binary_object_store(usba_device_t *);
+
+/*
* Check if we are not in interrupt context and have
* USB_FLAGS_SLEEP flags set.
*/
diff --git a/usr/src/uts/common/sys/usb/usba/usba_private.h b/usr/src/uts/common/sys/usb/usba/usba_private.h
index 4e56e4aa47..406ee3824c 100644
--- a/usr/src/uts/common/sys/usb/usba/usba_private.h
+++ b/usr/src/uts/common/sys/usb/usba/usba_private.h
@@ -23,6 +23,7 @@
* Use is subject to license terms.
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef _SYS_USB_USBA_USBA_PRIVATE_H
@@ -88,21 +89,21 @@ extern "C" {
* extended in a later rev of the spec.
*/
size_t usb_parse_dev_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */
size_t buflen,
usb_dev_descr_t *ret_descr,
size_t ret_buf_len);
size_t usb_parse_cfg_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t buflen,
usb_cfg_descr_t *ret_descr,
size_t ret_buf_len);
size_t usb_parse_ia_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t buflen,
size_t first_if,
usb_ia_descr_t *ret_descr,
@@ -110,7 +111,7 @@ size_t usb_parse_ia_descr(
size_t usb_parse_if_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t buflen,
uint_t if_index,
uint_t alt_if_setting,
@@ -123,7 +124,7 @@ size_t usb_parse_if_descr(
* the first endpoint
*/
size_t usb_parse_ep_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t buflen,
uint_t if_index,
uint_t alt_if_setting,
@@ -160,7 +161,7 @@ size_t usb_parse_ep_descr(
#define USB_DESCR_TYPE_ANY -1 /* Wild card */
size_t usb_parse_CV_cfg_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t buflen,
char *fmt,
uint_t descr_type,
@@ -170,7 +171,7 @@ size_t usb_parse_CV_cfg_descr(
size_t usb_parse_CV_if_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t buflen,
char *fmt,
uint_t if_index,
@@ -182,7 +183,7 @@ size_t usb_parse_CV_if_descr(
size_t usb_parse_CV_ep_descr(
- uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
+ const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */
size_t buflen,
char *fmt,
uint_t if_index,
@@ -199,7 +200,7 @@ size_t usb_parse_CV_ep_descr(
*/
size_t usb_parse_CV_descr(
char *format,
- uchar_t *data,
+ const uchar_t *data,
size_t datalen,
void *structure,
size_t structlen);
@@ -270,8 +271,7 @@ typedef enum usba_event {
USBA_EVENT_TAG_HOT_REMOVAL = 0,
USBA_EVENT_TAG_HOT_INSERTION = 1,
USBA_EVENT_TAG_PRE_SUSPEND = 2,
- USBA_EVENT_TAG_POST_RESUME = 3,
- USBA_EVENT_TAG_CPR = -1
+ USBA_EVENT_TAG_POST_RESUME = 3
} usba_event_t;
#define USBA_PRE_SUSPEND_EVENT "SUNW,USBA:USBA_PRE_SUSPEND"
@@ -409,11 +409,11 @@ typedef struct usba_if_pwr_descr {
uint16_t TransitionTimeFromD3; /* D3 -> D0 transition time */
} usba_if_pwr_descr_t;
-size_t usba_parse_cfg_pwr_descr(uchar_t *, size_t, usba_cfg_pwr_descr_t *,
- size_t);
+size_t usba_parse_cfg_pwr_descr(const uchar_t *, size_t, usba_cfg_pwr_descr_t *,
+ size_t);
-size_t usba_parse_if_pwr_descr(uchar_t *, size_t buflen, uint_t,
- uint_t, usba_if_pwr_descr_t *, size_t);
+size_t usba_parse_if_pwr_descr(const uchar_t *, size_t buflen, uint_t,
+ uint_t, usba_if_pwr_descr_t *, size_t);
/*
* Returns (at ret_descr) a null-terminated string. Null termination is
@@ -423,7 +423,7 @@ size_t usba_parse_if_pwr_descr(uchar_t *, size_t buflen, uint_t,
* XXX is this needed when there is usb_get_string_descriptor
* If so, then more comments about how it differs?
*/
-size_t usba_ascii_string_descr(uchar_t *, size_t, char *, size_t);
+size_t usba_ascii_string_descr(const uchar_t *, size_t, char *, size_t);
/*
diff --git a/usr/src/uts/common/sys/usb/usba/usba_types.h b/usr/src/uts/common/sys/usb/usba/usba_types.h
index c99a958c1a..e09bacb860 100644
--- a/usr/src/uts/common/sys/usb/usba/usba_types.h
+++ b/usr/src/uts/common/sys/usb/usba/usba_types.h
@@ -22,6 +22,7 @@
* Use is subject to license terms.
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef _SYS_USB_USBA_USBA_TYPES_H
@@ -31,6 +32,7 @@
#include <sys/taskq.h>
#include <sys/usb/usba/usba_private.h>
#include <sys/usb/usba/usbai_private.h>
+#include <sys/usb/usba/bos.h>
#ifdef __cplusplus
extern "C" {
@@ -241,7 +243,7 @@ typedef struct usb_client_dev_data_list {
} usb_client_dev_data_list_t;
/*
- * This structure uniquely identifies a USB device
+ * This structure uniquely identifies a USB device
* with all interfaces, or just one interface of a USB device.
* usba_device is associated with a devinfo node
*
@@ -363,6 +365,14 @@ typedef struct usba_device {
* Private data for HCD drivers
*/
void *usb_hcd_private;
+
+ /*
+ * Binary Object Store data
+ */
+ mblk_t *usb_bos_mp;
+ uint_t usb_bos_nalloc;
+ uint_t usb_bos_nents;
+ usb_bos_t *usb_bos;
} usba_device_t;
#define USBA_CLIENT_FLAG_SIZE 1
diff --git a/usr/src/uts/common/sys/usb/usbai.h b/usr/src/uts/common/sys/usb/usbai.h
index b37d8f230f..6c90a50b81 100644
--- a/usr/src/uts/common/sys/usb/usbai.h
+++ b/usr/src/uts/common/sys/usb/usbai.h
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_USB_USBAI_H
@@ -789,7 +789,7 @@ void usb_client_detach(
size_t usb_parse_data(
char *format,
- uchar_t *data,
+ const uchar_t *data,
size_t datalen,
void *structure,
size_t structlen);
diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h
index 0b997c518c..15b4d0b247 100644
--- a/usr/src/uts/common/sys/user.h
+++ b/usr/src/uts/common/sys/user.h
@@ -82,6 +82,21 @@ extern "C" {
#endif
/*
+ * File Descriptor assignment generation.
+ *
+ * Certain file descriptor consumers (namely epoll) need to be able to detect
+ * when the resource underlying an fd change due to (re)assignment. Checks
+ * comparing old and new file_t pointers work OK, but could easily be fooled by
+ * an entry freed-to and reused-from the cache. To better detect such
+ * assingments, a generation number is kept in the uf_entry. Whenever a
+ * non-NULL file_t is assigned to the entry, the generation is incremented,
+ * indicating the change. There is a minute possibility that a rollover of the
+ * value could cause assigments to evade detection by consumers, but it is
+ * considered acceptably small.
+ */
+typedef uint_t uf_entry_gen_t;
+
+/*
* Entry in the per-process list of open files.
* Note: only certain fields are copied in flist_grow() and flist_fork().
* This is indicated in brackets in the structure member comments.
@@ -96,11 +111,13 @@ typedef struct uf_entry {
short uf_busy; /* file is allocated [grow, fork] */
kcondvar_t uf_wanted_cv; /* waiting for setf() [never copied] */
kcondvar_t uf_closing_cv; /* waiting for close() [never copied] */
- struct portfd *uf_portfd; /* associated with port [grow] */
+ struct portfd *uf_portfd; /* associated with port [grow] */
+ uf_entry_gen_t uf_gen; /* assigned fd generation [grow,fork] */
/* Avoid false sharing - pad to coherency granularity (64 bytes) */
char uf_pad[64 - sizeof (kmutex_t) - 2 * sizeof (void*) -
2 * sizeof (int) - 2 * sizeof (short) -
- 2 * sizeof (kcondvar_t) - sizeof (struct portfd *)];
+ 2 * sizeof (kcondvar_t) - sizeof (struct portfd *) -
+ sizeof (uf_entry_gen_t)];
} uf_entry_t;
/*
@@ -185,9 +202,9 @@ typedef struct { /* kernel syscall set type */
* This value should not be changed in a patch.
*/
#if defined(__sparc)
-#define __KERN_NAUXV_IMPL 20
+#define __KERN_NAUXV_IMPL 24
#elif defined(__i386) || defined(__amd64)
-#define __KERN_NAUXV_IMPL 25
+#define __KERN_NAUXV_IMPL 28
#endif
struct execsw;
diff --git a/usr/src/uts/common/sys/vm.h b/usr/src/uts/common/sys/vm.h
index a8ca2ad377..0f7dfa9fd0 100644
--- a/usr/src/uts/common/sys/vm.h
+++ b/usr/src/uts/common/sys/vm.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -57,6 +58,8 @@ int queue_io_request(struct vnode *, u_offset_t);
extern kmutex_t memavail_lock;
extern kcondvar_t memavail_cv;
+#define WAKE_PAGEOUT_SCANNER() cv_broadcast(&proc_pageout->p_cv)
+
#endif /* defined(_KERNEL) */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h
index 1aa4a8ee6d..afbf438eff 100644
--- a/usr/src/uts/common/sys/vm_usage.h
+++ b/usr/src/uts/common/sys/vm_usage.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_VM_USAGE_H
@@ -79,8 +80,12 @@ extern "C" {
/* zoneid */
#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */
/* euser */
+#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */
-#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */
+#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */
+
+#define VMUSAGE_ZONE_FLAGS (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | \
+ VMUSAGE_A_ZONE)
typedef struct vmusage {
id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */
diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h
index c274bae805..2292310bda 100644
--- a/usr/src/uts/common/sys/vmsystm.h
+++ b/usr/src/uts/common/sys/vmsystm.h
@@ -19,6 +19,9 @@
* CDDL HEADER END
*/
/*
+ * Copyright (c) 2017, Joyent, Inc. All rights reserved.
+ */
+/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -58,6 +61,9 @@ extern pgcnt_t desscan; /* desired pages scanned per second */
extern pgcnt_t slowscan;
extern pgcnt_t fastscan;
extern pgcnt_t pushes; /* number of pages pushed to swap device */
+extern uint64_t low_mem_scan; /* num times page scan due to low memory */
+extern uint64_t zone_cap_scan; /* num times page scan due to zone cap */
+extern uint64_t n_throttle; /* num times page create throttled */
/* writable copies of tunables */
extern pgcnt_t maxpgio; /* max paging i/o per sec before start swaps */
@@ -159,6 +165,8 @@ extern void *boot_virt_alloc(void *addr, size_t size);
extern size_t exec_get_spslew(void);
+extern caddr_t map_userlimit(proc_t *pp, struct as *as, int flags);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h
new file mode 100644
index 0000000000..bc7c9c3122
--- /dev/null
+++ b/usr/src/uts/common/sys/vnd.h
@@ -0,0 +1,141 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VND_H
+#define _SYS_VND_H
+
+#include <sys/types.h>
+#include <sys/vnd_errno.h>
+#include <sys/frameio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * We distinguish between normal ioctls and private ioctls we issues to out
+ * streams version. Streams ioctls have the upper bit set in the lowest byte.
+ * Note that there are no STREAMs ioctls for userland and all definitions
+ * related to them are not present in this file.
+ */
+#define VND_IOC (('v' << 24) | ('n' << 16) | ('d' << 8))
+
+/*
+ * Attach the current minor instance to a given dlpi datalink identified by a
+ * vnd_ioc_name_t argument. This fails if it's already been attached. Note that
+ * unlike the other ioctls, this is passed directly as opposed to every other
+ * function which is passed as a pointer to the value.
+ */
+#define VND_IOC_ATTACH (VND_IOC | 0x1)
+
+#define VND_NAMELEN 32
+
+typedef struct vnd_ioc_attach {
+ char via_name[VND_NAMELEN];
+ zoneid_t via_zoneid;
+ uint32_t via_errno;
+} vnd_ioc_attach_t;
+
+/*
+ * Link the current minor instance into the /devices name space.
+ *
+ * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid,
+ * vil_name. The device will be namespaced to the zone. The global zone will be
+ * able to see all minor nodes. In the zone, only the /dev entries will exist.
+ * At this time, a given device can only have one link at a time. Note that a
+ * user cannot specify the zone to pass in, rather it is the zone that the
+ * device was attached in.
+ */
+#define VND_IOC_LINK (VND_IOC | 0x2)
+
+typedef struct vnd_ioc_link {
+ char vil_name[VND_NAMELEN];
+ uint32_t vil_errno;
+} vnd_ioc_link_t;
+
+/*
+ * Unlink the opened minor instance from the /devices name space. A zone may use
+ * this to unlink an extent entry in /dev; however, they will not be able to
+ * link it in again.
+ */
+#define VND_IOC_UNLINK (VND_IOC | 0x3)
+typedef struct vnd_ioc_unlink {
+ uint32_t viu_errno;
+} vnd_ioc_unlink_t;
+
+/*
+ * Controls to get and set the current buffer recieve buffer size.
+ */
+typedef struct vnd_ioc_buf {
+ uint64_t vib_size;
+ uint32_t vib_filler;
+ uint32_t vib_errno;
+} vnd_ioc_buf_t;
+
+#define VND_IOC_GETRXBUF (VND_IOC | 0x04)
+#define VND_IOC_SETRXBUF (VND_IOC | 0x05)
+#define VND_IOC_GETMAXBUF (VND_IOC | 0x06)
+#define VND_IOC_GETTXBUF (VND_IOC | 0x07)
+#define VND_IOC_SETTXBUF (VND_IOC | 0x08)
+#define VND_IOC_GETMINTU (VND_IOC | 0x09)
+#define VND_IOC_GETMAXTU (VND_IOC | 0x0a)
+
+/*
+ * Information and listing ioctls
+ *
+ * This gets information about all of the active vnd instances. vl_actents is
+ * always updated to the number around and vl_nents is the number of
+ * vnd_ioc_info_t elements are allocated in vl_ents.
+ */
+typedef struct vnd_ioc_info {
+ uint32_t vii_version;
+ zoneid_t vii_zone;
+ char vii_name[VND_NAMELEN];
+ char vii_datalink[VND_NAMELEN];
+} vnd_ioc_info_t;
+
+typedef struct vnd_ioc_list {
+ uint_t vl_nents;
+ uint_t vl_actents;
+ vnd_ioc_info_t *vl_ents;
+} vnd_ioc_list_t;
+
+#ifdef _KERNEL
+
+typedef struct vnd_ioc_list32 {
+ uint_t vl_nents;
+ uint_t vl_actents;
+ caddr32_t vl_ents;
+} vnd_ioc_list32_t;
+
+#endif /* _KERNEL */
+
+#define VND_IOC_LIST (VND_IOC | 0x20)
+
+/*
+ * Framed I/O ioctls
+ *
+ * Users should use the standard frameio_t as opposed to a vnd specific type.
+ * This is a consolidation private ioctl pending futher stability in the form of
+ * specific system work.
+ */
+#define VND_IOC_FRAMEIO_READ (VND_IOC | 0x30)
+#define VND_IOC_FRAMEIO_WRITE (VND_IOC | 0x31)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VND_H */
diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h
new file mode 100644
index 0000000000..89e5fc2543
--- /dev/null
+++ b/usr/src/uts/common/sys/vnd_errno.h
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VND_ERRNO_H
+#define _SYS_VND_ERRNO_H
+
+/*
+ * This header contains all of the available vnd errors.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum vnd_errno {
+ VND_E_SUCCESS = 0, /* no error */
+ VND_E_NOMEM, /* no memory */
+ VND_E_NODATALINK, /* no such datalink */
+ VND_E_NOTETHER, /* not DL_ETHER */
+ VND_E_DLPIINVAL, /* Unknown DLPI failures */
+ VND_E_ATTACHFAIL, /* DL_ATTACH_REQ failed */
+ VND_E_BINDFAIL, /* DL_BIND_REQ failed */
+ VND_E_PROMISCFAIL, /* DL_PROMISCON_REQ failed */
+ VND_E_DIRECTFAIL, /* DLD_CAPAB_DIRECT enable failed */
+ VND_E_CAPACKINVAL, /* bad dl_capability_ack_t */
+ VND_E_SUBCAPINVAL, /* bad dl_capability_sub_t */
+ VND_E_DLDBADVERS, /* bad dld version */
+ VND_E_KSTATCREATE, /* failed to create kstats */
+ VND_E_NODEV, /* no such vnd link */
+ VND_E_NONETSTACK, /* netstack doesn't exist */
+ VND_E_ASSOCIATED, /* device already associated */
+ VND_E_ATTACHED, /* device already attached */
+ VND_E_LINKED, /* device already linked */
+ VND_E_BADNAME, /* invalid name */
+ VND_E_PERM, /* can't touch this */
+ VND_E_NOZONE, /* no such zone */
+ VND_E_STRINIT, /* failed to initialize vnd stream module */
+ VND_E_NOTATTACHED, /* device not attached */
+ VND_E_NOTLINKED, /* device not linked */
+ VND_E_LINKEXISTS, /* another device has the same link name */
+ VND_E_MINORNODE, /* failed to create minor node */
+ VND_E_BUFTOOBIG, /* requested buffer size is too large */
+ VND_E_BUFTOOSMALL, /* requested buffer size is too small */
+ VND_E_DLEXCL, /* unable to get dlpi excl access */
+ VND_E_DIRECTNOTSUP,
+ /* DLD direct capability not suported over data link */
+ VND_E_BADPROPSIZE, /* invalid property size */
+ VND_E_BADPROP, /* invalid property */
+ VND_E_PROPRDONLY, /* property is read only */
+ VND_E_SYS, /* unexpected system error */
+ VND_E_CAPABPASS,
+ /* capabilities invalid, pass-through module detected */
+ VND_E_UNKNOWN /* unknown error */
+} vnd_errno_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VND_ERRNO_H */
diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h
index 7e50091347..4c8d49c621 100644
--- a/usr/src/uts/common/sys/vnic_impl.h
+++ b/usr/src/uts/common/sys/vnic_impl.h
@@ -21,7 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_VNIC_IMPL_H
@@ -64,7 +64,9 @@ typedef struct vnic_s {
mac_notify_handle_t vn_mnh;
uint32_t vn_hcksum_txflags;
+ mac_capab_lso_t vn_cap_lso;
uint32_t vn_mtu;
+ link_state_t vn_ls;
} vnic_t;
#define vn_mch vn_mc_handles[0]
diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h
index 51b4f7af18..b527558895 100644
--- a/usr/src/uts/common/sys/vnode.h
+++ b/usr/src/uts/common/sys/vnode.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2017 RackTop Systems.
*/
@@ -805,12 +805,14 @@ typedef enum vnevent {
VE_RMDIR = 4, /* Remove of directory vnode's name */
VE_CREATE = 5, /* Create with vnode's name which exists */
VE_LINK = 6, /* Link with vnode's name as source */
- VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */
+ VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */
VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */
VE_TRUNCATE = 9, /* Truncate */
VE_PRE_RENAME_SRC = 10, /* Pre-rename, with vnode as source */
VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */
- VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */
+ VE_PRE_RENAME_DEST_DIR = 12, /* Pre-rename with vnode as target dir */
+ VE_RENAME_SRC_DIR = 13, /* Rename with vnode as source dir */
+ VE_RESIZE = 14 /* Resize/truncate to non-zero offset */
} vnevent_t;
/*
@@ -1370,7 +1372,8 @@ void vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *);
void vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *);
void vnevent_create(vnode_t *, caller_context_t *);
void vnevent_link(vnode_t *, caller_context_t *);
-void vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct);
+void vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *,
+ caller_context_t *ct);
void vnevent_mountedover(vnode_t *, caller_context_t *);
void vnevent_truncate(vnode_t *, caller_context_t *);
int vnevent_support(vnode_t *, caller_context_t *);
@@ -1380,6 +1383,7 @@ void vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *,
caller_context_t *);
void vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *,
caller_context_t *);
+void vnevent_resize(vnode_t *, caller_context_t *);
/* Vnode specific data */
void vsd_create(uint_t *, void (*)(void *));
@@ -1482,6 +1486,7 @@ extern struct vnode kvps[];
typedef enum {
KV_KVP, /* vnode for all segkmem pages */
KV_ZVP, /* vnode for all ZFS pages */
+ KV_VVP, /* vnode for all VMM pages */
#if defined(__sparc)
KV_MPVP, /* vnode for all page_t meta-pages */
KV_PROMVP, /* vnode for all PROM pages */
diff --git a/usr/src/uts/common/sys/vxlan.h b/usr/src/uts/common/sys/vxlan.h
new file mode 100644
index 0000000000..d87786b507
--- /dev/null
+++ b/usr/src/uts/common/sys/vxlan.h
@@ -0,0 +1,47 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_VXLAN_H
+#define _SYS_VXLAN_H
+
+/*
+ * Common VXLAN information
+ */
+
+#include <sys/inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Sizes in bytes */
+#define VXLAN_HDR_LEN 8
+#define VXLAN_ID_LEN 3
+
+#define VXLAN_F_VDI 0x08000000
+#define VXLAN_ID_SHIFT 8
+
+#pragma pack(1)
+typedef struct vxlan_hdr {
+ uint32_t vxlan_flags;
+ uint32_t vxlan_id;
+} vxlan_hdr_t;
+#pragma pack()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VXLAN_H */
diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h
new file mode 100644
index 0000000000..e08d75ecba
--- /dev/null
+++ b/usr/src/uts/common/sys/zfd.h
@@ -0,0 +1,78 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_ZFD_H
+#define _SYS_ZFD_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Minor node name of the global zone side (often called the "master" side)
+ * of the zfd dev.
+ */
+#define ZFD_MASTER_NAME "master"
+
+/*
+ * Minor node name of the non-global zone side (often called the "slave"
+ * side) of the zfd dev.
+ */
+#define ZFD_SLAVE_NAME "slave"
+
+#define ZFD_NAME_LEN 16
+
+/*
+ * ZFD_IOC forms the base for all zfd ioctls.
+ */
+#define ZFD_IOC (('Z' << 24) | ('f' << 16) | ('d' << 8))
+
+/*
+ * This ioctl tells the slave side it should push the TTY stream modules
+ * so that the fd looks like a tty.
+ */
+#define ZFD_MAKETTY (ZFD_IOC | 0)
+
+/*
+ * This ioctl puts a hangup into the stream so that the slave side sees EOF.
+ */
+#define ZFD_EOF (ZFD_IOC | 1)
+
+/*
+ * This ioctl succeeds if the slave side is open.
+ */
+#define ZFD_HAS_SLAVE (ZFD_IOC | 2)
+
+/*
+ * This ioctl links two streams into a multiplexer configuration for in-zone
+ * logging.
+ */
+#define ZFD_MUX (ZFD_IOC | 3)
+
+/*
+ * This ioctl controls the flow control setting for the log multiplexer stream
+ * (1 = true, 0 = false). The default is false which implies teeing into the
+ * log stream is "best-effort" but data will be discarded if the stream
+ * becomes full. If set and the log stream begins to fill up, the primary
+ * stream will stop flowing.
+ */
+#define ZFD_MUX_FLOWCON (ZFD_IOC | 4)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFD_H */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 56fa4b8d87..a4ec347ce4 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -20,9 +20,9 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Joyent, Inc.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2018, Joyent, Inc.
*/
#ifndef _SYS_ZONE_H
@@ -43,6 +43,7 @@
#include <sys/secflags.h>
#include <netinet/in.h>
#include <sys/cpu_uarray.h>
+#include <sys/nvpair.h>
#ifdef __cplusplus
extern "C" {
@@ -52,15 +53,27 @@ extern "C" {
* NOTE
*
* The contents of this file are private to the implementation of
- * Solaris and are subject to change at any time without notice.
+ * illumos and are subject to change at any time without notice.
* Applications and drivers using these interfaces may fail to
* run on future releases.
*/
/* Available both in kernel and for user space */
-/* zone id restrictions and special ids */
-#define MAX_ZONEID 9999
+/*
+ * zone id restrictions and special ids.
+ * See 'maxzones' for run-time zone limit.
+ *
+ * The current 8k value for MAX_ZONES was originally derived from the virtual
+ * interface limit in IP when "shared-stack" was the only supported networking
+ * for zones. The virtual interface limit is the number of addresses allowed
+ * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k
+ * zone limit is still a reasonable choice at this time, given other limits
+ * within the kernel. Since we only support 8192 zones (which includes GZ),
+ * there is no point in allowing MAX_ZONEID > 8k.
+ */
+#define MAX_ZONES 8192
+#define MAX_ZONEID (MAX_ZONES - 1)
#define MIN_USERZONEID 1 /* lowest user-creatable zone ID */
#define MIN_ZONEID 0 /* minimum zone ID on system */
#define GLOBAL_ZONEID 0
@@ -99,14 +112,18 @@ extern "C" {
#define ZONE_ATTR_INITNAME 9
#define ZONE_ATTR_BOOTARGS 10
#define ZONE_ATTR_BRAND 11
-#define ZONE_ATTR_PHYS_MCAP 12
-#define ZONE_ATTR_SCHED_CLASS 13
-#define ZONE_ATTR_FLAGS 14
-#define ZONE_ATTR_HOSTID 15
-#define ZONE_ATTR_FS_ALLOWED 16
-#define ZONE_ATTR_NETWORK 17
-#define ZONE_ATTR_INITNORESTART 20
+#define ZONE_ATTR_SCHED_CLASS 12
+#define ZONE_ATTR_FLAGS 13
+#define ZONE_ATTR_HOSTID 14
+#define ZONE_ATTR_FS_ALLOWED 15
+#define ZONE_ATTR_NETWORK 16
+#define ZONE_ATTR_DID 17
+#define ZONE_ATTR_INITNORESTART 18
+#define ZONE_ATTR_APP_SVC_CT 19
+#define ZONE_ATTR_SCHED_FIXEDHI 20
#define ZONE_ATTR_SECFLAGS 21
+#define ZONE_ATTR_INITRESTART0 22
+#define ZONE_ATTR_INITREBOOT 23
/* Start of the brand-specific attribute namespace */
#define ZONE_ATTR_BRAND_ATTRS 32768
@@ -122,13 +139,18 @@ extern "C" {
#define ZONE_EVENT_READY "ready"
#define ZONE_EVENT_RUNNING "running"
#define ZONE_EVENT_SHUTTING_DOWN "shutting_down"
+#define ZONE_EVENT_FREE "free"
#define ZONE_CB_NAME "zonename"
#define ZONE_CB_NEWSTATE "newstate"
#define ZONE_CB_OLDSTATE "oldstate"
+#define ZONE_CB_RESTARTS "restarts"
#define ZONE_CB_TIMESTAMP "when"
#define ZONE_CB_ZONEID "zoneid"
+#define ZONE_EVENT_INIT_CLASS "init"
+#define ZONE_EVENT_INIT_RESTART_SC "restart"
+
/*
* Exit values that may be returned by scripts or programs invoked by various
* zone commands.
@@ -187,6 +209,7 @@ typedef struct {
uint32_t doi; /* DOI for label */
caddr32_t label; /* label associated with zone */
int flags;
+ zoneid_t zoneid; /* requested zoneid */
} zone_def32;
#endif
typedef struct {
@@ -203,6 +226,7 @@ typedef struct {
uint32_t doi; /* DOI for label */
const bslabel_t *label; /* label associated with zone */
int flags;
+ zoneid_t zoneid; /* requested zoneid */
} zone_def;
/* extended error information */
@@ -227,7 +251,8 @@ typedef enum {
ZONE_IS_EMPTY,
ZONE_IS_DOWN,
ZONE_IS_DYING,
- ZONE_IS_DEAD
+ ZONE_IS_DEAD,
+ ZONE_IS_FREE /* transient state for zone sysevent */
} zone_status_t;
#define ZONE_MIN_STATE ZONE_IS_UNINITIALIZED
#define ZONE_MAX_STATE ZONE_IS_DEAD
@@ -247,9 +272,12 @@ typedef enum zone_cmd {
typedef struct zone_cmd_arg {
uint64_t uniqid; /* unique "generation number" */
zone_cmd_t cmd; /* requested action */
- uint32_t _pad; /* need consistent 32/64 bit alignmt */
+ int status; /* init status on shutdown */
+ uint32_t debug; /* enable brand hook debug */
char locale[MAXPATHLEN]; /* locale in which to render messages */
char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */
+ /* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */
+ int pad;
} zone_cmd_arg_t;
/*
@@ -384,7 +412,7 @@ typedef struct zone_dataset {
} zone_dataset_t;
/*
- * structure for zone kstats
+ * structure for rctl zone kstats
*/
typedef struct zone_kstat {
kstat_named_t zk_zonename;
@@ -395,12 +423,57 @@ typedef struct zone_kstat {
struct cpucap;
typedef struct {
+ hrtime_t cycle_start;
+ uint_t cycle_cnt;
+ hrtime_t zone_avg_cnt;
+} sys_zio_cntr_t;
+
+typedef struct {
+ kstat_named_t zv_zonename;
+ kstat_named_t zv_nread;
+ kstat_named_t zv_reads;
+ kstat_named_t zv_rtime;
+ kstat_named_t zv_rlentime;
+ kstat_named_t zv_rcnt;
+ kstat_named_t zv_nwritten;
+ kstat_named_t zv_writes;
+ kstat_named_t zv_wtime;
+ kstat_named_t zv_wlentime;
+ kstat_named_t zv_wcnt;
+ kstat_named_t zv_10ms_ops;
+ kstat_named_t zv_100ms_ops;
+ kstat_named_t zv_1s_ops;
+ kstat_named_t zv_10s_ops;
+ kstat_named_t zv_delay_cnt;
+ kstat_named_t zv_delay_time;
+} zone_vfs_kstat_t;
+
+typedef struct {
+ kstat_named_t zz_zonename;
+ kstat_named_t zz_nread;
+ kstat_named_t zz_reads;
+ kstat_named_t zz_rtime;
+ kstat_named_t zz_rlentime;
+ kstat_named_t zz_nwritten;
+ kstat_named_t zz_writes;
+ kstat_named_t zz_waittime;
+} zone_zfs_kstat_t;
+
+typedef struct {
kstat_named_t zm_zonename;
+ kstat_named_t zm_rss;
+ kstat_named_t zm_phys_cap;
+ kstat_named_t zm_swap;
+ kstat_named_t zm_swap_cap;
+ kstat_named_t zm_nover;
+ kstat_named_t zm_pagedout;
kstat_named_t zm_pgpgin;
kstat_named_t zm_anonpgin;
kstat_named_t zm_execpgin;
kstat_named_t zm_fspgin;
kstat_named_t zm_anon_alloc_fail;
+ kstat_named_t zm_pf_throttle;
+ kstat_named_t zm_pf_throttle_usec;
} zone_mcap_kstat_t;
typedef struct {
@@ -415,8 +488,10 @@ typedef struct {
kstat_named_t zm_ffnoproc;
kstat_named_t zm_ffnomem;
kstat_named_t zm_ffmisc;
+ kstat_named_t zm_mfseglim;
kstat_named_t zm_nested_intp;
kstat_named_t zm_init_pid;
+ kstat_named_t zm_init_restarts;
kstat_named_t zm_boot_time;
} zone_misc_kstat_t;
@@ -459,6 +534,7 @@ typedef struct zone {
*/
list_node_t zone_linkage;
zoneid_t zone_id; /* ID of zone */
+ zoneid_t zone_did; /* persistent debug ID of zone */
uint_t zone_ref; /* count of zone_hold()s on zone */
uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */
/*
@@ -511,10 +587,11 @@ typedef struct zone {
kcondvar_t zone_cv; /* used to signal state changes */
struct proc *zone_zsched; /* Dummy kernel "zsched" process */
pid_t zone_proc_initpid; /* pid of "init" for this zone */
- char *zone_initname; /* fs path to 'init' */
+ uint_t zone_proc_init_restarts; /* times init restarted */
+ char *zone_initname; /* fs path to 'init' */
+ int zone_init_status; /* init's exit status */
int zone_boot_err; /* for zone_boot() if boot fails */
char *zone_bootargs; /* arguments passed via zone_boot() */
- uint64_t zone_phys_mcap; /* physical memory cap */
/*
* zone_kthreads is protected by zone_status_lock.
*/
@@ -552,9 +629,13 @@ typedef struct zone {
tsol_mlp_list_t zone_mlps; /* MLPs on zone-private addresses */
boolean_t zone_restart_init; /* Restart init if it dies? */
+ boolean_t zone_reboot_on_init_exit; /* Reboot if init dies? */
+ boolean_t zone_restart_init_0; /* Restart only if it exits 0 */
+ boolean_t zone_setup_app_contract; /* setup contract? */
struct brand *zone_brand; /* zone's brand */
void *zone_brand_data; /* store brand specific data */
id_t zone_defaultcid; /* dflt scheduling class id */
+ boolean_t zone_fixed_hipri; /* fixed sched. hi prio */
kstat_t *zone_swapresv_kstat;
kstat_t *zone_lockedmem_kstat;
/*
@@ -563,8 +644,24 @@ typedef struct zone {
list_t zone_dl_list;
netstack_t *zone_netstack;
struct cpucap *zone_cpucap; /* CPU caps data */
+
+ /*
+ * kstats and counters for VFS ops and bytes.
+ */
+ kmutex_t zone_vfs_lock; /* protects VFS statistics */
+ kstat_t *zone_vfs_ksp;
+ kstat_io_t zone_vfs_rwstats;
+ zone_vfs_kstat_t *zone_vfs_stats;
+
+ /*
+ * kstats for ZFS I/O ops and bytes.
+ */
+ kmutex_t zone_zfs_lock; /* protects ZFS statistics */
+ kstat_t *zone_zfs_ksp;
+ zone_zfs_kstat_t *zone_zfs_stats;
+
/*
- * Solaris Auditing per-zone audit context
+ * illumos Auditing per-zone audit context
*/
struct au_kcontext *zone_audit_kctxt;
/*
@@ -581,7 +678,11 @@ typedef struct zone {
/* zone_rctls->rcs_lock */
kstat_t *zone_nprocs_kstat;
- kmutex_t zone_mcap_lock; /* protects mcap statistics */
+ /*
+ * kstats and counters for physical memory capping.
+ */
+ kstat_t *zone_physmem_kstat;
+ kmutex_t zone_mcap_lock; /* protects mcap statistics */
kstat_t *zone_mcap_ksp;
zone_mcap_kstat_t *zone_mcap_stats;
uint64_t zone_pgpgin; /* pages paged in */
@@ -606,6 +707,8 @@ typedef struct zone {
uint32_t zone_ffnomem; /* as_dup/memory error */
uint32_t zone_ffmisc; /* misc. other error */
+ uint32_t zone_mfseglim; /* map failure (# segs limit) */
+
uint32_t zone_nested_intp; /* nested interp. kstat */
struct loadavg_s zone_loadavg; /* loadavg for this zone */
@@ -633,6 +736,53 @@ typedef struct zone {
} zone_t;
/*
+ * Data and counters used for ZFS fair-share disk IO.
+ */
+typedef struct zone_zfs_io {
+ uint16_t zpers_zfs_io_pri; /* ZFS IO priority - 16k max */
+ uint_t zpers_zfs_queued[2]; /* sync I/O enqueued count */
+ sys_zio_cntr_t zpers_rd_ops; /* Counters for ZFS reads, */
+ sys_zio_cntr_t zpers_wr_ops; /* writes, and */
+ sys_zio_cntr_t zpers_lwr_ops; /* logical writes. */
+ kstat_io_t zpers_zfs_rwstats;
+ uint64_t zpers_io_util; /* IO utilization metric */
+ uint64_t zpers_zfs_rd_waittime;
+ uint8_t zpers_io_delay; /* IO delay on logical r/w */
+ uint8_t zpers_zfs_weight; /* used to prevent starvation */
+ uint8_t zpers_io_util_above_avg; /* IO util percent > avg. */
+} zone_zfs_io_t;
+
+/*
+ * "Persistent" zone data which can be accessed idependently of the zone_t.
+ */
+typedef struct zone_persist {
+ kmutex_t zpers_zfs_lock; /* Protects zpers_zfsp references */
+ zone_zfs_io_t *zpers_zfsp; /* ZFS fair-share IO data */
+ uint8_t zpers_over; /* currently over cap */
+ uint32_t zpers_pg_cnt; /* current RSS in pages */
+ uint32_t zpers_pg_limit; /* current RRS limit in pages */
+ uint32_t zpers_nover; /* # of times over phys. cap */
+#ifndef DEBUG
+ uint64_t zpers_pg_out; /* # pages flushed */
+#else
+ /*
+ * To conserve memory, some detailed kstats are only kept for DEBUG
+ * builds.
+ */
+ uint64_t zpers_zfs_rd_waittime;
+
+ uint64_t zpers_pg_anon; /* # clean anon pages flushed */
+ uint64_t zpers_pg_anondirty; /* # dirty anon pages flushed */
+ uint64_t zpers_pg_fs; /* # clean fs pages flushed */
+ uint64_t zpers_pg_fsdirty; /* # dirty fs pages flushed */
+#endif
+} zone_persist_t;
+
+typedef enum zone_pageout_op {
+ ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY
+} zone_pageout_op_t;
+
+/*
* Special value of zone_psetid to indicate that pools are disabled.
*/
#define ZONE_PS_INVAL PS_MYID
@@ -662,6 +812,7 @@ extern zone_t *zone_find_by_name(char *);
extern zone_t *zone_find_by_any_path(const char *, boolean_t);
extern zone_t *zone_find_by_path(const char *);
extern zoneid_t getzoneid(void);
+extern zoneid_t getzonedid(void);
extern zone_t *zone_find_by_id_nolock(zoneid_t);
extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *);
extern int zone_check_datalink(zoneid_t *, datalink_id_t);
@@ -842,6 +993,7 @@ extern int zone_ncpus_online_get(zone_t *);
* Returns true if the named pool/dataset is visible in the current zone.
*/
extern int zone_dataset_visible(const char *, int *);
+extern int zone_dataset_visible_inzone(zone_t *, const char *, int *);
/*
* zone version of kadmin()
@@ -854,10 +1006,25 @@ extern void mount_completed(zone_t *);
extern int zone_walk(int (*)(zone_t *, void *), void *);
+struct page;
+extern void zone_add_page(struct page *);
+extern void zone_rm_page(struct page *);
+extern void zone_pageout_stat(int, zone_pageout_op_t);
+extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *);
+
+/* Interfaces for page scanning */
+extern uint_t zone_num_over_cap;
+extern zone_persist_t zone_pdata[MAX_ZONES];
+
extern rctl_hndl_t rc_zone_locked_mem;
extern rctl_hndl_t rc_zone_max_swap;
+extern rctl_hndl_t rc_zone_phys_mem;
extern rctl_hndl_t rc_zone_max_lofi;
+/* For publishing sysevents related to a particular zone */
+extern void zone_sysevent_publish(zone_t *, const char *, const char *,
+ nvlist_t *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c
index 9b4bd38baa..245ef9f14f 100644
--- a/usr/src/uts/common/syscall/brandsys.c
+++ b/usr/src/uts/common/syscall/brandsys.c
@@ -23,7 +23,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
#include <sys/brand.h>
#include <sys/systm.h>
@@ -35,7 +37,7 @@
*/
int64_t
brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
- uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
+ uintptr_t arg4)
{
struct proc *p = curthread->t_procp;
int64_t rval = 0;
@@ -49,7 +51,7 @@ brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
return (set_errno(ENOSYS));
if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3,
- arg4, arg5, arg6)) != 0)
+ arg4)) != 0)
return (set_errno(err));
return (rval);
diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c
index 84c924f570..deb5532b50 100644
--- a/usr/src/uts/common/syscall/chdir.c
+++ b/usr/src/uts/common/syscall/chdir.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -62,7 +63,7 @@
/*
* Change current working directory (".").
*/
-static int chdirec(vnode_t *, int ischroot, int do_traverse);
+static int chdirec(vnode_t *, boolean_t ischroot, boolean_t do_traverse);
int
chdir(char *fname)
@@ -78,7 +79,7 @@ lookup:
return (set_errno(error));
}
- error = chdirec(vp, 0, 1);
+ error = chdirec(vp, B_FALSE, B_TRUE);
if (error) {
if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
goto lookup;
@@ -102,7 +103,7 @@ fchdir(int fd)
vp = fp->f_vnode;
VN_HOLD(vp);
releasef(fd);
- error = chdirec(vp, 0, 0);
+ error = chdirec(vp, B_FALSE, B_FALSE);
if (error)
return (set_errno(error));
return (0);
@@ -125,7 +126,7 @@ lookup:
return (set_errno(error));
}
- error = chdirec(vp, 1, 1);
+ error = chdirec(vp, B_TRUE, B_TRUE);
if (error) {
if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
goto lookup;
@@ -152,18 +153,18 @@ fchroot(int fd)
vp = fp->f_vnode;
VN_HOLD(vp);
releasef(fd);
- error = chdirec(vp, 1, 0);
+ error = chdirec(vp, B_TRUE, B_FALSE);
if (error)
return (set_errno(error));
return (0);
}
static int
-chdirec(vnode_t *vp, int ischroot, int do_traverse)
+chdirec_common(proc_t *pp, vnode_t *vp, boolean_t ischroot,
+ boolean_t do_traverse)
{
int error;
vnode_t *oldvp;
- proc_t *pp = curproc;
vnode_t **vpp;
refstr_t *cwd;
int newcwd = 1;
@@ -194,7 +195,7 @@ chdirec(vnode_t *vp, int ischroot, int do_traverse)
if (ischroot) {
struct vattr tattr;
struct vattr rattr;
- vnode_t *zonevp = curproc->p_zone->zone_rootvp;
+ vnode_t *zonevp = pp->p_zone->zone_rootvp;
tattr.va_mask = AT_FSID|AT_NODEID;
if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL))
@@ -243,3 +244,15 @@ bad:
VN_RELE(vp);
return (error);
}
+
+int
+chdir_proc(proc_t *pp, vnode_t *vp, boolean_t ischroot, boolean_t do_traverse)
+{
+ return (chdirec_common(pp, vp, ischroot, do_traverse));
+}
+
+static int
+chdirec(vnode_t *vp, boolean_t ischroot, boolean_t do_traverse)
+{
+ return (chdirec_common(curproc, vp, ischroot, do_traverse));
+}
diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c
index 7b787a4acb..b029d92f1b 100644
--- a/usr/src/uts/common/syscall/fcntl.c
+++ b/usr/src/uts/common/syscall/fcntl.c
@@ -54,7 +54,8 @@
#include <sys/cmn_err.h>
-static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
+/* This is global so that it can be used by brand emulation. */
+int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *);
static void fd_too_big(proc_t *);
diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c
index 1ee4b6a395..721f884a7e 100644
--- a/usr/src/uts/common/syscall/memcntl.c
+++ b/usr/src/uts/common/syscall/memcntl.c
@@ -115,13 +115,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
* MS_SYNC used to be defined to be zero but is now non-zero.
* For binary compatibility we still accept zero
* (the absence of MS_ASYNC) to mean the same thing.
+ * Binary compatibility is not an issue for MS_INVALCURPROC.
*/
iarg = (uintptr_t)arg;
if ((iarg & ~MS_INVALIDATE) == 0)
iarg |= MS_SYNC;
- if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
- ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
+ if (((iarg &
+ ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
+ ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
+ ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
+ (MS_INVALIDATE|MS_INVALCURPROC))) {
error = set_errno(EINVAL);
} else {
error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c
index edb04c824b..874e31869c 100644
--- a/usr/src/uts/common/syscall/open.c
+++ b/usr/src/uts/common/syscall/open.c
@@ -74,12 +74,12 @@ copen(int startfd, char *fname, int filemode, int createmode)
if (filemode & (FSEARCH|FEXEC)) {
/*
- * Must be one or the other and neither FREAD nor FWRITE
+ * Must be one or the other.
* Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN
- * XXX: Should these just be silently ignored?
+ * XXX: Should these just be silently ignored like we
+ * silently ignore FREAD|FWRITE?
*/
- if ((filemode & (FREAD|FWRITE)) ||
- (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) ||
+ if ((filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) ||
(filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN)))
return (set_errno(EINVAL));
}
diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c
index ae34556f14..2214bacaf8 100644
--- a/usr/src/uts/common/syscall/poll.c
+++ b/usr/src/uts/common/syscall/poll.c
@@ -29,7 +29,7 @@
/*
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- * Copyright 2015, Joyent, Inc.
+ * Copyright (c) 2017, Joyent, Inc.
*/
/*
@@ -317,20 +317,57 @@ polllock(pollhead_t *php, kmutex_t *lp)
return (0);
}
-static int
-poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
+int
+poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds)
+{
+ pollfd_t *pollfdp;
+ nfds_t old_nfds;
+
+ /*
+ * NOTE: for performance, buffers are saved across poll() calls.
+ * The theory is that if a process polls heavily, it tends to poll
+ * on the same set of descriptors. Therefore, we only reallocate
+ * buffers when nfds changes. There is no hysteresis control,
+ * because there is no data to suggest that this is necessary;
+ * the penalty of reallocating is not *that* great in any event.
+ */
+ old_nfds = ps->ps_nfds;
+ if (nfds != old_nfds) {
+ kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
+ pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+ ps->ps_pollfd = pollfdp;
+ ps->ps_nfds = nfds;
+ }
+
+ pollfdp = ps->ps_pollfd;
+ if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
+ return (EFAULT);
+ }
+
+ if (fds == NULL) {
+ /*
+ * If the process has page 0 mapped, then the copyin() above
+ * will succeed even if fds is NULL. However, our cached
+ * poll lists are keyed by the address of the passed-in fds
+ * structure, and we use the value NULL to indicate an unused
+ * poll cache list entry. As such, we elect not to support
+ * NULL as a valid (user) memory address and fail the poll()
+ * call.
+ */
+ return (EFAULT);
+ }
+ return (0);
+}
+
+int
+poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp,
+ int *fdcnt)
{
kthread_t *t = curthread;
- klwp_t *lwp = ttolwp(t);
- proc_t *p = ttoproc(t);
- int fdcnt = 0;
- int i;
hrtime_t deadline; /* hrtime value when we want to return */
pollfd_t *pollfdp;
- pollstate_t *ps;
pollcache_t *pcp;
int error = 0;
- nfds_t old_nfds;
int cacheindex = 0; /* which cache set is used */
/*
@@ -340,33 +377,34 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
deadline = -1;
} else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
deadline = 0;
+ } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) {
+ /* Use an indefinite timeout if tv_sec would cause overflow */
+ deadline = -1;
} else {
+ /*
+ * The above check, when combined with the protections offered
+ * by itimerspecfix (ensuring that neither field is negative
+ * and that tv_nsec represents less than a whole second), will
+ * prevent overflow during the conversion from timespec_t to
+ * uhrtime_t.
+ */
+ uhrtime_t utime = tsp->tv_sec * NANOSEC;
+ utime += tsp->tv_nsec;
+
/* They must wait at least a tick. */
- deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec;
- deadline = MAX(deadline, nsec_per_tick);
- deadline += gethrtime();
- }
+ utime = MAX(utime, nsec_per_tick);
- /*
- * Reset our signal mask, if requested.
- */
- if (ksetp != NULL) {
- mutex_enter(&p->p_lock);
- schedctl_finish_sigblock(t);
- lwp->lwp_sigoldmask = t->t_hold;
- t->t_hold = *ksetp;
- t->t_flag |= T_TOMASK;
/*
- * Call cv_reltimedwait_sig() just to check for signals.
- * We will return immediately with either 0 or -1.
+ * Since utime has an upper bound of HRTIME_MAX, adding the
+ * gethrtime() result cannot incur an overflow as the unsigned
+ * type has an adequate bound.
*/
- if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
- TR_CLOCK_TICK)) {
- mutex_exit(&p->p_lock);
- error = EINTR;
- goto pollout;
+ utime += (uhrtime_t)gethrtime();
+ if (utime > HRTIME_MAX) {
+ deadline = -1;
+ } else {
+ deadline = (hrtime_t)utime;
}
- mutex_exit(&p->p_lock);
}
/*
@@ -374,6 +412,7 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
* If yes then bypass all the other stuff and make it sleep.
*/
if (nfds == 0) {
+ *fdcnt = 0;
/*
* Sleep until we have passed the requested future
* time or until interrupted by a signal.
@@ -385,66 +424,14 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
&t->t_delay_lock, deadline)) > 0)
continue;
mutex_exit(&t->t_delay_lock);
- error = (error == 0) ? EINTR : 0;
+ return ((error == 0) ? EINTR : 0);
}
- goto pollout;
- }
-
- if (nfds > p->p_fno_ctl) {
- mutex_enter(&p->p_lock);
- (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
- p->p_rctls, p, RCA_SAFE);
- mutex_exit(&p->p_lock);
- error = EINVAL;
- goto pollout;
- }
-
- /*
- * Need to allocate memory for pollstate before anything because
- * the mutex and cv are created in this space
- */
- ps = pollstate_create();
-
- if (ps->ps_pcache == NULL)
- ps->ps_pcache = pcache_alloc();
- pcp = ps->ps_pcache;
-
- /*
- * NOTE: for performance, buffers are saved across poll() calls.
- * The theory is that if a process polls heavily, it tends to poll
- * on the same set of descriptors. Therefore, we only reallocate
- * buffers when nfds changes. There is no hysteresis control,
- * because there is no data to suggest that this is necessary;
- * the penalty of reallocating is not *that* great in any event.
- */
- old_nfds = ps->ps_nfds;
- if (nfds != old_nfds) {
-
- kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
- pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
- ps->ps_pollfd = pollfdp;
- ps->ps_nfds = nfds;
+ return (0);
}
+ VERIFY(ps != NULL);
pollfdp = ps->ps_pollfd;
- if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
- error = EFAULT;
- goto pollout;
- }
-
- if (fds == NULL) {
- /*
- * If the process has page 0 mapped, then the copyin() above
- * will succeed even if fds is NULL. However, our cached
- * poll lists are keyed by the address of the passed-in fds
- * structure, and we use the value NULL to indicate an unused
- * poll cache list entry. As such, we elect not to support
- * NULL as a valid (user) memory address and fail the poll()
- * call.
- */
- error = EINVAL;
- goto pollout;
- }
+ VERIFY(pollfdp != NULL);
/*
* If this thread polls for the first time, allocate ALL poll
@@ -460,10 +447,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
/*
* poll and cache this poll fd list in ps_pcacheset[0].
*/
- error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
- if (fdcnt || error) {
+ error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex);
+ if (error || *fdcnt) {
mutex_exit(&ps->ps_lock);
- goto pollout;
+ return (error);
}
} else {
pollcacheset_t *pcset = ps->ps_pcacheset;
@@ -488,11 +475,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
* the callee will guarantee the consistency
* of cached poll list and cache content.
*/
- error = pcacheset_resolve(ps, nfds, &fdcnt,
+ error = pcacheset_resolve(ps, nfds, fdcnt,
cacheindex);
if (error) {
mutex_exit(&ps->ps_lock);
- goto pollout;
+ return (error);
}
break;
}
@@ -509,11 +496,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
* found an unused entry. Use it to cache
* this poll list.
*/
- error = pcacheset_cache_list(ps, fds, &fdcnt,
+ error = pcacheset_cache_list(ps, fds, fdcnt,
cacheindex);
- if (fdcnt || error) {
+ if (error || *fdcnt) {
mutex_exit(&ps->ps_lock);
- goto pollout;
+ return (error);
}
break;
}
@@ -527,10 +514,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
cacheindex = pcacheset_replace(ps);
ASSERT(cacheindex < ps->ps_nsets);
pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
- error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
+ error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex);
if (error) {
mutex_exit(&ps->ps_lock);
- goto pollout;
+ return (error);
}
}
}
@@ -548,8 +535,8 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
mutex_enter(&pcp->pc_lock);
for (;;) {
pcp->pc_flag = 0;
- error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
- if (fdcnt || error) {
+ error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex);
+ if (error || *fdcnt) {
mutex_exit(&pcp->pc_lock);
mutex_exit(&ps->ps_lock);
break;
@@ -595,13 +582,116 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
mutex_enter(&pcp->pc_lock);
}
+ return (error);
+}
+
+/*
+ * This is the system call trap that poll(),
+ * select() and pselect() are built upon.
+ * It is a private interface between libc and the kernel.
+ */
+int
+pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ timespec_t ts;
+ timespec_t *tsp;
+ k_sigset_t kset;
+ pollstate_t *ps = NULL;
+ pollfd_t *pollfdp = NULL;
+ int error = 0, fdcnt = 0;
+
+ /*
+ * Copy in timeout
+ */
+ if (timeoutp == NULL) {
+ tsp = NULL;
+ } else {
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &ts, sizeof (ts)))
+ return (set_errno(EFAULT));
+ } else {
+ timespec32_t ts32;
+
+ if (copyin(timeoutp, &ts32, sizeof (ts32)))
+ return (set_errno(EFAULT));
+ TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+ }
+
+ if (itimerspecfix(&ts))
+ return (set_errno(EINVAL));
+ tsp = &ts;
+ }
+
+ /*
+ * Copy in and reset signal mask, if requested.
+ */
+ if (setp != NULL) {
+ sigset_t set;
+
+ if (copyin(setp, &set, sizeof (set)))
+ return (set_errno(EFAULT));
+ sigutok(&set, &kset);
+
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(t);
+ lwp->lwp_sigoldmask = t->t_hold;
+ t->t_hold = kset;
+ t->t_flag |= T_TOMASK;
+ /*
+ * Call cv_reltimedwait_sig() just to check for signals.
+ * We will return immediately with either 0 or -1.
+ */
+ if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+ TR_CLOCK_TICK)) {
+ mutex_exit(&p->p_lock);
+ error = EINTR;
+ goto pollout;
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Initialize pollstate and copy in pollfd data if present.
+ * If nfds == 0, we will skip all of the copying and check steps and
+ * proceed directly into poll_common to process the supplied timeout.
+ */
+ if (nfds != 0) {
+ if (nfds > p->p_fno_ctl) {
+ mutex_enter(&p->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+ p->p_rctls, p, RCA_SAFE);
+ mutex_exit(&p->p_lock);
+ error = EINVAL;
+ goto pollout;
+ }
+
+ /*
+ * Need to allocate memory for pollstate before anything
+ * because the mutex and cv are created in this space
+ */
+ ps = pollstate_create();
+ if (ps->ps_pcache == NULL)
+ ps->ps_pcache = pcache_alloc();
+
+ if ((error = poll_copyin(ps, fds, nfds)) != 0)
+ goto pollout;
+ pollfdp = ps->ps_pollfd;
+ }
+
+ /*
+ * Perform the actual poll.
+ */
+ error = poll_common(ps, fds, nfds, tsp, &fdcnt);
+
pollout:
/*
- * If we changed the signal mask but we received
- * no signal then restore the signal mask.
- * Otherwise psig() will deal with the signal mask.
+ * If we changed the signal mask but we received no signal then restore
+ * the signal mask. Otherwise psig() will deal with the signal mask.
*/
- if (ksetp != NULL) {
+ if (setp != NULL) {
mutex_enter(&p->p_lock);
if (lwp->lwp_cursig == 0) {
t->t_hold = lwp->lwp_sigoldmask;
@@ -612,12 +702,10 @@ pollout:
if (error)
return (set_errno(error));
-
/*
* Copy out the events and return the fdcnt to the user.
*/
- if (nfds != 0 &&
- copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
+ if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
return (set_errno(EFAULT));
#ifdef DEBUG
@@ -625,7 +713,7 @@ pollout:
* Another sanity check:
*/
if (fdcnt) {
- int reventcnt = 0;
+ int i, reventcnt = 0;
for (i = 0; i < nfds; i++) {
if (pollfdp[i].fd < 0) {
@@ -638,6 +726,8 @@ pollout:
}
ASSERT(fdcnt == reventcnt);
} else {
+ int i;
+
for (i = 0; i < nfds; i++) {
ASSERT(pollfdp[i].revents == 0);
}
@@ -648,52 +738,6 @@ pollout:
}
/*
- * This is the system call trap that poll(),
- * select() and pselect() are built upon.
- * It is a private interface between libc and the kernel.
- */
-int
-pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
-{
- timespec_t ts;
- timespec_t *tsp;
- sigset_t set;
- k_sigset_t kset;
- k_sigset_t *ksetp;
- model_t datamodel = get_udatamodel();
-
- if (timeoutp == NULL)
- tsp = NULL;
- else {
- if (datamodel == DATAMODEL_NATIVE) {
- if (copyin(timeoutp, &ts, sizeof (ts)))
- return (set_errno(EFAULT));
- } else {
- timespec32_t ts32;
-
- if (copyin(timeoutp, &ts32, sizeof (ts32)))
- return (set_errno(EFAULT));
- TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
- }
-
- if (itimerspecfix(&ts))
- return (set_errno(EINVAL));
- tsp = &ts;
- }
-
- if (setp == NULL)
- ksetp = NULL;
- else {
- if (copyin(setp, &set, sizeof (set)))
- return (set_errno(EFAULT));
- sigutok(&set, &kset);
- ksetp = &kset;
- }
-
- return (poll_common(fds, nfds, tsp, ksetp));
-}
-
-/*
* Clean up any state left around by poll(2). Called when a thread exits.
*/
void
@@ -1277,8 +1321,8 @@ pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp,
* be OK too.
*/
ASSERT(curthread->t_pollcache == NULL);
- error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents,
- &memphp, NULL);
+ error = VOP_POLL(fp->f_vnode, pollfdp->events | ps->ps_implicit_ev, 0,
+ &pollfdp->revents, &memphp, NULL);
if (error) {
return (error);
}
@@ -1992,7 +2036,8 @@ retry:
* flag.
*/
ASSERT(curthread->t_pollcache == NULL);
- error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0,
+ error = VOP_POLL(fp->f_vnode,
+ pollfdp[entry].events | ps->ps_implicit_ev, 0,
&pollfdp[entry].revents, &php, NULL);
/*
* releasef after completely done with this cached
@@ -2291,6 +2336,7 @@ pollstate_create()
} else {
ASSERT(ps->ps_depth == 0);
ASSERT(ps->ps_flags == 0);
+ ASSERT(ps->ps_implicit_ev == 0);
ASSERT(ps->ps_pc_stack[0] == 0);
}
return (ps);
@@ -3025,7 +3071,7 @@ plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp,
php = NULL;
ASSERT(curthread->t_pollcache == NULL);
error = VOP_POLL(fp->f_vnode,
- pollfdp[i].events, 0,
+ pollfdp[i].events | psp->ps_implicit_ev, 0,
&pollfdp[i].revents, &php, NULL);
if (error) {
return (error);
diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c
index 3e0e63f4c0..09f3266ab4 100644
--- a/usr/src/uts/common/syscall/rusagesys.c
+++ b/usr/src/uts/common/syscall/rusagesys.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc. All rights reserved.
*/
/*
diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c
index a28894b2c9..23f03e841d 100644
--- a/usr/src/uts/common/syscall/rw.c
+++ b/usr/src/uts/common/syscall/rw.c
@@ -22,7 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -50,6 +50,7 @@
#include <sys/debug.h>
#include <sys/rctl.h>
#include <sys/nbmlock.h>
+#include <sys/limits.h>
#define COPYOUT_MAX_CACHE (1<<17) /* 128K */
@@ -607,19 +608,12 @@ out:
return (bcount);
}
-/*
- * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr....
- * XXX -- However, SVVS expects readv() and writev() to fail if
- * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
- * XXX -- so I guess that's the "interface".
- */
-#define DEF_IOV_MAX 16
-
ssize_t
readv(int fdes, struct iovec *iovp, int iovcnt)
{
struct uio auio;
- struct iovec aiov[DEF_IOV_MAX];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
file_t *fp;
register vnode_t *vp;
struct cpu *cp;
@@ -630,9 +624,14 @@ readv(int fdes, struct iovec *iovp, int iovcnt)
u_offset_t fileoff;
int in_crit = 0;
- if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+ if (iovcnt <= 0 || iovcnt > IOV_MAX)
return (set_errno(EINVAL));
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded,
@@ -640,36 +639,63 @@ readv(int fdes, struct iovec *iovp, int iovcnt)
* of data in a single call.
*/
if (get_udatamodel() == DATAMODEL_ILP32) {
- struct iovec32 aiov32[DEF_IOV_MAX];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ int aiov32len;
ssize32_t count32;
- if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+ aiov32len = iovcnt * sizeof (iovec32_t);
+ if (aiovlen != 0)
+ aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+ if (copyin(iovp, aiov32, aiov32len)) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
ssize32_t iovlen32 = aiov32[i].iov_len;
count32 += iovlen32;
- if (iovlen32 < 0 || count32 < 0)
+ if (iovlen32 < 0 || count32 < 0) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EINVAL));
+ }
aiov[i].iov_len = iovlen32;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+
+ if (aiovlen != 0)
+ kmem_free(aiov32, aiov32len);
} else
#endif
- if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+ if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EFAULT));
+ }
count = 0;
for (i = 0; i < iovcnt; i++) {
ssize_t iovlen = aiov[i].iov_len;
count += iovlen;
- if (iovlen < 0 || count < 0)
+ if (iovlen < 0 || count < 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EINVAL));
+ }
}
- if ((fp = getf(fdes)) == NULL)
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EBADF));
+ }
if (((fflag = fp->f_flag) & FREAD) == 0) {
error = EBADF;
goto out;
@@ -768,6 +794,8 @@ out:
if (in_crit)
nbl_end_crit(vp);
releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
if (error)
return (set_errno(error));
return (count);
@@ -777,7 +805,8 @@ ssize_t
writev(int fdes, struct iovec *iovp, int iovcnt)
{
struct uio auio;
- struct iovec aiov[DEF_IOV_MAX];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
file_t *fp;
register vnode_t *vp;
struct cpu *cp;
@@ -788,9 +817,14 @@ writev(int fdes, struct iovec *iovp, int iovcnt)
u_offset_t fileoff;
int in_crit = 0;
- if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+ if (iovcnt <= 0 || iovcnt > IOV_MAX)
return (set_errno(EINVAL));
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded,
@@ -798,36 +832,62 @@ writev(int fdes, struct iovec *iovp, int iovcnt)
* of data in a single call.
*/
if (get_udatamodel() == DATAMODEL_ILP32) {
- struct iovec32 aiov32[DEF_IOV_MAX];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ int aiov32len;
ssize32_t count32;
- if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+ aiov32len = iovcnt * sizeof (iovec32_t);
+ if (aiovlen != 0)
+ aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+ if (copyin(iovp, aiov32, aiov32len)) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
ssize32_t iovlen = aiov32[i].iov_len;
count32 += iovlen;
- if (iovlen < 0 || count32 < 0)
+ if (iovlen < 0 || count32 < 0) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EINVAL));
+ }
aiov[i].iov_len = iovlen;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+ if (aiovlen != 0)
+ kmem_free(aiov32, aiov32len);
} else
#endif
- if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+ if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EFAULT));
+ }
count = 0;
for (i = 0; i < iovcnt; i++) {
ssize_t iovlen = aiov[i].iov_len;
count += iovlen;
- if (iovlen < 0 || count < 0)
+ if (iovlen < 0 || count < 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EINVAL));
+ }
}
- if ((fp = getf(fdes)) == NULL)
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EBADF));
+ }
if (((fflag = fp->f_flag) & FWRITE) == 0) {
error = EBADF;
goto out;
@@ -917,6 +977,8 @@ out:
if (in_crit)
nbl_end_crit(vp);
releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
if (error)
return (set_errno(error));
return (count);
@@ -927,7 +989,8 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
off_t extended_offset)
{
struct uio auio;
- struct iovec aiov[DEF_IOV_MAX];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
file_t *fp;
register vnode_t *vp;
struct cpu *cp;
@@ -936,25 +999,35 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
int error = 0;
int i;
+ /*
+ * In a 64-bit kernel, this interface supports native 64-bit
+ * applications as well as 32-bit applications using both standard and
+ * large-file access. For 32-bit large-file aware applications, the
+ * offset is passed as two parameters which are joined into the actual
+ * offset used. The 64-bit libc always passes 0 for the extended_offset.
+ * Note that off_t is a signed value, but the preadv/pwritev API treats
+ * the offset as a position in the file for the operation, so passing
+ * a negative value will likely fail the maximum offset checks below
+ * because we convert it to an unsigned value which will be larger than
+ * the maximum valid offset.
+ */
#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
(u_offset_t)offset;
#else /* _SYSCALL32_IMPL || _ILP32 */
u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
#endif /* _SYSCALL32_IMPR || _ILP32 */
-#ifdef _SYSCALL32_IMPL
- const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
- extended_offset == 0?
- MAXOFF32_T : MAXOFFSET_T;
-#else /* _SYSCALL32_IMPL */
- const u_offset_t maxoff = MAXOFF32_T;
-#endif /* _SYSCALL32_IMPL */
int in_crit = 0;
- if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+ if (iovcnt <= 0 || iovcnt > IOV_MAX)
return (set_errno(EINVAL));
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded,
@@ -962,61 +1035,104 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
* of data in a single call.
*/
if (get_udatamodel() == DATAMODEL_ILP32) {
- struct iovec32 aiov32[DEF_IOV_MAX];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ int aiov32len;
ssize32_t count32;
- if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+ aiov32len = iovcnt * sizeof (iovec32_t);
+ if (aiovlen != 0)
+ aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+ if (copyin(iovp, aiov32, aiov32len)) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
ssize32_t iovlen32 = aiov32[i].iov_len;
count32 += iovlen32;
- if (iovlen32 < 0 || count32 < 0)
+ if (iovlen32 < 0 || count32 < 0) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EINVAL));
+ }
aiov[i].iov_len = iovlen32;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+ if (aiovlen != 0)
+ kmem_free(aiov32, aiov32len);
} else
#endif /* _SYSCALL32_IMPL */
- if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+ if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EFAULT));
+ }
count = 0;
for (i = 0; i < iovcnt; i++) {
ssize_t iovlen = aiov[i].iov_len;
count += iovlen;
- if (iovlen < 0 || count < 0)
+ if (iovlen < 0 || count < 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EINVAL));
+ }
}
- if ((bcount = (ssize_t)count) < 0)
+ if ((bcount = count) < 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EINVAL));
- if ((fp = getf(fdes)) == NULL)
+ }
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EBADF));
+ }
if (((fflag = fp->f_flag) & FREAD) == 0) {
error = EBADF;
goto out;
}
vp = fp->f_vnode;
rwflag = 0;
- if (vp->v_type == VREG) {
+ /*
+ * Behaviour is same as read(2). Please see comments in read(2).
+ */
+ if (vp->v_type == VREG) {
if (bcount == 0)
goto out;
- /*
- * return EINVAL for offsets that cannot be
- * represented in an off_t.
- */
- if (fileoff > maxoff) {
- error = EINVAL;
+ /* Handle offset past maximum offset allowed for file. */
+ if (fileoff >= OFFSET_MAX(fp)) {
+ struct vattr va;
+ va.va_mask = AT_SIZE;
+
+ error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL);
+ if (error == 0) {
+ if (fileoff >= va.va_size) {
+ count = 0;
+ } else {
+ error = EOVERFLOW;
+ }
+ }
goto out;
}
- if (fileoff + bcount > maxoff)
- bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
+ ASSERT(bcount == count);
+
+ /* Note: modified count used in nbl_conflict() call below. */
+ if ((fileoff + count) > OFFSET_MAX(fp))
+ count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+
} else if (vp->v_type == VFIFO) {
error = ESPIPE;
goto out;
@@ -1033,8 +1149,7 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
error = nbl_svmand(vp, fp->f_cred, &svmand);
if (error != 0)
goto out;
- if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
- NULL)) {
+ if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
error = EACCES;
goto out;
}
@@ -1042,33 +1157,6 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
(void) VOP_RWLOCK(vp, rwflag, NULL);
- /*
- * Behaviour is same as read(2). Please see comments in
- * read(2).
- */
-
- if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
- struct vattr va;
- va.va_mask = AT_SIZE;
- if ((error =
- VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
- VOP_RWUNLOCK(vp, rwflag, NULL);
- goto out;
- }
- if (fileoff >= va.va_size) {
- VOP_RWUNLOCK(vp, rwflag, NULL);
- count = 0;
- goto out;
- } else {
- VOP_RWUNLOCK(vp, rwflag, NULL);
- error = EOVERFLOW;
- goto out;
- }
- }
- if ((vp->v_type == VREG) &&
- (fileoff + count > OFFSET_MAX(fp))) {
- count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
- }
auio.uio_loffset = fileoff;
auio.uio_iov = aiov;
auio.uio_iovcnt = iovcnt;
@@ -1099,6 +1187,8 @@ out:
if (in_crit)
nbl_end_crit(vp);
releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
if (error)
return (set_errno(error));
return (count);
@@ -1109,7 +1199,8 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
off_t extended_offset)
{
struct uio auio;
- struct iovec aiov[DEF_IOV_MAX];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ int aiovlen = 0;
file_t *fp;
register vnode_t *vp;
struct cpu *cp;
@@ -1118,25 +1209,26 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
int error = 0;
int i;
+ /*
+ * See the comment in preadv for how the offset is handled.
+ */
#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
(u_offset_t)offset;
#else /* _SYSCALL32_IMPL || _ILP32 */
u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
#endif /* _SYSCALL32_IMPR || _ILP32 */
-#ifdef _SYSCALL32_IMPL
- const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
- extended_offset == 0?
- MAXOFF32_T : MAXOFFSET_T;
-#else /* _SYSCALL32_IMPL */
- const u_offset_t maxoff = MAXOFF32_T;
-#endif /* _SYSCALL32_IMPL */
int in_crit = 0;
- if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+ if (iovcnt <= 0 || iovcnt > IOV_MAX)
return (set_errno(EINVAL));
+ if (iovcnt > IOV_MAX_STACK) {
+ aiovlen = iovcnt * sizeof (iovec_t);
+ aiov = kmem_alloc(aiovlen, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded,
@@ -1144,58 +1236,92 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
* of data in a single call.
*/
if (get_udatamodel() == DATAMODEL_ILP32) {
- struct iovec32 aiov32[DEF_IOV_MAX];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ int aiov32len;
ssize32_t count32;
- if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+ aiov32len = iovcnt * sizeof (iovec32_t);
+ if (aiovlen != 0)
+ aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+ if (copyin(iovp, aiov32, aiov32len)) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
ssize32_t iovlen32 = aiov32[i].iov_len;
count32 += iovlen32;
- if (iovlen32 < 0 || count32 < 0)
+ if (iovlen32 < 0 || count32 < 0) {
+ if (aiovlen != 0) {
+ kmem_free(aiov32, aiov32len);
+ kmem_free(aiov, aiovlen);
+ }
return (set_errno(EINVAL));
+ }
aiov[i].iov_len = iovlen32;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+ if (aiovlen != 0)
+ kmem_free(aiov32, aiov32len);
} else
#endif /* _SYSCALL32_IMPL */
- if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+ if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EFAULT));
+ }
count = 0;
for (i = 0; i < iovcnt; i++) {
ssize_t iovlen = aiov[i].iov_len;
count += iovlen;
- if (iovlen < 0 || count < 0)
+ if (iovlen < 0 || count < 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EINVAL));
+ }
}
- if ((bcount = (ssize_t)count) < 0)
+ if ((bcount = count) < 0) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EINVAL));
- if ((fp = getf(fdes)) == NULL)
+ }
+ if ((fp = getf(fdes)) == NULL) {
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
return (set_errno(EBADF));
+ }
if (((fflag = fp->f_flag) & FWRITE) == 0) {
error = EBADF;
goto out;
}
vp = fp->f_vnode;
rwflag = 1;
- if (vp->v_type == VREG) {
+ /*
+ * The kernel's write(2) code checks the rctl & OFFSET_MAX and returns
+ * EFBIG when fileoff exceeds either limit. We do the same.
+ */
+ if (vp->v_type == VREG) {
if (bcount == 0)
goto out;
/*
- * return EINVAL for offsets that cannot be
- * represented in an off_t.
+ * Don't allow pwritev to cause file size to exceed the proper
+ * offset limit.
*/
- if (fileoff > maxoff) {
- error = EINVAL;
+ if (fileoff >= OFFSET_MAX(fp)) {
+ error = EFBIG;
goto out;
}
+
/*
* Take appropriate action if we are trying
* to write above the resource limit.
@@ -1218,17 +1344,13 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
error = EFBIG;
goto out;
}
- /*
- * Don't allow pwritev to cause file sizes to exceed
- * maxoff.
- */
- if (fileoff == maxoff) {
- error = EFBIG;
- goto out;
- }
- if (fileoff + bcount > maxoff)
- bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
+ ASSERT(bcount == count);
+
+ /* Note: modified count used in nbl_conflict() call below. */
+ if ((fileoff + count) > OFFSET_MAX(fp))
+ count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+
} else if (vp->v_type == VFIFO) {
error = ESPIPE;
goto out;
@@ -1245,8 +1367,7 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
error = nbl_svmand(vp, fp->f_cred, &svmand);
if (error != 0)
goto out;
- if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
- NULL)) {
+ if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
error = EACCES;
goto out;
}
@@ -1254,34 +1375,6 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
(void) VOP_RWLOCK(vp, rwflag, NULL);
-
- /*
- * Behaviour is same as write(2). Please see comments for
- * write(2).
- */
-
- if (vp->v_type == VREG) {
- if (fileoff >= curproc->p_fsz_ctl) {
- VOP_RWUNLOCK(vp, rwflag, NULL);
- mutex_enter(&curproc->p_lock);
- /* see above rctl_action comment */
- (void) rctl_action(
- rctlproc_legacy[RLIMIT_FSIZE],
- curproc->p_rctls,
- curproc, RCA_UNSAFE_SIGINFO);
- mutex_exit(&curproc->p_lock);
- error = EFBIG;
- goto out;
- }
- if (fileoff >= OFFSET_MAX(fp)) {
- VOP_RWUNLOCK(vp, rwflag, NULL);
- error = EFBIG;
- goto out;
- }
- if (fileoff + count > OFFSET_MAX(fp))
- count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
- }
-
auio.uio_loffset = fileoff;
auio.uio_iov = aiov;
auio.uio_iovcnt = iovcnt;
@@ -1308,6 +1401,8 @@ out:
if (in_crit)
nbl_end_crit(vp);
releasef(fdes);
+ if (aiovlen != 0)
+ kmem_free(aiov, aiovlen);
if (error)
return (set_errno(error));
return (count);
diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c
index 0cfafbf13f..16c6fdd27e 100644
--- a/usr/src/uts/common/syscall/sendfile.c
+++ b/usr/src/uts/common/syscall/sendfile.c
@@ -82,7 +82,7 @@ extern sotpi_info_t *sotpi_sototpi(struct sonode *);
* 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
* more than 2GB of data.
*/
-int
+static int
sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
int copy_cnt, ssize32_t *count)
{
@@ -343,7 +343,7 @@ sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
return (0);
}
-ssize32_t
+static ssize32_t
sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
size32_t *xferred, int fildes)
{
@@ -390,7 +390,7 @@ sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
}
#endif
-int
+static int
sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
{
@@ -680,7 +680,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
}
-int
+static int
sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
int copy_cnt, ssize_t *count)
{
@@ -1174,6 +1174,17 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
} else {
maxblk = (int)vp->v_stream->sd_maxblk;
}
+
+ /*
+ * We need to make sure that the socket that we're sending on
+ * supports sendfile behavior. sockfs doesn't know that the APIs
+ * we want to use are coming from sendfile, so we can't rely on
+ * it to check for us.
+ */
+ if ((so->so_mode & SM_SENDFILESUPP) == 0) {
+ error = EOPNOTSUPP;
+ goto err;
+ }
break;
case VREG:
break;
diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c
index 4085104cc7..93f26121bc 100644
--- a/usr/src/uts/common/syscall/stat.c
+++ b/usr/src/uts/common/syscall/stat.c
@@ -61,7 +61,7 @@
* to VOP_GETATTR
*/
-static int
+int
cstatat_getvp(int fd, char *name, int follow, vnode_t **vp, cred_t **cred)
{
vnode_t *startvp;
diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c
index 03f2fabe13..e09f4e85a2 100644
--- a/usr/src/uts/common/syscall/sysconfig.c
+++ b/usr/src/uts/common/syscall/sysconfig.c
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -111,6 +112,9 @@ sysconfig(int which)
case _CONFIG_NPROC_MAX:
return (max_ncpus);
+ case _CONFIG_NPROC_NCPU:
+ return (NCPU); /* Private sysconfig for direct NCPU access */
+
case _CONFIG_STACK_PROT:
return (curproc->p_stkprot & ~PROT_USER);
@@ -167,44 +171,29 @@ sysconfig(int which)
/*
* If the non-global zone has a phys. memory cap, use that.
* We always report the system-wide value for the global zone,
- * even though rcapd can be used on the global zone too.
+ * even though memory capping can be used on the global zone
+ * too.
*/
- if (!INGLOBALZONE(curproc) &&
- curproc->p_zone->zone_phys_mcap != 0)
- return (MIN(btop(curproc->p_zone->zone_phys_mcap),
- physinstalled));
+ if (!INGLOBALZONE(curproc)) {
+ pgcnt_t cap, free;
+
+ zone_get_physmem_data(curzone->zone_id, &cap, &free);
+ return (MIN(cap, physinstalled));
+ }
return (physinstalled);
case _CONFIG_AVPHYS_PAGES:
/*
- * If the non-global zone has a phys. memory cap, use
- * the phys. memory cap - zone's current rss. We always
- * report the system-wide value for the global zone, even
- * though rcapd can be used on the global zone too.
+ * If the non-global zone has a phys. memory cap, use its
+ * free value. We always report the system-wide value for the
+ * global zone, even though memory capping can be used on the
+ * global zone too.
*/
- if (!INGLOBALZONE(curproc) &&
- curproc->p_zone->zone_phys_mcap != 0) {
- pgcnt_t cap, rss, free;
- vmusage_t in_use;
- size_t cnt = 1;
-
- cap = btop(curproc->p_zone->zone_phys_mcap);
- if (cap > physinstalled)
- return (freemem);
-
- if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt,
- FKIOCTL) != 0)
- in_use.vmu_rss_all = 0;
- rss = btop(in_use.vmu_rss_all);
- /*
- * Because rcapd implements a soft cap, it is possible
- * for rss to be temporarily over the cap.
- */
- if (cap > rss)
- free = cap - rss;
- else
- free = 0;
+ if (!INGLOBALZONE(curproc)) {
+ pgcnt_t cap, free;
+
+ zone_get_physmem_data(curzone->zone_id, &cap, &free);
return (MIN(free, freemem));
}
diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c
index 858305504d..dfe7f22d44 100644
--- a/usr/src/uts/common/syscall/uadmin.c
+++ b/usr/src/uts/common/syscall/uadmin.c
@@ -22,7 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/param.h>
@@ -78,7 +78,7 @@ volatile int fastreboot_dryrun = 0;
* system with many zones.
*/
void
-killall(zoneid_t zoneid)
+killall(zoneid_t zoneid, boolean_t force)
{
proc_t *p;
@@ -108,7 +108,7 @@ killall(zoneid_t zoneid)
p->p_stat != SIDL &&
p->p_stat != SZOMB) {
mutex_enter(&p->p_lock);
- if (sigismember(&p->p_sig, SIGKILL)) {
+ if (!force && sigismember(&p->p_sig, SIGKILL)) {
mutex_exit(&p->p_lock);
p = p->p_next;
} else {
@@ -245,12 +245,13 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
*/
zone_shutdown_global();
- killall(ALL_ZONES);
+ killall(ALL_ZONES, B_FALSE);
/*
* If we are calling kadmin() from a kernel context then we
* do not release these resources.
*/
if (ttoproc(curthread) != &p0) {
+ mutex_enter(&curproc->p_lock);
VN_RELE(PTOU(curproc)->u_cdir);
if (PTOU(curproc)->u_rdir)
VN_RELE(PTOU(curproc)->u_rdir);
@@ -260,6 +261,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
PTOU(curproc)->u_cdir = rootdir;
PTOU(curproc)->u_rdir = NULL;
PTOU(curproc)->u_cwd = NULL;
+ mutex_exit(&curproc->p_lock);
}
/*
diff --git a/usr/src/uts/common/syscall/umount.c b/usr/src/uts/common/syscall/umount.c
index a2deedb163..b25f89b6d5 100644
--- a/usr/src/uts/common/syscall/umount.c
+++ b/usr/src/uts/common/syscall/umount.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -125,6 +126,7 @@ umount2(char *pathp, int flag)
struct pathname pn;
struct vfs *vfsp;
int error;
+ boolean_t altroot;
/*
* Some flags are disallowed through the system call interface.
@@ -154,9 +156,12 @@ umount2(char *pathp, int flag)
* isn't in an environment with an alternate root (to the zone's root)
* directory, i.e. chroot(2).
*/
- if (secpolicy_fs_unmount(CRED(), NULL) != 0 ||
- (PTOU(curproc)->u_rdir != NULL &&
- PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp) ||
+ mutex_enter(&curproc->p_lock);
+ altroot = (PTOU(curproc)->u_rdir != NULL &&
+ PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp);
+ mutex_exit(&curproc->p_lock);
+
+ if (secpolicy_fs_unmount(CRED(), NULL) != 0 || altroot ||
(vfsp = vfs_mntpoint2vfsp(pn.pn_path)) == NULL) {
vnode_t *fsrootvp;
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index a2509e7bb6..3735139068 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *);
* call.
*
* int hat_pageunload(pp, forceflag)
- * unload all translations attached to pp.
+ * Unload all translations attached to pp. On x86 the bulk of the work is
+ * done by hat_page_inval.
+ *
+ * void hat_page_inval(pp, pgsz, curhat)
+ * Unload translations attached to pp. If curhat is provided, only the
+ * translation for that process is unloaded, otherwise all are unloaded.
*
* uint_t hat_pagesync(pp, flags)
* get hw stats from hardware into page struct and reset hw stats
@@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t);
void hat_page_clrattr(struct page *, uint_t);
uint_t hat_page_getattr(struct page *, uint_t);
int hat_pageunload(struct page *, uint_t);
+void hat_page_inval(struct page *, uint_t, struct hat *);
uint_t hat_pagesync(struct page *, uint_t);
ulong_t hat_page_getshare(struct page *);
int hat_page_checkshare(struct page *, ulong_t);
@@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t);
*/
#define HAT_ADV_PGUNLOAD 0x00
#define HAT_FORCE_PGUNLOAD 0x01
+#define HAT_CURPROC_PGUNLOAD 0x02
/*
* Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 8747b96acc..ae9b0be758 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -229,6 +230,7 @@ struct as;
* p_nrm
* p_mapping
* p_share
+ * p_zoneid
*
* The following field is file system dependent. How it is used and
* the locking strategies applied are up to the individual file system
@@ -527,9 +529,8 @@ typedef struct page {
pfn_t p_pagenum; /* physical page number */
uint_t p_share; /* number of translations */
-#if defined(_LP64)
- uint_t p_sharepad; /* pad for growing p_share */
-#endif
+ short p_zoneid; /* zone page use tracking */
+ short p_pad1; /* TBD */
uint_t p_slckcnt; /* number of softlocks */
#if defined(__sparc)
uint_t p_kpmref; /* number of kpm mapping sharers */
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
index 7e48602189..7305c9c85a 100644
--- a/usr/src/uts/common/vm/page_lock.c
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
@@ -140,9 +141,8 @@ static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE];
& (VPH_TABLE_SIZE - 1))
/*
- * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
- * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
- * VPH_TABLE_SIZE + 1.
+ * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes,
+ * one for kvps[KV_ZVP], and one for other kvps[] users.
*/
kmutex_t vph_mutex[VPH_TABLE_SIZE + 2];
@@ -888,10 +888,10 @@ static int page_vnode_mutex_stress = 0;
kmutex_t *
page_vnode_mutex(vnode_t *vp)
{
- if (vp == &kvp)
+ if (vp == &kvp || vp == &kvps[KV_VVP])
return (&vph_mutex[VPH_TABLE_SIZE + 0]);
- if (vp == &zvp)
+ if (vp == &kvps[KV_ZVP])
return (&vph_mutex[VPH_TABLE_SIZE + 1]);
#ifdef DEBUG
if (page_vnode_mutex_stress != 0)
diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c
index 76be970a45..f4e8d0737f 100644
--- a/usr/src/uts/common/vm/page_retire.c
+++ b/usr/src/uts/common/vm/page_retire.c
@@ -22,6 +22,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -851,9 +852,8 @@ page_retire_incr_pend_count(void *datap)
{
PR_INCR_KSTAT(pr_pending);
- if ((datap == &kvp) || (datap == &zvp)) {
+ if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
PR_INCR_KSTAT(pr_pending_kas);
- }
}
void
@@ -861,9 +861,8 @@ page_retire_decr_pend_count(void *datap)
{
PR_DECR_KSTAT(pr_pending);
- if ((datap == &kvp) || (datap == &zvp)) {
+ if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
PR_DECR_KSTAT(pr_pending_kas);
- }
}
/*
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index 439c859d96..0b116d6eba 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -122,6 +122,11 @@ vmem_t *static_alloc_arena; /* arena for allocating static memory */
vmem_t *zio_arena = NULL; /* arena for allocating zio memory */
vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */
+#if defined(__amd64)
+vmem_t *kvmm_arena; /* arena for vmm VA */
+struct seg kvmmseg; /* Segment for vmm memory */
+#endif
+
/*
* seg_kmem driver can map part of the kernel heap with large pages.
* Currently this functionality is implemented for sparc platforms only.
@@ -440,7 +445,7 @@ segkmem_badop()
/*ARGSUSED*/
static faultcode_t
segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
- enum fault_type type, enum seg_rw rw)
+ enum fault_type type, enum seg_rw rw)
{
pgcnt_t npages;
spgcnt_t pg;
@@ -655,13 +660,19 @@ segkmem_dump(struct seg *seg)
segkmem_dump_range, seg->s_as);
vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
segkmem_dump_range, seg->s_as);
+ /*
+ * We don't want to dump pages attached to kzioseg since they
+ * contain file data from ZFS. If this page's segment is
+ * kzioseg return instead of writing it to the dump device.
+ *
+ * Same applies to VM memory allocations.
+ */
} else if (seg == &kzioseg) {
- /*
- * We don't want to dump pages attached to kzioseg since they
- * contain file data from ZFS. If this page's segment is
- * kzioseg return instead of writing it to the dump device.
- */
return;
+#if defined(__amd64)
+ } else if (seg == &kvmmseg) {
+ return;
+#endif
} else {
segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
}
@@ -677,7 +688,7 @@ segkmem_dump(struct seg *seg)
/*ARGSUSED*/
static int
segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
- page_t ***ppp, enum lock_type type, enum seg_rw rw)
+ page_t ***ppp, enum lock_type type, enum seg_rw rw)
{
page_t **pplist, *pp;
pgcnt_t npages;
@@ -802,21 +813,18 @@ struct seg_ops segkmem_ops = {
};
int
-segkmem_zio_create(struct seg *seg)
-{
- ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
- seg->s_ops = &segkmem_ops;
- seg->s_data = &zvp;
- kas.a_size += seg->s_size;
- return (0);
-}
-
-int
segkmem_create(struct seg *seg)
{
ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
seg->s_ops = &segkmem_ops;
- seg->s_data = &kvp;
+ if (seg == &kzioseg)
+ seg->s_data = &kvps[KV_ZVP];
+#if defined(__amd64)
+ else if (seg == &kvmmseg)
+ seg->s_data = &kvps[KV_VVP];
+#endif
+ else
+ seg->s_data = &kvps[KV_KVP];
kas.a_size += seg->s_size;
return (0);
}
@@ -858,7 +866,7 @@ segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
*/
void *
segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
- page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
+ page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
{
page_t *ppl;
caddr_t addr = inaddr;
@@ -968,10 +976,10 @@ segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
}
-void *
+static void *
segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
{
- return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
+ return (segkmem_alloc_vn(vmp, size, vmflag, &kvps[KV_ZVP]));
}
/*
@@ -980,8 +988,8 @@ segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
* we currently don't have a special kernel segment for non-paged
* kernel memory that is exported by drivers to user space.
*/
-static void
-segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
+void
+segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
void (*func)(page_t *))
{
page_t *pp;
@@ -1038,21 +1046,15 @@ segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
}
void
-segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
-{
- segkmem_free_vn(vmp, inaddr, size, &kvp, func);
-}
-
-void
segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
{
- segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
+ segkmem_xfree(vmp, inaddr, size, &kvp, NULL);
}
-void
+static void
segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
{
- segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
+ segkmem_xfree(vmp, inaddr, size, &kvps[KV_ZVP], NULL);
}
void
@@ -1534,8 +1536,21 @@ segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
ASSERT(zio_alloc_arena != NULL);
}
-#ifdef __sparc
+#if defined(__amd64)
+
+void
+segkmem_kvmm_init(void *base, size_t size)
+{
+ ASSERT(base != NULL);
+ ASSERT(size != 0);
+
+ kvmm_arena = vmem_create("kvmm_arena", base, size, 1024 * 1024,
+ NULL, NULL, NULL, 0, VM_SLEEP);
+
+ ASSERT(kvmm_arena != NULL);
+}
+#elif defined(__sparc)
static void *
segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
index 1db85826b1..9a20101670 100644
--- a/usr/src/uts/common/vm/seg_kmem.h
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2017 RackTop Systems.
*/
@@ -65,12 +65,18 @@ extern vmem_t *static_arena; /* arena for caches to import static memory */
extern vmem_t *static_alloc_arena; /* arena for allocating static memory */
extern vmem_t *zio_arena; /* arena for zio caches */
extern vmem_t *zio_alloc_arena; /* arena for zio caches */
+
+#if defined(__amd64)
+extern struct seg kvmmseg; /* Segment for vmm mappings */
+extern vmem_t *kvmm_arena; /* arena for vmm VA */
+extern void segkmem_kvmm_init(void *, size_t);
+#endif
+
extern struct vnode kvps[];
/*
- * segkmem page vnodes
+ * segkmem page vnodes (please don't add more defines here...)
*/
#define kvp (kvps[KV_KVP])
-#define zvp (kvps[KV_ZVP])
#if defined(__sparc)
#define mpvp (kvps[KV_MPVP])
#define promvp (kvps[KV_PROMVP])
@@ -83,16 +89,14 @@ extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t,
extern void *segkmem_alloc(vmem_t *, size_t, int);
extern void *segkmem_alloc_permanent(vmem_t *, size_t, int);
extern void segkmem_free(vmem_t *, void *, size_t);
-extern void segkmem_xfree(vmem_t *, void *, size_t, void (*)(page_t *));
+extern void segkmem_xfree(vmem_t *, void *, size_t,
+ struct vnode *, void (*)(page_t *));
extern void *boot_alloc(void *, size_t, uint_t);
extern void boot_mapin(caddr_t addr, size_t size);
extern void kernelheap_init(void *, void *, char *, void *, void *);
extern void segkmem_gc(void);
-extern void *segkmem_zio_alloc(vmem_t *, size_t, int);
-extern int segkmem_zio_create(struct seg *);
-extern void segkmem_zio_free(vmem_t *, void *, size_t);
extern void segkmem_zio_init(void *, size_t);
/*
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 8046d10212..da6393f792 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -7313,7 +7313,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = svd->vpage;
offset = svd->offset + (uintptr_t)(addr - seg->s_base);
bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
- ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+ ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+ ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
if (attr) {
pageprot = attr & ~(SHARED|PRIVATE);
@@ -7338,11 +7339,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = &svd->vpage[seg_page(seg, addr)];
} else if (svd->vp && svd->amp == NULL &&
- (flags & MS_INVALIDATE) == 0) {
+ (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
/*
- * No attributes, no anonymous pages and MS_INVALIDATE flag
- * is not on, just use one big request.
+ * No attributes, no anonymous pages and MS_INVAL* flags
+ * are not on, just use one big request.
*/
err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
bflags, svd->cred, NULL);
@@ -7394,7 +7395,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
* might race in and lock the page after we unlock and before
* we do the PUTPAGE, then PUTPAGE simply does nothing.
*/
- if (flags & MS_INVALIDATE) {
+ if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index 853b092e6d..ec6d2b8920 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -58,6 +58,7 @@
#include <sys/debug.h>
#include <sys/tnf_probe.h>
#include <sys/vtrace.h>
+#include <sys/ddi.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -72,6 +73,8 @@
clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
+ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */
+
static struct kmem_cache *as_cache;
static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
@@ -853,8 +856,6 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
int as_lock_held;
klwp_t *lwp = ttolwp(curthread);
-
-
retry:
/*
* Indicate that the lwp is not to be stopped while waiting for a
@@ -1724,6 +1725,20 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
p->p_rctls, p, RCA_UNSAFE_ALL);
return (ENOMEM);
}
+
+ /*
+ * Keep the number of segments in a userspace AS constrained to
+ * a reasonable limit. Linux enforces a value slightly less
+ * than 64k in order to avoid ELF limits if/when a process
+ * dumps core. While SunOS avoids that specific problem with
+ * other tricks, the limit is still valuable to keep kernel
+ * memory consumption in check.
+ */
+ if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) {
+ AS_LOCK_EXIT(as);
+ atomic_inc_32(&p->p_zone->zone_mfseglim);
+ return (ENOMEM);
+ }
}
if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 78d1cb1a58..abccf82057 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -22,6 +22,7 @@
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -440,10 +441,26 @@ init_pages_pp_maximum()
}
}
+/*
+ * In the past, we limited the maximum pages that could be gotten to essentially
+ * 1/2 of the total pages on the system. However, this is too conservative for
+ * some cases. For example, if we want to host a large virtual machine which
+ * needs to use a significant portion of the system's memory. In practice,
+ * allowing more than 1/2 of the total pages is fine, but becomes problematic
+ * as we approach or exceed 75% of the pages on the system. Thus, we limit the
+ * maximum to 23/32 of the total pages, which is ~72%.
+ */
void
set_max_page_get(pgcnt_t target_total_pages)
{
- max_page_get = target_total_pages / 2;
+ max_page_get = (target_total_pages >> 5) * 23;
+ ASSERT3U(max_page_get, >, 0);
+}
+
+pgcnt_t
+get_max_page_get()
+{
+ return (max_page_get);
}
static pgcnt_t pending_delete;
@@ -1460,6 +1477,8 @@ page_create_throttle(pgcnt_t npages, int flags)
uint_t i;
pgcnt_t tf; /* effective value of throttlefree */
+ atomic_inc_64(&n_throttle);
+
/*
* Normal priority allocations.
*/
@@ -1492,7 +1511,7 @@ page_create_throttle(pgcnt_t npages, int flags)
tf = throttlefree -
((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
for (;;) {
fm = 0;
@@ -1579,7 +1598,7 @@ checkagain:
}
ASSERT(proc_pageout != NULL);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
"page_create_sleep_start: freemem %ld needfree %ld",
@@ -2226,7 +2245,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
if (nscan < desscan && freemem < minfree) {
TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
"pageout_cv_signal:freemem %ld", freemem);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
}
pp = rootpp;
@@ -2355,7 +2374,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
if (nscan < desscan && freemem < minfree) {
TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
"pageout_cv_signal:freemem %ld", freemem);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
}
/*
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 1b8d12eb8d..a206320a30 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags)
page_io_unlock(pp);
page_unlock(pp);
}
- } else if (flags & B_INVAL) {
+ } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+ /*
+ * If B_INVALCURONLY is set, then we handle that case
+ * in the next conditional if hat_page_is_mapped()
+ * indicates that there are no additional mappings
+ * to the page.
+ */
+
/*
* XXX - Failed writes with B_INVAL set are
* not handled appropriately.
@@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags)
}
/*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
* operation and is only to be considered if it doesn't involve any
* waiting here. B_TRUNC indicates that the file is being truncated
* and so no i/o needs to be done. B_FORCE indicates that the page
@@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags)
* If we want to free or invalidate the page then
* we need to unload it so that anyone who wants
* it will have to take a minor fault to get it.
+ * If we are only invalidating the page for the
+ * current process, then pass in a different flag.
* Otherwise, we're just writing the page back so we
* need to sync up the hardwre and software mod bit to
* detect any future modifications. We clear the
* software mod bit when we put the page on the dirty
* list.
*/
- if (flags & (B_INVAL | B_FREE)) {
+ if (flags & B_INVALCURONLY) {
+ (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+ } else if (flags & (B_INVAL | B_FREE)) {
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
} else {
(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags)
* list after all.
*/
page_io_unlock(pp);
- if (flags & B_INVAL) {
+ if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else if (flags & B_FREE) {
@@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags)
* of VOP_PUTPAGE() who prefer freeing the
* page _only_ if no one else is accessing it.
* E.g. segmap_release()
+ * We also take this path for B_INVALCURONLY and
+ * let page_release call VN_DISPOSE if no one else is
+ * using the page.
*
* The above hat_ismod() check is useless because:
* (1) we may not be holding SE_EXCL lock;
@@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags)
* We'll detect the fact that they used it when the
* i/o is done and avoid freeing the page.
*/
- if (flags & B_FREE)
+ if (flags & (B_FREE | B_INVALCURONLY))
page_downgrade(pp);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index e542e8e479..01c2666e91 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,6 +25,10 @@
*/
/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+/*
* vm_usage
*
* This file implements the getvmusage() private system call.
@@ -114,7 +118,7 @@
* For accurate counting of map-shared and COW-shared pages.
*
* - visited private anons (refcnt > 1) for each collective.
- * (entity->vme_anon_hash)
+ * (entity->vme_anon)
* For accurate counting of COW-shared pages.
*
* The common accounting structure is the vmu_entity_t, which represents
@@ -152,6 +156,7 @@
#include <sys/vm_usage.h>
#include <sys/zone.h>
#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
#include <sys/avl.h>
#include <vm/anon.h>
#include <vm/as.h>
@@ -199,6 +204,14 @@ typedef struct vmu_object {
} vmu_object_t;
/*
+ * Node for tree of visited COW anons.
+ */
+typedef struct vmu_anon {
+ avl_node_t vma_node;
+ uintptr_t vma_addr;
+} vmu_anon_t;
+
+/*
* Entity by which to count results.
*
* The entity structure keeps the current rss/swap counts for each entity
@@ -221,7 +234,7 @@ typedef struct vmu_entity {
struct vmu_entity *vme_next_calc;
mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
- mod_hash_t *vme_anon_hash; /* COW anons visited for entity */
+ avl_tree_t vme_anon; /* COW anons visited for entity */
vmusage_t vme_result; /* identifies entity and results */
} vmu_entity_t;
@@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2)
}
/*
+ * Comparison routine for our AVL tree of anon structures.
+ */
+static int
+vmu_anon_cmp(const void *lhs, const void *rhs)
+{
+ const vmu_anon_t *l = lhs, *r = rhs;
+
+ if (l->vma_addr == r->vma_addr)
+ return (0);
+
+ if (l->vma_addr < r->vma_addr)
+ return (-1);
+
+ return (1);
+}
+
+/*
* Save a bound on the free list.
*/
static void
@@ -363,13 +393,18 @@ static void
vmu_free_entity(mod_hash_val_t val)
{
vmu_entity_t *entity = (vmu_entity_t *)val;
+ vmu_anon_t *anon;
+ void *cookie = NULL;
if (entity->vme_vnode_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_vnode_hash);
if (entity->vme_amp_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_amp_hash);
- if (entity->vme_anon_hash != NULL)
- i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+ while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
+ kmem_free(anon, sizeof (vmu_anon_t));
+
+ avl_destroy(&entity->vme_anon);
entity->vme_next = vmu_data.vmu_free_entities;
vmu_data.vmu_free_entities = entity;
@@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid)
"vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
sizeof (struct anon_map));
- if (entity->vme_anon_hash == NULL)
- entity->vme_anon_hash = mod_hash_create_ptrhash(
- "vmusage anon hash", VMUSAGE_HASH_SIZE,
- mod_hash_null_valdtor, sizeof (struct anon));
+ VERIFY(avl_first(&entity->vme_anon) == NULL);
+
+ avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
+ offsetof(struct vmu_anon, vma_node));
entity->vme_next = vmu_data.vmu_entities;
vmu_data.vmu_entities = entity;
@@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id)
zone->vmz_id = id;
- if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+ if ((vmu_data.vmu_calc_flags &
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
@@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
}
static int
-vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+vmu_find_insert_anon(vmu_entity_t *entity, void *key)
{
- int ret;
- caddr_t val;
+ vmu_anon_t anon, *ap;
- ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
- (mod_hash_val_t *)&val);
+ anon.vma_addr = (uintptr_t)key;
- if (ret == 0)
+ if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
return (0);
- ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
- (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+ ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
+ ap->vma_addr = (uintptr_t)key;
- ASSERT(ret == 0);
+ avl_add(&entity->vme_anon, ap);
return (1);
}
@@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
next = AVL_NEXT(tree, next);
continue;
}
+
+ ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
@@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
(page = page_exists(vn, off)) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
+
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
+ bound_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* If current bound type does not match page
@@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
continue;
}
+ ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
@@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
if (vnode->v_pages != NULL &&
(page = page_exists(vnode, ptob(index))) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
+
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
+ bound_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* If current bound type does not match page
@@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
}
/*
+ * Pages on the free list aren't counted for the rss.
+ */
+ if (PP_ISFREE(page))
+ continue;
+
+ /*
* Assume anon structs with a refcnt
* of 1 are not COW shared, so there
* is no reason to track them per entity.
@@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
* Track COW anons per entity so
* they are not double counted.
*/
- if (vmu_find_insert_anon(entity->vme_anon_hash,
- (caddr_t)ap) == 0)
+ if (vmu_find_insert_anon(entity, ap) == 0)
continue;
result->vmu_rss_all += (pgcnt << PAGESHIFT);
@@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p)
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
- (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
- VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+ VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
VMUSAGE_ALL_EUSERS)) {
ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
@@ -1595,8 +1648,7 @@ vmu_free_extra()
mod_hash_destroy_hash(te->vme_vnode_hash);
if (te->vme_amp_hash != NULL)
mod_hash_destroy_hash(te->vme_amp_hash);
- if (te->vme_anon_hash != NULL)
- mod_hash_destroy_hash(te->vme_anon_hash);
+ VERIFY(avl_first(&te->vme_anon) == NULL);
kmem_free(te, sizeof (vmu_entity_t));
}
while (vmu_data.vmu_free_zones != NULL) {
@@ -1617,13 +1669,42 @@ vmu_free_extra()
extern kcondvar_t *pr_pid_cv;
+static void
+vmu_get_zone_rss(zoneid_t zid)
+{
+ vmu_zone_t *zone;
+ zone_t *zp;
+ int ret;
+ uint_t pgcnt;
+
+ if ((zp = zone_find_by_id(zid)) == NULL)
+ return;
+
+ ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)zid, (mod_hash_val_t *)&zone);
+ if (ret != 0) {
+ zone = vmu_alloc_zone(zid);
+ ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)zid,
+ (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ pgcnt = zone_pdata[zid].zpers_pg_cnt;
+ zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt);
+ zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap;
+
+ zone_rele(zp);
+}
+
/*
* Determine which entity types are relevant and allocate the hashes to
- * track them. Then walk the process table and count rss and swap
- * for each process'es address space. Address space object such as
- * vnodes, amps and anons are tracked per entity, so that they are
- * not double counted in the results.
- *
+ * track them. First get the zone rss using the data we already have. Then,
+ * if necessary, walk the process table and count rss and swap for each
+ * process'es address space. Address space object such as vnodes, amps and
+ * anons are tracked per entity, so that they are not double counted in the
+ * results.
*/
static void
vmu_calculate()
@@ -1631,6 +1712,7 @@ vmu_calculate()
int i = 0;
int ret;
proc_t *p;
+ uint_t zone_flags = 0;
vmu_clear_calc();
@@ -1638,9 +1720,34 @@ vmu_calculate()
vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
ALL_ZONES);
+ zone_flags = vmu_data.vmu_calc_flags & VMUSAGE_ZONE_FLAGS;
+ if (zone_flags != 0) {
+ /*
+ * Use the accurate zone RSS data we already keep track of.
+ */
+ int i;
+
+ for (i = 0; i <= MAX_ZONEID; i++) {
+ if (zone_pdata[i].zpers_pg_cnt > 0) {
+ vmu_get_zone_rss(i);
+ }
+ }
+ }
+
+ /* If only neeeded zone data, we're done. */
+ if ((vmu_data.vmu_calc_flags & ~VMUSAGE_ZONE_FLAGS) == 0) {
+ return;
+ }
+
+ DTRACE_PROBE(vmu__calculate__all);
+ vmu_data.vmu_calc_flags &= ~VMUSAGE_ZONE_FLAGS;
+
/*
* Walk process table and calculate rss of each proc.
*
+ * Since we already obtained all zone rss above, the following loop
+ * executes with the VMUSAGE_ZONE_FLAGS cleared.
+ *
* Pidlock and p_lock cannot be held while doing the rss calculation.
* This is because:
* 1. The calculation allocates using KM_SLEEP.
@@ -1695,6 +1802,12 @@ again:
mutex_exit(&pidlock);
vmu_free_extra();
+
+ /*
+ * Restore any caller-supplied zone flags we blocked during
+ * the process-table walk.
+ */
+ vmu_data.vmu_calc_flags |= zone_flags;
}
/*
@@ -1745,7 +1858,7 @@ vmu_cache_rele(vmu_cache_t *cache)
*/
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
- uint_t flags, int cpflg)
+ uint_t flags, id_t req_zone_id, int cpflg)
{
vmusage_t *result, *out_result;
vmusage_t dummy;
@@ -1764,7 +1877,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
/* figure out what results the caller is interested in. */
if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
types |= VMUSAGE_SYSTEM;
- if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+ if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
types |= VMUSAGE_ZONE;
if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS))
@@ -1827,26 +1940,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
continue;
}
- /* Skip "other zone" results if not requested */
- if (result->vmu_zoneid != curproc->p_zone->zone_id) {
- if (result->vmu_type == VMUSAGE_ZONE &&
- (flags & VMUSAGE_ALL_ZONES) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_PROJECTS &&
- (flags & (VMUSAGE_ALL_PROJECTS |
- VMUSAGE_COL_PROJECTS)) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_TASKS &&
- (flags & VMUSAGE_ALL_TASKS) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_RUSERS &&
- (flags & (VMUSAGE_ALL_RUSERS |
- VMUSAGE_COL_RUSERS)) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_EUSERS &&
- (flags & (VMUSAGE_ALL_EUSERS |
- VMUSAGE_COL_EUSERS)) == 0)
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ flags & VMUSAGE_A_ZONE) {
+ /* Skip non-requested zone results */
+ if (result->vmu_zoneid != req_zone_id)
continue;
+ } else {
+ /* Skip "other zone" results if not requested */
+ if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ (flags & VMUSAGE_ALL_ZONES) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_PROJECTS &&
+ (flags & (VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_COL_PROJECTS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_TASKS &&
+ (flags & VMUSAGE_ALL_TASKS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_RUSERS &&
+ (flags & (VMUSAGE_ALL_RUSERS |
+ VMUSAGE_COL_RUSERS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_EUSERS &&
+ (flags & (VMUSAGE_ALL_EUSERS |
+ VMUSAGE_COL_EUSERS)) == 0)
+ continue;
+ }
}
count++;
if (out_result != NULL) {
@@ -1902,10 +2022,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
int cacherecent = 0;
hrtime_t now;
uint_t flags_orig;
+ id_t req_zone_id;
/*
* Non-global zones cannot request system wide and/or collated
- * results, or the system result, so munge the flags accordingly.
+ * results, or the system result, or usage of another zone, so munge
+ * the flags accordingly.
*/
flags_orig = flags;
if (curproc->p_zone != global_zone) {
@@ -1925,6 +2047,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
flags &= ~VMUSAGE_SYSTEM;
flags |= VMUSAGE_ZONE;
}
+ if (flags & VMUSAGE_A_ZONE) {
+ flags &= ~VMUSAGE_A_ZONE;
+ flags |= VMUSAGE_ZONE;
+ }
}
/* Check for unknown flags */
@@ -1935,6 +2061,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
if ((flags & VMUSAGE_MASK) == 0)
return (set_errno(EINVAL));
+ /* If requesting results for a specific zone, get the zone ID */
+ if (flags & VMUSAGE_A_ZONE) {
+ size_t bufsize;
+ vmusage_t zreq;
+
+ if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+ return (set_errno(EFAULT));
+ /* Requested zone ID is passed in buf, so 0 len not allowed */
+ if (bufsize == 0)
+ return (set_errno(EINVAL));
+ if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+ return (set_errno(EFAULT));
+ req_zone_id = zreq.vmu_id;
+ }
+
mutex_enter(&vmu_data.vmu_lock);
now = gethrtime();
@@ -1954,7 +2095,7 @@ start:
mutex_exit(&vmu_data.vmu_lock);
ret = vmu_copyout_results(cache, buf, nres, flags_orig,
- cpflg);
+ req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
if (vmu_data.vmu_pending_waiters > 0)
@@ -2011,7 +2152,8 @@ start:
mutex_exit(&vmu_data.vmu_lock);
/* copy cache */
- ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+ ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+ req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
mutex_exit(&vmu_data.vmu_lock);
diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c
index 761597653b..c21476df89 100644
--- a/usr/src/uts/common/xen/io/xnb.c
+++ b/usr/src/uts/common/xen/io/xnb.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifdef DEBUG
@@ -251,8 +252,8 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
* because it doesn't cover all of the interesting cases :-(
*/
mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
-
- return (mac_fix_cksum(mp));
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ return (mp);
}
mblk_t *